Skip to content

Commit f0112f4

Browse files
authored
Merge pull request #376 from JohT/feature/add-community-detection-with-hierarchical-density-based-spation-clustering-hdbscan-
Add Hierarchical Density-Based Spatial Clustering (HDBSCAN) Community Detection
2 parents 2aaf22e + 8812822 commit f0112f4

13 files changed

+188
-18
lines changed

.github/workflows/internal-java-code-analysis.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,4 +119,5 @@ jobs:
119119
with:
120120
analysis-name: ${{ needs.prepare-code-to-analyze.outputs.analysis-name }}
121121
artifacts-upload-name: ${{ needs.prepare-code-to-analyze.outputs.artifacts-upload-name }}
122-
sources-upload-name: ${{ needs.prepare-code-to-analyze.outputs.sources-upload-name }}
122+
sources-upload-name: ${{ needs.prepare-code-to-analyze.outputs.sources-upload-name }}
123+
jupyter-pdf: "false"

.github/workflows/internal-typescript-code-analysis.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,4 +117,5 @@ jobs:
117117
uses: ./.github/workflows/public-analyze-code-graph.yml
118118
with:
119119
analysis-name: ${{ needs.prepare-code-to-analyze.outputs.analysis-name }}
120-
sources-upload-name: ${{ needs.prepare-code-to-analyze.outputs.sources-upload-name }}
120+
sources-upload-name: ${{ needs.prepare-code-to-analyze.outputs.sources-upload-name }}
121+
jupyter-pdf: "false"

.github/workflows/public-analyze-code-graph.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,12 @@ on:
5555
required: false
5656
type: number
5757
default: 5
58+
jupyter-pdf:
59+
description: >
60+
Enable PDF generation for Jupyter Notebooks ("true") or disable it ("false").
61+
required: false
62+
type: string
63+
default: 'true'
5864
outputs:
5965
uploaded-analysis-results:
6066
description: >
@@ -159,7 +165,7 @@ jobs:
159165
shell: bash -el {0}
160166
env:
161167
NEO4J_INITIAL_PASSWORD: ${{ steps.generate-neo4j-initial-password.outputs.neo4j-initial-password }}
162-
ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION: "true"
168+
ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION: ${{ inputs.jupyter-pdf }}
163169
IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT: "" # Options: "none", "aggregated", "full". default = "plugin" or ""
164170
PREPARE_CONDA_ENVIRONMENT: "false" # Had already been done in step with id "prepare-conda-environment".
165171
run: |
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Estimate
2+
3+
CALL gds.hdbscan.write.estimate(
4+
$dependencies_projection + '-cleaned', {
5+
nodeProperty: $dependencies_projection_node_embeddings_property,
6+
writeProperty: $dependencies_projection_write_property,
7+
samples: 3
8+
})
9+
YIELD requiredMemory
10+
,nodeCount
11+
,relationshipCount
12+
,bytesMin
13+
,bytesMax
14+
,heapPercentageMin
15+
,heapPercentageMax
16+
,treeView
17+
,mapView
18+
RETURN requiredMemory
19+
,nodeCount
20+
,relationshipCount
21+
,bytesMin
22+
,bytesMax
23+
,heapPercentageMin
24+
,heapPercentageMax
25+
,treeView
26+
//,mapView //doesn't work on Windows with git bash jq version jq-1.7-dirty
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Statistics
2+
3+
CALL gds.hdbscan.stats(
4+
$dependencies_projection + '-cleaned', {
5+
nodeProperty: $dependencies_projection_node_embeddings_property,
6+
samples: 3
7+
})
8+
YIELD nodeCount, numberOfClusters, numberOfNoisePoints, preProcessingMillis, computeMillis, postProcessingMillis
9+
RETURN nodeCount, numberOfClusters, numberOfNoisePoints, preProcessingMillis, computeMillis, postProcessingMillis
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Mutate
2+
3+
CALL gds.hdbscan.mutate(
4+
$dependencies_projection + '-cleaned', {
5+
nodeProperty: $dependencies_projection_node_embeddings_property,
6+
mutateProperty: $dependencies_projection_write_property,
7+
samples: 3
8+
})
9+
YIELD nodeCount, numberOfClusters, numberOfNoisePoints, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, mutateMillis
10+
RETURN nodeCount, numberOfClusters, numberOfNoisePoints, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, mutateMillis
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Stream
2+
3+
CALL gds.hdbscan.stream(
4+
$dependencies_projection + '-cleaned', {
5+
nodeProperty: $dependencies_projection_node_embeddings_property,
6+
samples: 3
7+
})
8+
YIELD nodeId, label
9+
WITH gds.util.asNode(nodeId) AS member
10+
,label
11+
WITH member
12+
,coalesce(member.fqn, member.fileName, member.name) AS memberName
13+
,label
14+
WITH count(DISTINCT member) AS memberCount
15+
,collect(DISTINCT memberName) AS memberNames
16+
,label
17+
RETURN memberCount
18+
,label
19+
,memberNames
20+
ORDER BY memberCount DESC, label ASC
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - write node property e.g. communityHdbscanLabel
2+
3+
CALL gds.hdbscan.write(
4+
$dependencies_projection + '-cleaned', {
5+
nodeProperty: $dependencies_projection_node_embeddings_property,
6+
writeProperty: $dependencies_projection_write_property,
7+
samples: 3
8+
})
9+
// Samples = 3 turned out to be needed for
10+
YIELD nodeCount
11+
,numberOfClusters
12+
,numberOfNoisePoints
13+
,preProcessingMillis
14+
,computeMillis
15+
,writeMillis
16+
,postProcessingMillis
17+
,nodePropertiesWritten
18+
RETURN nodeCount
19+
,numberOfClusters
20+
,numberOfNoisePoints
21+
,preProcessingMillis
22+
,computeMillis
23+
,writeMillis
24+
,postProcessingMillis
25+
,nodePropertiesWritten

scripts/executeJupyterNotebook.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,10 @@
2020
# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
2121
set -o errexit -o pipefail
2222

23-
ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION=${ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION:-""} # Enable PDF generation for Jupyter Notebooks if set to any non empty value e.g. "true"
23+
ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION=${ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION:-""} # Enable PDF generation for Jupyter Notebooks if set to any non empty value like "true" or disable it with "" or "false".
24+
if [ "${ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION}" == "false" ]; then
25+
ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION="" # Reset PDF generation if explicitly set to false
26+
fi
2427

2528
## Get this "scripts" directory if not already set
2629
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.

scripts/reports/CommunityCsv.sh

Lines changed: 73 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,72 @@ detectCommunitiesWithKCoreDecomposition() {
242242
calculateCommunityMetrics "${@}" "${writePropertyName}"
243243
}
244244

245+
# Node Embeddings using Fast Random Projection
246+
#
247+
# Required Parameters:
248+
# - dependencies_projection=...
249+
# Name prefix for the in-memory projection name for dependencies. Example: "package"
250+
# - dependencies_projection_node=...
251+
# Label of the nodes that will be used for the projection. Example: "Package"
252+
# - dependencies_projection_weight_property=...
253+
# Name of the node property that contains the dependency weight. Example: "weight"
254+
# - dependencies_projection_node_embeddings_property=...
255+
# Name of the node property that will contain the node embeddings. Example: "embeddingsFastRandomProjectionForHDBSCAN"
256+
nodeEmbeddingsWithFastRandomProjectionForHDBSCAN() {
257+
local embeddingProperty
258+
embeddingProperty=$( extractQueryParameter "dependencies_projection_node_embeddings_property" "${@}")
259+
260+
local NODE_EMBEDDINGS_CYPHER_DIR="${CYPHER_DIR}/Node_Embeddings"
261+
local mutatePropertyName="dependencies_projection_write_property=${embeddingProperty}"
262+
local embeddingsDimension="dependencies_projection_embedding_dimension=2"
263+
264+
# Run the algorithm and write the result into the in-memory projection ("mutate")
265+
execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1c_Fast_Random_Projection_Mutate.cypher" "${@}" "${mutatePropertyName}" ${embeddingsDimension}
266+
}
267+
268+
# Community Detection using Hierarchical Density-Based Spatial Clustering (HDBSCAN) Algorithm
269+
#
270+
# Required Parameters:
271+
# - dependencies_projection=...
272+
# Name prefix for the in-memory projection name for dependencies. Example: "package"
273+
# - dependencies_projection_node=...
274+
# Label of the nodes that will be used for the projection. Example: "Package"
275+
# - dependencies_projection_weight_property=...
276+
# Name of the node property that contains the dependency weight. Example: "weight"
277+
#
278+
# Special Requirements:
279+
# - This algorithm needs a node property with an array of floats to compute clusters.
280+
# One possible way is to use node embeddings for that (like FastRP).
281+
detectCommunitiesWithHDBSCAN() {
282+
local COMMUNITY_DETECTION_CYPHER_DIR="${CYPHER_DIR}/Community_Detection"
283+
local PROJECTION_CYPHER_DIR="${CYPHER_DIR}/Dependencies_Projection"
284+
285+
local writePropertyName="dependencies_projection_write_property=communityFastRpHdbscanLabel"
286+
local writeLabelName="dependencies_projection_write_label=HDBSCAN"
287+
local embeddingProperty="dependencies_projection_node_embeddings_property=embeddingsFastRandomProjection2dHDBSCAN"
288+
289+
nodeEmbeddingsWithFastRandomProjectionForHDBSCAN "${@}" ${embeddingProperty}
290+
291+
# Statistics
292+
execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11a_HDBSCAN_Estimate.cypher" "${@}" ${embeddingProperty} "${writePropertyName}"
293+
execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11b_HDBSCAN_Statistics.cypher" "${@}" ${embeddingProperty}
294+
295+
# Run the algorithm and write the result into the in-memory projection ("mutate")
296+
execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11c_HDBSCAN_Mutate.cypher" "${@}" ${embeddingProperty} "${writePropertyName}"
297+
298+
# Stream to CSV
299+
local nodeLabel
300+
nodeLabel=$( extractQueryParameter "dependencies_projection_node" "${@}")
301+
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_8_Stream_Mutated_Grouped.cypher" "${@}" "${writePropertyName}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}_Communities_HDBSCAN.csv"
302+
303+
# Update Graph (node properties and labels) using the already mutated property projection
304+
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_9_Write_Mutated.cypher" "${@}" "${writePropertyName}"
305+
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_10_Delete_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}"
306+
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_11_Add_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}"
307+
308+
calculateCommunityMetrics "${@}" "${writePropertyName}"
309+
}
310+
245311
# Community Detection using the Approximate Maximum k-cut Algorithm
246312
#
247313
# Required Parameters:
@@ -402,6 +468,7 @@ detectCommunities() {
402468
time detectCommunitiesWithKCoreDecomposition "${@}"
403469
time detectCommunitiesWithApproximateMaximumKCut "${@}"
404470
time calculateLocalClusteringCoefficient "${@}"
471+
405472
compareCommunityDetectionResults "${@}"
406473
listAllResults "${@}"
407474
}
@@ -415,7 +482,7 @@ ARTIFACT_GAMMA="dependencies_leiden_gamma=1.11" # default = 1.00
415482
ARTIFACT_KCUT="dependencies_maxkcut=5" # default = 2
416483

417484
if createUndirectedDependencyProjection "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}"; then
418-
detectCommunities "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}" "${ARTIFACT_GAMMA}" "${ARTIFACT_KCUT}"
485+
detectCommunities "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}" "${ARTIFACT_GAMMA}" "${ARTIFACT_KCUT}" # "${ARTIFACT_NODE_EMBEDDINGS}"
419486
writeLeidenModularity "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}"
420487
fi
421488

@@ -430,7 +497,9 @@ PACKAGE_KCUT="dependencies_maxkcut=20" # default = 2
430497
if createUndirectedDependencyProjection "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"; then
431498
detectCommunities "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}" "${PACKAGE_GAMMA}" "${PACKAGE_KCUT}"
432499
writeLeidenModularity "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"
433-
500+
501+
detectCommunitiesWithHDBSCAN "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"
502+
434503
# Package Community Detection - Special CSV Queries after update
435504
execute_cypher "${CYPHER_DIR}/Community_Detection/Which_package_community_spans_several_artifacts_and_how_are_the_packages_distributed.cypher" > "${FULL_REPORT_DIRECTORY}/Package_Communities_Leiden_That_Span_Multiple_Artifacts.csv"
436505
fi
@@ -444,8 +513,8 @@ TYPE_GAMMA="dependencies_leiden_gamma=5.00" # default = 1.00
444513
TYPE_KCUT="dependencies_maxkcut=100" # default = 2
445514

446515
if createUndirectedJavaTypeDependencyProjection "${TYPE_PROJECTION}"; then
447-
detectCommunities "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" "${TYPE_GAMMA}" "${TYPE_KCUT}"
448-
516+
detectCommunities "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" "${TYPE_GAMMA}" "${TYPE_KCUT}" "${TYPE_NODE_EMBEDDINGS}"
517+
detectCommunitiesWithHDBSCAN "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}"
449518
# Type Community Detection - Special CSV Queries after update
450519
execute_cypher "${CYPHER_DIR}/Community_Detection/Which_type_community_spans_several_artifacts_and_how_are_the_types_distributed.cypher" > "${FULL_REPORT_DIRECTORY}/Type_Communities_Leiden_That_Span_Multiple_Artifacts.csv"
451520
execute_cypher "${CYPHER_DIR}/Community_Detection/Type_communities_with_few_members_in_foreign_packages.cypher" > "${FULL_REPORT_DIRECTORY}/Type_communities_with_few_members_in_foreign_packages.csv"

0 commit comments

Comments
 (0)