Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/internal-java-code-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,5 @@ jobs:
with:
analysis-name: ${{ needs.prepare-code-to-analyze.outputs.analysis-name }}
artifacts-upload-name: ${{ needs.prepare-code-to-analyze.outputs.artifacts-upload-name }}
sources-upload-name: ${{ needs.prepare-code-to-analyze.outputs.sources-upload-name }}
sources-upload-name: ${{ needs.prepare-code-to-analyze.outputs.sources-upload-name }}
jupyter-pdf: "false"
3 changes: 2 additions & 1 deletion .github/workflows/internal-typescript-code-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,4 +117,5 @@ jobs:
uses: ./.github/workflows/public-analyze-code-graph.yml
with:
analysis-name: ${{ needs.prepare-code-to-analyze.outputs.analysis-name }}
sources-upload-name: ${{ needs.prepare-code-to-analyze.outputs.sources-upload-name }}
sources-upload-name: ${{ needs.prepare-code-to-analyze.outputs.sources-upload-name }}
jupyter-pdf: "false"
8 changes: 7 additions & 1 deletion .github/workflows/public-analyze-code-graph.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ on:
required: false
type: number
default: 5
jupyter-pdf:
description: >
Enable PDF generation for Jupyter Notebooks ("true") or disable it ("false").
required: false
type: string
default: 'true'
outputs:
uploaded-analysis-results:
description: >
Expand Down Expand Up @@ -159,7 +165,7 @@ jobs:
shell: bash -el {0}
env:
NEO4J_INITIAL_PASSWORD: ${{ steps.generate-neo4j-initial-password.outputs.neo4j-initial-password }}
ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION: "true"
ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION: ${{ inputs.jupyter-pdf }}
IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT: "" # Options: "none", "aggregated", "full". default = "plugin" or ""
PREPARE_CONDA_ENVIRONMENT: "false" # Had already been done in step with id "prepare-conda-environment".
run: |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Estimate

CALL gds.hdbscan.write.estimate(
$dependencies_projection + '-cleaned', {
nodeProperty: $dependencies_projection_node_embeddings_property,
writeProperty: $dependencies_projection_write_property,
samples: 3
})
YIELD requiredMemory
,nodeCount
,relationshipCount
,bytesMin
,bytesMax
,heapPercentageMin
,heapPercentageMax
,treeView
,mapView
RETURN requiredMemory
,nodeCount
,relationshipCount
,bytesMin
,bytesMax
,heapPercentageMin
,heapPercentageMax
,treeView
//,mapView //doesn't work on Windows with git bash jq version jq-1.7-dirty
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Statistics

CALL gds.hdbscan.stats(
$dependencies_projection + '-cleaned', {
nodeProperty: $dependencies_projection_node_embeddings_property,
samples: 3
})
YIELD nodeCount, numberOfClusters, numberOfNoisePoints, preProcessingMillis, computeMillis, postProcessingMillis
RETURN nodeCount, numberOfClusters, numberOfNoisePoints, preProcessingMillis, computeMillis, postProcessingMillis
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Mutate

CALL gds.hdbscan.mutate(
$dependencies_projection + '-cleaned', {
nodeProperty: $dependencies_projection_node_embeddings_property,
mutateProperty: $dependencies_projection_write_property,
samples: 3
})
YIELD nodeCount, numberOfClusters, numberOfNoisePoints, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, mutateMillis
RETURN nodeCount, numberOfClusters, numberOfNoisePoints, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, mutateMillis
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Stream

CALL gds.hdbscan.stream(
$dependencies_projection + '-cleaned', {
nodeProperty: $dependencies_projection_node_embeddings_property,
samples: 3
})
YIELD nodeId, label
WITH gds.util.asNode(nodeId) AS member
,label
WITH member
,coalesce(member.fqn, member.fileName, member.name) AS memberName
,label
WITH count(DISTINCT member) AS memberCount
,collect(DISTINCT memberName) AS memberNames
,label
RETURN memberCount
,label
,memberNames
ORDER BY memberCount DESC, label ASC
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - write node property e.g. communityHdbscanLabel

CALL gds.hdbscan.write(
$dependencies_projection + '-cleaned', {
nodeProperty: $dependencies_projection_node_embeddings_property,
writeProperty: $dependencies_projection_write_property,
samples: 3
})
// Samples = 3 turned out to be needed for
YIELD nodeCount
,numberOfClusters
,numberOfNoisePoints
,preProcessingMillis
,computeMillis
,writeMillis
,postProcessingMillis
,nodePropertiesWritten
RETURN nodeCount
,numberOfClusters
,numberOfNoisePoints
,preProcessingMillis
,computeMillis
,writeMillis
,postProcessingMillis
,nodePropertiesWritten
5 changes: 4 additions & 1 deletion scripts/executeJupyterNotebook.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@
# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
set -o errexit -o pipefail

ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION=${ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION:-""} # Enable PDF generation for Jupyter Notebooks if set to any non empty value e.g. "true"
ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION=${ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION:-""} # Enable PDF generation for Jupyter Notebooks if set to any non empty value like "true" or disable it with "" or "false".
if [ "${ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION}" == "false" ]; then
ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION="" # Reset PDF generation if explicitly set to false
fi

## Get this "scripts" directory if not already set
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
Expand Down
77 changes: 73 additions & 4 deletions scripts/reports/CommunityCsv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,72 @@ detectCommunitiesWithKCoreDecomposition() {
calculateCommunityMetrics "${@}" "${writePropertyName}"
}

# Node Embeddings using Fast Random Projection
#
# Required Parameters:
# - dependencies_projection=...
# Name prefix for the in-memory projection name for dependencies. Example: "package"
# - dependencies_projection_node=...
# Label of the nodes that will be used for the projection. Example: "Package"
# - dependencies_projection_weight_property=...
# Name of the node property that contains the dependency weight. Example: "weight"
# - dependencies_projection_node_embeddings_property=...
# Name of the node property that will contain the node embeddings. Example: "embeddingsFastRandomProjectionForHDBSCAN"
nodeEmbeddingsWithFastRandomProjectionForHDBSCAN() {
local embeddingProperty
embeddingProperty=$( extractQueryParameter "dependencies_projection_node_embeddings_property" "${@}")

local NODE_EMBEDDINGS_CYPHER_DIR="${CYPHER_DIR}/Node_Embeddings"
local mutatePropertyName="dependencies_projection_write_property=${embeddingProperty}"
local embeddingsDimension="dependencies_projection_embedding_dimension=2"

# Run the algorithm and write the result into the in-memory projection ("mutate")
execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1c_Fast_Random_Projection_Mutate.cypher" "${@}" "${mutatePropertyName}" ${embeddingsDimension}
}

# Community Detection using Hierarchical Density-Based Spatial Clustering (HDBSCAN) Algorithm
#
# Required Parameters:
# - dependencies_projection=...
# Name prefix for the in-memory projection name for dependencies. Example: "package"
# - dependencies_projection_node=...
# Label of the nodes that will be used for the projection. Example: "Package"
# - dependencies_projection_weight_property=...
# Name of the node property that contains the dependency weight. Example: "weight"
#
# Special Requirements:
# - This algorithm needs a node property with an array of floats to compute clusters.
# One possible way is to use node embeddings for that (like FastRP).
detectCommunitiesWithHDBSCAN() {
local COMMUNITY_DETECTION_CYPHER_DIR="${CYPHER_DIR}/Community_Detection"
local PROJECTION_CYPHER_DIR="${CYPHER_DIR}/Dependencies_Projection"

local writePropertyName="dependencies_projection_write_property=communityFastRpHdbscanLabel"
local writeLabelName="dependencies_projection_write_label=HDBSCAN"
local embeddingProperty="dependencies_projection_node_embeddings_property=embeddingsFastRandomProjection2dHDBSCAN"

nodeEmbeddingsWithFastRandomProjectionForHDBSCAN "${@}" ${embeddingProperty}

# Statistics
execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11a_HDBSCAN_Estimate.cypher" "${@}" ${embeddingProperty} "${writePropertyName}"
execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11b_HDBSCAN_Statistics.cypher" "${@}" ${embeddingProperty}

# Run the algorithm and write the result into the in-memory projection ("mutate")
execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11c_HDBSCAN_Mutate.cypher" "${@}" ${embeddingProperty} "${writePropertyName}"

# Stream to CSV
local nodeLabel
nodeLabel=$( extractQueryParameter "dependencies_projection_node" "${@}")
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_8_Stream_Mutated_Grouped.cypher" "${@}" "${writePropertyName}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}_Communities_HDBSCAN.csv"

# Update Graph (node properties and labels) using the already mutated property projection
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_9_Write_Mutated.cypher" "${@}" "${writePropertyName}"
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_10_Delete_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}"
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_11_Add_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}"

calculateCommunityMetrics "${@}" "${writePropertyName}"
}

# Community Detection using the Approximate Maximum k-cut Algorithm
#
# Required Parameters:
Expand Down Expand Up @@ -402,6 +468,7 @@ detectCommunities() {
time detectCommunitiesWithKCoreDecomposition "${@}"
time detectCommunitiesWithApproximateMaximumKCut "${@}"
time calculateLocalClusteringCoefficient "${@}"

compareCommunityDetectionResults "${@}"
listAllResults "${@}"
}
Expand All @@ -415,7 +482,7 @@ ARTIFACT_GAMMA="dependencies_leiden_gamma=1.11" # default = 1.00
ARTIFACT_KCUT="dependencies_maxkcut=5" # default = 2

if createUndirectedDependencyProjection "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}"; then
detectCommunities "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}" "${ARTIFACT_GAMMA}" "${ARTIFACT_KCUT}"
detectCommunities "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}" "${ARTIFACT_GAMMA}" "${ARTIFACT_KCUT}" # "${ARTIFACT_NODE_EMBEDDINGS}"
writeLeidenModularity "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}"
fi

Expand All @@ -430,7 +497,9 @@ PACKAGE_KCUT="dependencies_maxkcut=20" # default = 2
if createUndirectedDependencyProjection "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"; then
detectCommunities "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}" "${PACKAGE_GAMMA}" "${PACKAGE_KCUT}"
writeLeidenModularity "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"


detectCommunitiesWithHDBSCAN "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"

# Package Community Detection - Special CSV Queries after update
execute_cypher "${CYPHER_DIR}/Community_Detection/Which_package_community_spans_several_artifacts_and_how_are_the_packages_distributed.cypher" > "${FULL_REPORT_DIRECTORY}/Package_Communities_Leiden_That_Span_Multiple_Artifacts.csv"
fi
Expand All @@ -444,8 +513,8 @@ TYPE_GAMMA="dependencies_leiden_gamma=5.00" # default = 1.00
TYPE_KCUT="dependencies_maxkcut=100" # default = 2

if createUndirectedJavaTypeDependencyProjection "${TYPE_PROJECTION}"; then
detectCommunities "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" "${TYPE_GAMMA}" "${TYPE_KCUT}"

detectCommunities "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" "${TYPE_GAMMA}" "${TYPE_KCUT}" "${TYPE_NODE_EMBEDDINGS}"
detectCommunitiesWithHDBSCAN "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}"
# Type Community Detection - Special CSV Queries after update
execute_cypher "${CYPHER_DIR}/Community_Detection/Which_type_community_spans_several_artifacts_and_how_are_the_types_distributed.cypher" > "${FULL_REPORT_DIRECTORY}/Type_Communities_Leiden_That_Span_Multiple_Artifacts.csv"
execute_cypher "${CYPHER_DIR}/Community_Detection/Type_communities_with_few_members_in_foreign_packages.cypher" > "${FULL_REPORT_DIRECTORY}/Type_communities_with_few_members_in_foreign_packages.csv"
Expand Down
2 changes: 0 additions & 2 deletions scripts/reports/compilations/CsvReports.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defa
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
# This way non-standard tools like readlink aren't needed.
REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )}
echo "CsvReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}"

REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")}
echo "CsvReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"

Expand Down
12 changes: 6 additions & 6 deletions scripts/reports/compilations/JupyterReports.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,18 @@ LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defa
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
# This way non-standard tools like readlink aren't needed.
REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )}
echo "JupyterReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}"

REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")}
echo "JupyterReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"

# Get the "scripts" directory by taking the scripts report path and going one directory up.
SCRIPTS_DIR=${SCRIPTS_DIR:-$(dirname -- "${REPORTS_SCRIPT_DIR}")}
echo "JupyterReports: SCRIPTS_DIR=${SCRIPTS_DIR}"

# Get the "jupyter" directory by taking the path of the scripts directory, going up one directory and change then into "jupyter".
JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY:-"${SCRIPTS_DIR}/../jupyter"} # Repository directory containing the Jupyter Notebooks

echo "${LOG_GROUP_START}Initialize Jupyter Notebook Reports";
echo "JupyterReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}"
echo "JupyterReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"
echo "JupyterReports: SCRIPTS_DIR=${SCRIPTS_DIR}"
echo "JupyterReports: JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY}"
echo "${LOG_GROUP_END}";

# Run all jupiter notebooks
for jupyter_notebook_file in "${JUPYTER_NOTEBOOK_DIRECTORY}"/*.ipynb; do
Expand Down
6 changes: 4 additions & 2 deletions scripts/reports/compilations/VisualizationReports.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@ LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defa
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
# This way non-standard tools like readlink aren't needed.
REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )}
echo "VisualizationReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}"

REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")}

echo "${LOG_GROUP_START}Initialize Visualization Reports";
echo "VisualizationReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}"
echo "VisualizationReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"
echo "${LOG_GROUP_END}";

# Run all visualization scripts
for visualization_script_file in "${REPORTS_SCRIPT_DIR}"/*Visualization.sh; do
Expand Down
Loading