From 6d98bea73d31f3eb685a76394c7e81ca38453275 Mon Sep 17 00:00:00 2001 From: JohT Date: Wed, 9 Aug 2023 20:51:47 +0200 Subject: [PATCH 01/15] Prepare incoming & outgoing artifacts dependencies --- .../Incoming_Artifact_Dependencies.cypher | 13 +++++++++++++ .../Outgoing_Artifact_Dependencies.cypher | 13 +++++++++++++ scripts/prepareAnalysis.sh | 5 +++++ 3 files changed, 31 insertions(+) create mode 100644 cypher/Artifact_Dependencies/Incoming_Artifact_Dependencies.cypher create mode 100644 cypher/Artifact_Dependencies/Outgoing_Artifact_Dependencies.cypher diff --git a/cypher/Artifact_Dependencies/Incoming_Artifact_Dependencies.cypher b/cypher/Artifact_Dependencies/Incoming_Artifact_Dependencies.cypher new file mode 100644 index 000000000..a13e2d33a --- /dev/null +++ b/cypher/Artifact_Dependencies/Incoming_Artifact_Dependencies.cypher @@ -0,0 +1,13 @@ +// Incoming Artifact Dependencies + + MATCH (a:Artifact:Archive) +OPTIONAL MATCH (a)<-[r:DEPENDS_ON]-(ea:Artifact:Archive) + WHERE a.fileName <> ea.fileName + WITH a + ,COUNT(ea) AS incomingDependencies + ,SUM(r.weight) AS incomingDependenciesWeight + SET a.incomingDependencies = incomingDependencies + ,a.incomingDependenciesWeight = incomingDependenciesWeight + RETURN a.fileName + ,incomingDependencies + ,incomingDependenciesWeight \ No newline at end of file diff --git a/cypher/Artifact_Dependencies/Outgoing_Artifact_Dependencies.cypher b/cypher/Artifact_Dependencies/Outgoing_Artifact_Dependencies.cypher new file mode 100644 index 000000000..f0a46e979 --- /dev/null +++ b/cypher/Artifact_Dependencies/Outgoing_Artifact_Dependencies.cypher @@ -0,0 +1,13 @@ +// Outgoing Artifact Dependencies + + MATCH (a:Artifact:Archive) +OPTIONAL MATCH (a)-[r:DEPENDS_ON]->(ea:Artifact:Archive) + WHERE a.fileName <> ea.fileName + WITH a + ,COUNT(ea) AS outgoingDependencies + ,SUM(r.weight) AS outgoingDependenciesWeight + SET a.outgoingDependencies = outgoingDependencies + ,a.outgoingDependenciesWeight = outgoingDependenciesWeight + RETURN a.fileName + ,outgoingDependencies + ,outgoingDependenciesWeight \ No newline at end of file diff --git a/scripts/prepareAnalysis.sh b/scripts/prepareAnalysis.sh index 676a10313..d7be07744 100644 --- a/scripts/prepareAnalysis.sh +++ b/scripts/prepareAnalysis.sh @@ -28,6 +28,7 @@ source "${SCRIPTS_DIR}/executeQueryFunctions.sh" PACKAGE_WEIGHTS_CYPHER_DIR="$CYPHER_DIR/Package_Relationship_Weights" PACKAGE_METRICS_CYPHER_DIR="$CYPHER_DIR/Metrics" EXTERNAL_DEPENDENCIES_CYPHER_DIR="$CYPHER_DIR/External_Dependencies" +ARTIFACT_DEPENDENCIES_CYPHER_DIR="$CYPHER_DIR/Artifact_Dependencies" # Preparation - Create indizes execute_cypher "${CYPHER_DIR}/Create_index_for_full_qualified_type_name.cypher" || exit 1 @@ -51,3 +52,7 @@ execute_cypher_expect_results "${PACKAGE_METRICS_CYPHER_DIR}/Set_Outgoing_Packag # "annoatation" means that there is a ANNOTATED_BY to that external type execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/Remove_external_type_and_annotation_labels.cypher" || exit 1 execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/Label_external_types_and_annotations.cypher" || exit 1 + +# Preparation - Add Artifact node properties "incomingDependencies" and "outgoingDependencies" +execute_cypher_expect_results "${ARTIFACT_DEPENDENCIES_CYPHER_DIR}/Incoming_Artifact_Dependencies.cypher" || exit 1 +execute_cypher_expect_results "${ARTIFACT_DEPENDENCIES_CYPHER_DIR}/Outgoing_Artifact_Dependencies.cypher" || exit 1 \ No newline at end of file From b64128c3f9ae73fc0f547f8571529993a25cfc3c Mon Sep 17 00:00:00 2001 From: JohT Date: Wed, 9 Aug 2023 20:52:09 +0200 Subject: [PATCH 02/15] Add Leiden community detection for artifacts --- ...unity_Detection_0_Delete_Projection.cypher | 2 +- ...unity_Detection_0_Delete_Projection.cypher | 5 ++ ...nity_Detection_0b_Delete_Projection.cypher | 5 ++ ...tion_1_Create_undirected_Projection.cypher | 15 ++++++ ...te_subgraph_without_empty_artifacts.cypher | 10 ++++ ..._Detection_2_Leiden_Estimate_Memory.cypher | 23 ++++++++ ...unity_Detection_3_Leiden_Statistics.cypher | 25 +++++++++ ...Community_Detection_4_Leiden_Stream.cypher | 17 ++++++ ...en_Write_property_leidenCommunityId.cypher | 36 +++++++++++++ ..._Detection_6_Delete_Existing_Labels.cypher | 9 ++++ ...idenCommunity_Id_label_to_artifacts.cypher | 12 +++++ ...tection_8_Check_Leiden_Community_Id.cypher | 3 ++ ...ts_with_a_Community_Detection_Label.cypher | 5 ++ scripts/reports/ArtifactCommunityCsv.sh | 53 +++++++++++++++++++ .../reports/ArtifactDependenciesJupyter.sh | 15 +++++- 15 files changed, 233 insertions(+), 2 deletions(-) create mode 100644 cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_0_Delete_Projection.cypher create mode 100644 cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_0b_Delete_Projection.cypher create mode 100644 cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_1_Create_undirected_Projection.cypher create mode 100644 cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_1b_Create_subgraph_without_empty_artifacts.cypher create mode 100644 cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_2_Leiden_Estimate_Memory.cypher create mode 100644 cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_3_Leiden_Statistics.cypher create mode 100644 cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_4_Leiden_Stream.cypher create mode 100644 cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_5_Leiden_Write_property_leidenCommunityId.cypher create mode 100644 cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_6_Delete_Existing_Labels.cypher create mode 100644 cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_7_Add_ArtifactLeidenCommunity_Id_label_to_artifacts.cypher create mode 100644 cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_8_Check_Leiden_Community_Id.cypher create mode 100644 cypher/Community_Detection_Leiden_for_Artifacts/Get_all_Artifacts_with_a_Community_Detection_Label.cypher create mode 100755 scripts/reports/ArtifactCommunityCsv.sh diff --git a/cypher/Community_Detection_Leiden/Community_Detection_0_Delete_Projection.cypher b/cypher/Community_Detection_Leiden/Community_Detection_0_Delete_Projection.cypher index 2f689daae..35554ac9d 100644 --- a/cypher/Community_Detection_Leiden/Community_Detection_0_Delete_Projection.cypher +++ b/cypher/Community_Detection_Leiden/Community_Detection_0_Delete_Projection.cypher @@ -1,5 +1,5 @@ //Community Detection 0 Delete Projection - CALL gds.graph.drop('package-dependencies' + CALL gds.graph.drop('package-dependencies', false) YIELD graphName, nodeCount, relationshipCount, creationTime, modificationTime RETURN graphName, nodeCount, relationshipCount, creationTime, modificationTime \ No newline at end of file diff --git a/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_0_Delete_Projection.cypher b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_0_Delete_Projection.cypher new file mode 100644 index 000000000..c223aaacd --- /dev/null +++ b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_0_Delete_Projection.cypher @@ -0,0 +1,5 @@ +//Community Detection 0 Delete Projection + + CALL gds.graph.drop('artifact-dependencies', false) + YIELD graphName, nodeCount, relationshipCount, creationTime, modificationTime +RETURN graphName, nodeCount, relationshipCount, creationTime, modificationTime \ No newline at end of file diff --git a/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_0b_Delete_Projection.cypher b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_0b_Delete_Projection.cypher new file mode 100644 index 000000000..2757e95ca --- /dev/null +++ b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_0b_Delete_Projection.cypher @@ -0,0 +1,5 @@ +//Community Detection 0b Delete Projection + + CALL gds.graph.drop('artifact-dependencies-without-empty', false) + YIELD graphName, nodeCount, relationshipCount, creationTime, modificationTime +RETURN graphName, nodeCount, relationshipCount, creationTime, modificationTime \ No newline at end of file diff --git a/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_1_Create_undirected_Projection.cypher b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_1_Create_undirected_Projection.cypher new file mode 100644 index 000000000..5db35470c --- /dev/null +++ b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_1_Create_undirected_Projection.cypher @@ -0,0 +1,15 @@ +//Community Detection 1 Create undirected Projection + +CALL gds.graph.project('artifact-dependencies', 'Artifact', + { + DEPENDS_ON: { + orientation: 'UNDIRECTED' + } + }, + { + relationshipProperties: ['weight'], + nodeProperties: ['incomingDependencies', 'outgoingDependencies'] + } +) + YIELD graphName, nodeCount, relationshipCount +RETURN graphName, nodeCount, relationshipCount \ No newline at end of file diff --git a/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_1b_Create_subgraph_without_empty_artifacts.cypher b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_1b_Create_subgraph_without_empty_artifacts.cypher new file mode 100644 index 000000000..47b0adcdf --- /dev/null +++ b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_1b_Create_subgraph_without_empty_artifacts.cypher @@ -0,0 +1,10 @@ +//Community Detection 1b Create subgraph without empty artifacts + +CALL gds.beta.graph.project.subgraph( + 'artifact-dependencies-without-empty', + 'artifact-dependencies', + 'n.outgoingDependencies > 0 OR n.incomingDependencies > 0', + '*' +) + YIELD graphName, fromGraphName, nodeCount, relationshipCount, nodeFilter +RETURN graphName, fromGraphName, nodeCount, relationshipCount, nodeFilter \ No newline at end of file diff --git a/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_2_Leiden_Estimate_Memory.cypher b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_2_Leiden_Estimate_Memory.cypher new file mode 100644 index 000000000..e7271a086 --- /dev/null +++ b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_2_Leiden_Estimate_Memory.cypher @@ -0,0 +1,23 @@ +//Community Detection 2 Leiden Estimate Memory + +CALL gds.beta.leiden.write.estimate('artifact-dependencies-without-empty', { + gamma: 1.11, + theta: 0.001, + consecutiveIds: true, + relationshipWeightProperty: 'weight', + writeProperty: 'leidenCommunityId' +}) +YIELD nodeCount + ,relationshipCount + ,bytesMin + ,bytesMax + ,heapPercentageMin + ,heapPercentageMax + ,treeView +RETURN nodeCount + ,relationshipCount + ,bytesMin + ,bytesMax + ,heapPercentageMin + ,heapPercentageMax + ,treeView \ No newline at end of file diff --git a/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_3_Leiden_Statistics.cypher b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_3_Leiden_Statistics.cypher new file mode 100644 index 000000000..7200b4b59 --- /dev/null +++ b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_3_Leiden_Statistics.cypher @@ -0,0 +1,25 @@ +//Community Detection 3 Leiden Statistics + +CALL gds.beta.leiden.stats('artifact-dependencies-without-empty', { + gamma: 1.11, + theta: 0.001, + includeIntermediateCommunities: true, + relationshipWeightProperty: 'weight' +}) +YIELD communityCount + ,ranLevels + ,modularity + ,modularities + ,communityDistribution +RETURN communityCount + ,ranLevels + ,modularity + ,modularities + ,communityDistribution.min + ,communityDistribution.mean + ,communityDistribution.max + ,communityDistribution.p50 + ,communityDistribution.p75 + ,communityDistribution.p90 + ,communityDistribution.p95 + ,communityDistribution.p99 \ No newline at end of file diff --git a/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_4_Leiden_Stream.cypher b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_4_Leiden_Stream.cypher new file mode 100644 index 000000000..09fb72892 --- /dev/null +++ b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_4_Leiden_Stream.cypher @@ -0,0 +1,17 @@ +//Community Detection 4 Leiden Stream + +CALL gds.beta.leiden.stream('artifact-dependencies-without-empty', { + gamma: 1.11, + theta: 0.001, + includeIntermediateCommunities: true, + relationshipWeightProperty: 'weight' +}) + YIELD nodeId, communityId, intermediateCommunityIds + WITH communityId + ,intermediateCommunityIds + ,gds.util.asNode(nodeId) AS artifact +RETURN intermediateCommunityIds[0] AS firstCommunityId + ,communityId AS finalCommunityId + ,COUNT(DISTINCT artifact) AS countOfMembers + ,collect(DISTINCT replace(last(split(artifact.fileName, '/')), '.jar', '')) AS artifactNames + ORDER BY countOfMembers DESC, communityId ASC \ No newline at end of file diff --git a/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_5_Leiden_Write_property_leidenCommunityId.cypher b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_5_Leiden_Write_property_leidenCommunityId.cypher new file mode 100644 index 000000000..095c1b2ae --- /dev/null +++ b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_5_Leiden_Write_property_leidenCommunityId.cypher @@ -0,0 +1,36 @@ +//Community Detection 5 Leiden Write property leidenCommunityId + +CALL gds.beta.leiden.write('artifact-dependencies-without-empty', { + gamma: 1.11, + theta: 0.001, + consecutiveIds: true, + relationshipWeightProperty: 'weight', + writeProperty: 'leidenCommunityId' +}) +YIELD preProcessingMillis + ,computeMillis + ,writeMillis + ,postProcessingMillis + ,nodePropertiesWritten + ,communityCount + ,ranLevels + ,modularity + ,modularities + ,communityDistribution +RETURN preProcessingMillis + ,computeMillis + ,writeMillis + ,postProcessingMillis + ,nodePropertiesWritten + ,communityCount + ,ranLevels + ,modularity + ,communityDistribution.min + ,communityDistribution.mean + ,communityDistribution.max + ,communityDistribution.p50 + ,communityDistribution.p75 + ,communityDistribution.p90 + ,communityDistribution.p95 + ,communityDistribution.p99 + ,modularities \ No newline at end of file diff --git a/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_6_Delete_Existing_Labels.cypher b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_6_Delete_Existing_Labels.cypher new file mode 100644 index 000000000..2e95af7aa --- /dev/null +++ b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_6_Delete_Existing_Labels.cypher @@ -0,0 +1,9 @@ +//Community Detection 6 Delete Existing Labels + + CALL db.labels() YIELD label + WHERE label STARTS WITH "ArtifactLeiden" + WITH collect(label) AS labels + MATCH (artifact:Artifact) + WITH collect(artifact) AS artifacts, labels + CALL apoc.create.removeLabels(artifacts, labels) YIELD node +RETURN COUNT(node) AS nodesCount; \ No newline at end of file diff --git a/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_7_Add_ArtifactLeidenCommunity_Id_label_to_artifacts.cypher b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_7_Add_ArtifactLeidenCommunity_Id_label_to_artifacts.cypher new file mode 100644 index 000000000..1d0a4419b --- /dev/null +++ b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_7_Add_ArtifactLeidenCommunity_Id_label_to_artifacts.cypher @@ -0,0 +1,12 @@ +//Community Detection 7 Add ArtifactLeidenCommunity+Id label to artifacts +//with more than one member + + MATCH (artifact:Artifact:Archive) + WITH artifact.leidenCommunityId AS communityId + ,collect(artifact) AS artifacts + ,COUNT(DISTINCT artifact.fileName) AS members + ,'ArtifactLeidenCommunity' + toString(artifact.leidenCommunityId) AS labelName + WHERE members > 1 +UNWIND artifacts AS artifact + CALL apoc.create.addLabels(artifact, [labelName]) YIELD node +RETURN COUNT(node) as nodesCount \ No newline at end of file diff --git a/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_8_Check_Leiden_Community_Id.cypher b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_8_Check_Leiden_Community_Id.cypher new file mode 100644 index 000000000..a269dbe27 --- /dev/null +++ b/cypher/Community_Detection_Leiden_for_Artifacts/Community_Detection_8_Check_Leiden_Community_Id.cypher @@ -0,0 +1,3 @@ +// Community Detection 8 Check Leiden Community Id + +MATCH (a:Artifact) WHERE a.leidenCommunityId IS NOT NULL RETURN a.leidenCommunityId LIMIT 1 \ No newline at end of file diff --git a/cypher/Community_Detection_Leiden_for_Artifacts/Get_all_Artifacts_with_a_Community_Detection_Label.cypher b/cypher/Community_Detection_Leiden_for_Artifacts/Get_all_Artifacts_with_a_Community_Detection_Label.cypher new file mode 100644 index 000000000..a3743aa8b --- /dev/null +++ b/cypher/Community_Detection_Leiden_for_Artifacts/Get_all_Artifacts_with_a_Community_Detection_Label.cypher @@ -0,0 +1,5 @@ +// Get all Artifacts with a Community Detection Label + +MATCH (artifact:Artifact) +WHERE any(label IN labels(artifact) WHERE label CONTAINS 'Community') +RETURN DISTINCT artifact; \ No newline at end of file diff --git a/scripts/reports/ArtifactCommunityCsv.sh b/scripts/reports/ArtifactCommunityCsv.sh new file mode 100755 index 000000000..1c250c596 --- /dev/null +++ b/scripts/reports/ArtifactCommunityCsv.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +# Detects communities using the Graph Data Science Library of Neo4j and creates CSV reports. +# It requires an already running Neo4j graph database with already scanned analyzed artifacts. +# The reports (csv files) will be written into the sub directory reports/community. +# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script. + +# Requires executeQueryFunctions.sh + +# Overrideable Constants (defaults also defined in sub scripts) +REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} + +## Get this "scripts/reports" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} +echo "artifactCommunityCsv: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" + +# Get the "scripts" directory by taking the path of this script and going one directory up. +SCRIPTS_DIR=${SCRIPTS_DIR:-"${REPORTS_SCRIPT_DIR}/.."} # Repository directory containing the shell scripts +echo "artifactCommunityCsv: SCRIPTS_DIR=${SCRIPTS_DIR}" + +# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher". +CYPHER_DIR=${CYPHER_DIR:-"${REPORTS_SCRIPT_DIR}/../../cypher"} +echo "artifactCommunityCsv: CYPHER_DIR=$CYPHER_DIR" + +# Define functions to execute a cypher query from within the given file (first and only argument) +source "${SCRIPTS_DIR}/executeQueryFunctions.sh" + +# Create report directory +REPORT_NAME="artifact-community-csv" +FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}" +mkdir -p "${FULL_REPORT_DIRECTORY}" + +# Local Constants +LEIDEN_CYPHER_DIR="$CYPHER_DIR/Community_Detection_Leiden_for_Artifacts" + +# Preparation for Community Detection - Create package dependencies projections +execute_cypher "${LEIDEN_CYPHER_DIR}/Community_Detection_0_Delete_Projection.cypher" +execute_cypher "${LEIDEN_CYPHER_DIR}/Community_Detection_0b_Delete_Projection.cypher" +execute_cypher "${LEIDEN_CYPHER_DIR}/Community_Detection_1_Create_undirected_Projection.cypher" +execute_cypher "${LEIDEN_CYPHER_DIR}/Community_Detection_1b_Create_subgraph_without_empty_artifacts.cypher" + +# Community Detection using the Leiden Algorithm - Query CSV +execute_cypher "${LEIDEN_CYPHER_DIR}/Community_Detection_2_Leiden_Estimate_Memory.cypher" +execute_cypher "${LEIDEN_CYPHER_DIR}/Community_Detection_3_Leiden_Statistics.cypher" +execute_cypher "${LEIDEN_CYPHER_DIR}/Community_Detection_4_Leiden_Stream.cypher" > "${FULL_REPORT_DIRECTORY}/Leiden_Communities.csv" + +# Community Detection using the Leiden Algorithm - Update Graph +execute_cypher "${LEIDEN_CYPHER_DIR}/Community_Detection_5_Leiden_Write_property_leidenCommunityId.cypher" +execute_cypher "${LEIDEN_CYPHER_DIR}/Community_Detection_6_Delete_Existing_Labels.cypher" +execute_cypher "${LEIDEN_CYPHER_DIR}/Community_Detection_7_Add_ArtifactLeidenCommunity_Id_label_to_artifacts.cypher" \ No newline at end of file diff --git a/scripts/reports/ArtifactDependenciesJupyter.sh b/scripts/reports/ArtifactDependenciesJupyter.sh index ab38049a2..29d10cf54 100755 --- a/scripts/reports/ArtifactDependenciesJupyter.sh +++ b/scripts/reports/ArtifactDependenciesJupyter.sh @@ -3,7 +3,7 @@ # Creates the "artifact-dependencies" report (ipynb, md, pdf) based on the Jupyter Notebook "ArtifactDependencies.ipynb". # It contains the hierarchical artifact dependencies graph -# Requires executeJupyterNotebook.sh +# Requires executeJupyterNotebook.sh, AritfactCommunityCsv.sh # Overrideable Constants (defaults also defined in sub scripts) REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} @@ -23,10 +23,23 @@ echo "ArtifactDependenciesJupyter: SCRIPTS_DIR=${SCRIPTS_DIR}" JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY:-"${SCRIPTS_DIR}/../jupyter"} # Repository directory containing the Jupyter Notebooks echo "ArtifactDependenciesJupyter: JUPYTER_NOTEBOOK_DIRECTORY=$JUPYTER_NOTEBOOK_DIRECTORY" +# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher". +CYPHER_DIR=${CYPHER_DIR:-"${REPORTS_SCRIPT_DIR}/../../cypher"} +echo "ArtifactDependenciesJupyter CYPHER_DIR=${CYPHER_DIR}" + +# Define functions to execute cypher queries from within a given file +source "${SCRIPTS_DIR}/executeQueryFunctions.sh" + +# Local Constants +LEIDEN_CYPHER_DIR="$CYPHER_DIR/Community_Detection_Leiden_for_Artifacts" + # Create report directory REPORT_NAME="artifact-dependencies" FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}" mkdir -p "${FULL_REPORT_DIRECTORY}" +# Dependency: Assure that artifacts have a Leiden Community Id (written by "AritfactCommunityCsv.sh") +execute_cypher_expect_results "${LEIDEN_CYPHER_DIR}/Community_Detection_8_Check_Leiden_Community_Id.cypher" + # Execute and convert the Jupyter Notebook "ArtifactDependencies.ipynb" within the given reports directory (cd "${FULL_REPORT_DIRECTORY}" && exec ${SCRIPTS_DIR}/executeJupyterNotebook.sh ${JUPYTER_NOTEBOOK_DIRECTORY}/ArtifactDependencies.ipynb) || exit 1 \ No newline at end of file From a25c14ca9b9bfe74f9baf9b6001d4bdf09b4b9bc Mon Sep 17 00:00:00 2001 From: JohT Date: Mon, 11 Sep 2023 20:48:27 +0200 Subject: [PATCH 03/15] Add topological sorted artifacts report --- ..._for_directed_artifact_dependencies.cypher | 5 ++ ..._for_directed_artifact_dependencies.cypher | 5 ++ .../1_Create_directed_Projection.cypher | 9 ++++ ...ed_subgraph_without_empty_artifacts.cypher | 10 ++++ .../3_Topological_Sort_Artifacts.cypher | 16 ++++++ ..._Topological_Sort_Artifacts_as_list.cypher | 21 ++++++++ ...uery_artifacts_in_topological_order.cypher | 14 ++++++ ..._Depth_First_Search_Artifacts_Index.cypher | 46 +++++++++++++++++ scripts/reports/ArtifactTopologicalSortCsv.sh | 49 +++++++++++++++++++ 9 files changed, 175 insertions(+) create mode 100644 cypher/Topological_Sort_Artifacts/0_Delete_Projections_for_directed_artifact_dependencies.cypher create mode 100644 cypher/Topological_Sort_Artifacts/0b_Delete_Projections_for_directed_artifact_dependencies.cypher create mode 100644 cypher/Topological_Sort_Artifacts/1_Create_directed_Projection.cypher create mode 100644 cypher/Topological_Sort_Artifacts/2_Create_directed_subgraph_without_empty_artifacts.cypher create mode 100644 cypher/Topological_Sort_Artifacts/3_Topological_Sort_Artifacts.cypher create mode 100644 cypher/Topological_Sort_Artifacts/3b_Topological_Sort_Artifacts_as_list.cypher create mode 100644 cypher/Topological_Sort_Artifacts/4_Query_artifacts_in_topological_order.cypher create mode 100644 cypher/Topological_Sort_Artifacts/5_Experimental_Depth_First_Search_Artifacts_Index.cypher create mode 100755 scripts/reports/ArtifactTopologicalSortCsv.sh diff --git a/cypher/Topological_Sort_Artifacts/0_Delete_Projections_for_directed_artifact_dependencies.cypher b/cypher/Topological_Sort_Artifacts/0_Delete_Projections_for_directed_artifact_dependencies.cypher new file mode 100644 index 000000000..685f11973 --- /dev/null +++ b/cypher/Topological_Sort_Artifacts/0_Delete_Projections_for_directed_artifact_dependencies.cypher @@ -0,0 +1,5 @@ +//0 Delete Projections for directed artifact dependencies + + CALL gds.graph.drop('artifact-dependencies-directed', false) + YIELD graphName, nodeCount, relationshipCount, creationTime, modificationTime +RETURN graphName, nodeCount, relationshipCount, creationTime, modificationTime \ No newline at end of file diff --git a/cypher/Topological_Sort_Artifacts/0b_Delete_Projections_for_directed_artifact_dependencies.cypher b/cypher/Topological_Sort_Artifacts/0b_Delete_Projections_for_directed_artifact_dependencies.cypher new file mode 100644 index 000000000..c621d56ac --- /dev/null +++ b/cypher/Topological_Sort_Artifacts/0b_Delete_Projections_for_directed_artifact_dependencies.cypher @@ -0,0 +1,5 @@ +//0b Delete Projections for directed artifact dependencies + + CALL gds.graph.drop('artifact-dependencies-directed-without-empty', false) + YIELD graphName, nodeCount, relationshipCount, creationTime, modificationTime +RETURN graphName, nodeCount, relationshipCount, creationTime, modificationTime \ No newline at end of file diff --git a/cypher/Topological_Sort_Artifacts/1_Create_directed_Projection.cypher b/cypher/Topological_Sort_Artifacts/1_Create_directed_Projection.cypher new file mode 100644 index 000000000..2f2164e03 --- /dev/null +++ b/cypher/Topological_Sort_Artifacts/1_Create_directed_Projection.cypher @@ -0,0 +1,9 @@ +//1 Create directed Projection +CALL gds.graph.project('artifact-dependencies-directed', 'Artifact', 'DEPENDS_ON', + { + relationshipProperties: ['weight'], + nodeProperties: ['incomingDependencies', 'outgoingDependencies'] + } +) + YIELD graphName, nodeCount, relationshipCount +RETURN graphName, nodeCount, relationshipCount \ No newline at end of file diff --git a/cypher/Topological_Sort_Artifacts/2_Create_directed_subgraph_without_empty_artifacts.cypher b/cypher/Topological_Sort_Artifacts/2_Create_directed_subgraph_without_empty_artifacts.cypher new file mode 100644 index 000000000..85b5e1841 --- /dev/null +++ b/cypher/Topological_Sort_Artifacts/2_Create_directed_subgraph_without_empty_artifacts.cypher @@ -0,0 +1,10 @@ +//2 Create directed subgraph without empty artifacts + +CALL gds.beta.graph.project.subgraph( + 'artifact-dependencies-directed-without-empty', + 'artifact-dependencies-directed', + 'n.outgoingDependencies > 0 OR n.incomingDependencies > 0', + '*' +) + YIELD graphName, fromGraphName, nodeCount, relationshipCount, nodeFilter +RETURN graphName, fromGraphName, nodeCount, relationshipCount, nodeFilter \ No newline at end of file diff --git a/cypher/Topological_Sort_Artifacts/3_Topological_Sort_Artifacts.cypher b/cypher/Topological_Sort_Artifacts/3_Topological_Sort_Artifacts.cypher new file mode 100644 index 000000000..9454a1bbb --- /dev/null +++ b/cypher/Topological_Sort_Artifacts/3_Topological_Sort_Artifacts.cypher @@ -0,0 +1,16 @@ +//3 Topological Sort Artifacts +//Needs graph-data-science plugin version >= 2.5.0 + +CALL gds.dag.topologicalSort.stream('artifact-dependencies-directed-without-empty',{ + computeMaxDistanceFromSource: true +}) YIELD nodeId, maxDistanceFromSource + WITH nodeId + ,gds.util.asNode(nodeId) AS artifact + ,toInteger(maxDistanceFromSource) AS maxDistanceFromSource + SET artifact.maxDistanceFromSource = maxDistanceFromSource + WITH COLLECT(nodeId) AS sortedNodeIds + ,COLLECT(artifact) AS sortedArtifacts + ,MAX(maxDistanceFromSource) AS overallMaxDistance +FOREACH (i IN RANGE(0, SIZE(sortedArtifacts)-1) | + SET gds.util.asNode(sortedNodeIds[i]).topologicalSortIndex = i) + RETURN size(sortedArtifacts) AS numberOfArtifacts, overallMaxDistance \ No newline at end of file diff --git a/cypher/Topological_Sort_Artifacts/3b_Topological_Sort_Artifacts_as_list.cypher b/cypher/Topological_Sort_Artifacts/3b_Topological_Sort_Artifacts_as_list.cypher new file mode 100644 index 000000000..6b0a9e729 --- /dev/null +++ b/cypher/Topological_Sort_Artifacts/3b_Topological_Sort_Artifacts_as_list.cypher @@ -0,0 +1,21 @@ +//3 Topological Sort Artifacts as list +//Needs graph-data-science plugin version >= 2.5.0 + +CALL gds.dag.topologicalSort.stream('artifact-dependencies-directed-without-empty',{ + computeMaxDistanceFromSource: true +}) YIELD nodeId, maxDistanceFromSource + WITH nodeId + ,gds.util.asNode(nodeId) AS artifact + ,toInteger(maxDistanceFromSource) AS maxDistanceFromSource + SET artifact.maxDistanceFromSource = maxDistanceFromSource + WITH COLLECT(nodeId) AS sortedNodeIds + ,COLLECT({artifact: artifact, maxDistanceFromSource: maxDistanceFromSource}) AS topologicalSortedArtifacts + ,MAX(maxDistanceFromSource) AS overallMaxDistance +FOREACH (i IN RANGE(0, SIZE(sortedNodeIds)-1) | + SET gds.util.asNode(sortedNodeIds[i]).topologicalSortIndex = i) + WITH topologicalSortedArtifacts + ,overallMaxDistance + UNWIND topologicalSortedArtifacts AS topologicalSortedArtifact + RETURN replace(last(split(topologicalSortedArtifact.artifact.fileName, '/')), '.jar', '') AS artifactName + ,topologicalSortedArtifact.maxDistanceFromSource AS buildLevel + ,overallMaxDistance AS overalNumberOfBuildLevels \ No newline at end of file diff --git a/cypher/Topological_Sort_Artifacts/4_Query_artifacts_in_topological_order.cypher b/cypher/Topological_Sort_Artifacts/4_Query_artifacts_in_topological_order.cypher new file mode 100644 index 000000000..abda069f9 --- /dev/null +++ b/cypher/Topological_Sort_Artifacts/4_Query_artifacts_in_topological_order.cypher @@ -0,0 +1,14 @@ +//4 Artifacts in topological order + +MATCH (artifact:Artifact) +WHERE artifact.topologicalSortIndex IS NOT NULL + WITH COLLECT(artifact) AS artifacts + ,MAX(artifact.maxDistanceFromSource) AS maxBuildLevel +UNWIND artifacts AS artifact +RETURN replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,artifact.topologicalSortIndex AS topologicalSortIndex + ,artifact.maxDistanceFromSource AS buildLevel + ,maxBuildLevel + ,artifact.incomingDependencies AS incomingDependencies + ,artifact.outgoingDependencies AS outgoingDependencies +ORDER BY artifact.topologicalSortIndex \ No newline at end of file diff --git a/cypher/Topological_Sort_Artifacts/5_Experimental_Depth_First_Search_Artifacts_Index.cypher b/cypher/Topological_Sort_Artifacts/5_Experimental_Depth_First_Search_Artifacts_Index.cypher new file mode 100644 index 000000000..12a355c5f --- /dev/null +++ b/cypher/Topological_Sort_Artifacts/5_Experimental_Depth_First_Search_Artifacts_Index.cypher @@ -0,0 +1,46 @@ +//5 Experimental Depth First Search Artifacts Index +//depthFirstSearchLevel is not correct + +// Depth First Search starting from a node with no incoming dependencies +MATCH (source:Artifact{fileName:'/axon-configuration-4.8.0.jar'}) + CALL gds.dfs.stream('artifact-dependencies-directed-without-empty', { + sourceNode: source +}) + YIELD nodeIds +// Generate an index to iterate through the searched nodes +UNWIND range(0, size(nodeIds)-1) AS nodeIndex + WITH nodeIndex + ,nodeIds + ,gds.util.asNodes(nodeIds) AS searchedNodes + WITH nodeIndex + ,nodeIds + ,searchedNodes + ,searchedNodes[nodeIndex] AS indexedNode + // Get the previous node to be able to detect where depth first search went back + ,CASE WHEN nodeIndex > 0 + THEN searchedNodes[nodeIndex - 1] + ELSE NULL + END AS previousNode +// Get the parent node of the indexed one + OPTIONAL MATCH (indexedNode)<-[:DEPENDS_ON]-(parent:Artifact) + WITH nodeIndex + ,nodeIds + ,searchedNodes + ,indexedNode + ,previousNode + ,COLLECT(parent.fileName) AS parentFilenames + ,(previousNode IN COLLECT(parent)) AS previousIsParent + ,COLLECT(apoc.coll.indexOf(searchedNodes[0..nodeIndex], parent)) AS previousParentIndizes + ,apoc.coll.max(COLLECT(apoc.coll.indexOf(searchedNodes[0..nodeIndex], parent))) + 1 AS topologyLevel + // Set the property 'depthFirstSearchIndex' to the index + // TODO Set 'depthFirstSearchLevel' relative to the level of the parent, not its dfs index + SET indexedNode.depthFirstSearchIndex = nodeIndex + ,indexedNode.depthFirstSearchLevel = topologyLevel +RETURN indexedNode.fileName + ,nodeIndex + ,previousNode.fileName + ,previousIsParent + ,previousParentIndizes + ,topologyLevel + ,parentFilenames +//FOREACH (i IN RANGE(0, SIZE(nodeIds)-1) | SET gds.util.asNode(nodeIds[i]).depthFirstSearchIndex = i) \ No newline at end of file diff --git a/scripts/reports/ArtifactTopologicalSortCsv.sh b/scripts/reports/ArtifactTopologicalSortCsv.sh new file mode 100755 index 000000000..6ff4fa0ab --- /dev/null +++ b/scripts/reports/ArtifactTopologicalSortCsv.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +# Applies the Topological Sorting algorithm to order the artifacts by their artifacts (build order/level) using Graph Data Science Library of Neo4j and creates CSV reports. +# It requires an already running Neo4j graph database with already scanned analyzed artifacts. +# The reports (csv files) will be written into the sub directory reports/artifact-topology-csv. +# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script. + +# Requires executeQueryFunctions.sh + +# Overrideable constants (defaults also defined in sub scripts) +REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} + +## Get this "scripts/reports" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} +echo "artifactTopologicalSortCsv: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" + +# Get the "scripts" directory by taking the path of this script and going one directory up. +SCRIPTS_DIR=${SCRIPTS_DIR:-"${REPORTS_SCRIPT_DIR}/.."} # Repository directory containing the shell scripts +echo "artifactTopologicalSortCsv: SCRIPTS_DIR=${SCRIPTS_DIR}" + +# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher". +CYPHER_DIR=${CYPHER_DIR:-"${REPORTS_SCRIPT_DIR}/../../cypher"} +echo "artifactTopologicalSortCsv: CYPHER_DIR=$CYPHER_DIR" + +# Define functions to execute a cypher query from within the given file (first and only argument) +source "${SCRIPTS_DIR}/executeQueryFunctions.sh" + +# Create report directory +REPORT_NAME="artifact-topology-csv" +FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}" +mkdir -p "${FULL_REPORT_DIRECTORY}" + +# Local Constants +TOPOLOGICAL_SORT_DIR="$CYPHER_DIR/Topological_Sort_Artifacts" + +# Preparation for Topological Sort - Create artifact dependencies projections +execute_cypher "${TOPOLOGICAL_SORT_DIR}/0_Delete_Projections_for_directed_artifact_dependencies.cypher" +execute_cypher "${TOPOLOGICAL_SORT_DIR}/0b_Delete_Projections_for_directed_artifact_dependencies.cypher" +execute_cypher "${TOPOLOGICAL_SORT_DIR}/1_Create_directed_Projection.cypher" +execute_cypher "${TOPOLOGICAL_SORT_DIR}/2_Create_directed_subgraph_without_empty_artifacts.cypher" + +# Topological Sort Artifacts +execute_cypher "${TOPOLOGICAL_SORT_DIR}/3_Topological_Sort_Artifacts.cypher" + +# Query topological sorted Artifacts (CSV) +execute_cypher "${TOPOLOGICAL_SORT_DIR}/4_Query_artifacts_in_topological_order.cypher" > "${FULL_REPORT_DIRECTORY}/TopologicalSortedArtifacts.csv" \ No newline at end of file From 0c45702a5f19660c9a9c93de508a1ee8d03a1988 Mon Sep 17 00:00:00 2001 From: JohT Date: Fri, 18 Aug 2023 09:10:06 +0200 Subject: [PATCH 04/15] Fix report output directory comments --- scripts/reports/ArtifactCommunityCsv.sh | 2 +- scripts/reports/CentralityCsv.sh | 2 +- scripts/reports/CommunityCsv.sh | 2 +- scripts/reports/ExternalDependenciesCsv.sh | 2 +- scripts/reports/InternalDependenciesCsv.sh | 2 +- scripts/reports/ObjectOrientedDesignMetricsCsv.sh | 2 +- scripts/reports/SimilarityCsv.sh | 2 +- scripts/reports/VisibilityMetricsCsv.sh | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/reports/ArtifactCommunityCsv.sh b/scripts/reports/ArtifactCommunityCsv.sh index 1c250c596..d6cd1dda0 100755 --- a/scripts/reports/ArtifactCommunityCsv.sh +++ b/scripts/reports/ArtifactCommunityCsv.sh @@ -2,7 +2,7 @@ # Detects communities using the Graph Data Science Library of Neo4j and creates CSV reports. # It requires an already running Neo4j graph database with already scanned analyzed artifacts. -# The reports (csv files) will be written into the sub directory reports/community. +# The reports (csv files) will be written into the sub directory reports/artifact-community-csv. # Note that "scripts/prepareAnalysis.sh" is required to run prior to this script. # Requires executeQueryFunctions.sh diff --git a/scripts/reports/CentralityCsv.sh b/scripts/reports/CentralityCsv.sh index 3885caeb7..17aa69e41 100755 --- a/scripts/reports/CentralityCsv.sh +++ b/scripts/reports/CentralityCsv.sh @@ -2,7 +2,7 @@ # Looks for centrality using the Graph Data Science Library of Neo4j and creates CSV reports. # It requires an already running Neo4j graph database with already scanned analyzed artifacts. -# The reports (csv files) will be written into the sub directory reports/community. +# The reports (csv files) will be written into the sub directory reports/centrality-csv. # Note that "scripts/prepareAnalysis.sh" is required to run prior to this script. # Requires executeQueryFunctions.sh diff --git a/scripts/reports/CommunityCsv.sh b/scripts/reports/CommunityCsv.sh index c4130a429..be948bbca 100755 --- a/scripts/reports/CommunityCsv.sh +++ b/scripts/reports/CommunityCsv.sh @@ -2,7 +2,7 @@ # Detects communities using the Graph Data Science Library of Neo4j and creates CSV reports. # It requires an already running Neo4j graph database with already scanned analyzed artifacts. -# The reports (csv files) will be written into the sub directory reports/community. +# The reports (csv files) will be written into the sub directory reports/community-csv. # Note that "scripts/prepareAnalysis.sh" is required to run prior to this script. # Requires executeQueryFunctions.sh diff --git a/scripts/reports/ExternalDependenciesCsv.sh b/scripts/reports/ExternalDependenciesCsv.sh index c75e53afc..d567c79eb 100755 --- a/scripts/reports/ExternalDependenciesCsv.sh +++ b/scripts/reports/ExternalDependenciesCsv.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# Executes "Package_Usage" Cypher queries to get the "external-dependencies-csv" CSV reports. +# Executes "External_Dependencies" Cypher queries to get the "external-dependencies-csv" CSV reports. # They list external library package usage like how often a external package is called. # Requires executeQueryFunctions.sh diff --git a/scripts/reports/InternalDependenciesCsv.sh b/scripts/reports/InternalDependenciesCsv.sh index 696aef399..5490179b8 100755 --- a/scripts/reports/InternalDependenciesCsv.sh +++ b/scripts/reports/InternalDependenciesCsv.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# Executes "Package_Usage" Cypher queries to get the "internal-dependencies" CSV reports. +# Executes "Package_Usage" Cypher queries to get the "internal-dependencies-csv" CSV reports. # It contains lists of e.g. incoming and outgoing package dependencies, # abstractness, instability and the distance to the so called "main sequence". diff --git a/scripts/reports/ObjectOrientedDesignMetricsCsv.sh b/scripts/reports/ObjectOrientedDesignMetricsCsv.sh index 074431739..e6b1f7dad 100755 --- a/scripts/reports/ObjectOrientedDesignMetricsCsv.sh +++ b/scripts/reports/ObjectOrientedDesignMetricsCsv.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# Executes "Metrics" Cypher queries to get the "object-oriented-design-metrics" CSV reports. +# Executes "Metrics" Cypher queries to get the "object-oriented-design-metrics-csv" CSV reports. # It contains lists of e.g. incoming and outgoing package dependencies, # abstractness, instability and the distance to the so called "main sequence". diff --git a/scripts/reports/SimilarityCsv.sh b/scripts/reports/SimilarityCsv.sh index e62b72cfb..fdaa44431 100755 --- a/scripts/reports/SimilarityCsv.sh +++ b/scripts/reports/SimilarityCsv.sh @@ -2,7 +2,7 @@ # Looks for similarity using the Graph Data Science Library of Neo4j and creates CSV reports. # It requires an already running Neo4j graph database with already scanned analyzed artifacts. -# The reports (csv files) will be written into the sub directory reports/community. +# The reports (csv files) will be written into the sub directory reports/similarity-csv. # Note that "scripts/prepareAnalysis.sh" is required to run prior to this script. # Requires executeQueryFunctions.sh diff --git a/scripts/reports/VisibilityMetricsCsv.sh b/scripts/reports/VisibilityMetricsCsv.sh index ee424211a..26173cfdd 100755 --- a/scripts/reports/VisibilityMetricsCsv.sh +++ b/scripts/reports/VisibilityMetricsCsv.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# Executes "Visibility" Cypher queries to get the "visibility-metrics" CSV reports. +# Executes "Visibility" Cypher queries to get the "visibility-metrics-csv" CSV reports. # It contains lists of packages with their relative visibility (public types divided by all types) # as well as the global statistics for every artifact. From 893cfedf218807a0691a0ac0bfbf6a44fc581a4c Mon Sep 17 00:00:00 2001 From: JohT Date: Sat, 19 Aug 2023 08:53:07 +0200 Subject: [PATCH 05/15] Rename Package_Usage to Internal_Dependencies --- ...ependent_packages_across_different_artifacts.cypher | 0 ...all_existing_are_used_by_dependent_artifacts.cypher | 0 ...pes_that_are_used_by_many_different_packages.cypher | 0 jupyter/InternalDependencies.ipynb | 6 +++--- scripts/reports/InternalDependenciesCsv.sh | 10 +++++----- 5 files changed, 8 insertions(+), 8 deletions(-) rename cypher/{Package_Usage => Internal_Dependencies}/How_many_classes_compared_to_all_existing_in_the_same_package_are_used_by_dependent_packages_across_different_artifacts.cypher (100%) rename cypher/{Package_Usage => Internal_Dependencies}/How_many_packages_compared_to_all_existing_are_used_by_dependent_artifacts.cypher (100%) rename cypher/{Package_Usage => Internal_Dependencies}/List_types_that_are_used_by_many_different_packages.cypher (100%) diff --git a/cypher/Package_Usage/How_many_classes_compared_to_all_existing_in_the_same_package_are_used_by_dependent_packages_across_different_artifacts.cypher b/cypher/Internal_Dependencies/How_many_classes_compared_to_all_existing_in_the_same_package_are_used_by_dependent_packages_across_different_artifacts.cypher similarity index 100% rename from cypher/Package_Usage/How_many_classes_compared_to_all_existing_in_the_same_package_are_used_by_dependent_packages_across_different_artifacts.cypher rename to cypher/Internal_Dependencies/How_many_classes_compared_to_all_existing_in_the_same_package_are_used_by_dependent_packages_across_different_artifacts.cypher diff --git a/cypher/Package_Usage/How_many_packages_compared_to_all_existing_are_used_by_dependent_artifacts.cypher b/cypher/Internal_Dependencies/How_many_packages_compared_to_all_existing_are_used_by_dependent_artifacts.cypher similarity index 100% rename from cypher/Package_Usage/How_many_packages_compared_to_all_existing_are_used_by_dependent_artifacts.cypher rename to cypher/Internal_Dependencies/How_many_packages_compared_to_all_existing_are_used_by_dependent_artifacts.cypher diff --git a/cypher/Package_Usage/List_types_that_are_used_by_many_different_packages.cypher b/cypher/Internal_Dependencies/List_types_that_are_used_by_many_different_packages.cypher similarity index 100% rename from cypher/Package_Usage/List_types_that_are_used_by_many_different_packages.cypher rename to cypher/Internal_Dependencies/List_types_that_are_used_by_many_different_packages.cypher diff --git a/jupyter/InternalDependencies.ipynb b/jupyter/InternalDependencies.ipynb index 9e42fd3fa..157079f29 100644 --- a/jupyter/InternalDependencies.ipynb +++ b/jupyter/InternalDependencies.ipynb @@ -239,7 +239,7 @@ }, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/Package_Usage/List_types_that_are_used_by_many_different_packages.cypher\").head(20)" + "query_cypher_to_data_frame(\"../cypher/Internal_Dependencies/List_types_that_are_used_by_many_different_packages.cypher\").head(20)" ] }, { @@ -263,7 +263,7 @@ }, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/Package_Usage/How_many_packages_compared_to_all_existing_are_used_by_dependent_artifacts.cypher\").head(20)" + "query_cypher_to_data_frame(\"../cypher/Internal_Dependencies/How_many_packages_compared_to_all_existing_are_used_by_dependent_artifacts.cypher\").head(20)" ] }, { @@ -287,7 +287,7 @@ }, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/Package_Usage/How_many_classes_compared_to_all_existing_in_the_same_package_are_used_by_dependent_packages_across_different_artifacts.cypher\").head(20)" + "query_cypher_to_data_frame(\"../cypher/Internal_Dependencies/How_many_classes_compared_to_all_existing_in_the_same_package_are_used_by_dependent_packages_across_different_artifacts.cypher\").head(20)" ] } ], diff --git a/scripts/reports/InternalDependenciesCsv.sh b/scripts/reports/InternalDependenciesCsv.sh index 5490179b8..e6ebe41b7 100755 --- a/scripts/reports/InternalDependenciesCsv.sh +++ b/scripts/reports/InternalDependenciesCsv.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# Executes "Package_Usage" Cypher queries to get the "internal-dependencies-csv" CSV reports. +# Executes "Internal_Dependencies" Cypher queries to get the "internal-dependencies-csv" CSV reports. # It contains lists of e.g. incoming and outgoing package dependencies, # abstractness, instability and the distance to the so called "main sequence". @@ -34,12 +34,12 @@ mkdir -p "${FULL_REPORT_DIRECTORY}" # Local Constants CYCLIC_DEPENDENCIES_CYPHER_DIR="${CYPHER_DIR}/Cyclic_Dependencies" -PACKAGE_USAGE_CYPHER_DIR="${CYPHER_DIR}/Package_Usage" +INTERNAL_DEPENDENCIES_CYPHER_DIR="${CYPHER_DIR}/Internal_Dependencies" execute_cypher "${CYCLIC_DEPENDENCIES_CYPHER_DIR}/Cyclic_Dependencies_as_List.cypher" > "${FULL_REPORT_DIRECTORY}/CyclicDependencies.csv" execute_cypher "${CYCLIC_DEPENDENCIES_CYPHER_DIR}/Cyclic_Dependencies_as_unwinded_List.cypher" > "${FULL_REPORT_DIRECTORY}/CyclicDependenciesUnwinded.csv" execute_cypher "${CYCLIC_DEPENDENCIES_CYPHER_DIR}/Cyclic_Dependencies_between_Artrifacts_as_unwinded_List.cypher" > "${FULL_REPORT_DIRECTORY}/CyclicArtifactDependenciesUnwinded.csv" execute_cypher "${CYPHER_DIR}/Candidates_for_Interface_Segregation.cypher" > "${FULL_REPORT_DIRECTORY}/InterfaceSegregationCandidates.csv" -execute_cypher "${PACKAGE_USAGE_CYPHER_DIR}/List_types_that_are_used_by_many_different_packages.cypher" > "${FULL_REPORT_DIRECTORY}/WidelyUsedTypes.csv" -execute_cypher "${PACKAGE_USAGE_CYPHER_DIR}/How_many_packages_compared_to_all_existing_are_used_by_dependent_artifacts.cypher" > "${FULL_REPORT_DIRECTORY}/ArtifactPackageUsage.csv" -execute_cypher "${PACKAGE_USAGE_CYPHER_DIR}/How_many_classes_compared_to_all_existing_in_the_same_package_are_used_by_dependent_packages_across_different_artifacts.cypher" > "${FULL_REPORT_DIRECTORY}/ClassesPerPackageUsageAcrossArtifacts.csv" \ No newline at end of file +execute_cypher "${INTERNAL_DEPENDENCIES_CYPHER_DIR}/List_types_that_are_used_by_many_different_packages.cypher" > "${FULL_REPORT_DIRECTORY}/WidelyUsedTypes.csv" +execute_cypher "${INTERNAL_DEPENDENCIES_CYPHER_DIR}/How_many_packages_compared_to_all_existing_are_used_by_dependent_artifacts.cypher" > "${FULL_REPORT_DIRECTORY}/ArtifactPackageUsage.csv" +execute_cypher "${INTERNAL_DEPENDENCIES_CYPHER_DIR}/How_many_classes_compared_to_all_existing_in_the_same_package_are_used_by_dependent_packages_across_different_artifacts.cypher" > "${FULL_REPORT_DIRECTORY}/ClassesPerPackageUsageAcrossArtifacts.csv" \ No newline at end of file From 5a074637e003bb1c801a73a3dd1263f71a195743 Mon Sep 17 00:00:00 2001 From: JohT Date: Sat, 19 Aug 2023 10:44:33 +0200 Subject: [PATCH 06/15] Add internal dependencies across artifacts reports --- ...ith_dependencies_to_other_artifacts.cypher | 47 ++++++++++++++++ ...rnal_dependencies_acreoss_artifacts.cypher | 24 ++++++++ ..._of_packages_and_types_on_artifacts.cypher | 13 +++++ ...d_of_internal_artifact_dependencies.cypher | 54 ++++++++++++++++++ ...ead_of_internal_artifact_dependents.cypher | 55 +++++++++++++++++++ scripts/reports/ArtifactDependenciesCsv.sh | 44 +++++++++++++++ scripts/reports/InternalDependenciesCsv.sh | 2 + 7 files changed, 239 insertions(+) create mode 100644 cypher/Artifact_Dependencies/Artifacts_with_dependencies_to_other_artifacts.cypher create mode 100644 cypher/Artifact_Dependencies/Most_used_internal_dependencies_acreoss_artifacts.cypher create mode 100644 cypher/Artifact_Dependencies/Set_number_of_packages_and_types_on_artifacts.cypher create mode 100644 cypher/Artifact_Dependencies/Usage_and_spread_of_internal_artifact_dependencies.cypher create mode 100644 cypher/Artifact_Dependencies/Usage_and_spread_of_internal_artifact_dependents.cypher create mode 100755 scripts/reports/ArtifactDependenciesCsv.sh diff --git a/cypher/Artifact_Dependencies/Artifacts_with_dependencies_to_other_artifacts.cypher b/cypher/Artifact_Dependencies/Artifacts_with_dependencies_to_other_artifacts.cypher new file mode 100644 index 000000000..0afbd5139 --- /dev/null +++ b/cypher/Artifact_Dependencies/Artifacts_with_dependencies_to_other_artifacts.cypher @@ -0,0 +1,47 @@ +// Artifacts with dependencies to other artifacts + +MATCH (artifact:Artifact)-[:CONTAINS]->(packageInArtifact:Package) +MATCH (packageInArtifact)-[:CONTAINS]->(typeInPackage:Type) +MATCH (typeInPackage)-[:DEPENDS_ON]->(dependencyType:Type) +MATCH (dependencyPackage:Package)-[:CONTAINS]->(dependencyType) +MATCH (dependencyArtifact:Artifact)-[:CONTAINS]->(dependencyPackage) +WHERE artifact.fileName <> dependencyArtifact.fileName + WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,artifact.numberOfPackages AS packagesInArtifactCount + ,artifact.numberOfTypes AS typesInArtifactCount + ,collect(DISTINCT packageInArtifact.fqn) AS packages + ,count(DISTINCT packageInArtifact.fqn) AS packagesCount + ,round(100.0 / artifact.numberOfPackages + * count(DISTINCT packageInArtifact.fqn) + , 2) AS packageSpread + ,collect(DISTINCT typeInPackage.name) AS types + ,count(DISTINCT typeInPackage.fqn) AS typesCount + ,round(100.0 / artifact.numberOfTypes + * count(DISTINCT typeInPackage.fqn) + , 2) AS typesSpread + ,replace(last(split(dependencyArtifact.fileName, '/')), '.jar', '') AS dependencyArtifactName +// additionally group by if the dependency is an interface or not + ,dependencyType:Interface AS dependencyTypeIsInterface + ,collect(DISTINCT dependencyPackage.fqn) AS dependencyPackages + ,count(DISTINCT dependencyPackage.fqn) AS dependencyPackagesCount + ,collect(DISTINCT dependencyType.name) AS dependencyTypes + ,count(DISTINCT dependencyType.fqn) AS dependencyTypesCount +// Filter out empty dependency sets +WHERE dependencyPackagesCount > 0 + AND packagesCount > 1 +RETURN artifactName + ,packagesInArtifactCount + ,packagesCount + ,packageSpread + ,typesInArtifactCount + ,typesCount + ,typesSpread + ,dependencyArtifactName + ,dependencyTypeIsInterface + ,dependencyPackagesCount + ,dependencyTypesCount + ,dependencyPackages[0..2] AS someDependencyPackages + ,dependencyTypes[0..4] AS someDependencyTypes + ,packages[0..2] AS someCallingPackages + ,types[0..4] AS someCallingTypes +ORDER BY packagesCount DESC \ No newline at end of file diff --git a/cypher/Artifact_Dependencies/Most_used_internal_dependencies_acreoss_artifacts.cypher b/cypher/Artifact_Dependencies/Most_used_internal_dependencies_acreoss_artifacts.cypher new file mode 100644 index 000000000..b056ca378 --- /dev/null +++ b/cypher/Artifact_Dependencies/Most_used_internal_dependencies_acreoss_artifacts.cypher @@ -0,0 +1,24 @@ +// Most used internal dependencies across artifacts + +MATCH (type:Type)-[:DEPENDS_ON]->(dependencyType:Type) +MATCH (artifact:Artifact)-[:CONTAINS]->(package:Package)-[:CONTAINS]->(type:Type) +MATCH (dependencyArtifact:Artifact)-[:CONTAINS]->(dependencyPackage:Package)-[:CONTAINS]->(dependencyType) +WHERE artifact.fileName <> dependencyArtifact.fileName + WITH replace(last(split(dependencyArtifact.fileName, '/')), '.jar', '') AS dependencyArtifactName + ,COLLECT(DISTINCT dependencyPackage.fqn) AS dependencyPackageNames + ,COLLECT(DISTINCT dependencyType.name) AS dependencyTypeNames + ,COLLECT(DISTINCT replace(last(split(artifact.fileName, '/')), '.jar', '')) AS artifactNames + ,COUNT(DISTINCT package.fqn) AS numberOfPackages + ,COUNT(DISTINCT type.fqn) AS numberOfTypes + ,COUNT(DISTINCT dependencyType) AS numberOfDependencyTypes + ,REDUCE(interfaces=0, depType IN COLLECT(DISTINCT dependencyType) | + CASE WHEN depType:Interface THEN interfaces + 1 ELSE interfaces END ) AS numberOfDependencyInterfaces + ORDER BY numberOfPackages DESC +RETURN dependencyArtifactName AS dependency + ,numberOfPackages AS usedByPackages + ,numberOfTypes AS usedByTypes + ,SIZE(dependencyPackageNames) AS providesPackages + ,SIZE(dependencyTypeNames) AS providesTypes + ,ROUND(100.0 / numberOfDependencyTypes * numberOfDependencyInterfaces, 2) AS interfaceRate + ,dependencyPackageNames[0..5] AS someProvidedPackages + ,dependencyTypeNames[0..5] AS someProvidedTypes \ No newline at end of file diff --git a/cypher/Artifact_Dependencies/Set_number_of_packages_and_types_on_artifacts.cypher b/cypher/Artifact_Dependencies/Set_number_of_packages_and_types_on_artifacts.cypher new file mode 100644 index 000000000..491a27894 --- /dev/null +++ b/cypher/Artifact_Dependencies/Set_number_of_packages_and_types_on_artifacts.cypher @@ -0,0 +1,13 @@ +// Set number of packages and types on artifacts + + MATCH (artifact:Artifact)-[:CONTAINS]->(package:Package) + MATCH (package)-[:CONTAINS]->(type:Type) + WITH artifact + ,COUNT(DISTINCT package.fqn) AS numberOfPackages + ,COUNT(DISTINCT type.fqn) AS numberOfTypes + SET artifact.numberOfPackages = numberOfPackages + ,artifact.numberOfTypes = numberOfTypes +RETURN artifact.fileName + ,numberOfPackages + ,numberOfTypes + ORDER BY artifact.fileName \ No newline at end of file diff --git a/cypher/Artifact_Dependencies/Usage_and_spread_of_internal_artifact_dependencies.cypher b/cypher/Artifact_Dependencies/Usage_and_spread_of_internal_artifact_dependencies.cypher new file mode 100644 index 000000000..07b67f5e7 --- /dev/null +++ b/cypher/Artifact_Dependencies/Usage_and_spread_of_internal_artifact_dependencies.cypher @@ -0,0 +1,54 @@ +// Usage and spread of internal artifact dependencies + +MATCH (artifact:Artifact)-[:CONTAINS]->(packageInArtifact:Package) +MATCH (packageInArtifact)-[:CONTAINS]->(typeInPackage:Type) +MATCH (typeInPackage)-[:DEPENDS_ON]->(dependencyType:Type) +MATCH (dependencyPackage:Package)-[:CONTAINS]->(dependencyType) +MATCH (dependencyArtifact:Artifact)-[:CONTAINS]->(dependencyPackage) +WHERE artifact.fileName <> dependencyArtifact.fileName + WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,artifact.numberOfPackages AS packagesInArtifactCount + ,artifact.numberOfTypes AS typesInArtifactCount + ,collect(DISTINCT packageInArtifact.fqn) AS packages + ,count(DISTINCT packageInArtifact.fqn) AS packagesCount + ,(100.0 + / artifact.numberOfPackages + * count(DISTINCT packageInArtifact.fqn)) AS packageSpread + ,collect(DISTINCT typeInPackage.name) AS types + ,count(DISTINCT typeInPackage.fqn) AS typesCount + ,(100.0 + / artifact.numberOfTypes + * count(DISTINCT typeInPackage.fqn)) AS typesSpread + ,replace(last(split(dependencyArtifact.fileName, '/')), '.jar', '') AS dependencyArtifactName +// additionally group by if the dependency is an interface or not + ,dependencyType:Interface AS dependencyTypeIsInterface + ,collect(DISTINCT dependencyPackage.fqn) AS dependencyPackages + ,count(DISTINCT dependencyPackage.fqn) AS dependencyPackagesCount + ,collect(DISTINCT dependencyType.name) AS dependencyTypes + ,count(DISTINCT dependencyType.fqn) AS dependencyTypesCount +// Filter out empty dependency sets +WHERE dependencyPackagesCount > 0 + AND packagesCount > 1 +RETURN dependencyArtifactName + ,dependencyTypeIsInterface + ,COUNT(DISTINCT artifactName) AS usedInArtifacts + ,SUM(packagesCount) AS usedInPackages + + ,MIN(packageSpread) AS minPackageSpread + ,MAX(packageSpread) AS maxPackageSpread + ,AVG(packageSpread) AS avgPackageSpread + ,stDev(packageSpread) AS stdPackageSpread + ,percentileDisc(packageSpread, 0.5) AS per5PackageSpread + + ,MIN(packagesCount) AS minPackageCount + ,MAX(packagesCount) AS maxPackageCount + ,AVG(packagesCount) AS avgPackageCount + ,stDev(packagesCount) AS stdPackageCount + ,percentileDisc(packagesCount, 0.5) AS per5PackageCount + + ,MIN(typesSpread) AS minTypeSpread + ,MAX(typesSpread) AS maxTypeSpread + ,AVG(typesSpread) AS avgTypeSpread + ,stDev(typesSpread) AS stdTypeSpread + ,percentileDisc(typesSpread, 0.5) AS per5TypeSpread +ORDER BY toLower(dependencyArtifactName) ASC \ No newline at end of file diff --git a/cypher/Artifact_Dependencies/Usage_and_spread_of_internal_artifact_dependents.cypher b/cypher/Artifact_Dependencies/Usage_and_spread_of_internal_artifact_dependents.cypher new file mode 100644 index 000000000..769d3aaa3 --- /dev/null +++ b/cypher/Artifact_Dependencies/Usage_and_spread_of_internal_artifact_dependents.cypher @@ -0,0 +1,55 @@ +// Usage and spread of internal artifact dependents + +MATCH (artifact:Artifact)-[:CONTAINS]->(packageInArtifact:Package) +MATCH (packageInArtifact)-[:CONTAINS]->(typeInPackage:Type) +MATCH (typeInPackage)-[:DEPENDS_ON]->(dependencyType:Type) +MATCH (dependencyPackage:Package)-[:CONTAINS]->(dependencyType) +MATCH (dependencyArtifact:Artifact)-[:CONTAINS]->(dependencyPackage) +WHERE artifact.fileName <> dependencyArtifact.fileName + WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,artifact.numberOfPackages AS packagesInArtifactCount + ,artifact.numberOfTypes AS typesInArtifactCount + ,collect(DISTINCT packageInArtifact.fqn) AS packages + ,count(DISTINCT packageInArtifact.fqn) AS packagesCount + ,(100.0 + / artifact.numberOfPackages + * count(DISTINCT packageInArtifact.fqn)) AS packageSpread + ,collect(DISTINCT typeInPackage.name) AS types + ,count(DISTINCT typeInPackage.fqn) AS typesCount + ,(100.0 + / artifact.numberOfTypes + * count(DISTINCT typeInPackage.fqn)) AS typesSpread + ,replace(last(split(dependencyArtifact.fileName, '/')), '.jar', '') AS dependencyArtifactName +// additionally group by if the dependency is an interface or not + ,dependencyType:Interface AS dependencyTypeIsInterface + ,collect(DISTINCT dependencyPackage.fqn) AS dependencyPackages + ,count(DISTINCT dependencyPackage.fqn) AS dependencyPackagesCount + ,collect(DISTINCT dependencyType.name) AS dependencyTypes + ,count(DISTINCT dependencyType.fqn) AS dependencyTypesCount +// Filter out empty dependency sets +WHERE dependencyPackagesCount > 0 + AND packagesCount > 1 +RETURN artifactName + ,dependencyTypeIsInterface + ,COUNT(DISTINCT dependencyArtifactName) AS artifactDependencies + ,SUM(dependencyPackagesCount) AS artifactDependencyPackages + ,100.0 / SUM(packagesInArtifactCount) * SUM(packagesCount) AS dependentPackagesRate + + ,MIN(packageSpread) AS minPackageSpread + ,MAX(packageSpread) AS maxPackageSpread + ,AVG(packageSpread) AS avgPackageSpread + ,stDev(packageSpread) AS stdPackageSpread + ,percentileDisc(packageSpread, 0.5) AS per5PackageSpread + + ,MIN(packagesCount) AS minPackageCount + ,MAX(packagesCount) AS maxPackageCount + ,AVG(packagesCount) AS avgPackageCount + ,stDev(packagesCount) AS stdPackageCount + ,percentileDisc(packagesCount, 0.5) AS per5PackageCount + + ,MIN(typesSpread) AS minTypeSpread + ,MAX(typesSpread) AS maxTypeSpread + ,AVG(typesSpread) AS avgTypeSpread + ,stDev(typesSpread) AS stdTypeSpread + ,percentileDisc(typesSpread, 0.5) AS per5TypeSpread +ORDER BY toLower(artifactName) ASC \ No newline at end of file diff --git a/scripts/reports/ArtifactDependenciesCsv.sh b/scripts/reports/ArtifactDependenciesCsv.sh new file mode 100755 index 000000000..65ee94835 --- /dev/null +++ b/scripts/reports/ArtifactDependenciesCsv.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# Executes "Artifact_Dependencies" Cypher queries to get the "artifact-dependencies-csv" CSV reports. +# It contains lists of dependencies across artifacts and hby ow many packages/types they are used by. + +# Requires executeQueryFunctions.sh + +# Overrideable Constants (defaults also defined in sub scripts) +REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} + +## Get this "scripts/reports" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} +echo "ArtifactDependenciesCsv: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" + +# Get the "scripts" directory by taking the path of this script and going one directory up. +SCRIPTS_DIR=${SCRIPTS_DIR:-"${REPORTS_SCRIPT_DIR}/.."} # Repository directory containing the shell scripts +echo "ArtifactDependenciesCsv SCRIPTS_DIR=${SCRIPTS_DIR}" + +# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher". +CYPHER_DIR=${CYPHER_DIR:-"${REPORTS_SCRIPT_DIR}/../../cypher"} +echo "ArtifactDependenciesCsv CYPHER_DIR=${CYPHER_DIR}" + +# Define functions to execute cypher queries from within a given file +source "${SCRIPTS_DIR}/executeQueryFunctions.sh" + +# Create report directory +REPORT_NAME="artifact-dependencies-csv" +FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}" +mkdir -p "${FULL_REPORT_DIRECTORY}" + +# Local Constants +ARTIFACT_DEPENDENCIES_CYPHER_DIR="${CYPHER_DIR}/Artifact_Dependencies" + +# Preparation: Set number of packages and types per artifact +execute_cypher_expect_results "${ARTIFACT_DEPENDENCIES_CYPHER_DIR}/Set_number_of_packages_and_types_on_artifacts.cypher" + +execute_cypher "${ARTIFACT_DEPENDENCIES_CYPHER_DIR}/Most_used_internal_dependencies_acreoss_artifacts.cypher" > "${FULL_REPORT_DIRECTORY}/MostUsedDependenciesAcrossArtifacts.csv" +execute_cypher "${ARTIFACT_DEPENDENCIES_CYPHER_DIR}/Artifacts_with_dependencies_to_other_artifacts.cypher" > "${FULL_REPORT_DIRECTORY}/DependenciesAcrossArtifacts.csv" + +execute_cypher "${ARTIFACT_DEPENDENCIES_CYPHER_DIR}/Usage_and_spread_of_internal_artifact_dependencies.cypher" > "${FULL_REPORT_DIRECTORY}/InternalArtifactUsageSpreadPerDependency.csv" +execute_cypher "${ARTIFACT_DEPENDENCIES_CYPHER_DIR}/Usage_and_spread_of_internal_artifact_dependents.cypher" > "${FULL_REPORT_DIRECTORY}/InternalArtifactUsageSpreadPerDependent.csv" \ No newline at end of file diff --git a/scripts/reports/InternalDependenciesCsv.sh b/scripts/reports/InternalDependenciesCsv.sh index e6ebe41b7..b933d6e3a 100755 --- a/scripts/reports/InternalDependenciesCsv.sh +++ b/scripts/reports/InternalDependenciesCsv.sh @@ -39,7 +39,9 @@ INTERNAL_DEPENDENCIES_CYPHER_DIR="${CYPHER_DIR}/Internal_Dependencies" execute_cypher "${CYCLIC_DEPENDENCIES_CYPHER_DIR}/Cyclic_Dependencies_as_List.cypher" > "${FULL_REPORT_DIRECTORY}/CyclicDependencies.csv" execute_cypher "${CYCLIC_DEPENDENCIES_CYPHER_DIR}/Cyclic_Dependencies_as_unwinded_List.cypher" > "${FULL_REPORT_DIRECTORY}/CyclicDependenciesUnwinded.csv" execute_cypher "${CYCLIC_DEPENDENCIES_CYPHER_DIR}/Cyclic_Dependencies_between_Artrifacts_as_unwinded_List.cypher" > "${FULL_REPORT_DIRECTORY}/CyclicArtifactDependenciesUnwinded.csv" + execute_cypher "${CYPHER_DIR}/Candidates_for_Interface_Segregation.cypher" > "${FULL_REPORT_DIRECTORY}/InterfaceSegregationCandidates.csv" + execute_cypher "${INTERNAL_DEPENDENCIES_CYPHER_DIR}/List_types_that_are_used_by_many_different_packages.cypher" > "${FULL_REPORT_DIRECTORY}/WidelyUsedTypes.csv" execute_cypher "${INTERNAL_DEPENDENCIES_CYPHER_DIR}/How_many_packages_compared_to_all_existing_are_used_by_dependent_artifacts.cypher" > "${FULL_REPORT_DIRECTORY}/ArtifactPackageUsage.csv" execute_cypher "${INTERNAL_DEPENDENCIES_CYPHER_DIR}/How_many_classes_compared_to_all_existing_in_the_same_package_are_used_by_dependent_packages_across_different_artifacts.cypher" > "${FULL_REPORT_DIRECTORY}/ClassesPerPackageUsageAcrossArtifacts.csv" \ No newline at end of file From 080394134c3f14ba7a8663bc970d08b8b7f753bb Mon Sep 17 00:00:00 2001 From: JohT Date: Tue, 8 Aug 2023 21:05:35 +0200 Subject: [PATCH 07/15] Add pagination to split large graph visualizations --- .../artifactDependenciesGraph.html | 4 +- .../artifactDependenciesGraph.js | 73 +- graph-visualization/index.css | 32 +- graph-visualization/package-lock.json | 761 ++++++++++++++++++ graph-visualization/package.json | 1 + graph-visualization/renderVisualizations.js | 47 +- .../vis-configuration-presets.js | 45 ++ .../visualization-pagination.js | 72 ++ 8 files changed, 969 insertions(+), 66 deletions(-) create mode 100644 graph-visualization/vis-configuration-presets.js create mode 100644 graph-visualization/visualization-pagination.js diff --git a/graph-visualization/artifactDependenciesGraph/artifactDependenciesGraph.html b/graph-visualization/artifactDependenciesGraph/artifactDependenciesGraph.html index 434c1ef3a..edd0c2f7c 100644 --- a/graph-visualization/artifactDependenciesGraph/artifactDependenciesGraph.html +++ b/graph-visualization/artifactDependenciesGraph/artifactDependenciesGraph.html @@ -10,6 +10,8 @@ + + @@ -19,7 +21,7 @@ -
+
\ No newline at end of file diff --git a/graph-visualization/artifactDependenciesGraph/artifactDependenciesGraph.js b/graph-visualization/artifactDependenciesGraph/artifactDependenciesGraph.js index 3a8481af0..6160a47b9 100644 --- a/graph-visualization/artifactDependenciesGraph/artifactDependenciesGraph.js +++ b/graph-visualization/artifactDependenciesGraph/artifactDependenciesGraph.js @@ -1,59 +1,31 @@ -function draw() { - const config = { - containerId: "viz", - neo4j: { - serverUrl: "bolt://localhost:7687", - serverUser: "neo4j", - serverPassword: document.getElementById("neo4j-server-password").value || "neo4jinitial", - }, - visConfig: { - nodes: { - shape: "hexagon", - shadow: false, - font: { - strokeWidth: 40, - strokeColor: "#F2F2FF", - }, - size: 60, - }, - edges: { - arrows: { - to: { enabled: true }, - }, - scaling: { - max: 15, - }, - }, - physics: { - hierarchicalRepulsion: { - nodeDistance: 300, // 100 - centralGravity: 0.5, // 0.2 - springLength: 180, // 200 - springConstant: 0.06, // 0.05 - damping: 0.09, // 0.09 - avoidOverlap: 0.1, // 0 - }, - solver: "hierarchicalRepulsion", // barnesHut - }, - layout: { - hierarchical: { - enabled: true, - sortMethod: "directed", - }, - }, - }, +function getNeo4jCredentials() { + return { + serverUrl: "bolt://localhost:7687", + serverUser: "neo4j", + serverPassword: document.getElementById("neo4j-server-password").value, + }; +} + +function getConfiguration(containerId = "viz", credentials, visConfiguration) { + return { + containerId: containerId, + neo4j: credentials, + visConfig: visConfiguration, labels: { Artifact: { [NeoVis.NEOVIS_ADVANCED_CONFIG]: { function: { // Print all properties for the title (when nodes are clicked) title: NeoVis.objectToTitleHtml, - // Use "fileName" as label. Remove leading slash, trailing ".jar" and version number. + // Use "fileName" as label. Remove leading slash, trailing ".jar", version number and a trailing word like "Final". label: (node) => node.properties.fileName .replace("/", "") .replace(".jar", "") - .replace(/-[\d\\.]+/, ""), + .replace(/[\d\.\-\_v]+\w+$/gm, "") + + "(" + + node.properties.maxDistanceFromSource + + ")", }, }, }, @@ -64,9 +36,12 @@ function draw() { value: "weight", }, }, - initialCypher: "MATCH (s:Artifact)-[r:DEPENDS_ON]->(d:Artifact) RETURN s,r,d", + initialCypher: + "MATCH (a1:Artifact)-[r1:DEPENDS_ON*0..1]->(a2:Artifact) WHERE a1.topologicalSortIndex >= 0 AND a2.topologicalSortIndex >= 0 AND a1 <> a2 RETURN a1,r1,a2 ORDER BY a2.topologicalSortIndex, a1.topologicalSortIndex SKIP toInteger($startIndex) LIMIT toInteger($blockSize)", }; +} - const neoViz = new NeoVis.default(config); - neoViz.render(); +function draw() { + const config = getConfiguration("viz", getNeo4jCredentials(), hierarchicalHexagons()); + paginatedGraphVisualization({containerElementId: "visualizations", neoVizConfiguration: config}); } \ No newline at end of file diff --git a/graph-visualization/index.css b/graph-visualization/index.css index 3afc70a7a..ee82a4b50 100644 --- a/graph-visualization/index.css +++ b/graph-visualization/index.css @@ -1,5 +1,31 @@ div { - width: 75vw; + width: 100vw; height: 100vh; - /*border: 1px solid lightgray;*/ -} \ No newline at end of file +} + +.indexedVisualization { + margin-top: 10px; + margin-left: 10px; + width: 400vw; + height: 400vh; + display:inline-block; +} + +.indexedVisualization.visualization-finished { + border-width: 1px; + border-style: solid; + border-color: lightgreen; +} + +.indexedVisualization:not(.visualization-finished) { + border-width: 2px; + border-style: dotted; + border-color: orange; +} + +.indexedVisualization.visualization-finished.visualization-failed { + height: 6em; + border-width: 2px; + border-style: solid; + border-color: red; +} diff --git a/graph-visualization/package-lock.json b/graph-visualization/package-lock.json index d38ef9a6a..08cee762a 100644 --- a/graph-visualization/package-lock.json +++ b/graph-visualization/package-lock.json @@ -10,6 +10,7 @@ "license": "MIT", "dependencies": { "glob": "^10.3.3", + "jimp": "^0.22.10", "neovis.js": "^2.1.0", "puppeteer": "^21.0.1" } @@ -170,6 +171,394 @@ "url": "https://github.com/chalk/wrap-ansi?sponsor=1" } }, + "node_modules/@jimp/bmp": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/bmp/-/bmp-0.22.10.tgz", + "integrity": "sha512-1UXRl1Nw1KptZ1r0ANqtXOst9vGH51dq7keVKQzyyTO2lz4dOaezS9StuSTNh+RmiHg/SVPaFRpPfB0S/ln4Kg==", + "dependencies": { + "@jimp/utils": "^0.22.10", + "bmp-js": "^0.1.0" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/core": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/core/-/core-0.22.10.tgz", + "integrity": "sha512-ZKyrehVy6wu1PnBXIUpn/fXmyMRQiVSbvHDubgXz4bfTOao3GiOurKHjByutQIgozuAN6ZHWiSge1dKA+dex3w==", + "dependencies": { + "@jimp/utils": "^0.22.10", + "any-base": "^1.1.0", + "buffer": "^5.2.0", + "exif-parser": "^0.1.12", + "file-type": "^16.5.4", + "isomorphic-fetch": "^3.0.0", + "pixelmatch": "^4.0.2", + "tinycolor2": "^1.6.0" + } + }, + "node_modules/@jimp/custom": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/custom/-/custom-0.22.10.tgz", + "integrity": "sha512-sPZkUYe1hu0iIgNisjizxPJqq2vaaKvkCkPoXq2U6UV3ZA1si/WVdrg25da3IcGIEV+83AoHgM8TvqlLgrCJsg==", + "dependencies": { + "@jimp/core": "^0.22.10" + } + }, + "node_modules/@jimp/gif": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/gif/-/gif-0.22.10.tgz", + "integrity": "sha512-yEX2dSpamvkSx1PPDWGnKeWDrBz0vrCKjVG/cn4Zr68MRRT75tbZIeOrBa+RiUpY3ho5ix7d36LkYvt3qfUIhQ==", + "dependencies": { + "@jimp/utils": "^0.22.10", + "gifwrap": "^0.10.1", + "omggif": "^1.0.9" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/jpeg": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/jpeg/-/jpeg-0.22.10.tgz", + "integrity": "sha512-6bu98pAcVN4DY2oiDLC4TOgieX/lZrLd1tombWZOFCN5PBmqaHQxm7IUmT+Wj4faEvh8QSHgVLSA+2JQQRJWVA==", + "dependencies": { + "@jimp/utils": "^0.22.10", + "jpeg-js": "^0.4.4" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-blit": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-blit/-/plugin-blit-0.22.10.tgz", + "integrity": "sha512-6EI8Sl+mxYHEIy6Yteh6eknD+EZguKpNdr3sCKxNezmLR0+vK99vHcllo6uGSjXXiwtwS67Xqxn8SsoatL+UJQ==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-blur": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-blur/-/plugin-blur-0.22.10.tgz", + "integrity": "sha512-4XRTWuPVdMXJeclJMisXPGizeHtTryVaVV5HnuQXpKqIZtzXReCCpNGH8q/i0kBQOQMXhGWS3mpqOEwtpPePKw==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-circle": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-circle/-/plugin-circle-0.22.10.tgz", + "integrity": "sha512-mhcwTO1ywRxiCgtLGge6tDDIDPlX6qkI3CY+BjgGG/XhVHccCddXgOGLdlf+5OuKIEF2Nqs0V01LQEQIJFTmEw==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-color": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-color/-/plugin-color-0.22.10.tgz", + "integrity": "sha512-e4t3L7Kedd96E0x1XjsTM6NcgulKUU66HdFTao7Tc9FYJRFSlttARZ/C6LEryGDm/i69R6bJEpo7BkNz0YL55Q==", + "dependencies": { + "@jimp/utils": "^0.22.10", + "tinycolor2": "^1.6.0" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-contain": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-contain/-/plugin-contain-0.22.10.tgz", + "integrity": "sha512-eP8KrzctuEoqibQAxi9WhbnoRosydhiwg+IYya3dKuKDBTrD9UHt+ERlPQ/lTNWHzV/l4S1ntV3r9s9saJgsXA==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5", + "@jimp/plugin-blit": ">=0.3.5", + "@jimp/plugin-resize": ">=0.3.5", + "@jimp/plugin-scale": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-cover": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-cover/-/plugin-cover-0.22.10.tgz", + "integrity": "sha512-kJCwL5T1igfa0InCfkE7bBeqg26m46aoRt10ug+rvm11P6RrvRMGrgINFyIKB+mnB7CiyBN/MOula1CvLhSInQ==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5", + "@jimp/plugin-crop": ">=0.3.5", + "@jimp/plugin-resize": ">=0.3.5", + "@jimp/plugin-scale": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-crop": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-crop/-/plugin-crop-0.22.10.tgz", + "integrity": "sha512-BOZ+YGaZlhU7c5ye65RxikicXH0Ki0It6/XHISvipR5WZrfjLjL2Ke20G+AGnwBQc76gKenVcMXVUCnEjtZV+Q==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-displace": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-displace/-/plugin-displace-0.22.10.tgz", + "integrity": "sha512-llNiWWMTKISDXt5+cXI0GaFmZWAjlT+4fFLYf4eXquuL/9wZoQsEBhv2GdGd48mkiS8jZq1Nnb2Q4ehEPTvrzw==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-dither": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-dither/-/plugin-dither-0.22.10.tgz", + "integrity": "sha512-05WLmeV5M+P/0FS+bWf13hMew2X0oa8w9AtmevL2UyA/5GqiyvP2Xm5WfGQ8oFiiMvpnL6RFomJQOZtWca0C2w==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-fisheye": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-fisheye/-/plugin-fisheye-0.22.10.tgz", + "integrity": "sha512-InjiXvc7Gkzrx8VWtU97kDqV7ENnhHGPULymJWeZaF2aicud9Fpk4iCtd/DcZIrk7Cbe60A8RwNXN00HXIbSCg==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-flip": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-flip/-/plugin-flip-0.22.10.tgz", + "integrity": "sha512-42GkGtTHWnhnwTMPVK/kXObZbkYIpQWfuIfy5EMEMk6zRj05zpv4vsjkKWfuemweZINwfvD7wDJF7FVFNNcZZg==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5", + "@jimp/plugin-rotate": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-gaussian": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-gaussian/-/plugin-gaussian-0.22.10.tgz", + "integrity": "sha512-ykrG/6lTp9Q5YA8jS5XzwMHtRxb9HOFMgtmnrUZ8kU+BK8REecfy9Ic5BUEOjCYvS1a/xLsnrZQU07iiYxBxFg==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-invert": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-invert/-/plugin-invert-0.22.10.tgz", + "integrity": "sha512-d8j9BlUJYs/c994t4azUWSWmQq4LLPG4ecm8m6SSNqap+S/HlVQGqjYhJEBbY9EXkOTYB9vBL9bqwSM1Rr6paA==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-mask": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-mask/-/plugin-mask-0.22.10.tgz", + "integrity": "sha512-yRBs1230XZkz24uFTdTcSlZ0HXZpIWzM3iFQN56MzZ7USgdVZjPPDCQ8I9RpqfZ36nDflQkUO0wV7ucsi4ogow==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-normalize": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-normalize/-/plugin-normalize-0.22.10.tgz", + "integrity": "sha512-Wk9GX6eJMchX/ZAazVa70Fagu+OXMvHiPY+HrcEwcclL+p1wo8xAHEsf9iKno7Ja4EU9lLhbBRY5hYJyiKMEkg==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-print": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-print/-/plugin-print-0.22.10.tgz", + "integrity": "sha512-1U3VloIR+beE1kWPdGEJMiE2h1Do29iv3w8sBbvPyRP4qXxRFcDpmCGtctsrKmb1krlBFlj8ubyAY90xL+5n9w==", + "dependencies": { + "@jimp/utils": "^0.22.10", + "load-bmfont": "^1.4.1" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5", + "@jimp/plugin-blit": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-resize": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-resize/-/plugin-resize-0.22.10.tgz", + "integrity": "sha512-ixomxVcnAONXDgaq0opvAx4UAOiEhOA/tipuhFFOvPKFd4yf1BAnEviB5maB0SBHHkJXPUSzDp/73xVTMGSe7g==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-rotate": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-rotate/-/plugin-rotate-0.22.10.tgz", + "integrity": "sha512-eeFX8dnRyf3LAdsdXWKWuN18hLRg8zy1cP0cP9rHzQVWRK7ck/QsLxK1vHq7MADGwQalNaNTJ9SQxH6c8mz6jw==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5", + "@jimp/plugin-blit": ">=0.3.5", + "@jimp/plugin-crop": ">=0.3.5", + "@jimp/plugin-resize": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-scale": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-scale/-/plugin-scale-0.22.10.tgz", + "integrity": "sha512-TG/H0oUN69C9ArBCZg4PmuoixFVKIiru8282KzSB/Tp1I0xwX0XLTv3dJ5pobPlIgPcB+TmD4xAIdkCT4rtWxg==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5", + "@jimp/plugin-resize": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-shadow": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-shadow/-/plugin-shadow-0.22.10.tgz", + "integrity": "sha512-TN9xm6fI7XfxbMUQqFPZjv59Xdpf0tSiAQdINB4g6pJMWiVANR/74OtDONoy3KKpenu5Y38s+FkrtID/KcQAhw==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5", + "@jimp/plugin-blur": ">=0.3.5", + "@jimp/plugin-resize": ">=0.3.5" + } + }, + "node_modules/@jimp/plugin-threshold": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugin-threshold/-/plugin-threshold-0.22.10.tgz", + "integrity": "sha512-DA2lSnU0TgIRbAgmXaxroYw3Ad6J2DOFEoJp0NleSm2h3GWbZEE5yW9U2B6hD3iqn4AenG4E2b2WzHXZyzSutw==", + "dependencies": { + "@jimp/utils": "^0.22.10" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5", + "@jimp/plugin-color": ">=0.8.0", + "@jimp/plugin-resize": ">=0.8.0" + } + }, + "node_modules/@jimp/plugins": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/plugins/-/plugins-0.22.10.tgz", + "integrity": "sha512-KDMZyM6pmvS8freB+UBLko1TO/k4D7URS/nphCozuH+P7i3UMe7NdckXKJ8u+WD6sqN0YFYvBehpkpnUiw/91w==", + "dependencies": { + "@jimp/plugin-blit": "^0.22.10", + "@jimp/plugin-blur": "^0.22.10", + "@jimp/plugin-circle": "^0.22.10", + "@jimp/plugin-color": "^0.22.10", + "@jimp/plugin-contain": "^0.22.10", + "@jimp/plugin-cover": "^0.22.10", + "@jimp/plugin-crop": "^0.22.10", + "@jimp/plugin-displace": "^0.22.10", + "@jimp/plugin-dither": "^0.22.10", + "@jimp/plugin-fisheye": "^0.22.10", + "@jimp/plugin-flip": "^0.22.10", + "@jimp/plugin-gaussian": "^0.22.10", + "@jimp/plugin-invert": "^0.22.10", + "@jimp/plugin-mask": "^0.22.10", + "@jimp/plugin-normalize": "^0.22.10", + "@jimp/plugin-print": "^0.22.10", + "@jimp/plugin-resize": "^0.22.10", + "@jimp/plugin-rotate": "^0.22.10", + "@jimp/plugin-scale": "^0.22.10", + "@jimp/plugin-shadow": "^0.22.10", + "@jimp/plugin-threshold": "^0.22.10", + "timm": "^1.6.1" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/png": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/png/-/png-0.22.10.tgz", + "integrity": "sha512-RYinU7tZToeeR2g2qAMn42AU+8OUHjXPKZZ9RkmoL4bguA1xyZWaSdr22/FBkmnHhOERRlr02KPDN1OTOYHLDQ==", + "dependencies": { + "@jimp/utils": "^0.22.10", + "pngjs": "^6.0.0" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/tiff": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/tiff/-/tiff-0.22.10.tgz", + "integrity": "sha512-OaivlSYzpNTHyH/h7pEtl3A7F7TbsgytZs52GLX/xITW92ffgDgT6PkldIrMrET6ERh/hdijNQiew7IoEEr2og==", + "dependencies": { + "utif2": "^4.0.1" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/types": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/types/-/types-0.22.10.tgz", + "integrity": "sha512-u/r+XYzbCx4zZukDmxx8S0er3Yq3iDPI6+31WKX0N18i2qPPJYcn8qwIFurfupRumGvJ8SlGLCgt/T+Y8zzUIw==", + "dependencies": { + "@jimp/bmp": "^0.22.10", + "@jimp/gif": "^0.22.10", + "@jimp/jpeg": "^0.22.10", + "@jimp/png": "^0.22.10", + "@jimp/tiff": "^0.22.10", + "timm": "^1.6.1" + }, + "peerDependencies": { + "@jimp/custom": ">=0.3.5" + } + }, + "node_modules/@jimp/utils": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/@jimp/utils/-/utils-0.22.10.tgz", + "integrity": "sha512-ztlOK9Mm2iLG2AMoabzM4i3WZ/FtshcgsJCbZCRUs/DKoeS2tySRJTnQZ1b7Roq0M4Ce+FUAxnCAcBV0q7PH9w==", + "dependencies": { + "regenerator-runtime": "^0.13.3" + } + }, "node_modules/@pkgjs/parseargs": { "version": "0.11.0", "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", @@ -199,6 +588,11 @@ "node": ">=16.3.0" } }, + "node_modules/@tokenizer/token": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/@tokenizer/token/-/token-0.3.0.tgz", + "integrity": "sha512-OvjF+z51L3ov0OyAU0duzsYuvO01PH7x4t6DJx+guahgTnBHkhJdG7soQeTSFLWN3efnHyibZ4Z8l2EuWwJN3A==" + }, "node_modules/@tootallnate/quickjs-emscripten": { "version": "0.23.0", "resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz", @@ -255,6 +649,11 @@ "node": ">=4" } }, + "node_modules/any-base": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/any-base/-/any-base-1.1.0.tgz", + "integrity": "sha512-uMgjozySS8adZZYePpaWs8cxB9/kdzmpX6SgJZ+wbz1K5eYk5QMYDVJaZKhxyIHUdnnJkfR7SVgStgH7LkGUyg==" + }, "node_modules/argparse": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", @@ -308,6 +707,11 @@ "node": ">=10.0.0" } }, + "node_modules/bmp-js": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/bmp-js/-/bmp-js-0.1.0.tgz", + "integrity": "sha512-vHdS19CnY3hwiNdkaqk93DvjVLfbEcI8mys4UjuWrlX1haDmroo8o4xCzh4wD6DGV6HxRCyauwhHRqMTfERtjw==" + }, "node_modules/brace-expansion": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", @@ -347,6 +751,14 @@ "node": "*" } }, + "node_modules/buffer-equal": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/buffer-equal/-/buffer-equal-0.0.1.tgz", + "integrity": "sha512-RgSV6InVQ9ODPdLWJ5UAqBqJBOg370Nz6ZQtRzpt6nUjc8v0St97uJ4PYC6NztqIScrAXafKM3mZPMygSe1ggA==", + "engines": { + "node": ">=0.4.0" + } + }, "node_modules/callsites": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", @@ -509,6 +921,11 @@ "resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1159816.tgz", "integrity": "sha512-2cZlHxC5IlgkIWe2pSDmCrDiTzbSJWywjbDDnupOImEBcG31CQgBLV8wWE+5t+C4rimcjHsbzy7CBzf9oFjboA==" }, + "node_modules/dom-walk": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/dom-walk/-/dom-walk-0.1.2.tgz", + "integrity": "sha512-6QvTW9mrGeIegrFXdtQi9pk7O/nSK6lSdXW2eqUspN5LWD7UTji2Fqw5V2YLjBpHEoU9Xl/eUWNpDeZvoyOv2w==" + }, "node_modules/eastasianwidth": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", @@ -599,6 +1016,11 @@ "node": ">=0.10.0" } }, + "node_modules/exif-parser": { + "version": "0.1.12", + "resolved": "https://registry.npmjs.org/exif-parser/-/exif-parser-0.1.12.tgz", + "integrity": "sha512-c2bQfLNbMzLPmzQuOr8fy0csy84WmwnER81W88DzTp9CYNPJ6yzOj2EZAh9pywYpqHnshVLHQJ8WzldAyfY+Iw==" + }, "node_modules/extract-zip": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-2.0.1.tgz", @@ -631,6 +1053,22 @@ "pend": "~1.2.0" } }, + "node_modules/file-type": { + "version": "16.5.4", + "resolved": "https://registry.npmjs.org/file-type/-/file-type-16.5.4.tgz", + "integrity": "sha512-/yFHK0aGjFEgDJjEKP0pWCplsPFPhwyfwevf/pVxiN0tmE4L9LmwWxWukdJSHdoCli4VgQLehjJtwQBnqmsKcw==", + "dependencies": { + "readable-web-to-node-stream": "^3.0.0", + "strtok3": "^6.2.4", + "token-types": "^4.1.1" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sindresorhus/file-type?sponsor=1" + } + }, "node_modules/foreground-child": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.1.1.tgz", @@ -695,6 +1133,15 @@ "node": ">= 14" } }, + "node_modules/gifwrap": { + "version": "0.10.1", + "resolved": "https://registry.npmjs.org/gifwrap/-/gifwrap-0.10.1.tgz", + "integrity": "sha512-2760b1vpJHNmLzZ/ubTtNnEx5WApN/PYWJvXvgS+tL1egTTthayFYIQQNi136FLEDcN/IyEY2EcGpIITD6eYUw==", + "dependencies": { + "image-q": "^4.0.0", + "omggif": "^1.0.10" + } + }, "node_modules/glob": { "version": "10.3.4", "resolved": "https://registry.npmjs.org/glob/-/glob-10.3.4.tgz", @@ -716,6 +1163,15 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/global": { + "version": "4.4.0", + "resolved": "https://registry.npmjs.org/global/-/global-4.4.0.tgz", + "integrity": "sha512-wv/LAoHdRE3BeTGz53FAamhGlPLhlssK45usmGFThIi4XqnBmjKQ16u+RNbP7WvigRZDxUsM0J3gcQ5yicaL0w==", + "dependencies": { + "min-document": "^2.19.0", + "process": "^0.11.10" + } + }, "node_modules/graceful-fs": { "version": "4.2.11", "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", @@ -772,6 +1228,19 @@ } ] }, + "node_modules/image-q": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/image-q/-/image-q-4.0.0.tgz", + "integrity": "sha512-PfJGVgIfKQJuq3s0tTDOKtztksibuUEbJQIYT3by6wctQo+Rdlh7ef4evJ5NCdxY4CfMbvFkocEwbl4BF8RlJw==", + "dependencies": { + "@types/node": "16.9.1" + } + }, + "node_modules/image-q/node_modules/@types/node": { + "version": "16.9.1", + "resolved": "https://registry.npmjs.org/@types/node/-/node-16.9.1.tgz", + "integrity": "sha512-QpLcX9ZSsq3YYUUnD3nFDY8H7wctAhQj/TFKL8Ya8v5fMm3CFXxo8zStsLAl780ltoYoo1WvKUVGBQK+1ifr7g==" + }, "node_modules/import-fresh": { "version": "3.3.0", "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.0.tgz", @@ -787,6 +1256,11 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/inherits": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", + "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" + }, "node_modules/ip": { "version": "1.1.8", "resolved": "https://registry.npmjs.org/ip/-/ip-1.1.8.tgz", @@ -805,11 +1279,25 @@ "node": ">=8" } }, + "node_modules/is-function": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/is-function/-/is-function-1.0.2.tgz", + "integrity": "sha512-lw7DUp0aWXYg+CBCN+JKkcE0Q2RayZnSvnZBlwgxHBQhqt5pZNVy4Ri7H9GmmXkdu7LUthszM+Tor1u/2iBcpQ==" + }, "node_modules/isexe": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==" }, + "node_modules/isomorphic-fetch": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/isomorphic-fetch/-/isomorphic-fetch-3.0.0.tgz", + "integrity": "sha512-qvUtwJ3j6qwsF3jLxkZ72qCgjMysPzDfeV240JHiGZsANBYd+EEuu35v7dfrJ9Up0Ak07D7GGSkGhCHTqg/5wA==", + "dependencies": { + "node-fetch": "^2.6.1", + "whatwg-fetch": "^3.4.1" + } + }, "node_modules/jackspeak": { "version": "2.2.2", "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-2.2.2.tgz", @@ -827,6 +1315,22 @@ "@pkgjs/parseargs": "^0.11.0" } }, + "node_modules/jimp": { + "version": "0.22.10", + "resolved": "https://registry.npmjs.org/jimp/-/jimp-0.22.10.tgz", + "integrity": "sha512-lCaHIJAgTOsplyJzC1w/laxSxrbSsEBw4byKwXgUdMmh+ayPsnidTblenQm+IvhIs44Gcuvlb6pd2LQ0wcKaKg==", + "dependencies": { + "@jimp/custom": "^0.22.10", + "@jimp/plugins": "^0.22.10", + "@jimp/types": "^0.22.10", + "regenerator-runtime": "^0.13.3" + } + }, + "node_modules/jpeg-js": { + "version": "0.4.4", + "resolved": "https://registry.npmjs.org/jpeg-js/-/jpeg-js-0.4.4.tgz", + "integrity": "sha512-WZzeDOEtTOBK4Mdsar0IqEU5sMr3vSV2RqkAIzUEV2BHnUfKGyswWFPFwK5EeDo93K3FohSHbLAjj0s1Wzd+dg==" + }, "node_modules/js-tokens": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", @@ -867,6 +1371,21 @@ "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==" }, + "node_modules/load-bmfont": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/load-bmfont/-/load-bmfont-1.4.1.tgz", + "integrity": "sha512-8UyQoYmdRDy81Brz6aLAUhfZLwr5zV0L3taTQ4hju7m6biuwiWiJXjPhBJxbUQJA8PrkvJ/7Enqmwk2sM14soA==", + "dependencies": { + "buffer-equal": "0.0.1", + "mime": "^1.3.4", + "parse-bmfont-ascii": "^1.0.3", + "parse-bmfont-binary": "^1.0.5", + "parse-bmfont-xml": "^1.1.4", + "phin": "^2.9.1", + "xhr": "^2.0.1", + "xtend": "^4.0.0" + } + }, "node_modules/lru-cache": { "version": "7.18.3", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-7.18.3.tgz", @@ -875,6 +1394,25 @@ "node": ">=12" } }, + "node_modules/mime": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz", + "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==", + "bin": { + "mime": "cli.js" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/min-document": { + "version": "2.19.0", + "resolved": "https://registry.npmjs.org/min-document/-/min-document-2.19.0.tgz", + "integrity": "sha512-9Wy1B3m3f66bPPmU5hdA4DR4PB2OfDU/+GS3yAB7IQozE3tqXaVv2zOjgla7MEGSRv95+ILmOuvhLkOK6wJtCQ==", + "dependencies": { + "dom-walk": "^0.1.0" + } + }, "node_modules/minimatch": { "version": "9.0.3", "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.3.tgz", @@ -1000,6 +1538,11 @@ } } }, + "node_modules/omggif": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/omggif/-/omggif-1.0.10.tgz", + "integrity": "sha512-LMJTtvgc/nugXj0Vcrrs68Mn2D1r0zf630VNtqtpI1FEO7e+O9FP4gqs9AcnBaSEeoHIPm28u6qgPR0oyEpGSw==" + }, "node_modules/once": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", @@ -1039,6 +1582,11 @@ "node": ">= 14" } }, + "node_modules/pako": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", + "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==" + }, "node_modules/parent-module": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", @@ -1050,6 +1598,30 @@ "node": ">=6" } }, + "node_modules/parse-bmfont-ascii": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/parse-bmfont-ascii/-/parse-bmfont-ascii-1.0.6.tgz", + "integrity": "sha512-U4RrVsUFCleIOBsIGYOMKjn9PavsGOXxbvYGtMOEfnId0SVNsgehXh1DxUdVPLoxd5mvcEtvmKs2Mmf0Mpa1ZA==" + }, + "node_modules/parse-bmfont-binary": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/parse-bmfont-binary/-/parse-bmfont-binary-1.0.6.tgz", + "integrity": "sha512-GxmsRea0wdGdYthjuUeWTMWPqm2+FAd4GI8vCvhgJsFnoGhTrLhXDDupwTo7rXVAgaLIGoVHDZS9p/5XbSqeWA==" + }, + "node_modules/parse-bmfont-xml": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/parse-bmfont-xml/-/parse-bmfont-xml-1.1.4.tgz", + "integrity": "sha512-bjnliEOmGv3y1aMEfREMBJ9tfL3WR0i0CKPj61DnSLaoxWR3nLrsQrEbCId/8rF4NyRF0cCqisSVXyQYWM+mCQ==", + "dependencies": { + "xml-parse-from-string": "^1.0.0", + "xml2js": "^0.4.5" + } + }, + "node_modules/parse-headers": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/parse-headers/-/parse-headers-2.0.5.tgz", + "integrity": "sha512-ft3iAoLOB/MlwbNXgzy43SWGP6sQki2jQvAyBg/zDFAgr9bfNWZIUj42Kw2eJIl8kEi4PbgE6U1Zau/HwI75HA==" + }, "node_modules/parse-json": { "version": "5.2.0", "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-5.2.0.tgz", @@ -1106,11 +1678,63 @@ "node": ">=8" } }, + "node_modules/peek-readable": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/peek-readable/-/peek-readable-4.1.0.tgz", + "integrity": "sha512-ZI3LnwUv5nOGbQzD9c2iDG6toheuXSZP5esSHBjopsXH4dg19soufvpUGA3uohi5anFtGb2lhAVdHzH6R/Evvg==", + "engines": { + "node": ">=8" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Borewit" + } + }, "node_modules/pend": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz", "integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==" }, + "node_modules/phin": { + "version": "2.9.3", + "resolved": "https://registry.npmjs.org/phin/-/phin-2.9.3.tgz", + "integrity": "sha512-CzFr90qM24ju5f88quFC/6qohjC144rehe5n6DH900lgXmUe86+xCKc10ev56gRKC4/BkHUoG4uSiQgBiIXwDA==" + }, + "node_modules/pixelmatch": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/pixelmatch/-/pixelmatch-4.0.2.tgz", + "integrity": "sha512-J8B6xqiO37sU/gkcMglv6h5Jbd9xNER7aHzpfRdNmV4IbQBzBpe4l9XmbG+xPF/znacgu2jfEw+wHffaq/YkXA==", + "dependencies": { + "pngjs": "^3.0.0" + }, + "bin": { + "pixelmatch": "bin/pixelmatch" + } + }, + "node_modules/pixelmatch/node_modules/pngjs": { + "version": "3.4.0", + "resolved": "https://registry.npmjs.org/pngjs/-/pngjs-3.4.0.tgz", + "integrity": "sha512-NCrCHhWmnQklfH4MtJMRjZ2a8c80qXeMlQMv2uVp9ISJMTt562SbGd6n2oq0PaPgKm7Z6pL9E2UlLIhC+SHL3w==", + "engines": { + "node": ">=4.0.0" + } + }, + "node_modules/pngjs": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/pngjs/-/pngjs-6.0.0.tgz", + "integrity": "sha512-TRzzuFRRmEoSW/p1KVAmiOgPco2Irlah+bGFCeNfJXxxYGwSw7YwAOAcd7X28K/m5bjBWKsC29KyoMfHbypayg==", + "engines": { + "node": ">=12.13.0" + } + }, + "node_modules/process": { + "version": "0.11.10", + "resolved": "https://registry.npmjs.org/process/-/process-0.11.10.tgz", + "integrity": "sha512-cdGef/drWFoydD1JsMzuFf8100nZl+GT+yacc2bEced5f9Rjk4z+WtFUTBu9PhOi9j/jfmBPu0mMEY4wIdAF8A==", + "engines": { + "node": ">= 0.6.0" + } + }, "node_modules/progress": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz", @@ -1186,6 +1810,34 @@ "resolved": "https://registry.npmjs.org/queue-tick/-/queue-tick-1.0.1.tgz", "integrity": "sha512-kJt5qhMxoszgU/62PLP1CJytzd2NKetjSRnyuj31fDd3Rlcz3fzlFdFLD1SItunPwyqEOkca6GbV612BWfaBag==" }, + "node_modules/readable-stream": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", + "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", + "dependencies": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/readable-web-to-node-stream": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/readable-web-to-node-stream/-/readable-web-to-node-stream-3.0.2.tgz", + "integrity": "sha512-ePeK6cc1EcKLEhJFt/AebMCLL+GgSKhuygrZ/GLaKZYEecIgIECf4UaUuaByiGtzckwR4ain9VzUh95T1exYGw==", + "dependencies": { + "readable-stream": "^3.6.0" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Borewit" + } + }, "node_modules/regenerator-runtime": { "version": "0.13.11", "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.11.tgz", @@ -1242,6 +1894,11 @@ } ] }, + "node_modules/sax": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz", + "integrity": "sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw==" + }, "node_modules/shebang-command": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", @@ -1388,6 +2045,22 @@ "node": ">=8" } }, + "node_modules/strtok3": { + "version": "6.3.0", + "resolved": "https://registry.npmjs.org/strtok3/-/strtok3-6.3.0.tgz", + "integrity": "sha512-fZtbhtvI9I48xDSywd/somNqgUHl2L2cstmXCCif0itOf96jeW18MBSyrLuNicYQVkvpOxkZtkzujiTJ9LW5Jw==", + "dependencies": { + "@tokenizer/token": "^0.3.0", + "peek-readable": "^4.1.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Borewit" + } + }, "node_modules/supports-color": { "version": "5.5.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", @@ -1424,12 +2097,38 @@ "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", "integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==" }, + "node_modules/timm": { + "version": "1.7.1", + "resolved": "https://registry.npmjs.org/timm/-/timm-1.7.1.tgz", + "integrity": "sha512-IjZc9KIotudix8bMaBW6QvMuq64BrJWFs1+4V0lXwWGQZwH+LnX87doAYhem4caOEusRP9/g6jVDQmZ8XOk1nw==" + }, "node_modules/timsort": { "version": "0.3.0", "resolved": "https://registry.npmjs.org/timsort/-/timsort-0.3.0.tgz", "integrity": "sha512-qsdtZH+vMoCARQtyod4imc2nIJwg9Cc7lPRrw9CzF8ZKR0khdr8+2nX80PBhET3tcyTtJDxAffGh2rXH4tyU8A==", "peer": true }, + "node_modules/tinycolor2": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/tinycolor2/-/tinycolor2-1.6.0.tgz", + "integrity": "sha512-XPaBkWQJdsf3pLKJV9p4qN/S+fm2Oj8AIPo1BTUhg5oxkvm9+SVEGFdhyOz7tTdUTfvxMiAs4sp6/eZO2Ew+pw==" + }, + "node_modules/token-types": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/token-types/-/token-types-4.2.1.tgz", + "integrity": "sha512-6udB24Q737UD/SDsKAHI9FCRP7Bqc9D/MQUV02ORQg5iskjtLJlZJNdN4kKtcdtwCeWIwIHDGaUsTsCCAa8sFQ==", + "dependencies": { + "@tokenizer/token": "^0.3.0", + "ieee754": "^1.2.1" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Borewit" + } + }, "node_modules/tr46": { "version": "0.0.3", "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", @@ -1457,6 +2156,19 @@ "node": ">= 4.0.0" } }, + "node_modules/utif2": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/utif2/-/utif2-4.1.0.tgz", + "integrity": "sha512-+oknB9FHrJ7oW7A2WZYajOcv4FcDR4CfoGB0dPNfxbi4GO05RRnFmt5oa23+9w32EanrYcSJWspUiJkLMs+37w==", + "dependencies": { + "pako": "^1.0.11" + } + }, + "node_modules/util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==" + }, "node_modules/uuid": { "version": "9.0.0", "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.0.tgz", @@ -1522,6 +2234,11 @@ "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==" }, + "node_modules/whatwg-fetch": { + "version": "3.6.17", + "resolved": "https://registry.npmjs.org/whatwg-fetch/-/whatwg-fetch-3.6.17.tgz", + "integrity": "sha512-c4ghIvG6th0eudYwKZY5keb81wtFz9/WeAHAoy8+r18kcWlitUIrmGFQ2rWEl4UCKUilD3zCLHOIPheHx5ypRQ==" + }, "node_modules/whatwg-url": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", @@ -1663,6 +2380,50 @@ } } }, + "node_modules/xhr": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/xhr/-/xhr-2.6.0.tgz", + "integrity": "sha512-/eCGLb5rxjx5e3mF1A7s+pLlR6CGyqWN91fv1JgER5mVWg1MZmlhBvy9kjcsOdRk8RrIujotWyJamfyrp+WIcA==", + "dependencies": { + "global": "~4.4.0", + "is-function": "^1.0.1", + "parse-headers": "^2.0.0", + "xtend": "^4.0.0" + } + }, + "node_modules/xml-parse-from-string": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/xml-parse-from-string/-/xml-parse-from-string-1.0.1.tgz", + "integrity": "sha512-ErcKwJTF54uRzzNMXq2X5sMIy88zJvfN2DmdoQvy7PAFJ+tPRU6ydWuOKNMyfmOjdyBQTFREi60s0Y0SyI0G0g==" + }, + "node_modules/xml2js": { + "version": "0.4.23", + "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.4.23.tgz", + "integrity": "sha512-ySPiMjM0+pLDftHgXY4By0uswI3SPKLDw/i3UXbnO8M/p28zqexCUoPmQFrYD+/1BzhGJSs2i1ERWKJAtiLrug==", + "dependencies": { + "sax": ">=0.6.0", + "xmlbuilder": "~11.0.0" + }, + "engines": { + "node": ">=4.0.0" + } + }, + "node_modules/xmlbuilder": { + "version": "11.0.1", + "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-11.0.1.tgz", + "integrity": "sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==", + "engines": { + "node": ">=4.0" + } + }, + "node_modules/xtend": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", + "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==", + "engines": { + "node": ">=0.4" + } + }, "node_modules/y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", diff --git a/graph-visualization/package.json b/graph-visualization/package.json index 3ee12ace2..a8eee348a 100644 --- a/graph-visualization/package.json +++ b/graph-visualization/package.json @@ -11,6 +11,7 @@ "license": "MIT", "dependencies": { "glob": "^10.3.3", + "jimp": "^0.22.10", "neovis.js": "^2.1.0", "puppeteer": "^21.0.1" } diff --git a/graph-visualization/renderVisualizations.js b/graph-visualization/renderVisualizations.js index 04924883d..c069609bd 100644 --- a/graph-visualization/renderVisualizations.js +++ b/graph-visualization/renderVisualizations.js @@ -1,7 +1,8 @@ import puppeteer, { Browser } from "puppeteer"; import { basename, dirname } from "path"; import { globSync } from "glob"; -import { existsSync, mkdirSync } from "fs"; +import { existsSync, mkdirSync, writeFileSync } from "fs"; +import jimp from "jimp"; // __filename and __dirname don't exist when using es6 modules. // So they will be derived from the nodejs command line argument index 1 (script filename). @@ -9,6 +10,18 @@ const indexOfScriptFilePathArgument = 1; const __filename = process.argv[indexOfScriptFilePathArgument]; const __dirname = dirname(__filename); +/** + * Crops the image in the buffer so that there is no empty frame around it. + * @param {Buffer} buffer + * @returns Buffer + */ +const autoCropImageBuffer = async (buffer) => { + return await jimp + .read(buffer) + .then((image) => image.autocrop()) + .then((image) => image.getBufferAsync(jimp.MIME_PNG)); +}; + /** * Converts a camel case string into an kebab case string separated with dashes. * Reference: {@link https://stackoverflow.com/questions/54246477/how-to-convert-camelcase-to-snake-case} @@ -25,24 +38,26 @@ const camelToKebabCase = (str) => str.replace(/[A-Z]/g, (letter) => `-${letter.t */ const takeCanvasScreenshots = async (browser, htmlFilename) => { const page = await browser.newPage(); - await page.setViewport({ width: 1600, height: 1000, isMobile: false, isLandscape: true, hasTouch: false, deviceScaleFactor: 1 }); + await page.setViewport({ width: 1500, height: 1000, isMobile: false, isLandscape: true, hasTouch: false, deviceScaleFactor: 1 }); console.log(`Loading ${htmlFilename}`); await page.goto(`file://${htmlFilename}`); - + // Login with Neo4j server password from the environment variable NEO4J_INITIAL_PASSWORD - const loginButton = await page.waitForSelector('#neo4j-server-login'); - await page.type('#neo4j-server-password', process.env.NEO4J_INITIAL_PASSWORD); + const loginButton = await page.waitForSelector("#neo4j-server-login"); + await page.type("#neo4j-server-password", process.env.NEO4J_INITIAL_PASSWORD); await loginButton.click(); // Wait for the graph visualization to be rendered onto a HTML5 canvas - await page.waitForSelector("div canvas"); + console.log(`Waiting for visualizations to be finished`); + await page.waitForSelector(".visualization-finished", { timeout: 90_000 }); // Get all HTML canvas tag elements - const canvasElements = await page.$$("div canvas"); + const canvasElements = await page.$$("canvas"); if (canvasElements.length <= 0) { - console.error(`No elements with CSS selector 'div canvas' found in ${htmlFilename}`); + console.error(`No elements with CSS selector 'canvas' found in ${htmlFilename}`); } + console.log(`Found ${canvasElements.length} visualizations`); // Take a png screenshot of every canvas element and save them with increasing indices const reportName = basename(htmlFilename, ".html"); @@ -50,12 +65,18 @@ const takeCanvasScreenshots = async (browser, htmlFilename) => { if (!existsSync(directoryName)) { mkdirSync(directoryName); } - let index = 1; await Promise.all( - canvasElements.map(async (canvasElement) => { - console.log(`Taking screenshot ${reportName} of canvas ${index} in ${htmlFilename} ...`); - await canvasElement.screenshot({ path: `./${directoryName}/${reportName}-${index}.png`, omitBackground: true }); - index++; + Array.from(canvasElements).map(async (canvasElement, index) => { + console.log(`Exporting image ${reportName}-${index}.png...`); + const dataUrl = await page.evaluate(async (canvasElement) => { + return canvasElement.toDataURL(); + }, canvasElement); + let data = Buffer.from(dataUrl.split(",").pop(), "base64"); + console.log(`Cropping image ${reportName}-${index}.png...`); + data = await autoCropImageBuffer(data); + writeFileSync(`./${directoryName}/${reportName}-${index}.png`, data); + // console.log(`Taking screenshot ${reportName} of canvas ${index} in ${htmlFilename} of element...`); + // await canvasElement.screenshot({ path: `./${directoryName}/${reportName}-${index}.png`, omitBackground: true }); }) ); }; diff --git a/graph-visualization/vis-configuration-presets.js b/graph-visualization/vis-configuration-presets.js new file mode 100644 index 000000000..47d48a28a --- /dev/null +++ b/graph-visualization/vis-configuration-presets.js @@ -0,0 +1,45 @@ +function hierarchicalHexagons() { + return { + nodes: { + shape: "hexagon", + shadow: false, + font: { + strokeWidth: 4, + strokeColor: "#F2F2FF", + size: 11, + }, + size: 22, + widthConstraint: { + maximum: 60, + }, + }, + edges: { + arrows: { + to: { + enabled: true, + scaleFactor: 0.5, + }, + }, + scaling: { + max: 8, + }, + }, + physics: { + hierarchicalRepulsion: { + nodeDistance: 200, // 100 + centralGravity: 0.5, // 0.2 + springLength: 200, // 200 + springConstant: 0.06, // 0.05 + damping: 0.09, // 0.09 + avoidOverlap: 1, // 0 + }, + solver: "hierarchicalRepulsion", // barnesHut + }, + layout: { + hierarchical: { + enabled: true, + sortMethod: "directed", + }, + }, + }; +} diff --git a/graph-visualization/visualization-pagination.js b/graph-visualization/visualization-pagination.js new file mode 100644 index 000000000..8a40f4814 --- /dev/null +++ b/graph-visualization/visualization-pagination.js @@ -0,0 +1,72 @@ +/** + * Splits a large Graph into multiple smaller ones with query pagination to be able visualize them. + * Uses {@link https://github.com/neo4j-contrib/neovis.js|neovis.js} + * on top of {@link https://visjs.github.io/vis-network/docs/network/|vis-network.js}. + * + * @param {Object} parameters - The parameters for the paginated visualization using neovis.js on top of vis-network.js. + * @param {string} parameters.containerElementId - The id of the main container element (typically a div) where all visualizations will be drawn into. + * @param {Object} parameters.neoVizConfiguration - {@link https://neo4j-contrib.github.io/neovis.js/interfaces/NeovisConfig.html|NeovisConfig} object. The containerId will be overwritten so it can be left out or filled with an empty string. + * @param {string} [parameters.maxVisualizations=100] - Maximal number of visualizations (pages) + * @param {string} [parameters.recordsPerVisualization=160] - Maximal numbers of records per visualization (page/block size) + * @param {string} [parameters.idPrefixOfIndexedVisualizationElement="vis"] - Element id prefix for every indexed visualization element. It is then followed by the index number. + * @param {string} [parameters.classOfIndexedVisualizationElement="indexedVisualization"] - CSS class name for every indexed visualization element + * @param {string} [parameters.classOfFinishedVisualization="visualization-finished"] - CSS class name for finished visualizations (might be successful or failed) + * @param {string} [parameters.classOfFailedVisualization="visualization-failed"] - CSS class name for failed visualizations containing an error message + */ +function paginatedGraphVisualization({ + containerElementId, + neoVizConfiguration, + maxVisualizations = 100, + recordsPerVisualization = 160, + idPrefixOfIndexedVisualizationElement = "vis", + classOfIndexedVisualizationElement = "indexedVisualization", + classOfFinishedVisualization = "visualization-finished", + classOfFailedVisualization = "visualization-failed", +}) { + /** + * Marks the given element as finished when the visualization is completed. + * @param {Element} indexedVisualizationElement + */ + function markVisualizationAsFinished(indexedVisualizationElement) { + indexedVisualizationElement.classList.add(classOfFinishedVisualization); + const unfinishedVisualizations = document.querySelectorAll(`.${classOfIndexedVisualizationElement}:not(.${classOfFinishedVisualization})`); + if (unfinishedVisualizations.length === 0) { + indexedVisualizationElement.parentElement.classList.add(classOfFinishedVisualization); + } + } + + const containerElement = document.getElementById(containerElementId); + + for (let index = 0; index < maxVisualizations; index++) { + const indexedVisualizationContainer = document.createElement("div"); + indexedVisualizationContainer.id = `${idPrefixOfIndexedVisualizationElement}-${index}`; + indexedVisualizationContainer.classList.add(classOfIndexedVisualizationElement); + containerElement.appendChild(indexedVisualizationContainer); + + const config = { ...neoVizConfiguration, containerId: indexedVisualizationContainer.id }; + const neoViz = new NeoVis.default(config); + + neoViz.registerOnEvent(NeoVis.NeoVisEvents.CompletionEvent, (event) => { + if (event.recordCount == 0) { + indexedVisualizationContainer.remove(); // remove an empty canvas + markVisualizationAsFinished(indexedVisualizationContainer); + } else { + setTimeout(() => { + neoViz.stabilize(); + markVisualizationAsFinished(indexedVisualizationContainer); + }, 5000); + } + }); + neoViz.registerOnEvent(NeoVis.NeoVisEvents.ErrorEvent, (event) => { + indexedVisualizationContainer.classList.add(classOfFailedVisualization); + indexedVisualizationContainer.textContent = event.error.message; + markVisualizationAsFinished(indexedVisualizationContainer); + }); + const parameters = { + blockSize: recordsPerVisualization, + startIndex: index * recordsPerVisualization, + endIndex: (index + 1) * recordsPerVisualization, + }; + neoViz.render(undefined, parameters); + } +} From 2e20d1124489f3d652c47d2d174c8e429e5a0dc0 Mon Sep 17 00:00:00 2001 From: JohT Date: Mon, 28 Aug 2023 12:54:30 +0200 Subject: [PATCH 08/15] Improve error handling by writing to err out --- scripts/executeQuery.sh | 12 ++++++------ scripts/executeQueryFunctions.sh | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/executeQuery.sh b/scripts/executeQuery.sh index 5b77a3301..47070f9c5 100755 --- a/scripts/executeQuery.sh +++ b/scripts/executeQuery.sh @@ -19,7 +19,7 @@ NEO4J_HTTP_TRANSACTION_ENDPOINT=${NEO4J_HTTP_TRANSACTION_ENDPOINT:-"db/neo4j/tx/ # Check if environment variable is set if [ -z "${NEO4J_INITIAL_PASSWORD}" ]; then - echo "Requires environment variable NEO4J_INITIAL_PASSWORD to be set first. Use 'export NEO4J_INITIAL_PASSWORD='." + echo "Requires environment variable NEO4J_INITIAL_PASSWORD to be set first. Use 'export NEO4J_INITIAL_PASSWORD='." >&2 exit 1 fi @@ -29,9 +29,9 @@ no_source_reference=false # Input Arguments: Function to print usage information print_usage() { - echo "Usage: $0 [--no-source-reference-column]" - echo "Options:" - echo " --no-source-reference-column: Exclude the source reference column" + echo "Usage: $0 [--no-source-reference-column]" >&2 + echo "Options:" >&2 + echo " --no-source-reference-column: Exclude the source reference column" >&2 } # Input Arguments: Parse the command-line arguments @@ -51,12 +51,12 @@ while [[ $# -gt 0 ]]; do # Input Arguments: Check the first input argument to be a valid file if [ ! -f "${cypher_query_file_name}" ] ; then - echo "Error: Please provide a valid filename." + echo "Error: Please provide a valid filename." >&2 print_usage exit 1 fi else - echo "Error: Unknown option: $key" + echo "Error: Unknown option: $key" >&2 print_usage exit 1 fi diff --git a/scripts/executeQueryFunctions.sh b/scripts/executeQueryFunctions.sh index ea84fcb4d..d8ac4964d 100644 --- a/scripts/executeQueryFunctions.sh +++ b/scripts/executeQueryFunctions.sh @@ -52,7 +52,7 @@ execute_cypher_http_expect_results() { results=$( execute_cypher_http ${cypherFileName} | wc -l ) results=$((results - 1)) if [[ "$results" -lt 1 ]]; then - echo "$(basename -- "${cypherFileName}") (via http) Error: Expected at least one entry but was ${results}" + echo "$(basename -- "${cypherFileName}") (via http) Error: Expected at least one entry but was ${results}" >&2 exit 1 fi } @@ -64,7 +64,7 @@ execute_cypher_shell() { # Check if NEO4J_BIN exists if [ ! -d "${NEO4J_BIN}" ] ; then - echo "executeQuery: Error: Neo4j Binary Directory <${NEO4J_BIN}> doesn't exist. Please run setupNeo4j.sh first." + echo "executeQuery: Error: Neo4j Binary Directory <${NEO4J_BIN}> doesn't exist. Please run setupNeo4j.sh first." >&2 exit 1 fi @@ -95,7 +95,7 @@ execute_cypher_shell_expect_results() { results=$( execute_cypher_shell ${cypherFileName} | wc -l ) results=$((results - 2)) if [[ "$results" -lt 1 ]]; then - echo "$(basename -- "${cypherFileName}") (via cypher-shell) Error: Expected at least one entry but was ${results}" + echo "$(basename -- "${cypherFileName}") (via cypher-shell) Error: Expected at least one entry but was ${results}" >&2 exit 1 fi } \ No newline at end of file From 7d704f8216b521c14aa841812a055807fc016fff Mon Sep 17 00:00:00 2001 From: JohT Date: Sat, 26 Aug 2023 18:02:34 +0200 Subject: [PATCH 09/15] Optimize external dependencies report for > scale --- .../External_package_levels.cypher | 16 + .../External_package_name_elements.cypher | 11 + .../External_package_usage_overall.cypher | 24 +- ...e_per_artifact_and_external_package.cypher | 50 + ...age_usage_per_artifact_distribution.cypher | 39 + ...age_per_artifact_package_aggregated.cypher | 81 ++ ...l_package_usage_per_artifact_sorted.cypher | 40 + ...ckage_usage_per_artifact_sorted_top.cypher | 52 + ...ge_usage_per_internal_package_count.cypher | 45 + ...package_usage_per_type_distribution.cypher | 29 - .../External_package_usage_spread.cypher | 65 + ..._second_level_package_usage_overall.cypher | 27 + ...e_per_artifact_and_external_package.cypher | 50 + ...l_second_level_package_usage_spread.cypher | 65 + jupyter/ExternalDependencies.ipynb | 1151 +++++++++++++++-- scripts/reports/ExternalDependenciesCsv.sh | 10 +- 16 files changed, 1620 insertions(+), 135 deletions(-) create mode 100644 cypher/External_Dependencies/External_package_levels.cypher create mode 100644 cypher/External_Dependencies/External_package_name_elements.cypher create mode 100644 cypher/External_Dependencies/External_package_usage_per_artifact_and_external_package.cypher create mode 100644 cypher/External_Dependencies/External_package_usage_per_artifact_distribution.cypher create mode 100644 cypher/External_Dependencies/External_package_usage_per_artifact_package_aggregated.cypher create mode 100644 cypher/External_Dependencies/External_package_usage_per_artifact_sorted.cypher create mode 100644 cypher/External_Dependencies/External_package_usage_per_artifact_sorted_top.cypher create mode 100644 cypher/External_Dependencies/External_package_usage_per_internal_package_count.cypher delete mode 100644 cypher/External_Dependencies/External_package_usage_per_type_distribution.cypher create mode 100644 cypher/External_Dependencies/External_package_usage_spread.cypher create mode 100644 cypher/External_Dependencies/External_second_level_package_usage_overall.cypher create mode 100644 cypher/External_Dependencies/External_second_level_package_usage_per_artifact_and_external_package.cypher create mode 100644 cypher/External_Dependencies/External_second_level_package_usage_spread.cypher diff --git a/cypher/External_Dependencies/External_package_levels.cypher b/cypher/External_Dependencies/External_package_levels.cypher new file mode 100644 index 000000000..49fdd696d --- /dev/null +++ b/cypher/External_Dependencies/External_package_levels.cypher @@ -0,0 +1,16 @@ +// External package levels + +MATCH (externalType:ExternalType) + WITH replace(externalType.fqn, '.' + externalType.name, '') AS externalPackageName + WITH count(DISTINCT split(externalPackageName, '.')[0]) AS externalFirstLevelPackages + ,count(DISTINCT split(externalPackageName, '.')[0..1]) AS externalSecondLevelPackages + ,count(DISTINCT split(externalPackageName, '.')[0..2]) AS externalThirdLevelPackages + ,count(DISTINCT split(externalPackageName, '.')[0..3]) AS externalForthLevelPackages + ,count(DISTINCT split(externalPackageName, '.')[0..4]) AS externalFifthLevelPackages + ,count(DISTINCT externalPackageName) AS allExternalPackages +RETURN externalFirstLevelPackages + ,externalSecondLevelPackages + ,externalThirdLevelPackages + ,externalForthLevelPackages + ,externalFifthLevelPackages + ,allExternalPackages \ No newline at end of file diff --git a/cypher/External_Dependencies/External_package_name_elements.cypher b/cypher/External_Dependencies/External_package_name_elements.cypher new file mode 100644 index 000000000..20079be7e --- /dev/null +++ b/cypher/External_Dependencies/External_package_name_elements.cypher @@ -0,0 +1,11 @@ +// External package name elements + +MATCH (externalType:ExternalType) + WITH replace(externalType.fqn, '.' + externalType.name, '') AS packageName + WITH size(split(packageName,'.')) AS packageNameElements + ,count(DISTINCT packageName) AS packageCount + ,collect(DISTINCT packageName)[0..19] AS somePackageNames +RETURN packageNameElements + ,packageCount + ,somePackageNames +ORDER BY packageNameElements \ No newline at end of file diff --git a/cypher/External_Dependencies/External_package_usage_overall.cypher b/cypher/External_Dependencies/External_package_usage_overall.cypher index 35835e2c5..b026375f3 100644 --- a/cypher/External_Dependencies/External_package_usage_overall.cypher +++ b/cypher/External_Dependencies/External_package_usage_overall.cypher @@ -1,17 +1,27 @@ // External package usage overall - MATCH (type:Type) - WITH count(type) as allTypes, collect(type) as typeList + MATCH (package:Package)-[:CONTAINS]->(type:Type) + WITH count(DISTINCT type.fqn) AS allTypes + ,count(DISTINCT package.fqn) AS allPackages + ,collect(type) as typeList UNWIND typeList AS type MATCH (type)-[externalDependency:DEPENDS_ON]->(externalType:ExternalType) + MATCH (typePackage:Package)-[:CONTAINS]->(type) WITH allTypes + ,allPackages ,replace(externalType.fqn, '.' + externalType.name, '') AS externalPackageName - ,count(externalDependency) AS numberOfExternalTypeCaller - ,sum(externalDependency.weight) AS numberOfExternalTypeCalls + ,count(DISTINCT typePackage.fqn) AS numberOfExternalCallerPackages + ,count(DISTINCT type.fqn) AS numberOfExternalCallerTypes + ,count(externalDependency) AS numberOfExternalTypeCalls + ,sum(externalDependency.weight) AS numberOfExternalTypeCallsWeighted ,collect(DISTINCT externalType.name) AS externalTypeNames +where numberOfExternalTypeCalls <> numberOfExternalCallerTypes RETURN externalPackageName - ,numberOfExternalTypeCaller + ,numberOfExternalCallerPackages + ,numberOfExternalCallerTypes ,numberOfExternalTypeCalls + ,numberOfExternalTypeCallsWeighted + ,allPackages ,allTypes - ,externalTypeNames - ORDER BY numberOfExternalTypeCaller DESC, externalPackageName ASC \ No newline at end of file + ,externalTypeNames[0..9] AS tenExternalTypeNames + ORDER BY numberOfExternalCallerPackages DESC, externalPackageName ASC \ No newline at end of file diff --git a/cypher/External_Dependencies/External_package_usage_per_artifact_and_external_package.cypher b/cypher/External_Dependencies/External_package_usage_per_artifact_and_external_package.cypher new file mode 100644 index 000000000..622b8f9bb --- /dev/null +++ b/cypher/External_Dependencies/External_package_usage_per_artifact_and_external_package.cypher @@ -0,0 +1,50 @@ +// External package usage per artifact and external package + +// Get the overall artifact statistics first + MATCH (artifact:Artifact)-[:CONTAINS]->(package:Package) + MATCH (package)-[:CONTAINS]->(type:Type) + OPTIONAL MATCH (packageUsingExternal:Package)-[:CONTAINS]->(type)-[:DEPENDS_ON]->(external:ExternalType) + WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,count(DISTINCT package.fqn) AS artifactPackages + ,count(DISTINCT type.fqn) AS artifactTypes + ,count(DISTINCT replace(external.fqn, '.' + external.name, '')) AS artifactExternalPackages + ,count(DISTINCT packageUsingExternal.fqn) AS artifactExternalCallingPackages + ,collect(type) AS typeList + WITH artifactName + ,artifactPackages + ,artifactTypes + ,artifactExternalPackages + ,artifactExternalCallingPackages + ,round((100.0 / artifactPackages * artifactExternalCallingPackages), 2) AS artifactExternalCallingPackagesRate + ,typeList +// Get the external dependencies for each internal type +UNWIND typeList AS type + MATCH (type)-[:DEPENDS_ON]->(externalType:ExternalType) + MATCH (typePackage:Package)-[:CONTAINS]->(type) +// Optionally filter out dependencies to external annotations +// WHERE NOT externalType:ExternalAnnotation + WITH artifactName + ,artifactPackages + ,artifactTypes + ,artifactExternalPackages + ,artifactExternalCallingPackages + ,artifactExternalCallingPackagesRate + ,typePackage.fqn AS packageName + ,type.fqn AS fullTypeName + ,replace(externalType.fqn, '.' + externalType.name, '') AS externalPackageName +// Group by artifact and external package +RETURN artifactName + ,artifactPackages + ,artifactTypes + ,artifactExternalPackages + ,artifactExternalCallingPackages + ,artifactExternalCallingPackagesRate + ,externalPackageName + ,count(DISTINCT packageName) AS numberOfPackages + ,count(DISTINCT fullTypeName) AS numberOfTypes + ,100.0 / artifactPackages * count(DISTINCT packageName) AS packagesCallingExternalRate + ,100.0 / artifactTypes * count(DISTINCT fullTypeName) AS typesCallingExternalRate + ,COLLECT(DISTINCT packageName) AS nameOfPackages + ,COLLECT(DISTINCT fullTypeName)[0..9] AS someTypeNames +// Order the results by number of packages that use the external package dependency descending +ORDER BY artifactExternalCallingPackagesRate DESC, artifactName ASC, numberOfPackages DESC, externalPackageName ASC \ No newline at end of file diff --git a/cypher/External_Dependencies/External_package_usage_per_artifact_distribution.cypher b/cypher/External_Dependencies/External_package_usage_per_artifact_distribution.cypher new file mode 100644 index 000000000..a708fdbb3 --- /dev/null +++ b/cypher/External_Dependencies/External_package_usage_per_artifact_distribution.cypher @@ -0,0 +1,39 @@ +// External package usage per artifact distribution + + MATCH (artifact:Artifact)-[:CONTAINS]->(package:Package) + MATCH (package)-[:CONTAINS]->(type:Type) + WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,count(DISTINCT package.fqn) AS artifactPackages + ,count(DISTINCT type.fqn) AS artifactTypes + ,collect(type) AS typeList +UNWIND typeList AS type + MATCH (type)-[:DEPENDS_ON]->(externalType:ExternalType) + MATCH (typePackage:Package)-[:CONTAINS]->(type) + WHERE NOT externalType:ExternalAnnotation + WITH artifactName + ,artifactPackages + ,artifactTypes + ,typePackage.fqn AS packageName + ,type.fqn AS fullTypeName + ,replace(externalType.fqn, '.' + externalType.name, '') AS externalPackageName + WITH artifactName + ,artifactPackages + ,artifactTypes + ,count(DISTINCT externalPackageName) AS numberOfExternalPackages + ,COLLECT(DISTINCT externalPackageName) AS nameOfExternalPackages + ,count(DISTINCT packageName) AS numberOfPackages + ,COLLECT(DISTINCT packageName) AS nameOfPackages + ,count(DISTINCT fullTypeName) AS numberOfTypes + ,COLLECT(DISTINCT fullTypeName) AS nameOfTypes +RETURN artifactName + ,artifactPackages + ,artifactTypes + ,numberOfExternalPackages + ,numberOfPackages + ,numberOfTypes + ,100.0 / artifactTypes * numberOfTypes AS typesCallingExternalRate + ,100.0 / artifactPackages * numberOfPackages AS packagesCallingExternalRate + ,nameOfExternalPackages[0..9] AS someExternalPackageNames + ,nameOfPackages[0..9] AS someExternalCallingPackageNames + ,nameOfTypes[0..9] AS someExternalCallingTypeNames +ORDER BY numberOfPackages DESC, artifactName ASC \ No newline at end of file diff --git a/cypher/External_Dependencies/External_package_usage_per_artifact_package_aggregated.cypher b/cypher/External_Dependencies/External_package_usage_per_artifact_package_aggregated.cypher new file mode 100644 index 000000000..0101f1aa3 --- /dev/null +++ b/cypher/External_Dependencies/External_package_usage_per_artifact_package_aggregated.cypher @@ -0,0 +1,81 @@ +// External package usage per artifact package aggregated + +// Get the overall artifact statistics first + MATCH (artifact:Artifact)-[:CONTAINS]->(package:Package) + MATCH (package)-[:CONTAINS]->(type:Type) + WHERE NOT type:ExternalType + WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,artifact.leidenCommunityId AS leidenCommunityId + ,count(DISTINCT package.fqn) AS artifactPackages + ,count(DISTINCT type.fqn) AS artifactTypes + ,collect(type) AS typeList +// Get the external dependencies for each internal type +UNWIND typeList AS type + MATCH (type)-[:DEPENDS_ON]->(externalType:ExternalType) + MATCH (typePackage:Package)-[:CONTAINS]->(type) +// Filter out dependencies to exxternal annotations + WHERE NOT externalType:ExternalAnnotation + WITH artifactName + ,leidenCommunityId + ,artifactPackages + ,artifactTypes + ,typePackage.fqn AS packageName + ,type.fqn AS fullTypeName + ,replace(externalType.fqn, '.' + externalType.name, '') AS externalPackageName +// Group by artifact and external package + WITH artifactName + ,leidenCommunityId + ,artifactPackages + ,artifactTypes + ,externalPackageName + ,count(DISTINCT packageName) AS numberOfPackages + ,COLLECT(DISTINCT packageName) AS nameOfPackages + ,count(DISTINCT fullTypeName) AS numberOfTypes + ,COLLECT(DISTINCT fullTypeName) AS nameOfTypes + ,100.0 / artifactPackages * count(DISTINCT packageName) AS packagesCallingExternalRate + ,100.0 / artifactTypes * count(DISTINCT fullTypeName) AS typesCallingExternalRate +// Pre order the results by number of packages that use the external package dependency descending +ORDER BY numberOfPackages DESC, artifactName ASC +// Optionally filter out external package dependencies that are only used by one package +// WHERE numberOfPackages > 1 +// Group by artifact, aggregate statistics and return the results +RETURN artifactName + ,leidenCommunityId + ,artifactPackages + ,artifactTypes + ,count(DISTINCT externalPackageName) AS numberOfExternalPackages + + // Statistics about the packages and their external package usage count + ,min(numberOfPackages) AS minNumberOfPackages + ,max(numberOfPackages) AS maxNumberOfPackages + ,percentileCont(numberOfPackages, 0.5) AS medNumberOfPackages + ,avg(numberOfPackages) AS avgNumberOfPackages + ,stDev(numberOfPackages) AS stdNumberOfPackages + + // Statistics about the packages and their external package usage percentage + ,min(packagesCallingExternalRate) AS minNumberOfPackagesPercentage + ,max(packagesCallingExternalRate) AS maxNumberOfPackagesPercentage + ,percentileCont(packagesCallingExternalRate, 0.5) AS medNumberOfPackagesPercentage + ,avg(packagesCallingExternalRate) AS avgNumberOfPackagesPercentage + ,stDev(packagesCallingExternalRate) AS stdNumberOfPackagesPercentage + + // Statistics about the types and their external package usage count + ,min(numberOfTypes) AS minNumberOfTypes + ,max(numberOfTypes) AS maxNumberOfTypes + ,percentileCont(numberOfTypes, 0.5) AS medNumberOfTypes + ,avg(numberOfTypes) AS avgNumberOfTypes + ,stDev(numberOfTypes) AS stdNumberOfTypes + + // Statistics about the types and their external package usage count percentage + ,min(typesCallingExternalRate) AS minNumberOfTypesPercentage + ,max(typesCallingExternalRate) AS maxNumberOfTypesPercentage + ,percentileCont(typesCallingExternalRate, 0.5) AS medNumberOfTypesPercentage + ,avg(typesCallingExternalRate) AS avgNumberOfTypesPercentage + ,stDev(typesCallingExternalRate) AS stdNumberOfTypesPercentage + + // Examples of external packages, caller packages and caller types + ,collect(externalPackageName)[0..9] AS top10ExternalPackageNamesByUsageDescending + ,COLLECT(nameOfPackages)[0][0..9] AS somePackageNames + ,COLLECT(nameOfTypes)[0][0..9] AS someTypeNames + +ORDER BY maxNumberOfPackages DESC, artifactName ASC \ No newline at end of file diff --git a/cypher/External_Dependencies/External_package_usage_per_artifact_sorted.cypher b/cypher/External_Dependencies/External_package_usage_per_artifact_sorted.cypher new file mode 100644 index 000000000..5be6fa16f --- /dev/null +++ b/cypher/External_Dependencies/External_package_usage_per_artifact_sorted.cypher @@ -0,0 +1,40 @@ +// External package usage per artifact sorted by external usage descending + + MATCH (artifact:Artifact:Archive)-[:CONTAINS]->(type:Type) + OPTIONAL MATCH (type)-[:DEPENDS_ON]->(externalType:ExternalType) + WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,count(DISTINCT type.fqn) AS numberOfTypesInArtifact + ,count(DISTINCT externalType.fqn) AS numberOfExternalTypesInArtifact + ,count(DISTINCT replace(externalType.fqn, '.' + externalType.name, '')) AS numberOfExternalPackagesInArtifact + ,collect(DISTINCT type) AS typeList +UNWIND typeList AS type + MATCH (type)-[externalDependency:DEPENDS_ON]->(externalType:ExternalType) + WITH numberOfTypesInArtifact + ,numberOfExternalTypesInArtifact + ,numberOfExternalPackagesInArtifact + ,100.0 / numberOfTypesInArtifact * numberOfExternalTypesInArtifact AS externalTypeRate + ,externalDependency + ,artifactName + ,type.fqn AS fullTypeName + ,type.name AS typeName + ,replace(externalType.fqn, '.' + externalType.name, '') AS externalPackageName + ,externalType.name AS externalTypeName + WITH numberOfTypesInArtifact + ,numberOfExternalTypesInArtifact + ,numberOfExternalPackagesInArtifact + ,externalTypeRate + ,artifactName + ,externalPackageName + ,count(externalDependency) AS numberOfExternalTypeCaller + ,sum(externalDependency.weight) AS numberOfExternalTypeCalls + ,collect(DISTINCT externalTypeName) AS externalTypeNames +RETURN artifactName + ,externalPackageName + ,numberOfExternalTypeCaller + ,numberOfExternalTypeCalls + ,numberOfTypesInArtifact + ,numberOfExternalTypesInArtifact + ,numberOfExternalPackagesInArtifact + ,externalTypeRate + ,externalTypeNames +ORDER BY externalTypeRate DESC, artifactName ASC, numberOfExternalTypeCaller DESC, externalPackageName ASC \ No newline at end of file diff --git a/cypher/External_Dependencies/External_package_usage_per_artifact_sorted_top.cypher b/cypher/External_Dependencies/External_package_usage_per_artifact_sorted_top.cypher new file mode 100644 index 000000000..178b24e2c --- /dev/null +++ b/cypher/External_Dependencies/External_package_usage_per_artifact_sorted_top.cypher @@ -0,0 +1,52 @@ +// External package usage per artifact top externals + + MATCH (artifact:Artifact:Archive)-[:CONTAINS]->(type:Type) + OPTIONAL MATCH (type)-[:DEPENDS_ON]->(externalType:ExternalType) + WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,count(DISTINCT type.fqn) AS numberOfTypesInArtifact + ,count(DISTINCT externalType.fqn) AS numberOfExternalTypesInArtifact + ,count(DISTINCT replace(externalType.fqn, '.' + externalType.name, '')) AS numberOfExternalPackagesInArtifact + ,collect(DISTINCT type) AS typeList +UNWIND typeList AS type + MATCH (type)-[externalDependency:DEPENDS_ON]->(externalType:ExternalType) + WITH numberOfTypesInArtifact + ,numberOfExternalTypesInArtifact + ,numberOfExternalPackagesInArtifact + ,100.0 / numberOfTypesInArtifact * numberOfExternalTypesInArtifact AS externalTypeRate + ,externalDependency + ,artifactName + ,type.fqn AS fullTypeName + ,type.name AS typeName + ,replace(externalType.fqn, '.' + externalType.name, '') AS externalPackageName + ,externalType.name AS externalTypeName + ORDER BY externalTypeRate DESC, artifactName ASC + WITH numberOfTypesInArtifact + ,numberOfExternalTypesInArtifact + ,numberOfExternalPackagesInArtifact + ,externalTypeRate + ,artifactName + ,externalPackageName + ,count(externalDependency) AS numberOfExternalTypeCaller + ,sum(externalDependency.weight) AS numberOfExternalTypeCalls + ,collect(DISTINCT externalTypeName) AS externalTypeNames + ORDER BY externalTypeRate DESC, artifactName ASC, numberOfExternalTypeCaller DESC + WITH numberOfTypesInArtifact + ,numberOfExternalTypesInArtifact + ,numberOfExternalPackagesInArtifact + ,externalTypeRate + ,artifactName + ,COLLECT(DISTINCT externalPackageName) AS externalPackageNames + ,SUM(numberOfExternalTypeCaller) AS numberOfExternalTypeCaller + ,sum(numberOfExternalTypeCalls) AS numberOfExternalTypeCalls + ,collect(externalTypeNames) AS externalTypeNames +RETURN artifactName + ,numberOfTypesInArtifact + ,numberOfExternalTypesInArtifact + ,numberOfExternalPackagesInArtifact + ,externalTypeRate + ,numberOfExternalTypeCaller + ,numberOfExternalTypeCalls + ,size(externalPackageNames) AS numberOfExternalPackages + ,externalPackageNames[0..4] AS top5ExternalPackages + ,externalTypeNames[0..1] AS someExternalTypes +LIMIT 40 \ No newline at end of file diff --git a/cypher/External_Dependencies/External_package_usage_per_internal_package_count.cypher b/cypher/External_Dependencies/External_package_usage_per_internal_package_count.cypher new file mode 100644 index 000000000..4f35261d4 --- /dev/null +++ b/cypher/External_Dependencies/External_package_usage_per_internal_package_count.cypher @@ -0,0 +1,45 @@ +// External package usage per internal package count + +// Get the overall artifact statistics first + MATCH (artifact:Artifact)-[:CONTAINS]->(package:Package) + MATCH (package)-[:CONTAINS]->(type:Type) + WHERE NOT type:ExternalType + WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,count(DISTINCT package.fqn) AS artifactPackages + ,count(DISTINCT type.fqn) AS artifactTypes + ,collect(type) AS typeList +// Get the external dependencies for each internal type +UNWIND typeList AS type + MATCH (type)-[:DEPENDS_ON]->(externalType:ExternalType) + MATCH (typePackage:Package)-[:CONTAINS]->(type) +// Optionally filter out dependencies to external annotations +// WHERE NOT externalType:ExternalAnnotation + WITH artifactName + ,artifactPackages + ,artifactTypes + ,typePackage.fqn AS packageName + ,type.fqn AS fullTypeName + ,replace(externalType.fqn, '.' + externalType.name, '') AS externalPackageName +// Group by artifact and external package +WITH artifactName + ,artifactPackages + ,artifactTypes + ,externalPackageName + ,count(DISTINCT packageName) AS numberOfPackages + ,count(DISTINCT fullTypeName) AS numberOfTypes + ,100.0 / artifactPackages * count(DISTINCT packageName) AS packagesCallingExternalRate + ,100.0 / artifactTypes * count(DISTINCT fullTypeName) AS typesCallingExternalRate + ,COLLECT(DISTINCT packageName) AS nameOfPackages + ,COLLECT(DISTINCT fullTypeName)[0..9] AS someTypeNames +RETURN artifactName + ,artifactPackages + ,artifactTypes + ,numberOfPackages + ,count(DISTINCT externalPackageName) AS numberOfExternalPackages + ,collect(DISTINCT externalPackageName) AS externalPackageNames + ,max(packagesCallingExternalRate) AS maxPackagesCallingExternalRate + ,max(typesCallingExternalRate) AS maxTypesCallingExternalRate + ,COLLECT(nameOfPackages)[0][0..9] AS somePackageNames + ,COLLECT(someTypeNames)[0] AS someTypeNames +// Order the results by number of packages that use the external package dependency descending +ORDER BY numberOfPackages DESC, artifactName ASC \ No newline at end of file diff --git a/cypher/External_Dependencies/External_package_usage_per_type_distribution.cypher b/cypher/External_Dependencies/External_package_usage_per_type_distribution.cypher deleted file mode 100644 index 7b7445d24..000000000 --- a/cypher/External_Dependencies/External_package_usage_per_type_distribution.cypher +++ /dev/null @@ -1,29 +0,0 @@ -// External package usage per type distribution - - MATCH (artifact:Artifact)-[:CONTAINS]->(type:Type) - WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName - ,count(type) AS artifactTypes - ,collect(type) AS typeList -UNWIND typeList AS type - MATCH (type)-[:DEPENDS_ON]->(externalType:ExternalType) - WHERE externalType:ExternalAnnotation - WITH artifactName - ,artifactTypes - ,type.fqn AS fullTypeName - ,replace(externalType.fqn, '.' + externalType.name, '') AS externalPackageName - WITH artifactName - ,artifactTypes - ,fullTypeName - ,count(DISTINCT externalPackageName) AS numberOfExternalPackages - WITH artifactName - ,artifactTypes - ,numberOfExternalPackages - ,count(DISTINCT fullTypeName) AS numberOfTypes - ,COLLECT(DISTINCT fullTypeName) AS nameOfTypes -RETURN artifactName - ,artifactTypes - ,numberOfExternalPackages - ,numberOfTypes - ,100.0 / artifactTypes * numberOfTypes AS numberOfTypesPercentage - ,nameOfTypes -ORDER BY artifactName ASC, numberOfExternalPackages ASC \ No newline at end of file diff --git a/cypher/External_Dependencies/External_package_usage_spread.cypher b/cypher/External_Dependencies/External_package_usage_spread.cypher new file mode 100644 index 000000000..2f9238ec6 --- /dev/null +++ b/cypher/External_Dependencies/External_package_usage_spread.cypher @@ -0,0 +1,65 @@ +// External package usage spread + +// Get the overall artifact statistics first + MATCH (artifact:Artifact)-[:CONTAINS]->(package:Package) + MATCH (package)-[:CONTAINS]->(type:Type) + WHERE NOT type:ExternalType + WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,count(DISTINCT package.fqn) AS artifactPackages + ,count(DISTINCT type.fqn) AS artifactTypes + ,collect(type) AS typeList +// Get the external dependencies for each internal type +UNWIND typeList AS type + MATCH (type)-[:DEPENDS_ON]->(externalType:ExternalType) + MATCH (typePackage:Package)-[:CONTAINS]->(type) +// Filter out dependencies to external annotations + WHERE NOT externalType:ExternalAnnotation + WITH artifactName + ,artifactPackages + ,artifactTypes + ,replace(externalType.fqn, '.' + externalType.name, '') AS externalPackageName + // Gathering counts and numbers for every artifact and the external packages it uses + ,count(DISTINCT typePackage.fqn) AS numberOfPackages + ,COLLECT(DISTINCT typePackage.fqn) AS nameOfPackages + ,count(DISTINCT type.fqn) AS numberOfTypes + ,COLLECT(DISTINCT type.fqn )[0..9] AS someTypeNames + ,100.0 / artifactPackages * count(DISTINCT typePackage.fqn) AS packagesCallingExternalRate + ,100.0 / artifactTypes * count(DISTINCT type.fqn) AS typesCallingExternalRate +// Group external package +RETURN externalPackageName + ,count(DISTINCT artifactName) AS numberOfArtifacts + + // Statistics about the packages and their external package usage count + ,sum(numberOfPackages) AS sumNumberOfPackages + ,min(numberOfPackages) AS minNumberOfPackages + ,max(numberOfPackages) AS maxNumberOfPackages + ,percentileCont(numberOfPackages, 0.5) AS medNumberOfPackages + ,avg(numberOfPackages) AS avgNumberOfPackages + ,stDev(numberOfPackages) AS stdNumberOfPackages + + // Statistics about the packages and their external package usage percentage + ,min(packagesCallingExternalRate) AS minNumberOfPackagesPercentage + ,max(packagesCallingExternalRate) AS maxNumberOfPackagesPercentage + ,percentileCont(packagesCallingExternalRate, 0.5) AS medNumberOfPackagesPercentage + ,avg(packagesCallingExternalRate) AS avgNumberOfPackagesPercentage + ,stDev(packagesCallingExternalRate) AS stdNumberOfPackagesPercentage + + // Statistics about the types and their external package usage count + ,sum(numberOfTypes) AS sumNumberOfTypes + ,min(numberOfTypes) AS minNumberOfTypes + ,max(numberOfTypes) AS maxNumberOfTypes + ,percentileCont(numberOfTypes, 0.5) AS medNumberOfTypes + ,avg(numberOfTypes) AS avgNumberOfTypes + ,stDev(numberOfTypes) AS stdNumberOfTypes + + // Statistics about the types and their external package usage count percentage + ,min(typesCallingExternalRate) AS minNumberOfTypesPercentage + ,max(typesCallingExternalRate) AS maxNumberOfTypesPercentage + ,percentileCont(typesCallingExternalRate, 0.5) AS medNumberOfTypesPercentage + ,avg(typesCallingExternalRate) AS avgNumberOfTypesPercentage + ,stDev(typesCallingExternalRate) AS stdNumberOfTypesPercentage + + ,collect(DISTINCT artifactName)[0..4] AS someArtifactNames + +// Order the results by number of artifacts that use the external package dependency descending +ORDER BY numberOfArtifacts DESC, externalPackageName ASC \ No newline at end of file diff --git a/cypher/External_Dependencies/External_second_level_package_usage_overall.cypher b/cypher/External_Dependencies/External_second_level_package_usage_overall.cypher new file mode 100644 index 000000000..8f0c27314 --- /dev/null +++ b/cypher/External_Dependencies/External_second_level_package_usage_overall.cypher @@ -0,0 +1,27 @@ +// External second level package usage overall + + MATCH (package:Package)-[:CONTAINS]->(type:Type) + WITH count(DISTINCT type.fqn) AS allTypes + ,count(DISTINCT package.fqn) AS allPackages + ,collect(type) as typeList +UNWIND typeList AS type + MATCH (type)-[externalDependency:DEPENDS_ON]->(externalType:ExternalType) + MATCH (typePackage:Package)-[:CONTAINS]->(type) + WITH allTypes + ,allPackages + ,apoc.text.join(split(externalType.fqn,'.')[0..2], '.') AS externalSecondLevelPackageName + ,count(DISTINCT typePackage.fqn) AS numberOfExternalCallerPackages + ,count(DISTINCT type.fqn) AS numberOfExternalCallerTypes + ,count(externalDependency) AS numberOfExternalTypeCalls + ,sum(externalDependency.weight) AS numberOfExternalTypeCallsWeighted + ,collect(DISTINCT externalType.name) AS externalTypeNames +where numberOfExternalTypeCalls <> numberOfExternalCallerTypes +RETURN externalSecondLevelPackageName + ,numberOfExternalCallerPackages + ,numberOfExternalCallerTypes + ,numberOfExternalTypeCalls + ,numberOfExternalTypeCallsWeighted + ,allPackages + ,allTypes + ,externalTypeNames[0..9] AS tenExternalTypeNames + ORDER BY numberOfExternalCallerPackages DESC, externalSecondLevelPackageName ASC \ No newline at end of file diff --git a/cypher/External_Dependencies/External_second_level_package_usage_per_artifact_and_external_package.cypher b/cypher/External_Dependencies/External_second_level_package_usage_per_artifact_and_external_package.cypher new file mode 100644 index 000000000..0db7da4ee --- /dev/null +++ b/cypher/External_Dependencies/External_second_level_package_usage_per_artifact_and_external_package.cypher @@ -0,0 +1,50 @@ +// External second level package usage per artifact and external package + +// Get the overall artifact statistics first + MATCH (artifact:Artifact)-[:CONTAINS]->(package:Package) + MATCH (package)-[:CONTAINS]->(type:Type) + OPTIONAL MATCH (packageUsingExternal:Package)-[:CONTAINS]->(type)-[:DEPENDS_ON]->(external:ExternalType) + WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,count(DISTINCT package.fqn) AS artifactPackages + ,count(DISTINCT type.fqn) AS artifactTypes + ,count(DISTINCT split(external.fqn,'.')[0..2]) AS artifactExternalPackagesFirst2Levels + ,count(DISTINCT packageUsingExternal.fqn) AS artifactExternalCallingPackages + ,collect(type) AS typeList + WITH artifactName + ,artifactPackages + ,artifactTypes + ,artifactExternalPackagesFirst2Levels + ,artifactExternalCallingPackages + ,round((100.0 / artifactPackages * artifactExternalCallingPackages), 2) AS artifactExternalCallingPackagesRate + ,typeList +// Get the external dependencies for each internal type +UNWIND typeList AS type + MATCH (type)-[:DEPENDS_ON]->(externalType:ExternalType) + MATCH (typePackage:Package)-[:CONTAINS]->(type) +// Optionally filter out dependencies to external annotations +// WHERE NOT externalType:ExternalAnnotation + WITH artifactName + ,artifactPackages + ,artifactTypes + ,artifactExternalPackagesFirst2Levels + ,artifactExternalCallingPackages + ,artifactExternalCallingPackagesRate + ,typePackage.fqn AS packageName + ,type.fqn AS fullTypeName + ,apoc.text.join(split(externalType.fqn,'.')[0..2], '.') AS externalPackageNameFirst2Levels +// Group by artifact and first to external package levels +RETURN artifactName + ,artifactPackages + ,artifactTypes + ,artifactExternalPackagesFirst2Levels + ,artifactExternalCallingPackages + ,artifactExternalCallingPackagesRate + ,externalPackageNameFirst2Levels + ,count(DISTINCT packageName) AS numberOfPackages + ,count(DISTINCT fullTypeName) AS numberOfTypes + ,100.0 / artifactPackages * count(DISTINCT packageName) AS packagesCallingExternalRate + ,100.0 / artifactTypes * count(DISTINCT fullTypeName) AS typesCallingExternalRate + ,COLLECT(DISTINCT packageName) AS nameOfPackages + ,COLLECT(DISTINCT fullTypeName)[0..9] AS someTypeNames +// Order the results by number of packages that use the external package dependency descending +ORDER BY artifactExternalCallingPackagesRate DESC, artifactName ASC, numberOfPackages DESC, externalPackageNameFirst2Levels ASC \ No newline at end of file diff --git a/cypher/External_Dependencies/External_second_level_package_usage_spread.cypher b/cypher/External_Dependencies/External_second_level_package_usage_spread.cypher new file mode 100644 index 000000000..e074d2d90 --- /dev/null +++ b/cypher/External_Dependencies/External_second_level_package_usage_spread.cypher @@ -0,0 +1,65 @@ +// External second level package usage spread + +// Get the overall artifact statistics first + MATCH (artifact:Artifact)-[:CONTAINS]->(package:Package) + MATCH (package)-[:CONTAINS]->(type:Type) + WHERE NOT type:ExternalType + WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,count(DISTINCT package.fqn) AS artifactPackages + ,count(DISTINCT type.fqn) AS artifactTypes + ,collect(type) AS typeList +// Get the external dependencies for each internal type +UNWIND typeList AS type + MATCH (type)-[:DEPENDS_ON]->(externalType:ExternalType) + MATCH (typePackage:Package)-[:CONTAINS]->(type) +// Filter out dependencies to external annotations + WHERE NOT externalType:ExternalAnnotation + WITH artifactName + ,artifactPackages + ,artifactTypes + ,apoc.text.join(split(externalType.fqn,'.')[0..2], '.') AS externalSecondLevelPackageName + // Gathering counts and numbers for every artifact and the external packages it uses + ,count(DISTINCT typePackage.fqn) AS numberOfPackages + ,COLLECT(DISTINCT typePackage.fqn) AS nameOfPackages + ,count(DISTINCT type.fqn) AS numberOfTypes + ,COLLECT(DISTINCT type.fqn )[0..9] AS someTypeNames + ,100.0 / artifactPackages * count(DISTINCT typePackage.fqn) AS packagesCallingExternalRate + ,100.0 / artifactTypes * count(DISTINCT type.fqn) AS typesCallingExternalRate +// Group by second level external package +RETURN externalSecondLevelPackageName + ,count(DISTINCT artifactName) AS numberOfArtifacts + + // Statistics about the packages and their external package usage count + ,sum(numberOfPackages) AS sumNumberOfPackages + ,min(numberOfPackages) AS minNumberOfPackages + ,max(numberOfPackages) AS maxNumberOfPackages + ,percentileCont(numberOfPackages, 0.5) AS medNumberOfPackages + ,avg(numberOfPackages) AS avgNumberOfPackages + ,stDev(numberOfPackages) AS stdNumberOfPackages + + // Statistics about the packages and their external package usage percentage + ,min(packagesCallingExternalRate) AS minNumberOfPackagesPercentage + ,max(packagesCallingExternalRate) AS maxNumberOfPackagesPercentage + ,percentileCont(packagesCallingExternalRate, 0.5) AS medNumberOfPackagesPercentage + ,avg(packagesCallingExternalRate) AS avgNumberOfPackagesPercentage + ,stDev(packagesCallingExternalRate) AS stdNumberOfPackagesPercentage + + // Statistics about the types and their external package usage count + ,sum(numberOfTypes) AS sumNumberOfTypes + ,min(numberOfTypes) AS minNumberOfTypes + ,max(numberOfTypes) AS maxNumberOfTypes + ,percentileCont(numberOfTypes, 0.5) AS medNumberOfTypes + ,avg(numberOfTypes) AS avgNumberOfTypes + ,stDev(numberOfTypes) AS stdNumberOfTypes + + // Statistics about the types and their external package usage count percentage + ,min(typesCallingExternalRate) AS minNumberOfTypesPercentage + ,max(typesCallingExternalRate) AS maxNumberOfTypesPercentage + ,percentileCont(typesCallingExternalRate, 0.5) AS medNumberOfTypesPercentage + ,avg(typesCallingExternalRate) AS avgNumberOfTypesPercentage + ,stDev(typesCallingExternalRate) AS stdNumberOfTypesPercentage + + ,collect(DISTINCT artifactName)[0..4] AS someArtifactNames + +// Order the results by number of artifacts that use the external package dependency descending +ORDER BY numberOfArtifacts DESC, externalSecondLevelPackageName ASC \ No newline at end of file diff --git a/jupyter/ExternalDependencies.ipynb b/jupyter/ExternalDependencies.ipynb index 866743dec..564feb7ab 100644 --- a/jupyter/ExternalDependencies.ipynb +++ b/jupyter/ExternalDependencies.ipynb @@ -11,7 +11,7 @@ "\n", "### References\n", "- [jqassistant](https://jqassistant.org)\n", - "- [py2neo](https://py2neo.org/2021.1/)" + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)" ] }, { @@ -139,117 +139,883 @@ "This table shows the external packages that are used by the most different internal types overall.\n", "Additionally, it shows which types of the external package are actually used. External annotations are also listed.\n", "\n", + "Only the top 20 entries are shown. The whole table can be found in the following CSV report:\n", + "`External_package_usage_overall`\n", + "\n", + "**Columns:**\n", + "- *externalPackageName* identifies the external package as described above\n", + "- *numberOfExternalCallerPackages* refers to the distinct packages that make use of the external package\n", + "- *numberOfExternalCallerTypes* refers to the distinct types that make use of the external package\n", + "- *numberOfExternalTypeCalls* includes every dependency to the types in the external package\n", + "- *numberOfExternalTypeCallsWeighted* includes every invocation or reference (sum of weights) to the types in the external package\n", + "- *allPackages* contains the total count of all analyzed packages in general\n", + "- *allTypes* contains the total count of all analyzed types in general\n", + "- *externalTypeNames* contains a list of actually utilized types of the external package" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff524ac7", + "metadata": {}, + "outputs": [], + "source": [ + "external_package_usage=query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_overall.cypher\")\n", + "\n", + "# Select columns and only show the first 20 entries (head)\n", + "external_package_usage.head(20)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1143afcb", + "metadata": {}, + "source": [ + "#### Table 1 Chart 1 - Most called external packages in % by types\n", + "\n", + "External packages that are used less than 0.7% are grouped into the name \"others\" to get a cleaner chart\n", + "with the most significant external packages and how ofter they are called in percent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44a11aec", + "metadata": {}, + "outputs": [], + "source": [ + "def group_to_others_below_threshold(data_frame : pd.DataFrame, value_column : str, name_column: str, threshold: float) -> pd.DataFrame: \n", + " \"\"\"Adds a new percentage column for the value column and \n", + " groups all values below the given threshold to \"others\" in the name column.\n", + "\n", + " Parameters:\n", + " - data_frame (pd.DataFrame): Input pandas DataFrame\n", + " - value_column (str): Name of the column that contains the numeric value\n", + " - name_column (str): Name of the column that contains the group name that will be replaced by \"others\" for small values\n", + " - threshold (float): Threshold in % that is used to group values below it into the \"others\" group\n", + "\n", + " Returns:\n", + " int:Returning value\n", + "\n", + " \"\"\"\n", + " result_data_frame = data_frame.copy();\n", + "\n", + " percent_column_name = value_column + 'Percent';\n", + "\n", + " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", + " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", + "\n", + " # Change the external package name to \"others\" if it is called less than the specified threshold\n", + " result_data_frame.loc[result_data_frame[percent_column_name] < threshold, name_column] = 'others';\n", + "\n", + " # Group external package name (foremost the new \"others\" entries) and sum their percentage\n", + " result_data_frame = result_data_frame.groupby(name_column)[percent_column_name].sum();\n", + "\n", + " # Sort by values descending\n", + " return result_data_frame.sort_values(ascending=False);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99ef3fad", + "metadata": {}, + "outputs": [], + "source": [ + "external_package_by_type_usage_significant = group_to_others_below_threshold(\n", + " data_frame=external_package_usage,\n", + " value_column='numberOfExternalCallerTypes',\n", + " name_column='externalPackageName',\n", + " threshold= 0.7\n", + ");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "688b6d56", + "metadata": {}, + "outputs": [], + "source": [ + "plot.figure();\n", + "\n", + "axis = external_package_by_type_usage_significant.plot(\n", + " kind='pie',\n", + " title='Top external package usage [%] by type',\n", + " legend=True,\n", + " labeldistance=None,\n", + " autopct='%1.2f%%',\n", + " textprops={'fontsize': 5},\n", + " pctdistance=1.2,\n", + " cmap=main_color_map\n", + ")\n", + "axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "84c123dc", + "metadata": {}, + "source": [ + "#### Table 1 Chart 2 - Most called external packages in % by packages\n", + "\n", + "External packages that are used less than 0.7% are grouped into the name \"others\" to get a cleaner chart\n", + "with the most significant external packages and how ofter they are called in percent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c10499a5", + "metadata": {}, + "outputs": [], + "source": [ + "external_package_by_package_usage_significant = group_to_others_below_threshold(\n", + " data_frame=external_package_usage,\n", + " value_column='numberOfExternalCallerPackages',\n", + " name_column='externalPackageName',\n", + " threshold= 0.7\n", + ");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c165f403", + "metadata": {}, + "outputs": [], + "source": [ + "plot.figure();\n", + "\n", + "axis = external_package_by_package_usage_significant.plot(\n", + " kind='pie',\n", + " title='Top external package usage [%] by package',\n", + " legend=True,\n", + " labeldistance=None,\n", + " autopct='%1.2f%%',\n", + " textprops={'fontsize': 5},\n", + " pctdistance=1.2,\n", + " cmap=main_color_map\n", + ")\n", + "axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "cb92e603", + "metadata": {}, + "source": [ + "### Table 2 - Top 20 most used external packages grouped by their first 2 layers\n", + "\n", + "This table shows external packages grouped by their first 2 layers that are used by the most different internal types overall including external annotations. For example, \"javax.xml.stream\" and \"javax.xml.parsers\" are grouped together to \"javax.xml\".\n", + "\n", + "Additionally, it shows which types of the external packages are actually used.\n", + "\n", + "Only the top 20 entries are shown. The whole table can be found in the following CSV report:\n", + "`External_second_level_package_usage_overall`\n", + "\n", + "**Columns:**\n", + "- *externalSecondLevelPackageName* identifies the first 2 levels of the external package as described above\n", + "- *numberOfExternalCallerPackages* refers to the distinct packages that make use of the external package\n", + "- *numberOfExternalCallerTypes* refers to the distinct types that make use of the external package\n", + "- *numberOfExternalTypeCalls* includes every dependency to the types in the external package\n", + "- *numberOfExternalTypeCallsWeighted* includes every invocation or reference (sum of weights) to the types in the external package\n", + "- *allPackages* contains the total count of all analyzed packages in general\n", + "- *allTypes* contains the total count of all analyzed types in general\n", + "- *externalTypeNames* contains a list of actually utilized types of the external package" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4dc5e771", + "metadata": {}, + "outputs": [], + "source": [ + "external_grouped_package_usage=query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_second_level_package_usage_overall.cypher\")\n", + "external_grouped_package_usage.head(20)" + ] + }, + { + "cell_type": "markdown", + "id": "279932a6", + "metadata": {}, + "source": [ + "#### Table 2 Chart 1 - Most called second level external packages in % by type\n", + "\n", + "External package groups that are used less than 0.7% are grouped into the name \"others\" to get a cleaner chart\n", + "with the most significant external packages and how ofter they are called in percent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d36043f", + "metadata": {}, + "outputs": [], + "source": [ + "external_grouped_package_by_type_usage_significant = group_to_others_below_threshold(\n", + " data_frame=external_grouped_package_usage,\n", + " value_column='numberOfExternalCallerTypes',\n", + " name_column='externalSecondLevelPackageName',\n", + " threshold= 0.7\n", + ");\n", + "\n", + "plot.figure();\n", + "\n", + "axis = external_grouped_package_by_type_usage_significant.plot(\n", + " kind='pie',\n", + " title='Top external package (grouped by first 2 layers) usage [%] by type',\n", + " legend=True,\n", + " labeldistance=None,\n", + " autopct='%1.2f%%',\n", + " textprops={'fontsize': 5},\n", + " pctdistance=1.2,\n", + " cmap=main_color_map\n", + ")\n", + "axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "43c3e1a3", + "metadata": {}, + "source": [ + "#### Table 2 Chart 2 - Most called second level external packages in % by package\n", + "\n", + "External package groups that are used less than 0.7% are grouped into the name \"others\" to get a cleaner chart\n", + "with the most significant external packages and how ofter they are called in percent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f05314f", + "metadata": {}, + "outputs": [], + "source": [ + "external_grouped_package_by_package_usage_significant = group_to_others_below_threshold(\n", + " data_frame=external_grouped_package_usage,\n", + " value_column='numberOfExternalCallerPackages',\n", + " name_column='externalSecondLevelPackageName',\n", + " threshold= 0.7\n", + ");\n", + "\n", + "plot.figure();\n", + "axis = external_grouped_package_by_package_usage_significant.plot(\n", + " kind='pie',\n", + " title='Top external package (grouped by first 2 layers) usage [%] by package',\n", + " legend=True,\n", + " labeldistance=None,\n", + " autopct='%1.2f%%',\n", + " textprops={'fontsize': 5},\n", + " pctdistance=1.2,\n", + " cmap=main_color_map\n", + ")\n", + "axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "2fb30664", + "metadata": {}, + "source": [ + "### Table 3 - Top 20 most widely spread external packages\n", + "\n", + "The following tables shows external packages that are used by many different artifacts with the highest number of artifacts first. External annotations are filtered out to only get those external packages that significantly add to coupling.\n", + "\n", + "Statistics like minimum, maximum, average, median and standard deviation are provided for the number of packages and number of types in every artifact that uses the listed external package. \n", + "\n", + "The intuition behind that is to find external package dependencies that are used in a widely spread manner. This should uncover libraries and frameworks and make it easier to distinguish them from external dependencies that are used for specific tasks. It can also be used to find external dependencies that are used sparsely regarding artifacts but are used in many different packages there. This could then be improved by applying a [Hexagonal architecture](https://alistair.cockburn.us/hexagonal-architecture).\n", + "\n", + "Only the top 20 entries are shown. The whole table can be found in the following CSV report:\n", + "`External_package_usage_spread`\n", + "\n", + "**Columns:**\n", + "- *externalPackageName* identifies the external package as defined above. All other columns contain aggregated data for this external package.\n", + "- *numberOfArtifacts* contains the number of artifacts that use the external package\n", + "- *sumNumberOfPackages* contains the sum of all packages that use the external package\n", + "- *min/max/med/avg/stdNumberOfPackages* provide statistics based on the number of packages of each artifact that uses the external package\n", + "- *min/max/med/avg/stdNumberOfPackagesPercentage* provide statistics in percent (%) based on the number of packages of each artifact that uses the external package\n", + "- *min/max/med/avg/stdNumberOfTypes* provide statistics based on the number of types of each artifact that uses the external package\n", + "- *min/max/med/avg/stdNumberOfPackagesPercentage* provide statistics in percent (%) based on the number of types of each artifact that uses the external package\n", + "- *someArtifactNames* contain some of the artifacts that contain the external package for reference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad1a04c4", + "metadata": {}, + "outputs": [], + "source": [ + "# Query the graph database to provide the \n", + "# most widely spread external dependencies for the tables/charts below.\n", + "external_package_usage_spread=query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_spread.cypher\")\n", + "external_package_usage_spread.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "6ed201cc", + "metadata": {}, + "source": [ + "### Table 3a - Top 20 most widely spread external packages - number of internal packages\n", + "\n", + "This table shows the top 20 most widely spread external packages focussing on the spread across the number of internal packages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5fa9f9e", + "metadata": {}, + "outputs": [], + "source": [ + "external_package_usage_spread_packages=external_package_usage_spread[['externalPackageName', 'numberOfArtifacts', 'minNumberOfPackages', 'maxNumberOfPackages', 'medNumberOfPackages', 'avgNumberOfPackages', 'stdNumberOfPackages']]\n", + "external_package_usage_spread_packages.head(20)" + ] + }, + { + "cell_type": "markdown", + "id": "27c5b75f", + "metadata": {}, + "source": [ + "### Table 3b - Top 20 most widely spread external packages - percentage of internal packages\n", + "\n", + "This table shows the top 20 most widely spread external packages focussing on the spread across the percentage of internal packages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4d4273e", + "metadata": {}, + "outputs": [], + "source": [ + "external_package_usage_spread_packages_percentage=external_package_usage_spread[['externalPackageName', 'numberOfArtifacts', 'minNumberOfPackagesPercentage', 'maxNumberOfPackagesPercentage', 'medNumberOfPackagesPercentage', 'avgNumberOfPackagesPercentage', 'stdNumberOfPackagesPercentage']]\n", + "external_package_usage_spread_packages_percentage.head(20)" + ] + }, + { + "cell_type": "markdown", + "id": "d7638e05", + "metadata": {}, + "source": [ + "### Table 3c - Top 20 most widely spread external packages - number of internal types\n", + "\n", + "This table shows the top 20 most widely spread external packages focussing on the spread across the number of internal types." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f5af910", + "metadata": {}, + "outputs": [], + "source": [ + "external_package_usage_spread_types=external_package_usage_spread[['externalPackageName', 'numberOfArtifacts', 'minNumberOfTypes', 'maxNumberOfTypes', 'medNumberOfTypes', 'avgNumberOfTypes', 'stdNumberOfTypes']]\n", + "external_package_usage_spread_types.head(20)" + ] + }, + { + "cell_type": "markdown", + "id": "07427cd5", + "metadata": {}, + "source": [ + "### Table 3d - Top 20 most widely spread external packages - percentage of internal types\n", + "\n", + "This table shows the top 20 most widely spread external packages focussing on the spread across the percentage of internal types." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50712f90", + "metadata": {}, + "outputs": [], + "source": [ + "external_package_usage_spread_types_percentage=external_package_usage_spread[['externalPackageName', 'numberOfArtifacts', 'minNumberOfTypesPercentage', 'maxNumberOfTypesPercentage', 'medNumberOfTypesPercentage', 'avgNumberOfTypesPercentage', 'stdNumberOfTypesPercentage']]\n", + "external_package_usage_spread_types_percentage.head(20)" + ] + }, + { + "cell_type": "markdown", + "id": "b210eea0", + "metadata": {}, + "source": [ + "#### Table 3 Chart 1 - Most widely spread external packages in % by types\n", + "\n", + "External packages that are used less than 0.5% are grouped into the name \"others\" to get a cleaner chart with the most significant external packages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f41d04c9", + "metadata": {}, + "outputs": [], + "source": [ + "external_package_type_usage_spread_significant = group_to_others_below_threshold(\n", + " data_frame=external_package_usage_spread,\n", + " value_column='sumNumberOfTypes',\n", + " name_column='externalPackageName',\n", + " threshold= 0.5\n", + ");\n", + "\n", + "plot.figure();\n", + "axis = external_package_type_usage_spread_significant.plot(\n", + " kind='pie',\n", + " title='Top external package usage spread [%] by type',\n", + " legend=True,\n", + " labeldistance=None,\n", + " autopct='%1.2f%%',\n", + " textprops={'fontsize': 5},\n", + " pctdistance=1.2,\n", + " cmap=main_color_map\n", + ")\n", + "axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "c48740e3", + "metadata": {}, + "source": [ + "#### Table 3 Chart 2 - Most widely spread external packages in % by packages\n", + "\n", + "External packages that are used less than 0.5% are grouped into the name \"others\" to get a cleaner chart with the most significant external packages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bb7b0d9", + "metadata": {}, + "outputs": [], + "source": [ + "external_package_usage_package_spread_significant = group_to_others_below_threshold(\n", + " data_frame=external_package_usage_spread,\n", + " value_column='sumNumberOfPackages',\n", + " name_column='externalPackageName',\n", + " threshold= 0.5\n", + ");\n", + "\n", + "plot.figure();\n", + "axis = external_package_usage_package_spread_significant.plot(\n", + " kind='pie',\n", + " title='Top external package usage spread [%] by package',\n", + " legend=True,\n", + " labeldistance=None,\n", + " autopct='%1.2f%%',\n", + " textprops={'fontsize': 5},\n", + " pctdistance=1.2,\n", + " cmap=main_color_map\n", + ")\n", + "axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "2d6762ea", + "metadata": {}, + "source": [ + "### Table 4 - Top 20 most widely spread external packages grouped by their first 2 layers\n", + "\n", + "This table shows external packages grouped by their first 2 layers that are used by many different artifacts with the highest number of artifacts first. External annotations are filtered out to only get those external packages that significantly add to coupling.\n", + "\n", + "Statistics like minimum, maximum, average, median and standard deviation are provided for the number of packages and number of types in every artifact that uses the listed external package. \n", + "\n", + "The intuition behind that is to find external package dependencies that are used in a widely spread manner. This should uncover libraries and frameworks and make it easier to distinguish them from external dependencies that are used for specific tasks. It can also be used to find external dependencies that are used sparsely regarding artifacts but are used in many different packages there. This could then be improved by applying a [Hexagonal architecture](https://alistair.cockburn.us/hexagonal-architecture).\n", + "\n", + "Only the top 20 entries are shown. The whole table can be found in the following CSV report:\n", + "`External_package_usage_spread`\n", + "\n", + "**Columns:**\n", + "- *externalPackageName* identifies the external package as defined above. All other columns contain aggregated data for this external package.\n", + "- *numberOfArtifacts* contains the number of artifacts that use the external package\n", + "- *sumNumberOfPackages* contains the sum of all packages that use the external package\n", + "- *min/max/med/avg/stdNumberOfPackages* provide statistics based on the number of packages of each artifact that uses the external package\n", + "- *min/max/med/avg/stdNumberOfPackagesPercentage* provide statistics in percent (%) based on the number of packages of each artifact that uses the external package\n", + "- *min/max/med/avg/stdNumberOfTypes* provide statistics based on the number of types of each artifact that uses the external package\n", + "- *min/max/med/avg/stdNumberOfPackagesPercentage* provide statistics in percent (%) based on the number of types of each artifact that uses the external package\n", + "- *someArtifactNames* contain some of the artifacts that contain the external package for reference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1f2624e", + "metadata": {}, + "outputs": [], + "source": [ + "external_grouped_package_usage_spread=query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_second_level_package_usage_spread.cypher\")\n", + "external_grouped_package_usage_spread.head(20)" + ] + }, + { + "cell_type": "markdown", + "id": "e6f098e6", + "metadata": {}, + "source": [ + "#### Table 4 Chart 1 - Most widely spread second level external packages in % by type\n", + "\n", + "External package groups that are used less than 0.5% are grouped into the name \"others\" to get a cleaner chart\n", + "with the most significant external packages and how ofter they are called in percent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a95e7be", + "metadata": {}, + "outputs": [], + "source": [ + "external_grouped_package_type_usage_spread_significant = group_to_others_below_threshold(\n", + " data_frame=external_grouped_package_usage_spread,\n", + " value_column='sumNumberOfTypes',\n", + " name_column='externalSecondLevelPackageName',\n", + " threshold= 0.5\n", + ");\n", + "\n", + "plot.figure();\n", + "\n", + "axis = external_grouped_package_type_usage_spread_significant.plot(\n", + " kind='pie',\n", + " title='Top external package (grouped by first 2 layers) usage [%] by type',\n", + " legend=True,\n", + " labeldistance=None,\n", + " autopct='%1.2f%%',\n", + " textprops={'fontsize': 5},\n", + " pctdistance=1.2,\n", + " cmap=main_color_map\n", + ")\n", + "axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "0b91082e", + "metadata": {}, + "source": [ + "#### Table 4 Chart 2 - Most widely spread second level external packages in % by package\n", + "\n", + "External package groups that are used less than 0.5% are grouped into the name \"others\" to get a cleaner chart\n", + "with the most significant external packages and how ofter they are called in percent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d74f8422", + "metadata": {}, + "outputs": [], + "source": [ + "external_grouped_package_package_usage_spread_significant = group_to_others_below_threshold(\n", + " data_frame=external_grouped_package_usage_spread,\n", + " value_column='sumNumberOfPackages',\n", + " name_column='externalSecondLevelPackageName',\n", + " threshold= 0.5\n", + ");\n", + "\n", + "plot.figure();\n", + "\n", + "axis = external_grouped_package_package_usage_spread_significant.plot(\n", + " kind='pie',\n", + " title='Top external package (grouped by first 2 layers) usage [%] by package',\n", + " legend=True,\n", + " labeldistance=None,\n", + " autopct='%1.2f%%',\n", + " textprops={'fontsize': 5},\n", + " pctdistance=1.2,\n", + " cmap=main_color_map\n", + ")\n", + "axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "plot.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "29ec2211", + "metadata": {}, + "source": [ + "### Table 5 - Top 20 least used external packages overall\n", + "\n", + "This table identifies external packages that aren't used very often. This could help to find libraries that aren't actually needed or maybe easily replaceable. Some of them might be used sparsely on purpose for example as an adapter to an external library that is actually important. Thus, decisions need to be made on a case-by-case basis.\n", + "\n", + "Only the last 20 entries are shown. The whole table can be found in the following CSV report:\n", + "`External_package_usage_overall`\n", + "\n", + "**Columns:**\n", + "- *externalPackageName* identifies the external package as described above\n", + "- *numberOfExternalTypeCalls* includes every invocation or reference to the types in the external package" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03641b8b", + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by number of external type calls\n", + "external_package_least_used=external_package_usage.sort_values(by='numberOfExternalTypeCalls', ascending=True)\n", + "\n", + "# Reset index\n", + "external_package_least_used = external_package_least_used.reset_index(drop=True)\n", + "\n", + "# Select columns and only show the first 10 entries (head)\n", + "external_package_least_used[['externalPackageName','numberOfExternalTypeCalls']].head(20)\n" + ] + }, + { + "cell_type": "markdown", + "id": "0bd11586", + "metadata": {}, + "source": [ + "### Table 6 - External usage per artifact sorted by highest external type rate descending\n", + "\n", + "The following table shows the most used external packages separately for each artifact including external annotations. The results are sorted by the artifacts with the highest external type usage rate descending. \n", + "\n", + "The intention of this table is to find artifacts that use a lot of external dependencies in relation to their size and get all the external packages and their usage.\n", + "\n", + "Only the last 40 entries are shown. The whole table can be found in the following CSV report:\n", + "`External_package_usage_per_artifact_sorted`\n", + "\n", "**Columns:**\n", + "- *artifactName* is used to group the the external package usage per artifact for a more detailed analysis.\n", "- *externalPackageName* identifies the external package as described above\n", "- *numberOfExternalTypeCaller* refers to the distinct types that make use of the external package\n", "- *numberOfExternalTypeCalls* includes every invocation or reference to the types in the external package\n", - "- *allTypes* represents the total count of all analyzed types in general\n", + "- *numberOfTypesInArtifact* represents the total count of all analyzed types for the artifact\n", + "- *numberOfExternalTypesInArtifact* is the number of all external types that are used by the artifact\n", + "- *numberOfExternalPackagesInArtifact* is the number of all external packages that are used by the artifact\n", + "- *externalTypeRate* is the numberOfExternalTypesInArtifact / numberOfTypesInArtifact * 100\n", "- *externalTypeNames* contains a list of actually utilized types of the external package" ] }, { "cell_type": "code", "execution_count": null, - "id": "ff524ac7", + "id": "f8459ede", "metadata": {}, "outputs": [], "source": [ - "external_package_usage=query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_overall.cypher\")\n", + "query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_artifact_sorted.cypher\").head(40)" + ] + }, + { + "cell_type": "markdown", + "id": "dd5600ed", + "metadata": {}, + "source": [ + "### Table 7 - Artifacts and their external packages\n", "\n", - "# Select columns and only show the first 20 entries (head)\n", - "external_package_usage.head(20)" + "The following table shows the artifacts with the highest external dependency usage broken down by each external package including external annotations. The results are sorted by the artifacts with the highest external package usage rate descending. \n", + "\n", + "The intention of this table is to find artifacts that use a lot of external dependencies and show in detail which external packages are used by them and how many internal packages.\n", + "\n", + "Only the last 30 entries are shown. The whole table can be found in the following CSV report:\n", + "`External_package_usage_per_artifact_and_external_package`\n", + "\n", + "**Columns:**\n", + "- *artifactName* is the name of the artifact with external dependencies (first grouping column)\n", + "- *artifactPackages* is the number of packages in the artifact\n", + "- *artifactTypes* is the number of types in the artifact\n", + "- *artifactExternalPackages* is the number of external packages used by the artifact\n", + "- *artifactExternalCallingPackages* is the number of packages that use external packages in the artifact \n", + "- *artifactExternalCallingPackagesRate* is artifactExternalCallingPackages / artifactPackages * 100%\n", + "- *externalPackageName* the name of the external package (second grouping column)\n", + "- *numberOfPackages* is the number of internal packages of the artifact that use the external packages\n", + "- *numberOfTypes* is the number of internal types of the artifact that use the external packages\n", + "- *packagesCallingExternalRate* is numberOfPackages / artifactPackages * 100%\n", + "- *typesCallingExternalRate* is numberOfTypes / artifactTypes * 100%\n", + "- *nameOfPackages* names of the internal packages that use the external package in the artifact\n", + "- *someTypeNames* some (10) names of the internal types that use the external package in the artifact" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39e4deb3", + "metadata": {}, + "outputs": [], + "source": [ + "external_packages_per_artifact = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_artifact_and_external_package.cypher\")\n", + "external_packages_per_artifact.head(30)" ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "1143afcb", + "id": "7e18a377", "metadata": {}, "source": [ - "### Chart 1 - Most called external packages in %\n", + "### Table 7a - Artifacts and their external packages (first 2 levels)\n", "\n", - "Packages that are used less than 0.7% are grouped into the name \"others\" to get a cleaner chart\n", - "with the most significant external packages and how ofter they are called in percent." + "The following table groups the external packages by their first two levels. For example `javax.xml.namespace` and `javax.xml.stream` will be grouped together to `javax.xml`." ] }, { "cell_type": "code", "execution_count": null, - "id": "99ef3fad", + "id": "6c8c7ed7", + "metadata": {}, + "outputs": [], + "source": [ + "external_second_level_packages_per_artifact = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_second_level_package_usage_per_artifact_and_external_package.cypher\")\n", + "external_second_level_packages_per_artifact.head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "bb458f01", + "metadata": {}, + "source": [ + "#### Table 7b - Top 15 external dependency using artifacts as columns with their external packages\n", + "\n", + "The following table uses pivot to show the artifacts in columns, the external dependencies in rows and the number of internal packages as values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd9667a9", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate the percentage for each value based on the global sum of all values\n", + "def percentage_global(data_frame : pd.DataFrame):\n", + " total = data_frame.sum().sum()\n", + " return data_frame / total * 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ae9ada4", "metadata": {}, "outputs": [], "source": [ - "external_package_usage_significant = external_package_usage.copy();\n", + "external_packages_per_artifact_pivot = external_packages_per_artifact.copy()\n", "\n", - "# Add column \"percentOfExternalTypeCalls\" with the percentage of the \"numberOfExternalTypeCalls\".\n", - "external_package_usage_significant['percentOfExternalTypeCalls'] = external_package_usage_significant['numberOfExternalTypeCalls'] / external_package_usage_significant['numberOfExternalTypeCalls'].sum() * 100\n", + "# Every row represents the number of external package\n", + "external_packages_per_artifact_pivot=external_packages_per_artifact_pivot.pivot(index='externalPackageName', columns='artifactName', values='numberOfPackages')\n", "\n", - "# Change the external package name to \"others\" if it is called less than 0.7 percent\n", - "external_package_usage_significant.loc[external_package_usage_significant['percentOfExternalTypeCalls'] < 0.7, 'externalPackageName'] = 'others'\n", + "# Sort by column sum and then take only the first 10 columns\n", + "sum_of_external_packages_per_artifact = external_packages_per_artifact_pivot.sum()\n", + "external_packages_per_artifact_pivot = external_packages_per_artifact_pivot[sum_of_external_packages_per_artifact.sort_values(ascending=False).index[:15]]\n", "\n", - "# Group external package name (foremost the new \"others\" entries) and sum their \"percentOfExternalTypeCalls\"\n", - "external_package_usage_significant = external_package_usage_significant.groupby('externalPackageName')['percentOfExternalTypeCalls'].sum()\n", + "# Fill missing values with zeroes\n", + "external_packages_per_artifact_pivot.fillna(0, inplace=True)\n", + "\n", + "external_packages_per_artifact_pivot.astype('int')" + ] + }, + { + "cell_type": "markdown", + "id": "9b1fed8e", + "metadata": {}, + "source": [ + "#### Table 7c - Top 15 external dependency using artifacts as columns with their external packages (first 2 levels)\n", "\n", - "# Sort by \"percentOfExternalTypeCalls\" descending\n", - "external_package_usage_significant.sort_values(ascending=False, inplace=True)" + "The following table uses pivot to show the artifacts in columns, the external package name grouped by its first two levels in rows and the number of internal packages as values. For example `javax.xml.namespace` and `javax.xml.stream` will be grouped together to `javax.xml`." ] }, { "cell_type": "code", "execution_count": null, - "id": "688b6d56", + "id": "845099fc", "metadata": {}, "outputs": [], "source": [ - "plot.figure();\n", + "external_second_level_packages_per_artifact_pivot = external_second_level_packages_per_artifact.copy()\n", "\n", - "# Set the name of the index to artifactName\n", - "#external_package_usage_significant=external_package_usage_significant.set_index('externalPackageName')\n", + "# Every row represents the number of external package\n", + "external_second_level_packages_per_artifact_pivot=external_second_level_packages_per_artifact_pivot.pivot(index='externalPackageNameFirst2Levels', columns='artifactName', values='numberOfPackages')\n", "\n", - "axis = external_package_usage_significant.plot(\n", - " kind='pie',\n", - " title='Top External Package Usage [%]',\n", + "# Sort by column sum and then take only the first 10 columns\n", + "sum_of_external_second_level_packages_per_artifact = external_second_level_packages_per_artifact_pivot.sum()\n", + "external_second_level_packages_per_artifact_pivot = external_second_level_packages_per_artifact_pivot[sum_of_external_second_level_packages_per_artifact.sort_values(ascending=False).index[:15]]\n", + "\n", + "# Fill missing values with zeroes\n", + "external_second_level_packages_per_artifact_pivot.fillna(0, inplace=True)\n", + "\n", + "external_second_level_packages_per_artifact_pivot.astype('int')" + ] + }, + { + "cell_type": "markdown", + "id": "43ec339b", + "metadata": {}, + "source": [ + "#### Table 7 Chart 1 - Top 15 external dependency using artifacts and their external packages stacked\n", + "\n", + "The following chart shows the top 15 external package using artifacts and breaks down which external packages they use in how many different internal packages with stacked bars. \n", + "\n", + "Note that every external dependency is counted separately so that if on internal package uses two external packages it will be displayed for both and so stacked twice. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d8e8bb0", + "metadata": {}, + "outputs": [], + "source": [ + "plot.figure();\n", + "axes = external_packages_per_artifact_pivot.transpose().plot(\n", + " kind='bar', \n", + " grid=True,\n", + " title='External package usage per artifact', \n", + " xlabel='artifact',\n", + " ylabel='number of packages',\n", + " stacked=True,\n", " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 6},\n", - " pctdistance=1.2,\n", " cmap=main_color_map\n", - ")\n", - "axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + ").legend(bbox_to_anchor=(1.0, 1.0))\n", "plot.show()" ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "29ec2211", + "id": "2d577c45", "metadata": {}, "source": [ - "### Table 2 - Top 20 least used external packages overall\n", + "#### Table 7 Chart 2 - Top 15 external dependency using artifacts and their external packages (first 2 levels) stacked\n", "\n", - "This table identifies external packages that aren't used very often. This could help to find libraries that aren't actually needed or maybe easily replaceable. Some of them might be used sparsely on purpose for example as an adapter to an external library that is actually important. Thus, decisions need to be made on a case-by-case basis.\n", + "The following chart shows the top 15 external package using artifacts and breaks down which external packages (first 2 levels) are used in how many different internal packages with stacked bars. \n", "\n", - "**Columns:**\n", - "- *externalPackageName* identifies the external package as described above\n", - "- *numberOfExternalTypeCalls* includes every invocation or reference to the types in the external package" + "Note that every external dependency is counted separately so that if on internal package uses two external packages it will be displayed for both and so stacked twice. " ] }, { "cell_type": "code", "execution_count": null, - "id": "03641b8b", + "id": "17fc1572", "metadata": {}, "outputs": [], "source": [ - "# Sort by number of external type calls\n", - "external_package_least_used=external_package_usage.sort_values(by='numberOfExternalTypeCalls', ascending=True)\n", - "\n", - "# Reset index\n", - "external_package_least_used = external_package_least_used.reset_index(drop=True)\n", - "\n", - "# Select columns and only show the first 10 entries (head)\n", - "external_package_least_used[['externalPackageName','numberOfExternalTypeCalls']].head(20)\n" + "plot.figure();\n", + "axes = external_second_level_packages_per_artifact_pivot.transpose().plot(\n", + " kind='bar', \n", + " grid=True,\n", + " title='External package (first 2 levels) usage per artifact', \n", + " xlabel='artifact',\n", + " ylabel='number of packages',\n", + " stacked=True,\n", + " legend=True,\n", + " cmap=main_color_map\n", + ").legend(bbox_to_anchor=(1.0, 1.0))\n", + "plot.show()" ] }, { @@ -258,17 +1024,26 @@ "id": "33c3bb79", "metadata": {}, "source": [ - "### Table 3 - External usage per artifact\n", + "### Table 8 - External usage per artifact\n", + "\n", + "The following table shows the most used external packages separately for each artifact including external annotations. The results are grouped per artifact and sorted by the artifacts with the highest external type usage rate descending. Additionally, for each artifact the top 5 used external packages are listed in the top5ExternalPackages column. \n", "\n", - "The following table shows the most used external packages separately for each artifact including external annotations. \n", + "The intention of this table is to find artifacts that use a lot of external dependencies in relation to their size and get an overview per artifact with the top 5 used external packages, the number of external types and packages used etc. .\n", + "\n", + "Only the last 40 entries are shown. The whole table can be found in the following CSV report:\n", + "`External_package_usage_per_artifact_sorted_top`\n", "\n", "**Columns:**\n", "- *artifactName* is used to group the the external package usage per artifact for a more detailed analysis.\n", - "- *externalPackageName* identifies the external package as described above\n", + "- *numberOfTypesInArtifact* represents the total count of all analyzed types for the artifact\n", + "- *numberOfExternalTypesInArtifact* is the number of all external types that are used by the artifact\n", + "- *numberOfExternalPackagesInArtifact* is the number of all external packages that are used by the artifact\n", + "- *externalTypeRate* is the numberOfExternalTypesInArtifact / numberOfTypesInArtifact * 100\n", "- *numberOfExternalTypeCaller* refers to the distinct types that make use of the external package\n", "- *numberOfExternalTypeCalls* includes every invocation or reference to the types in the external package\n", - "- *numberOfTypesInArtifact* represents the total count of all analyzed types for the artifact\n", - "- *externalTypeNames* contains a list of actually utilized types of the external package" + "- *numberOfExternalPackages* is the number of distinct external packages used by the artifact\n", + "- *top5ExternalPackages* contains a list of the top 5 most used external packages of the artifact\n", + "- *someExternalTypes* contains a list of lists and is also mean't to provide some examples of external types used" ] }, { @@ -278,7 +1053,7 @@ "metadata": {}, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_artifact.cypher\")" + "query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_artifact_sorted_top.cypher\").head(40)" ] }, { @@ -287,9 +1062,12 @@ "id": "4fb87c8a", "metadata": {}, "source": [ - "### Table 4 - External usage per artifact and package\n", + "### Table 9 - External usage per artifact and package\n", + "\n", + "This table lists internal packages and the artifacts they belong to that use many different external types of a specific external package without taking external annotations into account. \n", "\n", - "The next table lists internal packages and the artifacts they belong to that use many different external types of a specific external package without taken external annotations into account. Only the first 30 rows are shown.\n", + "Only the last 40 entries are shown. The whole table can be found in the following CSV report:\n", + "`External_package_usage_per_artifact_and_package`\n", "\n", "**Columns:**\n", "- *artifactName* that contains the type that calls the external package\n", @@ -310,7 +1088,7 @@ "outputs": [], "source": [ "external_package_usage_per_package = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_artifact_and_package.cypher\")\n", - "external_package_usage_per_package.head(30)" + "external_package_usage_per_package.head(40)" ] }, { @@ -319,9 +1097,12 @@ "id": "a3161e2b", "metadata": {}, "source": [ - "### Table 5 - Top 20 external package usage per type\n", + "### Table 10 - Top 20 external package usage per type\n", "\n", - "This table lists the internal types that utilize the most different external types and packages. These have the highest probability of change depending on external libraries. A case-by-case approach is also advisable here because there could for example also be code units that encapsulate an external library and have this high count of external dependencies on purpose.\n", + "This table shows internal types that utilize the most different external types and packages. These have the highest probability of change depending on external libraries. A case-by-case approach is also advisable here because there could for example also be code units that encapsulate an external library and have this high count of external dependencies on purpose.\n", + "\n", + "Only the last 20 entries are shown. The whole table can be found in the following CSV report:\n", + "`External_package_usage_per_type`\n", "\n", "**Columns:**\n", "- *artifactName* that contains the type that calls the external package\n", @@ -353,13 +1134,15 @@ "id": "5b420f59", "metadata": {}, "source": [ - "### Table 6 - External package usage distribution per type\n", + "### Table 11 - External package usage distribution per type\n", "\n", - "The next table shown here only includes the first 20 rows.\n", - "It shows how many types use one external package, how many use two, etc. .\n", + "This table shows how many types use one external package, how many use two, etc. .\n", "This gives an overview of the distribution of external package calls and the overall coupling to external libraries. The higher the count of distinct external packages the lower should be the count of types that use them. Dependencies to external annotations are left out here.\n", "\n", - "Have a look above to find out which types have the highest external package dependency usage.\n", + "More details about which types have the highest external package dependency usage can be in the tables 4 and 5 above.\n", + "\n", + "Only the last 40 entries are shown. The whole table can be found in the following CSV report:\n", + "`External_package_usage_per_artifact_distribution`\n", "\n", "**Columns:**\n", "- *artifactName* that contains the type that calls the external package\n", @@ -376,8 +1159,9 @@ "metadata": {}, "outputs": [], "source": [ - "external_package_usage_per_type_distribution = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_type_distribution.cypher\")\n", - "external_package_usage_per_type_distribution[['artifactName', 'artifactTypes', 'numberOfExternalPackages', 'numberOfTypes', 'numberOfTypesPercentage']].head(20)" + "external_package_usage_per_artifact_distribution = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_artifact_distribution.cypher\")\n", + "external_package_usage_per_artifact_distribution_truncated=external_package_usage_per_artifact_distribution[['artifactName', 'artifactPackages', 'artifactTypes', 'numberOfExternalPackages', 'numberOfPackages', 'numberOfTypes', 'typesCallingExternalRate', 'packagesCallingExternalRate']].head(40)\n", + "external_package_usage_per_artifact_distribution_truncated" ] }, { @@ -386,9 +1170,14 @@ "id": "39c045f6", "metadata": {}, "source": [ - "### Table 7 - External package usage distribution in percentage\n", + "### Table 12 - External package usage per artifact grouped by number of internal packages\n", "\n", - "The following table uses the same data as Table 6 but has a column per internal artifact and a row for the number of different external packages used. The values are the percentages of types that fulfill both conditions so they belong to artifact and have the exact count of different external packages used. Dependencies to external annotations are left out here." + "The following table shows the external package usage for every artifact grouped by the number of distinct internal dependent packages. The intention is to find external package usage spread across multiple internal packages in artifacts. \n", + "\n", + "Artifacts that encapsulate external dependency calls in one internal package overall (or each) are easier to change if those external dependencies change and are most likely applying a [Hexagonal architecture](https://alistair.cockburn.us/hexagonal-architecture). Artifacts that use external dependencies in multiple internal packages need more effort to adapt to changes of those external dependencies. On one hand this could be intended e.g. when using standardized libraries. On the other hand this might indicate higher than necessary coupling.\n", + "\n", + "The whole table can be found in the following CSV report:\n", + "`External_package_usage_per_internal_package_count`" ] }, { @@ -398,76 +1187,242 @@ "metadata": {}, "outputs": [], "source": [ + "#external_package_usage_per_package_distribution = external_package_usage_per_artifact_distribution\n", + "external_package_usage_per_package_distribution = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_internal_package_count.cypher\")\n", "\n", - "# Organize artifacts in columns with the number of types as values using pivot\n", + "# Only show external dependencies that are at least used in 2 internal packages. \n", + "# Filter out all rows with \"numberOfPackages\" <= 2.\n", + "external_package_usage_per_package_distribution=external_package_usage_per_package_distribution.query(\"`numberOfPackages` >= 2\")\n", + "# Organize artifacts in columns with the number of packages that call external packages as values using pivot\n", "# Every row represents the number of external packages\n", - "external_package_usage_per_type_distribution=external_package_usage_per_type_distribution.pivot(index='numberOfExternalPackages', columns='artifactName', values='numberOfTypesPercentage')\n", + "external_package_usage_per_package_distribution=external_package_usage_per_package_distribution.pivot(index='numberOfPackages', columns='artifactName', values='maxPackagesCallingExternalRate')\n", "\n", "# Fill missing values with zero\n", - "external_package_usage_per_type_distribution.fillna(0, inplace=True)\n", + "external_package_usage_per_package_distribution.fillna(0, inplace=True)\n", + "\n", + "external_package_usage_per_package_distribution" + ] + }, + { + "cell_type": "markdown", + "id": "055e5a36", + "metadata": {}, + "source": [ + "### Table 13 - External package usage aggregated\n", + "\n", + "This table lists all artifacts and their external package dependencies usage aggregated over internal packages. \n", "\n", - "# Convert to integer\n", - "# external_package_usage_per_type_distribution=external_package_usage_per_type_distribution.astype(int)\n", + "The intention behind this is to find artifacts that use an external dependency across multiple internal packages. This might be intended for frameworks and standardized libraries and helps to quantify how widely those are used. For some external dependencies it might be beneficial to only access it from one package and provide an abstraction for internal usage following a [Hexagonal architecture](https://alistair.cockburn.us/hexagonal-architecture). Thus, this table may also help in finding application for the Hexagonal architecture or similar approaches (Domain Driven Design Anti Corruption Layer). After all it is easier to update or replace such external dependencies when they are used in specific areas and not all over the code.\n", "\n", - "external_package_usage_per_type_distribution.head(10)" + "Only the last 40 entries are shown. The whole table can be found in the following CSV report:\n", + "`External_package_usage_per_artifact_package_aggregated`\n", + "\n", + "**Columns:**\n", + "- *artifactName* that contains the type that calls the external package\n", + "- *artifactPackages* is the total count of packages in the artifact\n", + "- *artifactTypes* is the total count of types in the artifact\n", + "- *numberOfExternalPackages* the number of distinct external packages used\n", + "- *[min,max,med,avg,std]NumberOfPackages* provide statistics based on each external package and its package usage within the artifact\n", + "- *[min,max,med,avg,std]NumberOfPackagesPercentage* provide statistics in % based on each external package and its package usage within the artifact in respect to the overall count of packages in the artifact\n", + "- *[min,max,med,avg,std]NumberOfTypes* provide statistics based on each external package and its type usage within the artifact\n", + "- *[min,max,med,avg,std]NumberOfTypePercentage* provide statistics in % based on each external package and its type usage within the artifact in respect to the overall count of packages in the artifact\n", + "- *numberOfTypes* in the artifact where the *numberOfExternalPackages* applies\n", + "- *numberOfTypesPercentage* in the artifact where the *numberOfExternalPackages* applies in %" + ] + }, + { + "cell_type": "markdown", + "id": "7850d0a2", + "metadata": {}, + "source": [ + "#### Table 13a - External package usage aggregated - count of internal packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d01860a", + "metadata": {}, + "outputs": [], + "source": [ + "external_package_usage_aggregated = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_artifact_package_aggregated.cypher\")\n", + "\n", + "external_package_usage_aggregated_packages = external_package_usage_aggregated[['artifactName', 'artifactPackages', 'numberOfExternalPackages', 'minNumberOfPackages', 'medNumberOfPackages', 'avgNumberOfPackages', 'maxNumberOfPackages', 'stdNumberOfPackages']]\n", + "external_package_usage_aggregated_packages.head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "f2dbe7ac", + "metadata": {}, + "source": [ + "#### Table 13b - External package usage aggregated - percentage of internal packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e70afee8", + "metadata": {}, + "outputs": [], + "source": [ + "external_package_usage_aggregated_packages_percentage = external_package_usage_aggregated[['artifactName', 'artifactPackages', 'numberOfExternalPackages', 'minNumberOfPackagesPercentage', 'medNumberOfPackagesPercentage', 'avgNumberOfPackagesPercentage', 'maxNumberOfPackagesPercentage', 'stdNumberOfPackagesPercentage']]\n", + "external_package_usage_aggregated_packages_percentage.head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "8c476382", + "metadata": {}, + "source": [ + "#### Table 13c - External package usage aggregated - count of internal types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40392b95", + "metadata": {}, + "outputs": [], + "source": [ + "external_package_usage_aggregated_types = external_package_usage_aggregated[['artifactName', 'artifactTypes', 'numberOfExternalPackages', 'minNumberOfTypes', 'medNumberOfTypes', 'avgNumberOfTypes', 'maxNumberOfTypes', 'stdNumberOfTypes']]\n", + "external_package_usage_aggregated_types.head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "0eac5b93", + "metadata": {}, + "source": [ + "#### Table 13d - External package usage aggregated - percentage of internal types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "260cc520", + "metadata": {}, + "outputs": [], + "source": [ + "external_package_usage_aggregated_types_percentage = external_package_usage_aggregated[['artifactName', 'artifactTypes', 'numberOfExternalPackages', 'minNumberOfTypesPercentage', 'medNumberOfTypesPercentage', 'avgNumberOfTypesPercentage', 'maxNumberOfTypesPercentage', 'stdNumberOfTypesPercentage']]\n", + "external_package_usage_aggregated_types_percentage.head(30)" ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "121a215f", + "id": "d1a16e29", + "metadata": {}, + "source": [ + "#### Table 13 Chart 1 - External package usage - max percentage of internal types\n", + "\n", + "This chart shows per artifact the maximum percentage of internal packages (compared to all packages in that artifact) that use one specific external package. \n", + "\n", + "**Example:** One artifact might use 10 external packages where 7 of them are used in one internal package, 2 of them are used in two packages and one external dependency is used in 5 packages. So for this artifact there will be a point at x = 10 (external packages used by the artifact) and 5 (max internal packages). Instead of the count the percentage of internal packages compared to all packages in that artifact is used to get a normalized plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad1db8af", "metadata": {}, + "outputs": [], "source": [ - "### Chart 2 - External package usage distribution in percentage\n", + "def annotate_plot(data_frame: pd.DataFrame, index: int):\n", + " \"\"\"\n", + " Annotates the data points identified by the \"index\" in the plot of the \"data_frame\" \n", + " \"\"\"\n", + " x_position = data_frame.numberOfExternalPackages[index].item()\n", + " y_position = data_frame.maxNumberOfPackagesPercentage[index].item()\n", + " artifact_name = data_frame.artifactName[index].item()\n", "\n", - "The next chart shows the number of types per artifact that use the given number of different external packages as listed in Table 7. Dependencies to external annotations are left out here." + " label_box=dict(boxstyle=\"round4,pad=0.5\", fc=\"w\", alpha=0.8)\n", + " plot.annotate(artifact_name\n", + " ,xy=(x_position, y_position)\n", + " ,xycoords='data'\n", + " ,xytext=(-30, -15)\n", + " ,textcoords='offset points'\n", + " ,size=6\n", + " ,bbox=label_box\n", + " ,arrowprops=dict(arrowstyle=\"-|>\", mutation_scale=10, color=\"black\")\n", + " )\n", + "\n", + "def index_of_sorted(data_frame: pd.DataFrame, highest: list[str] = []):\n", + " \"\"\"\n", + " Sorts the \"data_frame\" by columns 'numberOfExternalPackages','maxNumberOfPackagesPercentage','artifactPackages', 'artifactName'\n", + " and returns the index of the first row.\n", + " Columns that are contained in the list of strings parameter \"highest\" will be sorted descending (highest first).\n", + " \"\"\"\n", + " by = ['numberOfExternalPackages','maxNumberOfPackagesPercentage','artifactPackages', 'artifactName']\n", + " ascending = [('numberOfExternalPackages' not in highest), ('maxNumberOfPackagesPercentage' not in highest), False, True]\n", + " return data_frame.sort_values(by=by, ascending=ascending).head(1).index" ] }, { "cell_type": "code", "execution_count": null, - "id": "58c9052a", + "id": "615238d5", "metadata": {}, "outputs": [], "source": [ "plot.figure();\n", - "axes = external_package_usage_per_type_distribution.plot(\n", - " kind='bar', \n", - " grid=True,\n", - " title='Relative External Package Usage', \n", + "axes = external_package_usage_aggregated.plot(\n", + " kind='scatter',\n", + " title='External package usage - max internal packages', \n", + " x='numberOfExternalPackages',\n", + " y='maxNumberOfPackagesPercentage',\n", + " s='artifactPackages',\n", + " c='leidenCommunityId',\n", " xlabel='external package count',\n", - " ylabel='percentage of types',\n", + " ylabel='max percentage of internal packages',\n", " cmap=main_color_map,\n", ")\n", + "\n", + "# Annotate the largest artifact with the highest number of external packages and max number of packages in percentage\n", + "annotation_index = index_of_sorted(highest=['numberOfExternalPackages','maxNumberOfPackagesPercentage'], data_frame=external_package_usage_aggregated)\n", + "annotate_plot(external_package_usage_aggregated, annotation_index)\n", + "\n", + "\n", + "# Annotate the largest artifact with the lowest number of external packages and the highest max number of packages in percentage\n", + "annotation_index = index_of_sorted(highest=['maxNumberOfPackagesPercentage'], data_frame=external_package_usage_aggregated)\n", + "annotate_plot(external_package_usage_aggregated, annotation_index)\n", + "\n", + "# Annotate the largest artifact with the lowest number of external packages and max number of packages in percentage\n", + "annotation_index = index_of_sorted(highest=[], data_frame=external_package_usage_aggregated)\n", + "annotate_plot(external_package_usage_aggregated, annotation_index)\n", + "\n", + "\n", "plot.show()" ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "e4780292", + "id": "9b7cff51", "metadata": {}, "source": [ - "### Chart 3 - External package usage distribution in percentage stacked per artifact\n", + "#### Table 13 Chart 2 - External package usage - median percentage of internal types\n", "\n", - "The following chart shows a stacked bar for each artifact. Every color represents a different count of different external packages used. The y axis then shows how many percent of types (compared to all types of that artifact) use these external packages. By stacking them above each other it is easier to compare the artifacts and their external package usage. Dependencies to external annotations are left out here." + "This chart shows per artifact the median (0.5 percentile) of internal packages (compared to all packages in that artifact) that use one specific external package. \n", + "\n", + "**Example:** One artifact might use 9 external packages where 3 of them are used in 1 internal package, 3 of them are used in 2 package and the last 3 ones are used in 3 packages. So for this artifact there will be a point at x = 10 (external packages used by the artifact) and 2 (median internal packages). Instead of the count the percentage of internal packages compared to all packages in that artifact is used to get a normalized plot." ] }, { "cell_type": "code", "execution_count": null, - "id": "cd612166", + "id": "34d2595f", "metadata": {}, "outputs": [], "source": [ "plot.figure();\n", - "axes = external_package_usage_per_type_distribution.transpose().plot(\n", - " kind='bar', \n", - " grid=True,\n", - " title='Relative External Package Usage', \n", - " xlabel='artifact',\n", - " ylabel='percentage of types',\n", - " stacked=True,\n", + "axes = external_package_usage_aggregated.plot(\n", + " kind='scatter',\n", + " title='External package usage - median internal packages', \n", + " x='numberOfExternalPackages',\n", + " y='medNumberOfPackagesPercentage',\n", + " s='artifactPackages',\n", + " c='leidenCommunityId',\n", + " xlabel='external package count',\n", + " ylabel='median percentage of internal packages',\n", " cmap=main_color_map,\n", ")\n", "plot.show()" @@ -482,7 +1437,7 @@ "## Maven POMs\n", "\n", "\n", - "### Table 8 - Maven POMs and their declared dependencies\n", + "### Table 14 - Maven POMs and their declared dependencies\n", "\n", "If Maven is used as for package and dependency management and a \".pom\" file is included in the artifact, the following table shows the external dependencies that are declared there." ] diff --git a/scripts/reports/ExternalDependenciesCsv.sh b/scripts/reports/ExternalDependenciesCsv.sh index d567c79eb..a31d89eb9 100755 --- a/scripts/reports/ExternalDependenciesCsv.sh +++ b/scripts/reports/ExternalDependenciesCsv.sh @@ -40,8 +40,16 @@ if ! execute_cypher_expect_results "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/List_ext fi execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/External_package_usage_overall.cypher" > "${FULL_REPORT_DIRECTORY}/External_package_usage_overall.csv" +execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/External_package_usage_spread.cypher" > "${FULL_REPORT_DIRECTORY}/External_package_usage_spread.csv" +execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/External_second_level_package_usage_overall.cypher" > "${FULL_REPORT_DIRECTORY}/External_second_level_package_usage_overall.csv" +execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/External_second_level_package_usage_spread.cypher" > "${FULL_REPORT_DIRECTORY}/External_second_level_package_usage_spread.csv" execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/External_package_usage_per_type.cypher" > "${FULL_REPORT_DIRECTORY}/External_package_usage_per_type.csv" execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/External_package_usage_per_artifact.cypher" > "${FULL_REPORT_DIRECTORY}/External_package_usage_per_artifact.csv" -execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/External_package_usage_per_type_distribution.cypher" > "${FULL_REPORT_DIRECTORY}/External_package_usage_per_type_distribution.csv" +execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/External_package_usage_per_artifact_sorted_top.cypher" > "${FULL_REPORT_DIRECTORY}/External_package_usage_per_artifact_sorted_top.csv" +execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/External_package_usage_per_artifact_distribution.cypher" > "${FULL_REPORT_DIRECTORY}/External_package_usage_per_artifact_distribution.csv" +execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/External_package_usage_per_artifact_package_aggregated.cypher" > "${FULL_REPORT_DIRECTORY}/External_package_usage_per_artifact_package_aggregated.csv" execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/External_package_usage_per_artifact_and_package.cypher" > "${FULL_REPORT_DIRECTORY}/External_package_usage_per_artifact_and_package.csv" +execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/External_package_usage_per_artifact_and_external_package.cypher" > "${FULL_REPORT_DIRECTORY}/External_package_usage_per_artifact_and_external_package.csv" +execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/External_second_level_package_usage_per_artifact_and_external_package.cypher" > "${FULL_REPORT_DIRECTORY}/External_second_level_package_usage_per_artifact_and_external_package.csv" + execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/Maven_POMs_and_their_declared_dependencies.cypher" > "${FULL_REPORT_DIRECTORY}/Maven_POM_dependencies.csv" \ No newline at end of file From 1782e150309f0e2e4804005367169a20c99d1c86 Mon Sep 17 00:00:00 2001 From: JohT Date: Sun, 3 Sep 2023 10:17:54 +0200 Subject: [PATCH 10/15] Optimize internal dependencies report --- ...andidates_for_Interface_Segregation.cypher | 4 +- .../Cyclic_Dependencies.cypher | 42 ++- ...r => Cyclic_Dependencies_Breakdown.cypher} | 26 +- ...pendencies_Breakdown_Backward_Only.cypher} | 32 +- .../Cyclic_Dependencies_as_Nodes.cypher | 7 + ...are_used_by_many_different_packages.cypher | 10 +- cypher/List_all_existing_artifacts.cypher | 8 +- jupyter/InternalDependencies.ipynb | 285 ++++++++++++++++-- scripts/reports/InternalDependenciesCsv.sh | 5 +- 9 files changed, 351 insertions(+), 68 deletions(-) rename cypher/Cyclic_Dependencies/{Cyclic_Dependencies_as_unwinded_List.cypher => Cyclic_Dependencies_Breakdown.cypher} (60%) rename cypher/Cyclic_Dependencies/{Cyclic_Dependencies_as_List.cypher => Cyclic_Dependencies_Breakdown_Backward_Only.cypher} (51%) create mode 100644 cypher/Cyclic_Dependencies/Cyclic_Dependencies_as_Nodes.cypher diff --git a/cypher/Candidates_for_Interface_Segregation.cypher b/cypher/Candidates_for_Interface_Segregation.cypher index 2e9157f07..0fa970345 100644 --- a/cypher/Candidates_for_Interface_Segregation.cypher +++ b/cypher/Candidates_for_Interface_Segregation.cypher @@ -2,7 +2,7 @@ MATCH (type:Type)-[:DECLARES]->(method:Method)-[:INVOKES]->(dependentMethod:Method) MATCH (dependentMethod)<-[:DECLARES]-(dependentType:Type) -MATCH (dependentType)-[:IMPLEMENTS*]->(superType:Type)-[:DECLARES]->(inheritedMethod:Method) +MATCH (dependentType)-[:IMPLEMENTS*1..9]->(superType:Type)-[:DECLARES]->(inheritedMethod:Method) WHERE type.fqn <> dependentType.fqn AND dependentMethod.name IS NOT NULL AND inheritedMethod.name IS NOT NULL @@ -15,6 +15,8 @@ WHERE type.fqn <> dependentType.fqn // Count the different signatures without the return type // of all declared methods including the inherited ones ,count(DISTINCT split(method.signature, ' ')[1]) + count(DISTINCT split(inheritedMethod.signature, ' ')[1]) AS declaredMethods +// Filter out types that declare only a few more methods than those that are actually used. +// A good interface segregation candidate declares a lot of methods where only a few of them are used widely. WHERE declaredMethods > calledMethods + 2 WITH fullDependentTypeName ,declaredMethods diff --git a/cypher/Cyclic_Dependencies/Cyclic_Dependencies.cypher b/cypher/Cyclic_Dependencies/Cyclic_Dependencies.cypher index da1945e8d..64909dff0 100644 --- a/cypher/Cyclic_Dependencies/Cyclic_Dependencies.cypher +++ b/cypher/Cyclic_Dependencies/Cyclic_Dependencies.cypher @@ -1,7 +1,35 @@ -// Cyclic Dependencies -MATCH (package:Package)-[:CONTAINS]->(type:Type)-[:DEPENDS_ON]->(dependentType:Type)<-[:CONTAINS]-(dependentPackage:Package) -MATCH (dependentPackage)-[:CONTAINS]->(cycleType:Type)-[:DEPENDS_ON]->(cycleDependentType:Type)<-[:CONTAINS]-(package) -WHERE package <> dependentPackage -RETURN package, dependentPackage - ,type, dependentType, cycleType, cycleDependentType - LIMIT 100 \ No newline at end of file +//Cyclic Dependencies as List + +MATCH (package:Package)-[:CONTAINS]->(forwardSource:Type)-[:DEPENDS_ON]->(forwardTarget:Type)<-[:CONTAINS]-(dependentPackage:Package) +MATCH (dependentPackage)-[:CONTAINS]->(backwardSource:Type)-[:DEPENDS_ON]->(backwardTarget:Type)<-[:CONTAINS]-(package) +MATCH (artifact:Artifact)-[:CONTAINS]->(package) +MATCH (dependentArtifact:Artifact)-[:CONTAINS]->(dependentPackage) +WHERE package.fqn <> dependentPackage.fqn + WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,package.fqn AS packageName + ,replace(last(split(dependentArtifact .fileName, '/')), '.jar', '') AS dependentArtifactName + ,dependentPackage.fqn AS dependentPackageName + ,collect(DISTINCT forwardSource.name + '->' + forwardTarget.name) AS forwardDependencies + ,collect(DISTINCT backwardSource.name + '->' + backwardTarget.name) AS backwardDependencies + WITH artifactName + ,packageName + ,dependentArtifactName + ,dependentPackageName + ,forwardDependencies + ,backwardDependencies + ,size(forwardDependencies) AS numberOfForwardDependencies + ,size(backwardDependencies) AS numberOfBackwardDependencies + ,size(forwardDependencies) + size(backwardDependencies) AS numberOfAllCyclicDependencies +WHERE (size(forwardDependencies) > size(backwardDependencies) + OR (size(forwardDependencies) = size(backwardDependencies) + AND size(packageName) >= size(dependentPackageName))) +RETURN artifactName + ,packageName + ,dependentArtifactName + ,dependentPackageName + ,toFloat(ABS(numberOfForwardDependencies - numberOfBackwardDependencies)) / numberOfAllCyclicDependencies AS forwardToBackwardBalance + ,numberOfForwardDependencies AS numberForward + ,numberOfBackwardDependencies AS numberBackward + ,forwardDependencies[0..9] AS someForwardDependencies + ,backwardDependencies +ORDER BY forwardToBackwardBalance DESC, packageName ASC \ No newline at end of file diff --git a/cypher/Cyclic_Dependencies/Cyclic_Dependencies_as_unwinded_List.cypher b/cypher/Cyclic_Dependencies/Cyclic_Dependencies_Breakdown.cypher similarity index 60% rename from cypher/Cyclic_Dependencies/Cyclic_Dependencies_as_unwinded_List.cypher rename to cypher/Cyclic_Dependencies/Cyclic_Dependencies_Breakdown.cypher index dd52e9f77..845136678 100644 --- a/cypher/Cyclic_Dependencies/Cyclic_Dependencies_as_unwinded_List.cypher +++ b/cypher/Cyclic_Dependencies/Cyclic_Dependencies_Breakdown.cypher @@ -1,14 +1,20 @@ -//Cyclic Dependencies as unwinded List +//Cyclic Dependencies Breakdown MATCH (package:Package)-[:CONTAINS]->(forwardSource:Type)-[:DEPENDS_ON]->(forwardTarget:Type)<-[:CONTAINS]-(dependentPackage:Package) MATCH (dependentPackage)-[:CONTAINS]->(backwardSource:Type)-[:DEPENDS_ON]->(backwardTarget:Type)<-[:CONTAINS]-(package) -WHERE package <> dependentPackage - WITH package - ,dependentPackage +MATCH (artifact:Artifact)-[:CONTAINS]->(package) +MATCH (dependentArtifact:Artifact)-[:CONTAINS]->(dependentPackage) +WHERE package.fqn <> dependentPackage.fqn + WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,package.fqn AS packageName + ,replace(last(split(dependentArtifact .fileName, '/')), '.jar', '') AS dependentArtifactName + ,dependentPackage.fqn AS dependentPackageName ,collect(DISTINCT forwardSource.name + '->' + forwardTarget.name) AS forwardDependencies ,collect(DISTINCT backwardTarget.name + '<-' + backwardSource.name) AS backwardDependencies - WITH package - ,dependentPackage + WITH artifactName + ,packageName + ,dependentArtifactName + ,dependentPackageName ,forwardDependencies ,backwardDependencies ,size(forwardDependencies) AS numberOfForwardDependencies @@ -16,10 +22,12 @@ WHERE package <> dependentPackage ,size(forwardDependencies) + size(backwardDependencies) AS numberOfAllCyclicDependencies WHERE (size(forwardDependencies) > size(backwardDependencies) OR (size(forwardDependencies) = size(backwardDependencies) - AND size(package.fqn) >= size(dependentPackage.fqn))) + AND size(packageName) >= size(dependentPackageName))) UNWIND (backwardDependencies + forwardDependencies) AS dependency -RETURN package.fqn AS packageName - ,dependentPackage.fqn AS dependentPackageName +RETURN artifactName + ,packageName + ,dependentArtifactName + ,dependentPackageName ,dependency ,toFloat(ABS(numberOfForwardDependencies - numberOfBackwardDependencies)) / numberOfAllCyclicDependencies AS forwardToBackwardBalance ,numberOfForwardDependencies AS numberForward diff --git a/cypher/Cyclic_Dependencies/Cyclic_Dependencies_as_List.cypher b/cypher/Cyclic_Dependencies/Cyclic_Dependencies_Breakdown_Backward_Only.cypher similarity index 51% rename from cypher/Cyclic_Dependencies/Cyclic_Dependencies_as_List.cypher rename to cypher/Cyclic_Dependencies/Cyclic_Dependencies_Breakdown_Backward_Only.cypher index 4e09fa3b7..3b073399f 100644 --- a/cypher/Cyclic_Dependencies/Cyclic_Dependencies_as_List.cypher +++ b/cypher/Cyclic_Dependencies/Cyclic_Dependencies_Breakdown_Backward_Only.cypher @@ -1,14 +1,20 @@ -//Cyclic Dependencies as List +//Cyclic Dependencies Breakdown Backward Only MATCH (package:Package)-[:CONTAINS]->(forwardSource:Type)-[:DEPENDS_ON]->(forwardTarget:Type)<-[:CONTAINS]-(dependentPackage:Package) MATCH (dependentPackage)-[:CONTAINS]->(backwardSource:Type)-[:DEPENDS_ON]->(backwardTarget:Type)<-[:CONTAINS]-(package) -WHERE package <> dependentPackage - WITH package - ,dependentPackage +MATCH (artifact:Artifact)-[:CONTAINS]->(package) +MATCH (dependentArtifact:Artifact)-[:CONTAINS]->(dependentPackage) +WHERE package.fqn <> dependentPackage.fqn + WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,package.fqn AS packageName + ,replace(last(split(dependentArtifact .fileName, '/')), '.jar', '') AS dependentArtifactName + ,dependentPackage.fqn AS dependentPackageName ,collect(DISTINCT forwardSource.name + '->' + forwardTarget.name) AS forwardDependencies - ,collect(DISTINCT backwardSource.name + '->' + backwardTarget.name) AS backwardDependencies - WITH package - ,dependentPackage + ,collect(DISTINCT backwardTarget.name + '<-' + backwardSource.name) AS backwardDependencies + WITH artifactName + ,packageName + ,dependentArtifactName + ,dependentPackageName ,forwardDependencies ,backwardDependencies ,size(forwardDependencies) AS numberOfForwardDependencies @@ -16,12 +22,14 @@ WHERE package <> dependentPackage ,size(forwardDependencies) + size(backwardDependencies) AS numberOfAllCyclicDependencies WHERE (size(forwardDependencies) > size(backwardDependencies) OR (size(forwardDependencies) = size(backwardDependencies) - AND size(package.fqn) >= size(dependentPackage.fqn))) -RETURN package.fqn AS packageName - ,dependentPackage.fqn AS dependentPackageName + AND size(packageName) >= size(dependentPackageName))) +UNWIND backwardDependencies AS dependency +RETURN artifactName + ,packageName + ,dependentArtifactName + ,dependentPackageName + ,dependency ,toFloat(ABS(numberOfForwardDependencies - numberOfBackwardDependencies)) / numberOfAllCyclicDependencies AS forwardToBackwardBalance ,numberOfForwardDependencies AS numberForward ,numberOfBackwardDependencies AS numberBackward - ,forwardDependencies - ,backwardDependencies ORDER BY forwardToBackwardBalance DESC, packageName ASC \ No newline at end of file diff --git a/cypher/Cyclic_Dependencies/Cyclic_Dependencies_as_Nodes.cypher b/cypher/Cyclic_Dependencies/Cyclic_Dependencies_as_Nodes.cypher new file mode 100644 index 000000000..da1945e8d --- /dev/null +++ b/cypher/Cyclic_Dependencies/Cyclic_Dependencies_as_Nodes.cypher @@ -0,0 +1,7 @@ +// Cyclic Dependencies +MATCH (package:Package)-[:CONTAINS]->(type:Type)-[:DEPENDS_ON]->(dependentType:Type)<-[:CONTAINS]-(dependentPackage:Package) +MATCH (dependentPackage)-[:CONTAINS]->(cycleType:Type)-[:DEPENDS_ON]->(cycleDependentType:Type)<-[:CONTAINS]-(package) +WHERE package <> dependentPackage +RETURN package, dependentPackage + ,type, dependentType, cycleType, cycleDependentType + LIMIT 100 \ No newline at end of file diff --git a/cypher/Internal_Dependencies/List_types_that_are_used_by_many_different_packages.cypher b/cypher/Internal_Dependencies/List_types_that_are_used_by_many_different_packages.cypher index e60011325..25c95565f 100644 --- a/cypher/Internal_Dependencies/List_types_that_are_used_by_many_different_packages.cypher +++ b/cypher/Internal_Dependencies/List_types_that_are_used_by_many_different_packages.cypher @@ -3,10 +3,10 @@ MATCH (artifact:Artifact)-[:CONTAINS]->(package:Package)-[:CONTAINS]->(type:Type)-[:DEPENDS_ON]->(dependentType:Type)<-[:CONTAINS]-(dependentPackage:Package)<-[:CONTAINS]-(dependentArtifact:Artifact) WHERE package <> dependentPackage WITH dependentType - ,labels(dependentType) AS dependentTypeLabels - ,COUNT(DISTINCT package) AS numberOfUsingPackages -RETURN dependentType.fqn - ,dependentType.name + ,labels(dependentType) AS dependentTypeLabels + ,COUNT(DISTINCT package.fqn) AS numberOfUsingPackages +RETURN dependentType.fqn AS fullQualifiedDependentTypeName + ,dependentType.name AS dependentTypeName ,dependentTypeLabels ,numberOfUsingPackages - ORDER BY numberOfUsingPackages DESC \ No newline at end of file + ORDER BY numberOfUsingPackages DESC, dependentTypeName ASC \ No newline at end of file diff --git a/cypher/List_all_existing_artifacts.cypher b/cypher/List_all_existing_artifacts.cypher index 375fcdd8c..b2ed60035 100644 --- a/cypher/List_all_existing_artifacts.cypher +++ b/cypher/List_all_existing_artifacts.cypher @@ -2,6 +2,8 @@ MATCH (artifact:Artifact:Archive)-[:CONTAINS]->(package:Package)-[:CONTAINS]->(type:Type) WITH last(split(artifact.fileName, '/')) AS artifactName - ,COUNT(DISTINCT package) AS packages - ,COUNT(DISTINCT type) AS types -RETURN artifactName, packages, types \ No newline at end of file + ,artifact.incomingDependencies AS incomingDependencies + ,artifact.outgoingDependencies AS outgoingDependencies + ,COUNT(DISTINCT package.fqn) AS packages + ,COUNT(DISTINCT type.fqn) AS types +RETURN artifactName, packages, types, incomingDependencies, outgoingDependencies \ No newline at end of file diff --git a/jupyter/InternalDependencies.ipynb b/jupyter/InternalDependencies.ipynb index 157079f29..6c851fc13 100644 --- a/jupyter/InternalDependencies.ipynb +++ b/jupyter/InternalDependencies.ipynb @@ -12,7 +12,7 @@ "### References\n", "- [Analyze java package metrics in a graph database](https://joht.github.io/johtizen/data/2023/04/21/java-package-metrics-analysis.html)\n", "- [Calculate metrics](https://101.jqassistant.org/calculate-metrics/index.html)\n", - "- [py2neo](https://py2neo.org/2021.1/)" + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)" ] }, { @@ -62,8 +62,11 @@ "metadata": {}, "outputs": [], "source": [ - "def query_cypher_to_data_frame(filename):\n", - " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", + "def query_cypher_to_data_frame(filename : str, limit: int = 10_000):\n", + " cypher_query_template = \"{query} LIMIT {row_limit}\"\n", + " cypher_query = get_cypher_query_from_file(filename)\n", + " cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)\n", + " records, summary, keys = driver.execute_query(cypher_query)\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" ] }, @@ -105,9 +108,10 @@ "source": [ "## Artifacts\n", "\n", - "### Table 1\n", + "List the artifacts this notebook is based on. Different sorting variations help finding artifacts by their features and support larger code bases where the list of all artifacts gets too long.\n", "\n", - "- List all the artifacts this notebook is based on" + "Only the top 30 entries are shown. The whole table can be found in the following CSV report: \n", + "`List_all_existing_artifacts`" ] }, { @@ -117,7 +121,167 @@ "metadata": {}, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/List_all_existing_artifacts.cypher\")" + "artifacts = query_cypher_to_data_frame(\"../cypher/List_all_existing_artifacts.cypher\")" + ] + }, + { + "cell_type": "markdown", + "id": "e48ead1b", + "metadata": {}, + "source": [ + "### Table 1a - Top 30 artifacts with the highest package count" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f822ddca", + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by number of packages descending\n", + "artifacts.sort_values(by=['packages','artifactName'], ascending=[False, True]).reset_index(drop=True).head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "9f4661b3", + "metadata": {}, + "source": [ + "### Table 1b - Top 30 artifacts with the highest type count" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61ed6b36", + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by number of types descending\n", + "artifacts.sort_values(by=['types','artifactName'], ascending=[False, True]).reset_index(drop=True).head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "1e508a21", + "metadata": {}, + "source": [ + "### Table 1c - Top 30 artifacts with the highest number of incoming dependencies\n", + "\n", + "The following table lists the top 30 artifacts that are used the most by other artifacts (highest count of incoming dependencies, highest in-degree)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "107c0a1a", + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by number of incoming dependencies descending\n", + "artifacts.sort_values(by=['incomingDependencies','artifactName'], ascending=[False, True]).reset_index(drop=True).head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "52593397", + "metadata": {}, + "source": [ + "### Table 1d - Top 30 artifacts with the highest number of outgoing dependencies\n", + "\n", + "The following table lists the top 30 artifacts that are depending on the highest number of other artifacts (highest count of outgoing dependencies, highest out-degree)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eee13bd8", + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by number of outgoing dependencies descending\n", + "artifacts.sort_values(by=['outgoingDependencies','artifactName'], ascending=[False, True]).reset_index(drop=True).head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "8c928800", + "metadata": {}, + "source": [ + "### Table 1e - Top 30 artifacts with the lowest package count" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18d73c15", + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by number of packages ascending\n", + "artifacts.sort_values(by=['packages','artifactName'], ascending=[True, True]).reset_index(drop=True).head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "4bdd5de0", + "metadata": {}, + "source": [ + "### Table 1f - Top 30 artifacts with the lowest type count" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0078e92e", + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by number of types ascending\n", + "artifacts.sort_values(by=['types','artifactName'], ascending=[True, True]).reset_index(drop=True).head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "9f628024", + "metadata": {}, + "source": [ + "### Table 1g - Top 30 artifacts with the lowest number of incoming dependencies\n", + "\n", + "The following table lists the top 30 artifacts that are used the least by other artifacts (lowest count of incoming dependencies, lowest in-degree)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6100d597", + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by number of incoming dependencies ascending\n", + "artifacts.sort_values(by=['incomingDependencies','artifactName'], ascending=[True, True]).reset_index(drop=True).head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "ad07b654", + "metadata": {}, + "source": [ + "### Table 1h - Top 30 artifacts with the lowest number of outgoing dependencies\n", + "\n", + "The following table lists the top 30 artifacts that are depending on the lowest number of other artifacts (lowest count of outgoing dependencies, lowest out-degree)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd096a38", + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by number of outgoing dependencies ascending\n", + "artifacts.sort_values(by=['outgoingDependencies','artifactName'], ascending=[True, True]).reset_index(drop=True).head(30)" ] }, { @@ -129,10 +293,31 @@ "## Cyclic Dependencies\n", "\n", "Cyclic dependencies occur when one package uses a class of another package and vice versa. \n", - "These dependencies can lead to a lot of trouble when one of these packages needs to be changed.\n", + "These dependencies can lead to problems when one of these packages needs to be changed." + ] + }, + { + "cell_type": "markdown", + "id": "9112f590", + "metadata": {}, + "source": [ + "## Table 2a - Cyclic Dependencies Overview\n", "\n", - "### Table 2\n", - "- List packages with cyclic dependencies as an overview" + "Show the top 40 cyclic dependencies sorted by the most promising to resolve first. This is done by calculating the number of forward dependencies (first cycle participant to second cycle participant) in relation to backward dependencies (second cycle participant back to first cycle participant). The higher this rate (approaching 1), the easier it should be to resolve the cycle by focussing on the few backward dependencies.\n", + "\n", + "Only the top 40 entries are shown. The whole table can be found in the following CSV report: \n", + "`Cyclic_Dependencies`\n", + "\n", + "**Columns:**\n", + "- *artifactName* identifies the artifact of the first participant of the cycle\n", + "- *packageName* identifies the package of the first participant of the cycle\n", + "- *dependentArtifactName* identifies the artifact of the second participant of the cycle\n", + "- *dependentPackageName* identifies the package of the second participant of the cycle\n", + "- *forwardToBackwardBalance* is between 0 and 1. High for many forward and few backward dependencies.\n", + "- *numberForward* contains the number of dependencies from the first participant of the cycle to the second one\n", + "- *numberBackward* contains the number of dependencies from the second participant of the cycle back to the first one\n", + "- *someForwardDependencies* lists some forward dependencies in the text format \"type1 -> type2\"\n", + "- *backwardDependencies* lists the backward dependencies in the format \"type1 <- type2\" that are recommended to get resolved" ] }, { @@ -144,7 +329,8 @@ }, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/Cyclic_Dependencies/Cyclic_Dependencies_as_List.cypher\")" + "cyclic_dependencies = query_cypher_to_data_frame(\"../cypher/Cyclic_Dependencies/Cyclic_Dependencies.cypher\")\n", + "cyclic_dependencies.head(40)" ] }, { @@ -153,8 +339,15 @@ "id": "02146c4d", "metadata": {}, "source": [ - "### Table 3\n", - "- List packages with cyclic dependencies with every dependency in a separate row sorted by the easiest and most valuable resolvable dependency first" + "### Table 2b - Cyclic Dependencies Break Down\n", + "\n", + "Lists packages with cyclic dependencies with every dependency in a separate row sorted by the most promising dependency first.\n", + "\n", + "Only the top 40 entries are shown. The whole table can be found in the following CSV report: \n", + "`Cyclic_Dependencies_Breakdown`\n", + "\n", + "**Columns in addition to Table 2a:**\n", + "- *dependency* shows the cycle dependency in the text format \"type1 -> type2\" (forward) or \"type2<-type1\" (backward)" ] }, { @@ -164,7 +357,32 @@ "metadata": {}, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/Cyclic_Dependencies/Cyclic_Dependencies_as_unwinded_List.cypher\").head(60)" + "cyclic_dependencies_breakdown = query_cypher_to_data_frame(\"../cypher/Cyclic_Dependencies/Cyclic_Dependencies_Breakdown.cypher\",limit=40)\n", + "cyclic_dependencies_breakdown" + ] + }, + { + "cell_type": "markdown", + "id": "52f46a62", + "metadata": {}, + "source": [ + "### Table 2c - Cyclic Dependencies Break Down - Backward Dependencies Only\n", + "\n", + "Lists packages with cyclic dependencies with every dependency in a separate row sorted by the most promising dependency first. This table only contains the backward dependencies from the second participant of the cycle back to the first one that are the most promising to resolve.\n", + "\n", + "Only the top 40 entries are shown. The whole table can be found in the following CSV report: \n", + "`Cyclic_Dependencies_Breakdown_BackwardOnly`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61521c88", + "metadata": {}, + "outputs": [], + "source": [ + "cyclic_dependencies_breakdown_backward = query_cypher_to_data_frame(\"../cypher/Cyclic_Dependencies/Cyclic_Dependencies_Breakdown_Backward_Only.cypher\",limit=40)\n", + "cyclic_dependencies_breakdown_backward" ] }, { @@ -191,10 +409,17 @@ "\n", "If just one method of a type is used, especially in many places, then the result of this method can be used to call e.g. a method or constuct an object instead of using the whole object and then just calling that single method.\n", "\n", - "If there are a couple of methods that are used for a distinct purpose, those could be factored out into a separate interface. The original type can extended/implement the new interface so that there are no breaking changes. Then all the callers, that use only this group of methods, can be changed to the new interface.\n", + "If there are a couple of methods that are used for a distinct purpose, those could be factored out into a separate interface. The original type can extended/implement the new interface so that there are no breaking changes. Then all the callers, that use only this group of methods, can be changed to the new interface.\n" + ] + }, + { + "cell_type": "markdown", + "id": "a966e4c3", + "metadata": {}, + "source": [ + "### Table 4 - Top 40 most used combinations of methods\n", "\n", - "### Table 4\n", - "- List top 20 most used combinations of methods of larger Types that might benefit from *Interface Segregation*" + "The following table shows the top 40 most used combinations of methods of larger types that might benefit from applying the *Interface Segregation Principle*. The whole table can be found in the CSV report `Candidates_for_Interface_Segregation`." ] }, { @@ -206,7 +431,8 @@ }, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/Candidates_for_Interface_Segregation.cypher\").head(20)" + "interface_segregation_candidates=query_cypher_to_data_frame(\"../cypher/Candidates_for_Interface_Segregation.cypher\", limit=40)\n", + "interface_segregation_candidates" ] }, { @@ -224,10 +450,9 @@ "id": "bb185191", "metadata": {}, "source": [ - "### Types that are used by multiple packages\n", + "### Table 5 - Types that are used by multiple packages\n", "\n", - "#### Table 5\n", - "- List the top 20 packages that are used by the highest count of different packages " + "This table shows the top 40 packages that are used by the highest number of different packages. The whole table can be found in the CSV report `List_types_that_are_used_by_many_different_packages`.\n" ] }, { @@ -239,7 +464,8 @@ }, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/Internal_Dependencies/List_types_that_are_used_by_many_different_packages.cypher\").head(20)" + "types_used_by_many_packages=query_cypher_to_data_frame(\"../cypher/Internal_Dependencies/List_types_that_are_used_by_many_different_packages.cypher\", limit=40)\n", + "types_used_by_many_packages" ] }, { @@ -248,10 +474,10 @@ "id": "73dd1b13", "metadata": {}, "source": [ - "### Packages that are used by multiple artifacts\n", + "### Table 6 - Packages that are used by multiple artifacts\n", "\n", - "#### Table 6\n", - "- List the top 20 artifacts that only use a few (compared to all existing) packages of another artifact" + "This table shows the top 30 artifacts that only use a few (compared to all existing) packages of another artifact.\n", + "The whole table can be found in the CSV report `ArtifactPackageUsage`." ] }, { @@ -263,7 +489,8 @@ }, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/Internal_Dependencies/How_many_packages_compared_to_all_existing_are_used_by_dependent_artifacts.cypher\").head(20)" + "used_packages_of_dependent_artifact=query_cypher_to_data_frame(\"../cypher/Internal_Dependencies/How_many_packages_compared_to_all_existing_are_used_by_dependent_artifacts.cypher\",limit=30)\n", + "used_packages_of_dependent_artifact" ] }, { @@ -272,10 +499,9 @@ "id": "d37ba7ae", "metadata": {}, "source": [ - "### Packages that are used by multiple artifacts\n", + "### Table 7 - Types that are used by multiple artifacts\n", "\n", - "#### Table 7\n", - "- List the top 20 packages that only use a few (compared to all existing) types of another package " + "This table shows the top 30 types that only use a few (compared to all existing) types of another artifact. The whole table can be found in the CSV report `ClassesPerPackageUsageAcrossArtifacts`." ] }, { @@ -287,7 +513,8 @@ }, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/Internal_Dependencies/How_many_classes_compared_to_all_existing_in_the_same_package_are_used_by_dependent_packages_across_different_artifacts.cypher\").head(20)" + "used_types_of_dependent_artifact=query_cypher_to_data_frame(\"../cypher/Internal_Dependencies/How_many_classes_compared_to_all_existing_in_the_same_package_are_used_by_dependent_packages_across_different_artifacts.cypher\", limit=30)\n", + "used_types_of_dependent_artifact" ] } ], diff --git a/scripts/reports/InternalDependenciesCsv.sh b/scripts/reports/InternalDependenciesCsv.sh index b933d6e3a..09031e1c0 100755 --- a/scripts/reports/InternalDependenciesCsv.sh +++ b/scripts/reports/InternalDependenciesCsv.sh @@ -36,8 +36,9 @@ mkdir -p "${FULL_REPORT_DIRECTORY}" CYCLIC_DEPENDENCIES_CYPHER_DIR="${CYPHER_DIR}/Cyclic_Dependencies" INTERNAL_DEPENDENCIES_CYPHER_DIR="${CYPHER_DIR}/Internal_Dependencies" -execute_cypher "${CYCLIC_DEPENDENCIES_CYPHER_DIR}/Cyclic_Dependencies_as_List.cypher" > "${FULL_REPORT_DIRECTORY}/CyclicDependencies.csv" -execute_cypher "${CYCLIC_DEPENDENCIES_CYPHER_DIR}/Cyclic_Dependencies_as_unwinded_List.cypher" > "${FULL_REPORT_DIRECTORY}/CyclicDependenciesUnwinded.csv" +execute_cypher "${CYCLIC_DEPENDENCIES_CYPHER_DIR}/Cyclic_Dependencies.cypher" > "${FULL_REPORT_DIRECTORY}/Cyclic_Dependencies.csv" +execute_cypher "${CYCLIC_DEPENDENCIES_CYPHER_DIR}/Cyclic_Dependencies_Breakdown.cypher" > "${FULL_REPORT_DIRECTORY}/Cyclic_Dependencies_Breakdown.csv" +execute_cypher "${CYCLIC_DEPENDENCIES_CYPHER_DIR}/Cyclic_Dependencies_Breakdown_Backward_Only.cypher" > "${FULL_REPORT_DIRECTORY}/Cyclic_Dependencies_Breakdown_Backward_Only.csv" execute_cypher "${CYCLIC_DEPENDENCIES_CYPHER_DIR}/Cyclic_Dependencies_between_Artrifacts_as_unwinded_List.cypher" > "${FULL_REPORT_DIRECTORY}/CyclicArtifactDependenciesUnwinded.csv" execute_cypher "${CYPHER_DIR}/Candidates_for_Interface_Segregation.cypher" > "${FULL_REPORT_DIRECTORY}/InterfaceSegregationCandidates.csv" From 27fe9164d21755e9c0f28d1b67b1169697b5b627 Mon Sep 17 00:00:00 2001 From: JohT Date: Tue, 5 Sep 2023 08:40:21 +0200 Subject: [PATCH 11/15] Optimize object oriented metrics report --- jupyter/ObjectOrientedDesignMetrics.ipynb | 122 +++++++++++++--------- 1 file changed, 72 insertions(+), 50 deletions(-) diff --git a/jupyter/ObjectOrientedDesignMetrics.ipynb b/jupyter/ObjectOrientedDesignMetrics.ipynb index d80402ee0..4569ae90d 100644 --- a/jupyter/ObjectOrientedDesignMetrics.ipynb +++ b/jupyter/ObjectOrientedDesignMetrics.ipynb @@ -15,7 +15,7 @@ "- [jqassistant](https://jqassistant.org)\n", "- [notebook walks through examples for integrating various packages with Neo4j](https://nicolewhite.github.io/neo4j-jupyter/hello-world.html)\n", "- [OO Design Quality Metrics](https://api.semanticscholar.org/CorpusID:18246616)\n", - "- [py2neo](https://py2neo.org/2021.1/)" + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)" ] }, { @@ -100,29 +100,6 @@ "" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "91d80bf7", - "metadata": {}, - "source": [ - "## Artifacts\n", - "\n", - "#### Table 1\n", - "\n", - "- List all the artifacts this notebook is based on" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc682db6", - "metadata": {}, - "outputs": [], - "source": [ - "query_cypher_to_data_frame(\"../cypher/List_all_existing_artifacts.cypher\")" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -256,7 +233,7 @@ "\n", "#### Table 6\n", "\n", - "- Show the top 20 packages with the highest distance from the \"main sequence\"" + "- Show the top 30 packages with the highest distance from the \"main sequence\"" ] }, { @@ -267,7 +244,7 @@ "outputs": [], "source": [ "instabilityPerAbstractness = query_cypher_to_data_frame(\"../cypher/Metrics/Calculate_distance_between_abstractness_and_instability.cypher\")\n", - "instabilityPerAbstractness.head(20)" + "instabilityPerAbstractness.head(30)" ] }, { @@ -295,14 +272,24 @@ "# Function that returns the number of past (index smaller than given index) rows \n", "# with the same value in columnName1 and columnName2\n", "# If there was a row with the same columnName1 and columnName2 values\n", - "def countPastEntriesWithSameValues(dataFrame, index, columnName1, columnName2):\n", - " columnValue1 = dataFrame[columnName1][index]\n", - " columnValue2 = dataFrame[columnName2][index]\n", - " return len(dataFrame[\n", - " (dataFrame.index.isin(range(0, index + 1))) & \n", - " (dataFrame[columnName1]==columnValue1) & \n", - " (dataFrame[columnName2]==columnValue2)\n", - " ]) - 1" + "# def countPastEntriesWithSameValues(dataFrame, index, columnName1, columnName2):\n", + "# columnValue1 = dataFrame[columnName1][index]\n", + "# columnValue2 = dataFrame[columnName2][index]\n", + "# return len(dataFrame[\n", + "# (dataFrame.index.isin(range(0, index + 1))) & \n", + "# (dataFrame[columnName1]==columnValue1) & \n", + "# (dataFrame[columnName2]==columnValue2)\n", + "# ]) - 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36d8cf50", + "metadata": {}, + "outputs": [], + "source": [ + "instabilityPerAbstractness.packageName[0]" ] }, { @@ -312,6 +299,37 @@ "metadata": {}, "outputs": [], "source": [ + "def annotate_plot(data_frame: pd.DataFrame, index: int):\n", + " \"\"\"\n", + " Annotates the data points identified by the \"index\" in the plot of the \"data_frame\" \n", + " \"\"\"\n", + " x_position = data_frame.abstractness[index].item()\n", + " y_position = data_frame.instability[index].item()\n", + " artifact_name = data_frame.artifactName[index].item()\n", + " package_name = data_frame.packageName[index].item()\n", + "\n", + " label_box=dict(boxstyle=\"round4,pad=0.5\", fc=\"w\", alpha=0.8)\n", + " plot.annotate(artifact_name + '\\n' + package_name\n", + " ,xy=(x_position, y_position)\n", + " ,xycoords='data'\n", + " ,xytext=(20, 0)\n", + " ,textcoords='offset points'\n", + " ,size=6\n", + " ,bbox=label_box\n", + " ,arrowprops=dict(arrowstyle=\"-|>\", mutation_scale=10, color=\"black\")\n", + " )\n", + "\n", + "def index_of_sorted(data_frame: pd.DataFrame, highest: list[str] = []):\n", + " \"\"\"\n", + " Sorts the \"data_frame\" by columns 'abstractness','instability','typesInPackage', 'artifactName'\n", + " and returns the index of the first row.\n", + " Columns that are contained in the list of strings parameter \"highest\" will be sorted descending.\n", + " \"\"\"\n", + " by = ['abstractness','instability','typesInPackage', 'artifactName']\n", + " ascending = [('abstractness' not in highest), ('instability' not in highest), False, True]\n", + " return data_frame.sort_values(by=by, ascending=ascending).head(1).index\n", + "\n", + "\n", "# data points scaled by the number of types and colored by the distance to the \"main sequence\"\n", "plot.scatter(\n", " instabilityPerAbstractness.abstractness, # x axis shows abstractness\n", @@ -323,22 +341,26 @@ "# green \"main sequence\" line\n", "plot.plot([0,1], [1,0], c='lightgreen', linestyle='dashed') \n", "\n", - "# add the packagenames to the those with the 15 highest distance values\n", - "distanceAnnotationThreshold = instabilityPerAbstractness.distance.nlargest(15).iloc[-1]\n", - "# (variant) highest 15% (quantile) of all distance values\n", - "# distanceAnnotationThreshold = instabilityPerAbstractness.distance.quantile(0.85)\n", - "for i, name in enumerate(instabilityPerAbstractness.packageName):\n", - " if (instabilityPerAbstractness.distance[i] >= distanceAnnotationThreshold):\n", - " x_position = instabilityPerAbstractness.abstractness[i]\n", - " y_position = instabilityPerAbstractness.instability[i]\n", - " # To overcome overlapping text annotations for multiple data points on the same position, \n", - " # entries with same position values in the past indizes are count and used to offset the y-position\n", - " # so that multiple names are written underneath each other.\n", - " alreadyExistingPositions = countPastEntriesWithSameValues(instabilityPerAbstractness, i, 'abstractness', 'instability')\n", - " y_position = y_position - alreadyExistingPositions / len(instabilityPerAbstractness) * 2\n", - " \n", - " plot.annotate(name, (x_position, y_position), size=6)\n", - " \n", + "# Annotate largest package with the highest abstractness and instability\n", + "annotation_index = index_of_sorted(highest=['abstractness','instability'], data_frame=instabilityPerAbstractness)\n", + "annotate_plot(instabilityPerAbstractness, annotation_index)\n", + "\n", + "# Annotate largest package with the lowest abstractness and highest instability\n", + "annotation_index = index_of_sorted(highest=['instability'], data_frame=instabilityPerAbstractness)\n", + "annotate_plot(instabilityPerAbstractness, annotation_index)\n", + "\n", + "# Annotate largest package with the lowest abstractness and lowest instability\n", + "annotation_index = index_of_sorted(highest=[], data_frame=instabilityPerAbstractness)\n", + "annotate_plot(instabilityPerAbstractness, annotation_index)\n", + "\n", + "# Annotate largest package with the highest abstractness and lowest instability\n", + "annotation_index = index_of_sorted(highest=['abstractness'], data_frame=instabilityPerAbstractness)\n", + "annotate_plot(instabilityPerAbstractness, annotation_index)\n", + "\n", + "# Annotate largest packages with the highest abstractness and instability near 0.5% \n", + "annotation_index = index_of_sorted(highest=['abstractness', 'instability'], data_frame=instabilityPerAbstractness.query('abstractness <= 0.5 & instability <= 0.5'))\n", + "annotate_plot(instabilityPerAbstractness, annotation_index)\n", + "\n", "plot.title('Abstractness vs. Instability (\"Main Sequence\")')\n", "plot.xlabel('Abstractness')\n", "plot.ylabel('Instability')\n", From 525a53287c8faa096a7ad7ebfb1479da3535586b Mon Sep 17 00:00:00 2001 From: JohT Date: Thu, 7 Sep 2023 08:27:32 +0200 Subject: [PATCH 12/15] Optimize overview reports --- ...ve_lines_of_method_code_per_package.cypher | 12 +- .../Number_of_types_per_artifact.cypher | 14 +- cypher/Overview/Overview_size.cypher | 44 ++ jupyter/Overview.ipynb | 475 ++++++++++++++---- scripts/reports/OverviewCsv.sh | 1 + 5 files changed, 431 insertions(+), 115 deletions(-) create mode 100644 cypher/Overview/Overview_size.cypher diff --git a/cypher/Overview/Effective_lines_of_method_code_per_package.cypher b/cypher/Overview/Effective_lines_of_method_code_per_package.cypher index 8ee6739f9..423e82bd1 100644 --- a/cypher/Overview/Effective_lines_of_method_code_per_package.cypher +++ b/cypher/Overview/Effective_lines_of_method_code_per_package.cypher @@ -8,6 +8,7 @@ ,package.fqn AS fullPackageName ,package.name AS packageName ,sum(method.effectiveLineCount) AS sumEffectiveLinesOfMethodCode + ,sum(method.cyclomaticComplexity) AS sumCyclomaticComplexity ,COUNT(DISTINCT method) AS numberOfMethods ,reduce( // Get the max effectiveLineCount of all methods in the package with the name and type of the method loc = {max:-1}, // initial object with max lines of code = -1 @@ -16,7 +17,7 @@ THEN {max: m.method.effectiveLineCount, method: m.method, type: m.type} // then update the object ELSE loc // otherwise keep the object as it was END - ) AS methodWithMaxLoc + ) AS methodWithMaxLinesOfCode ,reduce( // Get the max cyclomaticComplexity of all methods in the package with the name and type of the method cmplx = {max:-1}, // initial object with max cyclomatic complexity = -1 m IN collect({method:method, type:type}) | // collect all methods and their types as objects @@ -27,12 +28,13 @@ ) AS methodWithMaxCyclomaticComplexity RETURN artifactName, fullPackageName ,sumEffectiveLinesOfMethodCode AS linesInPackage + ,sumCyclomaticComplexity AS complexityInPackage ,numberOfMethods AS methodCount - ,methodWithMaxLoc.max AS maxLinesMethod - ,methodWithMaxLoc.type.name AS maxLinesMethodType - ,methodWithMaxLoc.method.name AS maxLinesMethodName + ,methodWithMaxLinesOfCode.max AS maxLinesMethod + ,methodWithMaxLinesOfCode.type.name AS maxLinesMethodType + ,methodWithMaxLinesOfCode.method.name AS maxLinesMethodName ,methodWithMaxCyclomaticComplexity.max AS maxComplexity ,methodWithMaxCyclomaticComplexity.type.name AS maxComplexityType ,methodWithMaxCyclomaticComplexity.method.name AS maxComplexityMethod ,packageName -ORDER BY sumEffectiveLinesOfMethodCode DESC, artifactName ASC, fullPackageName \ No newline at end of file +ORDER BY linesInPackage DESC, artifactName ASC, fullPackageName \ No newline at end of file diff --git a/cypher/Overview/Number_of_types_per_artifact.cypher b/cypher/Overview/Number_of_types_per_artifact.cypher index 0a020d9a6..eecbd07a9 100644 --- a/cypher/Overview/Number_of_types_per_artifact.cypher +++ b/cypher/Overview/Number_of_types_per_artifact.cypher @@ -2,11 +2,21 @@ MATCH (artifact:Artifact)-[:CONTAINS]->(type:Type) WITH replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName + ,count(DISTINCT type.fqn) AS numberOfArtifactTypes + ,collect(DISTINCT type) AS types +UNWIND types AS type + WITH artifactName + ,numberOfArtifactTypes ,type ,labels(type) AS typeLabels UNWIND typeLabels AS typeLabel - WITH artifactName, type, typeLabel + WITH artifactName + ,numberOfArtifactTypes + ,type + ,typeLabel WHERE typeLabel IN ['Class', 'Interface', 'Annotation', 'Enum'] RETURN artifactName + ,numberOfArtifactTypes ,typeLabel AS languageElement - ,count(type) AS numberOfTypes \ No newline at end of file + ,count(type) AS numberOfTypes + ORDER BY numberOfArtifactTypes DESC, artifactName ASC \ No newline at end of file diff --git a/cypher/Overview/Overview_size.cypher b/cypher/Overview/Overview_size.cypher new file mode 100644 index 000000000..22e6e5b71 --- /dev/null +++ b/cypher/Overview/Overview_size.cypher @@ -0,0 +1,44 @@ +// Overview size + + MATCH (n) + WITH COUNT(n) AS nodeCount + MATCH ()-[]->() + WITH nodeCount + ,count(*) AS relationshipCount + MATCH (a:Artifact:Archive) + WITH nodeCount + ,relationshipCount + ,count(DISTINCT a.fileName) AS artifactCount + MATCH (p:Package) + WITH nodeCount + ,relationshipCount + ,artifactCount + ,count(DISTINCT p.fqn) AS packageCount + MATCH (t:Type) + WITH nodeCount + ,relationshipCount + ,artifactCount + ,packageCount + ,count(DISTINCT t.fqn) AS typeCount + MATCH (m:Method) + WITH nodeCount + ,relationshipCount + ,artifactCount + ,packageCount + ,typeCount + ,count(DISTINCT m.signature) AS methodCount + MATCH (member:Member) + WITH nodeCount + ,relationshipCount + ,artifactCount + ,packageCount + ,typeCount + ,methodCount + ,count(DISTINCT member.signature) AS memberCount +RETURN nodeCount + ,relationshipCount + ,artifactCount + ,packageCount + ,typeCount + ,methodCount + ,memberCount \ No newline at end of file diff --git a/jupyter/Overview.ipynb b/jupyter/Overview.ipynb index de0be338d..9b6433b05 100644 --- a/jupyter/Overview.ipynb +++ b/jupyter/Overview.ipynb @@ -107,6 +107,33 @@ "main_color_map = 'nipy_spectral'" ] }, + { + "cell_type": "markdown", + "id": "0c68aa20", + "metadata": {}, + "source": [ + "## Overview" + ] + }, + { + "cell_type": "markdown", + "id": "8333d13e", + "metadata": {}, + "source": [ + "### Table 1 - Size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4ddd4d4", + "metadata": {}, + "outputs": [], + "source": [ + "overview_size = query_cypher_to_data_frame(\"../cypher/Overview/Overview_size.cypher\")\n", + "overview_size" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -122,7 +149,10 @@ "id": "e02f924a", "metadata": {}, "source": [ - "### Table 1 - Types per artifact" + "### Table 2a - Largest 30 types per artifact\n", + "\n", + "This table shows the largest (number of types) artifacts and their kind of types (Class, Interface, Enum, Annotation).\n", + "The whole table can be found in the CSV report `Number_of_types_per_artifact`." ] }, { @@ -133,7 +163,7 @@ "outputs": [], "source": [ "types_per_artifact = query_cypher_to_data_frame(\"../cypher/Overview/Number_of_types_per_artifact.cypher\")\n", - "types_per_artifact" + "types_per_artifact.head(30)" ] }, { @@ -142,7 +172,11 @@ "id": "b44c8a75", "metadata": {}, "source": [ - "### Table 2 - Types per artifact (grouped)" + "### Table 2b - Largest 30 types per artifact grouped\n", + "\n", + "This table shows the largest (number of types) artifacts each in one row, their kind of types in columns and the count of them as values.\n", + "\n", + "The source data for this aggregated table can be found in the CSV report `Number_of_types_per_artifact`." ] }, { @@ -175,7 +209,15 @@ "types_per_artifact_grouped = types_per_artifact_grouped[column_sum.sort_values(ascending=False).index[:]]\n", "\n", "# Convert to integer\n", - "types_per_artifact_grouped.astype(int)" + "types_per_artifact_grouped.astype(int).head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "cbe350d7", + "metadata": {}, + "source": [ + "### Table 2b Chart 1 - 30 largest artifacts and their types stacked" ] }, { @@ -186,86 +228,137 @@ "outputs": [], "source": [ "plot.figure();\n", - "types_per_artifact_grouped.plot(\n", + "types_per_artifact_grouped.head(30).plot(\n", " kind='bar', \n", - " title='Types per Artifact',\n", + " title='Top 30 types per artifact',\n", " xlabel='Artifact',\n", " ylabel='Types',\n", " stacked=True, \n", - " cmap=main_color_map\n", + " cmap=main_color_map,\n", + " figsize=(8, 5)\n", ")\n", "plot.show()" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "62268321", + "metadata": {}, + "source": [ + "### Table 2c - Largest 30 types per artifact (grouped and normalized in %)" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "ec514eba", + "id": "ed887815", "metadata": {}, "outputs": [], "source": [ - "# (Optional) Plot \"Types per Artifact\" and the normalized variation side by side\n", - "# plot.figure();\n", - "# fig, (axis_left, axis_right) = plot.subplots(nrows=1, ncols=2)\n", - "# types_per_artifact_grouped.plot(\n", - "# ax=axis_left,\n", - "# kind='bar', \n", - "# title='Types per Artifact',\n", - "# xlabel='Artifact',\n", - "# ylabel='Types',\n", - "# stacked=True, \n", - "# cmap=main_color_map\n", - "# )\n", - "# types_per_artifact_grouped_normalized.plot(\n", - "# ax=axis_right,\n", - "# kind='bar', \n", - "# title='Types per Artifact [%]',\n", - "# xlabel='Artifact',\n", - "# ylabel='Types %',\n", - "# stacked=True, \n", - "# cmap=main_color_map\n", - "# )\n", - "# plot.show()" + "# Divide every value by the sum of the row to get horizontal normalized values.\n", + "# This makes it easier to compare the \"language element\" usage without taking the size of the artifact into account\n", + "types_per_artifact_grouped_normalized = types_per_artifact_grouped.div(types_per_artifact_grouped.sum(axis=1), axis=0).multiply(100)\n", + "types_per_artifact_grouped_normalized.head(30)" ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "62268321", + "id": "47befd1d", "metadata": {}, "source": [ - "### Table 3 - Types per artifact (grouped and normalized in %)" + "### Table 2c Chart 1 - Top 30 artifacts with the highest relative amount of classes in %" ] }, { "cell_type": "code", "execution_count": null, - "id": "ed887815", + "id": "a0e1b7bc", "metadata": {}, "outputs": [], "source": [ - "# Divide every value by the sum of the row to get horizontal normalized values.\n", - "# This makes it easier to compare the \"language element\" usage without taking the size of the artifact into account\n", - "types_per_artifact_grouped_normalized = types_per_artifact_grouped.div(types_per_artifact_grouped.sum(axis=1), axis=0).multiply(100)\n", - "types_per_artifact_grouped_normalized" + "types_per_artifact_sorted_by_classes=types_per_artifact_grouped_normalized.sort_values(by='Class', ascending=False)\n", + "\n", + "plot.figure();\n", + "types_per_artifact_sorted_by_classes.head(30).plot(kind='bar', stacked=True, cmap=main_color_map, figsize=(8, 5))\n", + "plot.xlabel('Artifact')\n", + "plot.ylabel('Types %')\n", + "plot.title('Class types [%] per artifact')\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "652b8526", + "metadata": {}, + "source": [ + "### Table 2c Chart 2 - Top 30 artifacts with the highest relative amount of interfaces in %" ] }, { "cell_type": "code", "execution_count": null, - "id": "a0e1b7bc", + "id": "ccb4d386", "metadata": {}, "outputs": [], "source": [ - "# Divide every value by the sum of the row to get horizontal normalized values.\n", - "# This makes it easier to compare the \"language element\" usage without taking the size of the artifact into account\n", - "types_per_artifact_grouped_normalized = types_per_artifact_grouped.div(types_per_artifact_grouped.sum(axis=1), axis=0).multiply(100)\n", + "types_per_artifact_sorted_by_interfaces=types_per_artifact_grouped_normalized.sort_values(by='Interface', ascending=False)\n", + "\n", + "plot.figure();\n", + "types_per_artifact_sorted_by_interfaces.head(30).plot(kind='bar', stacked=True, cmap=main_color_map, figsize=(8, 5))\n", + "plot.xlabel('Artifact')\n", + "plot.ylabel('Types %')\n", + "plot.title('Interface types [%] per artifact')\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "a6103211", + "metadata": {}, + "source": [ + "### Table 2c Chart 3 - Top 30 artifacts with the highest relative amount of enums in %" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd70980d", + "metadata": {}, + "outputs": [], + "source": [ + "types_per_artifact_sorted_by_enums=types_per_artifact_grouped_normalized.sort_values(by='Enum', ascending=False)\n", "\n", "plot.figure();\n", - "types_per_artifact_grouped_normalized.plot(kind='bar', stacked=True, cmap=main_color_map)\n", + "types_per_artifact_sorted_by_enums.head(30).plot(kind='bar', stacked=True, cmap=main_color_map, figsize=(8, 5))\n", "plot.xlabel('Artifact')\n", "plot.ylabel('Types %')\n", - "plot.title('Types [%] per Artifact')\n", + "plot.title('Enum types [%] per artifact')\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "17cf05c7", + "metadata": {}, + "source": [ + "### Table 2c Chart 4 - Top 30 artifacts with the highest relative amount of annotations in %" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1054634e", + "metadata": {}, + "outputs": [], + "source": [ + "types_per_artifact_sorted_by_annotations=types_per_artifact_grouped_normalized.sort_values(by='Annotation', ascending=False)\n", + "\n", + "plot.figure();\n", + "types_per_artifact_sorted_by_annotations.head(30).plot(kind='bar', stacked=True, cmap=main_color_map, figsize=(8, 5))\n", + "plot.xlabel('Artifact')\n", + "plot.ylabel('Types %')\n", + "plot.title('Annotation types [%] per artifact')\n", "plot.show()" ] }, @@ -275,7 +368,9 @@ "id": "85535e4f", "metadata": {}, "source": [ - "### Table 4 - Number of packages per artifact" + "### Table 3 - Top 30 artifacts with the highest package count\n", + "\n", + "The whole table can be found in the CSV report `Number_of_packages_per_artifact`." ] }, { @@ -288,12 +383,60 @@ "packages_per_artifact = query_cypher_to_data_frame(\"../cypher/Overview/Number_of_packages_per_artifact.cypher\")\n", "\n", "# Sort the DataFrame by the sum of values\n", - "types_per_artifact_sorted = packages_per_artifact.sort_values(by='numberOfPackages', ascending=False)\n", + "types_per_artifact_sorted = packages_per_artifact.sort_values(by='numberOfPackages', ascending=False).reset_index(drop=True)\n", "\n", "# Set the name of the index to artifactName\n", - "types_per_artifact_sorted.set_index('artifactName', inplace=True)\n", + "#types_per_artifact_sorted.set_index('artifactName', inplace=True)\n", + "\n", + "types_per_artifact_sorted.head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "18227c68", + "metadata": {}, + "source": [ + "### Table 3 Chart 1 - Number of packages per artifact\n", + "\n", + "The following chat shows artifacts with the largest package count in percentage. Artifacts with less than 0.7% package count are grouped into \"others\" to focus on the most significant artifacts regarding their package count." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "102710a5", + "metadata": {}, + "outputs": [], + "source": [ + "def group_to_others_below_threshold(data_frame : pd.DataFrame, value_column : str, name_column: str, threshold: float) -> pd.DataFrame: \n", + " \"\"\"Adds a new percentage column for the value column and \n", + " groups all values below the given threshold to \"others\" in the name column.\n", + "\n", + " Parameters:\n", + " - data_frame (pd.DataFrame): Input pandas DataFrame\n", + " - value_column (str): Name of the column that contains the numeric value\n", + " - name_column (str): Name of the column that contains the group name that will be replaced by \"others\" for small values\n", + " - threshold (float): Threshold in % that is used to group values below it into the \"others\" group\n", + "\n", + " Returns:\n", + " int:Returning value\n", + "\n", + " \"\"\"\n", + " result_data_frame = data_frame.copy();\n", "\n", - "types_per_artifact_sorted" + " percent_column_name = value_column + 'Percent';\n", + "\n", + " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", + " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", + "\n", + " # Change the external package name to \"others\" if it is called less than the specified threshold\n", + " result_data_frame.loc[result_data_frame[percent_column_name] < threshold, name_column] = 'others';\n", + "\n", + " # Group external package name (foremost the new \"others\" entries) and sum their percentage\n", + " result_data_frame = result_data_frame.groupby(name_column)[percent_column_name].sum();\n", + "\n", + " # Sort by values descending\n", + " return result_data_frame.sort_values(ascending=False);" ] }, { @@ -303,8 +446,26 @@ "metadata": {}, "outputs": [], "source": [ + "types_per_artifact_sorted_significant=group_to_others_below_threshold(\n", + " data_frame=types_per_artifact_sorted,\n", + " value_column='numberOfPackages',\n", + " name_column='artifactName',\n", + " threshold= 0.7\n", + ");\n", + "\n", "plot.figure();\n", - "types_per_artifact_sorted.plot(y='numberOfPackages', kind='pie', title='Packages per Artifact', labeldistance=None, cmap=main_color_map)\n", + "types_per_artifact_sorted_significant.plot(\n", + " y='numberOfPackages', \n", + " kind='pie', \n", + " title='Number of packages per artifact', \n", + " legend=True,\n", + " labeldistance=None,\n", + " autopct='%1.2f%%',\n", + " textprops={'fontsize': 5},\n", + " pctdistance=1.2,\n", + " cmap=main_color_map\n", + ")\n", + "plot.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", "plot.show()" ] }, @@ -323,9 +484,13 @@ "id": "5c49be47", "metadata": {}, "source": [ - "### Table 5 - Effective method line count distribution\n", + "### Table 3a - Effective method line count distribution\n", + "\n", + "This table shows the distribution of the effective method line count per artifact.\n", + "For each artifact the number of methods with effective line count = 1,2,3,... is shown to get an overview of how line counts are distributed over methods.\n", + "\n", + "Only the 15 artifacts with the highest method count and their effective method line count distribution (limited by 40)is shown here. The whole table can be found in the CSV report `Effective_Method_Line_Count_Distribution`.\n", "\n", - "The table shown here only includes the first 10 rows which typically represents the most significant entries.\n", "Have a look below to find out which packages and methods have the highest effective lines of code." ] }, @@ -336,6 +501,8 @@ "metadata": {}, "outputs": [], "source": [ + "effective_method_line_count_distribution_max_artifacts=20\n", + "\n", "effective_method_line_count_distribution=query_cypher_to_data_frame(\"../cypher/Overview/Effective_Method_Line_Count_Distribution.cypher\")\n", "effective_method_line_count_distribution=effective_method_line_count_distribution.pivot(index='effectiveLineCount', columns='artifactName', values='methods')\n", "\n", @@ -345,7 +512,11 @@ "# Convert to integer\n", "effective_method_line_count_distribution=effective_method_line_count_distribution.astype(int)\n", "\n", - "effective_method_line_count_distribution.head(10)" + "# Sort by column sum and then take only the first 10 columns\n", + "effective_method_line_count_sum_per_artifact = effective_method_line_count_distribution.sum()\n", + "effective_method_line_count_distribution = effective_method_line_count_distribution[effective_method_line_count_sum_per_artifact.sort_values(ascending=False).index[:effective_method_line_count_distribution_max_artifacts]]\n", + "\n", + "effective_method_line_count_distribution.head(40)" ] }, { @@ -354,9 +525,9 @@ "id": "69f80f6d", "metadata": {}, "source": [ - "### Table 6 - Effective method line count distribution (normalized)\n", + "### Table 3b - Effective method line count distribution (normalized)\n", "\n", - "The table shown here only includes the first 10 rows which typically represents the most significant entries.\n", + "The table shown here only includes the first 40 rows which typically represents the most significant entries.\n", "Have a look below to find out which packages and methods have the highest effective lines of code." ] }, @@ -369,7 +540,15 @@ "source": [ "# Divide every value by the sum of all values in the same column to get vertical normalized values.\n", "effective_method_line_count_distribution_normalized = effective_method_line_count_distribution.div(effective_method_line_count_distribution.sum(axis=0), axis=1).multiply(100)\n", - "effective_method_line_count_distribution_normalized.head(10)" + "effective_method_line_count_distribution_normalized.head(40)" + ] + }, + { + "cell_type": "markdown", + "id": "2e75ff54", + "metadata": {}, + "source": [ + "### Table 3b Chart 1 - Effective method line count distribution (normalized)" ] }, { @@ -380,31 +559,113 @@ "outputs": [], "source": [ "plot.figure();\n", - "method_line_count_x_ticks=range(1,11)\n", - "axes = effective_method_line_count_distribution_normalized.plot(\n", - " kind='line', \n", + "method_line_count_x_ticks=range(1,20)\n", + "axes = effective_method_line_count_distribution_normalized.head(20).plot(\n", + " kind='line',\n", " logx=True,\n", " grid=True,\n", - " xlim=[1,20],\n", + " xlim=[2, 20],\n", + " ylim=[0, 20],\n", " xticks=method_line_count_x_ticks,\n", " title='Effective Method Line Count Distribution', \n", " xlabel='effective line count',\n", - " ylabel='number of methods',\n", + " ylabel='percent of methods',\n", " cmap=main_color_map,\n", + " figsize=(10, 6),\n", + " lw=2,\n", ")\n", "axes.set_xticklabels(method_line_count_x_ticks)\n", + "axes.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", "plot.show()" ] }, + { + "cell_type": "markdown", + "id": "ff1748a3", + "metadata": {}, + "source": [ + "### Table 3c - Top 30 packages with highest effective line counts\n", + "\n", + "The following table shows the top 30 packages with the highest effective lines of code. The whole table can be found in the CSV report `Effective_lines_of_method_code_per_package`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac2bd9c2", + "metadata": {}, + "outputs": [], + "source": [ + "# Query artifacts and packages and find the method with the highest effective lines of code and cyclomatic complexity\n", + "# The result of this query will be also used further below. \n", + "method_statistics_per_package = query_cypher_to_data_frame(\"../cypher/Overview/Effective_lines_of_method_code_per_package.cypher\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dafd62c1", + "metadata": {}, + "outputs": [], + "source": [ + "# Select and order the columns for this particular table \n", + "effective_line_count_per_package=method_statistics_per_package[['artifactName', 'fullPackageName', 'linesInPackage', 'methodCount', 'maxLinesMethod','maxLinesMethodName']]\n", + "\n", + "# Print out the top 30 (head) rows\n", + "effective_line_count_per_package.head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "19f5e1a6", + "metadata": {}, + "source": [ + "### Table 3d - Top 30 methods with the highest effective line count\n", + "\n", + "The following table shows the top 30 methods with the highest effective lines of code. The whole table can be found in the CSV report `Effective_lines_of_method_code_per_package`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c196e836", + "metadata": {}, + "outputs": [], + "source": [ + "# Select and order the columns for this particular table \n", + "effective_line_count_method_per_package=method_statistics_per_package[['artifactName', 'fullPackageName', 'maxLinesMethodType', 'maxLinesMethodName', 'maxLinesMethod', 'linesInPackage']]\n", + "\n", + "# Sort by the maximum of effective lines of code per package descending\n", + "effective_line_count_method_per_package=effective_line_count_method_per_package.sort_values(by='maxLinesMethod', ascending=False)\n", + "\n", + "# Reset the index to the new sort order\n", + "effective_line_count_method_per_package=effective_line_count_method_per_package.reset_index()\n", + "\n", + "# Print out the top 30 (head) rows\n", + "effective_line_count_method_per_package.head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "c5554922", + "metadata": {}, + "source": [ + "## Cyclomatic Complexity" + ] + }, { "attachments": {}, "cell_type": "markdown", "id": "8f811e86", "metadata": {}, "source": [ - "### Table 7 - Cyclomatic method complexity distribution\n", + "### Table 4a - Cyclomatic method complexity distribution\n", + "\n", + "This table shows the distribution of the cyclomatic complexity of methods per artifact.\n", + "For each artifact the number of methods with the cyclomatic complexity = 1,2,3,... is shown to get an overview of how cyclomatic complexity is distributed over methods.\n", + "\n", + "Only the 15 artifacts with the highest method count sum and their cyclomatic method complexity distribution (limited by 40) is shown here. The whole table can be found in the CSV report `Cyclomatic_Method_Complexity_Distribution`.\n", "\n", - "The table shown here only includes the first 10 rows which typically represents the most significant entries.\n", "Have a look below to find out which packages and methods have the highest effective lines of code." ] }, @@ -415,6 +676,8 @@ "metadata": {}, "outputs": [], "source": [ + "cyclomatic_method_complexity_distribution_max_artifacts=15\n", + "\n", "cyclomatic_method_complexity_distribution=query_cypher_to_data_frame(\"../cypher/Overview/Cyclomatic_Method_Complexity_Distribution.cypher\")\n", "cyclomatic_method_complexity_distribution=cyclomatic_method_complexity_distribution.pivot(index='cyclomaticComplexity', columns='artifactName', values='methods')\n", "\n", @@ -424,7 +687,11 @@ "# Convert to integer\n", "cyclomatic_method_complexity_distribution=cyclomatic_method_complexity_distribution.astype(int)\n", "\n", - "cyclomatic_method_complexity_distribution.head(10)" + "# Sort by column sum and then take only the first 10 columns\n", + "effective_method_line_count_sum_per_artifact = effective_method_line_count_distribution.sum()\n", + "cyclomatic_method_complexity_distribution = cyclomatic_method_complexity_distribution[effective_method_line_count_sum_per_artifact.sort_values(ascending=False).index[:cyclomatic_method_complexity_distribution_max_artifacts]]\n", + "\n", + "cyclomatic_method_complexity_distribution.head(40)" ] }, { @@ -433,9 +700,9 @@ "id": "edd94088", "metadata": {}, "source": [ - "### Table 8 - Cyclomatic method complexity distribution (normalized)\n", + "### Table 4b - Cyclomatic method complexity distribution (normalized)\n", "\n", - "The table shown here only includes the first 10 rows which typically represents the most significant entries.\n", + "The table shown here only includes the first 40 rows which typically represents the most significant entries.\n", "Have a look below to find out which packages and methods have the highest effective lines of code." ] }, @@ -448,7 +715,15 @@ "source": [ "# Divide every value by the sum of all values in the same column to get vertical normalized values.\n", "cyclomatic_method_complexity_distribution_normalized = cyclomatic_method_complexity_distribution.div(cyclomatic_method_complexity_distribution.sum(axis=0), axis=1).multiply(100)\n", - "cyclomatic_method_complexity_distribution_normalized.head(10)" + "cyclomatic_method_complexity_distribution_normalized.head(40)" + ] + }, + { + "cell_type": "markdown", + "id": "24fdb553", + "metadata": {}, + "source": [ + "### Table 4b Chart 1 - Cyclomatic method complexity distribution (normalized)" ] }, { @@ -466,67 +741,46 @@ " logx=True,\n", " logy=True,\n", " grid=True,\n", - " xlim=[1,10],\n", + " xlim=[1,11],\n", " ylim=[1,100],\n", " xticks=method_line_count_x_ticks,\n", " yticks=cyclomatic_complexity_y_ticks,\n", - " title='Cyclomatic Method Complexity Distribution', \n", + " title='Cyclomatic complexity distribution of methods', \n", " xlabel='cyclomatic complexity',\n", - " ylabel='number of methods',\n", + " ylabel='percentage of methods',\n", " cmap=main_color_map,\n", ")\n", "axes.set_xticklabels(method_line_count_x_ticks)\n", "axes.set_yticklabels(cyclomatic_complexity_y_ticks)\n", + "axes.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", "plot.show()" ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "732618fb", + "id": "3fc668c5", "metadata": {}, "source": [ - "### Table 9 - Top 10 packages with highest effective line counts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "101091d9", - "metadata": {}, - "outputs": [], - "source": [ - "# Query data from graph database\n", - "effective_line_count_per_package = query_cypher_to_data_frame(\"../cypher/Overview/Effective_lines_of_method_code_per_package.cypher\")\n", + "### Table 4c - Top 30 packages with highest cyclomatic complexity\n", "\n", - "# Select columns and top 10 rows (head)\n", - "effective_line_count_per_package[['artifactName', 'fullPackageName', 'linesInPackage', 'methodCount', 'maxLinesMethod','maxLinesMethodName']].head(10)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "558d0efe", - "metadata": {}, - "source": [ - "### Table 10 - Top 10 methods with highest effective line counts" + "The following table shows the top 30 packages with the highest cyclomatic complexity. The whole table can be found in the CSV report `Effective_lines_of_method_code_per_package`." ] }, { "cell_type": "code", "execution_count": null, - "id": "b81bc7ca", + "id": "d8739176", "metadata": {}, "outputs": [], "source": [ - "# Sort by maxLinesMethod\n", - "effective_line_count_per_method=effective_line_count_per_package.sort_values(by='maxLinesMethod', ascending=False)\n", + "# Select and order the columns for this particular table \n", + "cyclomatic_complexity_per_package=method_statistics_per_package[['artifactName', 'fullPackageName', 'complexityInPackage', 'methodCount', 'maxComplexity','maxComplexityMethod']]\n", "\n", - "# Reset index\n", - "effective_line_count_per_method = effective_line_count_per_method.reset_index()\n", + "# Sort by the method with the highest cyclomatic complexity per package descending\n", + "cyclomatic_complexity_per_package=cyclomatic_complexity_per_package.sort_values(by='complexityInPackage', ascending=False)\n", "\n", - "# Select columns and top 10 rows (head)\n", - "effective_line_count_per_method[['artifactName', 'fullPackageName', 'maxLinesMethodType', 'maxLinesMethodName', 'maxLinesMethod']].head(10)" + "# Print out the top 30 (head) rows\n", + "cyclomatic_complexity_per_package.head(30)" ] }, { @@ -535,7 +789,9 @@ "id": "4c82a0fd", "metadata": {}, "source": [ - "### Table 11 - Top 10 methods with highest cyclomatic complexity" + "### Table 4d - Top 30 methods with highest cyclomatic complexity\n", + "\n", + "The following table shows the top 30 packages containing the methods with the highest cyclomatic complexity. The whole table can be found in the CSV report `Effective_lines_of_method_code_per_package`." ] }, { @@ -545,14 +801,17 @@ "metadata": {}, "outputs": [], "source": [ - "# Sort by maxComplexity\n", - "cyclomatic_complexity_per_method=effective_line_count_per_package.sort_values(by='maxComplexity', ascending=False)\n", + "# Select and order the columns for this particular table \n", + "cyclomatic_complexity_per_method=method_statistics_per_package[['artifactName', 'fullPackageName', 'maxComplexityType', 'maxComplexityMethod', 'maxComplexity']]\n", + "\n", + "# Sort by the method with the highest cyclomatic complexity per package descending\n", + "cyclomatic_complexity_per_method=cyclomatic_complexity_per_method.sort_values(by='maxComplexity', ascending=False)\n", "\n", - "# Reset Index\n", - "cyclomatic_complexity_per_method = cyclomatic_complexity_per_method.reset_index()\n", + "# Reset the index to the new sort order\n", + "cyclomatic_complexity_per_method=cyclomatic_complexity_per_method.reset_index()\n", "\n", - "# Select columns and only the top 10 rows (head)\n", - "cyclomatic_complexity_per_method[['artifactName', 'fullPackageName', 'maxComplexityType', 'maxComplexityMethod', 'maxComplexity']].head(10)" + "# Print out the top 30 (head) rows\n", + "cyclomatic_complexity_per_method.head(30)" ] } ], diff --git a/scripts/reports/OverviewCsv.sh b/scripts/reports/OverviewCsv.sh index a4ef7073d..8913e879b 100755 --- a/scripts/reports/OverviewCsv.sh +++ b/scripts/reports/OverviewCsv.sh @@ -34,6 +34,7 @@ mkdir -p "${FULL_REPORT_DIRECTORY}" # Local Constants OVERVIEW_CYPHER_DIR="${CYPHER_DIR}/Overview" +execute_cypher "${OVERVIEW_CYPHER_DIR}/Overview_size.cypher" > "${FULL_REPORT_DIRECTORY}/Overview_size.csv" execute_cypher "${OVERVIEW_CYPHER_DIR}/Cyclomatic_Method_Complexity_Distribution.cypher" > "${FULL_REPORT_DIRECTORY}/Cyclomatic_Method_Complexity.csv" execute_cypher "${OVERVIEW_CYPHER_DIR}/Effective_lines_of_method_code_per_package.cypher" > "${FULL_REPORT_DIRECTORY}/Effective_lines_of_method_code_per_package.csv" execute_cypher "${OVERVIEW_CYPHER_DIR}/Effective_lines_of_method_code_per_type.cypher" > "${FULL_REPORT_DIRECTORY}/Effective_lines_of_method_code_per_type.csv" From 2fed3647bf25d741f9d1d5654dcd30863d2f31af Mon Sep 17 00:00:00 2001 From: JohT Date: Sat, 9 Sep 2023 21:12:49 +0200 Subject: [PATCH 13/15] Split out method metrics into own Jupyter report --- jupyter/MethodMetrics.ipynb | 483 ++++++++++++++++++++++++ jupyter/Overview.ipynb | 347 +---------------- scripts/reports/MethodMetricsJupyter.sh | 33 ++ scripts/reports/OverviewJupyter.sh | 5 +- 4 files changed, 519 insertions(+), 349 deletions(-) create mode 100644 jupyter/MethodMetrics.ipynb create mode 100755 scripts/reports/MethodMetricsJupyter.sh diff --git a/jupyter/MethodMetrics.ipynb b/jupyter/MethodMetrics.ipynb new file mode 100644 index 000000000..4c6190699 --- /dev/null +++ b/jupyter/MethodMetrics.ipynb @@ -0,0 +1,483 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "2f0eabc4", + "metadata": {}, + "source": [ + "# Method Metrics\n", + "
\n", + "\n", + "### References\n", + "- [jqassistant](https://jqassistant.org)\n", + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4191f259", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plot\n", + "from neo4j import GraphDatabase" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c5dab37", + "metadata": {}, + "outputs": [], + "source": [ + "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n", + "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", + "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", + "\n", + "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")))\n", + "driver.verify_connectivity()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1db254b", + "metadata": {}, + "outputs": [], + "source": [ + "def get_cypher_query_from_file(filename):\n", + " with open(filename) as file:\n", + " return ' '.join(file.readlines())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59310f6f", + "metadata": {}, + "outputs": [], + "source": [ + "def query_cypher_to_data_frame(filename):\n", + " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da9e8edb", + "metadata": {}, + "outputs": [], + "source": [ + "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n", + "#This is especially needed for PDF export of tables with multiple columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9deaabce", + "metadata": {}, + "outputs": [], + "source": [ + "%%html\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2496caf", + "metadata": {}, + "outputs": [], + "source": [ + "# Main Colormap\n", + "main_color_map = 'nipy_spectral'" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "23fd8dfd", + "metadata": {}, + "source": [ + "## Effective Method Line Count" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "5c49be47", + "metadata": {}, + "source": [ + "### Table 1a - Effective method line count distribution\n", + "\n", + "This table shows the distribution of the effective method line count per artifact.\n", + "For each artifact the number of methods with effective line count = 1,2,3,... is shown to get an overview of how line counts are distributed over methods.\n", + "\n", + "Only the 15 artifacts with the highest method count and their effective method line count distribution (limited by 40)is shown here. The whole table can be found in the CSV report `Effective_Method_Line_Count_Distribution`.\n", + "\n", + "Have a look below to find out which packages and methods have the highest effective lines of code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6dfbcb7", + "metadata": {}, + "outputs": [], + "source": [ + "effective_method_line_count_distribution_max_artifacts=20\n", + "\n", + "effective_method_line_count_distribution=query_cypher_to_data_frame(\"../cypher/Overview/Effective_Method_Line_Count_Distribution.cypher\")\n", + "effective_method_line_count_distribution=effective_method_line_count_distribution.pivot(index='effectiveLineCount', columns='artifactName', values='methods')\n", + "\n", + "# Fill missing values with zero\n", + "effective_method_line_count_distribution.fillna(0, inplace=True)\n", + "\n", + "# Convert to integer\n", + "effective_method_line_count_distribution=effective_method_line_count_distribution.astype(int)\n", + "\n", + "# Sort by column sum and then take only the first 10 columns\n", + "effective_method_line_count_sum_per_artifact = effective_method_line_count_distribution.sum()\n", + "effective_method_line_count_distribution = effective_method_line_count_distribution[effective_method_line_count_sum_per_artifact.sort_values(ascending=False).index[:effective_method_line_count_distribution_max_artifacts]]\n", + "\n", + "effective_method_line_count_distribution.head(40)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "69f80f6d", + "metadata": {}, + "source": [ + "### Table 1b - Effective method line count distribution (normalized)\n", + "\n", + "The table shown here only includes the first 40 rows which typically represents the most significant entries.\n", + "Have a look below to find out which packages and methods have the highest effective lines of code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b389bbc", + "metadata": {}, + "outputs": [], + "source": [ + "# Divide every value by the sum of all values in the same column to get vertical normalized values.\n", + "effective_method_line_count_distribution_normalized = effective_method_line_count_distribution.div(effective_method_line_count_distribution.sum(axis=0), axis=1).multiply(100)\n", + "effective_method_line_count_distribution_normalized.head(40)" + ] + }, + { + "cell_type": "markdown", + "id": "2e75ff54", + "metadata": {}, + "source": [ + "### Table 1b Chart 1 - Effective method line count distribution (normalized)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a1ae2c3", + "metadata": {}, + "outputs": [], + "source": [ + "plot.figure();\n", + "method_line_count_x_ticks=range(1,20)\n", + "axes = effective_method_line_count_distribution_normalized.head(20).plot(\n", + " kind='line',\n", + " logx=True,\n", + " grid=True,\n", + " xlim=[2, 20],\n", + " ylim=[0, 20],\n", + " xticks=method_line_count_x_ticks,\n", + " title='Effective Method Line Count Distribution', \n", + " xlabel='effective line count',\n", + " ylabel='percent of methods',\n", + " cmap=main_color_map,\n", + " figsize=(10, 6),\n", + " lw=2,\n", + ")\n", + "axes.set_xticklabels(method_line_count_x_ticks)\n", + "axes.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "ff1748a3", + "metadata": {}, + "source": [ + "### Table 1c - Top 30 packages with highest effective line counts\n", + "\n", + "The following table shows the top 30 packages with the highest effective lines of code. The whole table can be found in the CSV report `Effective_lines_of_method_code_per_package`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac2bd9c2", + "metadata": {}, + "outputs": [], + "source": [ + "# Query artifacts and packages and find the method with the highest effective lines of code and cyclomatic complexity\n", + "# The result of this query will be also used further below. \n", + "method_statistics_per_package = query_cypher_to_data_frame(\"../cypher/Overview/Effective_lines_of_method_code_per_package.cypher\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dafd62c1", + "metadata": {}, + "outputs": [], + "source": [ + "# Select and order the columns for this particular table \n", + "effective_line_count_per_package=method_statistics_per_package[['artifactName', 'fullPackageName', 'linesInPackage', 'methodCount', 'maxLinesMethod','maxLinesMethodName']]\n", + "\n", + "# Print out the top 30 (head) rows\n", + "effective_line_count_per_package.head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "19f5e1a6", + "metadata": {}, + "source": [ + "### Table 1d - Top 30 methods with the highest effective line count\n", + "\n", + "The following table shows the top 30 methods with the highest effective lines of code. The whole table can be found in the CSV report `Effective_lines_of_method_code_per_package`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c196e836", + "metadata": {}, + "outputs": [], + "source": [ + "# Select and order the columns for this particular table \n", + "effective_line_count_method_per_package=method_statistics_per_package[['artifactName', 'fullPackageName', 'maxLinesMethodType', 'maxLinesMethodName', 'maxLinesMethod', 'linesInPackage']]\n", + "\n", + "# Sort by the maximum of effective lines of code per package descending\n", + "effective_line_count_method_per_package=effective_line_count_method_per_package.sort_values(by='maxLinesMethod', ascending=False)\n", + "\n", + "# Reset the index to the new sort order\n", + "effective_line_count_method_per_package=effective_line_count_method_per_package.reset_index()\n", + "\n", + "# Print out the top 30 (head) rows\n", + "effective_line_count_method_per_package.head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "c5554922", + "metadata": {}, + "source": [ + "## Cyclomatic Complexity" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8f811e86", + "metadata": {}, + "source": [ + "### Table 2a - Cyclomatic method complexity distribution\n", + "\n", + "This table shows the distribution of the cyclomatic complexity of methods per artifact.\n", + "For each artifact the number of methods with the cyclomatic complexity = 1,2,3,... is shown to get an overview of how cyclomatic complexity is distributed over methods.\n", + "\n", + "Only the 15 artifacts with the highest method count sum and their cyclomatic method complexity distribution (limited by 40) is shown here. The whole table can be found in the CSV report `Cyclomatic_Method_Complexity_Distribution`.\n", + "\n", + "Have a look below to find out which packages and methods have the highest effective lines of code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf989acc", + "metadata": {}, + "outputs": [], + "source": [ + "cyclomatic_method_complexity_distribution_max_artifacts=15\n", + "\n", + "cyclomatic_method_complexity_distribution=query_cypher_to_data_frame(\"../cypher/Overview/Cyclomatic_Method_Complexity_Distribution.cypher\")\n", + "cyclomatic_method_complexity_distribution=cyclomatic_method_complexity_distribution.pivot(index='cyclomaticComplexity', columns='artifactName', values='methods')\n", + "\n", + "# Fill missing values with zero\n", + "cyclomatic_method_complexity_distribution.fillna(0, inplace=True)\n", + "\n", + "# Convert to integer\n", + "cyclomatic_method_complexity_distribution=cyclomatic_method_complexity_distribution.astype(int)\n", + "\n", + "# Sort by column sum and then take only the first 10 columns\n", + "effective_method_line_count_sum_per_artifact = effective_method_line_count_distribution.sum()\n", + "cyclomatic_method_complexity_distribution = cyclomatic_method_complexity_distribution[effective_method_line_count_sum_per_artifact.sort_values(ascending=False).index[:cyclomatic_method_complexity_distribution_max_artifacts]]\n", + "\n", + "cyclomatic_method_complexity_distribution.head(40)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "edd94088", + "metadata": {}, + "source": [ + "### Table 2b - Cyclomatic method complexity distribution (normalized)\n", + "\n", + "The table shown here only includes the first 40 rows which typically represents the most significant entries.\n", + "Have a look below to find out which packages and methods have the highest effective lines of code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7aff1d2f", + "metadata": {}, + "outputs": [], + "source": [ + "# Divide every value by the sum of all values in the same column to get vertical normalized values.\n", + "cyclomatic_method_complexity_distribution_normalized = cyclomatic_method_complexity_distribution.div(cyclomatic_method_complexity_distribution.sum(axis=0), axis=1).multiply(100)\n", + "cyclomatic_method_complexity_distribution_normalized.head(40)" + ] + }, + { + "cell_type": "markdown", + "id": "24fdb553", + "metadata": {}, + "source": [ + "### Table 2b Chart 1 - Cyclomatic method complexity distribution (normalized)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4e40ede", + "metadata": {}, + "outputs": [], + "source": [ + "plot.figure();\n", + "method_line_count_x_ticks=range(1,11)\n", + "cyclomatic_complexity_y_ticks=[1, 2, 3, 4, 5, 7, 10, 20, 30, 40, 50, 100]\n", + "axes = cyclomatic_method_complexity_distribution_normalized.plot(\n", + " kind='line', \n", + " logx=True,\n", + " logy=True,\n", + " grid=True,\n", + " xlim=[1,11],\n", + " ylim=[1,100],\n", + " xticks=method_line_count_x_ticks,\n", + " yticks=cyclomatic_complexity_y_ticks,\n", + " title='Cyclomatic complexity distribution of methods', \n", + " xlabel='cyclomatic complexity',\n", + " ylabel='percentage of methods',\n", + " cmap=main_color_map,\n", + ")\n", + "axes.set_xticklabels(method_line_count_x_ticks)\n", + "axes.set_yticklabels(cyclomatic_complexity_y_ticks)\n", + "axes.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "3fc668c5", + "metadata": {}, + "source": [ + "### Table 2c - Top 30 packages with highest cyclomatic complexity\n", + "\n", + "The following table shows the top 30 packages with the highest cyclomatic complexity. The whole table can be found in the CSV report `Effective_lines_of_method_code_per_package`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8739176", + "metadata": {}, + "outputs": [], + "source": [ + "# Select and order the columns for this particular table \n", + "cyclomatic_complexity_per_package=method_statistics_per_package[['artifactName', 'fullPackageName', 'complexityInPackage', 'methodCount', 'maxComplexity','maxComplexityMethod']]\n", + "\n", + "# Sort by the method with the highest cyclomatic complexity per package descending\n", + "cyclomatic_complexity_per_package=cyclomatic_complexity_per_package.sort_values(by='complexityInPackage', ascending=False)\n", + "\n", + "# Print out the top 30 (head) rows\n", + "cyclomatic_complexity_per_package.head(30)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "4c82a0fd", + "metadata": {}, + "source": [ + "### Table 2d - Top 30 methods with highest cyclomatic complexity\n", + "\n", + "The following table shows the top 30 packages containing the methods with the highest cyclomatic complexity. The whole table can be found in the CSV report `Effective_lines_of_method_code_per_package`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e000945", + "metadata": {}, + "outputs": [], + "source": [ + "# Select and order the columns for this particular table \n", + "cyclomatic_complexity_per_method=method_statistics_per_package[['artifactName', 'fullPackageName', 'maxComplexityType', 'maxComplexityMethod', 'maxComplexity']]\n", + "\n", + "# Sort by the method with the highest cyclomatic complexity per package descending\n", + "cyclomatic_complexity_per_method=cyclomatic_complexity_per_method.sort_values(by='maxComplexity', ascending=False)\n", + "\n", + "# Reset the index to the new sort order\n", + "cyclomatic_complexity_per_method=cyclomatic_complexity_per_method.reset_index()\n", + "\n", + "# Print out the top 30 (head) rows\n", + "cyclomatic_complexity_per_method.head(30)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "JohT" + } + ], + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "title": "Object Oriented Design Quality Metrics for Java with Neo4j" + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/jupyter/Overview.ipynb b/jupyter/Overview.ipynb index 9b6433b05..2283684bd 100644 --- a/jupyter/Overview.ipynb +++ b/jupyter/Overview.ipynb @@ -11,7 +11,7 @@ "\n", "### References\n", "- [jqassistant](https://jqassistant.org)\n", - "- [py2neo](https://py2neo.org/2021.1/)" + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)" ] }, { @@ -468,351 +468,6 @@ "plot.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", "plot.show()" ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "23fd8dfd", - "metadata": {}, - "source": [ - "## Effective Method Line Count" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "5c49be47", - "metadata": {}, - "source": [ - "### Table 3a - Effective method line count distribution\n", - "\n", - "This table shows the distribution of the effective method line count per artifact.\n", - "For each artifact the number of methods with effective line count = 1,2,3,... is shown to get an overview of how line counts are distributed over methods.\n", - "\n", - "Only the 15 artifacts with the highest method count and their effective method line count distribution (limited by 40)is shown here. The whole table can be found in the CSV report `Effective_Method_Line_Count_Distribution`.\n", - "\n", - "Have a look below to find out which packages and methods have the highest effective lines of code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f6dfbcb7", - "metadata": {}, - "outputs": [], - "source": [ - "effective_method_line_count_distribution_max_artifacts=20\n", - "\n", - "effective_method_line_count_distribution=query_cypher_to_data_frame(\"../cypher/Overview/Effective_Method_Line_Count_Distribution.cypher\")\n", - "effective_method_line_count_distribution=effective_method_line_count_distribution.pivot(index='effectiveLineCount', columns='artifactName', values='methods')\n", - "\n", - "# Fill missing values with zero\n", - "effective_method_line_count_distribution.fillna(0, inplace=True)\n", - "\n", - "# Convert to integer\n", - "effective_method_line_count_distribution=effective_method_line_count_distribution.astype(int)\n", - "\n", - "# Sort by column sum and then take only the first 10 columns\n", - "effective_method_line_count_sum_per_artifact = effective_method_line_count_distribution.sum()\n", - "effective_method_line_count_distribution = effective_method_line_count_distribution[effective_method_line_count_sum_per_artifact.sort_values(ascending=False).index[:effective_method_line_count_distribution_max_artifacts]]\n", - "\n", - "effective_method_line_count_distribution.head(40)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "69f80f6d", - "metadata": {}, - "source": [ - "### Table 3b - Effective method line count distribution (normalized)\n", - "\n", - "The table shown here only includes the first 40 rows which typically represents the most significant entries.\n", - "Have a look below to find out which packages and methods have the highest effective lines of code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3b389bbc", - "metadata": {}, - "outputs": [], - "source": [ - "# Divide every value by the sum of all values in the same column to get vertical normalized values.\n", - "effective_method_line_count_distribution_normalized = effective_method_line_count_distribution.div(effective_method_line_count_distribution.sum(axis=0), axis=1).multiply(100)\n", - "effective_method_line_count_distribution_normalized.head(40)" - ] - }, - { - "cell_type": "markdown", - "id": "2e75ff54", - "metadata": {}, - "source": [ - "### Table 3b Chart 1 - Effective method line count distribution (normalized)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6a1ae2c3", - "metadata": {}, - "outputs": [], - "source": [ - "plot.figure();\n", - "method_line_count_x_ticks=range(1,20)\n", - "axes = effective_method_line_count_distribution_normalized.head(20).plot(\n", - " kind='line',\n", - " logx=True,\n", - " grid=True,\n", - " xlim=[2, 20],\n", - " ylim=[0, 20],\n", - " xticks=method_line_count_x_ticks,\n", - " title='Effective Method Line Count Distribution', \n", - " xlabel='effective line count',\n", - " ylabel='percent of methods',\n", - " cmap=main_color_map,\n", - " figsize=(10, 6),\n", - " lw=2,\n", - ")\n", - "axes.set_xticklabels(method_line_count_x_ticks)\n", - "axes.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - "plot.show()" - ] - }, - { - "cell_type": "markdown", - "id": "ff1748a3", - "metadata": {}, - "source": [ - "### Table 3c - Top 30 packages with highest effective line counts\n", - "\n", - "The following table shows the top 30 packages with the highest effective lines of code. The whole table can be found in the CSV report `Effective_lines_of_method_code_per_package`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ac2bd9c2", - "metadata": {}, - "outputs": [], - "source": [ - "# Query artifacts and packages and find the method with the highest effective lines of code and cyclomatic complexity\n", - "# The result of this query will be also used further below. \n", - "method_statistics_per_package = query_cypher_to_data_frame(\"../cypher/Overview/Effective_lines_of_method_code_per_package.cypher\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dafd62c1", - "metadata": {}, - "outputs": [], - "source": [ - "# Select and order the columns for this particular table \n", - "effective_line_count_per_package=method_statistics_per_package[['artifactName', 'fullPackageName', 'linesInPackage', 'methodCount', 'maxLinesMethod','maxLinesMethodName']]\n", - "\n", - "# Print out the top 30 (head) rows\n", - "effective_line_count_per_package.head(30)" - ] - }, - { - "cell_type": "markdown", - "id": "19f5e1a6", - "metadata": {}, - "source": [ - "### Table 3d - Top 30 methods with the highest effective line count\n", - "\n", - "The following table shows the top 30 methods with the highest effective lines of code. The whole table can be found in the CSV report `Effective_lines_of_method_code_per_package`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c196e836", - "metadata": {}, - "outputs": [], - "source": [ - "# Select and order the columns for this particular table \n", - "effective_line_count_method_per_package=method_statistics_per_package[['artifactName', 'fullPackageName', 'maxLinesMethodType', 'maxLinesMethodName', 'maxLinesMethod', 'linesInPackage']]\n", - "\n", - "# Sort by the maximum of effective lines of code per package descending\n", - "effective_line_count_method_per_package=effective_line_count_method_per_package.sort_values(by='maxLinesMethod', ascending=False)\n", - "\n", - "# Reset the index to the new sort order\n", - "effective_line_count_method_per_package=effective_line_count_method_per_package.reset_index()\n", - "\n", - "# Print out the top 30 (head) rows\n", - "effective_line_count_method_per_package.head(30)" - ] - }, - { - "cell_type": "markdown", - "id": "c5554922", - "metadata": {}, - "source": [ - "## Cyclomatic Complexity" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "8f811e86", - "metadata": {}, - "source": [ - "### Table 4a - Cyclomatic method complexity distribution\n", - "\n", - "This table shows the distribution of the cyclomatic complexity of methods per artifact.\n", - "For each artifact the number of methods with the cyclomatic complexity = 1,2,3,... is shown to get an overview of how cyclomatic complexity is distributed over methods.\n", - "\n", - "Only the 15 artifacts with the highest method count sum and their cyclomatic method complexity distribution (limited by 40) is shown here. The whole table can be found in the CSV report `Cyclomatic_Method_Complexity_Distribution`.\n", - "\n", - "Have a look below to find out which packages and methods have the highest effective lines of code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cf989acc", - "metadata": {}, - "outputs": [], - "source": [ - "cyclomatic_method_complexity_distribution_max_artifacts=15\n", - "\n", - "cyclomatic_method_complexity_distribution=query_cypher_to_data_frame(\"../cypher/Overview/Cyclomatic_Method_Complexity_Distribution.cypher\")\n", - "cyclomatic_method_complexity_distribution=cyclomatic_method_complexity_distribution.pivot(index='cyclomaticComplexity', columns='artifactName', values='methods')\n", - "\n", - "# Fill missing values with zero\n", - "cyclomatic_method_complexity_distribution.fillna(0, inplace=True)\n", - "\n", - "# Convert to integer\n", - "cyclomatic_method_complexity_distribution=cyclomatic_method_complexity_distribution.astype(int)\n", - "\n", - "# Sort by column sum and then take only the first 10 columns\n", - "effective_method_line_count_sum_per_artifact = effective_method_line_count_distribution.sum()\n", - "cyclomatic_method_complexity_distribution = cyclomatic_method_complexity_distribution[effective_method_line_count_sum_per_artifact.sort_values(ascending=False).index[:cyclomatic_method_complexity_distribution_max_artifacts]]\n", - "\n", - "cyclomatic_method_complexity_distribution.head(40)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "edd94088", - "metadata": {}, - "source": [ - "### Table 4b - Cyclomatic method complexity distribution (normalized)\n", - "\n", - "The table shown here only includes the first 40 rows which typically represents the most significant entries.\n", - "Have a look below to find out which packages and methods have the highest effective lines of code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7aff1d2f", - "metadata": {}, - "outputs": [], - "source": [ - "# Divide every value by the sum of all values in the same column to get vertical normalized values.\n", - "cyclomatic_method_complexity_distribution_normalized = cyclomatic_method_complexity_distribution.div(cyclomatic_method_complexity_distribution.sum(axis=0), axis=1).multiply(100)\n", - "cyclomatic_method_complexity_distribution_normalized.head(40)" - ] - }, - { - "cell_type": "markdown", - "id": "24fdb553", - "metadata": {}, - "source": [ - "### Table 4b Chart 1 - Cyclomatic method complexity distribution (normalized)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4e40ede", - "metadata": {}, - "outputs": [], - "source": [ - "plot.figure();\n", - "method_line_count_x_ticks=range(1,11)\n", - "cyclomatic_complexity_y_ticks=[1, 2, 3, 4, 5, 7, 10, 20, 30, 40, 50, 100]\n", - "axes = cyclomatic_method_complexity_distribution_normalized.plot(\n", - " kind='line', \n", - " logx=True,\n", - " logy=True,\n", - " grid=True,\n", - " xlim=[1,11],\n", - " ylim=[1,100],\n", - " xticks=method_line_count_x_ticks,\n", - " yticks=cyclomatic_complexity_y_ticks,\n", - " title='Cyclomatic complexity distribution of methods', \n", - " xlabel='cyclomatic complexity',\n", - " ylabel='percentage of methods',\n", - " cmap=main_color_map,\n", - ")\n", - "axes.set_xticklabels(method_line_count_x_ticks)\n", - "axes.set_yticklabels(cyclomatic_complexity_y_ticks)\n", - "axes.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - "plot.show()" - ] - }, - { - "cell_type": "markdown", - "id": "3fc668c5", - "metadata": {}, - "source": [ - "### Table 4c - Top 30 packages with highest cyclomatic complexity\n", - "\n", - "The following table shows the top 30 packages with the highest cyclomatic complexity. The whole table can be found in the CSV report `Effective_lines_of_method_code_per_package`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d8739176", - "metadata": {}, - "outputs": [], - "source": [ - "# Select and order the columns for this particular table \n", - "cyclomatic_complexity_per_package=method_statistics_per_package[['artifactName', 'fullPackageName', 'complexityInPackage', 'methodCount', 'maxComplexity','maxComplexityMethod']]\n", - "\n", - "# Sort by the method with the highest cyclomatic complexity per package descending\n", - "cyclomatic_complexity_per_package=cyclomatic_complexity_per_package.sort_values(by='complexityInPackage', ascending=False)\n", - "\n", - "# Print out the top 30 (head) rows\n", - "cyclomatic_complexity_per_package.head(30)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "4c82a0fd", - "metadata": {}, - "source": [ - "### Table 4d - Top 30 methods with highest cyclomatic complexity\n", - "\n", - "The following table shows the top 30 packages containing the methods with the highest cyclomatic complexity. The whole table can be found in the CSV report `Effective_lines_of_method_code_per_package`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5e000945", - "metadata": {}, - "outputs": [], - "source": [ - "# Select and order the columns for this particular table \n", - "cyclomatic_complexity_per_method=method_statistics_per_package[['artifactName', 'fullPackageName', 'maxComplexityType', 'maxComplexityMethod', 'maxComplexity']]\n", - "\n", - "# Sort by the method with the highest cyclomatic complexity per package descending\n", - "cyclomatic_complexity_per_method=cyclomatic_complexity_per_method.sort_values(by='maxComplexity', ascending=False)\n", - "\n", - "# Reset the index to the new sort order\n", - "cyclomatic_complexity_per_method=cyclomatic_complexity_per_method.reset_index()\n", - "\n", - "# Print out the top 30 (head) rows\n", - "cyclomatic_complexity_per_method.head(30)" - ] } ], "metadata": { diff --git a/scripts/reports/MethodMetricsJupyter.sh b/scripts/reports/MethodMetricsJupyter.sh new file mode 100755 index 000000000..1297315b1 --- /dev/null +++ b/scripts/reports/MethodMetricsJupyter.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +# Creates the method metrics report (ipynb, md, pdf) based on the Jupyter Notebook "MethodMetrics.ipynb". +# It contains effective line counts and cyclomatic complexity of methods per artifact and package +# and their distribution. + +# Requires executeJupyterNotebook.sh + +# Overrideable Constants (defaults also defined in sub scripts) +REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} + +## Get this "scripts/reports" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} +echo "MethodMetricsJupyter: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" + +# Get the "scripts" directory by taking the path of this script and going one directory up. +SCRIPTS_DIR=${SCRIPTS_DIR:-"${REPORTS_SCRIPT_DIR}/.."} # Repository directory containing the shell scripts +echo "MethodMetricsJupyter: SCRIPTS_DIR=${SCRIPTS_DIR}" + +# Get the "jupyter" directory by taking the path of this script and going two directory up and then to "jupyter". +JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY:-"${SCRIPTS_DIR}/../jupyter"} # Repository directory containing the Jupyter Notebooks +echo "MethodMetricsJupyter: JUPYTER_NOTEBOOK_DIRECTORY=$JUPYTER_NOTEBOOK_DIRECTORY" + +# Create report directory +REPORT_NAME="method-metrics" +FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}" +mkdir -p "${FULL_REPORT_DIRECTORY}" + +# Execute and convert the following Jupyter Notebook within the given reports directory +(cd "${FULL_REPORT_DIRECTORY}" && exec ${SCRIPTS_DIR}/executeJupyterNotebook.sh ${JUPYTER_NOTEBOOK_DIRECTORY}/MethodMetrics.ipynb) || exit 1 \ No newline at end of file diff --git a/scripts/reports/OverviewJupyter.sh b/scripts/reports/OverviewJupyter.sh index 90ef26b71..151085a79 100755 --- a/scripts/reports/OverviewJupyter.sh +++ b/scripts/reports/OverviewJupyter.sh @@ -1,9 +1,8 @@ #!/usr/bin/env bash # Creates the "overview" report (ipynb, md, pdf) based on the Jupyter Notebook "Overview.ipynb". -# It contains a basic overview on how many Classes, Interfaces, Enums and Annotations earch artifact contains, -# how they relate to each other, distribution of Methods and their effective lines of code -# and how the cyclomatic complexity is distributed across all Methods per artifact. +# It contains a basic overview on how many Classes, Interfaces, Enums and Annotations earch artifact contains and +# how they relate to each other. # Requires executeJupyterNotebook.sh From c7176e88ba398868aa13c438669bb2405806a9b5 Mon Sep 17 00:00:00 2001 From: JohT Date: Sun, 10 Sep 2023 10:12:33 +0200 Subject: [PATCH 14/15] Optimize visibility metrics report --- jupyter/VisibilityMetrics.ipynb | 290 ++++++++++++++++++++++++++++---- 1 file changed, 259 insertions(+), 31 deletions(-) diff --git a/jupyter/VisibilityMetrics.ipynb b/jupyter/VisibilityMetrics.ipynb index 9e8aa59c4..d9a1e717d 100644 --- a/jupyter/VisibilityMetrics.ipynb +++ b/jupyter/VisibilityMetrics.ipynb @@ -13,7 +13,7 @@ "- [Visibility Metrics and the Importance of Hiding Things](https://dzone.com/articles/visibility-metrics-and-the-importance-of-hiding-th)\n", "- [Calculate metrics](https://101.jqassistant.org/calculate-metrics/index.html)\n", "- [Controlling Access to Members of a Class](https://docs.oracle.com/javase/tutorial/java/javaOO/accesscontrol.html)\n", - "- [py2neo](https://py2neo.org/2021.1/)" + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)" ] }, { @@ -29,6 +29,47 @@ "from neo4j import GraphDatabase" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "acf605be", + "metadata": {}, + "outputs": [], + "source": [ + "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n", + "#This is especially needed for PDF export of tables with multiple columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cc19954", + "metadata": {}, + "outputs": [], + "source": [ + "%%html\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33c356d7", + "metadata": {}, + "outputs": [], + "source": [ + "# Main Colormap\n", + "main_color_map = 'nipy_spectral'" + ] + }, { "cell_type": "code", "execution_count": null, @@ -98,29 +139,6 @@ "" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "91d80bf7", - "metadata": {}, - "source": [ - "## Artifacts\n", - "\n", - "### Table 1\n", - "\n", - "- List all the artifacts this notebook is based on" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc682db6", - "metadata": {}, - "outputs": [], - "source": [ - "query_cypher_to_data_frame(\"../cypher/List_all_existing_artifacts.cypher\")" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -141,11 +159,32 @@ "\n", "The relative visibility is between zero (all types are package protected) and one (all types are public). A value lower than one means that there are types that are declared package protected. The lower the value is, the better implementation details are hidden. \n", "\n", - "Non public classes can't be accessed from another package so they can be changed without affecting code in other packages. They clearly indicate functionality that only belongs to one package. This also motivates to use more classes and to split up code into smaller pieces with a single responsibility and reason to change.\n", + "Non public classes can't be accessed from another package so they can be changed without affecting code in other packages. They clearly indicate functionality that only belongs to one package. This also motivates to use more classes and to split up code into smaller pieces with a single responsibility and reason to change." + ] + }, + { + "cell_type": "markdown", + "id": "c9536fd9", + "metadata": {}, + "source": [ + "### Table 1a - Top 40 artifacts with lowest median of package protection encapsulation\n", "\n", - "### Table 2\n", + "This table shows the relative visibility statistics aggregated for all packages per artifact and focusses on artifacts with many packages and hardly any package protected types (lowest median, high visibility). Package protected types would help to improve encapsulation.\n", "\n", - "- Show relative visibility statistics aggregated for all packages per artifact " + "Only the top 40 entries are shown. The whole table can be found in the following CSV report: \n", + "`Global_relative_visibility_statistics_for_types`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68ed42d0", + "metadata": {}, + "outputs": [], + "source": [ + "# Query the visibility statistics per artifact (all packages aggregated)\n", + "# The results Will be used in multiple tables below.\n", + "relative_visibility_per_artifact_aggregated=query_cypher_to_data_frame(\"../cypher/Visibility/Global_relative_visibility_statistics_for_types.cypher\")" ] }, { @@ -157,7 +196,110 @@ }, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/Visibility/Global_relative_visibility_statistics_for_types.cypher\")" + "# Sort by the \"percentile50\" (median) and \"all\" (number of packages in the artifact) descending\n", + "relative_visibility_statistics_highest_median=relative_visibility_per_artifact_aggregated.sort_values(by=['percentile50', 'all'], ascending=[False, False])\n", + "\n", + "# Reset the index (row numbering starting at 0 and increasing by 1)\n", + "relative_visibility_statistics_highest_median=relative_visibility_statistics_highest_median.reset_index(drop=True)\n", + "\n", + "relative_visibility_statistics_highest_median.head(40)" + ] + }, + { + "cell_type": "markdown", + "id": "1b84fd51", + "metadata": {}, + "source": [ + "### Table 1b - Top 40 artifacts with highest median of package protection encapsulation\n", + "\n", + "This table shows the relative visibility statistics aggregated for all packages per artifact and focusses on artifacts with many packages and the highest median of package protected types (low visibility). Package protected types help to improve encapsulation.\n", + "\n", + "Only the top 40 entries are shown. The whole table can be found in the following CSV report: \n", + "`Global_relative_visibility_statistics_for_types`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc59a07d", + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by the \"percentile50\" (median) ascending and \"all\" (number of packages in the artifact) descending\n", + "relative_visibility_statistics_lowest_median=relative_visibility_per_artifact_aggregated.sort_values(by=['percentile50', 'all'], ascending=[True, False])\n", + "\n", + "# Reset the index (row numbering starting at 0 and increasing by 1)\n", + "relative_visibility_statistics_lowest_median=relative_visibility_statistics_lowest_median.reset_index(drop=True)\n", + "\n", + "relative_visibility_statistics_lowest_median.head(40)" + ] + }, + { + "cell_type": "markdown", + "id": "5196ecc2", + "metadata": {}, + "source": [ + "### Table 1 Chart 1 - Relative visibility in artifacts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f467a8dd", + "metadata": {}, + "outputs": [], + "source": [ + "plot.figure();\n", + "fig, axes = plot.subplots(nrows=3, ncols=1, sharex=True)\n", + "\n", + "number_of_packages_grid_ticks=[1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2_000, 5_000, 10_000]\n", + "\n", + "relative_visibility_per_artifact_aggregated.plot(\n", + " ax=axes[0],\n", + " kind='scatter',\n", + " title='Relative visibility in artifacts (75% percentile)', \n", + " x='percentile75',\n", + " y='all',\n", + " grid=True,\n", + " logy=True,\n", + " yticks=number_of_packages_grid_ticks,\n", + " xlabel='relative visibility',\n", + " ylabel='number of packages',\n", + " cmap=main_color_map,\n", + " figsize=(10,4),\n", + ")\n", + "relative_visibility_per_artifact_aggregated.plot(\n", + " ax=axes[1],\n", + " kind='scatter',\n", + " title='Relative visibility in artifacts (50% percentile)', \n", + " x='percentile50',\n", + " y='all',\n", + " grid=True,\n", + " logy=True,\n", + " yticks=number_of_packages_grid_ticks,\n", + " xlabel='relative visibility',\n", + " ylabel='number of packages',\n", + " cmap=main_color_map,\n", + " figsize=(10,4),\n", + ")\n", + "relative_visibility_per_artifact_aggregated.plot(\n", + " ax=axes[2],\n", + " kind='scatter',\n", + " title='Relative visibility in artifacts (25% percentile)', \n", + " x='percentile25',\n", + " y='all',\n", + " grid=True,\n", + " logy=True,\n", + " yticks=number_of_packages_grid_ticks,\n", + " xlabel='relative visibility',\n", + " ylabel='number of packages',\n", + " cmap=main_color_map,\n", + " figsize=(10,10),\n", + ")\n", + "axes[0].grid(color = 'grey', linestyle = '-', linewidth = 0.2)\n", + "axes[1].grid(color = 'grey', linestyle = '-', linewidth = 0.2)\n", + "axes[2].grid(color = 'grey', linestyle = '-', linewidth = 0.2)\n", + "plot.show()" ] }, { @@ -166,9 +308,12 @@ "id": "3f59da8d", "metadata": {}, "source": [ - "### Table 3\n", + "### Table 2a - Top 40 packages with the highest visibility and lowest encapsulation\n", "\n", - "- List the top 40 packages and their artifact with the highest relative visibility" + "This table shows the relative visibility statistics per packages and artifact and focusses on packages with many types, hardly any package protected ones and therefore the highest relative visibility (lowest encapsulation). Package protected types would help to improve encapsulation.\n", + "\n", + "Only the top 40 entries are shown. The whole table can be found in the following CSV report: \n", + "`Relative_visibility_public_types_to_all_types_per_package`" ] }, { @@ -180,7 +325,90 @@ }, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/Visibility/Relative_visibility_public_types_to_all_types_per_package.cypher\").head(50)" + "# Query the visibility statistics per package and artifact (all types aggregated)\n", + "# The results Will be used in multiple tables below.\n", + "relative_visibility_per_package=query_cypher_to_data_frame(\"../cypher/Visibility/Relative_visibility_public_types_to_all_types_per_package.cypher\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48f7f2d2", + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by the \"relativeVisibility\" and \"allTypes\" (number of types in the package) descending\n", + "highest_relative_visibility_packages=relative_visibility_per_package.sort_values(by=['relativeVisibility', 'allTypes'], ascending=[False, False])\n", + "\n", + "# Reset the index (row numbering starting at 0 and increasing by 1)\n", + "highest_relative_visibility_packages=highest_relative_visibility_packages.reset_index(drop=True)\n", + "\n", + "highest_relative_visibility_packages.head(40)" + ] + }, + { + "cell_type": "markdown", + "id": "c6786ef1", + "metadata": {}, + "source": [ + "### Table 2b - Top 40 packages with the lowest visibility and highest encapsulation\n", + "\n", + "This table shows the relative visibility statistics per packages and artifact and focusses on packages with many types, many package protected ones and therefore the lowest relative visibility (highest encapsulation). Package protected types help to improve encapsulation. Zero percent visibility and therefore packages with no public visible type are suspicious to be dead code.\n", + "\n", + "Only the top 40 entries are shown. The whole table can be found in the following CSV report: \n", + "`Relative_visibility_public_types_to_all_types_per_package`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48c20ca4", + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by the \"relativeVisibility\" ascending and \"allTypes\" (number of types in the package) descending\n", + "lowest_relative_visibility_packages=relative_visibility_per_package.sort_values(by=['relativeVisibility', 'allTypes'], ascending=[True, False])\n", + "\n", + "# Reset the index (row numbering starting at 0 and increasing by 1)\n", + "lowest_relative_visibility_packages=lowest_relative_visibility_packages.reset_index(drop=True)\n", + "\n", + "lowest_relative_visibility_packages.head(40)" + ] + }, + { + "cell_type": "markdown", + "id": "8ff237fd", + "metadata": {}, + "source": [ + "### Table 2 Chart 1 - Relative visibility of packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98b12846", + "metadata": {}, + "outputs": [], + "source": [ + "plot.figure();\n", + "\n", + "number_of_types_grid_ticks=[1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2_000, 5_000, 10_000]\n", + "\n", + "relative_visibility_per_package.plot(\n", + " kind='scatter',\n", + " title='Relative visibility of packages', \n", + " x='relativeVisibility',\n", + " y='allTypes',\n", + " grid=True,\n", + " logy=True,\n", + " yticks=number_of_types_grid_ticks,\n", + " xlabel='relative visibility',\n", + " ylabel='number of types',\n", + " cmap=main_color_map,\n", + " figsize=(10,4),\n", + ")\n", + "\n", + "plot.show()" ] } ], From 4702dbbb27a692ea54ff7d1b956c03ccb62bb650 Mon Sep 17 00:00:00 2001 From: JohT Date: Mon, 11 Sep 2023 20:48:10 +0200 Subject: [PATCH 15/15] Optimize Jupyter artifact dependency visualization --- jupyter/ArtifactDependencies.ipynb | 81 +++++++++++++++++++----------- 1 file changed, 53 insertions(+), 28 deletions(-) diff --git a/jupyter/ArtifactDependencies.ipynb b/jupyter/ArtifactDependencies.ipynb index 4cd1011b4..2938a5edf 100644 --- a/jupyter/ArtifactDependencies.ipynb +++ b/jupyter/ArtifactDependencies.ipynb @@ -7,12 +7,15 @@ "source": [ "## Artifact Dependencies\n", "\n", + "This report includes graph visualization(s) using JavaScript and might not be exportable to some document formats.\n", + "\n", "### References\n", "\n", "- [neovis.js (GitHub)](https://github.com/neo4j-contrib/neovis.js)\n", "- [vis-network (GitHub)](https://github.com/visjs/vis-network)\n", "- [vis network documentation](https://visjs.github.io/vis-network/docs/network)\n", - "- [Neo4j Graph Algorithms Jupyter Notebooks (GitHub)](https://github.com/neo4j-graph-analytics/graph-algorithms-notebooks)\n" + "- [Neo4j Graph Algorithms Jupyter Notebooks (GitHub)](https://github.com/neo4j-graph-analytics/graph-algorithms-notebooks)\n", + "- [Neo4j Graph Data Science Topological Sort](https://neo4j.com/docs/graph-data-science/current/algorithms/alpha/topological-sort)\n" ] }, { @@ -72,34 +75,41 @@ "metadata": {}, "outputs": [], "source": [ - "def visualization_configuration():\n", + "def visualization_configuration(node_distance: int = 150):\n", " return {\n", " \"visConfig\": {\n", " \"nodes\": {\n", " \"shape\": \"hexagon\",\n", " \"font\": {\n", - " \"strokeWidth\": 30,\n", - " \"strokeColor\": \"#F0F0FF\"\n", + " \"strokeWidth\": 4,\n", + " \"strokeColor\": \"#D0D0FF\",\n", + " \"size\": 32\n", " },\n", - " \"size\": 50,\n", - " \"borderWidth\": 2\n", + " \"size\": 60,\n", + " \"borderWidth\": 2,\n", + " \"widthConstraint\": {\n", + " \"maximum\": 120\n", + " }\n", " },\n", " \"edges\": {\n", " \"arrows\": {\n", - " \"to\": { \"enabled\": True }\n", + " \"to\": { \n", + " \"enabled\": True,\n", + " \"scaleFactor\": 0.5\n", + " }\n", " },\n", " \"scaling\": {\n", - " \"max\": 15\n", + " \"max\": 8\n", " }\n", " },\n", " \"physics\": {\n", " \"hierarchicalRepulsion\": {\n", - " \"nodeDistance\": 300, # 100\n", - " \"centralGravity\": 0.5, # 0.2\n", - " \"springLength\": 180, # 200\n", - " \"springConstant\": 0.06, # 0.05\n", + " \"nodeDistance\": node_distance, # 120\n", + " \"centralGravity\": 0.2, # 0.0\n", + " \"springLength\": 100, # 100\n", + " \"springConstant\": 0.02, # 0.01\n", " \"damping\": 0.09, # 0.09\n", - " \"avoidOverlap\": 0.1 # 0\n", + " \"avoidOverlap\": 0.9 # 0\n", " },\n", " \"solver\": \"hierarchicalRepulsion\" # barnesHut\n", " },\n", @@ -120,9 +130,9 @@ "metadata": {}, "outputs": [], "source": [ - "def graph_query_configuration():\n", + "def graph_query_configuration(query: str):\n", " return {\n", - " \"initialCypher\": \"MATCH (s:Artifact)-[r:DEPENDS_ON]->(d:Artifact) RETURN s,r,d\",\n", + " \"initialCypher\": query,\n", " \"labels\": {\n", " \"Artifact\": {\n", " \"label\": \"fileName\"\n", @@ -137,6 +147,18 @@ " }" ] }, + { + "cell_type": "markdown", + "id": "3328314d", + "metadata": {}, + "source": [ + "## Hierarchical Artifact Dependencies\n", + "\n", + "The following hierarchical graph shows artifact dependencies with the most used basis/shared artifact at the bottom and the artifact the builds upon the other dependencies on top. The visualization is limited to the first 60 nodes and their direct dependency ordered by the dependency layer (\"maxDistanceFromSource\") descending. \n", + "\n", + "For the whole list of topologically sorted artifacts including the hierarchical layer go to the report `TopologicalSortedArtifacts.csv`. This is also known as the \"build order\"." + ] + }, { "cell_type": "code", "execution_count": null, @@ -144,26 +166,29 @@ "metadata": {}, "outputs": [], "source": [ + "query = \"\"\"\n", + " MATCH (artifact:Artifact:Archive)-[dependency:DEPENDS_ON]->(dependent:Artifact:Archive)\n", + " WHERE artifact.maxDistanceFromSource IS NOT NULL\n", + " AND dependent.maxDistanceFromSource > artifact.maxDistanceFromSource\n", + "RETURN artifact, dependency, dependent\n", + " ORDER BY artifact.maxDistanceFromSource DESC\n", + " ,artifact.maxDistanceFromSource ASC\n", + " ,artifact.topologicalSortIndex ASC\n", + " ,dependent.topologicalSortIndex ASC\n", + "LIMIT 60 \n", + "\"\"\"\n", + "\n", "htmlElement = {\"containerId\": \"graph-visualization\"}\n", "serverConfiguration = neo4j_server_configuration(uri=neo4jUri, user=neo4jUser,password=neo4jPassword)\n", "\n", "# Assemble the neovis.js configuration by joining the different parts of it\n", - "graphVisualizationConfiguration = {**htmlElement, **visualization_configuration(), **serverConfiguration, **graph_query_configuration()}\n", + "graphVisualizationConfiguration = {**htmlElement, **visualization_configuration(), **serverConfiguration, **graph_query_configuration(query)}\n", + "#graphVisualizationConfiguration = {**htmlElement, **visualization_configuration(node_distance=220), **serverConfiguration, **graph_query_configuration(query)}\n", "\n", "# Create a javascript variable containing the whole configuration in JSON format\n", "Javascript(\"\"\"window.graphVisualizationConfiguration={};\"\"\".format(json.dumps(graphVisualizationConfiguration)))" ] }, - { - "cell_type": "markdown", - "id": "3328314d", - "metadata": {}, - "source": [ - "## Hierarchical Artifact Dependencies\n", - "\n", - "The following hierarchical graph shows artifact dependencies with the most used basis/shared artifact at the bottom and the artifact the builds upon all other dependencies on top. " - ] - }, { "cell_type": "code", "execution_count": null, @@ -176,8 +201,8 @@ "%%html\n", "\n",