JohT · JohT · Sep 24, 2024 · Sep 20, 2024 · Sep 21, 2024 · Sep 21, 2024
diff --git a/README.md b/README.md
@@ -219,6 +219,13 @@ The [Code Structure Analysis Pipeline](./.github/workflows/java-code-analysis.ym
   👉 The custom Jupyter Notebook metadata property `code_graph_analysis_pipeline_data_validation` can be set to choose a query from [cypher/Validation](./cypher/Validation) that will be executed preliminary to the notebook. If the query leads to at least one result, the validation succeeds and the notebook will be run. If the query leads to no result, the notebook will be skipped.
   For more details see [Data Availability Validation](./COMMANDS.md#data-availability-validation).
 
+- How can i increase the heap memory when scanning large Typescript projects?  
+  👉 Use the environment variable TYPESCRIPT_SCAN_HEAP_MEMORY in megabyte (default = 4096):
+
+  ```shell
+  TYPESCRIPT_SCAN_HEAP_MEMORY=16384 ./../../scripts/analysis/analyze.sh
+  ```
+
 ## 🕸 Web References
 
 - [Graph Data Science 101: Understanding Graphs and Graph Data Science](https://techfirst.medium.com/graph-data-science-101-understanding-graphs-and-graph-data-science-c25055a9db01)

diff --git a/cypher/External_Dependencies/Package_json_dependencies_by_package.cypher b/cypher/External_Dependencies/Package_json_dependencies_by_package.cypher
@@ -0,0 +1,10 @@
+// List package.json dependencies by package
+
+ MATCH (package:NPM:Package)-[:DECLARES_DEPENDENCY]->(dependency:NPM:Dependency)
+ OPTIONAL MATCH (package)-[:CONTAINS]->(:Json:Object)-[:HAS_KEY]->(:Json:Key{name:'name'})-[:HAS_VALUE]->(packageName:Json:Scalar:Value)
+RETURN replace(replace(package.fileName, '/npm-package-json/', ''), '/package.json', '')
+                             AS packageDirectory
+      ,packageName.value     AS packageName
+      ,dependency.name       AS dependencyName
+      ,dependency.dependency AS dependencyVersion
+ORDER BY packageName, dependencyName
diff --git a/cypher/External_Dependencies/Package_json_dependencies_combinations.cypher b/cypher/External_Dependencies/Package_json_dependencies_combinations.cypher
@@ -0,0 +1,18 @@
+// List most used combination of 2 and 3 dependencies
+
+ MATCH (package:NPM:Package)-[:DECLARES_DEPENDENCY]->(dependency:NPM:Dependency)
+ OPTIONAL MATCH (package)-[:CONTAINS]->(:Json:Object)-[:HAS_KEY]->(:Json:Key{name:'name'})-[:HAS_VALUE]->(packageName:Json:Scalar:Value)
+  WITH package.fileName      AS packageFileName
+      ,dependency.name       AS dependencyName
+ ORDER BY packageFileName, dependencyName
+  WITH packageFileName
+      ,apoc.coll.combinations(collect(dependencyName), 2, 3) AS dependencyCombinations
+UNWIND dependencyCombinations   AS dependencyCombination
+  WITH dependencyCombination
+      ,count(*) as occurrences
+      ,collect(packageFileName) AS packages
+ WHERE occurrences > 1
+RETURN dependencyCombination
+      ,occurrences
+      ,packages[0..9] AS firstTenPackages
+ORDER BY occurrences DESC
diff --git a/cypher/External_Dependencies/Package_json_dependencies_combinations_with_versions.cypher b/cypher/External_Dependencies/Package_json_dependencies_combinations_with_versions.cypher
@@ -0,0 +1,19 @@
+// List most used combination of 2 and 3 dependencies including version specifier
+
+ MATCH (package:NPM:Package)-[:DECLARES_DEPENDENCY]->(dependency:NPM:Dependency)
+ OPTIONAL MATCH (package)-[:CONTAINS]->(:Json:Object)-[:HAS_KEY]->(:Json:Key{name:'name'})-[:HAS_VALUE]->(packageName:Json:Scalar:Value)
+  WITH package.fileName      AS packageFileName
+      ,dependency.name       AS dependencyName
+      ,dependency.dependency AS dependencyVersion
+ ORDER BY packageFileName, dependencyName
+  WITH packageFileName
+      ,apoc.coll.combinations(collect(dependencyName + ' ' + dependencyVersion), 2, 3) AS dependencyCombinations
+UNWIND dependencyCombinations   AS dependencyCombination
+  WITH dependencyCombination
+      ,count(*) as occurrences
+      ,collect(packageFileName) AS packages
+ WHERE occurrences > 1
+RETURN dependencyCombination
+      ,occurrences
+      ,packages[0..9] AS firstTenPackages
+ORDER BY occurrences DESC
diff --git a/cypher/External_Dependencies/Package_json_dependencies_occurrence.cypher b/cypher/External_Dependencies/Package_json_dependencies_occurrence.cypher
@@ -0,0 +1,16 @@
+// List package.json dependencies by the number they are used by all packages
+
+ MATCH (package:NPM:Package)-[:DECLARES_DEPENDENCY]->(dependency:NPM:Dependency)
+ OPTIONAL MATCH (package)-[:CONTAINS]->(:Json:Object)-[:HAS_KEY]->(:Json:Key{name:'name'})-[:HAS_VALUE]->(packageName:Json:Scalar:Value)
+  WITH replace(replace(package.fileName, '/npm-package-json/', ''), '/package.json', '')
+                             AS packageDirectory
+      ,packageName.value     AS packageName
+      ,dependency.name       AS dependencyName
+      ,dependency.dependency AS dependencyVersion
+RETURN dependencyName
+      ,count(*)                          AS usingPackageCount
+      ,count(DISTINCT dependencyVersion) AS dependencyVersionCount
+      ,collect(packageName)[0..9]        AS packageNameExamples
+      ,collect(dependencyVersion)[0..4]  AS dependencyVersionExamples
+      ,collect(packageDirectory)[0..4]   AS packageDirectory
+ORDER BY usingPackageCount DESC
diff --git a/cypher/Overview/Words_for_git_author_Wordcloud_with_frequency.cypher b/cypher/Overview/Words_for_git_author_Wordcloud_with_frequency.cypher
@@ -0,0 +1,6 @@
+// Wordcloud of git authors and their commit count
+
+ MATCH (author:Git:Author)-[:COMMITTED]-(commit:Git:Commit)
+ WHERE NOT author.name CONTAINS '[bot]'
+   AND size(author.name) > 1
+RETURN author.name AS word, count(commit) AS frequency
diff --git a/cypher/Overview/Words_for_universal_Wordcloud.cypher b/cypher/Overview/Words_for_universal_Wordcloud.cypher
@@ -1,6 +1,6 @@
 // Words for universal Wordcloud
 
-MATCH (named:!Key&!Primitive&!PrimitiveType&!Void&!JavaType&!ResolvedDuplicateType&!ExternalType)
+MATCH (named:!Key&!Primitive&!PrimitiveType&!Void&!JavaType&!ResolvedDuplicateType&!ExternalType&!Git)
 WHERE named.name > ''
   AND named.name <> 'package-info'
   AND named.name <> '<init>'

diff --git a/cypher/Path_Finding/Path_Finding_5_All_pairs_shortest_path_distribution_per_project.cypher b/cypher/Path_Finding/Path_Finding_5_All_pairs_shortest_path_distribution_per_project.cypher
@@ -26,9 +26,14 @@ UNWIND sourcesAndTargets AS sourceAndTarget
 // Optionally get the project (e.g. Java Artifact, Typescript Project) the source and target belong to
 OPTIONAL MATCH (sourceProject:Artifact|Project)-[:CONTAINS]->(source)
 OPTIONAL MATCH (targetProject:Artifact|Project)-[:CONTAINS]->(target)
+// Optionally get the name of the scan that contained that project
+OPTIONAL MATCH (sourceScan:TS:Scan)-[:CONTAINS_PROJECT]->(sourceProject)
+OPTIONAL MATCH (targetScan:TS:Scan)-[:CONTAINS_PROJECT]->(targetProject)
 // Group by project name, if the target project is the same and the distance. Return those as result.
 RETURN sourceProject.name               AS sourceProject
+      ,sourceScan.name                  AS sourceScan
       ,(targetProject <> sourceProject) AS isDifferentTargetProject
+      ,(targetScan <> sourceScan)       AS isDifferentTargetScan
       ,distance
       ,distanceTotalPairCount
       ,distanceTotalSourceCount

diff --git a/cypher/Path_Finding/Path_Finding_5_All_pairs_shortest_path_examples.cypher b/cypher/Path_Finding/Path_Finding_5_All_pairs_shortest_path_examples.cypher
@@ -0,0 +1,23 @@
+// Path Finding - All pairs shortest path algorithm - Stream - Longest paths as examples
+
+  CALL gds.allShortestPaths.stream($dependencies_projection + '-cleaned')
+ YIELD sourceNodeId, targetNodeId, distance
+// Filter out all pairs that have no connection (infinite distance)
+ WHERE gds.util.isFinite(distance) = true
+   AND sourceNodeId  <> targetNodeId // Filter out cyclic dependencies
+  WITH toInteger(distance) AS distance
+      ,sourceNodeId
+      ,targetNodeId
+      ,gds.util.asNode(sourceNodeId) AS source
+      ,gds.util.asNode(targetNodeId) AS target
+// Optionally get the project (e.g. Java Artifact, Typescript Project) the source and target belong to
+OPTIONAL MATCH (sourceProject:Artifact|Project)-[:CONTAINS]->(source)
+// Optionally get the name of the scan that contained that project
+OPTIONAL MATCH (sourceScan:TS:Scan)-[:CONTAINS_PROJECT]->(sourceProject)
+   WITH *, coalesce(sourceScan, sourceProject).name AS sourceContainerName
+ ORDER BY distance DESC, sourceContainerName ASC
+// Only output the top 10 entries
+ LIMIT 10
+// Get the shortest path for the source and target node
+ MATCH path = SHORTEST 1 (source)-[:DEPENDS_ON]->+(target)
+RETURN distance, sourceContainerName, sourceProject, sourceScan, path
diff --git a/cypher/Path_Finding/Path_Finding_6_Longest_paths_distribution_per_project.cypher b/cypher/Path_Finding/Path_Finding_6_Longest_paths_distribution_per_project.cypher
@@ -24,9 +24,14 @@ UNWIND sourcesAndTargets AS sourceAndTarget
 // Optionally get the project (e.g. Java Artifact, Typescript Project) the source and target belong to
 OPTIONAL MATCH (sourceProject:Artifact|Project)-[:CONTAINS]->(source)
 OPTIONAL MATCH (targetProject:Artifact|Project)-[:CONTAINS]->(target)
+// Optionally get the name of the scan that contained that project
+OPTIONAL MATCH (sourceScan:TS:Scan)-[:CONTAINS_PROJECT]->(sourceProject)
+OPTIONAL MATCH (targetScan:TS:Scan)-[:CONTAINS_PROJECT]->(targetProject)
 // Group by project name, if the target project is the same and the distance. Return those as result.
 RETURN sourceProject.name               AS sourceProject
+      ,sourceScan.name                  AS sourceScan
       ,(targetProject <> sourceProject) AS isDifferentTargetProject
+      ,(targetScan <> sourceScan)       AS isDifferentTargetScan
       ,distance
       ,distanceTotalPairCount
       ,distanceTotalSourceCount

diff --git a/cypher/Path_Finding/Path_Finding_6_Longest_paths_examples.cypher b/cypher/Path_Finding/Path_Finding_6_Longest_paths_examples.cypher
@@ -0,0 +1,25 @@
+// Path Finding - Longest path - Stream - Max. paths as examples
+
+  CALL gds.dag.longestPath.stream($dependencies_projection + '-cleaned')
+ YIELD index, sourceNode, targetNode, totalCost, path
+  WITH index
+      ,path
+      ,toInteger(totalCost)          AS distance
+      ,sourceNode                    AS sourceNodeId
+      ,targetNode                    AS targetNodeId
+ WHERE sourceNodeId  <> targetNodeId // Filter out cyclic dependencies
+  WITH *
+      ,gds.util.asNode(sourceNodeId) AS source
+      ,gds.util.asNode(targetNodeId) AS target   
+// Optionally get the project (e.g. Java Artifact, Typescript Project) the source and target belong to
+OPTIONAL MATCH (sourceProject:Artifact|Project)-[:CONTAINS]->(source)
+OPTIONAL MATCH (targetProject:Artifact|Project)-[:CONTAINS]->(target)
+// Optionally get the name of the scan that contained that project
+OPTIONAL MATCH (sourceScan:TS:Scan)-[:CONTAINS_PROJECT]->(sourceProject)
+OPTIONAL MATCH (targetScan:TS:Scan)-[:CONTAINS_PROJECT]->(targetProject)
+   WITH *, coalesce(sourceScan, sourceProject).name AS sourceContainerName
+ ORDER BY distance DESC, sourceContainerName ASC
+// Only output the top 10 entries
+ LIMIT 10
+// Group by project name, if the target project is the same and the distance. Return those as result.
+RETURN distance, index, sourceContainerName, sourceProject, sourceScan, path
diff --git a/cypher/Typescript_Enrichment/Add_name_to_property_on_scan_nodes.cypher b/cypher/Typescript_Enrichment/Add_name_to_property_on_scan_nodes.cypher
@@ -2,6 +2,8 @@
 
  MATCH (typescriptScan:TS:Scan)
   WITH  typescriptScan
-       ,replace(reverse(split(reverse(typescriptScan.fileName), '/')[0]), '.json', '') AS scanName
+       ,reverse(split(reverse(split(typescriptScan.fileName, '/.reports/')[0]), '/')[0]) AS scanName
    SET  typescriptScan.name = scanName
-RETURN count(*) AS numberOfNamesScans
+RETURN count(*) AS numberOfNamesScans
+// Debugging
+//RETURN scanName, scanNameOld, typescriptScan.fileName