Add anomaly detection Markdown summary report

JohT · JohT · commit 1733c483fbd0 · 2025-09-27T12:43:04.000+02:00
diff --git a/domains/anomaly-detection/anomalyDetectionCsv.sh b/domains/anomaly-detection/anomalyDetectionCsv.sh
@@ -70,6 +70,8 @@ anomaly_detection_features() {
 # Required Parameters:
 # - projection_node_label=...
 #   Label of the nodes that will be used for the projection. Example: "Package"
+# - projection_language=...
+#   Name of the associated programming language. Default: "Java". Example: "Typescript"
 anomaly_detection_queries() {
     local nodeLabel
     nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
@@ -95,6 +97,8 @@ anomaly_detection_queries() {
 # Required Parameters:
 # - projection_node_label=...
 #   Label of the nodes that will be used for the projection. Example: "Package"
+# - projection_language=...
+#   Name of the associated programming language. Examples: "Java", "Typescript"
 anomaly_detection_labels() {
     local nodeLabel
     nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
@@ -121,6 +125,8 @@ anomaly_detection_labels() {
 #   Label of the nodes that will be used for the projection. Example: "Package"
 # - projection_weight_property=...
 #   Name of the node property that contains the dependency weight. Example: "weight"
+# - projection_language=...
+#   Name of the associated programming language. Examples: "Java", "Typescript"
 anomaly_detection_csv_reports() {
     time anomaly_detection_features "${@}"
     time anomaly_detection_queries "${@}"
diff --git a/domains/anomaly-detection/summary/AnomaliesDeepDiveOverview.cypher b/domains/anomaly-detection/summary/AnomaliesDeepDiveOverview.cypher
@@ -0,0 +1,20 @@
+// Anomaly Detection DeepDive: Overview of analyzed code units and the number of anomalies detected. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
+
+MATCH (codeUnit)
+WHERE $projection_node_label IN labels(codeUnit)
+  AND (codeUnit.incomingDependencies IS NOT NULL 
+   OR  codeUnit.outgoingDependencies IS NOT NULL)
+  WITH sum(codeUnit.anomalyLabel)                AS anomalyCount
+      ,sum(sign(codeUnit.anomalyAuthorityRank))  AS authorityCount
+      ,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount
+      ,sum(sign(codeUnit.anomalyBridgeRank))     AS bridgeCount
+      ,sum(sign(codeUnit.anomalyHubRank))        AS hubCount
+      ,sum(sign(codeUnit.anomalyOutlierRank))    AS outlierCount
+      //,collect(codeUnit.name)[0..4]              AS exampleNames
+ RETURN anomalyCount     AS `Anomalies`
+       ,authorityCount   AS `Authorities`
+       ,bottleNeckCount  AS `Bottlenecks`
+       ,bridgeCount      AS `Bridges`
+       ,hubCount         AS `Hubs`
+       ,outlierCount     AS `Outliers`
+      //,exampleNames
diff --git a/domains/anomaly-detection/summary/AnomaliesInTotal.cypher b/domains/anomaly-detection/summary/AnomaliesInTotal.cypher
@@ -0,0 +1,22 @@
+// Anomaly Detection Summary: Overview of all analyzed code units in total. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
+
+MATCH (codeUnit)
+WHERE (codeUnit.incomingDependencies IS NOT NULL 
+   OR  codeUnit.outgoingDependencies IS NOT NULL)
+  WITH count(DISTINCT codeUnit)                  AS codeUnitCount
+      ,sum(codeUnit.anomalyLabel)                AS anomalyCount
+      ,sum(sign(codeUnit.anomalyAuthorityRank))  AS authorityCount
+      ,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount
+      ,sum(sign(codeUnit.anomalyBridgeRank))     AS bridgeCount
+      ,sum(sign(codeUnit.anomalyHubRank))        AS hubCount
+      ,sum(sign(codeUnit.anomalyOutlierRank))    AS outlierCount
+      //,collect(codeUnit.name)[0..4]  AS exampleNames
+ RETURN codeUnitCount    AS `Analyzed Units`
+       ,anomalyCount     AS `Anomalies`
+       ,authorityCount   AS `Authorities`
+       ,bottleNeckCount  AS `Bottlenecks`
+       ,bridgeCount      AS `Bridges`
+       ,hubCount         AS `Hubs`
+       ,outlierCount     AS `Outliers`
+       //,exampleNames
+ ORDER BY anomalyCount DESC, codeUnitCount DESC
diff --git a/domains/anomaly-detection/summary/AnomaliesPerAbstractionLayer.cypher b/domains/anomaly-detection/summary/AnomaliesPerAbstractionLayer.cypher
@@ -0,0 +1,30 @@
+// Anomaly Detection Summary: Overview of analyzed code units and the number of anomalies detected. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
+
+MATCH (codeUnit)
+WHERE (codeUnit.incomingDependencies IS NOT NULL 
+   OR  codeUnit.outgoingDependencies IS NOT NULL)
+UNWIND labels(codeUnit) AS codeUnitLabel
+  WITH *
+ WHERE NOT codeUnitLabel STARTS WITH 'Mark4'
+   AND NOT codeUnitLabel IN ['File', 'Directory', 'ByteCode', 'GenericDeclaration']
+  WITH collect(codeUnitLabel) AS codeUnitLabels
+      ,codeUnit
+  WITH apoc.text.join(codeUnitLabels, ',')       AS codeUnitLabels
+      ,count(DISTINCT codeUnit)                  AS codeUnitCount
+      ,sum(codeUnit.anomalyLabel)                AS anomalyCount
+      ,sum(sign(codeUnit.anomalyAuthorityRank))  AS authorityCount
+      ,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount
+      ,sum(sign(codeUnit.anomalyBridgeRank))     AS bridgeCount
+      ,sum(sign(codeUnit.anomalyHubRank))        AS hubCount
+      ,sum(sign(codeUnit.anomalyOutlierRank))    AS outlierCount
+      //,collect(codeUnit.name)[0..4]  AS exampleNames
+ RETURN codeUnitLabels   AS `Abstraction Level`
+       ,codeUnitCount    AS `Units`
+       ,anomalyCount     AS `Anomalies`
+       ,authorityCount   AS `Authorities`
+       ,bottleNeckCount  AS `Bottlenecks`
+       ,bridgeCount      AS `Bridges`
+       ,hubCount         AS `Hubs`
+       ,outlierCount     AS `Outliers`
+       //,exampleNames
+ ORDER BY anomalyCount DESC, codeUnitCount DESC
diff --git a/domains/anomaly-detection/summary/AnomalyDetectionReportTopArchetypes.cypher b/domains/anomaly-detection/summary/AnomalyDetectionReportTopArchetypes.cypher
@@ -0,0 +1,38 @@
+// Anomaly Detection Labels: Summarizes all labelled archetypes by their anomaly score including their archetype rank. For code units with more than one archetype, the one with the higher rank is shown. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
+
+ MATCH (codeUnit)
+ WHERE $projection_node_label IN labels(codeUnit)
+UNWIND keys(codeUnit) AS codeUnitProperty
+  WITH *
+ WHERE codeUnitProperty STARTS WITH 'anomaly'
+   AND codeUnitProperty ENDS   WITH 'Rank'
+  WITH *
+      ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
+      ,split(split(codeUnitProperty, 'anomaly')[1], 'Rank')[0] AS archetype             
+      ,codeUnit[codeUnitProperty]                              AS archetypeRank
+      ,codeUnit.anomalyScore                                   AS anomalyScore
+ ORDER BY codeUnit.anomalyScore DESC, archetypeRank ASC, codeUnitName ASC, archetype ASC
+OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
+    WITH *, artifact.name                                             AS artifactName
+OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
+    WITH *, last(split(projectRoot.absoluteFileName, '/'))            AS projectName
+OPTIONAL MATCH (codeDirectory:File:Directory)-[:CONTAINS]->(codeUnit)
+    WITH *, split(replace(codeDirectory.fileName, './', ''), '/')[-2] AS directoryName
+    WITH *, coalesce(artifactName, projectName, directoryName, "")    AS projectName
+RETURN projectName                                                            AS `Contained in`
+      //$projection_language + ' ' +  $projection_node_label                  AS `Code Unit`
+      ,codeUnitName                                                           AS `Name`
+      ,round(anomalyScore, 4, 'HALF_UP')                                      AS `Score`
+      ,collect(archetype)[0]                                                  AS `Archetype`
+      ,collect(archetypeRank)[0]                                              AS `Archetype Rank`
+      ,nullif(codeUnit.anomalyTopFeature1, "")                                AS `Top Feature 1`
+      ,nullif(round(codeUnit.anomalyTopFeatureSHAPValue1, 4, 'HALF_UP'), 0.0) AS `Top Feature 1 SHAP`
+      ,nullif(codeUnit.anomalyTopFeature2, "")                                AS `Top Feature 2`
+      ,nullif(round(codeUnit.anomalyTopFeatureSHAPValue2, 4, 'HALF_UP'), 0.0) AS `Top Feature 2 SHAP`
+      ,nullif(codeUnit.anomalyTopFeature3, "")                                AS `Top Feature 3`
+      ,nullif(round(codeUnit.anomalyTopFeatureSHAPValue3, 4, 'HALF_UP'), 0.0) AS `Top Feature 3 SHAP`
+      ,CASE WHEN codeUnit.anomalyScore <= 0          THEN 'Typical'
+            WHEN codeUnit.anomalyTopFeature1 IS NULL THEN 'Undetermined'
+                                                     ELSE 'Anomalous'     END AS `Model Status`
+      //,collect(archetype)[1]     AS secondaryArchetype
+      //,collect(archetypeRank)[1] AS secondaryArchetypeRank
diff --git a/domains/anomaly-detection/summary/anomalyDetectionSummary.sh b/domains/anomaly-detection/summary/anomalyDetectionSummary.sh
@@ -0,0 +1,141 @@
+#!/usr/bin/env bash
+
+# Creates a Markdown report that contains all results of all the anomaly detection methods.
+# It requires an already running Neo4j graph database with already scanned and analyzed artifacts.
+# The results will be written into the sub directory reports/anomaly-detection.
+
+# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script.
+# Note that either "anomalyDetectionCsv.sh" or "anomalyDetectionPython.sh" is required to run prior to this script.
+
+# Requires executeQueryFunctions.sh, projectionFunctions.sh, cleanupAfterReportGeneration.sh
+
+# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
+set -o errexit -o pipefail
+
+# Overrideable Constants (defaults also defined in sub scripts)
+REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"}
+MARKDOWN_INCLUDES_DIRECTORY=${MARKDOWN_INCLUDES_DIRECTORY:-"includes"}
+
+## Get this "domains/anomaly-detection/summary" directory if not already set
+# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
+# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
+# This way non-standard tools like readlink aren't needed.
+ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)}
+#echo "anomalyDetectionSummary: ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR}"
+# Get the "scripts" directory by taking the path of this script and going one directory up.
+SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SUMMARY_DIR}/../../../scripts"} # Repository directory containing the shell scripts
+
+MARKDOWN_INCLUDES_DIRECTORY="includes"
+MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-"${SCRIPTS_DIR}/markdown"}
+#echo "anomalyDetectionSummary: MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR}" >&2
+
+# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher" and "execute_cypher_summarized"
+source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
+
+# Appends a Markdown table to an existing file and
+# removes redundant header + separator rows.
+#
+# Usage:
+#   cat newTable.md | append_table myMarkdownFile.md
+#
+#   append_table myMarkdownFile.md <<'EOF'
+#   | Name | Score | Archetype |
+#   | ---  | ---   | ---       |
+#   | Bar  | 0.9   | Something |
+#   EOF
+#
+# Behavior:
+#   - Keeps the first header row and its following separator row.
+#   - Removes all subsequent duplicate header + separator pairs.
+#   - Leaves all data rows untouched.
+append_to_markdown_table() {
+  local file="$1"
+
+  # Append stdin to the target file
+  cat >> "${file}"
+  
+  # Clean up duplicate headers (header row + --- row)
+  awk '!seen[$0]++ || NR <= 2' "${file}" > "${file}.tmp" && mv "${file}.tmp" "${file}"
+}
+
+# Run the anomaly detection main report generation.
+anomaly_detection_report_first_section() {
+    local report_markdown_includes_directory="${FULL_REPORT_DIRECTORY}/${MARKDOWN_INCLUDES_DIRECTORY}"
+    mkdir -p "${report_markdown_includes_directory}"
+    
+    execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesPerAbstractionLayer.cypher" --output-markdown-table > "${report_markdown_includes_directory}/AnomaliesPerAbstractionLayer.md"
+    execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesInTotal.cypher" --output-markdown-table > "${report_markdown_includes_directory}/AnomaliesInTotal.md"
+}
+
+# Aggregates all results in a Markdown report.
+#
+# Required Parameters:
+# - projection_node_label=...
+#   Label of the nodes that will be used for the projection. Example: "Package"
+# - projection_language=...
+#   Name of the associated programming language. Examples: "Java", "Typescript"
+anomaly_detection_deep_dive_report() {
+    local nodeLabel
+    nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
+    
+    local language
+    language=$( extractQueryParameter "projection_language" "${@}" )
+    
+    local report_number
+    report_number=$( extractQueryParameter "report_number" "${@}" )
+    
+    echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Creating ${language} ${nodeLabel} anomaly summary Markdown report..."
+    
+    anomaly_summary_directory="${FULL_REPORT_DIRECTORY}/anomaly_summary_${language}_${nodeLabel}"
+    mkdir -p "${anomaly_summary_directory}/${MARKDOWN_INCLUDES_DIRECTORY}"
+    
+    # TODO 2.{number of report}
+    echo "### 2.${report_number} ${language} ${nodeLabel}" > "${anomaly_summary_directory}/${MARKDOWN_INCLUDES_DIRECTORY}/DeepDiveSectionTitle.md"
+
+    execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesDeepDiveOverview.cypher" "${@}" --output-markdown-table > "${anomaly_summary_directory}/${MARKDOWN_INCLUDES_DIRECTORY}/DeepDiveOverview.md"
+    execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomalyDetectionReportTopArchetypes.cypher" "${@}" --output-markdown-table > "${anomaly_summary_directory}/${MARKDOWN_INCLUDES_DIRECTORY}/TopAnomaliesByArchetype.md"
+    
+    # Use Markdown template to assemble the final deep dive section of the Markdown report
+    cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report-deep-dive-template.md" "${anomaly_summary_directory}"
+    cat "${anomaly_summary_directory}/report-deep-dive-template.md" | "${MARKDOWN_SCRIPTS_DIR}/embedMarkdownIncludes.sh" "${anomaly_summary_directory}/${MARKDOWN_INCLUDES_DIRECTORY}" > "${anomaly_summary_directory}/report-deep-dive.md"
+    rm -rf "${anomaly_summary_directory}/report-deep-dive-template.md"
+
+    # Clean-up after report generation. Empty reports will be deleted.
+    source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${anomaly_summary_directory}"
+}
+
+# Run the anomaly detection report generation.
+# 
+# Required Parameters:
+# - projection_node_label=...
+#   Label of the nodes that will be used for the projection. Example: "Package"
+# - projection_language=...
+#   Name of the associated programming language. Examples: "Java", "Typescript"
+anomaly_detection_report() {
+    time anomaly_detection_deep_dive_report "${@}"
+}
+
+# Create report directory
+REPORT_NAME="anomaly-detection"
+FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}"
+mkdir -p "${FULL_REPORT_DIRECTORY}"
+
+# Query Parameter key pairs for projection and algorithm side
+ALGORITHM_NODE="projection_node_label"
+ALGORITHM_LANGUAGE="projection_language"
+REPORT_NUMBER="report_number"
+
+# -- Overview Report for all code type -------------------------------
+
+anomaly_detection_report_first_section
+
+# -- Detail Reports for each code type -------------------------------
+
+anomaly_detection_report "${REPORT_NUMBER}=1" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_LANGUAGE}=Java"
+anomaly_detection_report "${REPORT_NUMBER}=2" "${ALGORITHM_NODE}=Package" "${ALGORITHM_LANGUAGE}=Java"
+anomaly_detection_report "${REPORT_NUMBER}=3" "${ALGORITHM_NODE}=Type" "${ALGORITHM_LANGUAGE}=Java"
+anomaly_detection_report "${REPORT_NUMBER}=4" "${ALGORITHM_NODE}=Module" "${ALGORITHM_LANGUAGE}=Typescript"
+
+# ---------------------------------------------------------------
+
+echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished."
diff --git a/domains/anomaly-detection/summary/report-deep-dive-template.md b/domains/anomaly-detection/summary/report-deep-dive-template.md
@@ -0,0 +1,40 @@
+<!-- include:DeepDiveSectionTitle.md -->
+
+#### Metrics and Features
+
+* Degree (in/out)
+* PageRank, ArticleRank, PageRank–ArticleRank difference
+* Betweenness centrality
+* Local clustering coefficient
+* Cluster metrics (distance-to-medoid, average/max radius, outlier score)
+* Node embeddings (PCA-reduced)
+
+#### Anomaly Results
+
+##### Total anomalies
+
+<!-- include:DeepDiveOverview.md -->
+
+* **Top contributing features (via SHAP):**
+
+  * {{FEATURE\_1}}
+  * {{FEATURE\_2}}
+  * {{FEATURE\_3}}
+
+#### Archetype Distribution
+
+| Archetype          | Count | Example Node(s)   |
+| ------------------ | ----- | ----------------- |
+| Hub (Consumer)     | {{X}} | {{NODE\_EXAMPLE}} |
+| Hub (Provider)     | {{X}} | {{NODE\_EXAMPLE}} |
+| Bottleneck (Embed) | {{X}} | {{NODE\_EXAMPLE}} |
+| Outlier (Misfit)   | {{X}} | {{NODE\_EXAMPLE}} |
+
+#### Plots
+
+* SHAP summary plots
+* Betweenness distribution histogram
+* Scatter: PageRank vs Clustering Coefficient
+* Cluster outlier visualization
+
+---
diff --git a/domains/anomaly-detection/summary/report-template.md b/domains/anomaly-detection/summary/report-template.md
@@ -0,0 +1,60 @@
+# 📊 Anomaly Detection Report
+
+## 1. Executive Overview
+
+This report analyzes structural and dependency anomalies across multiple abstraction levels of the codebase.
+The goal is to detect potential **software quality, design, and architecture issues** using graph-based features, anomaly detection (Isolation Forest), and SHAP explainability.
+
+### 1.1 Overview of Analyzed Structures
+
+<!-- include:AnomaliesPerAbstractionLayer.md -->
+
+### 1.2 Anomalies in total
+
+<!-- include:AnomaliesPerAbstractionLayer.md -->
+
+## 2. Deep Dives by Abstraction Level
+
+<!-- include:AnomalyDetectionDeepDives.md -->
+
+## 3. Taxonomy of Anomaly Archetypes
+
+| Archetype | Feature Profile | Risk for Architecture |
+|-----------|----------------|------------------------|
+| **Hub** | High degree, low clustering coefficient | Central dependency, fragile hotspot |
+| **Bottleneck** | High betweenness, low redundancy | Single point of failure, slows evolution |
+| **Outlier** | High cluster distance, small cluster size | Misfit component, unusual dependency pattern |
+| **Authority** | High PageRank but low articleRank | Over-relied utility with few reverse connections |
+| **Bridge** | Embedding-driven anomaly, cross-cluster | Connects unrelated domains, risky coupling |
+
+---
+
+## 4. Recommendations
+
+* **Refactor hubs:** Break down god classes/utilities into smaller abstractions.
+* **Mitigate bottlenecks:** Add redundancy or alternative paths.
+* **Investigate outliers:** Validate if they are justified exceptions or design flaws.
+* **Enforce cohesion:** Raise clustering coefficient via better modular boundaries.
+* **Stabilize authorities:** Encapsulate widely used but locally weak components, reduce over-generalization, and ensure stable APIs.  
+* **Clarify bridges:** Validate whether cross-cluster connectors are intentional (adapters/facades) or accidental; refactor or relocate responsibilities to preserve modularity.
+
+---
+
+## 5. Appendix
+
+* **Methodology:** Isolation Forest, Random Forest proxy, SHAP explanations.
+* **Embedding generation:** Fast Random Projection, PCA (20–35 dims, \~0.9 target variance).
+* **Clustering:** HDBSCAN tuned against Leiden communities (golden reference, AMI optimization).
+* **Optimization:** Hyperparameter optimization for both Isolation Forest and Random Forest proxy with their F1 score
+* **Feature set:**
+  * Degree
+  * PageRank
+  * ArticleRank
+  * Page-to-Article Rank Difference
+  * Betweenness
+  * Local Clustering Coefficient
+  * Cluster Approximate Outlier Score ( = 1.0 - Cluster Probability)
+  * Cluster Radius Average
+  * Cluster Distance to Medoid
+  * Cluster Size
+  * Node Embedding
diff --git a/scripts/cleanupAfterReportGeneration.sh b/scripts/cleanupAfterReportGeneration.sh