Skip to content

Commit 1733c48

Browse files
committed
Add anomaly detection Markdown summary report
1 parent fe9f6a1 commit 1733c48

9 files changed

+367
-0
lines changed

domains/anomaly-detection/anomalyDetectionCsv.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ anomaly_detection_features() {
7070
# Required Parameters:
7171
# - projection_node_label=...
7272
# Label of the nodes that will be used for the projection. Example: "Package"
73+
# - projection_language=...
74+
# Name of the associated programming language. Default: "Java". Example: "Typescript"
7375
anomaly_detection_queries() {
7476
local nodeLabel
7577
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
@@ -95,6 +97,8 @@ anomaly_detection_queries() {
9597
# Required Parameters:
9698
# - projection_node_label=...
9799
# Label of the nodes that will be used for the projection. Example: "Package"
100+
# - projection_language=...
101+
# Name of the associated programming language. Examples: "Java", "Typescript"
98102
anomaly_detection_labels() {
99103
local nodeLabel
100104
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
@@ -121,6 +125,8 @@ anomaly_detection_labels() {
121125
# Label of the nodes that will be used for the projection. Example: "Package"
122126
# - projection_weight_property=...
123127
# Name of the node property that contains the dependency weight. Example: "weight"
128+
# - projection_language=...
129+
# Name of the associated programming language. Examples: "Java", "Typescript"
124130
anomaly_detection_csv_reports() {
125131
time anomaly_detection_features "${@}"
126132
time anomaly_detection_queries "${@}"
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// Anomaly Detection DeepDive: Overview of analyzed code units and the number of anomalies detected. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
2+
3+
MATCH (codeUnit)
4+
WHERE $projection_node_label IN labels(codeUnit)
5+
AND (codeUnit.incomingDependencies IS NOT NULL
6+
OR codeUnit.outgoingDependencies IS NOT NULL)
7+
WITH sum(codeUnit.anomalyLabel) AS anomalyCount
8+
,sum(sign(codeUnit.anomalyAuthorityRank)) AS authorityCount
9+
,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount
10+
,sum(sign(codeUnit.anomalyBridgeRank)) AS bridgeCount
11+
,sum(sign(codeUnit.anomalyHubRank)) AS hubCount
12+
,sum(sign(codeUnit.anomalyOutlierRank)) AS outlierCount
13+
//,collect(codeUnit.name)[0..4] AS exampleNames
14+
RETURN anomalyCount AS `Anomalies`
15+
,authorityCount AS `Authorities`
16+
,bottleNeckCount AS `Bottlenecks`
17+
,bridgeCount AS `Bridges`
18+
,hubCount AS `Hubs`
19+
,outlierCount AS `Outliers`
20+
//,exampleNames
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// Anomaly Detection Summary: Overview of all analyzed code units in total. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
2+
3+
MATCH (codeUnit)
4+
WHERE (codeUnit.incomingDependencies IS NOT NULL
5+
OR codeUnit.outgoingDependencies IS NOT NULL)
6+
WITH count(DISTINCT codeUnit) AS codeUnitCount
7+
,sum(codeUnit.anomalyLabel) AS anomalyCount
8+
,sum(sign(codeUnit.anomalyAuthorityRank)) AS authorityCount
9+
,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount
10+
,sum(sign(codeUnit.anomalyBridgeRank)) AS bridgeCount
11+
,sum(sign(codeUnit.anomalyHubRank)) AS hubCount
12+
,sum(sign(codeUnit.anomalyOutlierRank)) AS outlierCount
13+
//,collect(codeUnit.name)[0..4] AS exampleNames
14+
RETURN codeUnitCount AS `Analyzed Units`
15+
,anomalyCount AS `Anomalies`
16+
,authorityCount AS `Authorities`
17+
,bottleNeckCount AS `Bottlenecks`
18+
,bridgeCount AS `Bridges`
19+
,hubCount AS `Hubs`
20+
,outlierCount AS `Outliers`
21+
//,exampleNames
22+
ORDER BY anomalyCount DESC, codeUnitCount DESC
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// Anomaly Detection Summary: Overview of analyzed code units and the number of anomalies detected. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
2+
3+
MATCH (codeUnit)
4+
WHERE (codeUnit.incomingDependencies IS NOT NULL
5+
OR codeUnit.outgoingDependencies IS NOT NULL)
6+
UNWIND labels(codeUnit) AS codeUnitLabel
7+
WITH *
8+
WHERE NOT codeUnitLabel STARTS WITH 'Mark4'
9+
AND NOT codeUnitLabel IN ['File', 'Directory', 'ByteCode', 'GenericDeclaration']
10+
WITH collect(codeUnitLabel) AS codeUnitLabels
11+
,codeUnit
12+
WITH apoc.text.join(codeUnitLabels, ',') AS codeUnitLabels
13+
,count(DISTINCT codeUnit) AS codeUnitCount
14+
,sum(codeUnit.anomalyLabel) AS anomalyCount
15+
,sum(sign(codeUnit.anomalyAuthorityRank)) AS authorityCount
16+
,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount
17+
,sum(sign(codeUnit.anomalyBridgeRank)) AS bridgeCount
18+
,sum(sign(codeUnit.anomalyHubRank)) AS hubCount
19+
,sum(sign(codeUnit.anomalyOutlierRank)) AS outlierCount
20+
//,collect(codeUnit.name)[0..4] AS exampleNames
21+
RETURN codeUnitLabels AS `Abstraction Level`
22+
,codeUnitCount AS `Units`
23+
,anomalyCount AS `Anomalies`
24+
,authorityCount AS `Authorities`
25+
,bottleNeckCount AS `Bottlenecks`
26+
,bridgeCount AS `Bridges`
27+
,hubCount AS `Hubs`
28+
,outlierCount AS `Outliers`
29+
//,exampleNames
30+
ORDER BY anomalyCount DESC, codeUnitCount DESC
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// Anomaly Detection Labels: Summarizes all labelled archetypes by their anomaly score including their archetype rank. For code units with more than one archetype, the one with the higher rank is shown. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
2+
3+
MATCH (codeUnit)
4+
WHERE $projection_node_label IN labels(codeUnit)
5+
UNWIND keys(codeUnit) AS codeUnitProperty
6+
WITH *
7+
WHERE codeUnitProperty STARTS WITH 'anomaly'
8+
AND codeUnitProperty ENDS WITH 'Rank'
9+
WITH *
10+
,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
11+
,split(split(codeUnitProperty, 'anomaly')[1], 'Rank')[0] AS archetype
12+
,codeUnit[codeUnitProperty] AS archetypeRank
13+
,codeUnit.anomalyScore AS anomalyScore
14+
ORDER BY codeUnit.anomalyScore DESC, archetypeRank ASC, codeUnitName ASC, archetype ASC
15+
OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
16+
WITH *, artifact.name AS artifactName
17+
OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
18+
WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName
19+
OPTIONAL MATCH (codeDirectory:File:Directory)-[:CONTAINS]->(codeUnit)
20+
WITH *, split(replace(codeDirectory.fileName, './', ''), '/')[-2] AS directoryName
21+
WITH *, coalesce(artifactName, projectName, directoryName, "") AS projectName
22+
RETURN projectName AS `Contained in`
23+
//$projection_language + ' ' + $projection_node_label AS `Code Unit`
24+
,codeUnitName AS `Name`
25+
,round(anomalyScore, 4, 'HALF_UP') AS `Score`
26+
,collect(archetype)[0] AS `Archetype`
27+
,collect(archetypeRank)[0] AS `Archetype Rank`
28+
,nullif(codeUnit.anomalyTopFeature1, "") AS `Top Feature 1`
29+
,nullif(round(codeUnit.anomalyTopFeatureSHAPValue1, 4, 'HALF_UP'), 0.0) AS `Top Feature 1 SHAP`
30+
,nullif(codeUnit.anomalyTopFeature2, "") AS `Top Feature 2`
31+
,nullif(round(codeUnit.anomalyTopFeatureSHAPValue2, 4, 'HALF_UP'), 0.0) AS `Top Feature 2 SHAP`
32+
,nullif(codeUnit.anomalyTopFeature3, "") AS `Top Feature 3`
33+
,nullif(round(codeUnit.anomalyTopFeatureSHAPValue3, 4, 'HALF_UP'), 0.0) AS `Top Feature 3 SHAP`
34+
,CASE WHEN codeUnit.anomalyScore <= 0 THEN 'Typical'
35+
WHEN codeUnit.anomalyTopFeature1 IS NULL THEN 'Undetermined'
36+
ELSE 'Anomalous' END AS `Model Status`
37+
//,collect(archetype)[1] AS secondaryArchetype
38+
//,collect(archetypeRank)[1] AS secondaryArchetypeRank
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
#!/usr/bin/env bash
2+
3+
# Creates a Markdown report that contains all results of all the anomaly detection methods.
4+
# It requires an already running Neo4j graph database with already scanned and analyzed artifacts.
5+
# The results will be written into the sub directory reports/anomaly-detection.
6+
7+
# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script.
8+
# Note that either "anomalyDetectionCsv.sh" or "anomalyDetectionPython.sh" is required to run prior to this script.
9+
10+
# Requires executeQueryFunctions.sh, projectionFunctions.sh, cleanupAfterReportGeneration.sh
11+
12+
# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
13+
set -o errexit -o pipefail
14+
15+
# Overrideable Constants (defaults also defined in sub scripts)
16+
REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"}
17+
MARKDOWN_INCLUDES_DIRECTORY=${MARKDOWN_INCLUDES_DIRECTORY:-"includes"}
18+
19+
## Get this "domains/anomaly-detection/summary" directory if not already set
20+
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
21+
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
22+
# This way non-standard tools like readlink aren't needed.
23+
ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)}
24+
#echo "anomalyDetectionSummary: ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR}"
25+
# Get the "scripts" directory by taking the path of this script and going one directory up.
26+
SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SUMMARY_DIR}/../../../scripts"} # Repository directory containing the shell scripts
27+
28+
MARKDOWN_INCLUDES_DIRECTORY="includes"
29+
MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-"${SCRIPTS_DIR}/markdown"}
30+
#echo "anomalyDetectionSummary: MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR}" >&2
31+
32+
# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher" and "execute_cypher_summarized"
33+
source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
34+
35+
# Appends a Markdown table to an existing file and
36+
# removes redundant header + separator rows.
37+
#
38+
# Usage:
39+
# cat newTable.md | append_table myMarkdownFile.md
40+
#
41+
# append_table myMarkdownFile.md <<'EOF'
42+
# | Name | Score | Archetype |
43+
# | --- | --- | --- |
44+
# | Bar | 0.9 | Something |
45+
# EOF
46+
#
47+
# Behavior:
48+
# - Keeps the first header row and its following separator row.
49+
# - Removes all subsequent duplicate header + separator pairs.
50+
# - Leaves all data rows untouched.
51+
append_to_markdown_table() {
52+
local file="$1"
53+
54+
# Append stdin to the target file
55+
cat >> "${file}"
56+
57+
# Clean up duplicate headers (header row + --- row)
58+
awk '!seen[$0]++ || NR <= 2' "${file}" > "${file}.tmp" && mv "${file}.tmp" "${file}"
59+
}
60+
61+
# Run the anomaly detection main report generation.
62+
anomaly_detection_report_first_section() {
63+
local report_markdown_includes_directory="${FULL_REPORT_DIRECTORY}/${MARKDOWN_INCLUDES_DIRECTORY}"
64+
mkdir -p "${report_markdown_includes_directory}"
65+
66+
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesPerAbstractionLayer.cypher" --output-markdown-table > "${report_markdown_includes_directory}/AnomaliesPerAbstractionLayer.md"
67+
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesInTotal.cypher" --output-markdown-table > "${report_markdown_includes_directory}/AnomaliesInTotal.md"
68+
}
69+
70+
# Aggregates all results in a Markdown report.
71+
#
72+
# Required Parameters:
73+
# - projection_node_label=...
74+
# Label of the nodes that will be used for the projection. Example: "Package"
75+
# - projection_language=...
76+
# Name of the associated programming language. Examples: "Java", "Typescript"
77+
anomaly_detection_deep_dive_report() {
78+
local nodeLabel
79+
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
80+
81+
local language
82+
language=$( extractQueryParameter "projection_language" "${@}" )
83+
84+
local report_number
85+
report_number=$( extractQueryParameter "report_number" "${@}" )
86+
87+
echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Creating ${language} ${nodeLabel} anomaly summary Markdown report..."
88+
89+
anomaly_summary_directory="${FULL_REPORT_DIRECTORY}/anomaly_summary_${language}_${nodeLabel}"
90+
mkdir -p "${anomaly_summary_directory}/${MARKDOWN_INCLUDES_DIRECTORY}"
91+
92+
# TODO 2.{number of report}
93+
echo "### 2.${report_number} ${language} ${nodeLabel}" > "${anomaly_summary_directory}/${MARKDOWN_INCLUDES_DIRECTORY}/DeepDiveSectionTitle.md"
94+
95+
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesDeepDiveOverview.cypher" "${@}" --output-markdown-table > "${anomaly_summary_directory}/${MARKDOWN_INCLUDES_DIRECTORY}/DeepDiveOverview.md"
96+
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomalyDetectionReportTopArchetypes.cypher" "${@}" --output-markdown-table > "${anomaly_summary_directory}/${MARKDOWN_INCLUDES_DIRECTORY}/TopAnomaliesByArchetype.md"
97+
98+
# Use Markdown template to assemble the final deep dive section of the Markdown report
99+
cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report-deep-dive-template.md" "${anomaly_summary_directory}"
100+
cat "${anomaly_summary_directory}/report-deep-dive-template.md" | "${MARKDOWN_SCRIPTS_DIR}/embedMarkdownIncludes.sh" "${anomaly_summary_directory}/${MARKDOWN_INCLUDES_DIRECTORY}" > "${anomaly_summary_directory}/report-deep-dive.md"
101+
rm -rf "${anomaly_summary_directory}/report-deep-dive-template.md"
102+
103+
# Clean-up after report generation. Empty reports will be deleted.
104+
source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${anomaly_summary_directory}"
105+
}
106+
107+
# Run the anomaly detection report generation.
108+
#
109+
# Required Parameters:
110+
# - projection_node_label=...
111+
# Label of the nodes that will be used for the projection. Example: "Package"
112+
# - projection_language=...
113+
# Name of the associated programming language. Examples: "Java", "Typescript"
114+
anomaly_detection_report() {
115+
time anomaly_detection_deep_dive_report "${@}"
116+
}
117+
118+
# Create report directory
119+
REPORT_NAME="anomaly-detection"
120+
FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}"
121+
mkdir -p "${FULL_REPORT_DIRECTORY}"
122+
123+
# Query Parameter key pairs for projection and algorithm side
124+
ALGORITHM_NODE="projection_node_label"
125+
ALGORITHM_LANGUAGE="projection_language"
126+
REPORT_NUMBER="report_number"
127+
128+
# -- Overview Report for all code type -------------------------------
129+
130+
anomaly_detection_report_first_section
131+
132+
# -- Detail Reports for each code type -------------------------------
133+
134+
anomaly_detection_report "${REPORT_NUMBER}=1" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_LANGUAGE}=Java"
135+
anomaly_detection_report "${REPORT_NUMBER}=2" "${ALGORITHM_NODE}=Package" "${ALGORITHM_LANGUAGE}=Java"
136+
anomaly_detection_report "${REPORT_NUMBER}=3" "${ALGORITHM_NODE}=Type" "${ALGORITHM_LANGUAGE}=Java"
137+
anomaly_detection_report "${REPORT_NUMBER}=4" "${ALGORITHM_NODE}=Module" "${ALGORITHM_LANGUAGE}=Typescript"
138+
139+
# ---------------------------------------------------------------
140+
141+
echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished."
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<!-- include:DeepDiveSectionTitle.md -->
2+
3+
#### Metrics and Features
4+
5+
* Degree (in/out)
6+
* PageRank, ArticleRank, PageRank–ArticleRank difference
7+
* Betweenness centrality
8+
* Local clustering coefficient
9+
* Cluster metrics (distance-to-medoid, average/max radius, outlier score)
10+
* Node embeddings (PCA-reduced)
11+
12+
#### Anomaly Results
13+
14+
##### Total anomalies
15+
16+
<!-- include:DeepDiveOverview.md -->
17+
18+
* **Top contributing features (via SHAP):**
19+
20+
* {{FEATURE\_1}}
21+
* {{FEATURE\_2}}
22+
* {{FEATURE\_3}}
23+
24+
#### Archetype Distribution
25+
26+
| Archetype | Count | Example Node(s) |
27+
| ------------------ | ----- | ----------------- |
28+
| Hub (Consumer) | {{X}} | {{NODE\_EXAMPLE}} |
29+
| Hub (Provider) | {{X}} | {{NODE\_EXAMPLE}} |
30+
| Bottleneck (Embed) | {{X}} | {{NODE\_EXAMPLE}} |
31+
| Outlier (Misfit) | {{X}} | {{NODE\_EXAMPLE}} |
32+
33+
#### Plots
34+
35+
* SHAP summary plots
36+
* Betweenness distribution histogram
37+
* Scatter: PageRank vs Clustering Coefficient
38+
* Cluster outlier visualization
39+
40+
---
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# 📊 Anomaly Detection Report
2+
3+
## 1. Executive Overview
4+
5+
This report analyzes structural and dependency anomalies across multiple abstraction levels of the codebase.
6+
The goal is to detect potential **software quality, design, and architecture issues** using graph-based features, anomaly detection (Isolation Forest), and SHAP explainability.
7+
8+
### 1.1 Overview of Analyzed Structures
9+
10+
<!-- include:AnomaliesPerAbstractionLayer.md -->
11+
12+
### 1.2 Anomalies in total
13+
14+
<!-- include:AnomaliesPerAbstractionLayer.md -->
15+
16+
## 2. Deep Dives by Abstraction Level
17+
18+
<!-- include:AnomalyDetectionDeepDives.md -->
19+
20+
## 3. Taxonomy of Anomaly Archetypes
21+
22+
| Archetype | Feature Profile | Risk for Architecture |
23+
|-----------|----------------|------------------------|
24+
| **Hub** | High degree, low clustering coefficient | Central dependency, fragile hotspot |
25+
| **Bottleneck** | High betweenness, low redundancy | Single point of failure, slows evolution |
26+
| **Outlier** | High cluster distance, small cluster size | Misfit component, unusual dependency pattern |
27+
| **Authority** | High PageRank but low articleRank | Over-relied utility with few reverse connections |
28+
| **Bridge** | Embedding-driven anomaly, cross-cluster | Connects unrelated domains, risky coupling |
29+
30+
---
31+
32+
## 4. Recommendations
33+
34+
* **Refactor hubs:** Break down god classes/utilities into smaller abstractions.
35+
* **Mitigate bottlenecks:** Add redundancy or alternative paths.
36+
* **Investigate outliers:** Validate if they are justified exceptions or design flaws.
37+
* **Enforce cohesion:** Raise clustering coefficient via better modular boundaries.
38+
* **Stabilize authorities:** Encapsulate widely used but locally weak components, reduce over-generalization, and ensure stable APIs.
39+
* **Clarify bridges:** Validate whether cross-cluster connectors are intentional (adapters/facades) or accidental; refactor or relocate responsibilities to preserve modularity.
40+
41+
---
42+
43+
## 5. Appendix
44+
45+
* **Methodology:** Isolation Forest, Random Forest proxy, SHAP explanations.
46+
* **Embedding generation:** Fast Random Projection, PCA (20–35 dims, \~0.9 target variance).
47+
* **Clustering:** HDBSCAN tuned against Leiden communities (golden reference, AMI optimization).
48+
* **Optimization:** Hyperparameter optimization for both Isolation Forest and Random Forest proxy with their F1 score
49+
* **Feature set:**
50+
* Degree
51+
* PageRank
52+
* ArticleRank
53+
* Page-to-Article Rank Difference
54+
* Betweenness
55+
* Local Clustering Coefficient
56+
* Cluster Approximate Outlier Score ( = 1.0 - Cluster Probability)
57+
* Cluster Radius Average
58+
* Cluster Distance to Medoid
59+
* Cluster Size
60+
* Node Embedding

0 commit comments

Comments
 (0)