Skip to content

Commit 107b946

Browse files
authored
Merge pull request #451 from JohT/feature/anomaly-detection-graph-visualization
Add graph visualizations to anomaly detection
2 parents 52f2654 + 8bdb374 commit 107b946

21 files changed

+897
-52
lines changed

.github/workflows/public-analyze-code-graph.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ jobs:
100100
repository: JohT/code-graph-analysis-pipeline
101101
ref: ${{ inputs.ref }}
102102
persist-credentials: false
103+
fetch-tags: true
103104

104105
- name: (Java Setup) Java Development Kit (JDK) ${{ matrix.java }}
105106
uses: actions/setup-java@dded0888837ed1f317902acf8a20df0ad188d165 # v5

domains/anomaly-detection/anomalyDetectionCsv.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ anomaly_detection_features() {
6161
# Determine the article rank if not already done
6262
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Exists.cypher" \
6363
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Write.cypher" "${@}"
64+
# Determine the normalized difference between Page Rank and Article Rank if not already done
65+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher" \
66+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Write.cypher" "${@}"
6467
}
6568

6669
# Run queries to find anomalies in the graph.

domains/anomaly-detection/anomalyDetectionPython.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ anomaly_detection_features() {
106106
# Determine the article rank if not already done
107107
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Exists.cypher" \
108108
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Write.cypher" "${@}"
109+
# Determine the normalized difference between Page Rank and Article Rank if not already done
110+
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher" \
111+
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Write.cypher" "${@}"
109112
}
110113

111114
# Execute the Python scripts for anomaly detection.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/usr/bin/env bash
2+
3+
# This script is dynamically triggered by "VisualizationReports.sh" when report "All" or "Visualization" is enabled.
4+
# It is designed as an entry point and delegates the execution to the dedicated "anomalyDetectionGraphVisualization.sh" script that does the "heavy lifting".
5+
6+
# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script.
7+
8+
# Requires anomalyDetectionGraphVisualization.sh
9+
10+
# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
11+
set -o errexit -o pipefail
12+
13+
# Overrideable Constants (defaults also defined in sub scripts)
14+
REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"}
15+
16+
## Get this "scripts/reports" directory if not already set
17+
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
18+
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
19+
# This way non-standard tools like readlink aren't needed.
20+
ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)}
21+
# echo "anomalyDetectionCsv: ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR}"
22+
23+
# Get the "summary" directory by taking the path of this script and selecting "summary".
24+
ANOMALY_DETECTION_GRAPHS_DIR=${ANOMALY_DETECTION_GRAPHS_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/graphs"} # Contains everything (scripts, queries, templates) to create the Markdown summary report for anomaly detection
25+
26+
# Delegate the execution to the responsible script.
27+
source "${ANOMALY_DETECTION_GRAPHS_DIR}/anomalyDetectionGraphVisualization.sh"
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
// Return the first node with (amongst others) a "centralityPageRankToArticleRankDifference" property if it exists
2+
3+
MATCH (codeUnit)
4+
WHERE $projection_node_label IN labels(codeUnit)
5+
AND codeUnit.centralityPageRankToArticleRankDifference IS NOT NULL
6+
AND codeUnit.centralityPageRankNormalized IS NOT NULL
7+
AND codeUnit.centralityPArticleRankNormalized IS NOT NULL
8+
RETURN codeUnit.name AS shortCodeUnitName
9+
,elementId(codeUnit) AS nodeElementId
10+
,codeUnit.centralityPageRankToArticleRankDifference AS pageToArticleRankDifference
11+
LIMIT 1
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// Calculates and writes the (amongst others) "centralityPageRankToArticleRankDifference" property.
2+
3+
MATCH (codeUnit)
4+
WHERE $projection_node_label IN labels(codeUnit)
5+
AND codeUnit.centralityPageRank IS NOT NULL
6+
AND codeUnit.centralityArticleRank IS NOT NULL
7+
WITH collect(codeUnit) AS codeUnits
8+
,min(codeUnit.centralityPageRank) AS minPageRank
9+
,max(codeUnit.centralityPageRank) AS maxPageRank
10+
,min(codeUnit.centralityArticleRank) AS minArticleRank
11+
,max(codeUnit.centralityArticleRank) AS maxArticleRank
12+
UNWIND codeUnits AS codeUnit
13+
WITH *
14+
,(codeUnit.centralityPageRank - minPageRank) / (maxPageRank - minPageRank) AS normalizedPageRank
15+
,(codeUnit.centralityArticleRank - minArticleRank) / (maxArticleRank - minArticleRank) AS normalizedArticleRank
16+
WITH *
17+
,normalizedPageRank - normalizedArticleRank AS normalizedPageRankToArticleRankDifference
18+
SET codeUnit.centralityPageRankToArticleRankDifference = normalizedPageRankToArticleRankDifference
19+
,codeUnit.centralityPageRankNormalized = normalizedPageRank
20+
,codeUnit.centralityArticleRankNormalized = normalizedArticleRank
21+
RETURN count(*) AS nodePropertiesWritten
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
// Anomaly Detection Graphs: Find top nodes marked as "Authority" including their incoming and outgoing dependencies, sizes based on PageRank and thick outline for nodes with high Page Rank to Article Rank difference in Graphviz format.
2+
3+
// Step 1: Query overall statistics, e.g. min/max weight for later normalization
4+
MATCH (sourceForStatistics)-[dependencyForStatistics:DEPENDS_ON]->(targetForStatistics)
5+
WHERE $projection_node_label IN labels(sourceForStatistics)
6+
AND $projection_node_label IN labels(targetForStatistics)
7+
WITH max(coalesce(dependencyForStatistics.weight25PercentInterfaces, dependencyForStatistics.weight)) AS maxWeight
8+
,percentileDisc(sourceForStatistics.centralityPageRankToArticleRankDifference, 0.80) AS pageToArticleRankThreshold
9+
,percentileDisc(targetForStatistics.centralityPageRankNormalized, 0.80) AS pageRankThreshold
10+
// Step 2: Query selected central node
11+
MATCH (central)
12+
WHERE $projection_node_label IN labels(central)
13+
AND central.anomalyAuthorityRank = toInteger($projection_node_rank)
14+
WITH maxWeight
15+
,pageToArticleRankThreshold
16+
,pageRankThreshold
17+
,central
18+
,"Top Rank #" + $projection_node_rank + " " + $projection_language + " " + $projection_node_label + " Authority\\n" AS graphLabel
19+
,coalesce(central.fqn, central.globalFqn, central.fileName, central.signature, central.name) AS targetName
20+
WITH *, "\\n\\ndark nodes: incoming dependencies (limited max. 40)\\n" AS graphLegend
21+
WITH *, graphLegend + "bright nodes: outgoing dependencies (limited max. 40)\\n" AS graphLegend
22+
WITH *, graphLegend + "node value: Page Rank (normalized)\\n" AS graphLegend
23+
WITH *, graphLegend + "large circle: > 80% percentile of Page Rank\\n" AS graphLegend
24+
WITH *, graphLegend + "thick outline: > 80% percentile of Page Rank to Article Rank Difference\\n" AS graphLegend
25+
WITH *, ["graph [label=\"" + graphLabel + targetName + graphLegend + "\\n\"];"] AS graphVizOutput
26+
WITH *, "🏛️ authority #" + central.anomalyAuthorityRank + "\\n" + central.name AS centralNodeLabel
27+
WITH *, graphVizOutput + ["central [label=\"" + centralNodeLabel + "\"];"] AS graphVizOutput
28+
// Step 3: Query direct incoming dependencies to the central node
29+
MATCH (source)-[dependency:DEPENDS_ON]->(central)
30+
WHERE $projection_node_label IN labels(source)
31+
AND source.outgoingDependencies > 0
32+
ORDER BY dependency.weight DESC, source.name ASC
33+
LIMIT 40
34+
WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight
35+
WITH *, round((toFloat(weight) / toFloat(maxWeight) * 2.5) + 0.4, 1.0) AS penWidth
36+
WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=" + penWidth AS edgeAttributes
37+
WITH *, CASE WHEN source.centralityPageRankToArticleRankDifference >= pageToArticleRankThreshold
38+
THEN 5 ELSE 2 END AS scaledNodeBorder
39+
WITH *, CASE WHEN source.centralityPageRankNormalized >= pageRankThreshold
40+
THEN "shape = \"circle\"; height=2; " ELSE "" END AS nodeEmphasis
41+
WITH *, round(source.centralityPageRankNormalized * 100.0, 2) + "%" AS labelValue
42+
// Add the last part of the element id to the node name to make it unique.
43+
WITH *, source.name + "_" + split(elementId(source), ':')[-1] AS sourceId
44+
WITH *, "penwidth = " + scaledNodeBorder + "; " AS directInBorder
45+
// Split long names like inner classes identified by a dollar sign ($)
46+
WITH *, replace(source.name, '$', '$\\n') AS sourceNameSplit
47+
WITH *, "label = \"" + sourceNameSplit + "\\n(" + labelValue + ")\"; " AS directInLabel
48+
WITH *, " [" + nodeEmphasis + directInLabel + directInBorder + "]; " AS directInNodeProperties
49+
WITH *, "\"" + sourceId + "\" " + directInNodeProperties AS directInNode
50+
WITH maxWeight
51+
,pageToArticleRankThreshold
52+
,pageRankThreshold
53+
,central
54+
,graphVizOutput
55+
,collect(source) AS incomingDependencyNodes
56+
,collect(directInNode + "\"" + sourceId + "\" -> central [" + edgeAttributes + "];") AS directInEdges
57+
WITH *, graphVizOutput + directInEdges AS graphVizOutput
58+
// Step 4: Query direct outgoing dependencies from the central node
59+
MATCH (source)<-[dependency:DEPENDS_ON]-(central)
60+
WHERE $projection_node_label IN labels(source)
61+
AND source.incomingDependencies > 0
62+
ORDER BY dependency.weight DESC, source.name ASC
63+
LIMIT 40
64+
WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight
65+
WITH *, round((toFloat(weight) / toFloat(maxWeight) * 2.5) + 0.4, 1.0) AS penWidth
66+
WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=" + penWidth AS edgeAttributes
67+
// Use a lighter color for the target nodes of outgoing dependencies from the central node and their edges
68+
WITH *, edgeAttributes + "; color = 5" AS edgeAttributes
69+
WITH *, "color = 5; fillcolor = 1; " AS directOutColor
70+
WITH *, CASE WHEN source.centralityPageRankToArticleRankDifference >= pageToArticleRankThreshold
71+
THEN 5 ELSE 2 END AS scaledNodeBorder
72+
WITH *, CASE WHEN source.centralityPageRankNormalized >= pageRankThreshold
73+
THEN "shape = \"circle\"; height=2; " ELSE "" END AS nodeEmphasis
74+
WITH *, round(source.centralityPageRankNormalized * 100.0, 2) + "%" AS labelValue
75+
// Add the last part of the element id to the node name to make it unique.
76+
WITH *, source.name + "_" + split(elementId(source), ':')[-1] AS sourceId
77+
WITH *, "penwidth = " + scaledNodeBorder + "; " AS directOutBorder
78+
// Split long names like inner classes identified by a dollar sign ($)
79+
WITH *, replace(source.name, '$', '$\\n') AS sourceNameSplit
80+
WITH *, "label = \"" + sourceNameSplit + "\\n(" + labelValue + ")\"; " AS directOutLabel
81+
WITH *, " [" + nodeEmphasis + directOutLabel + directOutBorder + directOutColor + "]; " AS directOutNodeProperties
82+
WITH *, "\"" + sourceId + "\" " + directOutNodeProperties AS directOutNode
83+
WITH maxWeight
84+
,central
85+
,graphVizOutput
86+
,incomingDependencyNodes
87+
,collect(source) AS outgoingDependencyNodes
88+
,collect(directOutNode + "central -> \"" + sourceId + "\" [" + edgeAttributes + "];") AS directOutEdges
89+
WITH *, graphVizOutput + directOutEdges AS graphVizOutput
90+
WITH *, incomingDependencyNodes + outgoingDependencyNodes AS directDependentNodes
91+
// Step 5: Query dependencies between direct dependencies outside the central node
92+
UNWIND directDependentNodes AS directDependentNode
93+
MATCH (directDependentNode)-[dependency:DEPENDS_ON]->(anotherDirectDependentNode)
94+
WHERE anotherDirectDependentNode IN directDependentNodes
95+
AND anotherDirectDependentNode <> directDependentNode
96+
ORDER BY dependency.weight DESC, directDependentNode.name ASC
97+
WITH graphVizOutput
98+
,directDependentNode
99+
,dependency
100+
,collect(anotherDirectDependentNode)[0] AS firstLinkedDependentNode
101+
LIMIT 140
102+
WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight
103+
// Use a fixed small pen width for secondary dependencies for better visibility of the more important direct dependency
104+
WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=0.3" AS edgeAttributes
105+
// Use an even lighter color for secondary dependency edges
106+
WITH *, edgeAttributes + "; color = 3" AS edgeAttributes
107+
// Add the last part of the element id to the node name to make it unique.
108+
WITH *, directDependentNode.name + "_" + split(elementId(directDependentNode), ':')[-1] AS directDependentNodeId
109+
WITH *, firstLinkedDependentNode.name + "_" + split(elementId(firstLinkedDependentNode), ':')[-1] AS firstLinkedDependentNodeId
110+
WITH *, "\"" + directDependentNodeId + "\" -> \"" + firstLinkedDependentNodeId + "\"" AS directDependenciesEdge
111+
WITH *, collect(directDependenciesEdge + " [" + edgeAttributes + "]") AS directDependenciesEdges
112+
WITH *, graphVizOutput + directDependenciesEdges AS graphVizOutput
113+
UNWIND graphVizOutput AS graphVizOutputLine
114+
RETURN DISTINCT graphVizOutputLine

0 commit comments

Comments
 (0)