From f66a1158f90abfbf448ae21bf42bdfb983bdfc0c Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Fri, 5 Sep 2025 18:39:29 +0200 Subject: [PATCH 01/13] Refine description and threshold variable names --- ...nomalyDetectionDependencyHungryOrchestrators.cypher | 10 +++++----- .../AnomalyDetectionFragileStructuralBridges.cypher | 10 +++++----- .../queries/AnomalyDetectionHiddenBridgeNodes.cypher | 10 +++++----- .../AnomalyDetectionOverReferencesUtilities.cypher | 10 +++++----- .../queries/AnomalyDetectionPopularBottlenecks.cypher | 10 +++++----- .../AnomalyDetectionPotentialImbalancedRoles.cypher | 4 ++-- ...malyDetectionPotentialOverEngineerOrIsolated.cypher | 10 +++++----- .../queries/AnomalyDetectionSilentCoordinators.cypher | 10 +++++----- .../AnomalyDetectionUnexpectedCentralNodes.cypher | 10 +++++----- 9 files changed, 42 insertions(+), 42 deletions(-) diff --git a/domains/anomaly-detection/queries/AnomalyDetectionDependencyHungryOrchestrators.cypher b/domains/anomaly-detection/queries/AnomalyDetectionDependencyHungryOrchestrators.cypher index e3fb4a543..2ec73e5bb 100644 --- a/domains/anomaly-detection/queries/AnomalyDetectionDependencyHungryOrchestrators.cypher +++ b/domains/anomaly-detection/queries/AnomalyDetectionDependencyHungryOrchestrators.cypher @@ -1,4 +1,4 @@ -// Anomaly Detection Query: Find dependency hungry orchestrators by listing the top 20 entries with the highest Article Rank >= 90% percentile and a Betweeenness centrality >= 90% percentile. +// Anomaly Detection Query: Find dependency hungry orchestrators by listing the top (at most) 20 entries with the highest Article Rank >= 90% percentile and a Betweeenness centrality >= 90% percentile. // Shows key code that depend on many others and also controls flow — likely orchestrators or managers. MATCH (codeUnit) @@ -8,12 +8,12 @@ AND codeUnit.incomingDependencies IS NOT NULL AND codeUnit.outgoingDependencies IS NOT NULL WITH collect(codeUnit) AS codeUnits - ,percentileDisc(codeUnit.centralityArticleRank, 0.90) AS articleRank90Percentile - ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweenness90Percentile + ,percentileDisc(codeUnit.centralityArticleRank, 0.90) AS articleRankThreshold + ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweennessThreshold UNWIND codeUnits AS codeUnit WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree - WHERE codeUnit.centralityArticleRank >= articleRank90Percentile - AND codeUnit.centralityBetweenness >= betweenness90Percentile + WHERE codeUnit.centralityArticleRank >= articleRankThreshold + AND codeUnit.centralityBetweenness >= betweennessThreshold OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) WITH *, artifact.name AS artifactName OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) diff --git a/domains/anomaly-detection/queries/AnomalyDetectionFragileStructuralBridges.cypher b/domains/anomaly-detection/queries/AnomalyDetectionFragileStructuralBridges.cypher index f9766c6d3..aae9a3e51 100644 --- a/domains/anomaly-detection/queries/AnomalyDetectionFragileStructuralBridges.cypher +++ b/domains/anomaly-detection/queries/AnomalyDetectionFragileStructuralBridges.cypher @@ -1,4 +1,4 @@ -// Anomaly Detection Query: Find fragile structural bridges, potential boundary-spanning modules and cohesion violations by listing the top 20 entries with the highest Betweeenness centrality >= 90% percentile and a local clustering coefficient <= 10% percentile. +// Anomaly Detection Query: Find fragile structural bridges, potential boundary-spanning modules and cohesion violations by listing the (at most) top 20 entries with the highest Betweeenness centrality >= 90% percentile and a local clustering coefficient <= 10% percentile. // Shows code that connects otherwise unrelated parts of the graph — potential architectural risks. MATCH (codeUnit) @@ -8,12 +8,12 @@ AND codeUnit.incomingDependencies IS NOT NULL AND codeUnit.outgoingDependencies IS NOT NULL WITH collect(codeUnit) AS codeUnits - ,percentileDisc(codeUnit.communityLocalClusteringCoefficient, 0.10) AS localClusteringCoefficient10Percentile - ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweenness90Percentile + ,percentileDisc(codeUnit.communityLocalClusteringCoefficient, 0.10) AS localClusteringCoefficientThreshold + ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweennessThreshold UNWIND codeUnits AS codeUnit WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree - WHERE codeUnit.communityLocalClusteringCoefficient <= localClusteringCoefficient10Percentile - AND codeUnit.centralityBetweenness >= betweenness90Percentile + WHERE codeUnit.communityLocalClusteringCoefficient <= localClusteringCoefficientThreshold + AND codeUnit.centralityBetweenness >= betweennessThreshold OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) WITH *, artifact.name AS artifactName OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) diff --git a/domains/anomaly-detection/queries/AnomalyDetectionHiddenBridgeNodes.cypher b/domains/anomaly-detection/queries/AnomalyDetectionHiddenBridgeNodes.cypher index e93b8d1a0..95d3bd038 100644 --- a/domains/anomaly-detection/queries/AnomalyDetectionHiddenBridgeNodes.cypher +++ b/domains/anomaly-detection/queries/AnomalyDetectionHiddenBridgeNodes.cypher @@ -1,4 +1,4 @@ -// Anomaly Detection Query: Find hidden bridge code or misplaced responsibilities by listing the top 20 entries with the highest Betweeenness centrality >= 90% percentile and a Page Rank <= 10% percentile. +// Anomaly Detection Query: Find hidden bridge code or misplaced responsibilities by listing the (at most) top 20 entries with the highest Betweeenness centrality >= 90% percentile and a Page Rank <= 10% percentile. // Shows code that mediates flow, but isn’t highly depended on — structural surprise. MATCH (codeUnit) @@ -8,12 +8,12 @@ AND codeUnit.incomingDependencies IS NOT NULL AND codeUnit.outgoingDependencies IS NOT NULL WITH collect(codeUnit) AS codeUnits - ,percentileDisc(codeUnit.centralityPageRank, 0.10) AS pageRank10Percentile - ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweenness90Percentile + ,percentileDisc(codeUnit.centralityPageRank, 0.10) AS pageRankThreshold + ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweennessThreshold UNWIND codeUnits AS codeUnit WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree - WHERE codeUnit.centralityPageRank <= pageRank10Percentile - AND codeUnit.centralityBetweenness >= betweenness90Percentile + WHERE codeUnit.centralityPageRank <= pageRankThreshold + AND codeUnit.centralityBetweenness >= betweennessThreshold OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) WITH *, artifact.name AS artifactName OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) diff --git a/domains/anomaly-detection/queries/AnomalyDetectionOverReferencesUtilities.cypher b/domains/anomaly-detection/queries/AnomalyDetectionOverReferencesUtilities.cypher index 876ba74e9..72ece11f2 100644 --- a/domains/anomaly-detection/queries/AnomalyDetectionOverReferencesUtilities.cypher +++ b/domains/anomaly-detection/queries/AnomalyDetectionOverReferencesUtilities.cypher @@ -1,4 +1,4 @@ -// Anomaly Detection Query: Find over-referenced utility code by listing the top 20 entries with the highest Page Rank >= 90% percentile and a low local clustering coefficient below the 10% percentile. +// Anomaly Detection Query: Find over-referenced utility code by listing the (at most) top 20 entries with the highest Page Rank >= 90% percentile and a low local clustering coefficient below the 10% percentile. // Shows code that is widely referenced, but loosely coupled in neighborhood — could be over-generalized or abused. MATCH (codeUnit) @@ -8,12 +8,12 @@ AND codeUnit.incomingDependencies IS NOT NULL AND codeUnit.outgoingDependencies IS NOT NULL WITH collect(codeUnit) AS codeUnits - ,percentileDisc(codeUnit.communityLocalClusteringCoefficient, 0.10) AS localClusteringCoefficient10PercentPercentile - ,percentileDisc(codeUnit.centralityPageRank, 0.90) AS pageRank90PercentPercentile + ,percentileDisc(codeUnit.communityLocalClusteringCoefficient, 0.10) AS localClusteringCoefficientThreshold + ,percentileDisc(codeUnit.centralityPageRank, 0.90) AS pageRankThreshold UNWIND codeUnits AS codeUnit WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree - WHERE codeUnit.communityLocalClusteringCoefficient <= localClusteringCoefficient10PercentPercentile - AND codeUnit.centralityPageRank >= pageRank90PercentPercentile + WHERE codeUnit.communityLocalClusteringCoefficient <= localClusteringCoefficientThreshold + AND codeUnit.centralityPageRank >= pageRankThreshold OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) WITH *, artifact.name AS artifactName OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) diff --git a/domains/anomaly-detection/queries/AnomalyDetectionPopularBottlenecks.cypher b/domains/anomaly-detection/queries/AnomalyDetectionPopularBottlenecks.cypher index 9b66b27c6..d7d5a7b16 100644 --- a/domains/anomaly-detection/queries/AnomalyDetectionPopularBottlenecks.cypher +++ b/domains/anomaly-detection/queries/AnomalyDetectionPopularBottlenecks.cypher @@ -1,4 +1,4 @@ -// Anomaly Detection Query: Find popular bottlenecks by listing the top 20 entries with the highest Betweeenness centrality >= 90% percentile and a Page Rank >= 90% percentile. +// Anomaly Detection Query: Find popular bottlenecks by listing the (at most) top 20 entries with the highest Betweeenness centrality >= 90% percentile and a Page Rank >= 90% percentile. // Shows key code that is both heavily depended on and control flow — critical hubs. MATCH (codeUnit) @@ -8,12 +8,12 @@ AND codeUnit.incomingDependencies IS NOT NULL AND codeUnit.outgoingDependencies IS NOT NULL WITH collect(codeUnit) AS codeUnits - ,percentileDisc(codeUnit.centralityPageRank, 0.90) AS pageRank90Percentile - ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweenness90Percentile + ,percentileDisc(codeUnit.centralityPageRank, 0.90) AS pageRankThreshold + ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweennessThreshold UNWIND codeUnits AS codeUnit WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree - WHERE codeUnit.centralityPageRank >= pageRank90Percentile - AND codeUnit.centralityBetweenness >= betweenness90Percentile + WHERE codeUnit.centralityPageRank >= pageRankThreshold + AND codeUnit.centralityBetweenness >= betweennessThreshold OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) WITH *, artifact.name AS artifactName OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) diff --git a/domains/anomaly-detection/queries/AnomalyDetectionPotentialImbalancedRoles.cypher b/domains/anomaly-detection/queries/AnomalyDetectionPotentialImbalancedRoles.cypher index fd7cd6a30..62fbafc08 100644 --- a/domains/anomaly-detection/queries/AnomalyDetectionPotentialImbalancedRoles.cypher +++ b/domains/anomaly-detection/queries/AnomalyDetectionPotentialImbalancedRoles.cypher @@ -1,4 +1,4 @@ -// Anomaly Detection Query: Find potential imbalanced roles in the codebase by listing the top 40 most significant Page Rank to Article Rank differences. +// Anomaly Detection Query: Find potential imbalanced roles in the codebase by listing the (at most) top 20 most significant Page Rank to Article Rank differences. MATCH (codeUnit) WHERE $projection_node_label IN labels(codeUnit) @@ -31,4 +31,4 @@ OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS //,pageToArticleRankDifferenceMean //,pageToArticleRankDifferenceStandardDeviation ORDER BY abs(pageToArticleRankDifferenceZScore) DESC - LIMIT 40 \ No newline at end of file + LIMIT 20 \ No newline at end of file diff --git a/domains/anomaly-detection/queries/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher b/domains/anomaly-detection/queries/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher index 74e9f886e..5d044683f 100644 --- a/domains/anomaly-detection/queries/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher +++ b/domains/anomaly-detection/queries/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher @@ -1,4 +1,4 @@ -// Anomaly Detection Query: Find potential over-engineered or isolated code unit by listing the top 20 entries with the highest local clustering coefficient and a Page Rank below the 5% percentile. +// Anomaly Detection Query: Find potential over-engineered or isolated code unit by listing the (at most) top 20 entries with the highest local clustering coefficient and a Page Rank below the 5% percentile. MATCH (codeUnit) WHERE $projection_node_label IN labels(codeUnit) @@ -7,12 +7,12 @@ AND codeUnit.incomingDependencies IS NOT NULL AND codeUnit.outgoingDependencies IS NOT NULL WITH collect(codeUnit) AS codeUnits - ,percentileDisc(codeUnit.centralityPageRank, 0.10) AS pageRank10PercentPercentile - ,percentileDisc(codeUnit.communityLocalClusteringCoefficient, 0.90) AS localClusteringCoefficient90PercentPercentile + ,percentileDisc(codeUnit.centralityPageRank, 0.10) AS pageRankThreshold + ,percentileDisc(codeUnit.communityLocalClusteringCoefficient, 0.90) AS localClusteringCoefficientThreshold UNWIND codeUnits AS codeUnit WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree - WHERE codeUnit.centralityPageRank <= pageRank10PercentPercentile - AND codeUnit.communityLocalClusteringCoefficient >= localClusteringCoefficient90PercentPercentile + WHERE codeUnit.centralityPageRank <= pageRankThreshold + AND codeUnit.communityLocalClusteringCoefficient >= localClusteringCoefficientThreshold OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) WITH *, artifact.name AS artifactName OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) diff --git a/domains/anomaly-detection/queries/AnomalyDetectionSilentCoordinators.cypher b/domains/anomaly-detection/queries/AnomalyDetectionSilentCoordinators.cypher index 945b73e54..f1f00ad35 100644 --- a/domains/anomaly-detection/queries/AnomalyDetectionSilentCoordinators.cypher +++ b/domains/anomaly-detection/queries/AnomalyDetectionSilentCoordinators.cypher @@ -1,4 +1,4 @@ -// Anomaly Detection Query: Find silent coordinators by listing the top 20 entries with the highest betweeenness >= 90% percentile and a in-degree <= 10% percentile. +// Anomaly Detection Query: Find silent coordinators by listing the (at most) top 20 entries with the highest betweeenness >= 90% percentile and a in-degree <= 10% percentile. // Shows code that controls lots of interactions, yet not many modules depend on it — hidden complexity MATCH (codeUnit) @@ -7,12 +7,12 @@ AND codeUnit.incomingDependencies IS NOT NULL AND codeUnit.outgoingDependencies IS NOT NULL WITH collect(codeUnit) AS codeUnits - ,percentileDisc(codeUnit.incomingDependencies, 0.10) AS incomingDependencies10Percentile - ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweenness90Percentile + ,percentileDisc(codeUnit.incomingDependencies, 0.10) AS incomingDependenciesThreshold + ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweennessThreshold UNWIND codeUnits AS codeUnit WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree - WHERE codeUnit.incomingDependencies <= incomingDependencies10Percentile - AND codeUnit.centralityBetweenness <= betweenness90Percentile + WHERE codeUnit.incomingDependencies <= incomingDependenciesThreshold + AND codeUnit.centralityBetweenness <= betweennessThreshold OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) WITH *, artifact.name AS artifactName OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) diff --git a/domains/anomaly-detection/queries/AnomalyDetectionUnexpectedCentralNodes.cypher b/domains/anomaly-detection/queries/AnomalyDetectionUnexpectedCentralNodes.cypher index a1fdd6b5b..ca059244e 100644 --- a/domains/anomaly-detection/queries/AnomalyDetectionUnexpectedCentralNodes.cypher +++ b/domains/anomaly-detection/queries/AnomalyDetectionUnexpectedCentralNodes.cypher @@ -1,4 +1,4 @@ -// Anomaly Detection Query: Find hidden bottlenecks or hubs by listing the top 20 entries with the highest betweeenness >= 90% percentile and a degree <= 10% percentile. +// Anomaly Detection Query: Find hidden bottlenecks or hubs by listing the (at most) top 20 entries with the highest betweeenness >= 90% percentile and a degree <= 10% percentile. // Shows code with high structural importance and only a few incoming and outgoing dependencies — often unexpected. MATCH (codeUnit) @@ -8,12 +8,12 @@ AND codeUnit.outgoingDependencies IS NOT NULL WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree WITH collect(codeUnit) AS codeUnits - ,percentileDisc(degree, 0.10) AS degree10Percentile - ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweenness90Percentile + ,percentileDisc(degree, 0.10) AS degreeThreshold + ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweennessThreshold UNWIND codeUnits AS codeUnit WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree - WHERE degree <= degree10Percentile - AND codeUnit.centralityBetweenness <= betweenness90Percentile + WHERE degree <= degreeThreshold + AND codeUnit.centralityBetweenness <= betweennessThreshold OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) WITH *, artifact.name AS artifactName OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) From 693001c23356a80fbf788d507e6782e4e001626c Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Mon, 8 Sep 2025 16:07:17 +0200 Subject: [PATCH 02/13] Refine description of MARKDOWN_SCRIPTS_DIR --- scripts/markdown/embedMarkdownIncludes.sh | 2 +- scripts/markdown/formatQueryResultAsMarkdownTable.sh | 2 +- scripts/markdown/testEmbedMarkdownIncludes.sh | 2 +- scripts/markdown/testFormatQueryResultAsMarkdownTable.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/markdown/embedMarkdownIncludes.sh b/scripts/markdown/embedMarkdownIncludes.sh index e54a58315..c1fe16f19 100755 --- a/scripts/markdown/embedMarkdownIncludes.sh +++ b/scripts/markdown/embedMarkdownIncludes.sh @@ -9,7 +9,7 @@ set -o errexit -o pipefail # Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. # This way non-standard tools like readlink aren't needed. -MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts +MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts for markdown #echo "embedMarkdownIncludes: MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR}" >&2 template_markdown_file="$1" diff --git a/scripts/markdown/formatQueryResultAsMarkdownTable.sh b/scripts/markdown/formatQueryResultAsMarkdownTable.sh index f418e4bc6..a0be6e136 100755 --- a/scripts/markdown/formatQueryResultAsMarkdownTable.sh +++ b/scripts/markdown/formatQueryResultAsMarkdownTable.sh @@ -9,7 +9,7 @@ set -o errexit -o pipefail # Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. # This way non-standard tools like readlink aren't needed. -MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts +MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts for markdown #echo "formatQueryResultAsMarkdownTable: MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR}" >&2 echo "formatQueryResultAsMarkdownTable: Will output in Markdown Table Format" >&2 diff --git a/scripts/markdown/testEmbedMarkdownIncludes.sh b/scripts/markdown/testEmbedMarkdownIncludes.sh index 17661554b..a16d24db5 100755 --- a/scripts/markdown/testEmbedMarkdownIncludes.sh +++ b/scripts/markdown/testEmbedMarkdownIncludes.sh @@ -11,7 +11,7 @@ set -o errexit -o pipefail # Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. # This way non-standard tools like readlink aren't needed. -MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts +MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts for markdown echo "testEmbedMarkdownIncludes: MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR}" >&2 tearDown() { diff --git a/scripts/markdown/testFormatQueryResultAsMarkdownTable.sh b/scripts/markdown/testFormatQueryResultAsMarkdownTable.sh index 2d20983b3..6a5786573 100755 --- a/scripts/markdown/testFormatQueryResultAsMarkdownTable.sh +++ b/scripts/markdown/testFormatQueryResultAsMarkdownTable.sh @@ -11,7 +11,7 @@ set -o errexit -o pipefail # Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. # This way non-standard tools like readlink aren't needed. -MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts +MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts for markdown #echo "testFormatQueryResultAsMarkdownTable: MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR}" >&2 tearDown() { From fba241127b8c9756e5724940a467f3fdc2598a6b Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Mon, 15 Sep 2025 17:31:09 +0200 Subject: [PATCH 03/13] Remove redundant markdown generation logging --- scripts/markdown/formatQueryResultAsMarkdownTable.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/markdown/formatQueryResultAsMarkdownTable.sh b/scripts/markdown/formatQueryResultAsMarkdownTable.sh index a0be6e136..2fe88341e 100755 --- a/scripts/markdown/formatQueryResultAsMarkdownTable.sh +++ b/scripts/markdown/formatQueryResultAsMarkdownTable.sh @@ -12,8 +12,6 @@ set -o errexit -o pipefail MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts for markdown #echo "formatQueryResultAsMarkdownTable: MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR}" >&2 -echo "formatQueryResultAsMarkdownTable: Will output in Markdown Table Format" >&2 - # Read all input (including multiline) into cypher_query_result cypher_query_result=$(cat) From 7137bf6affb957b6feacfd0b911b54f23ebd83f9 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Thu, 25 Sep 2025 08:27:08 +0200 Subject: [PATCH 04/13] Make markdown includes directory configurable and use sysin for template --- scripts/executeQuery.sh | 7 +++-- scripts/markdown/embedMarkdownIncludes.sh | 23 +++++++++++---- scripts/markdown/testEmbedMarkdownIncludes.sh | 28 +++++++++++++++---- 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/scripts/executeQuery.sh b/scripts/executeQuery.sh index e5a1468dd..04b53766c 100755 --- a/scripts/executeQuery.sh +++ b/scripts/executeQuery.sh @@ -26,6 +26,9 @@ set -o errexit -o pipefail SCRIPTS_DIR=${SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts #echo "executeQuery: SCRIPTS_DIR=$SCRIPTS_DIR" >&2 +MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-"${SCRIPTS_DIR}/markdown"} +#echo "executeQuery: MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR}" >&2 + # Overrideable Defaults NEO4J_HTTP_PORT=${NEO4J_HTTP_PORT:-"7474"} # Neo4j HTTP API port for executing queries NEO4J_HTTP_TRANSACTION_ENDPOINT=${NEO4J_HTTP_TRANSACTION_ENDPOINT:-"db/neo4j/tx/commit"} # Since Neo4j v5: "db//tx/commit", Neo4j v4: "db/data/transaction/commit" @@ -149,8 +152,8 @@ if [[ -n "${error_message}" ]]; then fi if [ "${output_markdown_table}" = "true" ] ; then - echo "executeQuery: Will output in Markdown Table Format" >&2 - echo -n "${cypher_query_result}" | "${SCRIPTS_DIR}/markdown/formatQueryResultAsMarkdownTable.sh" + #echo "executeQuery: Will output in Markdown Table Format" >&2 + echo -n "${cypher_query_result}" | "${MARKDOWN_SCRIPTS_DIR}/formatQueryResultAsMarkdownTable.sh" else # Output results in CSV format if [ "${no_source_reference}" = true ] ; then diff --git a/scripts/markdown/embedMarkdownIncludes.sh b/scripts/markdown/embedMarkdownIncludes.sh index c1fe16f19..12bc40106 100755 --- a/scripts/markdown/embedMarkdownIncludes.sh +++ b/scripts/markdown/embedMarkdownIncludes.sh @@ -1,6 +1,7 @@ #!/usr/bin/env bash -# Processes a template_markdown_file markdown file, replacing placeholders like "" with the contents of the specified markdown files. The files to include needs to be in the "includes" subdirectory. +# Processes template markdown (sysin) replacing placeholders like "" with the contents of the specified markdown files. The files to include needs to be in the "includes" subdirectory. +# Can take an optional input for the directory that contains the markdown files to be included/embedded (defaults to "includes"). # Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) set -o errexit -o pipefail @@ -12,10 +13,20 @@ set -o errexit -o pipefail MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts for markdown #echo "embedMarkdownIncludes: MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR}" >&2 -template_markdown_file="$1" -include_directory="includes" +# Read all input (including multiline) into markdown_template +markdown_template=$(cat) -awk -v include_directory="${include_directory}" ' +includes_directory="$1" +if [ -z "${includes_directory}" ] ; then + includes_directory="./includes" + echo "embedMarkdownIncludes: Using default include directory ${includes_directory}." >&2 +fi +if [ ! -d "${includes_directory}" ] ; then + echo "embedMarkdownIncludes: Couldn't find include directory ${includes_directory}." >&2 + exit 2 +fi + +echo -n "${markdown_template}" | awk -v includes_directory="${includes_directory}" ' # Check if the filename is safe function is_safe(path) { if (substr(path, 1, 1) == "/") return 0 @@ -24,7 +35,7 @@ awk -v include_directory="${include_directory}" ' } function include_file(path, fullpath, line) { - fullpath = include_directory "/" path + fullpath = includes_directory "/" path if (!is_safe(path)) { print "ERROR: illegal include path: " path > "/dev/stderr" @@ -56,6 +67,6 @@ awk -v include_directory="${include_directory}" ' print } } -' "${template_markdown_file}" +' #echo "embedMarkdownIncludes: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished." >&2 \ No newline at end of file diff --git a/scripts/markdown/testEmbedMarkdownIncludes.sh b/scripts/markdown/testEmbedMarkdownIncludes.sh index a16d24db5..3396b180a 100755 --- a/scripts/markdown/testEmbedMarkdownIncludes.sh +++ b/scripts/markdown/testEmbedMarkdownIncludes.sh @@ -61,7 +61,7 @@ expected_test_include_content="This is the included content for the test." echo "${expected_test_include_content}" > "${temporaryTestDirectory}/${testIncludeFile}" # - Execute script under test -embeddedContent=$(cd "${temporaryTestDirectory}"; "${MARKDOWN_SCRIPTS_DIR}/embedMarkdownIncludes.sh" "${testMarkdownTemplate}" ) +embeddedContent=$(cat "${testMarkdownTemplate}" | "${MARKDOWN_SCRIPTS_DIR}/embedMarkdownIncludes.sh" "${temporaryTestDirectory}/includes") # - Verify results if [ "${embeddedContent}" != "${expected_test_include_content}" ]; then @@ -71,15 +71,33 @@ fi # ------------------------------------------------------------ # Test case -- # ------------------------------------------------------------ -echo "testEmbedMarkdownIncludes: 2.) A missing include file results in an error." +echo "testEmbedMarkdownIncludes: 2.) An existing include file in the DEFAULT directory is correctly embedded." # - Setup -testMarkdownTemplateMissingInclude="testMarkdownTemplateMissingInclude.md" -echo "" > "${temporaryTestDirectory}/${testMarkdownTemplateMissingInclude}" +testIncludeFile="includes/testInclude.md" +expected_test_include_content="This is the included content for the test." +echo "${expected_test_include_content}" > "${temporaryTestDirectory}/${testIncludeFile}" + +# - Execute script under test +embeddedContent=$(cd "${temporaryTestDirectory}"; cat "${testMarkdownTemplate}" | "${MARKDOWN_SCRIPTS_DIR}/embedMarkdownIncludes.sh") + +# - Verify results +if [ "${embeddedContent}" != "${expected_test_include_content}" ]; then + fail "2.) Test failed: Expected embedded content to be '${expected_test_include_content}', but got '${embeddedContent}'." +fi + +# ------------------------------------------------------------ +# Test case -- +# ------------------------------------------------------------ +echo "testEmbedMarkdownIncludes: 3.) A missing include file results in an error." + +# - Setup +testMarkdownTemplateMissingInclude="${temporaryTestDirectory}/testMarkdownTemplateMissingInclude.md" +echo "" > "${testMarkdownTemplateMissingInclude}" # - Execute script under test set +o errexit -errorOutput=$(cd "${temporaryTestDirectory}"; { "${MARKDOWN_SCRIPTS_DIR}/embedMarkdownIncludes.sh" "${testMarkdownTemplateMissingInclude}" 2>&1 1>/dev/null; } ) +errorOutput=$( { cat "${testMarkdownTemplateMissingInclude}" | "${MARKDOWN_SCRIPTS_DIR}/embedMarkdownIncludes.sh" "${temporaryTestDirectory}/includes" 2>&1 1>/dev/null; } ) exitCode=$? set -o errexit From 6ea6e9ca1980dc030e1b9f62056bb6cb9734e227 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Mon, 29 Sep 2025 16:46:30 +0200 Subject: [PATCH 05/13] Support fallback markdown includes for missing files --- scripts/markdown/embedMarkdownIncludes.sh | 36 +++++++++++++------ scripts/markdown/testEmbedMarkdownIncludes.sh | 24 ++++++++++++- 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/scripts/markdown/embedMarkdownIncludes.sh b/scripts/markdown/embedMarkdownIncludes.sh index 12bc40106..eb37a27fe 100755 --- a/scripts/markdown/embedMarkdownIncludes.sh +++ b/scripts/markdown/embedMarkdownIncludes.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# Processes template markdown (sysin) replacing placeholders like "" with the contents of the specified markdown files. The files to include needs to be in the "includes" subdirectory. +# Processes template markdown (sysin) replacing placeholders like "" or "" with the contents of the specified markdown files. The files to include needs to be in the "includes" subdirectory. # Can take an optional input for the directory that contains the markdown files to be included/embedded (defaults to "includes"). # Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) @@ -34,17 +34,14 @@ echo -n "${markdown_template}" | awk -v includes_directory="${includes_directory return 1 } - function include_file(path, fullpath, line) { - fullpath = includes_directory "/" path - + function try_include(path, fullpath, line) { if (!is_safe(path)) { print "ERROR: illegal include path: " path > "/dev/stderr" exit 1 } - + fullpath = includes_directory "/" path if ((getline test < fullpath) < 0) { - print "ERROR: missing file " fullpath > "/dev/stderr" - exit 1 + return 0 # not found } close(fullpath) @@ -52,15 +49,33 @@ echo -n "${markdown_template}" | awk -v includes_directory="${includes_directory print line } close(fullpath) + return 1 + } + + function include_file(spec, n, parts, i, success) { + n = split(spec, parts, /\|/) + success = 0 + for (i = 1; i <= n; i++) { + gsub(/^[ \t]+|[ \t]+$/, "", parts[i]) # trim + if (parts[i] == "") continue + if (try_include(parts[i])) { + success = 1 + break + } + } + if (!success) { + print "ERROR: missing include file(s): " spec > "/dev/stderr" + exit 1 + } } { - # Look for the include marker using index+substr (portable) - if ($0 ~ /") fname = substr($0, start, end - start) - gsub(/^[ \t]+|[ \t]+$/, "", fname) # trim spaces + gsub(/^[ \t]+|[ \t]+$/, "", fname) include_file(fname) } else { @@ -69,4 +84,5 @@ echo -n "${markdown_template}" | awk -v includes_directory="${includes_directory } ' + #echo "embedMarkdownIncludes: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished." >&2 \ No newline at end of file diff --git a/scripts/markdown/testEmbedMarkdownIncludes.sh b/scripts/markdown/testEmbedMarkdownIncludes.sh index 3396b180a..ad4482ea2 100755 --- a/scripts/markdown/testEmbedMarkdownIncludes.sh +++ b/scripts/markdown/testEmbedMarkdownIncludes.sh @@ -105,9 +105,31 @@ set -o errexit if [ ${exitCode} -eq 0 ]; then fail "2.) Test failed: Expected an error due to missing include file, but the script succeeded." fi -if [[ "${errorOutput}" != *"ERROR: missing file"* ]]; then +if [[ "${errorOutput}" != *"ERROR: missing include file"* ]]; then fail "2.) Test failed: Expected error message to contain 'ERROR: missing file', but got '${errorOutput}'." fi +# ------------------------------------------------------------ +# Test case -- +# ------------------------------------------------------------ +echo "testEmbedMarkdownIncludes: 4.) The fallback include is used when the main include is missing" + +# - Setup +testFallbackIncludeFileName="testFallbackInclude.md" +echo "" > "${testMarkdownTemplate}" + +testFallbackIncludeFile="includes/${testFallbackIncludeFileName}" +expected_test_include_content="This is the included content from the fallback include." +echo "${expected_test_include_content}" > "${temporaryTestDirectory}/${testFallbackIncludeFile}" + +# - Execute script under test +embeddedContent=$(cd "${temporaryTestDirectory}"; cat "${testMarkdownTemplate}" | "${MARKDOWN_SCRIPTS_DIR}/embedMarkdownIncludes.sh") + +# - Verify results +if [ "${embeddedContent}" != "${expected_test_include_content}" ]; then + fail "4.) Test failed: Expected embedded content to be '${expected_test_include_content}', but got '${embeddedContent}'." +fi + + successful return 0 \ No newline at end of file From 2b972778508c3c7fe54bbf383f1d882466b85dd2 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sat, 6 Sep 2025 11:11:39 +0200 Subject: [PATCH 06/13] Add sum of node embeddings contributing to the anomaly score (SHAP value) --- ...yDetectionIsolationForestExploration.ipynb | 54 ++++++++ .../tunedAnomalyDetectionExplained.py | 127 +++++++++++++++++- 2 files changed, 180 insertions(+), 1 deletion(-) diff --git a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb index e158835ed..757966892 100644 --- a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb +++ b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb @@ -1644,6 +1644,55 @@ " return anomaly_detected_features\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "486948e3", + "metadata": {}, + "outputs": [], + "source": [ + "def add_node_embedding_shap_sum(\n", + " shap_anomaly_values: np.ndarray,\n", + " feature_names: list[str],\n", + " anomaly_detected_features: pd.DataFrame,\n", + " anomaly_label_column: str = \"anomalyLabel\",\n", + " output_column_name: str = \"anomalyNodeEmbeddingSHAPSum\"\n", + ") -> pd.DataFrame:\n", + " \"\"\"\n", + " Adds a column with the sum of SHAP values for all features that start with 'nodeEmbedding'.\n", + " The sum is signed, so that negative values contributing to an anomaly are reduced by positive numbers indicating \"normal\" tendencies.\n", + "\n", + " Parameters:\n", + " - shap_anomaly_values: SHAP values array with shape (n_samples, n_features).\n", + " - feature_names: List of names corresponding to the features.\n", + " - anomaly_detected_features: Original DataFrame containing anomaly labels.\n", + " - anomaly_label_column: Name of the column indicating anomalies (1 = anomaly).\n", + " - output_column_name: Name of the new column to store the SHAP sum.\n", + "\n", + " Returns:\n", + " - DataFrame with an additional column containing the summed SHAP values for nodeEmbedding features.\n", + " \"\"\"\n", + " # Convert SHAP values into a DataFrame for easier manipulation\n", + " shap_values_dataframe = pd.DataFrame(shap_anomaly_values, columns=feature_names)\n", + "\n", + " # Identify all features whose names start with \"nodeEmbedding\"\n", + " node_embedding_features = [name for name in feature_names if name.startswith(\"nodeEmbedding\")]\n", + "\n", + " # Default initialize new column\n", + " anomaly_detected_features[output_column_name] = 0.0\n", + "\n", + " # Get indices of rows marked as anomalies\n", + " anomaly_indices = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1].index\n", + "\n", + " # Compute raw signed sum of SHAP values for each anomaly row\n", + " for row_index in anomaly_indices:\n", + " row_shap_values = shap_values_dataframe.loc[row_index, node_embedding_features]\n", + " shap_sum = row_shap_values.sum() # signed sum\n", + " anomaly_detected_features.at[row_index, output_column_name] = shap_sum\n", + "\n", + " return anomaly_detected_features" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1656,6 +1705,11 @@ " feature_names=java_package_anomaly_detection_feature_names,\n", " anomaly_detected_features=java_package_anomaly_detection_features\n", ")\n", + "add_node_embedding_shap_sum(\n", + " shap_anomaly_values=java_package_anomalies_explanation_results.shap_anomaly_values,\n", + " feature_names=java_package_anomaly_detection_feature_names,\n", + " anomaly_detected_features=java_package_anomaly_detection_features \n", + ")\n", "display(java_package_anomaly_detection_features[java_package_anomaly_detection_features[\"anomalyLabel\"] == 1].sort_values(by='anomalyScore', ascending=False).head(10))" ] }, diff --git a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py index 4760e5f4a..74d5a9b5c 100755 --- a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py +++ b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py @@ -260,7 +260,7 @@ def query_data(input_parameters: Parameters = Parameters.example()) -> pd.DataFr ,codeUnit.centralityArticleRank AS articleRank ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference ,codeUnit.centralityBetweenness AS betweenness - ,codeUnit.communityLocalClusteringCoefficient AS locallusteringCoefficient + ,codeUnit.communityLocalClusteringCoefficient AS localClusteringCoefficient ,1.0 - codeUnit.clusteringHDBSCANProbability AS clusterApproximateOutlierScore ,codeUnit.clusteringHDBSCANNoise AS clusterNoise ,codeUnit.clusteringHDBSCANRadiusAverage AS clusterRadiusAverage @@ -754,6 +754,73 @@ def plot_shap_explained_beeswarm( plot.close() +def plot_shap_explained_local_feature_importance( + index_to_explain, + anomalies_explanation_results: AnomaliesExplanationResults, + prepared_features: np.ndarray, + feature_names: list[str], + title: str, + plot_file_path: str, + rounding_precision: int = 4, +): + """ + Uses the SHAP values for anomalies to visualize the local feature importance for a specific anomaly. + This function generates a force plot showing how each feature contributes to the anomaly score for a specific anomaly instance. + The force plot is a powerful visualization that helps to understand the impact of each feature for each as anomaly classified data point. + Visual breakdown of how each feature contributes to the score. + Highly interpretable for debugging single nodes. + """ + shap_anomaly_values = anomalies_explanation_results.shap_anomaly_values + expected_anomaly_value = anomalies_explanation_results.shap_expected_anomaly_value + + shap_values_rounded = np.round(shap_anomaly_values[index_to_explain], rounding_precision) + prepared_features_rounded = prepared_features[index_to_explain].round(rounding_precision) + base_value_rounded = np.round(expected_anomaly_value, rounding_precision) + + shap.force_plot( + base_value_rounded, + shap_values_rounded, + prepared_features_rounded, + feature_names=feature_names, + matplotlib=True, + show=False, + contribution_threshold=0.06 + ) + current_figure = plot.gcf() + + # Resize fonts manually (best effort, affects all text) + for text in current_figure.findobj(match=plot.Text): + text.set_fontsize(10) # Set smaller font + + plot.title(title, fontsize=16, loc='left', y=0.05) + plot.savefig(plot_file_path) + plot.close() + + +def plot_all_shap_explained_local_feature_importance( + data: pd.DataFrame, + explanation_results: AnomaliesExplanationResults, + prepared_features: np.ndarray, + feature_names: list[str], + parameters: Parameters, + title_prefix: str = "", + code_unit_name_column: str = "codeUnitName" + ) -> None: + + index=0 + for row_index, row in data.iterrows(): + row_index = typing.cast(int, row_index) + index=index+1 + plot_shap_explained_local_feature_importance( + index_to_explain=row_index, + anomalies_explanation_results=explanation_results, + prepared_features=prepared_features, + feature_names=feature_names, + title=f"{title_prefix} \"{row[code_unit_name_column]}\" anomaly #{index} explained", + plot_file_path=get_file_path(f"{title_prefix}_Anomaly_{index}_shap_explanation", parameters), + ) + + def plot_shap_explained_top_10_feature_dependence( shap_anomaly_values: np.ndarray, prepared_features: np.ndarray, @@ -838,6 +905,48 @@ def add_top_shap_features_to_anomalies( return anomaly_detected_features +def add_node_embedding_shap_sum( + shap_anomaly_values: np.ndarray, + feature_names: list[str], + anomaly_detected_features: pd.DataFrame, + anomaly_label_column: str = "anomalyLabel", + output_column_name: str = "anomalyNodeEmbeddingSHAPSum" +) -> pd.DataFrame: + """ + Adds a column with the sum of SHAP values for all features that start with 'nodeEmbedding'. + The sum is signed, so that negative values contributing to an anomaly are reduced by positive numbers indicating "normal" tendencies. + + Parameters: + - shap_anomaly_values: SHAP values array with shape (n_samples, n_features). + - feature_names: List of names corresponding to the features. + - anomaly_detected_features: Original DataFrame containing anomaly labels. + - anomaly_label_column: Name of the column indicating anomalies (1 = anomaly). + - output_column_name: Name of the new column to store the SHAP sum. + + Returns: + - DataFrame with an additional column containing the summed SHAP values for nodeEmbedding features. + """ + # Convert SHAP values into a DataFrame for easier manipulation + shap_values_dataframe = pd.DataFrame(shap_anomaly_values, columns=feature_names) + + # Identify all features whose names start with "nodeEmbedding" + node_embedding_features = [name for name in feature_names if name.startswith("nodeEmbedding")] + + # Default initialize new column + anomaly_detected_features[output_column_name] = 0.0 + + # Get indices of rows marked as anomalies + anomaly_indices = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1].index + + # Compute raw signed sum of SHAP values for each anomaly row + for row_index in anomaly_indices: + row_shap_values = shap_values_dataframe.loc[row_index, node_embedding_features] + shap_sum = row_shap_values.sum() # signed sum + anomaly_detected_features.at[row_index, output_column_name] = shap_sum + + return anomaly_detected_features + + # ------------------------------------------------------------------------------------------------------------ # MAIN # ------------------------------------------------------------------------------------------------------------ @@ -913,6 +1022,15 @@ def add_top_shap_features_to_anomalies( plot_file_path=get_file_path(f"{plot_prefix}_Anomaly_feature_importance_explained", parameters) ) +plot_all_shap_explained_local_feature_importance( + data=get_top_10_anomalies(features), + explanation_results=explanation_results, + prepared_features=features_prepared, + feature_names=feature_names, + parameters=parameters, + title_prefix=plot_prefix +) + plot_shap_explained_top_10_feature_dependence( shap_anomaly_values=explanation_results.shap_anomaly_values, prepared_features=features_prepared, @@ -927,6 +1045,12 @@ def add_top_shap_features_to_anomalies( anomaly_detected_features=features ) +add_node_embedding_shap_sum( + shap_anomaly_values=explanation_results.shap_anomaly_values, + feature_names=feature_names, + anomaly_detected_features=features +) + if parameters.is_verbose(): print("tunedAnomalyDetectionExplained: Features with added anomaly score explanation columns:") print(features[features["anomalyLabel"] == 1].sort_values(by='anomalyScore', ascending=False).head(10)) @@ -941,6 +1065,7 @@ def add_top_shap_features_to_anomalies( 'anomalyTopFeatureSHAPValue1': features['anomalyTopFeatureSHAPValue_1'], 'anomalyTopFeatureSHAPValue2': features['anomalyTopFeatureSHAPValue_2'], 'anomalyTopFeatureSHAPValue3': features['anomalyTopFeatureSHAPValue_3'], + 'anomalyNodeEmbeddingSHAPSum': features['anomalyNodeEmbeddingSHAPSum'], }) write_batch_data_into_database(data_to_write, parameters.get_projection_node_label(), verbose=parameters.is_verbose()) From b5843db682530f9c481a5090b347b16d02edf174 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sat, 27 Sep 2025 12:44:29 +0200 Subject: [PATCH 07/13] Output Markdown table with shapley explained top features --- .../tunedAnomalyDetectionExplained.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py index 74d5a9b5c..5210e97c7 100755 --- a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py +++ b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py @@ -10,6 +10,7 @@ # - Provide the password for Neo4j in the environment variable "NEO4J_INITIAL_PASSWORD". # - Requires "tunedLeidenCommunityDetection.py", "tunedNodeEmbeddingClustering.py" and "umap2dNodeEmbedding.py" to be executed before this script to provide the necessary data. +from re import M import typing import numpy.typing as numpy_typing @@ -947,6 +948,58 @@ def add_node_embedding_shap_sum( return anomaly_detected_features +def output_top_shap_explained_global_features_as_markdown_table( + shap_anomaly_values: np.ndarray, + feature_names: list[str], + output_file_path: str, + top_n_features: int = 10 +): + # Compute mean absolute shap value across all samples for each feature (importance ranking) + mean_absolute_shap_values = np.abs(shap_anomaly_values).mean(axis=0) + + # Create DataFrame with feature names and mean shap values + feature_importance = pd.DataFrame({ + "Feature": feature_names, + "Mean absolute SHAP value": mean_absolute_shap_values + }) + + # Aggregate all nodeEmbedding* features + mask = feature_importance["Feature"].str.startswith("nodeEmbedding") + node_embedding_sum = feature_importance.loc[mask, "Mean absolute SHAP value"].sum() + + # Append aggregated feature + feature_importance = pd.concat([ + feature_importance, + pd.DataFrame([{ + "Feature": "*Node embeddings aggregated*", + "Mean absolute SHAP value": node_embedding_sum + }]) + ]) + + # Sort by importance + top_features = feature_importance.sort_values("Mean absolute SHAP value", ascending=False).head(top_n_features + 1) + + # Build markdown table manually using column names + headers = list(top_features.columns) + rows = top_features.values.tolist() + + markdown_header_row = "| " + " | ".join(headers) + " |\n" + markdown_table = markdown_header_row + + markdown_header_separator_row = "| " + " | ".join(["---"] * len(headers)) + " |\n" + markdown_table += markdown_header_separator_row + + for row in rows: + markdown_data_row = "| " + " | ".join([str(row[0]), f"{row[1]:.6f}"]) + " |\n" + markdown_table += markdown_data_row + + # Save to file + with open(output_file_path, "w") as f: + f.write(markdown_table) + + print(f"tunedAnomalyDetectionExplained: Markdown table with top {top_n_features} SHAP explained features saved to {output_file_path}") + + # ------------------------------------------------------------------------------------------------------------ # MAIN # ------------------------------------------------------------------------------------------------------------ @@ -1051,6 +1104,12 @@ def add_node_embedding_shap_sum( anomaly_detected_features=features ) +output_top_shap_explained_global_features_as_markdown_table( + shap_anomaly_values=explanation_results.shap_anomaly_values, + feature_names=feature_names, + output_file_path=get_file_path(f"{plot_prefix}_Top_anomaly_features", parameters, 'md') +) + if parameters.is_verbose(): print("tunedAnomalyDetectionExplained: Features with added anomaly score explanation columns:") print(features[features["anomalyLabel"] == 1].sort_values(by='anomalyScore', ascending=False).head(10)) From 3e7f1f3863f2a3da10b4781114d796f8e5e514ca Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sun, 7 Sep 2025 09:00:34 +0200 Subject: [PATCH 08/13] Add anomaly archetype labels and reports --- .../anomaly-detection/anomalyDetectionCsv.sh | 27 +- .../anomalyDetectionPython.sh | 23 ++ .../explore/AnomalyDetectionExploration.ipynb | 366 +++++++++++++++++- .../AnomalyDetectionArchetypeAuthority.cypher | 58 +++ ...AnomalyDetectionArchetypeBottleneck.cypher | 32 ++ .../AnomalyDetectionArchetypeBridge.cypher | 26 ++ .../AnomalyDetectionArchetypeHub.cypher | 41 ++ .../AnomalyDetectionArchetypeOutlier.cypher | 37 ++ ...omalyDetectionArchetypeRemoveLabels.cypher | 9 + 9 files changed, 615 insertions(+), 4 deletions(-) create mode 100644 domains/anomaly-detection/labels/AnomalyDetectionArchetypeAuthority.cypher create mode 100644 domains/anomaly-detection/labels/AnomalyDetectionArchetypeBottleneck.cypher create mode 100644 domains/anomaly-detection/labels/AnomalyDetectionArchetypeBridge.cypher create mode 100644 domains/anomaly-detection/labels/AnomalyDetectionArchetypeHub.cypher create mode 100644 domains/anomaly-detection/labels/AnomalyDetectionArchetypeOutlier.cypher create mode 100644 domains/anomaly-detection/labels/AnomalyDetectionArchetypeRemoveLabels.cypher diff --git a/domains/anomaly-detection/anomalyDetectionCsv.sh b/domains/anomaly-detection/anomalyDetectionCsv.sh index ea57f0f30..1d4452135 100755 --- a/domains/anomaly-detection/anomalyDetectionCsv.sh +++ b/domains/anomaly-detection/anomalyDetectionCsv.sh @@ -25,8 +25,9 @@ SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/../../scripts"} # Re # Get the "cypher" query directory for gathering features. ANOMALY_DETECTION_FEATURE_CYPHER_DIR=${ANOMALY_DETECTION_FEATURE_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/features"} ANOMALY_DETECTION_QUERY_CYPHER_DIR=${ANOMALY_DETECTION_QUERY_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/queries"} +ANOMALY_DETECTION_LABEL_CYPHER_DIR=${ANOMALY_DETECTION_LABEL_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/labels"} -# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher" +# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher" and "execute_cypher_summarized" source "${SCRIPTS_DIR}/executeQueryFunctions.sh" # Define functions to create and delete Graph Projections like "createUndirectedDependencyProjection" @@ -60,6 +61,7 @@ anomaly_detection_features() { execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Exists.cypher" \ "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Write.cypher" "${@}" } + # Run queries to find anomalies in the graph. # # Required Parameters: @@ -85,6 +87,28 @@ anomaly_detection_queries() { execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_UnexpectedCentralNodes.csv" } +# Label code units with top anomalies by archetype. +# +# Required Parameters: +# - projection_node_label=... +# Label of the nodes that will be used for the projection. Example: "Package" +anomaly_detection_labels() { + local nodeLabel + nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" ) + + local language + language=$( extractQueryParameter "projection_language" "${@}" ) + + echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Labelling ${language} ${nodeLabel} anomalies..." + execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeRemoveLabels.cypher" "${@}" + execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopAuthority.csv" + execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopBottleneck.csv" + execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopHub.csv" + # The following two label types require Python scripts to run first and are skipped here intentionally: + # execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBridge.cypher" "${@}" + # execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}" +} + # Run the anomaly detection pipeline. # # Required Parameters: @@ -97,6 +121,7 @@ anomaly_detection_queries() { anomaly_detection_csv_reports() { time anomaly_detection_features "${@}" time anomaly_detection_queries "${@}" + time anomaly_detection_labels "${@}" } # Create report directory diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh index bd12c2f87..358fcae8e 100755 --- a/domains/anomaly-detection/anomalyDetectionPython.sh +++ b/domains/anomaly-detection/anomalyDetectionPython.sh @@ -25,6 +25,7 @@ SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/../../scripts"} # Re # Get the "cypher" query directory for gathering features. ANOMALY_DETECTION_FEATURE_CYPHER_DIR=${ANOMALY_DETECTION_FEATURE_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/features"} ANOMALY_DETECTION_QUERY_CYPHER_DIR=${ANOMALY_DETECTION_QUERY_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/queries"} +ANOMALY_DETECTION_LABEL_CYPHER_DIR=${ANOMALY_DETECTION_LABEL_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/labels"} # Function to display script usage usage() { @@ -138,6 +139,27 @@ anomaly_detection_using_python() { execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}AnomalyDetection_Features.csv" } +# Label code units with top anomalies by archetype. +# +# Required Parameters: +# - projection_node_label=... +# Label of the nodes that will be used for the projection. Example: "Package" +anomaly_detection_labels() { + local nodeLabel + nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" ) + + local language + language=$( extractQueryParameter "projection_language" "${@}" ) + + echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Labelling ${language} ${nodeLabel} anomalies..." + execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeRemoveLabels.cypher" "${@}" + execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" + execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" + execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" + execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBridge.cypher" "${@}" + execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}" +} + # Run the anomaly detection pipeline. # # Required Parameters: @@ -150,6 +172,7 @@ anomaly_detection_using_python() { anomaly_detection_python_reports() { time anomaly_detection_features "${@}" anomaly_detection_using_python "${@}" + time anomaly_detection_labels "${@}" } # Create report directory diff --git a/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb index 06989e6bd..3bf3531d2 100644 --- a/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb +++ b/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb @@ -167,10 +167,12 @@ " OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)\n", " WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName \n", " WITH * \n", - " ,coalesce(codeUnit.incomingDependencies, 0) AS incomingDependencies\n", - " ,coalesce(codeUnit.outgoingDependencies, 0) AS outgoingDependencies\n", + " ,coalesce(codeUnit.incomingDependencies, 0) AS incomingDependencies\n", + " ,coalesce(codeUnit.outgoingDependencies, 0) AS outgoingDependencies\n", " ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName\n", - " ,coalesce(artifactName, projectName, \"\") AS projectName\n", + " ,coalesce(artifactName, projectName, \"\") AS projectName\n", + " ,coalesce(codeUnit.anomalyScore, 0.0) AS anomalyScore\n", + " ,coalesce(codeUnit.anomalyNodeEmbeddingSHAPSum, 0.0) AS anomalyNodeEmbeddingSHAPSum\n", " RETURN DISTINCT \n", " codeUnitName\n", " ,codeUnit.name AS shortCodeUnitName\n", @@ -195,6 +197,8 @@ " ,codeUnit.clusteringHDBSCANSize AS clusterSize\n", " ,codeUnit.clusteringHDBSCANLabel AS clusterLabel\n", " ,codeUnit.clusteringHDBSCANMedoid AS clusterMedoid\n", + " ,CASE WHEN anomalyScore < 0.0 THEN 0.0 ELSE anomalyScore END AS anomalyScore\n", + " ,anomalyNodeEmbeddingSHAPSum * -1.0 AS negatedAnomalyNodeEmbeddingSHAPSum\n", " ,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationX AS embeddingVisualizationX\n", " ,codeUnit.embeddingsFastRandomProjectionTunedForClusteringVisualizationY AS embeddingVisualizationY\n", " \"\"\"\n", @@ -1540,6 +1544,362 @@ " color_column_name='betweenness'\n", ")" ] + }, + { + "cell_type": "markdown", + "id": "19552eea", + "metadata": {}, + "source": [ + "### 2.4 Best Pareto Frontier tradeoff feature combinations and archetypes\n", + "\n", + "Multi objective optimization for anomaly detection. Combining multiple metrics to identify anomalies that may not be apparent when considering each metric in isolation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ba89c3b", + "metadata": {}, + "outputs": [], + "source": [ + "def add_rank_column(\n", + " data: pd.DataFrame,\n", + " value_column_name: str,\n", + " ascending: bool = False\n", + ") -> pd.DataFrame:\n", + " \"\"\"\n", + " Adds a ranking column to the DataFrame based on the specified value column.\n", + " \n", + " Parameters\n", + " ----------\n", + " data : pd.DataFrame\n", + " The input DataFrame.\n", + " value_column_name : str\n", + " The name of the column based on which the ranking is computed.\n", + " ascending : bool, optional\n", + " If True, ranks in ascending order (default is False for descending order).\n", + " \n", + " Returns\n", + " -------\n", + " pd.DataFrame\n", + " The DataFrame with the new ranking column added.\n", + " \"\"\"\n", + " if value_column_name not in data.columns:\n", + " raise ValueError(f\"Column '{value_column_name}' does not exist in the DataFrame.\")\n", + " if data.empty:\n", + " print(\"DataFrame is empty. No ranking column added.\")\n", + " return data\n", + " if value_column_name + '_ranking' in data.columns:\n", + " print(f\"Ranking column '{value_column_name}_ranking' already exists. No new column added.\")\n", + " return data\n", + " data[value_column_name + '_ranking'] = data[value_column_name].rank(ascending=ascending, method='dense').astype(int)\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9533f2b6", + "metadata": {}, + "outputs": [], + "source": [ + "code_unit_columns = ['projectName', 'codeUnitName']\n", + "features_to_rank = ['anomalyScore', 'degree', 'pageRank', 'articleRank', 'pageToArticleRankDifference', 'betweenness', 'negatedAnomalyNodeEmbeddingSHAPSum',\n", + " 'inverseClusteringCoefficient', 'clusterApproximateOutlierScore', 'clusterRadiusAverage', 'clusterSize', 'clusterDistanceToMedoid']\n", + "\n", + "for feature in features_to_rank:\n", + " java_type_features = add_rank_column(java_type_features, feature, ascending=False)\n", + "\n", + "# display(java_type_features.sort_values(by='anomalyScore', ascending=False)[code_unit_columns + features_to_rank].head(20))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64071a2e", + "metadata": {}, + "outputs": [], + "source": [ + "def pareto_frontier(input_data, metrics, maximize=True):\n", + " \"\"\"\n", + " Extracts the Pareto frontier (skyline) from a DataFrame.\n", + "\n", + " input_data: DataFrame\n", + " metrics: list of column names to consider\n", + " maximize: True if higher is better for all metrics\n", + " \"\"\"\n", + " data = input_data[metrics].to_numpy()\n", + " if not maximize:\n", + " data = -data # flip sign if minimizing\n", + " \n", + " # Keep track of which rows are dominated (start with none)\n", + " is_dominated = np.zeros(len(data), dtype=bool)\n", + " for i, point in enumerate(data):\n", + " # Skip if already marked dominated\n", + " if is_dominated[i]:\n", + " continue\n", + " # Check which other rows dominate this row\n", + " dominates = np.all(data >= point, axis=1) & np.any(data > point, axis=1)\n", + " # If any row dominates this one, mark this row as dominated\n", + " is_dominated |= dominates\n", + " \n", + " # Keep only non-dominated rows = Pareto frontier\n", + " return input_data[~is_dominated].reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e81641f", + "metadata": {}, + "outputs": [], + "source": [ + "def get_best_feature_tradeoff_code_units(\n", + " data: pd.DataFrame,\n", + " feature_names: list[str],\n", + " code_unit_columns: list[str] = ['projectName', 'shortCodeUnitName', 'codeUnitName'],\n", + " top_n: int = 10\n", + ") -> pd.DataFrame:\n", + " \"\"\"\n", + " Identifies code units that represent the best trade-offs across multiple features using the Pareto frontier.\n", + "\n", + " Parameters\n", + " ----------\n", + " data : pd.DataFrame\n", + " The input DataFrame containing code unit features.\n", + " features : list of str\n", + " List of feature column names to consider for the Pareto frontier.\n", + " code_unit_columns : list of str\n", + " List of columns that identify the code units (e.g., name, project) (default is ['projectName', 'codeUnitName']).\n", + " top_n : int, optional\n", + " Number of top code units to return from the Pareto frontier (default is 20).\n", + "\n", + " Returns\n", + " -------\n", + " pd.DataFrame\n", + " DataFrame containing the top code units on the Pareto frontier with their features.\n", + " \"\"\"\n", + " if data.empty:\n", + " print(\"DataFrame is empty. No Pareto frontier can be computed.\")\n", + " return data\n", + "\n", + " features_rank_columns = [feature + '_ranking' for feature in feature_names]\n", + " selected_columns = code_unit_columns + feature_names + features_rank_columns\n", + " pareto_best_feature_tradeoffs = pareto_frontier(java_type_features, feature_names, maximize=False)\n", + " return pareto_best_feature_tradeoffs[selected_columns].head(top_n)" + ] + }, + { + "cell_type": "markdown", + "id": "9e8d270b", + "metadata": {}, + "source": [ + "#### 2.4.0 Pareto best trade-offs of all features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8445365", + "metadata": {}, + "outputs": [], + "source": [ + "display(get_best_feature_tradeoff_code_units(java_type_features, features_to_rank))" + ] + }, + { + "cell_type": "markdown", + "id": "39b3adf7", + "metadata": {}, + "source": [ + "#### 2.4.1 Hub (High degree, low clustering coefficient) - Best Pareto feature trade-offs\n", + "\n", + "**Definition:**\n", + "A node with unusually high **degree centrality** (many direct connections) compared to its peers, often with **low clustering coefficient** (its neighbors are not connected to each other).\n", + "\n", + "**In software:**\n", + "\n", + "* A class/package/module that is used **everywhere** → often “God classes” or utility-heavy components.\n", + "* Can indicate **violation of modularity** or **overgeneralization** (too many responsibilities).\n", + "\n", + "**Implications:**\n", + "\n", + "* Increases **coupling**, reduces maintainability.\n", + "* Single point of failure: refactoring or breaking changes ripple through the system.\n", + "\n", + "**Variants:**\n", + "* In-degree hub (high fan-in): Many other code units depend on this one.Indicates re-use, but also high coupling. Classic sign of God Class / Utility Class (everywhere referenced).\n", + "* Out-degree hub (high fan-out): This code unit depends on many others. Indicates broad knowledge of the system. Often suggests Feature Envy or Controller classes (too many responsibilities).\n", + "\n", + "**References:**\n", + "\n", + "* Lanza & Marinescu, *Object-Oriented Metrics in Practice* (Springer, 2006) – “God Class” anti-pattern.\n", + "* Barabási, *Network Science* (Cambridge, 2016) – scale-free networks, hub nodes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7ff7935", + "metadata": {}, + "outputs": [], + "source": [ + "hub_focussed_features = ['anomalyScore', 'degree', 'inverseClusteringCoefficient']\n", + "display(get_best_feature_tradeoff_code_units(java_type_features, hub_focussed_features))" + ] + }, + { + "cell_type": "markdown", + "id": "a5cfd235", + "metadata": {}, + "source": [ + "#### 2.4.2 Bottleneck (High betweenness, low redundancy) best Pareto feature trade-offs\n", + "\n", + "**Definition:**\n", + "A node with very high **betweenness centrality** – it lies on many shortest paths between other nodes.\n", + "\n", + "**In software:**\n", + "\n", + "* A package/module that acts as a **bridge between subsystems**.\n", + "* Often an **unintended dependency concentration**: if removed, communication between modules breaks.\n", + "\n", + "**Implications:**\n", + "\n", + "* Scalability risk: changes here affect many modules.\n", + "* Architectural smell: “concentration of control.”\n", + "\n", + "**References:**\n", + "\n", + "* MacCormack et al., *Exploring the Structure of Complex Software Designs* (Management Science, 2006) – dependency bottlenecks in software.\n", + "* Freeman, *Centrality in Social Networks* (Social Networks, 1977) – betweenness centrality.\n", + "* Valverde & Solé (2003): \"Hierarchical small worlds in software architecture\" → showed how real software dependency graphs often lack redundancy and thus create fragile bottlenecks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1ea7fb2", + "metadata": {}, + "outputs": [], + "source": [ + "bottleneck_focussed_features = ['anomalyScore', 'betweenness', 'clusterApproximateOutlierScore']\n", + "display(get_best_feature_tradeoff_code_units(java_type_features, bottleneck_focussed_features))" + ] + }, + { + "cell_type": "markdown", + "id": "2cc86c29", + "metadata": {}, + "source": [ + "#### 2.4.3 Outlier (High cluster distance, small cluster size) best Pareto feature trade-offs\n", + "\n", + "**Definition:**\n", + "A node that is **structurally far away** from its assigned cluster/community (large distance to medoid, very small cluster size).\n", + "\n", + "**In software:**\n", + "\n", + "* A class/module that doesn’t fit into any architectural layer cleanly.\n", + "* Example: a utility hidden inside a domain-specific cluster, or a feature with **no clear dependencies**.\n", + "\n", + "**Implications:**\n", + "\n", + "* Possible **code smell**: “orphan” or “misplaced class.”\n", + "* Hard to reason about, maintain, or assign ownership.\n", + "* Unusual dependency pattern\n", + "* Architectural mismatch when approximate outlier score is also high\n", + "\n", + "**References:**\n", + "\n", + "* Koschke, *Software Clustering: Extracting Structure from Source Code* (IEEE TSE, 2006).\n", + "* Hinneburg & Keim, *Optimal Grid-Clustering* (VLDB, 1999) – cluster outliers in high-dimensional data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73daaa7d", + "metadata": {}, + "outputs": [], + "source": [ + "outlier_focussed_features = ['anomalyScore', 'clusterDistanceToMedoid', 'clusterApproximateOutlierScore']\n", + "display(get_best_feature_tradeoff_code_units(java_type_features, outlier_focussed_features))" + ] + }, + { + "cell_type": "markdown", + "id": "c4df5cce", + "metadata": {}, + "source": [ + "#### 2.4.4 Authority (High PageRank, low articleRank) best Pareto feature trade-offs\n", + "\n", + "**Definition:**\n", + "A node with **high PageRank** but relatively **low ArticleRank** or similar ranking mismatch → suggests **influence disproportionate to usage context**.\n", + "\n", + "**In software:**\n", + "\n", + "* A module referenced widely but not strongly contributing back (utility libraries, framework entry points).\n", + "* Could indicate **monopoly dependencies** (e.g., logging frameworks, base classes).\n", + "\n", + "**Implications:**\n", + "\n", + "* Central authority role can be intended (core library), but in anomaly context, it may indicate **over-centralization**.\n", + "* A design smell: one class \"knows too much\" or others depend on it excessively.\n", + "* Over-relied utility with few reverse connections.\n", + "\n", + "**References:**\n", + "\n", + "* Kleinberg, *Authoritative Sources in a Hyperlinked Environment* (JACM, 1999) – HITS algorithm.\n", + "* Page et al., *The PageRank Citation Ranking* (Stanford Tech Report, 1999)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3c53e58", + "metadata": {}, + "outputs": [], + "source": [ + "authority_focussed_features = ['anomalyScore', 'degree', 'pageRank', 'pageToArticleRankDifference']\n", + "display(get_best_feature_tradeoff_code_units(java_type_features, authority_focussed_features))" + ] + }, + { + "cell_type": "markdown", + "id": "179f9732", + "metadata": {}, + "source": [ + "#### 2.4.5 Bridge (Embedding-driven anomaly, cross-cluster) best Pareto feature trade-offs\n", + "\n", + "**Definition:**\n", + "A node whose embedding or SHAP contribution comes from **latent dimensions** (e.g., PCA components) rather than raw structural metrics → meaning it connects across **otherwise unrelated clusters**.\n", + "\n", + "**In software:**\n", + "\n", + "* A class/module that integrates concepts from multiple subsystems.\n", + "* May appear in embeddings as a “boundary object” that doesn’t belong to just one cluster.\n", + "\n", + "**Implications:**\n", + "\n", + "* Can be **legitimate integrators** (e.g., API facades) or **architecture violations** (tangled dependencies).\n", + "* Increases coupling between modules that should be independent.\n", + "* Connects unrelated domains, risky coupling\n", + "\n", + "**References:**\n", + "\n", + "* Conway’s Law (Conway, 1968) – bridges often mirror organizational seams.\n", + "* Borgatti & Everett, *Models of Core/Periphery Structures* (Social Networks, 2000)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "902f3008", + "metadata": {}, + "outputs": [], + "source": [ + "bridge_focussed_features = ['anomalyScore', 'negatedAnomalyNodeEmbeddingSHAPSum']\n", + "display(get_best_feature_tradeoff_code_units(java_type_features, bridge_focussed_features))" + ] } ], "metadata": { diff --git a/domains/anomaly-detection/labels/AnomalyDetectionArchetypeAuthority.cypher b/domains/anomaly-detection/labels/AnomalyDetectionArchetypeAuthority.cypher new file mode 100644 index 000000000..efd080462 --- /dev/null +++ b/domains/anomaly-detection/labels/AnomalyDetectionArchetypeAuthority.cypher @@ -0,0 +1,58 @@ +// Anomaly Detection Labels: Label code units of archetype "Authority" by looking for the (at most) top 20 entries with a high PageRank >= 90% percentile and a high PageRank to ArticleRank difference >= 95% percentile. Requires features/*.cypher to be run first. +// Shows code that is referenced widely but not strongly contributing back (utility libraries, framework entry points) + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.centralityPageRank IS NOT NULL + AND codeUnit.centralityArticleRank IS NOT NULL + WITH collect(codeUnit) AS codeUnits + ,min(codeUnit.centralityPageRank) AS minPageRank + ,max(codeUnit.centralityPageRank) AS maxPageRank + ,min(codeUnit.centralityArticleRank) AS minArticleRank + ,max(codeUnit.centralityArticleRank) AS maxArticleRank + ,percentileDisc(codeUnit.centralityPageRank, 0.90) AS pageRankThreshold + UNWIND codeUnits AS codeUnit + WITH * + WHERE codeUnit.centralityPageRank >= pageRankThreshold + WITH * + ,(codeUnit.centralityPageRank - minPageRank) / (maxPageRank - minPageRank) AS normalizedPageRank + ,(codeUnit.centralityArticleRank - minArticleRank) / (maxArticleRank - minArticleRank) AS normalizedArticleRank + WITH * + ,normalizedPageRank - normalizedArticleRank AS normalizedPageRankToArticleRankDifference + WITH collect(codeUnit) AS codeUnits + ,minPageRank, maxPageRank, minArticleRank, maxArticleRank + ,percentileDisc(normalizedPageRankToArticleRankDifference, 0.90) AS pageToArticleRankDifferenceThreshold + UNWIND codeUnits AS codeUnit + WITH * + ,(codeUnit.centralityPageRank - minPageRank) / (maxPageRank - minPageRank) AS normalizedPageRank + ,(codeUnit.centralityArticleRank - minArticleRank) / (maxArticleRank - minArticleRank) AS normalizedArticleRank + WITH * + ,normalizedPageRank - normalizedArticleRank AS normalizedPageRankToArticleRankDifference + WHERE normalizedPageRankToArticleRankDifference >= pageToArticleRankDifferenceThreshold +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + WITH *, coalesce(artifactName, projectName) AS projectName + ORDER BY codeUnit.centralityPageRank DESC, codeUnit.centralityArticleRank ASC + LIMIT 10 + WITH collect([codeUnit, projectName, normalizedPageRank, normalizedArticleRank, normalizedPageRankToArticleRankDifference]) AS results + UNWIND range(0, size(results) - 1) AS codeUnitIndex + WITH codeUnitIndex + 1 AS codeUnitIndex + ,results[codeUnitIndex][0] AS codeUnit + ,results[codeUnitIndex][1] AS projectName + ,results[codeUnitIndex][2] AS normalizedPageRank + ,results[codeUnitIndex][3] AS normalizedArticleRank + ,results[codeUnitIndex][4] AS normalizedPageRankToArticleRankDifference + SET codeUnit:Mark4TopAnomalyAuthority + ,codeUnit.anomalyAuthorityRank = codeUnitIndex + RETURN DISTINCT + projectName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.centralityPageRank AS pageRank + ,codeUnit.centralityArticleRank AS articleRank + ,codeUnit.anomalyAuthorityRank AS rank + ,normalizedPageRank + ,normalizedArticleRank + ,normalizedPageRankToArticleRankDifference \ No newline at end of file diff --git a/domains/anomaly-detection/labels/AnomalyDetectionArchetypeBottleneck.cypher b/domains/anomaly-detection/labels/AnomalyDetectionArchetypeBottleneck.cypher new file mode 100644 index 000000000..9dde4582d --- /dev/null +++ b/domains/anomaly-detection/labels/AnomalyDetectionArchetypeBottleneck.cypher @@ -0,0 +1,32 @@ +// Anomaly Detection Labels: Label code units of archetype "Bottleneck" by looking for the top 20 entries with the highest Betweeenness centrality >= 90% percentile. Requires features/*.cypher to be run first. +// Shows key code that is both heavily depended on and control flow — critical hubs. +// Potentially an unintended dependency concentration: if removed, communication between modules breaks. + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.centralityBetweenness IS NOT NULL + WITH collect(codeUnit) AS codeUnits + ,percentileDisc(codeUnit.centralityBetweenness, 0.90) AS betweennessThreshold + UNWIND codeUnits AS codeUnit + WITH * + WHERE codeUnit.centralityBetweenness >= betweennessThreshold +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + WITH *, coalesce(artifactName, projectName) AS projectName + ORDER BY codeUnit.centralityBetweenness DESC + LIMIT 10 + WITH collect([codeUnit, projectName]) AS results + UNWIND range(0, size(results) - 1) AS codeUnitIndex + WITH codeUnitIndex + 1 AS codeUnitIndex + ,results[codeUnitIndex][0] AS codeUnit + ,results[codeUnitIndex][1] AS projectName + SET codeUnit:Mark4TopAnomalyBottleneck + ,codeUnit.anomalyBottleneckRank = codeUnitIndex + RETURN DISTINCT + projectName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.centralityBetweenness AS betweenness + ,codeUnit.anomalyBottleneckRank AS rank \ No newline at end of file diff --git a/domains/anomaly-detection/labels/AnomalyDetectionArchetypeBridge.cypher b/domains/anomaly-detection/labels/AnomalyDetectionArchetypeBridge.cypher new file mode 100644 index 000000000..0d067130a --- /dev/null +++ b/domains/anomaly-detection/labels/AnomalyDetectionArchetypeBridge.cypher @@ -0,0 +1,26 @@ +// Anomaly Detection Labels: Label code units of archetype "Bridge" by looking for the (at most) top 20 entries with the sum of the "nodeEmbedding" anomaly detection feature SHAP (explainable AI) values. Requires anomalyDetectionExplained.py to be run first. +// Shows code that integrates various layers or boundaries (e.g., API facades) or violates architecture (tangled dependencies). + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.anomalyNodeEmbeddingSHAPSum < 0 +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + WITH *, coalesce(artifactName, projectName) AS projectName + ORDER BY codeUnit.anomalyNodeEmbeddingSHAPSum ASC + LIMIT 10 + WITH collect([codeUnit, projectName]) AS results + UNWIND range(0, size(results) - 1) AS codeUnitIndex + WITH codeUnitIndex + 1 AS codeUnitIndex + ,results[codeUnitIndex][0] AS codeUnit + ,results[codeUnitIndex][1] AS projectName + SET codeUnit:Mark4TopAnomalyBridge + ,codeUnit.anomalyBridgeRank = codeUnitIndex + RETURN DISTINCT + projectName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.anomalyNodeEmbeddingSHAPSum AS nodeEmbeddingTop3SHAPValueSum + ,codeUnit.anomalyBridgeRank AS rank \ No newline at end of file diff --git a/domains/anomaly-detection/labels/AnomalyDetectionArchetypeHub.cypher b/domains/anomaly-detection/labels/AnomalyDetectionArchetypeHub.cypher new file mode 100644 index 000000000..388b87317 --- /dev/null +++ b/domains/anomaly-detection/labels/AnomalyDetectionArchetypeHub.cypher @@ -0,0 +1,41 @@ +// Anomaly Detection Labels: Label code units of archetype "Hub" by looking for the (at most) top 20 entries with the highest degree >= 90% percentile and a local clustering coefficient <= 10% percentile. Requires features/*.cypher to be run first. +// Shows code with many connections that are not well integrated into a cluster. + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.communityLocalClusteringCoefficient IS NOT NULL + AND codeUnit.incomingDependencies IS NOT NULL + AND codeUnit.outgoingDependencies IS NOT NULL + WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree + WITH collect(codeUnit) AS codeUnits + ,percentileDisc(degree, 0.90) AS degreeThreshold + ,percentileDisc(codeUnit.communityLocalClusteringCoefficient, 0.10) AS localClusteringCoefficientThreshold + UNWIND codeUnits AS codeUnit + WITH *, codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree + WHERE degree >= degreeThreshold + AND codeUnit.communityLocalClusteringCoefficient <= localClusteringCoefficientThreshold +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + WITH *, coalesce(artifactName, projectName) AS projectName + ORDER BY codeUnit.communityLocalClusteringCoefficient ASC, degree DESC + LIMIT 10 + WITH collect([codeUnit, projectName]) AS results + UNWIND range(0, size(results) - 1) AS codeUnitIndex + WITH codeUnitIndex + 1 AS codeUnitIndex + ,results[codeUnitIndex][0] AS codeUnit + ,results[codeUnitIndex][1] AS projectName + WITH * + ,codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree + SET codeUnit:Mark4TopAnomalyHub + ,codeUnit.anomalyHubRank = codeUnitIndex + RETURN DISTINCT + projectName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.communityLocalClusteringCoefficient AS localClusteringCoefficient + ,degree + ,codeUnit.incomingDependencies AS incomingDependencies + ,codeUnit.outgoingDependencies AS outgoingDependencies + ,codeUnit.anomalyHubRank AS rank \ No newline at end of file diff --git a/domains/anomaly-detection/labels/AnomalyDetectionArchetypeOutlier.cypher b/domains/anomaly-detection/labels/AnomalyDetectionArchetypeOutlier.cypher new file mode 100644 index 000000000..380e4f015 --- /dev/null +++ b/domains/anomaly-detection/labels/AnomalyDetectionArchetypeOutlier.cypher @@ -0,0 +1,37 @@ +// Anomaly Detection Query: Find code units of archetype "Outlier" by listing the (at most) top 20 entries with a normalized distance to the cluster medoid (center) >= 90% percentile and a clustering probability (1.0 - approximate cluster outlier score) <= 30% percentile. Requires tunedNodeEmbeddingClustering.py to be run first. +// Shows code that doesn't clearly fit into any architectural layer or domain boundary cleanly. + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.clusteringHDBSCANNormalizedDistanceToMedoid IS NOT NULL + AND codeUnit.clusteringHDBSCANProbability IS NOT NULL + WITH collect(codeUnit) AS codeUnits + ,percentileDisc(codeUnit.clusteringHDBSCANNormalizedDistanceToMedoid, 0.90) AS distanceToMedoidThreshold + ,percentileDisc(codeUnit.clusteringHDBSCANProbability, 0.30) AS clusteringProbabilityThreshold + UNWIND codeUnits AS codeUnit + WITH * + WHERE codeUnit.clusteringHDBSCANNormalizedDistanceToMedoid >= distanceToMedoidThreshold + AND codeUnit.clusteringHDBSCANProbability <= clusteringProbabilityThreshold +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + WITH *, coalesce(artifactName, projectName) AS projectName + ORDER BY codeUnit.clusteringHDBSCANNormalizedDistanceToMedoid DESC, codeUnit.clusteringHDBSCANProbability ASC + LIMIT 10 + WITH collect([codeUnit, projectName]) AS results + UNWIND range(0, size(results) - 1) AS codeUnitIndex + WITH codeUnitIndex + 1 AS codeUnitIndex + ,results[codeUnitIndex][0] AS codeUnit + ,results[codeUnitIndex][1] AS projectName + WITH * + ,codeUnit.incomingDependencies + codeUnit.outgoingDependencies AS degree + SET codeUnit:Mark4TopAnomalyOutlier + ,codeUnit.anomalyOutlierRank = codeUnitIndex + RETURN DISTINCT + projectName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.clusteringHDBSCANNormalizedDistanceToMedoid AS clusteringNormalizedDistanceToMedoid + ,1.0 - codeUnit.clusteringHDBSCANProbability AS clusterApproximateOutlierScore + ,codeUnit.anomalyOutlierRank AS rank \ No newline at end of file diff --git a/domains/anomaly-detection/labels/AnomalyDetectionArchetypeRemoveLabels.cypher b/domains/anomaly-detection/labels/AnomalyDetectionArchetypeRemoveLabels.cypher new file mode 100644 index 000000000..fc3dcc263 --- /dev/null +++ b/domains/anomaly-detection/labels/AnomalyDetectionArchetypeRemoveLabels.cypher @@ -0,0 +1,9 @@ +// Anomaly Detection Labels: Reset/Remove all marker labels intended to be used before setting them for a clean state. + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + REMOVE codeUnit:Mark4TopAnomalyAuthority + REMOVE codeUnit:Mark4TopAnomalyBottleneck + REMOVE codeUnit:Mark4TopAnomalyBridge + REMOVE codeUnit:Mark4TopAnomalyHub + REMOVE codeUnit:Mark4TopAnomalyOutlier \ No newline at end of file From 88710fa240fa4943859d151ca398554e0dd12ff1 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sat, 4 Oct 2025 20:10:00 +0200 Subject: [PATCH 09/13] Add betweenness centrality distribution --- .../anomalyDetectionFeaturePlots.py | 49 ++++++---- .../explore/AnomalyDetectionExploration.ipynb | 89 +++++++++++++++---- 2 files changed, 104 insertions(+), 34 deletions(-) diff --git a/domains/anomaly-detection/anomalyDetectionFeaturePlots.py b/domains/anomaly-detection/anomalyDetectionFeaturePlots.py index a2972a92c..da1c05b1f 100755 --- a/domains/anomaly-detection/anomalyDetectionFeaturePlots.py +++ b/domains/anomaly-detection/anomalyDetectionFeaturePlots.py @@ -378,37 +378,40 @@ def annotate_outliers(outliers: pd.DataFrame) -> None: plot.savefig(plot_file_path) -def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title: str, plot_file_path: str) -> None: +def plot_feature_distribution(feature_values: pd.Series, feature_name: str, title: str, plot_file_path: str) -> None: """ - Plots the distribution of clustering coefficients. - + Plots the distribution of feature's values. + Parameters ---------- - clustering_coefficients : pd.Series - Series containing clustering coefficient values. + feature_values : pd.Series + Series containing feature values. + text_prefix: str + Text at the beginning of the title """ - if clustering_coefficients.empty: + if feature_values.empty: print("No data available to plot.") return plot.figure(figsize=(10, 6)) plot.figure(figsize=(10, 6)) - plot.hist(clustering_coefficients, bins=40, color='blue', alpha=0.7, edgecolor='black') + plot.hist(feature_values, bins=40, color='blue', alpha=0.7, edgecolor='black') plot.title(title, pad=20) - plot.xlabel('Clustering Coefficient') + plot.xlabel(feature_name) plot.ylabel('Frequency') - plot.xlim(left=clustering_coefficients.min(), right=clustering_coefficients.max()) + plot.xlim(left=feature_values.min(), right=feature_values.max()) # plot.yscale('log') # Use logarithmic scale for better visibility of differences plot.grid(True) - plot.tight_layout() - mean = clustering_coefficients.mean() - standard_deviation = clustering_coefficients.std() + mean = feature_values.mean() + standard_deviation = feature_values.std() # Vertical line for the mean plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0) # Vertical line for 1 x standard deviations + mean (=z-score of 1) - plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=1) + plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1) + # Vertical line for 2 x standard deviations + mean (=z-score of 2) + plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2) plot.tight_layout() plot.savefig(plot_file_path) @@ -829,10 +832,18 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict: plot_file_path=get_file_path(f"{plot_prefix}_PageRank_Minus_ArticleRank_Distribution", parameters) ) -plot_clustering_coefficient_distribution( - data['clusteringCoefficient'], - title=f"{plot_prefix} distribution of clustering coefficients", - plot_file_path=get_file_path(f"{plot_prefix}_ClusteringCoefficient_distribution", parameters) +plot_feature_distribution( + feature_values=data['clusteringCoefficient'], + feature_name='Clustering Coefficient', + title=f"{plot_prefix} clustering coefficient distribution", + plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters) +) + +plot_feature_distribution( + feature_values=data['betweenness'], + feature_name='Betweenness', + title=f"{plot_prefix} betweenness centrality distribution", + plot_file_path=get_file_path("BetweennessCentrality_distribution", parameters) ) plot_clustering_coefficient_vs_page_rank( @@ -848,8 +859,8 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict: print(f"anomalyDetectionFeaturePlots: Less than 20 clusters: {overall_cluster_count}. Only one plot containing all clusters will be created.") plot_clusters( clustering_visualization_dataframe=data, - title=f"{plot_prefix} all clusters overall (less than 20)", - plot_file_path=get_file_path(f"{plot_prefix}_Clusters_Overall", parameters) + title=f"{plot_prefix} all clusters overall", + plot_file_path=get_file_path("Clusters_Overall", parameters) ) else: print(f"anomalyDetectionFeaturePlots: More than 20 clusters: {overall_cluster_count}. Different plots focussing on different features like cluster size will be created.") diff --git a/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb index 3bf3531d2..e7511b277 100644 --- a/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb +++ b/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb @@ -640,47 +640,70 @@ { "cell_type": "code", "execution_count": null, - "id": "ed900c59", + "id": "d7b587c9", "metadata": {}, "outputs": [], "source": [ - "def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title_prefix: str) -> None:\n", + "def plot_feature_distribution(feature_values: pd.Series, feature_name: str, title_prefix: str) -> None:\n", " \"\"\"\n", - " Plots the distribution of clustering coefficients.\n", + " Plots the distribution of feature's values.\n", " \n", " Parameters\n", " ----------\n", - " clustering_coefficients : pd.Series\n", - " Series containing clustering coefficient values.\n", + " feature_values : pd.Series\n", + " Series containing feature values.\n", " text_prefix: str\n", " Text at the beginning of the title\n", " \"\"\"\n", - " if clustering_coefficients.empty:\n", + " if feature_values.empty:\n", " print(\"No data available to plot.\")\n", " return\n", "\n", " plot.figure(figsize=(10, 6))\n", " plot.figure(figsize=(10, 6))\n", - " plot.hist(clustering_coefficients, bins=40, color='blue', alpha=0.7, edgecolor='black')\n", - " plot.title(f\"{title_prefix} Distribution of Clustering Coefficients\", pad=20)\n", - " plot.xlabel('Clustering Coefficient')\n", + " plot.hist(feature_values, bins=40, color='blue', alpha=0.7, edgecolor='black')\n", + " plot.title(f\"{title_prefix} Distribution of the feature '{feature_name}'\", pad=20)\n", + " plot.xlabel(feature_name)\n", " plot.ylabel('Frequency')\n", - " plot.xlim(left=clustering_coefficients.min(), right=clustering_coefficients.max())\n", + " plot.xlim(left=feature_values.min(), right=feature_values.max())\n", " # plot.yscale('log') # Use logarithmic scale for better visibility of differences\n", " plot.grid(True)\n", " plot.tight_layout()\n", "\n", - " mean = clustering_coefficients.mean()\n", - " standard_deviation = clustering_coefficients.std()\n", + " mean = feature_values.mean()\n", + " standard_deviation = feature_values.std()\n", "\n", " # Vertical line for the mean\n", " plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)\n", " # Vertical line for 1 x standard deviations + mean (=z-score of 1)\n", - " plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=1)\n", + " plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)\n", + " # Vertical line for 2 x standard deviations + mean (=z-score of 2)\n", + " plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)\n", "\n", " plot.show()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed900c59", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title_prefix: str) -> None:\n", + " \"\"\"\n", + " Plots the distribution of clustering coefficients.\n", + " \n", + " Parameters\n", + " ----------\n", + " clustering_coefficients : pd.Series\n", + " Series containing clustering coefficient values.\n", + " text_prefix: str\n", + " Text at the beginning of the title\n", + " \"\"\"\n", + " plot_feature_distribution(clustering_coefficients, 'Clustering Coefficient', title_prefix)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -688,7 +711,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_clustering_coefficient_distribution(java_package_features['clusteringCoefficient'], title_prefix=\"Java Package\")" + "plot_feature_distribution(java_package_features['clusteringCoefficient'], 'Clustering Coefficient', title_prefix=\"Java Package\")" ] }, { @@ -798,6 +821,24 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "38cad9cb", + "metadata": {}, + "source": [ + "### 1.2b Betweenness Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01c0ea0d", + "metadata": {}, + "outputs": [], + "source": [ + "plot_feature_distribution(java_package_features['betweenness'], 'Betweenness', title_prefix=\"Java Package\")" + ] + }, { "cell_type": "markdown", "id": "630f5e4b", @@ -1402,7 +1443,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_clustering_coefficient_distribution(java_type_features['clusteringCoefficient'], title_prefix=\"Java Package\")" + "plot_feature_distribution(java_type_features['clusteringCoefficient'], 'Clustering Coefficient', title_prefix=\"Java Type\")" ] }, { @@ -1421,6 +1462,24 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "dfb7560d", + "metadata": {}, + "source": [ + "### 2.2b Betweenness Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1082ef81", + "metadata": {}, + "outputs": [], + "source": [ + "plot_feature_distribution(java_type_features['betweenness'], 'Betweenness', title_prefix=\"Java Type\")" + ] + }, { "cell_type": "markdown", "id": "69256999", From d075eafc75b261365238fcf2d16ab7ec5586cbdc Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Mon, 29 Sep 2025 16:53:30 +0200 Subject: [PATCH 10/13] Create sub directories for each anomaly detected code unit --- .../anomaly-detection/anomalyDetectionCsv.sh | 34 ++++++++------ .../anomalyDetectionFeaturePlots.py | 46 +++++++++---------- .../anomalyDetectionPython.sh | 10 ++-- .../tunedAnomalyDetectionExplained.py | 34 +++++++------- 4 files changed, 67 insertions(+), 57 deletions(-) diff --git a/domains/anomaly-detection/anomalyDetectionCsv.sh b/domains/anomaly-detection/anomalyDetectionCsv.sh index 1d4452135..ca4be0a2b 100755 --- a/domains/anomaly-detection/anomalyDetectionCsv.sh +++ b/domains/anomaly-detection/anomalyDetectionCsv.sh @@ -73,18 +73,22 @@ anomaly_detection_queries() { local language language=$( extractQueryParameter "projection_language" "${@}" ) - + + # Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...) + local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}" + mkdir -p "${detail_report_directory}" + echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Queries for ${nodeLabel} nodes..." - execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialImbalancedRoles.csv" - execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialOverEngineerOrIsolated.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialImbalancedRoles.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialOverEngineerOrIsolated.csv" - execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_HiddenBridgeNodes.csv" - execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PopularBottlenecks.csv" - execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_SilentCoordinators.csv" - execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_OverReferencesUtilities.csv" - execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_FragileStructuralBridges.csv" - execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_DependencyHungryOrchestrators.csv" - execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_UnexpectedCentralNodes.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_HiddenBridgeNodes.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PopularBottlenecks.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_SilentCoordinators.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_OverReferencesUtilities.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_FragileStructuralBridges.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_DependencyHungryOrchestrators.csv" + execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_UnexpectedCentralNodes.csv" } # Label code units with top anomalies by archetype. @@ -99,11 +103,15 @@ anomaly_detection_labels() { local language language=$( extractQueryParameter "projection_language" "${@}" ) + # Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...) + local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}" + mkdir -p "${detail_report_directory}" + echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Labelling ${language} ${nodeLabel} anomalies..." execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeRemoveLabels.cypher" "${@}" - execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopAuthority.csv" - execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopBottleneck.csv" - execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopHub.csv" + execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopAuthority.csv" + execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopBottleneck.csv" + execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopHub.csv" # The following two label types require Python scripts to run first and are skipped here intentionally: # execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBridge.cypher" "${@}" # execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}" diff --git a/domains/anomaly-detection/anomalyDetectionFeaturePlots.py b/domains/anomaly-detection/anomalyDetectionFeaturePlots.py index da1c05b1f..3517832c3 100755 --- a/domains/anomaly-detection/anomalyDetectionFeaturePlots.py +++ b/domains/anomaly-detection/anomalyDetectionFeaturePlots.py @@ -98,7 +98,7 @@ def __is_code_language_available(self) -> bool: def __get_projection_language(self) -> str: return self.query_parameters_["projection_language"] if self.__is_code_language_available() else "" - def get_plot_prefix(self) -> str: + def get_title_prefix(self) -> str: if self.__is_code_language_available(): return self.__get_projection_language() + " " + self.__get_projection_node_label() return self.__get_projection_node_label() @@ -815,7 +815,7 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict: # ------------------------------------------------------------------------------------------------------------ parameters = parse_input_parameters() -plot_prefix = parameters.get_plot_prefix() +title_prefix = parameters.get_title_prefix() report_directory = parameters.get_report_directory() driver = get_graph_database_driver() @@ -828,21 +828,21 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict: data['pageRank'], data['articleRank'], data['shortCodeUnitName'], - title=f"{plot_prefix} distribution of PageRank - ArticleRank differences", - plot_file_path=get_file_path(f"{plot_prefix}_PageRank_Minus_ArticleRank_Distribution", parameters) + title=f"{title_prefix} distribution of PageRank - ArticleRank differences", + plot_file_path=get_file_path("PageRank_Minus_ArticleRank_Distribution", parameters) ) plot_feature_distribution( feature_values=data['clusteringCoefficient'], feature_name='Clustering Coefficient', - title=f"{plot_prefix} clustering coefficient distribution", + title=f"{title_prefix} clustering coefficient distribution", plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters) ) plot_feature_distribution( feature_values=data['betweenness'], feature_name='Betweenness', - title=f"{plot_prefix} betweenness centrality distribution", + title=f"{title_prefix} betweenness centrality distribution", plot_file_path=get_file_path("BetweennessCentrality_distribution", parameters) ) @@ -851,15 +851,15 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict: data['pageRank'], data['shortCodeUnitName'], data['clusterNoise'], - title=f"{plot_prefix} clustering coefficient versus PageRank", - plot_file_path=get_file_path(f"{plot_prefix}_ClusteringCoefficient_versus_PageRank", parameters) + title=f"{title_prefix} clustering coefficient versus PageRank", + plot_file_path=get_file_path("ClusteringCoefficient_versus_PageRank", parameters) ) if (overall_cluster_count < 20): print(f"anomalyDetectionFeaturePlots: Less than 20 clusters: {overall_cluster_count}. Only one plot containing all clusters will be created.") plot_clusters( clustering_visualization_dataframe=data, - title=f"{plot_prefix} all clusters overall", + title=f"{title_prefix} all clusters overall", plot_file_path=get_file_path("Clusters_Overall", parameters) ) else: @@ -869,8 +869,8 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict: ) plot_clusters( clustering_visualization_dataframe=clusters_by_largest_size, - title=f"{plot_prefix} clusters with the largest size", - plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_size", parameters) + title=f"{title_prefix} clusters with the largest size", + plot_file_path=get_file_path("Clusters_largest_size", parameters) ) clusters_by_largest_max_radius = get_clusters_by_criteria( @@ -878,8 +878,8 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict: ) plot_clusters( clustering_visualization_dataframe=clusters_by_largest_max_radius, - title=f"{plot_prefix} clusters with the largest max radius", - plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_max_radius", parameters) + title=f"{title_prefix} clusters with the largest max radius", + plot_file_path=get_file_path("Clusters_largest_max_radius", parameters) ) clusters_by_largest_average_radius = get_clusters_by_criteria( @@ -887,39 +887,39 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict: ) plot_clusters( clustering_visualization_dataframe=clusters_by_largest_average_radius, - title=f"{plot_prefix} clusters with the largest average radius", - plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_average_radius", parameters) + title=f"{title_prefix} clusters with the largest average radius", + plot_file_path=get_file_path("Clusters_largest_average_radius", parameters) ) plot_clusters_probabilities( clustering_visualization_dataframe=data, - title=f"{plot_prefix} clustering probabilities (red=high uncertainty)", - plot_file_path=get_file_path(f"{plot_prefix}_Cluster_probabilities", parameters) + title=f"{title_prefix} clustering probabilities (red=high uncertainty)", + plot_file_path=get_file_path("Cluster_probabilities", parameters) ) plot_cluster_noise( clustering_visualization_dataframe=data, - title=f"{plot_prefix} clustering noise points that are surprisingly central (red) or popular (size)", + title=f"{title_prefix} clustering noise points that are surprisingly central (red) or popular (size)", size_column_name='degree', color_column_name='pageRank', - plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_highly_central_and_popular", parameters) + plot_file_path=get_file_path("ClusterNoise_highly_central_and_popular", parameters) ) plot_cluster_noise( clustering_visualization_dataframe=data, - title=f"{plot_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)", + title=f"{title_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)", size_column_name='inverseClusteringCoefficient', color_column_name='betweenness', - plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_poorly_integrated_bridges", parameters), + plot_file_path=get_file_path("ClusterNoise_poorly_integrated_bridges", parameters), downscale_normal_sizes=0.4 ) plot_cluster_noise( clustering_visualization_dataframe=data, - title=f"{plot_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)", + title=f"{title_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)", size_column_name='pageToArticleRankDifference', color_column_name='betweenness', - plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_role_inverted_bridges", parameters) + plot_file_path=get_file_path("ClusterNoise_role_inverted_bridges", parameters) ) driver.close() diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh index 358fcae8e..ef4b60f3d 100755 --- a/domains/anomaly-detection/anomalyDetectionPython.sh +++ b/domains/anomaly-detection/anomalyDetectionPython.sh @@ -125,6 +125,10 @@ anomaly_detection_using_python() { echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Python scripts for ${language} ${nodeLabel} nodes..." + # Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...) + local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}" + mkdir -p "${detail_report_directory}" + # Get tuned Leiden communities as a reference to tune clustering time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedLeidenCommunityDetection.py" "${@}" ${verboseMode} # Tuned Fast Random Projection and tuned HDBSCAN clustering @@ -132,11 +136,11 @@ anomaly_detection_using_python() { # Reduce the dimensionality of the node embeddings down to 2D for visualization using UMAP time "${ANOMALY_DETECTION_SCRIPT_DIR}/umap2dNodeEmbeddings.py" "${@}" ${verboseMode} # Plot the results with clustering and UMAP embeddings to reveal anomalies in rare feature combinations - time "${ANOMALY_DETECTION_SCRIPT_DIR}/anomalyDetectionFeaturePlots.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode} + time "${ANOMALY_DETECTION_SCRIPT_DIR}/anomalyDetectionFeaturePlots.py" "${@}" "--report_directory" "${detail_report_directory}" ${verboseMode} # Run an unsupervised anomaly detection algorithm including tuning and explainability - time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedAnomalyDetectionExplained.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode} + time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedAnomalyDetectionExplained.py" "${@}" "--report_directory" "${detail_report_directory}" ${verboseMode} # Query Results: Output all collected features into a CSV file. - execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}AnomalyDetection_Features.csv" + execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${detail_report_directory}/Anomaly_Features.csv" } # Label code units with top anomalies by archetype. diff --git a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py index 5210e97c7..722aa997d 100755 --- a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py +++ b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py @@ -10,7 +10,6 @@ # - Provide the password for Neo4j in the environment variable "NEO4J_INITIAL_PASSWORD". # - Requires "tunedLeidenCommunityDetection.py", "tunedNodeEmbeddingClustering.py" and "umap2dNodeEmbedding.py" to be executed before this script to provide the necessary data. -from re import M import typing import numpy.typing as numpy_typing @@ -95,9 +94,9 @@ def from_input_parameters(cls, input_parameters: typing.Dict[str, str], report_d @classmethod def example(cls): - return cls(dict( - projection_node_label="Package", - )) + return cls({ + "projection_node_label": "Package", + }) def get_query_parameters(self) -> typing.Dict[str, str]: return self.query_parameters_.copy() # copy enforces immutability @@ -111,7 +110,7 @@ def __is_code_language_available(self) -> bool: def __get_projection_language(self) -> str: return self.query_parameters_["projection_language"] if self.__is_code_language_available() else "" - def get_plot_prefix(self) -> str: + def get_title_prefix(self) -> str: if self.__is_code_language_available(): return self.__get_projection_language() + " " + self.get_projection_node_label() return self.get_projection_node_label() @@ -484,7 +483,7 @@ def objective(trial) -> float: study.optimize(objective, n_trials=number_of_trials, timeout=optimization_timeout_in_seconds) # Output tuning results - print(f"Best Isolation & Random Forest parameters for {parameters.get_plot_prefix()} after {len(study.trials)}/{number_of_trials} trials with best #{study.best_trial.number} (Optuna):", study.best_params) + print(f"Best Isolation & Random Forest parameters for {parameters.get_title_prefix()} after {len(study.trials)}/{number_of_trials} trials with best #{study.best_trial.number} (Optuna):", study.best_params) if parameters.is_verbose(): output_optuna_tuning_results(study, study.study_name) @@ -818,7 +817,7 @@ def plot_all_shap_explained_local_feature_importance( prepared_features=prepared_features, feature_names=feature_names, title=f"{title_prefix} \"{row[code_unit_name_column]}\" anomaly #{index} explained", - plot_file_path=get_file_path(f"{title_prefix}_Anomaly_{index}_shap_explanation", parameters), + plot_file_path=get_file_path(f"Anomaly_{index}_shap_explanation", parameters), ) @@ -1018,17 +1017,17 @@ def output_top_shap_explained_global_features_as_markdown_table( ] parameters = parse_input_parameters() -plot_prefix = parameters.get_plot_prefix() +title_prefix = parameters.get_title_prefix() driver = get_graph_database_driver() features = query_data(parameters) if parameters.is_verbose(): - print("tunedAnomalyDetectionExplained: Features for anomaly detection of {plot_type} (first 5 rows):", features.head(5)) + print("tunedAnomalyDetectionExplained: Features for anomaly detection of {title_prefix} (first 5 rows):", features.head(5)) validate_data(features) if features.empty: - print(f"tunedAnomalyDetectionExplained: Warning: No data. Skipping Anomaly Detection for {plot_prefix}.") + print(f"tunedAnomalyDetectionExplained: Warning: No data. Skipping Anomaly Detection for {title_prefix}.") sys.exit(0) features_to_standardize = features.columns.drop(features_for_visualization_to_exclude_from_training + ['embedding']).to_list() @@ -1044,7 +1043,6 @@ def output_top_shap_explained_global_features_as_markdown_table( features = add_anomaly_detection_results_to_features(features, anomaly_detection_results) if parameters.is_verbose(): - # TODO Output CSV with anomaly detection results print("tunedAnomalyDetectionExplained: Top 10 anomalies:") print(get_top_10_anomalies(features).reset_index(drop=True)) print("tunedAnomalyDetectionExplained: Top 10 non-anomalies:") @@ -1053,7 +1051,7 @@ def output_top_shap_explained_global_features_as_markdown_table( plot_anomalies( features_to_visualize=features, title_prefix="Java Package Anomalies", - plot_file_path=get_file_path(f"{plot_prefix}_Anomalies", parameters) + plot_file_path=get_file_path("Anomalies", parameters) ) if parameters.is_verbose(): @@ -1071,8 +1069,8 @@ def output_top_shap_explained_global_features_as_markdown_table( shap_anomaly_values=explanation_results.shap_anomaly_values, prepared_features=features_prepared, feature_names=feature_names, - title_prefix=plot_prefix, - plot_file_path=get_file_path(f"{plot_prefix}_Anomaly_feature_importance_explained", parameters) + title_prefix=title_prefix, + plot_file_path=get_file_path("Anomaly_feature_importance_explained", parameters) ) plot_all_shap_explained_local_feature_importance( @@ -1081,15 +1079,15 @@ def output_top_shap_explained_global_features_as_markdown_table( prepared_features=features_prepared, feature_names=feature_names, parameters=parameters, - title_prefix=plot_prefix + title_prefix=title_prefix ) plot_shap_explained_top_10_feature_dependence( shap_anomaly_values=explanation_results.shap_anomaly_values, prepared_features=features_prepared, feature_names=feature_names, - title_prefix=plot_prefix, - plot_file_path=get_file_path(f"{plot_prefix}_Anomaly_feature_dependence_explained", parameters) + title_prefix=title_prefix, + plot_file_path=get_file_path("Anomaly_feature_dependence_explained", parameters) ) add_top_shap_features_to_anomalies( @@ -1107,7 +1105,7 @@ def output_top_shap_explained_global_features_as_markdown_table( output_top_shap_explained_global_features_as_markdown_table( shap_anomaly_values=explanation_results.shap_anomaly_values, feature_names=feature_names, - output_file_path=get_file_path(f"{plot_prefix}_Top_anomaly_features", parameters, 'md') + output_file_path=get_file_path("Top_anomaly_features", parameters, 'md') ) if parameters.is_verbose(): From 41971852eeebfaf1c31f2ba2554c3e6ad00636a1 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Mon, 6 Oct 2025 16:09:22 +0200 Subject: [PATCH 11/13] Fix duplicate dash character in report artifact name --- .github/workflows/public-analyze-code-graph.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/public-analyze-code-graph.yml b/.github/workflows/public-analyze-code-graph.yml index fc45775a5..91105c4b2 100644 --- a/.github/workflows/public-analyze-code-graph.yml +++ b/.github/workflows/public-analyze-code-graph.yml @@ -92,7 +92,7 @@ jobs: run: echo "Please specify either the input parameter 'artifacts-upload-name' or 'sources-upload-name'."; exit 1 - name: Assemble ENVIRONMENT_INFO - run: echo "ENVIRONMENT_INFO=-java-${{ matrix.java }}-python-${{ matrix.python }}-miniforge-${{ matrix.miniforge }}" >> $GITHUB_ENV + run: echo "ENVIRONMENT_INFO=java-${{ matrix.java }}-python-${{ matrix.python }}-miniforge-${{ matrix.miniforge }}" >> $GITHUB_ENV - name: (Code Analysis Setup) Checkout code-graph-analysis-pipeline uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5 From 2ff41da403ad01b192755892c564ef99cfed5fb5 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sun, 5 Oct 2025 20:48:43 +0200 Subject: [PATCH 12/13] Support Markdown reports --- .../anomaly-detection/anomalyDetectionCsv.sh | 1 + .../anomalyDetectionMarkdown.sh | 27 ++++++++++ .../anomalyDetectionPython.sh | 1 + scripts/reports/compilations/AllReports.sh | 3 +- .../reports/compilations/MarkdownReports.sh | 52 +++++++++++++++++++ 5 files changed, 83 insertions(+), 1 deletion(-) create mode 100755 domains/anomaly-detection/anomalyDetectionMarkdown.sh create mode 100755 scripts/reports/compilations/MarkdownReports.sh diff --git a/domains/anomaly-detection/anomalyDetectionCsv.sh b/domains/anomaly-detection/anomalyDetectionCsv.sh index ca4be0a2b..c49ef2936 100755 --- a/domains/anomaly-detection/anomalyDetectionCsv.sh +++ b/domains/anomaly-detection/anomalyDetectionCsv.sh @@ -3,6 +3,7 @@ # Pipeline that coordinates anomaly detection using the Graph Data Science Library of Neo4j. # It requires an already running Neo4j graph database with already scanned and analyzed artifacts. # The results will be written into the sub directory reports/anomaly-detection. +# Dynamically triggered by "CsvReports.sh". # Note that "scripts/prepareAnalysis.sh" is required to run prior to this script. diff --git a/domains/anomaly-detection/anomalyDetectionMarkdown.sh b/domains/anomaly-detection/anomalyDetectionMarkdown.sh new file mode 100755 index 000000000..2aef586c0 --- /dev/null +++ b/domains/anomaly-detection/anomalyDetectionMarkdown.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# This script is dynamically triggered by "MarkdownReports.sh" when report "All" or "Markdown" are enabled. +# It is designed as an entry point and delegates the execution to the dedicated "anomalyDetectionSummary.sh" script that does the "heavy lifting". + +# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script. + +# Requires anomalyDetectionSummary.sh + +# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) +set -o errexit -o pipefail + +# Overrideable Constants (defaults also defined in sub scripts) +REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} + +## Get this "scripts/reports" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)} +# echo "anomalyDetectionCsv: ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR}" + +# Get the "summary" directory by taking the path of this script and selecting "summary". +ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/summary"} # Contains everything (scripts, queries, templates) to create the Markdown summary report for anomaly detection + +# Delegate the execution to the responsible script. +source "${ANOMALY_DETECTION_SUMMARY_DIR}/anomalyDetectionSummary.sh" \ No newline at end of file diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh index ef4b60f3d..c4216a628 100755 --- a/domains/anomaly-detection/anomalyDetectionPython.sh +++ b/domains/anomaly-detection/anomalyDetectionPython.sh @@ -3,6 +3,7 @@ # Pipeline that coordinates anomaly detection using the Graph Data Science Library of Neo4j. # It requires an already running Neo4j graph database with already scanned and analyzed artifacts. # The results will be written into the sub directory reports/anomaly-detection. +# Dynamically triggered by "PythonReports.sh". # Note that "scripts/prepareAnalysis.sh" is required to run prior to this script. diff --git a/scripts/reports/compilations/AllReports.sh b/scripts/reports/compilations/AllReports.sh index d195c9de7..75e5d3d17 100755 --- a/scripts/reports/compilations/AllReports.sh +++ b/scripts/reports/compilations/AllReports.sh @@ -26,4 +26,5 @@ echo "AllReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DI source "${REPORT_COMPILATIONS_SCRIPT_DIR}/CsvReports.sh" source "${REPORT_COMPILATIONS_SCRIPT_DIR}/JupyterReports.sh" source "${REPORT_COMPILATIONS_SCRIPT_DIR}/PythonReports.sh" -source "${REPORT_COMPILATIONS_SCRIPT_DIR}/VisualizationReports.sh" \ No newline at end of file +source "${REPORT_COMPILATIONS_SCRIPT_DIR}/VisualizationReports.sh" +source "${REPORT_COMPILATIONS_SCRIPT_DIR}/MarkdownReports.sh" \ No newline at end of file diff --git a/scripts/reports/compilations/MarkdownReports.sh b/scripts/reports/compilations/MarkdownReports.sh new file mode 100755 index 000000000..2b079258e --- /dev/null +++ b/scripts/reports/compilations/MarkdownReports.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +# Runs all Markdown report scripts (no Chromium required, no Python required). +# It only considers scripts in the "reports" and "domains" directories and their sub directories (overridable with REPORTS_SCRIPT_DIR and DOMAINS_DIRECTORY). + +# Requires reports/*.sh + +# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) +set -o errexit -o pipefail + +# Overrideable Constants (defaults also defined in sub scripts) +LOG_GROUP_START=${LOG_GROUP_START:-"::group::"} # Prefix to start a log group. Defaults to GitHub Actions log group start command. +LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defaults to GitHub Actions log group end command. + +## Get this "scripts/reports/compilations" directory if not already set. +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} +echo "MarkdownReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}" + +REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")} +echo "MarkdownReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" + +SCRIPTS_DIR=${SCRIPTS_DIR:-$(dirname -- "${REPORTS_SCRIPT_DIR}")} +echo "MarkdownReports: SCRIPTS_DIR=${SCRIPTS_DIR}" + +# Get the "domains" directory that contains analysis and report scripts by functionality. +DOMAINS_DIRECTORY=${DOMAINS_DIRECTORY:-"${REPORTS_SCRIPT_DIR}/../../domains"} +echo "MarkdownReports: DOMAINS_DIRECTORY=${DOMAINS_DIRECTORY}" + +# Run all Markdown report scripts (filename ending with Markdown.sh or Summary.sh) in the REPORTS_SCRIPT_DIR and DOMAINS_DIRECTORY directories. +for directory in "${REPORTS_SCRIPT_DIR}" "${DOMAINS_DIRECTORY}"; do + if [ ! -d "${directory}" ]; then + echo "MarkdownReports: Error: Directory ${directory} does not exist. Please check your REPORTS_SCRIPT_DIR and DOMAIN_DIRECTORY settings." + exit 1 + fi + + # Run all Python report scripts for the selected directory. + find "${directory}" -type f -name "*Markdown.sh" | sort | while read -r report_script_file; do + report_script_filename=$(basename -- "${report_script_file}"); + report_script_filename="${report_script_filename%.*}" # Remove file extension + + echo "${LOG_GROUP_START}Create Markdown Report ${report_script_filename}"; + echo "MarkdownReports: $(date +'%Y-%m-%dT%H:%M:%S%z') Starting ${report_script_filename}..."; + + source "${report_script_file}" + + echo "MarkdownReports: $(date +'%Y-%m-%dT%H:%M:%S%z') Finished ${report_script_filename}"; + echo "${LOG_GROUP_END}"; + done +done From c545b3863d0bdeb682a57d18def172460e9edd43 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Mon, 8 Sep 2025 16:09:16 +0200 Subject: [PATCH 13/13] Add anomaly detection Markdown summary report --- .../anomaly-detection/anomalyDetectionCsv.sh | 6 + .../AnomaliesDeepDiveArchetypes.cypher | 27 ++ .../summary/AnomaliesDeepDiveOverview.cypher | 20 ++ .../summary/AnomaliesInTotal.cypher | 22 ++ .../AnomaliesPerAbstractionLayer.cypher | 30 +++ .../AnomalyDeepDiveTopAnomalies.cypher | 41 +++ .../summary/anomalyDetectionSummary.sh | 197 ++++++++++++++ .../summary/report.template.md | 247 ++++++++++++++++++ .../summary/report_deep_dive.template.md | 33 +++ ...report_deep_dive_anomaly_plots.template.md | 24 ++ ..._deep_dive_cluster_focus_plots.template.md | 7 + ..._deep_dive_cluster_noise_plots.template.md | 13 + ...deep_dive_cluster_overall_plot.template.md | 5 + ...report_deep_dive_feature_plots.template.md | 13 + ...port_no_anomaly_detection_data.template.md | 1 + scripts/cleanupAfterReportGeneration.sh | 10 + 16 files changed, 696 insertions(+) create mode 100644 domains/anomaly-detection/summary/AnomaliesDeepDiveArchetypes.cypher create mode 100644 domains/anomaly-detection/summary/AnomaliesDeepDiveOverview.cypher create mode 100644 domains/anomaly-detection/summary/AnomaliesInTotal.cypher create mode 100644 domains/anomaly-detection/summary/AnomaliesPerAbstractionLayer.cypher create mode 100644 domains/anomaly-detection/summary/AnomalyDeepDiveTopAnomalies.cypher create mode 100755 domains/anomaly-detection/summary/anomalyDetectionSummary.sh create mode 100644 domains/anomaly-detection/summary/report.template.md create mode 100644 domains/anomaly-detection/summary/report_deep_dive.template.md create mode 100644 domains/anomaly-detection/summary/report_deep_dive_anomaly_plots.template.md create mode 100644 domains/anomaly-detection/summary/report_deep_dive_cluster_focus_plots.template.md create mode 100644 domains/anomaly-detection/summary/report_deep_dive_cluster_noise_plots.template.md create mode 100644 domains/anomaly-detection/summary/report_deep_dive_cluster_overall_plot.template.md create mode 100644 domains/anomaly-detection/summary/report_deep_dive_feature_plots.template.md create mode 100644 domains/anomaly-detection/summary/report_no_anomaly_detection_data.template.md diff --git a/domains/anomaly-detection/anomalyDetectionCsv.sh b/domains/anomaly-detection/anomalyDetectionCsv.sh index c49ef2936..73cb5b828 100755 --- a/domains/anomaly-detection/anomalyDetectionCsv.sh +++ b/domains/anomaly-detection/anomalyDetectionCsv.sh @@ -68,6 +68,8 @@ anomaly_detection_features() { # Required Parameters: # - projection_node_label=... # Label of the nodes that will be used for the projection. Example: "Package" +# - projection_language=... +# Name of the associated programming language. Default: "Java". Example: "Typescript" anomaly_detection_queries() { local nodeLabel nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" ) @@ -97,6 +99,8 @@ anomaly_detection_queries() { # Required Parameters: # - projection_node_label=... # Label of the nodes that will be used for the projection. Example: "Package" +# - projection_language=... +# Name of the associated programming language. Examples: "Java", "Typescript" anomaly_detection_labels() { local nodeLabel nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" ) @@ -127,6 +131,8 @@ anomaly_detection_labels() { # Label of the nodes that will be used for the projection. Example: "Package" # - projection_weight_property=... # Name of the node property that contains the dependency weight. Example: "weight" +# - projection_language=... +# Name of the associated programming language. Examples: "Java", "Typescript" anomaly_detection_csv_reports() { time anomaly_detection_features "${@}" time anomaly_detection_queries "${@}" diff --git a/domains/anomaly-detection/summary/AnomaliesDeepDiveArchetypes.cypher b/domains/anomaly-detection/summary/AnomaliesDeepDiveArchetypes.cypher new file mode 100644 index 000000000..6db125df2 --- /dev/null +++ b/domains/anomaly-detection/summary/AnomaliesDeepDiveArchetypes.cypher @@ -0,0 +1,27 @@ +// Anomaly Detection Summary: Summarizes all labelled archetypes by their anomaly score including examples. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) +UNWIND keys(codeUnit) AS codeUnitProperty + WITH * + WHERE codeUnitProperty STARTS WITH 'anomaly' + AND codeUnitProperty ENDS WITH 'Rank' + WITH * + ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,split(split(codeUnitProperty, 'anomaly')[1], 'Rank')[0] AS archetype + ,codeUnit[codeUnitProperty] AS archetypeRank + ,codeUnit.anomalyScore AS anomalyScore + WITH *, collect(archetype)[0] AS archetype + ORDER BY codeUnit.anomalyScore DESC, archetypeRank ASC, codeUnitName ASC, archetype ASC + WITH archetype + ,anomalyScore + ,CASE WHEN codeUnit.anomalyScore <= 0 THEN 'Typical' + WHEN codeUnit.anomalyTopFeature1 IS NULL THEN 'Undetermined' + ELSE 'Anomalous' END AS modelStatus + ,codeUnitName +RETURN archetype AS `Archetype` + ,count(DISTINCT codeUnitName) AS `Count` + ,round(max(anomalyScore), 4, 'HALF_UP') AS `Max. Score` + ,modelStatus AS `Model Status` + ,apoc.text.join(collect(DISTINCT codeUnitName)[0..3], ', ') AS `Examples` +ORDER BY modelStatus, archetype, `Max. Score` DESC \ No newline at end of file diff --git a/domains/anomaly-detection/summary/AnomaliesDeepDiveOverview.cypher b/domains/anomaly-detection/summary/AnomaliesDeepDiveOverview.cypher new file mode 100644 index 000000000..7795b0872 --- /dev/null +++ b/domains/anomaly-detection/summary/AnomaliesDeepDiveOverview.cypher @@ -0,0 +1,20 @@ +// Anomaly Detection DeepDive: Overview of analyzed code units and the number of anomalies detected. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label + +MATCH (codeUnit) +WHERE $projection_node_label IN labels(codeUnit) + AND (codeUnit.incomingDependencies IS NOT NULL + OR codeUnit.outgoingDependencies IS NOT NULL) + WITH sum(codeUnit.anomalyLabel) AS anomalyCount + ,sum(sign(codeUnit.anomalyAuthorityRank)) AS authorityCount + ,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount + ,sum(sign(codeUnit.anomalyBridgeRank)) AS bridgeCount + ,sum(sign(codeUnit.anomalyHubRank)) AS hubCount + ,sum(sign(codeUnit.anomalyOutlierRank)) AS outlierCount + //,collect(codeUnit.name)[0..4] AS exampleNames + RETURN anomalyCount AS `Anomalies` + ,authorityCount AS `Authorities` + ,bottleNeckCount AS `Bottlenecks` + ,bridgeCount AS `Bridges` + ,hubCount AS `Hubs` + ,outlierCount AS `Outliers` + //,exampleNames \ No newline at end of file diff --git a/domains/anomaly-detection/summary/AnomaliesInTotal.cypher b/domains/anomaly-detection/summary/AnomaliesInTotal.cypher new file mode 100644 index 000000000..6ba1a78d9 --- /dev/null +++ b/domains/anomaly-detection/summary/AnomaliesInTotal.cypher @@ -0,0 +1,22 @@ +// Anomaly Detection Summary: Overview of all analyzed code units in total. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label + +MATCH (codeUnit) +WHERE (codeUnit.incomingDependencies IS NOT NULL + OR codeUnit.outgoingDependencies IS NOT NULL) + WITH count(DISTINCT codeUnit) AS codeUnitCount + ,sum(codeUnit.anomalyLabel) AS anomalyCount + ,sum(sign(codeUnit.anomalyAuthorityRank)) AS authorityCount + ,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount + ,sum(sign(codeUnit.anomalyBridgeRank)) AS bridgeCount + ,sum(sign(codeUnit.anomalyHubRank)) AS hubCount + ,sum(sign(codeUnit.anomalyOutlierRank)) AS outlierCount + //,collect(codeUnit.name)[0..4] AS exampleNames + RETURN codeUnitCount AS `Analyzed Units` + ,anomalyCount AS `Anomalies` + ,authorityCount AS `Authorities` + ,bottleNeckCount AS `Bottlenecks` + ,bridgeCount AS `Bridges` + ,hubCount AS `Hubs` + ,outlierCount AS `Outliers` + //,exampleNames + ORDER BY anomalyCount DESC, codeUnitCount DESC \ No newline at end of file diff --git a/domains/anomaly-detection/summary/AnomaliesPerAbstractionLayer.cypher b/domains/anomaly-detection/summary/AnomaliesPerAbstractionLayer.cypher new file mode 100644 index 000000000..407eb5bee --- /dev/null +++ b/domains/anomaly-detection/summary/AnomaliesPerAbstractionLayer.cypher @@ -0,0 +1,30 @@ +// Anomaly Detection Summary: Overview of analyzed code units and the number of anomalies detected. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label + +MATCH (codeUnit) +WHERE (codeUnit.incomingDependencies IS NOT NULL + OR codeUnit.outgoingDependencies IS NOT NULL) +UNWIND labels(codeUnit) AS codeUnitLabel + WITH * + WHERE NOT codeUnitLabel STARTS WITH 'Mark4' + AND NOT codeUnitLabel IN ['File', 'Directory', 'ByteCode', 'GenericDeclaration'] + WITH collect(codeUnitLabel) AS codeUnitLabels + ,codeUnit + WITH apoc.text.join(codeUnitLabels, ',') AS codeUnitLabels + ,count(DISTINCT codeUnit) AS codeUnitCount + ,sum(codeUnit.anomalyLabel) AS anomalyCount + ,sum(sign(codeUnit.anomalyAuthorityRank)) AS authorityCount + ,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount + ,sum(sign(codeUnit.anomalyBridgeRank)) AS bridgeCount + ,sum(sign(codeUnit.anomalyHubRank)) AS hubCount + ,sum(sign(codeUnit.anomalyOutlierRank)) AS outlierCount + //,collect(codeUnit.name)[0..4] AS exampleNames + RETURN codeUnitLabels AS `Abstraction Level` + ,codeUnitCount AS `Units` + ,anomalyCount AS `Anomalies` + ,authorityCount AS `Authorities` + ,bottleNeckCount AS `Bottlenecks` + ,bridgeCount AS `Bridges` + ,hubCount AS `Hubs` + ,outlierCount AS `Outliers` + //,exampleNames + ORDER BY anomalyCount DESC, codeUnitCount DESC \ No newline at end of file diff --git a/domains/anomaly-detection/summary/AnomalyDeepDiveTopAnomalies.cypher b/domains/anomaly-detection/summary/AnomalyDeepDiveTopAnomalies.cypher new file mode 100644 index 000000000..2a08f0760 --- /dev/null +++ b/domains/anomaly-detection/summary/AnomalyDeepDiveTopAnomalies.cypher @@ -0,0 +1,41 @@ +// Anomaly Detection Summary: Lists top anomalies (at most 20), the top 3 features that contributed to the decision and the archetype(s) classification (if available) they are assigned to. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.anomalyScore > 0 +ORDER BY codeUnit.anomalyScore DESC +UNWIND keys(codeUnit) AS codeUnitProperty + WITH codeUnit + ,CASE WHEN codeUnitProperty STARTS WITH 'anomaly' + AND codeUnitProperty ENDS WITH 'Rank' + THEN split(split(codeUnitProperty, 'anomaly')[1], 'Rank')[0] + END AS archetype + ,CASE WHEN codeUnitProperty STARTS WITH 'anomaly' + AND codeUnitProperty ENDS WITH 'Rank' + THEN codeUnit[codeUnitProperty] + END AS archetypeRank +ORDER BY codeUnit.anomalyScore DESC, archetypeRank ASC + WITH codeUnit + ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,apoc.text.join(collect(DISTINCT archetype), ', ') AS archetypes +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName +OPTIONAL MATCH (codeDirectory:File:Directory)-[:CONTAINS]->(codeUnit) + WITH *, split(replace(codeDirectory.fileName, './', ''), '/')[-2] AS directoryName + WITH *, coalesce(artifactName, projectName, directoryName, "") AS projectName +RETURN codeUnitName AS `Name` + ,projectName AS `Contained in` + ,round(codeUnit.anomalyScore, 4, 'HALF_UP') AS `Anomaly Score` + ,collect(archetypes)[0] AS `Archetypes` + ,nullif(codeUnit.anomalyTopFeature1, "") AS `Top Feature 1` + ,nullif(round(codeUnit.anomalyTopFeatureSHAPValue1, 4, 'HALF_UP'), 0.0) AS `Top Feature 1 SHAP` + ,nullif(codeUnit.anomalyTopFeature2, "") AS `Top Feature 2` + ,nullif(round(codeUnit.anomalyTopFeatureSHAPValue2, 4, 'HALF_UP'), 0.0) AS `Top Feature 2 SHAP` + ,nullif(codeUnit.anomalyTopFeature3, "") AS `Top Feature 3` + ,nullif(round(codeUnit.anomalyTopFeatureSHAPValue3, 4, 'HALF_UP'), 0.0) AS `Top Feature 3 SHAP` + ,CASE WHEN codeUnit.anomalyScore <= 0 THEN 'Typical' + WHEN codeUnit.anomalyTopFeature1 IS NULL THEN 'Undetermined' + ELSE 'Anomalous' END AS `Model Status` +LIMIT 20 \ No newline at end of file diff --git a/domains/anomaly-detection/summary/anomalyDetectionSummary.sh b/domains/anomaly-detection/summary/anomalyDetectionSummary.sh new file mode 100755 index 000000000..852b400dd --- /dev/null +++ b/domains/anomaly-detection/summary/anomalyDetectionSummary.sh @@ -0,0 +1,197 @@ +#!/usr/bin/env bash + +# Creates a Markdown report that contains all results of all the anomaly detection methods. +# It requires an already running Neo4j graph database with already scanned and analyzed artifacts. +# The results will be written into the sub directory reports/anomaly-detection. +# Dynamically triggered by "MarkdownReports.sh". + +# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script. +# Note that either "anomalyDetectionCsv.sh" or "anomalyDetectionPython.sh" is required to run prior to this script. + +# Requires executeQueryFunctions.sh, cleanupAfterReportGeneration.sh + +# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) +set -o errexit -o pipefail + +# Overrideable Constants (defaults also defined in sub scripts) +REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} +MARKDOWN_INCLUDES_DIRECTORY=${MARKDOWN_INCLUDES_DIRECTORY:-"includes"} + +## Get this "domains/anomaly-detection/summary" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)} +#echo "anomalyDetectionSummary: ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR}" +# Get the "scripts" directory by taking the path of this script and going one directory up. +SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SUMMARY_DIR}/../../../scripts"} # Repository directory containing the shell scripts + +MARKDOWN_INCLUDES_DIRECTORY="includes" +MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-"${SCRIPTS_DIR}/markdown"} +#echo "anomalyDetectionSummary: MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR}" >&2 + +# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher" and "execute_cypher_summarized" +source "${SCRIPTS_DIR}/executeQueryFunctions.sh" + +# Aggregates all results in a Markdown report. +# +# Required Parameters: +# - projection_node_label=... +# Label of the nodes that will be used for the projection. Example: "Package" +# - projection_language=... +# Name of the associated programming language. Examples: "Java", "Typescript" +anomaly_detection_deep_dive_report() { + local nodeLabel + nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" ) + + local language + language=$( extractQueryParameter "projection_language" "${@}" ) + + echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Creating ${language} ${nodeLabel} anomaly summary Markdown report..." + + local detail_report_directory_name="${language}_${nodeLabel}" + local detail_report_directory="${FULL_REPORT_DIRECTORY}/${detail_report_directory_name}" + + # Skip the deep dive report part if there is no data available + if ! find "${detail_report_directory}" -mindepth 1 -print -quit 2>/dev/null | grep -q .; then + echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Skipping ${language} ${nodeLabel} report..." >&2 + return 0 + fi + + report_number=$((report_number+1)) + + # Create the directory that contains the Markdown includes + local detail_report_include_directory="${detail_report_directory}/${MARKDOWN_INCLUDES_DIRECTORY}" + mkdir -p "${detail_report_include_directory}" + + # Collect dynamic Markdown includes + execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesDeepDiveOverview.cypher" "${@}" --output-markdown-table > "${detail_report_include_directory}/DeepDiveOverview.md" + execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesDeepDiveArchetypes.cypher" "${@}" --output-markdown-table > "${detail_report_include_directory}/DeepDiveArchetypes.md" + execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomalyDeepDiveTopAnomalies.cypher" "${@}" --output-markdown-table > "${detail_report_include_directory}/DeepDiveTopAnomalies.md" + + # Remove empty Markdown includes + source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${detail_report_include_directory}" + + # Collect static Markdown includes (after cleanup to not remove one-liner) + echo "### 2.${report_number} ${language} ${nodeLabel}" > "${detail_report_include_directory}/DeepDiveSectionTitle.md" + echo "" > "${detail_report_include_directory}/empty.md" + cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report_no_anomaly_detection_data.template.md" "${detail_report_include_directory}/report_no_anomaly_detection_data.template.md" + cp -f "${detail_report_directory}/Top_anomaly_features.md" "${detail_report_include_directory}" || true + + # Assemble Markdown-Includes containing plots depending on their availability (fallback empty.md) + if [ -f "${detail_report_directory}/Anomaly_feature_importance_explained.svg" ] ; then + cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report_deep_dive_anomaly_plots.template.md" "${detail_report_include_directory}/report_deep_dive_anomaly_plots.md" + fi + + if [ -f "${detail_report_directory}/Clusters_Overall.svg" ] ; then + cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report_deep_dive_cluster_overall_plot.template.md" "${detail_report_include_directory}/report_deep_dive_cluster_overall_plot.md" + fi + + if [ -f "${detail_report_directory}/Clusters_largest_average_radius.svg" ] ; then + cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report_deep_dive_cluster_focus_plots.template.md" "${detail_report_include_directory}/report_deep_dive_cluster_focus_plots.md" + fi + + if [ -f "${detail_report_directory}/Cluster_probabilities.svg" ] ; then + cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report_deep_dive_cluster_noise_plots.template.md" "${detail_report_include_directory}/report_deep_dive_cluster_noise_plots.md" + fi + + if [ -f "${detail_report_directory}/ClusteringCoefficient_distribution.svg" ] ; then + cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report_deep_dive_feature_plots.template.md" "${detail_report_include_directory}/report_deep_dive_feature_plots.md" + fi + + # Use Markdown template to assemble the final deep dive section of the Markdown report and replace variables + cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report_deep_dive.template.md" "${detail_report_directory}/report_deep_dive.template.md" + cat "${detail_report_directory}/report_deep_dive.template.md" | "${MARKDOWN_SCRIPTS_DIR}/embedMarkdownIncludes.sh" "${detail_report_include_directory}" > "${detail_report_directory}/report_deep_dive_with_vars.md" + sed "s/{{deep_dive_directory}}/${detail_report_directory_name}/g" "${detail_report_directory}/report_deep_dive_with_vars.md" > "${detail_report_directory}/report_deep_dive_${report_number}.md" + + rm -rf "${detail_report_directory}/report_deep_dive_with_vars.md" + rm -rf "${detail_report_directory}/report_deep_dive.template.md" + rm -rf "${detail_report_include_directory}" + + # Clean-up after report generation. Empty reports will be deleted. + source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${detail_report_directory}" +} + +# Run the anomaly detection report generation. +# +# Required Parameters: +# - projection_node_label=... +# Label of the nodes that will be used for the projection. Example: "Package" +# - projection_language=... +# Name of the associated programming language. Examples: "Java", "Typescript" +anomaly_detection_report() { + time anomaly_detection_deep_dive_report "${@}" +} + +anomaly_detection_front_matter_metadata_head() { + local current_date + current_date="$(date +'%Y-%m-%d')" + + local latest_tag + latest_tag="$(git ls-remote --tags origin | grep -v '\^{}' | tail -n1 | awk '{print $2}' | sed 's|refs/tags/||')" + + local analysis_directory + analysis_directory="${PWD##*/}" + + echo "---" + echo "title: \"Anomaly Detection Report\"" + echo "generated: \"${current_date}\"" + echo "model_version: \"${latest_tag}\"" + echo "dataset: \"${analysis_directory}\"" + echo "authors: [\"JohT/code-graph-analysis-pipeline\"]" + echo "---" +} + +# Finalize the anomaly detection report by taking the main template, applying includes and appending all deep dive reports +anomaly_detection_finalize_report() { + echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Assembling main anomaly detection Markdown report..." + + local report_include_directory="${FULL_REPORT_DIRECTORY}/${MARKDOWN_INCLUDES_DIRECTORY}" + mkdir -p "${report_include_directory}" + + execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesPerAbstractionLayer.cypher" --output-markdown-table > "${report_include_directory}/AnomaliesPerAbstractionLayer.md" + execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesInTotal.cypher" --output-markdown-table > "${report_include_directory}/AnomaliesInTotal.md" + + # Write "front matter" metadata section + anomaly_detection_front_matter_metadata_head > "${report_include_directory}/AnomalyDetectionReportFrontMatter.md" + + # Concatenate all deep dive reports as Markdown include + rm -rf "${report_include_directory}/AnomalyDetectionDeepDive.md" + for markdown_file in $(find . -type f -name 'report_deep_dive_*.md' | sort); do + cat "${markdown_file}" >> "${report_include_directory}/AnomalyDetectionDeepDive.md" + echo "" >> "${report_include_directory}/AnomalyDetectionDeepDive.md" + rm -rf "${markdown_file}" + done + + # Remove empty Markdown includes + source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${report_include_directory}" + + cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report.template.md" "${FULL_REPORT_DIRECTORY}/report.template.md" + cat "${FULL_REPORT_DIRECTORY}/report.template.md" | "${MARKDOWN_SCRIPTS_DIR}/embedMarkdownIncludes.sh" "${report_include_directory}" > "${FULL_REPORT_DIRECTORY}/anomaly_detection_report.md" + + rm -rf "${FULL_REPORT_DIRECTORY}/report.template.md" + rm -rf "${report_include_directory}" +} + +# Create report directory +REPORT_NAME="anomaly-detection" +FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}" +mkdir -p "${FULL_REPORT_DIRECTORY}" + +# Query Parameter key pairs for projection and algorithm side +ALGORITHM_NODE="projection_node_label" +ALGORITHM_LANGUAGE="projection_language" + +# -- Detail Reports for each code type ------------------------------- +report_number=0 + +anomaly_detection_report "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_LANGUAGE}=Java" +anomaly_detection_report "${ALGORITHM_NODE}=Package" "${ALGORITHM_LANGUAGE}=Java" +anomaly_detection_report "${ALGORITHM_NODE}=Type" "${ALGORITHM_LANGUAGE}=Java" +anomaly_detection_report "${ALGORITHM_NODE}=Module" "${ALGORITHM_LANGUAGE}=Typescript" + +# --------------------------------------------------------------- + +anomaly_detection_finalize_report + +echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished." \ No newline at end of file diff --git a/domains/anomaly-detection/summary/report.template.md b/domains/anomaly-detection/summary/report.template.md new file mode 100644 index 000000000..4aacac7bf --- /dev/null +++ b/domains/anomaly-detection/summary/report.template.md @@ -0,0 +1,247 @@ + + +# 📊 Anomaly Detection Report + +## 1. Executive Overview + +This report analyzes structural and dependency anomalies across multiple abstraction levels of the codebase. +The goal is to detect potential **software quality, design, and architecture issues** using graph-based features, anomaly detection (Isolation Forest), and SHAP explainability. + +## 📚 Table of Contents + +1. [Executive Overview](#1-executive-overview) +1. [Deep Dives by Abstraction Level](#2-deep-dives-by-abstraction-level) +1. [Plot Interpretation Guide](#3-plot-interpretation-guide) +1. [Taxonomy of Anomaly Archetypes](#4-taxonomy-of-anomaly-archetypes) +1. [Recommendations](#5-recommendations) +1. [Appendix](#6-appendix) + +--- + +### 1.1 Anomalies in total + + + +### 1.2 Overview of Analyzed Structures + + + +## 2. Deep Dives by Abstraction Level + +Each abstraction level includes anomaly statistics, SHAP feature importance, archetype distribution, and example anomalies. + + + +## 3. Plot Interpretation Guide + +> **Purpose:** Understand each plot type’s diagnostic value. +> **Applies to:** All abstraction levels. + +| Plot Type | Best For | Adds | Why It Matters | +| --- | --- | --- | --- | +| **Anomalies Plot** | Seeing distribution of anomalies in clusters | Context of clusters & outliers | Reveals isolation or cluster-based anomalies | +| **SHAP Summary** | Global feature importance | Feature impact direction | Shows what drives anomalies overall | +| **Local SHAP Force** | Explaining a single anomaly | Feature contribution breakdown | Useful for debugging individual outliers | +| **Dependence Plot** | Understanding feature influence | Interaction visualization | Reveals nonlinear feature effects | +| **Cluster Metrics** | Cluster characteristics | Radius, cohesion, noise | Identifies weakly defined or noisy clusters | + +## 3. Plot Interpretation Guide + +> **Purpose:** Provide a direct mapping between all plots and their analytical meaning. +> **Scope:** Applies to plots for *Java Type*, *Java Package*, and similar abstraction levels. +> **Format:** Each entry includes `Best for`, `Adds`, and `Why`, matching the in-report descriptions. + +--- + +### 📘 Main Plots + +| Plot | Description | Best For | Adds | Why | +|------|--------------|----------|------|-----| +| **Anomalies** | 2D visualization of all code units showing clusters and anomalies. | Understanding the overall distribution of anomalies in relation to clusters. | Context of clusters and outliers. | Reveals whether anomalies are isolated or cluster-based, guiding investigation. | +| **Global Feature Importance (SHAP Summary)** | Mean absolute SHAP values ranking global feature impact. | Global understanding of which features drive anomalies. | Direction of impact (color shows feature value). | Explains which metrics consistently influence anomaly detection. | +| **Feature Dependence (Top Important Features)** | Shows how specific feature values affect anomaly score; colored by interacting feature. | Understanding how one feature affects anomaly scores. | Color shows feature interaction or threshold effect. | Helps identify nonlinear relationships and feature interactions. | + +--- + +### 📙 Local Explanation Plots + +| Plot | Description | Best For | Adds | Why | +|------|--------------|----------|------|-----| +| **Local SHAP Force Plots (Top Anomalies 1–6)** | Visualizes per-feature contributions to each anomaly’s score relative to baseline. | Explaining *why a specific data point* is anomalous. | Visual breakdown of how each feature contributes to anomaly score. | Enables debugging of individual anomalies through transparent explanation. | + +--- + +### 📗 Cluster-Level Diagnostic Plots + +| Plot | Description | Best For | Adds | Why | +|------|--------------|----------|------|-----| +| **Clusters – Overall** | Shows all clusters since they all fit into one plot. | Gaining a holistic view of cluster characteristics in the dataset. | An overall summary of how all clusters are distributed and their key metrics. | Understanding the general structure and properties of clusters can help identify patterns and potential anomalies in the data. | +| **Clusters – Largest Average Radius** | Ranks clusters by mean distance of members from their centroid. | Getting an overview of clusters that are more dispersed. | Identifies clusters with internal variability. | Large average radius suggests less cohesion and potential outliers. | +| **Clusters – Largest Max Radius** | Shows clusters with the farthest outlying member. | Identifying clusters that have members farthest from cluster center. | Highlights clusters containing extreme outliers. | Indicates clusters that may contain hidden anomalies. | +| **Clusters – Largest Size** | Displays cluster membership counts. | Understanding which clusters contain the most code units. | Provides sense of frequency of code structures. | Large clusters may represent common design patterns; small clusters are specialized. | +| **Cluster Probabilities** | Distribution of HDBSCAN membership probabilities. | Detecting code units that don’t strongly belong to any cluster. | Measures how well-defined clusters are. | Highlights noisy or weakly defined clusters. | + +--- + +### 📒 Cluster Noise & Bridge Diagnostics + +| Plot | Description | Best For | Adds | Why | +|------|--------------|----------|------|-----| +| **Cluster Noise – Highly Central and Popular** | Central nodes that don’t fit any cluster. | Detecting code units that are highly connected but anomalous. | Reveals influential but misfit nodes. | Such nodes may be key but unstable integration points. | +| **Cluster Noise – Poorly Integrated Bridges** | Nodes connecting clusters but weakly integrated. | Detecting code units that bridge modules unusually. | Identifies cross-cutting or leaking dependencies. | May reveal architectural boundary violations. | +| **Cluster Noise – Role Inverted Bridges** | Bridges with reversed structural roles compared to expected topology. | Detecting code units connecting clusters in unexpected ways. | Highlights anomalous coupling roles. | Indicates architectural inversion or misuse of interfaces. | + +--- + +### 📙 Feature Distribution & Relationship Plots + +| Plot | Description | Best For | Adds | Why | +|------|--------------|----------|------|-----| +| **Betweenness Centrality Distribution** | Histogram of betweenness values. | Identifying code units that act as structural bridges. | Insight into flow of dependency control. | Detects potential bottlenecks or single points of failure. | +| **Clustering Coefficient Distribution** | Histogram of local clustering coefficients. | Identifying modularity and local cohesion. | Insight into how tightly code units cluster. | Reveals how cohesive or isolated different regions of the graph are. | +| **PageRank – ArticleRank Difference Distribution** | Distribution of `PageRank - ArticleRank`. | Identifying influential nodes beyond local connectivity. | Shows imbalance between influence and popularity. | Highlights components with disproportionate architectural impact. | +| **Clustering Coefficient vs PageRank** | Scatterplot comparing local clustering to global influence. | Identifying relationships between cohesion and centrality. | Visualizes trade-offs between modularity and reach. | Helps spot code units that are both locally and globally critical. | + +--- + +### 📔 Summary Categories + +| Category | Included Plots | Typical Usage | +|-----------|----------------|----------------| +| **Main Diagnostic** | Anomalies, Global SHAP, Feature Dependence | High-level anomaly review | +| **Local Explanation** | Local SHAP Force Plots | Case-by-case anomaly debugging | +| **Cluster Diagnostics** | Cluster Radius / Size / Probability | Assess cluster cohesion and outliers | +| **Cluster Noise Analysis** | Cluster Noise (3 types) | Identify special structural anomalies | +| **Feature Distributions** | Betweenness, Clustering, Rank Difference | Assess feature-based structure patterns | +| **Feature Relationships** | Clustering vs PageRank | Evaluate global vs local influence balance | + +--- + +### 💡 Reading Guidance + +- **Color Conventions:** + Red = anomalous, Green = typical, Light grey = noise, Pale colors = clusters. +- **Scales:** + SHAP values are normalized (mean absolute); graph metrics standardized by z-score. +- **How to Use:** + 1. Start with *Main Diagnostic* plots to identify anomalies and drivers. + 2. Use *Local SHAP* for detailed case analysis. + 3. Check *Cluster Diagnostics* and *Noise Plots* to verify grouping quality. + 4. Use *Feature Distributions* to contextualize metrics. + 5. Cross-reference *Feature Relationships* for architectural interpretation. + +--- + +### 📄 Structured Form (YAML Summary) + +You can include this in your appendix for machine-readable mapping: + +```yaml +plots: + main: + - name: Anomalies + purpose: Distribution of anomalies and clusters + - name: Global Feature Importance (SHAP) + purpose: Global feature ranking + - name: Feature Dependence + purpose: Feature–score relationship + local: + - name: Local SHAP Force Plots + purpose: Local explanations for top anomalies + cluster: + - name: Clusters Largest Average Radius + purpose: Identify dispersed clusters + - name: Clusters Largest Max Radius + purpose: Identify extreme outlier clusters + - name: Clusters Largest Size + purpose: Identify dominant cluster types + - name: Cluster Probabilities + purpose: Assess cluster definition strength + cluster_noise: + - name: Cluster Noise – Highly Central and Popular + purpose: Central anomalies without cluster fit + - name: Cluster Noise – Poorly Integrated Bridges + purpose: Weakly integrated bridges + - name: Cluster Noise – Role Inverted Bridges + purpose: Inverted bridge roles + feature_distributions: + - name: Betweenness Centrality Distribution + purpose: Bridge and bottleneck detection + - name: Clustering Coefficient Distribution + purpose: Cohesion and modularity measurement + - name: PageRank – ArticleRank Difference Distribution + purpose: Influence vs popularity analysis + feature_relationships: + - name: Clustering Coefficient vs PageRank + purpose: Local vs global influence comparison +``` + +## 4. Taxonomy of Anomaly Archetypes + +| Archetype | Feature Profile | Architectural Risk | +|-----------|-----------------|--------------------| +| **Hub** | High degree, low clustering coefficient | Central dependency; fragile hotspot | +| **Bottleneck** | High betweenness, low redundancy | Single point of failure; slows evolution | +| **Outlier** | High cluster distance, small cluster size | Misfit or irregular dependency pattern | +| **Authority** | High PageRank, low ArticleRank | Over-relied utility; low local stability | +| **Bridge** | Cross-cluster connection | Risky coupling; weak modular boundaries | + +**Structured form (for LLM parsing):** + +```yaml +archetypes: + - name: Hub + profile: High degree, low clustering coefficient + risk: Central dependency, fragile hotspot + - name: Bottleneck + profile: High betweenness, low redundancy + risk: Single point of failure + - name: Outlier + profile: High cluster distance, small cluster size + risk: Misfit component + - name: Authority + profile: High PageRank, low ArticleRank + risk: Over-relied utility + - name: Bridge + profile: Cross-cluster connector + risk: Risky coupling +``` + +--- + +## 5. Recommendations + +* **Refactor hubs:** Decompose large or over-connected utilities. +* **Mitigate bottlenecks:** Introduce redundancy or alternative communication paths. +* **Investigate outliers:** Determine if anomalies are justified exceptions. +* **Raise cohesion:** Increase local clustering by improving modular boundaries. +* **Stabilize authorities:** Encapsulate frequently used but fragile components. +* **Validate bridges:** Confirm cross-cluster connectors are intentional and safe. + +--- + +## 6. Appendix + +### 6.1 Methodology Overview + +1. Build dependency graph (types, packages, artifacts). +1. Compute graph metrics: degree, PageRank, betweenness, clustering coefficient, etc. +1. Generate embeddings via Fast Random Projection. +1. Reduce embeddings with PCA (retain 90% variance). +1. Train Isolation Forest for anomaly detection. +1. Explain results using SHAP (via Random Forest proxy). +1. Cluster anomalies via HDBSCAN, tuned with Leiden reference communities (AMI score). +1. Hyperparameter optimization for both Isolation Forest and Random Forest proxy with their F1 score + +### 6.2 Feature Set + +* Degree (in/out) +* PageRank +* ArticleRank +* Page-to-Article Rank Difference +* Betweenness Centrality +* Local Clustering Coefficient +* Cluster Outlier Score (1.0 - cluster probability) +* Cluster Radius (avg, max) +* Cluster Size +* Node Embedding (PCA 20–35 dims) diff --git a/domains/anomaly-detection/summary/report_deep_dive.template.md b/domains/anomaly-detection/summary/report_deep_dive.template.md new file mode 100644 index 000000000..164c6048c --- /dev/null +++ b/domains/anomaly-detection/summary/report_deep_dive.template.md @@ -0,0 +1,33 @@ + + +#### Anomaly Results + +##### Total anomalies + + + +##### Top global contributing features (via SHAP) + + + +#### Archetype Distribution + + + +#### Top anomalies with their local contributing features (via SHAP) + + + +#### Visualizations + +See [Plot Interpretation Guide](#3-plot-interpretation-guide) on how to read the plots in detail. + + + + + + + + + + diff --git a/domains/anomaly-detection/summary/report_deep_dive_anomaly_plots.template.md b/domains/anomaly-detection/summary/report_deep_dive_anomaly_plots.template.md new file mode 100644 index 000000000..4c169d685 --- /dev/null +++ b/domains/anomaly-detection/summary/report_deep_dive_anomaly_plots.template.md @@ -0,0 +1,24 @@ +##### Anomalies + +![Anomalies](./{{deep_dive_directory}}/Anomalies.svg) + +##### Global feature importance SHAP summary plots + +![Anomaly feature importance explained (global)](./{{deep_dive_directory}}/Anomaly_feature_importance_explained.svg) + +##### Feature dependence plots for top important features + +![Anomaly feature dependence explained (global)](./{{deep_dive_directory}}/Anomaly_feature_dependence_explained.svg) + +--- + +##### Local SHAP Force Plots – Top 6 Anomalies + +![Top 1 anomaly - local feature importance](./{{deep_dive_directory}}/Anomaly_1_shap_explanation.svg) +![Top 2 anomaly - local feature importance](./{{deep_dive_directory}}/Anomaly_2_shap_explanation.svg) +![Top 3 anomaly - local feature importance](./{{deep_dive_directory}}/Anomaly_3_shap_explanation.svg) +![Top 4 anomaly - local feature importance](./{{deep_dive_directory}}/Anomaly_4_shap_explanation.svg) +![Top 5 anomaly - local feature importance](./{{deep_dive_directory}}/Anomaly_5_shap_explanation.svg) +![Top 6 anomaly - local feature importance](./{{deep_dive_directory}}/Anomaly_6_shap_explanation.svg) + +--- diff --git a/domains/anomaly-detection/summary/report_deep_dive_cluster_focus_plots.template.md b/domains/anomaly-detection/summary/report_deep_dive_cluster_focus_plots.template.md new file mode 100644 index 000000000..c8a7c26bf --- /dev/null +++ b/domains/anomaly-detection/summary/report_deep_dive_cluster_focus_plots.template.md @@ -0,0 +1,7 @@ +##### Cluster Diagnostics + +![Clusters largest average radius](./{{deep_dive_directory}}/Clusters_largest_average_radius.svg) +![Clusters largest max radius](./{{deep_dive_directory}}/Clusters_largest_max_radius.svg) +![Clusters largest size](./{{deep_dive_directory}}/Clusters_largest_size.svg) + +--- diff --git a/domains/anomaly-detection/summary/report_deep_dive_cluster_noise_plots.template.md b/domains/anomaly-detection/summary/report_deep_dive_cluster_noise_plots.template.md new file mode 100644 index 000000000..3b26dd63b --- /dev/null +++ b/domains/anomaly-detection/summary/report_deep_dive_cluster_noise_plots.template.md @@ -0,0 +1,13 @@ +##### Cluster Membership Strength + +![Cluster probabilities](./{{deep_dive_directory}}/Cluster_probabilities.svg) + +--- + +##### Cluster Noise and Bridge Analysis + +![Cluster Noise: Highly central and popular](./{{deep_dive_directory}}/ClusterNoise_highly_central_and_popular.svg) +![Cluster Noise: Poorly integrated bridges](./{{deep_dive_directory}}/ClusterNoise_poorly_integrated_bridges.svg) +![Cluster Noise: Role inverted bridges](./{{deep_dive_directory}}/ClusterNoise_role_inverted_bridges.svg) + +--- diff --git a/domains/anomaly-detection/summary/report_deep_dive_cluster_overall_plot.template.md b/domains/anomaly-detection/summary/report_deep_dive_cluster_overall_plot.template.md new file mode 100644 index 000000000..f5331fcd2 --- /dev/null +++ b/domains/anomaly-detection/summary/report_deep_dive_cluster_overall_plot.template.md @@ -0,0 +1,5 @@ +##### Cluster Diagnostics + +![Cluster Overall](./{{deep_dive_directory}}/Clusters_Overall.svg) + +--- diff --git a/domains/anomaly-detection/summary/report_deep_dive_feature_plots.template.md b/domains/anomaly-detection/summary/report_deep_dive_feature_plots.template.md new file mode 100644 index 000000000..92c8d4c32 --- /dev/null +++ b/domains/anomaly-detection/summary/report_deep_dive_feature_plots.template.md @@ -0,0 +1,13 @@ +##### Feature Distributions + +![Betweenness Centrality Distribution](./{{deep_dive_directory}}/BetweennessCentrality_distribution.svg) +![Clustering coefficient distribution](./{{deep_dive_directory}}/ClusteringCoefficient_distribution.svg) +![PageRank minus ArticleRank distribution](./{{deep_dive_directory}}/PageRank_Minus_ArticleRank_Distribution.svg) + +--- + +##### Feature Relationships + +![Clustering coefficient versus PageRank](./{{deep_dive_directory}}/ClusteringCoefficient_versus_PageRank.svg) + +--- diff --git a/domains/anomaly-detection/summary/report_no_anomaly_detection_data.template.md b/domains/anomaly-detection/summary/report_no_anomaly_detection_data.template.md new file mode 100644 index 000000000..2ea56f4e3 --- /dev/null +++ b/domains/anomaly-detection/summary/report_no_anomaly_detection_data.template.md @@ -0,0 +1 @@ +⚠️ _No anomaly detection and SHAP data available for this level (model skipped or insufficient samples)._ \ No newline at end of file diff --git a/scripts/cleanupAfterReportGeneration.sh b/scripts/cleanupAfterReportGeneration.sh index f543e5f6b..186b3b3c7 100644 --- a/scripts/cleanupAfterReportGeneration.sh +++ b/scripts/cleanupAfterReportGeneration.sh @@ -29,6 +29,16 @@ find "${report_directory}" -type f -name "*.csv" | sort | while read -r report_f fi done +# Find all Markdown (md) files in the report directory +# and delete the ones that contain less than 3 lines. +find "${report_directory}" -type f -name "*.md" | sort | while read -r report_file; do + number_of_lines=$(wc -l < "${report_file}" | awk '{print $1}') + if [[ "${number_of_lines}" -le 2 ]]; then + echo "cleanupReports: deleting empty (${number_of_lines} lines) report file ${report_file}" + rm -f "${report_file}" + fi +done + # Delete reports directory if its empty number_files_in_report_directory=$( find "${report_directory}" -type f | wc -l | awk '{print $1}' ) if [[ "${number_files_in_report_directory}" -lt 1 ]]; then