diff --git a/.github/workflows/public-analyze-code-graph.yml b/.github/workflows/public-analyze-code-graph.yml index 3b481b83f..6946d7a07 100644 --- a/.github/workflows/public-analyze-code-graph.yml +++ b/.github/workflows/public-analyze-code-graph.yml @@ -100,6 +100,7 @@ jobs: repository: JohT/code-graph-analysis-pipeline ref: ${{ inputs.ref }} persist-credentials: false + fetch-tags: true - name: (Java Setup) Java Development Kit (JDK) ${{ matrix.java }} uses: actions/setup-java@dded0888837ed1f317902acf8a20df0ad188d165 # v5 diff --git a/domains/anomaly-detection/anomalyDetectionCsv.sh b/domains/anomaly-detection/anomalyDetectionCsv.sh index 73cb5b828..b4e276dcd 100755 --- a/domains/anomaly-detection/anomalyDetectionCsv.sh +++ b/domains/anomaly-detection/anomalyDetectionCsv.sh @@ -61,6 +61,9 @@ anomaly_detection_features() { # Determine the article rank if not already done execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Exists.cypher" \ "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Write.cypher" "${@}" + # Determine the normalized difference between Page Rank and Article Rank if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Write.cypher" "${@}" } # Run queries to find anomalies in the graph. diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh index c4216a628..80c3c1109 100755 --- a/domains/anomaly-detection/anomalyDetectionPython.sh +++ b/domains/anomaly-detection/anomalyDetectionPython.sh @@ -106,6 +106,9 @@ anomaly_detection_features() { # Determine the article rank if not already done execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Exists.cypher" \ "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Write.cypher" "${@}" + # Determine the normalized difference between Page Rank and Article Rank if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Write.cypher" "${@}" } # Execute the Python scripts for anomaly detection. diff --git a/domains/anomaly-detection/anomalyDetectionVisualization.sh b/domains/anomaly-detection/anomalyDetectionVisualization.sh new file mode 100755 index 000000000..513af9962 --- /dev/null +++ b/domains/anomaly-detection/anomalyDetectionVisualization.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# This script is dynamically triggered by "VisualizationReports.sh" when report "All" or "Visualization" is enabled. +# It is designed as an entry point and delegates the execution to the dedicated "anomalyDetectionGraphVisualization.sh" script that does the "heavy lifting". + +# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script. + +# Requires anomalyDetectionGraphVisualization.sh + +# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) +set -o errexit -o pipefail + +# Overrideable Constants (defaults also defined in sub scripts) +REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} + +## Get this "scripts/reports" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)} +# echo "anomalyDetectionCsv: ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR}" + +# Get the "summary" directory by taking the path of this script and selecting "summary". +ANOMALY_DETECTION_GRAPHS_DIR=${ANOMALY_DETECTION_GRAPHS_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/graphs"} # Contains everything (scripts, queries, templates) to create the Markdown summary report for anomaly detection + +# Delegate the execution to the responsible script. +source "${ANOMALY_DETECTION_GRAPHS_DIR}/anomalyDetectionGraphVisualization.sh" \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher new file mode 100644 index 000000000..0524df410 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher @@ -0,0 +1,11 @@ +// Return the first node with (amongst others) a "centralityPageRankToArticleRankDifference" property if it exists + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.centralityPageRankToArticleRankDifference IS NOT NULL + AND codeUnit.centralityPageRankNormalized IS NOT NULL + AND codeUnit.centralityPArticleRankNormalized IS NOT NULL + RETURN codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,codeUnit.centralityPageRankToArticleRankDifference AS pageToArticleRankDifference + LIMIT 1 \ No newline at end of file diff --git a/domains/anomaly-detection/features/AnomalyDetectionFeature-PageToArticleRank-Write.cypher b/domains/anomaly-detection/features/AnomalyDetectionFeature-PageToArticleRank-Write.cypher new file mode 100644 index 000000000..9fc2b3153 --- /dev/null +++ b/domains/anomaly-detection/features/AnomalyDetectionFeature-PageToArticleRank-Write.cypher @@ -0,0 +1,21 @@ +// Calculates and writes the (amongst others) "centralityPageRankToArticleRankDifference" property. + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.centralityPageRank IS NOT NULL + AND codeUnit.centralityArticleRank IS NOT NULL + WITH collect(codeUnit) AS codeUnits + ,min(codeUnit.centralityPageRank) AS minPageRank + ,max(codeUnit.centralityPageRank) AS maxPageRank + ,min(codeUnit.centralityArticleRank) AS minArticleRank + ,max(codeUnit.centralityArticleRank) AS maxArticleRank + UNWIND codeUnits AS codeUnit + WITH * + ,(codeUnit.centralityPageRank - minPageRank) / (maxPageRank - minPageRank) AS normalizedPageRank + ,(codeUnit.centralityArticleRank - minArticleRank) / (maxArticleRank - minArticleRank) AS normalizedArticleRank + WITH * + ,normalizedPageRank - normalizedArticleRank AS normalizedPageRankToArticleRankDifference + SET codeUnit.centralityPageRankToArticleRankDifference = normalizedPageRankToArticleRankDifference + ,codeUnit.centralityPageRankNormalized = normalizedPageRank + ,codeUnit.centralityArticleRankNormalized = normalizedArticleRank +RETURN count(*) AS nodePropertiesWritten \ No newline at end of file diff --git a/domains/anomaly-detection/graphs/TopAuthority.cypher b/domains/anomaly-detection/graphs/TopAuthority.cypher new file mode 100644 index 000000000..8582287d9 --- /dev/null +++ b/domains/anomaly-detection/graphs/TopAuthority.cypher @@ -0,0 +1,114 @@ +// Anomaly Detection Graphs: Find top nodes marked as "Authority" including their incoming and outgoing dependencies, sizes based on PageRank and thick outline for nodes with high Page Rank to Article Rank difference in Graphviz format. + +// Step 1: Query overall statistics, e.g. min/max weight for later normalization + MATCH (sourceForStatistics)-[dependencyForStatistics:DEPENDS_ON]->(targetForStatistics) + WHERE $projection_node_label IN labels(sourceForStatistics) + AND $projection_node_label IN labels(targetForStatistics) + WITH max(coalesce(dependencyForStatistics.weight25PercentInterfaces, dependencyForStatistics.weight)) AS maxWeight + ,percentileDisc(sourceForStatistics.centralityPageRankToArticleRankDifference, 0.80) AS pageToArticleRankThreshold + ,percentileDisc(targetForStatistics.centralityPageRankNormalized, 0.80) AS pageRankThreshold +// Step 2: Query selected central node + MATCH (central) + WHERE $projection_node_label IN labels(central) + AND central.anomalyAuthorityRank = toInteger($projection_node_rank) + WITH maxWeight + ,pageToArticleRankThreshold + ,pageRankThreshold + ,central + ,"Top Rank #" + $projection_node_rank + " " + $projection_language + " " + $projection_node_label + " Authority\\n" AS graphLabel + ,coalesce(central.fqn, central.globalFqn, central.fileName, central.signature, central.name) AS targetName + WITH *, "\\n\\ndark nodes: incoming dependencies (limited max. 40)\\n" AS graphLegend + WITH *, graphLegend + "bright nodes: outgoing dependencies (limited max. 40)\\n" AS graphLegend + WITH *, graphLegend + "node value: Page Rank (normalized)\\n" AS graphLegend + WITH *, graphLegend + "large circle: > 80% percentile of Page Rank\\n" AS graphLegend + WITH *, graphLegend + "thick outline: > 80% percentile of Page Rank to Article Rank Difference\\n" AS graphLegend + WITH *, ["graph [label=\"" + graphLabel + targetName + graphLegend + "\\n\"];"] AS graphVizOutput + WITH *, "๐Ÿ›๏ธ authority #" + central.anomalyAuthorityRank + "\\n" + central.name AS centralNodeLabel + WITH *, graphVizOutput + ["central [label=\"" + centralNodeLabel + "\"];"] AS graphVizOutput +// Step 3: Query direct incoming dependencies to the central node + MATCH (source)-[dependency:DEPENDS_ON]->(central) + WHERE $projection_node_label IN labels(source) + AND source.outgoingDependencies > 0 + ORDER BY dependency.weight DESC, source.name ASC + LIMIT 40 + WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight + WITH *, round((toFloat(weight) / toFloat(maxWeight) * 2.5) + 0.4, 1.0) AS penWidth + WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=" + penWidth AS edgeAttributes + WITH *, CASE WHEN source.centralityPageRankToArticleRankDifference >= pageToArticleRankThreshold + THEN 5 ELSE 2 END AS scaledNodeBorder + WITH *, CASE WHEN source.centralityPageRankNormalized >= pageRankThreshold + THEN "shape = \"circle\"; height=2; " ELSE "" END AS nodeEmphasis + WITH *, round(source.centralityPageRankNormalized * 100.0, 2) + "%" AS labelValue + // Add the last part of the element id to the node name to make it unique. + WITH *, source.name + "_" + split(elementId(source), ':')[-1] AS sourceId + WITH *, "penwidth = " + scaledNodeBorder + "; " AS directInBorder + // Split long names like inner classes identified by a dollar sign ($) + WITH *, replace(source.name, '$', '$\\n') AS sourceNameSplit + WITH *, "label = \"" + sourceNameSplit + "\\n(" + labelValue + ")\"; " AS directInLabel + WITH *, " [" + nodeEmphasis + directInLabel + directInBorder + "]; " AS directInNodeProperties + WITH *, "\"" + sourceId + "\" " + directInNodeProperties AS directInNode + WITH maxWeight + ,pageToArticleRankThreshold + ,pageRankThreshold + ,central + ,graphVizOutput + ,collect(source) AS incomingDependencyNodes + ,collect(directInNode + "\"" + sourceId + "\" -> central [" + edgeAttributes + "];") AS directInEdges + WITH *, graphVizOutput + directInEdges AS graphVizOutput +// Step 4: Query direct outgoing dependencies from the central node + MATCH (source)<-[dependency:DEPENDS_ON]-(central) + WHERE $projection_node_label IN labels(source) + AND source.incomingDependencies > 0 + ORDER BY dependency.weight DESC, source.name ASC + LIMIT 40 + WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight + WITH *, round((toFloat(weight) / toFloat(maxWeight) * 2.5) + 0.4, 1.0) AS penWidth + WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=" + penWidth AS edgeAttributes + // Use a lighter color for the target nodes of outgoing dependencies from the central node and their edges + WITH *, edgeAttributes + "; color = 5" AS edgeAttributes + WITH *, "color = 5; fillcolor = 1; " AS directOutColor + WITH *, CASE WHEN source.centralityPageRankToArticleRankDifference >= pageToArticleRankThreshold + THEN 5 ELSE 2 END AS scaledNodeBorder + WITH *, CASE WHEN source.centralityPageRankNormalized >= pageRankThreshold + THEN "shape = \"circle\"; height=2; " ELSE "" END AS nodeEmphasis + WITH *, round(source.centralityPageRankNormalized * 100.0, 2) + "%" AS labelValue + // Add the last part of the element id to the node name to make it unique. + WITH *, source.name + "_" + split(elementId(source), ':')[-1] AS sourceId + WITH *, "penwidth = " + scaledNodeBorder + "; " AS directOutBorder + // Split long names like inner classes identified by a dollar sign ($) + WITH *, replace(source.name, '$', '$\\n') AS sourceNameSplit + WITH *, "label = \"" + sourceNameSplit + "\\n(" + labelValue + ")\"; " AS directOutLabel + WITH *, " [" + nodeEmphasis + directOutLabel + directOutBorder + directOutColor + "]; " AS directOutNodeProperties + WITH *, "\"" + sourceId + "\" " + directOutNodeProperties AS directOutNode + WITH maxWeight + ,central + ,graphVizOutput + ,incomingDependencyNodes + ,collect(source) AS outgoingDependencyNodes + ,collect(directOutNode + "central -> \"" + sourceId + "\" [" + edgeAttributes + "];") AS directOutEdges + WITH *, graphVizOutput + directOutEdges AS graphVizOutput + WITH *, incomingDependencyNodes + outgoingDependencyNodes AS directDependentNodes +// Step 5: Query dependencies between direct dependencies outside the central node + UNWIND directDependentNodes AS directDependentNode + MATCH (directDependentNode)-[dependency:DEPENDS_ON]->(anotherDirectDependentNode) + WHERE anotherDirectDependentNode IN directDependentNodes + AND anotherDirectDependentNode <> directDependentNode + ORDER BY dependency.weight DESC, directDependentNode.name ASC + WITH graphVizOutput + ,directDependentNode + ,dependency + ,collect(anotherDirectDependentNode)[0] AS firstLinkedDependentNode + LIMIT 140 + WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight + // Use a fixed small pen width for secondary dependencies for better visibility of the more important direct dependency + WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=0.3" AS edgeAttributes + // Use an even lighter color for secondary dependency edges + WITH *, edgeAttributes + "; color = 3" AS edgeAttributes + // Add the last part of the element id to the node name to make it unique. + WITH *, directDependentNode.name + "_" + split(elementId(directDependentNode), ':')[-1] AS directDependentNodeId + WITH *, firstLinkedDependentNode.name + "_" + split(elementId(firstLinkedDependentNode), ':')[-1] AS firstLinkedDependentNodeId + WITH *, "\"" + directDependentNodeId + "\" -> \"" + firstLinkedDependentNodeId + "\"" AS directDependenciesEdge + WITH *, collect(directDependenciesEdge + " [" + edgeAttributes + "]") AS directDependenciesEdges + WITH *, graphVizOutput + directDependenciesEdges AS graphVizOutput +UNWIND graphVizOutput AS graphVizOutputLine +RETURN DISTINCT graphVizOutputLine \ No newline at end of file diff --git a/domains/anomaly-detection/graphs/TopBottleneck.cypher b/domains/anomaly-detection/graphs/TopBottleneck.cypher new file mode 100644 index 000000000..ff9e08e5b --- /dev/null +++ b/domains/anomaly-detection/graphs/TopBottleneck.cypher @@ -0,0 +1,107 @@ +// Anomaly Detection Graphs: Find top nodes marked as "Bottleneck" including their incoming and outgoing dependencies and output them in Graphviz format. + +// Step 1: Query overall statistics, e.g. min/max weight for later normalization + MATCH (sourceForStatistics)-[dependencyForStatistics:DEPENDS_ON]->(targetForStatistics) + WHERE $projection_node_label IN labels(sourceForStatistics) + AND $projection_node_label IN labels(targetForStatistics) + WITH max(coalesce(dependencyForStatistics.weight25PercentInterfaces, dependencyForStatistics.weight)) AS maxWeight + ,percentileDisc(sourceForStatistics.centralityBetweenness, 0.90) AS betweennessThreshold +// Step 2: Query selected central node + MATCH (central) + WHERE $projection_node_label IN labels(central) + AND central.anomalyBottleneckRank = toInteger($projection_node_rank) + WITH maxWeight + ,betweennessThreshold + ,central + ,"Top Rank #" + $projection_node_rank + " " + $projection_language + " " + $projection_node_label + " Bottleneck\\n" AS graphLabel + ,coalesce(central.fqn, central.globalFqn, central.fileName, central.signature, central.name) AS targetName + WITH *, "\\n\\ndark nodes: incoming dependencies (limited max. 40)\\n" AS graphLegend + WITH *, graphLegend + "bright nodes: outgoing dependencies (limited max. 40)\\n" AS graphLegend + WITH *, graphLegend + "node value: Betweenness centrality\\n" AS graphLegend + WITH *, graphLegend + "thick outline: > 90% percentile of Betweenness centrality\\n" AS graphLegend + WITH *, ["graph [label=\"" + graphLabel + targetName + graphLegend + "\\n\"];"] AS graphVizOutput + WITH *, "๐Ÿ”’ bottleneck #" + central.anomalyBottleneckRank + "\\n" + central.name AS centralNodeLabel + WITH *, graphVizOutput + ["central [label=\"" + centralNodeLabel + "\"];"] AS graphVizOutput +// Step 3: Query direct incoming dependencies to the central node + MATCH (source)-[dependency:DEPENDS_ON]->(central) + WHERE $projection_node_label IN labels(source) + AND source.outgoingDependencies > 0 + ORDER BY dependency.weight DESC, source.name ASC + LIMIT 40 + WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight + WITH *, round((toFloat(weight) / toFloat(maxWeight) * 2.5) + 0.4, 1.0) AS penWidth + WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=" + penWidth AS edgeAttributes + WITH *, CASE WHEN source.centralityBetweenness >= betweennessThreshold + THEN 5 ELSE 2 END AS scaledNodeBorder + WITH *, round(source.centralityBetweenness, 2) AS labelValue + // Add the last part of the element id to the node name to make it unique. + WITH *, source.name + "_" + split(elementId(source), ':')[-1] AS sourceId + WITH *, "penwidth = " + scaledNodeBorder + "; " AS directInBorder + // Split long names like inner classes identified by a dollar sign ($) + WITH *, replace(source.name, '$', '$\\n') AS sourceNameSplit + WITH *, "label = \"" + sourceNameSplit + "\\n(" + labelValue + ")\"; " AS directInLabel + WITH *, " [" + directInLabel + directInBorder + "]; " AS directInNodeProperties + WITH *, "\"" + sourceId + "\" " + directInNodeProperties AS directInNode + WITH maxWeight + ,betweennessThreshold + ,central + ,graphVizOutput + ,collect(source) AS incomingDependencyNodes + ,collect(directInNode + "\"" + sourceId + "\" -> central [" + edgeAttributes + "];") AS directInEdges + WITH *, graphVizOutput + directInEdges AS graphVizOutput +// Step 4: Query direct outgoing dependencies from the central node + MATCH (source)<-[dependency:DEPENDS_ON]-(central) + WHERE $projection_node_label IN labels(source) + AND source.incomingDependencies > 0 + ORDER BY dependency.weight DESC, source.name ASC + LIMIT 40 + WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight + WITH *, round((toFloat(weight) / toFloat(maxWeight) * 2.5) + 0.4, 1.0) AS penWidth + WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=" + penWidth AS edgeAttributes + // Use a lighter color for the target nodes of outgoing dependencies from the central node and their edges + WITH *, edgeAttributes + "; color = 5" AS edgeAttributes + WITH *, "color = 5; fillcolor = 1; " AS directOutColor + WITH *, CASE WHEN source.centralityBetweenness >= betweennessThreshold + THEN 5 ELSE 2 END AS scaledNodeBorder + WITH *, round(source.centralityBetweenness, 2) AS labelValue + // Add the last part of the element id to the node name to make it unique. + WITH *, source.name + "_" + split(elementId(source), ':')[-1] AS sourceId + WITH *, "penwidth = " + scaledNodeBorder + "; " AS directOutBorder + // Split long names like inner classes identified by a dollar sign ($) + WITH *, replace(source.name, '$', '$\\n') AS sourceNameSplit + WITH *, "label = \"" + sourceNameSplit + "\\n(" + labelValue + ")\"; " AS directOutLabel + WITH *, " [" + directOutLabel + directOutBorder + directOutColor + "]; " AS directOutNodeProperties + WITH *, "\"" + sourceId + "\" " + directOutNodeProperties AS directOutNode + WITH maxWeight + ,betweennessThreshold + ,central + ,graphVizOutput + ,incomingDependencyNodes + ,collect(source) AS outgoingDependencyNodes + ,collect(directOutNode + "central -> \"" + sourceId + "\" [" + edgeAttributes + "];") AS directOutEdges + WITH *, graphVizOutput + directOutEdges AS graphVizOutput + WITH *, incomingDependencyNodes + outgoingDependencyNodes AS directDependentNodes +// Step 5: Query dependencies between direct dependencies outside the central node + UNWIND directDependentNodes AS directDependentNode + MATCH (directDependentNode)-[dependency:DEPENDS_ON]->(anotherDirectDependentNode) + WHERE anotherDirectDependentNode IN directDependentNodes + AND anotherDirectDependentNode <> directDependentNode + ORDER BY dependency.weight DESC, directDependentNode.name ASC + WITH graphVizOutput + ,directDependentNode + ,dependency + ,collect(anotherDirectDependentNode)[0] AS firstLinkedDependentNode + LIMIT 140 + WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight + // Use a fixed small pen width for secondary dependencies for better visibility of the more important direct dependency + WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=0.3" AS edgeAttributes + // Use an even lighter color for secondary dependency edges + WITH *, edgeAttributes + "; color = 3" AS edgeAttributes + // Add the last part of the element id to the node name to make it unique. + WITH *, directDependentNode.name + "_" + split(elementId(directDependentNode), ':')[-1] AS directDependentNodeId + WITH *, firstLinkedDependentNode.name + "_" + split(elementId(firstLinkedDependentNode), ':')[-1] AS firstLinkedDependentNodeId + WITH *, "\"" + directDependentNodeId + "\" -> \"" + firstLinkedDependentNodeId + "\"" AS directDependenciesEdge + WITH *, collect(directDependenciesEdge + " [" + edgeAttributes + "]") AS directDependenciesEdges + WITH *, graphVizOutput + directDependenciesEdges AS graphVizOutput +UNWIND graphVizOutput AS graphVizOutputLine +RETURN DISTINCT graphVizOutputLine \ No newline at end of file diff --git a/domains/anomaly-detection/graphs/TopBridge.cypher b/domains/anomaly-detection/graphs/TopBridge.cypher new file mode 100644 index 000000000..4f2f8ea0e --- /dev/null +++ b/domains/anomaly-detection/graphs/TopBridge.cypher @@ -0,0 +1,94 @@ +// Anomaly Detection Graphs: Find top nodes marked as "Bridge" including their incoming and outgoing dependencies and output them in Graphviz format. + +// Step 1: Query overall statistics, e.g. min/max weight for later normalization + MATCH (sourceForStatistics)-[dependencyForStatistics:DEPENDS_ON]->(targetForStatistics) + WHERE $projection_node_label IN labels(sourceForStatistics) + AND $projection_node_label IN labels(targetForStatistics) + WITH max(coalesce(dependencyForStatistics.weight25PercentInterfaces, dependencyForStatistics.weight)) AS maxWeight +// Step 2: Query selected central node + MATCH (central) + WHERE $projection_node_label IN labels(central) + AND central.anomalyBridgeRank = toInteger($projection_node_rank) + WITH maxWeight + ,central + ,"Top Rank #" + $projection_node_rank + " " + $projection_language + " " + $projection_node_label + " Bridge\\n" AS graphLabel + ,coalesce(central.fqn, central.globalFqn, central.fileName, central.signature, central.name) AS targetName + WITH *, "\\n\\ndark nodes: incoming dependencies (limited max. 40)\\n" AS graphLegend + WITH *, graphLegend + "bright nodes: outgoing dependencies (limited max. 40)\\n" AS graphLegend + WITH *, ["graph [label=\"" + graphLabel + targetName + graphLegend + "\\n\"];"] AS graphVizOutput + WITH *, "๐ŸŒ‰ bridge #" + central.anomalyBridgeRank + "\\n" + central.name AS centralNodeLabel + WITH *, graphVizOutput + ["central [label=\"" + centralNodeLabel + "\"];"] AS graphVizOutput +// Step 3: Query direct incoming dependencies to the central node + MATCH (source)-[dependency:DEPENDS_ON]->(central) + WHERE $projection_node_label IN labels(source) + AND source.outgoingDependencies > 0 + ORDER BY dependency.weight DESC, source.name ASC + LIMIT 40 + WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight + WITH *, round((toFloat(weight) / toFloat(maxWeight) * 2.5) + 0.4, 1.0) AS penWidth + WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=" + penWidth AS edgeAttributes + // Add the last part of the element id to the node name to make it unique. + WITH *, source.name + "_" + split(elementId(source), ':')[-1] AS sourceId + // Split long names like inner classes identified by a dollar sign ($) + WITH *, replace(source.name, '$', '$\\n') AS sourceNameSplit + WITH *, "label = \"" + sourceNameSplit + "\"; " AS directInLabel + WITH *, " [" + directInLabel + "]; " AS directInNodeProperties + WITH *, "\"" + sourceId + "\"" + directInNodeProperties AS directInNode + WITH maxWeight + ,central + ,graphVizOutput + ,collect(source) AS incomingDependencyNodes + ,collect(directInNode + "\"" + sourceId + "\" -> central [" + edgeAttributes + "];") AS directInEdges + WITH *, graphVizOutput + directInEdges AS graphVizOutput +// Step 4: Query direct outgoing dependencies from the central node + MATCH (source)<-[dependency:DEPENDS_ON]-(central) + WHERE $projection_node_label IN labels(source) + AND source.incomingDependencies > 0 + ORDER BY dependency.weight DESC, source.name ASC + LIMIT 40 + WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight + WITH *, round((toFloat(weight) / toFloat(maxWeight) * 2.5) + 0.4, 1.0) AS penWidth + WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=" + penWidth AS edgeAttributes + // Use a lighter color for the target nodes of outgoing dependencies from the central node and their edges + WITH *, edgeAttributes + "; color = 5" AS edgeAttributes + WITH *, "color = 5; fillcolor = 1; " AS directOutColor + WITH *, "\"" + source.name + "\" [color = 5; fillcolor = 1;]; " AS directOutNode + // Add the last part of the element id to the node name to make it unique. + WITH *, source.name + "_" + split(elementId(source), ':')[-1] AS sourceId + // Split long names like inner classes identified by a dollar sign ($) + WITH *, replace(source.name, '$', '$\\n') AS sourceNameSplit + WITH *, "label = \"" + sourceNameSplit + "\"; " AS directOutLabel + WITH *, " [" + directOutLabel + directOutColor + "]; " AS directOutNodeProperties + WITH *, "\"" + sourceId + "\"" + directOutNodeProperties AS directOutNode + WITH maxWeight + ,central + ,graphVizOutput + ,incomingDependencyNodes + ,collect(source) AS outgoingDependencyNodes + ,collect(directOutNode + "central -> \"" + sourceId + "\" [" + edgeAttributes + "];") AS directOutEdges + WITH *, graphVizOutput + directOutEdges AS graphVizOutput + WITH *, incomingDependencyNodes + outgoingDependencyNodes AS directDependentNodes +// Step 5: Query dependencies between direct dependencies outside the central node + UNWIND directDependentNodes AS directDependentNode + MATCH (directDependentNode)-[dependency:DEPENDS_ON]->(anotherDirectDependentNode) + WHERE anotherDirectDependentNode IN directDependentNodes + AND anotherDirectDependentNode <> directDependentNode + ORDER BY dependency.weight DESC, directDependentNode.name ASC + WITH graphVizOutput + ,directDependentNode + ,dependency + ,collect(anotherDirectDependentNode)[0] AS firstLinkedDependentNode + LIMIT 140 + WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight + // Use a fixed small pen width for secondary dependencies for better visibility of the more important direct dependency + WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=0.3" AS edgeAttributes + // Use an even lighter color for secondary dependency edges + WITH *, edgeAttributes + "; color = 3" AS edgeAttributes + // Add the last part of the element id to the node name to make it unique. + WITH *, directDependentNode.name + "_" + split(elementId(directDependentNode), ':')[-1] AS directDependentNodeId + WITH *, firstLinkedDependentNode.name + "_" + split(elementId(firstLinkedDependentNode), ':')[-1] AS firstLinkedDependentNodeId + WITH *, "\"" + directDependentNodeId + "\" -> \"" + firstLinkedDependentNodeId + "\"" AS directDependenciesEdge + WITH *, collect(directDependenciesEdge + " [" + edgeAttributes + "]") AS directDependenciesEdges + WITH *, graphVizOutput + directDependenciesEdges AS graphVizOutput +UNWIND graphVizOutput AS graphVizOutputLine +RETURN DISTINCT graphVizOutputLine \ No newline at end of file diff --git a/domains/anomaly-detection/graphs/TopCentral.template.gv b/domains/anomaly-detection/graphs/TopCentral.template.gv new file mode 100644 index 000000000..6fdd6fa71 --- /dev/null +++ b/domains/anomaly-detection/graphs/TopCentral.template.gv @@ -0,0 +1,22 @@ +// This is a GraphViz dot template file for the visualization of a anomaly archetype graphs with a selected central node. +// The main part of the template is marked by the comments "Begin-Template" and "End-Template". +// It also contains a simple example graph. +// +strict digraph top_sized_template { + //Begin-Template + graph [layout = "fdp"; start = "7"; splines = "spline"; pad = "0.8,0.1"; outputorder = "edgesfirst";]; + graph [fontname = "Helvetica,Arial,sans-serif"; labelloc = "t";]; + node [colorscheme = "bugn9"; color = 6; fillcolor = 3;]; + edge [colorscheme = "bugn9"; color = 7;]; + node [fontsize = 12; style = "filled"; margin = "0.03,0.03"; width="1.3"; height="0.9";]; + edge [fontsize = 4; arrowsize = "0.4";]; + + central [shape = "doublecircle"; height = "2.0"; margin = "0.000001,0.000001";]; + central [fontsize = 16;]; + central [color = 7; fillcolor = 5; penwidth = 3;]; + + //End-Template + "A" -> "central" [penwidth = 1.0; label = 1;]; + "A" -> "B" [penwidth = 3.0; label = 4;]; + "B" -> "central" [penwidth = 2.0; label = 2;]; +} \ No newline at end of file diff --git a/domains/anomaly-detection/graphs/TopHub.cypher b/domains/anomaly-detection/graphs/TopHub.cypher new file mode 100644 index 000000000..aa9f971eb --- /dev/null +++ b/domains/anomaly-detection/graphs/TopHub.cypher @@ -0,0 +1,72 @@ +// Anomaly Detection Graphs: Find top nodes marked as "central" including their incoming dependencies and output them in Graphviz format. + +// Step 1: Query overall statistics, e.g. min/max weight for later normalization + MATCH (sourceForStatistics)-[dependencyForStatistics:DEPENDS_ON]->(targetForStatistics) + WHERE $projection_node_label IN labels(sourceForStatistics) + AND $projection_node_label IN labels(targetForStatistics) + WITH max(coalesce(dependencyForStatistics.weight25PercentInterfaces, dependencyForStatistics.weight)) AS maxWeight + ,percentileDisc(sourceForStatistics.communityLocalClusteringCoefficient, 0.10) AS localClusteringCoefficientLowThreshold +// Step 2: Query selected central node + MATCH (central) + WHERE $projection_node_label IN labels(central) + AND central.anomalyHubRank = toInteger($projection_node_rank) + WITH maxWeight + ,localClusteringCoefficientLowThreshold + ,central + ,"Top Rank #" + $projection_node_rank + " " + $projection_language + " " + $projection_node_label + " Hub\\n" AS graphLabel + ,coalesce(central.fqn, central.globalFqn, central.fileName, central.signature, central.name) AS targetName + WITH *, "\\n\\nnodes: incoming dependencies (limited max. 40)\\n" AS graphLegend + WITH *, graphLegend + "node value: local clustering coefficient\\n" AS graphLegend + WITH *, graphLegend + "thick outline: <1 0% percentile of local clustering coefficient\\n" AS graphLegend + WITH *, ["graph [label=\"" + graphLabel + targetName + graphLegend + "\\n\"];"] AS graphVizOutput + WITH *, "๐ŸŽก hub #" + central.anomalyHubRank + "\\n" + central.name AS centralNodeLabel + WITH *, centralNodeLabel + "\\n(in-degree=" + central.incomingDependencies + ")" AS centralNodeLabel + WITH *, graphVizOutput + ["central [label=\"" + centralNodeLabel + "\"];"] AS graphVizOutput +// Step 3: Query direct incoming dependencies to the central node + MATCH (source)-[dependency:DEPENDS_ON]->(central) + WHERE $projection_node_label IN labels(source) + AND source.outgoingDependencies > 0 + ORDER BY dependency.weight DESC, source.name ASC + LIMIT 50 + WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight + WITH *, round((toFloat(weight) / toFloat(maxWeight) * 2.5) + 0.4, 1.0) AS penWidth + WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=" + penWidth AS edgeAttributes + WITH *, CASE WHEN source.communityLocalClusteringCoefficient <= localClusteringCoefficientLowThreshold + THEN 5 ELSE 2 END AS scaledNodeBorder + WITH *, round(source.communityLocalClusteringCoefficient, 2) AS labelValue + // Add the last part of the element id to make the node name unique, even if the name itself isn't. + WITH *, source.name + "_" + split(elementId(source), ':')[-1] AS sourceId + // Split long names like inner classes identified by a dollar sign ($) + WITH *, "penwidth = " + scaledNodeBorder + "; " AS directInBorder + WITH *, replace(source.name, '$', '$\\n') AS sourceNameSplit + WITH *, "label = \"" + sourceNameSplit + "\\n(" + labelValue + ")\"; " AS directInLabel + WITH *, " [" + directInLabel + directInBorder + "]; " AS directInNodeProperties + WITH *, "\"" + sourceId + "\" " + directInNodeProperties AS directInNode + WITH maxWeight + ,central + ,graphVizOutput + ,collect(source) AS directDependentNodes + ,collect(directInNode + "\"" + sourceId + "\" -> central [" + edgeAttributes + "];") AS directInEdges + WITH *, graphVizOutput + directInEdges AS graphVizOutput +// Step 4: Query dependencies between direct dependencies outside the central node + UNWIND directDependentNodes AS directDependentNode + MATCH (directDependentNode)-[dependency:DEPENDS_ON]->(anotherDirectDependentNode) + WHERE anotherDirectDependentNode IN directDependentNodes + AND anotherDirectDependentNode <> directDependentNode + ORDER BY dependency.weight DESC, directDependentNode.name ASC + WITH graphVizOutput + ,directDependentNode + ,dependency + ,collect(anotherDirectDependentNode)[0] AS firstLinkedDependentNode + LIMIT 140 + WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight +// Use a fixed small pen width for secondary dependencies for better visibility of the more important direct dependency + WITH *, "weight=" + weight + "; penwidth=0.2" AS edgeAttributes + // Add the last part of the element id to the node name to make it unique. + WITH *, directDependentNode.name + "_" + split(elementId(directDependentNode), ':')[-1] AS directDependentNodeId + WITH *, firstLinkedDependentNode.name + "_" + split(elementId(firstLinkedDependentNode), ':')[-1] AS firstLinkedDependentNodeId + WITH *, "\"" + directDependentNodeId + "\" -> \"" + firstLinkedDependentNodeId + "\"" AS directDependenciesEdge + WITH *, collect(directDependenciesEdge + " [" + edgeAttributes + "]") AS directDependenciesEdges + WITH *, graphVizOutput + directDependenciesEdges AS graphVizOutput +UNWIND graphVizOutput AS graphVizOutputLine +RETURN DISTINCT graphVizOutputLine \ No newline at end of file diff --git a/domains/anomaly-detection/graphs/TopOutlier.cypher b/domains/anomaly-detection/graphs/TopOutlier.cypher new file mode 100644 index 000000000..9b72085d3 --- /dev/null +++ b/domains/anomaly-detection/graphs/TopOutlier.cypher @@ -0,0 +1,114 @@ +// Anomaly Detection Graphs: Find top nodes marked as "Outlier" including their incoming and outgoing dependencies and sizes based on cluster confidence and output them in Graphviz format. + +// Step 1: Query overall statistics, e.g. min/max weight for later normalization + MATCH (sourceForStatistics)-[dependencyForStatistics:DEPENDS_ON]->(targetForStatistics) + WHERE $projection_node_label IN labels(sourceForStatistics) + AND $projection_node_label IN labels(targetForStatistics) + WITH max(coalesce(dependencyForStatistics.weight25PercentInterfaces, dependencyForStatistics.weight)) AS maxWeight + ,percentileDisc(sourceForStatistics.clusteringHDBSCANNormalizedDistanceToMedoid, 0.80) AS clusterMedoidDistanceThreshold + ,percentileDisc(sourceForStatistics.clusteringHDBSCANProbability, 0.25) AS clusterProbabilityLowThreshold +// Step 2: Query selected central node + MATCH (central) + WHERE $projection_node_label IN labels(central) + AND central.anomalyOutlierRank = toInteger($projection_node_rank) + WITH maxWeight + ,clusterMedoidDistanceThreshold + ,clusterProbabilityLowThreshold + ,central + ,"Top Rank #" + $projection_node_rank + " " + $projection_language + " " + $projection_node_label + " Outlier\\n" AS graphLabel + ,coalesce(central.fqn, central.globalFqn, central.fileName, central.signature, central.name) AS targetName + WITH *, "\\n\\ndark nodes: incoming dependencies (limited max. 40)\\n" AS graphLegend + WITH *, graphLegend + "bright nodes: outgoing dependencies (limited max. 40)\\n" AS graphLegend + WITH *, graphLegend + "node value: cluster probability\\n" AS graphLegend + WITH *, graphLegend + "large circle: < 25% percentile of cluster probability\\n" AS graphLegend + WITH *, graphLegend + "thick outline: > 80% percentile of distance to cluster center\\n" AS graphLegend + WITH *, ["graph [label=\"" + graphLabel + targetName + graphLegend + "\\n\"];"] AS graphVizOutput + WITH *, "๐Ÿงฉ outlier #" + central.anomalyOutlierRank + "\\n" + central.name AS centralNodeLabel + WITH *, graphVizOutput + ["central [label=\"" + centralNodeLabel + "\"];"] AS graphVizOutput +// Step 3: Query direct incoming dependencies to the central node + MATCH (source)-[dependency:DEPENDS_ON]->(central) + WHERE $projection_node_label IN labels(source) + AND source.outgoingDependencies > 0 + ORDER BY dependency.weight DESC, source.name ASC + LIMIT 40 + WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight + WITH *, round((toFloat(weight) / toFloat(maxWeight) * 2.5) + 0.4, 1.0) AS penWidth + WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=" + penWidth AS edgeAttributes + WITH *, CASE WHEN source.clusteringHDBSCANNormalizedDistanceToMedoid >= clusterMedoidDistanceThreshold + THEN 5 ELSE 2 END AS scaledNodeBorder + WITH *, CASE WHEN source.clusteringHDBSCANProbability <= clusterProbabilityLowThreshold + THEN "shape = \"circle\"; height=1.8; " ELSE "" END AS nodeEmphasis + WITH *, round(source.clusteringHDBSCANProbability * 100.0, 2) + "%" AS labelValue + // Add the last part of the element id to the node name to make it unique. + WITH *, source.name + "_" + split(elementId(source), ':')[-1] AS sourceId + WITH *, "penwidth = " + scaledNodeBorder + "; " AS directInBorder + // Split long names like inner classes identified by a dollar sign ($) + WITH *, replace(source.name, '$', '$\\n') AS sourceNameSplit + WITH *, "label = \"" + sourceNameSplit + "\\n(" + labelValue + ")\"; " AS directInLabel + WITH *, " [" + nodeEmphasis + directInLabel + directInBorder + "]; " AS directInNodeProperties + WITH *, "\"" + sourceId + "\" " + directInNodeProperties AS directInNode + WITH maxWeight + ,clusterMedoidDistanceThreshold + ,clusterProbabilityLowThreshold + ,central + ,graphVizOutput + ,collect(source) AS incomingDependencyNodes + ,collect(directInNode + "\"" + sourceId + "\" -> central [" + edgeAttributes + "];") AS directInEdges + WITH *, graphVizOutput + directInEdges AS graphVizOutput +// Step 4: Query direct outgoing dependencies from the central node + MATCH (source)<-[dependency:DEPENDS_ON]-(central) + WHERE $projection_node_label IN labels(source) + AND source.incomingDependencies > 0 + ORDER BY dependency.weight DESC, source.name ASC + LIMIT 40 + WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight + WITH *, round((toFloat(weight) / toFloat(maxWeight) * 2.5) + 0.4, 1.0) AS penWidth + WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=" + penWidth AS edgeAttributes + // Use a lighter color for the target nodes of outgoing dependencies from the central node and their edges + WITH *, edgeAttributes + "; color = 5" AS edgeAttributes + WITH *, "color = 5; fillcolor = 1; " AS directOutColor + WITH *, CASE WHEN source.clusteringHDBSCANNormalizedDistanceToMedoid >= clusterMedoidDistanceThreshold + THEN 5 ELSE 2 END AS scaledNodeBorder + WITH *, CASE WHEN source.clusteringHDBSCANProbability <= clusterProbabilityLowThreshold + THEN "shape = \"circle\"; height=1.8; " ELSE "" END AS nodeEmphasis + WITH *, round(source.clusteringHDBSCANProbability * 100.0, 2) + "%" AS labelValue + // Add the last part of the element id to the node name to make it unique. + WITH *, source.name + "_" + split(elementId(source), ':')[-1] AS sourceId + WITH *, "penwidth = " + scaledNodeBorder + "; " AS directOutBorder + // Split long names like inner classes identified by a dollar sign ($) + WITH *, replace(source.name, '$', '$\\n') AS sourceNameSplit + WITH *, "label = \"" + sourceNameSplit + "\\n(" + labelValue + ")\"; " AS directOutLabel + WITH *, " [" + nodeEmphasis + directOutLabel + directOutBorder + directOutColor + "]; " AS directOutNodeProperties + WITH *, "\"" + sourceId + "\" " + directOutNodeProperties AS directOutNode + WITH maxWeight + ,central + ,graphVizOutput + ,incomingDependencyNodes + ,collect(source) AS outgoingDependencyNodes + ,collect(directOutNode + "central -> \"" + sourceId + "\" [" + edgeAttributes + "];") AS directOutEdges + WITH *, graphVizOutput + directOutEdges AS graphVizOutput + WITH *, incomingDependencyNodes + outgoingDependencyNodes AS directDependentNodes +// Step 5: Query dependencies between direct dependencies outside the central node + UNWIND directDependentNodes AS directDependentNode + MATCH (directDependentNode)-[dependency:DEPENDS_ON]->(anotherDirectDependentNode) + WHERE anotherDirectDependentNode IN directDependentNodes + AND anotherDirectDependentNode <> directDependentNode + ORDER BY dependency.weight DESC, directDependentNode.name ASC + WITH graphVizOutput + ,directDependentNode + ,dependency + ,collect(anotherDirectDependentNode)[0] AS firstLinkedDependentNode + LIMIT 140 + WITH *, coalesce(dependency.weight25PercentInterfaces, dependency.weight, 1) AS weight + // Use a fixed small pen width for secondary dependencies for better visibility of the more important direct dependency + WITH *, "label=" + weight + "; weight=" + weight + "; penwidth=0.3" AS edgeAttributes + // Use an even lighter color for secondary dependency edges + WITH *, edgeAttributes + "; color = 3" AS edgeAttributes + // Add the last part of the element id to the node name to make it unique. + WITH *, directDependentNode.name + "_" + split(elementId(directDependentNode), ':')[-1] AS directDependentNodeId + WITH *, firstLinkedDependentNode.name + "_" + split(elementId(firstLinkedDependentNode), ':')[-1] AS firstLinkedDependentNodeId + WITH *, "\"" + directDependentNodeId + "\" -> \"" + firstLinkedDependentNodeId + "\"" AS directDependenciesEdge + WITH *, collect(directDependenciesEdge + " [" + edgeAttributes + "]") AS directDependenciesEdges + WITH *, graphVizOutput + directDependenciesEdges AS graphVizOutput +UNWIND graphVizOutput AS graphVizOutputLine +RETURN DISTINCT graphVizOutputLine \ No newline at end of file diff --git a/domains/anomaly-detection/graphs/anomalyDetectionGraphVisualization.sh b/domains/anomaly-detection/graphs/anomalyDetectionGraphVisualization.sh new file mode 100755 index 000000000..b7f62aea4 --- /dev/null +++ b/domains/anomaly-detection/graphs/anomalyDetectionGraphVisualization.sh @@ -0,0 +1,224 @@ +#!/usr/bin/env bash + +# Executes selected anomaly detection Cypher queries for GraphViz visualization. +# Visualizes top ranked anomaly archetypes. +# Requires an already running Neo4j graph database with already scanned and analyzed artifacts. +# The reports (csv, dot and svg files) will be written into the sub directory reports/anomaly-detection/{language}_{codeUnit}. + +# Requires executeQueryFunctions.sh, visualizeQueryResults.sh, cleanupAfterReportGeneration.sh + +# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) +set -o errexit -o pipefail + +# Overrideable Constants (defaults also defined in sub scripts) +REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} + +## Get this "scripts/reports" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +ANOMALY_DETECTION_GRAPHS_DIR=${ANOMALY_DETECTION_GRAPHS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} +#echo "anomalyDetectionGraphVisualization: ANOMALY_DETECTION_GRAPHS_DIR=${ANOMALY_DETECTION_GRAPHS_DIR}" + +# Get the "scripts" directory by taking the path of this script and going one directory up. +SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_GRAPHS_DIR}/../../../scripts"} # Repository directory containing the shell scripts + +# echo "anomalyDetectionGraphVisualization: SCRIPTS_DIR=${SCRIPTS_DIR}" +# Get the "cypher" query directory for gathering features. +ANOMALY_DETECTION_FEATURE_CYPHER_DIR=${ANOMALY_DETECTION_FEATURE_CYPHER_DIR:-"${ANOMALY_DETECTION_GRAPHS_DIR}/../features"} + +# Get the "scripts/visualization" directory. +VISUALIZATION_SCRIPTS_DIR=${VISUALIZATION_SCRIPTS_DIR:-"${SCRIPTS_DIR}/visualization"} # Repository directory containing the shell scripts for visualization +# echo "anomalyDetectionGraphVisualization: VISUALIZATION_SCRIPTS_DIR=${VISUALIZATION_SCRIPTS_DIR}" + +MARKDOWN_INCLUDES_DIRECTORY=${MARKDOWN_INCLUDES_DIRECTORY:-"includes"} +ANOMALY_DETECTION_TOP_N_GRAPHS=${ANOMALY_DETECTION_TOP_N_GRAPHS:-5} # Number of top ranked graphs to visualize per query for anomaly detection. + +# Define functions to execute cypher queries from within a given file +source "${SCRIPTS_DIR}/executeQueryFunctions.sh" + +# Query or recalculate features. +# +# Required Parameters: +# - projection_name=... +# Name prefix for the in-memory projection name. Example: "package-anomaly-detection" +# - projection_node_label=... +# Label of the nodes that will be used for the projection. Example: "Package" +# - projection_weight_property=... +# Name of the node property that contains the dependency weight. Example: "weight" +set_required_features() { + local nodeLabel + nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" ) + + echo "anomalyDetectionGraphVisualization: $(date +'%Y-%m-%dT%H:%M:%S%z') Collecting features for ${nodeLabel} nodes..." + + # TODO Create missing projection (with all that comes with it) for the sake of self containment or assume Page/Article Rank? + # Determine the page rank if not already done + #execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Exists.cypher" \ + # "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageRank-Write.cypher" "${@}" + # Determine the article rank if not already done + #execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Exists.cypher" \ + # "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Write.cypher" "${@}" + # Determine the normalized difference between Page Rank and Article Rank if not already done + execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Exists.cypher" \ + "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-PageToArticleRank-Write.cypher" "${@}" +} + +# Creates or updates the markdown file (include for main summary) that contains the references to all graph visualizations. +# +# Required Parameters: +# - projection_language=... +# Name of the associated programming language. Examples: "Java", "Typescript" +# - projection_node_label=... +# Label of the nodes that will be used for the projection. Example: "Package" +# - report_name=... +# Name of the query and then also the resulting visualization file. +# - index=... +# Index of visualization plot. +update_markdown_references() { + local language + language=$( extractQueryParameter "projection_language" "${@}" ) + + local nodeLabel + nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" ) + + local report_name + report_name=$( extractQueryParameter "report_name" "${@}" ) + + local index + index=$( extractQueryParameter "index" "${@}" ) + + local detail_report_directory_name="${language}_${nodeLabel}" + local detail_report_directory="${FULL_REPORT_DIRECTORY}/${detail_report_directory_name}/${GRAPH_VISUALIZATIONS_DIRECTORY_NAME}" + local markdown_graph_visualizations_reference_file="${detail_report_directory}/${MARKDOWN_REFERENCE_FILE_NAME}" + + if [ ! -f "${markdown_graph_visualizations_reference_file}" ]; then + echo "anomalyDetectionGraphVisualization: $(date +'%Y-%m-%dT%H:%M:%S%z') Creating ${markdown_graph_visualizations_reference_file}..." + { + echo "#### Graph Visualizations" + echo "" + } >> "${markdown_graph_visualizations_reference_file}" + else + { + echo "---" + echo "" + } >> "${markdown_graph_visualizations_reference_file}" + fi + + if [ "${index}" == "1" ]; then + { + echo "##### ${report_name} Graph Visualizations" + echo "" + } >> "${markdown_graph_visualizations_reference_file}" + fi + + { + echo "![${report_name} ${index}](./${detail_report_directory_name}/${GRAPH_VISUALIZATIONS_DIRECTORY_NAME}/${report_name}${index}.svg)" + echo "" + } >> "${markdown_graph_visualizations_reference_file}" +} + +# Runs a parametrized query, converts their results in GraphViz format and creates a Graph visualization. +# Outputs (at most) 10 indexed files (for report_name="TopHub" then TopHub1, TopHub2,...) with a focused visualization of one selected node and its surroundings. +# +# Required Parameters: +# - report_name=... +# Name of the query and then also the resulting visualization file. +# - template_name=... +# Name of the GraphViz template gv file. +# - projection_language=... +# Name of the associated programming language. Examples: "Java", "Typescript" +# - projection_node_label=... +# Label of the nodes that will be used for the projection. Example: "Package" +create_graph_visualization() { + local nodeLabel + nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" ) + + local language + language=$( extractQueryParameter "projection_language" "${@}" ) + + local report_name + report_name=$( extractQueryParameter "report_name" "${@}" ) + + local template_name + template_name=$( extractQueryParameter "template_name" "${@}" ) + + echo "anomalyDetectionGraphVisualization: $(date +'%Y-%m-%dT%H:%M:%S%z') Creating ${language} ${nodeLabel} ${report_name} visualizations with template ${template_name}..." + + local detail_report_directory_name="${language}_${nodeLabel}" + local detail_report_directory="${FULL_REPORT_DIRECTORY}/${detail_report_directory_name}/${GRAPH_VISUALIZATIONS_DIRECTORY_NAME}" + mkdir -p "${detail_report_directory}" + rm -rf "${detail_report_directory}/${report_name}.*" + + for ((index=1; index<=ANOMALY_DETECTION_TOP_N_GRAPHS; index++)); do + # Query Graph data + local resultFileName="${detail_report_directory}/${report_name}${index}" + local queryResultFile="${resultFileName}.csv" + execute_cypher "${ANOMALY_DETECTION_GRAPHS_DIR}/${report_name}.cypher" "${@}" "projection_node_rank=${index}" > "${queryResultFile}" || true + + # Remove empty files + # Note: Afterwards, detail_report_directory might be deleted as well. + # In that case the image generation is finished and the loop needs to be terminated. + source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${detail_report_directory}" + # Stop generation as soon as the first query result is empty or the directory is deleted. + if [ ! -f "${queryResultFile}" ] ; then + break; + fi + + # Generate svg image using GraphViz + source "${VISUALIZATION_SCRIPTS_DIR}/visualizeQueryResults.sh" "${queryResultFile}" --template "${ANOMALY_DETECTION_GRAPHS_DIR}/${template_name}.template.gv" + + # Clean up after graph visualization image generation: + rm -rf "${queryResultFile}" # Remove query result + # Collect graphviz files in a "graphviz" sub directory + mkdir -p "${detail_report_directory}/graphviz" + mv -f "${resultFileName}.gv" "${detail_report_directory}/graphviz" + + update_markdown_references "${@}" "index=${index}" + done +} + +# Run queries, outputs their results in GraphViz format and create Graph visualizations. +# +# Required Parameters: +# - projection_language=... +# Name of the associated programming language. Examples: "Java", "Typescript" +# - projection_node_label=... +# Label of the nodes that will be used for the projection. Example: "Package" +anomaly_detection_graph_visualization() { + set_required_features "${@}" + + create_graph_visualization "report_name=TopHub" "template_name=TopCentral" "${@}" + create_graph_visualization "report_name=TopBottleneck" "template_name=TopCentral" "${@}" + create_graph_visualization "report_name=TopAuthority" "template_name=TopCentral" "${@}" + create_graph_visualization "report_name=TopBridge" "template_name=TopCentral" "${@}" + create_graph_visualization "report_name=TopOutlier" "template_name=TopCentral" "${@}" +} + + +# Create report directory +REPORT_NAME="anomaly-detection" +FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}" +mkdir -p "${FULL_REPORT_DIRECTORY}" + +GRAPH_VISUALIZATIONS_DIRECTORY_NAME="GraphVisualizations" + +MARKDOWN_REFERENCE_FILE_NAME="GraphVisualizationsReferenceForSummary.md" +# Delete all markdown reference files so that they can be written by appending lines. +# Wildcards/Globs need to be outside of double quotes. +rm -rfv "${FULL_REPORT_DIRECTORY}"/*_*/${GRAPH_VISUALIZATIONS_DIRECTORY_NAME}/${MARKDOWN_REFERENCE_FILE_NAME} + +# Query Parameter key pairs for projection and algorithm side +QUERY_NODE="projection_node_label" +QUERY_LANGUAGE="projection_language" + +# -- Detail Reports for each code type ------------------------------- + +anomaly_detection_graph_visualization "${QUERY_NODE}=Artifact" "${QUERY_LANGUAGE}=Java" +anomaly_detection_graph_visualization "${QUERY_NODE}=Package" "${QUERY_LANGUAGE}=Java" +anomaly_detection_graph_visualization "${QUERY_NODE}=Type" "${QUERY_LANGUAGE}=Java" +anomaly_detection_graph_visualization "${QUERY_NODE}=Module" "${QUERY_LANGUAGE}=Typescript" + +# --------------------------------------------------------------- + +echo "anomalyDetectionGraphVisualization: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished." \ No newline at end of file diff --git a/domains/anomaly-detection/labels/AnomalyDetectionArchetypeAuthority.cypher b/domains/anomaly-detection/labels/AnomalyDetectionArchetypeAuthority.cypher index efd080462..8897a2771 100644 --- a/domains/anomaly-detection/labels/AnomalyDetectionArchetypeAuthority.cypher +++ b/domains/anomaly-detection/labels/AnomalyDetectionArchetypeAuthority.cypher @@ -11,24 +11,10 @@ ,min(codeUnit.centralityArticleRank) AS minArticleRank ,max(codeUnit.centralityArticleRank) AS maxArticleRank ,percentileDisc(codeUnit.centralityPageRank, 0.90) AS pageRankThreshold + ,percentileDisc(codeUnit.centralityPageRankToArticleRankDifference, 0.90) AS pageToArticleRankDifferenceThreshold UNWIND codeUnits AS codeUnit WITH * - WHERE codeUnit.centralityPageRank >= pageRankThreshold - WITH * - ,(codeUnit.centralityPageRank - minPageRank) / (maxPageRank - minPageRank) AS normalizedPageRank - ,(codeUnit.centralityArticleRank - minArticleRank) / (maxArticleRank - minArticleRank) AS normalizedArticleRank - WITH * - ,normalizedPageRank - normalizedArticleRank AS normalizedPageRankToArticleRankDifference - WITH collect(codeUnit) AS codeUnits - ,minPageRank, maxPageRank, minArticleRank, maxArticleRank - ,percentileDisc(normalizedPageRankToArticleRankDifference, 0.90) AS pageToArticleRankDifferenceThreshold - UNWIND codeUnits AS codeUnit - WITH * - ,(codeUnit.centralityPageRank - minPageRank) / (maxPageRank - minPageRank) AS normalizedPageRank - ,(codeUnit.centralityArticleRank - minArticleRank) / (maxArticleRank - minArticleRank) AS normalizedArticleRank - WITH * - ,normalizedPageRank - normalizedArticleRank AS normalizedPageRankToArticleRankDifference - WHERE normalizedPageRankToArticleRankDifference >= pageToArticleRankDifferenceThreshold + WHERE codeUnit.centralityPageRankToArticleRankDifference >= pageToArticleRankDifferenceThreshold OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) WITH *, artifact.name AS artifactName OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) @@ -36,14 +22,11 @@ OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS WITH *, coalesce(artifactName, projectName) AS projectName ORDER BY codeUnit.centralityPageRank DESC, codeUnit.centralityArticleRank ASC LIMIT 10 - WITH collect([codeUnit, projectName, normalizedPageRank, normalizedArticleRank, normalizedPageRankToArticleRankDifference]) AS results - UNWIND range(0, size(results) - 1) AS codeUnitIndex - WITH codeUnitIndex + 1 AS codeUnitIndex - ,results[codeUnitIndex][0] AS codeUnit - ,results[codeUnitIndex][1] AS projectName - ,results[codeUnitIndex][2] AS normalizedPageRank - ,results[codeUnitIndex][3] AS normalizedArticleRank - ,results[codeUnitIndex][4] AS normalizedPageRankToArticleRankDifference + WITH collect([codeUnit, projectName]) AS results + UNWIND range(0, size(results) - 1) AS codeUnitIndex + WITH codeUnitIndex + 1 AS codeUnitIndex + ,results[codeUnitIndex][0] AS codeUnit + ,results[codeUnitIndex][1] AS projectName SET codeUnit:Mark4TopAnomalyAuthority ,codeUnit.anomalyAuthorityRank = codeUnitIndex RETURN DISTINCT @@ -52,7 +35,5 @@ OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName ,codeUnit.centralityPageRank AS pageRank ,codeUnit.centralityArticleRank AS articleRank - ,codeUnit.anomalyAuthorityRank AS rank - ,normalizedPageRank - ,normalizedArticleRank - ,normalizedPageRankToArticleRankDifference \ No newline at end of file + ,codeUnit.centralityPageRankToArticleRankDifference AS normalizedPageRankToArticleRankDifference + ,codeUnit.anomalyAuthorityRank AS rank \ No newline at end of file diff --git a/domains/anomaly-detection/summary/anomalyDetectionSummary.sh b/domains/anomaly-detection/summary/anomalyDetectionSummary.sh index 852b400dd..a7659df03 100755 --- a/domains/anomaly-detection/summary/anomalyDetectionSummary.sh +++ b/domains/anomaly-detection/summary/anomalyDetectionSummary.sh @@ -15,7 +15,7 @@ set -o errexit -o pipefail # Overrideable Constants (defaults also defined in sub scripts) REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} -MARKDOWN_INCLUDES_DIRECTORY=${MARKDOWN_INCLUDES_DIRECTORY:-"includes"} +MARKDOWN_INCLUDES_DIRECTORY=${MARKDOWN_INCLUDES_DIRECTORY:-"includes"} # Subdirectory that contains Markdown files to be included by the Markdown template for the report. ## Get this "domains/anomaly-detection/summary" directory if not already set # Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. @@ -26,7 +26,6 @@ ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR:-$(CDPATH=. cd -- # Get the "scripts" directory by taking the path of this script and going one directory up. SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SUMMARY_DIR}/../../../scripts"} # Repository directory containing the shell scripts -MARKDOWN_INCLUDES_DIRECTORY="includes" MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-"${SCRIPTS_DIR}/markdown"} #echo "anomalyDetectionSummary: MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR}" >&2 @@ -76,7 +75,11 @@ anomaly_detection_deep_dive_report() { echo "### 2.${report_number} ${language} ${nodeLabel}" > "${detail_report_include_directory}/DeepDiveSectionTitle.md" echo "" > "${detail_report_include_directory}/empty.md" cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report_no_anomaly_detection_data.template.md" "${detail_report_include_directory}/report_no_anomaly_detection_data.template.md" + cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report_no_anomaly_detection_graphs.template.md" "${detail_report_include_directory}/report_no_anomaly_detection_graphs.template.md" + + # Copy cp -f "${detail_report_directory}/Top_anomaly_features.md" "${detail_report_include_directory}" || true + cp -f "${detail_report_directory}/GraphVisualizations/GraphVisualizationsReferenceForSummary.md" "${detail_report_include_directory}/GraphVisualizationsReference.md" || true # Assemble Markdown-Includes containing plots depending on their availability (fallback empty.md) if [ -f "${detail_report_directory}/Anomaly_feature_importance_explained.svg" ] ; then @@ -104,6 +107,11 @@ anomaly_detection_deep_dive_report() { cat "${detail_report_directory}/report_deep_dive.template.md" | "${MARKDOWN_SCRIPTS_DIR}/embedMarkdownIncludes.sh" "${detail_report_include_directory}" > "${detail_report_directory}/report_deep_dive_with_vars.md" sed "s/{{deep_dive_directory}}/${detail_report_directory_name}/g" "${detail_report_directory}/report_deep_dive_with_vars.md" > "${detail_report_directory}/report_deep_dive_${report_number}.md" + # Add a page break at the end of a deep dive section + { + echo "--" + } >> "${detail_report_directory}/report_deep_dive_${report_number}.md" + rm -rf "${detail_report_directory}/report_deep_dive_with_vars.md" rm -rf "${detail_report_directory}/report_deep_dive.template.md" rm -rf "${detail_report_include_directory}" @@ -128,7 +136,10 @@ anomaly_detection_front_matter_metadata_head() { current_date="$(date +'%Y-%m-%d')" local latest_tag - latest_tag="$(git ls-remote --tags origin | grep -v '\^{}' | tail -n1 | awk '{print $2}' | sed 's|refs/tags/||')" + # The latest tag can always be determined by reading the remote repository. However, this doesn't support working offline. + # Therefore, git describe is used which - on the other hand - requires tags to be fetched which requires GitHub Action checkout parameter fetch-tags. + #latest_tag="$(git ls-remote --tags origin | grep -v '\^{}' | tail -n1 | awk '{print $2}' | sed 's|refs/tags/||')" + latest_tag="$(git for-each-ref --sort=-creatordate --count=1 --format '%(refname:short)' refs/tags)" local analysis_directory analysis_directory="${PWD##*/}" diff --git a/domains/anomaly-detection/summary/report.template.md b/domains/anomaly-detection/summary/report.template.md index 4aacac7bf..0f846235c 100644 --- a/domains/anomaly-detection/summary/report.template.md +++ b/domains/anomaly-detection/summary/report.template.md @@ -104,6 +104,24 @@ Each abstraction level includes anomaly statistics, SHAP feature importance, arc --- +### ๐Ÿ“• Graph Visualizations (Archetype-Level Network Views) + +| Plot | Description | Best For | Adds | Why | +|------|--------------|----------|------|-----| +| **Top Hub Graph Visualization** | Displays the most connected node (e.g., **#1 Hub**) at the center, surrounded by its direct dependencies. Incoming nodes show who is dependent on the hub. | Understanding highly connected code units or components that serve as central integrators. | Highlights nodes that act as major dependency aggregators. | Helps detect over-centralized modules or potential architectural bottlenecks. | +| **Top Bottleneck Graph Visualization** | Shows the node with the highest betweenness centrality (e.g., **#1 Bottleneck**) and its local neighborhood. | Identifying code units that control information or dependency flow. | Emphasizes nodes that mediate critical paths between modules. | Reveals single points of failure or routing constraints in dependency flow. | +| **Top Authority Graph Visualization** | Centers the most authoritative node (e.g., **#1 Authority**) with incoming and outgoing links from dependent nodes with high PageRank and emphasized PageRank to ArticleRank difference. | Detecting key knowledge or functionality providers. | Highlights components with high centrality. | Indicates structural or semantic โ€œsources of truthโ€ in the system. | +| **Top Bridge Graph Visualization** | Displays a node acting as a structural bridge between clusters (e.g., **#1 Bridge**) and its cross-cluster connections based on node embeddings encoding the Graph structure. | Understanding cross-cutting dependencies between modules. | Reveals links connecting distinct architectural domains. | Useful for spotting boundary leaks or undesired coupling between subsystems. | +| **Top Outlier Graph Visualization** | Centers an unusual or isolated node (e.g., **#1 Outlier**) that can hardly be assigned to a cluster and visualizes its sparse or unexpected dependency patterns. | Identifying structurally or behaviorally anomalous nodes. | Highlights nodes with rare or unexpected connection patterns. | Helps pinpoint code units that deviate from established dependency norms. | + +> **Note:** +> - In all Graph Visualizations, the **central node** represents the selected *Top Archetype* (e.g., *Top 1 Hub*). +> - **Darker nodes** indicate *incoming dependencies*, while **brighter nodes** indicate *outgoing dependencies*. +> - **Emphasized nodes** (thicker borders or larger size) mark particularly influential or anomalous dependencies, depending on the archetype. +> - These visualizations are most effective for interpreting *local dependency topology* and *role significance* of key components. + +--- + ### ๐Ÿ“” Summary Categories | Category | Included Plots | Typical Usage | @@ -114,6 +132,7 @@ Each abstraction level includes anomaly statistics, SHAP feature importance, arc | **Cluster Noise Analysis** | Cluster Noise (3 types) | Identify special structural anomalies | | **Feature Distributions** | Betweenness, Clustering, Rank Difference | Assess feature-based structure patterns | | **Feature Relationships** | Clustering vs PageRank | Evaluate global vs local influence balance | +| **Archetype Graphs** | Top Hub / Bottleneck / Authority / Bridge / Outlier | Visualizing key dependency roles and structural importance | --- diff --git a/domains/anomaly-detection/summary/report_deep_dive.template.md b/domains/anomaly-detection/summary/report_deep_dive.template.md index 164c6048c..452804cae 100644 --- a/domains/anomaly-detection/summary/report_deep_dive.template.md +++ b/domains/anomaly-detection/summary/report_deep_dive.template.md @@ -12,7 +12,7 @@ #### Archetype Distribution - + #### Top anomalies with their local contributing features (via SHAP) @@ -31,3 +31,5 @@ See [Plot Interpretation Guide](#3-plot-interpretation-guide) on how to read the + + diff --git a/domains/anomaly-detection/summary/report_no_anomaly_detection_graphs.template.md b/domains/anomaly-detection/summary/report_no_anomaly_detection_graphs.template.md new file mode 100644 index 000000000..96a7029ea --- /dev/null +++ b/domains/anomaly-detection/summary/report_no_anomaly_detection_graphs.template.md @@ -0,0 +1 @@ +โš ๏ธ _No anomaly detection graph visualizations for this level._ \ No newline at end of file diff --git a/scripts/reports/compilations/VisualizationReports.sh b/scripts/reports/compilations/VisualizationReports.sh index 1b5bb2f0c..4996a457e 100755 --- a/scripts/reports/compilations/VisualizationReports.sh +++ b/scripts/reports/compilations/VisualizationReports.sh @@ -21,22 +21,33 @@ LOG_GROUP_END=${LOG_GROUP_END:-"::endgroup::"} # Prefix to end a log group. Defa # This way non-standard tools like readlink aren't needed. REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")} - -echo "${LOG_GROUP_START}Initialize Visualization Reports"; -echo "VisualizationReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}" -echo "VisualizationReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" -echo "${LOG_GROUP_END}"; - -# Run all visualization scripts -for visualization_script_file in "${REPORTS_SCRIPT_DIR}"/*Visualization.sh; do - visualization_script_filename=$(basename -- "${visualization_script_file}") - visualization_script_filename="${visualization_script_filename%.*}" # Remove file extension - - echo "${LOG_GROUP_START}Create Visualization Report ${visualization_script_filename}"; - echo "VisualizationReports: $(date +'%Y-%m-%dT%H:%M:%S%z') Starting ${visualization_script_filename}..."; - - source "${visualization_script_file}" - - echo "VisualizationReports: $(date +'%Y-%m-%dT%H:%M:%S%z') Finished ${visualization_script_filename}"; - echo "${LOG_GROUP_END}"; +DOMAINS_DIRECTORY=${DOMAINS_DIRECTORY:-"${REPORTS_SCRIPT_DIR}/../../domains"} + +# For detailed debug output uncomment the following lines: +#echo "${LOG_GROUP_START}Initialize Visualization Reports"; +#echo "VisualizationReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT_DIR}" +#echo "VisualizationReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" +#echo "VisualizationReports: DOMAINS_DIRECTORY=${DOMAINS_DIRECTORY}" +#echo "${LOG_GROUP_END}"; + +# Run all visualization scripts (filename ending with Visualization.sh) in the REPORTS_SCRIPT_DIR and DOMAINS_DIRECTORY directories. +for directory in "${REPORTS_SCRIPT_DIR}" "${DOMAINS_DIRECTORY}"; do + if [ ! -d "${directory}" ]; then + echo "PythonReports: Error: Directory ${directory} does not exist. Please check your REPORTS_SCRIPT_DIR and DOMAIN_DIRECTORY settings." + exit 1 + fi + + # Run all visualization scripts in the selected directory. + find "${directory}" -type f -name "*Visualization.sh" | sort | while read -r visualization_script_file; do + visualization_script_filename=$(basename -- "${visualization_script_file}") + visualization_script_filename="${visualization_script_filename%.*}" # Remove file extension + + echo "${LOG_GROUP_START}Create Visualization Report ${visualization_script_filename}"; + echo "VisualizationReports: $(date +'%Y-%m-%dT%H:%M:%S%z') Starting ${visualization_script_filename}..."; + + source "${visualization_script_file}" + + echo "VisualizationReports: $(date +'%Y-%m-%dT%H:%M:%S%z') Finished ${visualization_script_filename}"; + echo "${LOG_GROUP_END}"; + done done diff --git a/scripts/visualization/convertQueryResultCsvToGraphVizDotFile.sh b/scripts/visualization/convertQueryResultCsvToGraphVizDotFile.sh index 8955465f8..7f2add190 100755 --- a/scripts/visualization/convertQueryResultCsvToGraphVizDotFile.sh +++ b/scripts/visualization/convertQueryResultCsvToGraphVizDotFile.sh @@ -105,8 +105,13 @@ outputFilename="${inputFilePath}/${graphName}.gv" echo "strict digraph ${graphName} {" # Extract the template content from the template file and remove the begin and end markers sed -n '/\/\/Begin-Template/,/\/\/End-Template/{//!p;}' "${templateFile}" - # Remove the first (header) line of the CSV file, remove the enclosing double quotes and replace the escaped double quotes by double quotes + # Remove the first (header) line of the CSV file, + # print the first column prefixed with a tab, + # remove heading double quote + # remove the enclosing double quotes and + # replace the escaped double quotes by double quotes awk -F ',' 'NR>1 {print "\t" $1}' "${inputFilename}" \ + | sed 's/^\t\"/\t/' \ | sed 's/^\t\"\"\"/\t"/' \ | sed 's/^\t\"\\\"\"/\t"/' \ | sed 's/\\\"\"/"/g' \ diff --git a/scripts/visualization/visualizeQueryResults.sh b/scripts/visualization/visualizeQueryResults.sh index bde45f820..c0a2ee424 100755 --- a/scripts/visualization/visualizeQueryResults.sh +++ b/scripts/visualization/visualizeQueryResults.sh @@ -17,7 +17,7 @@ echo "visualizeQueryResults: VISUALIZATION_SCRIPTS_DIR=${VISUALIZATION_SCRIPTS_D # Read the first unnamed input argument containing the version of the project inputCsvFileName="" case "${1}" in - "--"*) ;; # Skipping named command line options to forward them later to the "analyze" command + "--"*) ;; # Skipping named command line options to forward them later to the "convertQueryResultCsvToGraphVizDotFile" command *) inputCsvFileName="${1}" shift || true @@ -63,5 +63,7 @@ fi # Run GraphViz command line interface (CLI) wrapped utilizing WASM (WebAssembly) # to convert the DOT file to SVG operating system independently. +# Use "npm install" first to create local "node_modules" and be able to run it after that in offline mode. echo "visualizeQueryResults: Info: Using npx to run GraphViz CLI (Web Assembly Wrapper) for SVG generation." +npm install @hpcc-js/wasm-graphviz-cli@1.2.6 --silent --no-progress --loglevel=error > /dev/null npx --yes @hpcc-js/wasm-graphviz-cli@1.2.6 -T svg "${inputCsvFilePath}/${graphName}.gv" > "${inputCsvFilePath}/${graphName}.svg" \ No newline at end of file