Create sub directories for each anomaly detected code unit

JohT · JohT · commit d075eafc75b2 · 2025-10-05T11:42:09.000+02:00
diff --git a/domains/anomaly-detection/anomalyDetectionCsv.sh b/domains/anomaly-detection/anomalyDetectionCsv.sh
@@ -73,18 +73,22 @@ anomaly_detection_queries() {
     
     local language
     language=$( extractQueryParameter "projection_language" "${@}" )
-    
+
+    # Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
+    local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
+    mkdir -p "${detail_report_directory}"
+
     echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Queries for ${nodeLabel} nodes..."
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialImbalancedRoles.csv"
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialOverEngineerOrIsolated.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialImbalancedRoles.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialOverEngineerOrIsolated.csv"
     
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_HiddenBridgeNodes.csv"
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PopularBottlenecks.csv"
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_SilentCoordinators.csv"
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_OverReferencesUtilities.csv"
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_FragileStructuralBridges.csv"
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_DependencyHungryOrchestrators.csv"
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_UnexpectedCentralNodes.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_HiddenBridgeNodes.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PopularBottlenecks.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_SilentCoordinators.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_OverReferencesUtilities.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_FragileStructuralBridges.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_DependencyHungryOrchestrators.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_UnexpectedCentralNodes.csv"
 }
 
 # Label code units with top anomalies by archetype.
@@ -99,11 +103,15 @@ anomaly_detection_labels() {
     local language
     language=$( extractQueryParameter "projection_language" "${@}" )
     
+    # Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
+    local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
+    mkdir -p "${detail_report_directory}"
+
     echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Labelling ${language} ${nodeLabel} anomalies..."
     execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeRemoveLabels.cypher" "${@}"
-    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopAuthority.csv"
-    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopBottleneck.csv"
-    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopHub.csv"
+    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopAuthority.csv"
+    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopBottleneck.csv"
+    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopHub.csv"
     # The following two label types require Python scripts to run first and are skipped here intentionally:
     # execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBridge.cypher" "${@}"
     # execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}"
diff --git a/domains/anomaly-detection/anomalyDetectionFeaturePlots.py b/domains/anomaly-detection/anomalyDetectionFeaturePlots.py
@@ -98,7 +98,7 @@ def __is_code_language_available(self) -> bool:
     def __get_projection_language(self) -> str:
         return self.query_parameters_["projection_language"] if self.__is_code_language_available() else ""
 
-    def get_plot_prefix(self) -> str:
+    def get_title_prefix(self) -> str:
         if self.__is_code_language_available():
             return self.__get_projection_language() + " " + self.__get_projection_node_label()
         return self.__get_projection_node_label()
@@ -815,7 +815,7 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
 # ------------------------------------------------------------------------------------------------------------
 
 parameters = parse_input_parameters()
-plot_prefix = parameters.get_plot_prefix()
+title_prefix = parameters.get_title_prefix()
 report_directory = parameters.get_report_directory()
 
 driver = get_graph_database_driver()
@@ -828,21 +828,21 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
     data['pageRank'],
     data['articleRank'],
     data['shortCodeUnitName'],
-    title=f"{plot_prefix} distribution of PageRank - ArticleRank differences",
-    plot_file_path=get_file_path(f"{plot_prefix}_PageRank_Minus_ArticleRank_Distribution", parameters)
+    title=f"{title_prefix} distribution of PageRank - ArticleRank differences",
+    plot_file_path=get_file_path("PageRank_Minus_ArticleRank_Distribution", parameters)
 )
 
 plot_feature_distribution(
     feature_values=data['clusteringCoefficient'],
     feature_name='Clustering Coefficient',
-    title=f"{plot_prefix} clustering coefficient distribution",
+    title=f"{title_prefix} clustering coefficient distribution",
     plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters)
 )
 
 plot_feature_distribution(
     feature_values=data['betweenness'],
     feature_name='Betweenness',
-    title=f"{plot_prefix} betweenness centrality distribution",
+    title=f"{title_prefix} betweenness centrality distribution",
     plot_file_path=get_file_path("BetweennessCentrality_distribution", parameters)
 )
 
@@ -851,15 +851,15 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
     data['pageRank'],
     data['shortCodeUnitName'],
     data['clusterNoise'],
-    title=f"{plot_prefix} clustering coefficient versus PageRank",
-    plot_file_path=get_file_path(f"{plot_prefix}_ClusteringCoefficient_versus_PageRank", parameters)
+    title=f"{title_prefix} clustering coefficient versus PageRank",
+    plot_file_path=get_file_path("ClusteringCoefficient_versus_PageRank", parameters)
 )
 
 if (overall_cluster_count < 20):
     print(f"anomalyDetectionFeaturePlots: Less than 20 clusters: {overall_cluster_count}. Only one plot containing all clusters will be created.")
     plot_clusters(
         clustering_visualization_dataframe=data,
-        title=f"{plot_prefix} all clusters overall",
+        title=f"{title_prefix} all clusters overall",
         plot_file_path=get_file_path("Clusters_Overall", parameters)
     )
 else:
@@ -869,57 +869,57 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
     )
     plot_clusters(
         clustering_visualization_dataframe=clusters_by_largest_size,
-        title=f"{plot_prefix} clusters with the largest size",
-        plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_size", parameters)
+        title=f"{title_prefix} clusters with the largest size",
+        plot_file_path=get_file_path("Clusters_largest_size", parameters)
     )
 
     clusters_by_largest_max_radius = get_clusters_by_criteria(
         data, by='clusterRadiusMax', ascending=False, cluster_count=20
     )
     plot_clusters(
         clustering_visualization_dataframe=clusters_by_largest_max_radius,
-        title=f"{plot_prefix} clusters with the largest max radius",
-        plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_max_radius", parameters)
+        title=f"{title_prefix} clusters with the largest max radius",
+        plot_file_path=get_file_path("Clusters_largest_max_radius", parameters)
     )
 
     clusters_by_largest_average_radius = get_clusters_by_criteria(
         data, by='clusterRadiusAverage', ascending=False, cluster_count=20
     )
     plot_clusters(
         clustering_visualization_dataframe=clusters_by_largest_average_radius,
-        title=f"{plot_prefix} clusters with the largest average radius",
-        plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_average_radius", parameters)
+        title=f"{title_prefix} clusters with the largest average radius",
+        plot_file_path=get_file_path("Clusters_largest_average_radius", parameters)
     )
 
 plot_clusters_probabilities(
     clustering_visualization_dataframe=data,
-    title=f"{plot_prefix} clustering probabilities (red=high uncertainty)",
-    plot_file_path=get_file_path(f"{plot_prefix}_Cluster_probabilities", parameters)
+    title=f"{title_prefix} clustering probabilities (red=high uncertainty)",
+    plot_file_path=get_file_path("Cluster_probabilities", parameters)
 )
 
 plot_cluster_noise(
     clustering_visualization_dataframe=data,
-    title=f"{plot_prefix} clustering noise points that are surprisingly central (red) or popular (size)",
+    title=f"{title_prefix} clustering noise points that are surprisingly central (red) or popular (size)",
     size_column_name='degree',
     color_column_name='pageRank',
-    plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_highly_central_and_popular", parameters)
+    plot_file_path=get_file_path("ClusterNoise_highly_central_and_popular", parameters)
 )
 
 plot_cluster_noise(
     clustering_visualization_dataframe=data,
-    title=f"{plot_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)",
+    title=f"{title_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)",
     size_column_name='inverseClusteringCoefficient',
     color_column_name='betweenness',
-    plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_poorly_integrated_bridges", parameters),
+    plot_file_path=get_file_path("ClusterNoise_poorly_integrated_bridges", parameters),
     downscale_normal_sizes=0.4
 )
 
 plot_cluster_noise(
     clustering_visualization_dataframe=data,
-    title=f"{plot_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)",
+    title=f"{title_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)",
     size_column_name='pageToArticleRankDifference',
     color_column_name='betweenness',
-    plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_role_inverted_bridges", parameters)
+    plot_file_path=get_file_path("ClusterNoise_role_inverted_bridges", parameters)
 )
 
 driver.close()
diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh
@@ -125,18 +125,22 @@ anomaly_detection_using_python() {
     
     echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Python scripts for ${language} ${nodeLabel} nodes..."
 
+    # Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
+    local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
+    mkdir -p "${detail_report_directory}"
+
     # Get tuned Leiden communities as a reference to tune clustering
     time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedLeidenCommunityDetection.py" "${@}" ${verboseMode}
     # Tuned Fast Random Projection and tuned HDBSCAN clustering 
     time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedNodeEmbeddingClustering.py" "${@}" ${verboseMode}
     # Reduce the dimensionality of the node embeddings down to 2D for visualization using UMAP
     time "${ANOMALY_DETECTION_SCRIPT_DIR}/umap2dNodeEmbeddings.py" "${@}" ${verboseMode}
     # Plot the results with clustering and UMAP embeddings to reveal anomalies in rare feature combinations
-    time "${ANOMALY_DETECTION_SCRIPT_DIR}/anomalyDetectionFeaturePlots.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode}
+    time "${ANOMALY_DETECTION_SCRIPT_DIR}/anomalyDetectionFeaturePlots.py" "${@}" "--report_directory" "${detail_report_directory}" ${verboseMode}
     # Run an unsupervised anomaly detection algorithm including tuning and explainability
-    time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedAnomalyDetectionExplained.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode}
+    time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedAnomalyDetectionExplained.py" "${@}" "--report_directory" "${detail_report_directory}" ${verboseMode}
     # Query Results: Output all collected features into a CSV file.
-    execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}AnomalyDetection_Features.csv"
+    execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${detail_report_directory}/Anomaly_Features.csv"
 }
 
 # Label code units with top anomalies by archetype.
diff --git a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py