JohT
diff --git a/‎.github/workflows/public-analyze-code-graph.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/public-analyze-code-graph.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎domains/anomaly-detection/anomalyDetectionCsv.sh‎
Lines changed: 51 additions & 11 deletions b/‎domains/anomaly-detection/anomalyDetectionCsv.sh‎
Lines changed: 51 additions & 11 deletions
diff --git a/‎domains/anomaly-detection/anomalyDetectionFeaturePlots.py‎
Lines changed: 50 additions & 39 deletions b/‎domains/anomaly-detection/anomalyDetectionFeaturePlots.py‎
Lines changed: 50 additions & 39 deletions
@@ -92,7 +92,7 @@ jobs:
         run: echo "Please specify either the input parameter 'artifacts-upload-name' or 'sources-upload-name'."; exit 1
 
       - name: Assemble ENVIRONMENT_INFO
-        run: echo "ENVIRONMENT_INFO=-java-${{ matrix.java }}-python-${{ matrix.python }}-miniforge-${{ matrix.miniforge }}" >> $GITHUB_ENV
+        run: echo "ENVIRONMENT_INFO=java-${{ matrix.java }}-python-${{ matrix.python }}-miniforge-${{ matrix.miniforge }}" >> $GITHUB_ENV
 
       - name: (Code Analysis Setup) Checkout code-graph-analysis-pipeline
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5
 
@@ -3,6 +3,7 @@
 # Pipeline that coordinates anomaly detection using the Graph Data Science Library of Neo4j.
 # It requires an already running Neo4j graph database with already scanned and analyzed artifacts.
 # The results will be written into the sub directory reports/anomaly-detection.
+# Dynamically triggered by "CsvReports.sh".
 
 # Note that "scripts/prepareAnalysis.sh" is required to run prior to this script.
 
@@ -25,8 +26,9 @@ SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/../../scripts"} # Re
 # Get the "cypher" query directory for gathering features.
 ANOMALY_DETECTION_FEATURE_CYPHER_DIR=${ANOMALY_DETECTION_FEATURE_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/features"}
 ANOMALY_DETECTION_QUERY_CYPHER_DIR=${ANOMALY_DETECTION_QUERY_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/queries"}
+ANOMALY_DETECTION_LABEL_CYPHER_DIR=${ANOMALY_DETECTION_LABEL_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/labels"}
 
-# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher"
+# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher" and "execute_cypher_summarized"
 source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
 
 # Define functions to create and delete Graph Projections like "createUndirectedDependencyProjection"
@@ -60,29 +62,64 @@ anomaly_detection_features() {
     execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Exists.cypher" \
                                          "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Write.cypher" "${@}"
 }
+
 # Run queries to find anomalies in the graph.
 # 
 # Required Parameters:
 # - projection_node_label=...
 #   Label of the nodes that will be used for the projection. Example: "Package"
+# - projection_language=...
+#   Name of the associated programming language. Default: "Java". Example: "Typescript"
 anomaly_detection_queries() {
     local nodeLabel
     nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
 
     local language
     language=$( extractQueryParameter "projection_language" "${@}" )
-    
+
+    # Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
+    local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
+    mkdir -p "${detail_report_directory}"
+
     echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Queries for ${nodeLabel} nodes..."
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialImbalancedRoles.csv"
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialOverEngineerOrIsolated.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialImbalancedRoles.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialOverEngineerOrIsolated.csv"
+    
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_HiddenBridgeNodes.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PopularBottlenecks.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_SilentCoordinators.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_OverReferencesUtilities.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_FragileStructuralBridges.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_DependencyHungryOrchestrators.csv"
+    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_UnexpectedCentralNodes.csv"
+}
+
+# Label code units with top anomalies by archetype.
+# 
+# Required Parameters:
+# - projection_node_label=...
+#   Label of the nodes that will be used for the projection. Example: "Package"
+# - projection_language=...
+#   Name of the associated programming language. Examples: "Java", "Typescript"
+anomaly_detection_labels() {
+    local nodeLabel
+    nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
+    
+    local language
+    language=$( extractQueryParameter "projection_language" "${@}" )
 
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_HiddenBridgeNodes.csv"
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PopularBottlenecks.csv"
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_SilentCoordinators.csv"
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_OverReferencesUtilities.csv"
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_FragileStructuralBridges.csv"
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_DependencyHungryOrchestrators.csv"
-    execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_UnexpectedCentralNodes.csv"
+    # Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
+    local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
+    mkdir -p "${detail_report_directory}"
+
+    echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Labelling ${language} ${nodeLabel} anomalies..."
+    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeRemoveLabels.cypher" "${@}"
+    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopAuthority.csv"
+    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopBottleneck.csv"
+    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopHub.csv"
+    # The following two label types require Python scripts to run first and are skipped here intentionally:
+    # execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBridge.cypher" "${@}"
+    # execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}"
 }
 
 # Run the anomaly detection pipeline.
@@ -94,9 +131,12 @@ anomaly_detection_queries() {
 #   Label of the nodes that will be used for the projection. Example: "Package"
 # - projection_weight_property=...
 #   Name of the node property that contains the dependency weight. Example: "weight"
+# - projection_language=...
+#   Name of the associated programming language. Examples: "Java", "Typescript"
 anomaly_detection_csv_reports() {
     time anomaly_detection_features "${@}"
     time anomaly_detection_queries "${@}"
+    time anomaly_detection_labels "${@}"
 }
 
 # Create report directory
 
@@ -98,7 +98,7 @@ def __is_code_language_available(self) -> bool:
     def __get_projection_language(self) -> str:
         return self.query_parameters_["projection_language"] if self.__is_code_language_available() else ""
 
-    def get_plot_prefix(self) -> str:
+    def get_title_prefix(self) -> str:
         if self.__is_code_language_available():
             return self.__get_projection_language() + " " + self.__get_projection_node_label()
         return self.__get_projection_node_label()
@@ -378,37 +378,40 @@ def annotate_outliers(outliers: pd.DataFrame) -> None:
     plot.savefig(plot_file_path)
 
 
-def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title: str, plot_file_path: str) -> None:
+def plot_feature_distribution(feature_values: pd.Series, feature_name: str, title: str, plot_file_path: str) -> None:
     """
-    Plots the distribution of clustering coefficients.
-
+    Plots the distribution of feature's values.
+    
     Parameters
     ----------
-    clustering_coefficients : pd.Series
-        Series containing clustering coefficient values.
+    feature_values : pd.Series
+        Series containing feature values.
+    text_prefix: str
+        Text at the beginning of the title
     """
-    if clustering_coefficients.empty:
+    if feature_values.empty:
         print("No data available to plot.")
         return
 
     plot.figure(figsize=(10, 6))
     plot.figure(figsize=(10, 6))
-    plot.hist(clustering_coefficients, bins=40, color='blue', alpha=0.7, edgecolor='black')
+    plot.hist(feature_values, bins=40, color='blue', alpha=0.7, edgecolor='black')
     plot.title(title, pad=20)
-    plot.xlabel('Clustering Coefficient')
+    plot.xlabel(feature_name)
     plot.ylabel('Frequency')
-    plot.xlim(left=clustering_coefficients.min(), right=clustering_coefficients.max())
+    plot.xlim(left=feature_values.min(), right=feature_values.max())
     # plot.yscale('log')  # Use logarithmic scale for better visibility of differences
     plot.grid(True)
-    plot.tight_layout()
 
-    mean = clustering_coefficients.mean()
-    standard_deviation = clustering_coefficients.std()
+    mean = feature_values.mean()
+    standard_deviation = feature_values.std()
 
     # Vertical line for the mean
     plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)
     # Vertical line for 1 x standard deviations + mean (=z-score of 1)
-    plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=1)
+    plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)
+    # Vertical line for 2 x standard deviations + mean (=z-score of 2)
+    plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)
 
     plot.tight_layout()
     plot.savefig(plot_file_path)
@@ -812,7 +815,7 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
 # ------------------------------------------------------------------------------------------------------------
 
 parameters = parse_input_parameters()
-plot_prefix = parameters.get_plot_prefix()
+title_prefix = parameters.get_title_prefix()
 report_directory = parameters.get_report_directory()
 
 driver = get_graph_database_driver()
@@ -825,31 +828,39 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
     data['pageRank'],
     data['articleRank'],
     data['shortCodeUnitName'],
-    title=f"{plot_prefix} distribution of PageRank - ArticleRank differences",
-    plot_file_path=get_file_path(f"{plot_prefix}_PageRank_Minus_ArticleRank_Distribution", parameters)
+    title=f"{title_prefix} distribution of PageRank - ArticleRank differences",
+    plot_file_path=get_file_path("PageRank_Minus_ArticleRank_Distribution", parameters)
 )
 
-plot_clustering_coefficient_distribution(
-    data['clusteringCoefficient'],
-    title=f"{plot_prefix} distribution of clustering coefficients",
-    plot_file_path=get_file_path(f"{plot_prefix}_ClusteringCoefficient_distribution", parameters)
+plot_feature_distribution(
+    feature_values=data['clusteringCoefficient'],
+    feature_name='Clustering Coefficient',
+    title=f"{title_prefix} clustering coefficient distribution",
+    plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters)
+)
+
+plot_feature_distribution(
+    feature_values=data['betweenness'],
+    feature_name='Betweenness',
+    title=f"{title_prefix} betweenness centrality distribution",
+    plot_file_path=get_file_path("BetweennessCentrality_distribution", parameters)
 )
 
 plot_clustering_coefficient_vs_page_rank(
     data['clusteringCoefficient'],
     data['pageRank'],
     data['shortCodeUnitName'],
     data['clusterNoise'],
-    title=f"{plot_prefix} clustering coefficient versus PageRank",
-    plot_file_path=get_file_path(f"{plot_prefix}_ClusteringCoefficient_versus_PageRank", parameters)
+    title=f"{title_prefix} clustering coefficient versus PageRank",
+    plot_file_path=get_file_path("ClusteringCoefficient_versus_PageRank", parameters)
 )
 
 if (overall_cluster_count < 20):
     print(f"anomalyDetectionFeaturePlots: Less than 20 clusters: {overall_cluster_count}. Only one plot containing all clusters will be created.")
     plot_clusters(
         clustering_visualization_dataframe=data,
-        title=f"{plot_prefix} all clusters overall (less than 20)",
-        plot_file_path=get_file_path(f"{plot_prefix}_Clusters_Overall", parameters)
+        title=f"{title_prefix} all clusters overall",
+        plot_file_path=get_file_path("Clusters_Overall", parameters)
     )
 else:
     print(f"anomalyDetectionFeaturePlots: More than 20 clusters: {overall_cluster_count}. Different plots focussing on different features like cluster size will be created.")
@@ -858,57 +869,57 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
     )
     plot_clusters(
         clustering_visualization_dataframe=clusters_by_largest_size,
-        title=f"{plot_prefix} clusters with the largest size",
-        plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_size", parameters)
+        title=f"{title_prefix} clusters with the largest size",
+        plot_file_path=get_file_path("Clusters_largest_size", parameters)
     )
 
     clusters_by_largest_max_radius = get_clusters_by_criteria(
         data, by='clusterRadiusMax', ascending=False, cluster_count=20
     )
     plot_clusters(
         clustering_visualization_dataframe=clusters_by_largest_max_radius,
-        title=f"{plot_prefix} clusters with the largest max radius",
-        plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_max_radius", parameters)
+        title=f"{title_prefix} clusters with the largest max radius",
+        plot_file_path=get_file_path("Clusters_largest_max_radius", parameters)
     )
 
     clusters_by_largest_average_radius = get_clusters_by_criteria(
         data, by='clusterRadiusAverage', ascending=False, cluster_count=20
     )
     plot_clusters(
         clustering_visualization_dataframe=clusters_by_largest_average_radius,
-        title=f"{plot_prefix} clusters with the largest average radius",
-        plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_average_radius", parameters)
+        title=f"{title_prefix} clusters with the largest average radius",
+        plot_file_path=get_file_path("Clusters_largest_average_radius", parameters)
     )
 
 plot_clusters_probabilities(
     clustering_visualization_dataframe=data,
-    title=f"{plot_prefix} clustering probabilities (red=high uncertainty)",
-    plot_file_path=get_file_path(f"{plot_prefix}_Cluster_probabilities", parameters)
+    title=f"{title_prefix} clustering probabilities (red=high uncertainty)",
+    plot_file_path=get_file_path("Cluster_probabilities", parameters)
 )
 
 plot_cluster_noise(
     clustering_visualization_dataframe=data,
-    title=f"{plot_prefix} clustering noise points that are surprisingly central (red) or popular (size)",
+    title=f"{title_prefix} clustering noise points that are surprisingly central (red) or popular (size)",
     size_column_name='degree',
     color_column_name='pageRank',
-    plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_highly_central_and_popular", parameters)
+    plot_file_path=get_file_path("ClusterNoise_highly_central_and_popular", parameters)
 )
 
 plot_cluster_noise(
     clustering_visualization_dataframe=data,
-    title=f"{plot_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)",
+    title=f"{title_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)",
     size_column_name='inverseClusteringCoefficient',
     color_column_name='betweenness',
-    plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_poorly_integrated_bridges", parameters),
+    plot_file_path=get_file_path("ClusterNoise_poorly_integrated_bridges", parameters),
     downscale_normal_sizes=0.4
 )
 
 plot_cluster_noise(
     clustering_visualization_dataframe=data,
-    title=f"{plot_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)",
+    title=f"{title_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)",
     size_column_name='pageToArticleRankDifference',
     color_column_name='betweenness',
-    plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_role_inverted_bridges", parameters)
+    plot_file_path=get_file_path("ClusterNoise_role_inverted_bridges", parameters)
 )
 
 driver.close()