Skip to content

Commit ed99649

Browse files
authored
Merge pull request #431 from JohT/feature/anomaly-detection-archetypes
Anomaly Detection Archetypes and Markdown Summary
2 parents ce6ea4e + c545b38 commit ed99649

File tree

45 files changed

+1933
-158
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+1933
-158
lines changed

.github/workflows/public-analyze-code-graph.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ jobs:
9292
run: echo "Please specify either the input parameter 'artifacts-upload-name' or 'sources-upload-name'."; exit 1
9393

9494
- name: Assemble ENVIRONMENT_INFO
95-
run: echo "ENVIRONMENT_INFO=-java-${{ matrix.java }}-python-${{ matrix.python }}-miniforge-${{ matrix.miniforge }}" >> $GITHUB_ENV
95+
run: echo "ENVIRONMENT_INFO=java-${{ matrix.java }}-python-${{ matrix.python }}-miniforge-${{ matrix.miniforge }}" >> $GITHUB_ENV
9696

9797
- name: (Code Analysis Setup) Checkout code-graph-analysis-pipeline
9898
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5

domains/anomaly-detection/anomalyDetectionCsv.sh

Lines changed: 51 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# Pipeline that coordinates anomaly detection using the Graph Data Science Library of Neo4j.
44
# It requires an already running Neo4j graph database with already scanned and analyzed artifacts.
55
# The results will be written into the sub directory reports/anomaly-detection.
6+
# Dynamically triggered by "CsvReports.sh".
67

78
# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script.
89

@@ -25,8 +26,9 @@ SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/../../scripts"} # Re
2526
# Get the "cypher" query directory for gathering features.
2627
ANOMALY_DETECTION_FEATURE_CYPHER_DIR=${ANOMALY_DETECTION_FEATURE_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/features"}
2728
ANOMALY_DETECTION_QUERY_CYPHER_DIR=${ANOMALY_DETECTION_QUERY_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/queries"}
29+
ANOMALY_DETECTION_LABEL_CYPHER_DIR=${ANOMALY_DETECTION_LABEL_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/labels"}
2830

29-
# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher"
31+
# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher" and "execute_cypher_summarized"
3032
source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
3133

3234
# Define functions to create and delete Graph Projections like "createUndirectedDependencyProjection"
@@ -60,29 +62,64 @@ anomaly_detection_features() {
6062
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Exists.cypher" \
6163
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Write.cypher" "${@}"
6264
}
65+
6366
# Run queries to find anomalies in the graph.
6467
#
6568
# Required Parameters:
6669
# - projection_node_label=...
6770
# Label of the nodes that will be used for the projection. Example: "Package"
71+
# - projection_language=...
72+
# Name of the associated programming language. Default: "Java". Example: "Typescript"
6873
anomaly_detection_queries() {
6974
local nodeLabel
7075
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
7176

7277
local language
7378
language=$( extractQueryParameter "projection_language" "${@}" )
74-
79+
80+
# Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
81+
local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
82+
mkdir -p "${detail_report_directory}"
83+
7584
echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Queries for ${nodeLabel} nodes..."
76-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialImbalancedRoles.csv"
77-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialOverEngineerOrIsolated.csv"
85+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialImbalancedRoles.csv"
86+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialOverEngineerOrIsolated.csv"
87+
88+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_HiddenBridgeNodes.csv"
89+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PopularBottlenecks.csv"
90+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_SilentCoordinators.csv"
91+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_OverReferencesUtilities.csv"
92+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_FragileStructuralBridges.csv"
93+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_DependencyHungryOrchestrators.csv"
94+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_UnexpectedCentralNodes.csv"
95+
}
96+
97+
# Label code units with top anomalies by archetype.
98+
#
99+
# Required Parameters:
100+
# - projection_node_label=...
101+
# Label of the nodes that will be used for the projection. Example: "Package"
102+
# - projection_language=...
103+
# Name of the associated programming language. Examples: "Java", "Typescript"
104+
anomaly_detection_labels() {
105+
local nodeLabel
106+
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
107+
108+
local language
109+
language=$( extractQueryParameter "projection_language" "${@}" )
78110

79-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_HiddenBridgeNodes.csv"
80-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PopularBottlenecks.csv"
81-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_SilentCoordinators.csv"
82-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_OverReferencesUtilities.csv"
83-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_FragileStructuralBridges.csv"
84-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_DependencyHungryOrchestrators.csv"
85-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_UnexpectedCentralNodes.csv"
111+
# Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
112+
local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
113+
mkdir -p "${detail_report_directory}"
114+
115+
echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Labelling ${language} ${nodeLabel} anomalies..."
116+
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeRemoveLabels.cypher" "${@}"
117+
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopAuthority.csv"
118+
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopBottleneck.csv"
119+
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopHub.csv"
120+
# The following two label types require Python scripts to run first and are skipped here intentionally:
121+
# execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBridge.cypher" "${@}"
122+
# execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}"
86123
}
87124

88125
# Run the anomaly detection pipeline.
@@ -94,9 +131,12 @@ anomaly_detection_queries() {
94131
# Label of the nodes that will be used for the projection. Example: "Package"
95132
# - projection_weight_property=...
96133
# Name of the node property that contains the dependency weight. Example: "weight"
134+
# - projection_language=...
135+
# Name of the associated programming language. Examples: "Java", "Typescript"
97136
anomaly_detection_csv_reports() {
98137
time anomaly_detection_features "${@}"
99138
time anomaly_detection_queries "${@}"
139+
time anomaly_detection_labels "${@}"
100140
}
101141

102142
# Create report directory

domains/anomaly-detection/anomalyDetectionFeaturePlots.py

Lines changed: 50 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def __is_code_language_available(self) -> bool:
9898
def __get_projection_language(self) -> str:
9999
return self.query_parameters_["projection_language"] if self.__is_code_language_available() else ""
100100

101-
def get_plot_prefix(self) -> str:
101+
def get_title_prefix(self) -> str:
102102
if self.__is_code_language_available():
103103
return self.__get_projection_language() + " " + self.__get_projection_node_label()
104104
return self.__get_projection_node_label()
@@ -378,37 +378,40 @@ def annotate_outliers(outliers: pd.DataFrame) -> None:
378378
plot.savefig(plot_file_path)
379379

380380

381-
def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title: str, plot_file_path: str) -> None:
381+
def plot_feature_distribution(feature_values: pd.Series, feature_name: str, title: str, plot_file_path: str) -> None:
382382
"""
383-
Plots the distribution of clustering coefficients.
384-
383+
Plots the distribution of feature's values.
384+
385385
Parameters
386386
----------
387-
clustering_coefficients : pd.Series
388-
Series containing clustering coefficient values.
387+
feature_values : pd.Series
388+
Series containing feature values.
389+
text_prefix: str
390+
Text at the beginning of the title
389391
"""
390-
if clustering_coefficients.empty:
392+
if feature_values.empty:
391393
print("No data available to plot.")
392394
return
393395

394396
plot.figure(figsize=(10, 6))
395397
plot.figure(figsize=(10, 6))
396-
plot.hist(clustering_coefficients, bins=40, color='blue', alpha=0.7, edgecolor='black')
398+
plot.hist(feature_values, bins=40, color='blue', alpha=0.7, edgecolor='black')
397399
plot.title(title, pad=20)
398-
plot.xlabel('Clustering Coefficient')
400+
plot.xlabel(feature_name)
399401
plot.ylabel('Frequency')
400-
plot.xlim(left=clustering_coefficients.min(), right=clustering_coefficients.max())
402+
plot.xlim(left=feature_values.min(), right=feature_values.max())
401403
# plot.yscale('log') # Use logarithmic scale for better visibility of differences
402404
plot.grid(True)
403-
plot.tight_layout()
404405

405-
mean = clustering_coefficients.mean()
406-
standard_deviation = clustering_coefficients.std()
406+
mean = feature_values.mean()
407+
standard_deviation = feature_values.std()
407408

408409
# Vertical line for the mean
409410
plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)
410411
# Vertical line for 1 x standard deviations + mean (=z-score of 1)
411-
plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=1)
412+
plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)
413+
# Vertical line for 2 x standard deviations + mean (=z-score of 2)
414+
plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)
412415

413416
plot.tight_layout()
414417
plot.savefig(plot_file_path)
@@ -812,7 +815,7 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
812815
# ------------------------------------------------------------------------------------------------------------
813816

814817
parameters = parse_input_parameters()
815-
plot_prefix = parameters.get_plot_prefix()
818+
title_prefix = parameters.get_title_prefix()
816819
report_directory = parameters.get_report_directory()
817820

818821
driver = get_graph_database_driver()
@@ -825,31 +828,39 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
825828
data['pageRank'],
826829
data['articleRank'],
827830
data['shortCodeUnitName'],
828-
title=f"{plot_prefix} distribution of PageRank - ArticleRank differences",
829-
plot_file_path=get_file_path(f"{plot_prefix}_PageRank_Minus_ArticleRank_Distribution", parameters)
831+
title=f"{title_prefix} distribution of PageRank - ArticleRank differences",
832+
plot_file_path=get_file_path("PageRank_Minus_ArticleRank_Distribution", parameters)
830833
)
831834

832-
plot_clustering_coefficient_distribution(
833-
data['clusteringCoefficient'],
834-
title=f"{plot_prefix} distribution of clustering coefficients",
835-
plot_file_path=get_file_path(f"{plot_prefix}_ClusteringCoefficient_distribution", parameters)
835+
plot_feature_distribution(
836+
feature_values=data['clusteringCoefficient'],
837+
feature_name='Clustering Coefficient',
838+
title=f"{title_prefix} clustering coefficient distribution",
839+
plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters)
840+
)
841+
842+
plot_feature_distribution(
843+
feature_values=data['betweenness'],
844+
feature_name='Betweenness',
845+
title=f"{title_prefix} betweenness centrality distribution",
846+
plot_file_path=get_file_path("BetweennessCentrality_distribution", parameters)
836847
)
837848

838849
plot_clustering_coefficient_vs_page_rank(
839850
data['clusteringCoefficient'],
840851
data['pageRank'],
841852
data['shortCodeUnitName'],
842853
data['clusterNoise'],
843-
title=f"{plot_prefix} clustering coefficient versus PageRank",
844-
plot_file_path=get_file_path(f"{plot_prefix}_ClusteringCoefficient_versus_PageRank", parameters)
854+
title=f"{title_prefix} clustering coefficient versus PageRank",
855+
plot_file_path=get_file_path("ClusteringCoefficient_versus_PageRank", parameters)
845856
)
846857

847858
if (overall_cluster_count < 20):
848859
print(f"anomalyDetectionFeaturePlots: Less than 20 clusters: {overall_cluster_count}. Only one plot containing all clusters will be created.")
849860
plot_clusters(
850861
clustering_visualization_dataframe=data,
851-
title=f"{plot_prefix} all clusters overall (less than 20)",
852-
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_Overall", parameters)
862+
title=f"{title_prefix} all clusters overall",
863+
plot_file_path=get_file_path("Clusters_Overall", parameters)
853864
)
854865
else:
855866
print(f"anomalyDetectionFeaturePlots: More than 20 clusters: {overall_cluster_count}. Different plots focussing on different features like cluster size will be created.")
@@ -858,57 +869,57 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
858869
)
859870
plot_clusters(
860871
clustering_visualization_dataframe=clusters_by_largest_size,
861-
title=f"{plot_prefix} clusters with the largest size",
862-
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_size", parameters)
872+
title=f"{title_prefix} clusters with the largest size",
873+
plot_file_path=get_file_path("Clusters_largest_size", parameters)
863874
)
864875

865876
clusters_by_largest_max_radius = get_clusters_by_criteria(
866877
data, by='clusterRadiusMax', ascending=False, cluster_count=20
867878
)
868879
plot_clusters(
869880
clustering_visualization_dataframe=clusters_by_largest_max_radius,
870-
title=f"{plot_prefix} clusters with the largest max radius",
871-
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_max_radius", parameters)
881+
title=f"{title_prefix} clusters with the largest max radius",
882+
plot_file_path=get_file_path("Clusters_largest_max_radius", parameters)
872883
)
873884

874885
clusters_by_largest_average_radius = get_clusters_by_criteria(
875886
data, by='clusterRadiusAverage', ascending=False, cluster_count=20
876887
)
877888
plot_clusters(
878889
clustering_visualization_dataframe=clusters_by_largest_average_radius,
879-
title=f"{plot_prefix} clusters with the largest average radius",
880-
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_average_radius", parameters)
890+
title=f"{title_prefix} clusters with the largest average radius",
891+
plot_file_path=get_file_path("Clusters_largest_average_radius", parameters)
881892
)
882893

883894
plot_clusters_probabilities(
884895
clustering_visualization_dataframe=data,
885-
title=f"{plot_prefix} clustering probabilities (red=high uncertainty)",
886-
plot_file_path=get_file_path(f"{plot_prefix}_Cluster_probabilities", parameters)
896+
title=f"{title_prefix} clustering probabilities (red=high uncertainty)",
897+
plot_file_path=get_file_path("Cluster_probabilities", parameters)
887898
)
888899

889900
plot_cluster_noise(
890901
clustering_visualization_dataframe=data,
891-
title=f"{plot_prefix} clustering noise points that are surprisingly central (red) or popular (size)",
902+
title=f"{title_prefix} clustering noise points that are surprisingly central (red) or popular (size)",
892903
size_column_name='degree',
893904
color_column_name='pageRank',
894-
plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_highly_central_and_popular", parameters)
905+
plot_file_path=get_file_path("ClusterNoise_highly_central_and_popular", parameters)
895906
)
896907

897908
plot_cluster_noise(
898909
clustering_visualization_dataframe=data,
899-
title=f"{plot_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)",
910+
title=f"{title_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)",
900911
size_column_name='inverseClusteringCoefficient',
901912
color_column_name='betweenness',
902-
plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_poorly_integrated_bridges", parameters),
913+
plot_file_path=get_file_path("ClusterNoise_poorly_integrated_bridges", parameters),
903914
downscale_normal_sizes=0.4
904915
)
905916

906917
plot_cluster_noise(
907918
clustering_visualization_dataframe=data,
908-
title=f"{plot_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)",
919+
title=f"{title_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)",
909920
size_column_name='pageToArticleRankDifference',
910921
color_column_name='betweenness',
911-
plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_role_inverted_bridges", parameters)
922+
plot_file_path=get_file_path("ClusterNoise_role_inverted_bridges", parameters)
912923
)
913924

914925
driver.close()

0 commit comments

Comments
 (0)