Skip to content

Commit d075eaf

Browse files
committed
Create sub directories for each anomaly detected code unit
1 parent 88710fa commit d075eaf

File tree

4 files changed

+67
-57
lines changed

4 files changed

+67
-57
lines changed

domains/anomaly-detection/anomalyDetectionCsv.sh

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -73,18 +73,22 @@ anomaly_detection_queries() {
7373

7474
local language
7575
language=$( extractQueryParameter "projection_language" "${@}" )
76-
76+
77+
# Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
78+
local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
79+
mkdir -p "${detail_report_directory}"
80+
7781
echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Queries for ${nodeLabel} nodes..."
78-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialImbalancedRoles.csv"
79-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialOverEngineerOrIsolated.csv"
82+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialImbalancedRoles.csv"
83+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialOverEngineerOrIsolated.csv"
8084

81-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_HiddenBridgeNodes.csv"
82-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PopularBottlenecks.csv"
83-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_SilentCoordinators.csv"
84-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_OverReferencesUtilities.csv"
85-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_FragileStructuralBridges.csv"
86-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_DependencyHungryOrchestrators.csv"
87-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_UnexpectedCentralNodes.csv"
85+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_HiddenBridgeNodes.csv"
86+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PopularBottlenecks.csv"
87+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_SilentCoordinators.csv"
88+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_OverReferencesUtilities.csv"
89+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_FragileStructuralBridges.csv"
90+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_DependencyHungryOrchestrators.csv"
91+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_UnexpectedCentralNodes.csv"
8892
}
8993

9094
# Label code units with top anomalies by archetype.
@@ -99,11 +103,15 @@ anomaly_detection_labels() {
99103
local language
100104
language=$( extractQueryParameter "projection_language" "${@}" )
101105

106+
# Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
107+
local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
108+
mkdir -p "${detail_report_directory}"
109+
102110
echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Labelling ${language} ${nodeLabel} anomalies..."
103111
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeRemoveLabels.cypher" "${@}"
104-
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopAuthority.csv"
105-
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopBottleneck.csv"
106-
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopHub.csv"
112+
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopAuthority.csv"
113+
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopBottleneck.csv"
114+
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopHub.csv"
107115
# The following two label types require Python scripts to run first and are skipped here intentionally:
108116
# execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBridge.cypher" "${@}"
109117
# execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}"

domains/anomaly-detection/anomalyDetectionFeaturePlots.py

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def __is_code_language_available(self) -> bool:
9898
def __get_projection_language(self) -> str:
9999
return self.query_parameters_["projection_language"] if self.__is_code_language_available() else ""
100100

101-
def get_plot_prefix(self) -> str:
101+
def get_title_prefix(self) -> str:
102102
if self.__is_code_language_available():
103103
return self.__get_projection_language() + " " + self.__get_projection_node_label()
104104
return self.__get_projection_node_label()
@@ -815,7 +815,7 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
815815
# ------------------------------------------------------------------------------------------------------------
816816

817817
parameters = parse_input_parameters()
818-
plot_prefix = parameters.get_plot_prefix()
818+
title_prefix = parameters.get_title_prefix()
819819
report_directory = parameters.get_report_directory()
820820

821821
driver = get_graph_database_driver()
@@ -828,21 +828,21 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
828828
data['pageRank'],
829829
data['articleRank'],
830830
data['shortCodeUnitName'],
831-
title=f"{plot_prefix} distribution of PageRank - ArticleRank differences",
832-
plot_file_path=get_file_path(f"{plot_prefix}_PageRank_Minus_ArticleRank_Distribution", parameters)
831+
title=f"{title_prefix} distribution of PageRank - ArticleRank differences",
832+
plot_file_path=get_file_path("PageRank_Minus_ArticleRank_Distribution", parameters)
833833
)
834834

835835
plot_feature_distribution(
836836
feature_values=data['clusteringCoefficient'],
837837
feature_name='Clustering Coefficient',
838-
title=f"{plot_prefix} clustering coefficient distribution",
838+
title=f"{title_prefix} clustering coefficient distribution",
839839
plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters)
840840
)
841841

842842
plot_feature_distribution(
843843
feature_values=data['betweenness'],
844844
feature_name='Betweenness',
845-
title=f"{plot_prefix} betweenness centrality distribution",
845+
title=f"{title_prefix} betweenness centrality distribution",
846846
plot_file_path=get_file_path("BetweennessCentrality_distribution", parameters)
847847
)
848848

@@ -851,15 +851,15 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
851851
data['pageRank'],
852852
data['shortCodeUnitName'],
853853
data['clusterNoise'],
854-
title=f"{plot_prefix} clustering coefficient versus PageRank",
855-
plot_file_path=get_file_path(f"{plot_prefix}_ClusteringCoefficient_versus_PageRank", parameters)
854+
title=f"{title_prefix} clustering coefficient versus PageRank",
855+
plot_file_path=get_file_path("ClusteringCoefficient_versus_PageRank", parameters)
856856
)
857857

858858
if (overall_cluster_count < 20):
859859
print(f"anomalyDetectionFeaturePlots: Less than 20 clusters: {overall_cluster_count}. Only one plot containing all clusters will be created.")
860860
plot_clusters(
861861
clustering_visualization_dataframe=data,
862-
title=f"{plot_prefix} all clusters overall",
862+
title=f"{title_prefix} all clusters overall",
863863
plot_file_path=get_file_path("Clusters_Overall", parameters)
864864
)
865865
else:
@@ -869,57 +869,57 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
869869
)
870870
plot_clusters(
871871
clustering_visualization_dataframe=clusters_by_largest_size,
872-
title=f"{plot_prefix} clusters with the largest size",
873-
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_size", parameters)
872+
title=f"{title_prefix} clusters with the largest size",
873+
plot_file_path=get_file_path("Clusters_largest_size", parameters)
874874
)
875875

876876
clusters_by_largest_max_radius = get_clusters_by_criteria(
877877
data, by='clusterRadiusMax', ascending=False, cluster_count=20
878878
)
879879
plot_clusters(
880880
clustering_visualization_dataframe=clusters_by_largest_max_radius,
881-
title=f"{plot_prefix} clusters with the largest max radius",
882-
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_max_radius", parameters)
881+
title=f"{title_prefix} clusters with the largest max radius",
882+
plot_file_path=get_file_path("Clusters_largest_max_radius", parameters)
883883
)
884884

885885
clusters_by_largest_average_radius = get_clusters_by_criteria(
886886
data, by='clusterRadiusAverage', ascending=False, cluster_count=20
887887
)
888888
plot_clusters(
889889
clustering_visualization_dataframe=clusters_by_largest_average_radius,
890-
title=f"{plot_prefix} clusters with the largest average radius",
891-
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_average_radius", parameters)
890+
title=f"{title_prefix} clusters with the largest average radius",
891+
plot_file_path=get_file_path("Clusters_largest_average_radius", parameters)
892892
)
893893

894894
plot_clusters_probabilities(
895895
clustering_visualization_dataframe=data,
896-
title=f"{plot_prefix} clustering probabilities (red=high uncertainty)",
897-
plot_file_path=get_file_path(f"{plot_prefix}_Cluster_probabilities", parameters)
896+
title=f"{title_prefix} clustering probabilities (red=high uncertainty)",
897+
plot_file_path=get_file_path("Cluster_probabilities", parameters)
898898
)
899899

900900
plot_cluster_noise(
901901
clustering_visualization_dataframe=data,
902-
title=f"{plot_prefix} clustering noise points that are surprisingly central (red) or popular (size)",
902+
title=f"{title_prefix} clustering noise points that are surprisingly central (red) or popular (size)",
903903
size_column_name='degree',
904904
color_column_name='pageRank',
905-
plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_highly_central_and_popular", parameters)
905+
plot_file_path=get_file_path("ClusterNoise_highly_central_and_popular", parameters)
906906
)
907907

908908
plot_cluster_noise(
909909
clustering_visualization_dataframe=data,
910-
title=f"{plot_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)",
910+
title=f"{title_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)",
911911
size_column_name='inverseClusteringCoefficient',
912912
color_column_name='betweenness',
913-
plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_poorly_integrated_bridges", parameters),
913+
plot_file_path=get_file_path("ClusterNoise_poorly_integrated_bridges", parameters),
914914
downscale_normal_sizes=0.4
915915
)
916916

917917
plot_cluster_noise(
918918
clustering_visualization_dataframe=data,
919-
title=f"{plot_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)",
919+
title=f"{title_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)",
920920
size_column_name='pageToArticleRankDifference',
921921
color_column_name='betweenness',
922-
plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_role_inverted_bridges", parameters)
922+
plot_file_path=get_file_path("ClusterNoise_role_inverted_bridges", parameters)
923923
)
924924

925925
driver.close()

domains/anomaly-detection/anomalyDetectionPython.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,18 +125,22 @@ anomaly_detection_using_python() {
125125

126126
echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Python scripts for ${language} ${nodeLabel} nodes..."
127127

128+
# Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
129+
local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
130+
mkdir -p "${detail_report_directory}"
131+
128132
# Get tuned Leiden communities as a reference to tune clustering
129133
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedLeidenCommunityDetection.py" "${@}" ${verboseMode}
130134
# Tuned Fast Random Projection and tuned HDBSCAN clustering
131135
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedNodeEmbeddingClustering.py" "${@}" ${verboseMode}
132136
# Reduce the dimensionality of the node embeddings down to 2D for visualization using UMAP
133137
time "${ANOMALY_DETECTION_SCRIPT_DIR}/umap2dNodeEmbeddings.py" "${@}" ${verboseMode}
134138
# Plot the results with clustering and UMAP embeddings to reveal anomalies in rare feature combinations
135-
time "${ANOMALY_DETECTION_SCRIPT_DIR}/anomalyDetectionFeaturePlots.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode}
139+
time "${ANOMALY_DETECTION_SCRIPT_DIR}/anomalyDetectionFeaturePlots.py" "${@}" "--report_directory" "${detail_report_directory}" ${verboseMode}
136140
# Run an unsupervised anomaly detection algorithm including tuning and explainability
137-
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedAnomalyDetectionExplained.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode}
141+
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedAnomalyDetectionExplained.py" "${@}" "--report_directory" "${detail_report_directory}" ${verboseMode}
138142
# Query Results: Output all collected features into a CSV file.
139-
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}AnomalyDetection_Features.csv"
143+
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${detail_report_directory}/Anomaly_Features.csv"
140144
}
141145

142146
# Label code units with top anomalies by archetype.

0 commit comments

Comments
 (0)