Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/public-analyze-code-graph.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ jobs:
run: echo "Please specify either the input parameter 'artifacts-upload-name' or 'sources-upload-name'."; exit 1

- name: Assemble ENVIRONMENT_INFO
run: echo "ENVIRONMENT_INFO=-java-${{ matrix.java }}-python-${{ matrix.python }}-miniforge-${{ matrix.miniforge }}" >> $GITHUB_ENV
run: echo "ENVIRONMENT_INFO=java-${{ matrix.java }}-python-${{ matrix.python }}-miniforge-${{ matrix.miniforge }}" >> $GITHUB_ENV

- name: (Code Analysis Setup) Checkout code-graph-analysis-pipeline
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5
Expand Down
62 changes: 51 additions & 11 deletions domains/anomaly-detection/anomalyDetectionCsv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Pipeline that coordinates anomaly detection using the Graph Data Science Library of Neo4j.
# It requires an already running Neo4j graph database with already scanned and analyzed artifacts.
# The results will be written into the sub directory reports/anomaly-detection.
# Dynamically triggered by "CsvReports.sh".

# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script.

Expand All @@ -25,8 +26,9 @@ SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/../../scripts"} # Re
# Get the "cypher" query directory for gathering features.
ANOMALY_DETECTION_FEATURE_CYPHER_DIR=${ANOMALY_DETECTION_FEATURE_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/features"}
ANOMALY_DETECTION_QUERY_CYPHER_DIR=${ANOMALY_DETECTION_QUERY_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/queries"}
ANOMALY_DETECTION_LABEL_CYPHER_DIR=${ANOMALY_DETECTION_LABEL_CYPHER_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/labels"}

# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher"
# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher" and "execute_cypher_summarized"
source "${SCRIPTS_DIR}/executeQueryFunctions.sh"

# Define functions to create and delete Graph Projections like "createUndirectedDependencyProjection"
Expand Down Expand Up @@ -60,29 +62,64 @@ anomaly_detection_features() {
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Exists.cypher" \
"${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-ArticleRank-Write.cypher" "${@}"
}

# Run queries to find anomalies in the graph.
#
# Required Parameters:
# - projection_node_label=...
# Label of the nodes that will be used for the projection. Example: "Package"
# - projection_language=...
# Name of the associated programming language. Default: "Java". Example: "Typescript"
anomaly_detection_queries() {
local nodeLabel
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )

local language
language=$( extractQueryParameter "projection_language" "${@}" )


# Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
mkdir -p "${detail_report_directory}"

echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Queries for ${nodeLabel} nodes..."
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialImbalancedRoles.csv"
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialOverEngineerOrIsolated.csv"
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialImbalancedRoles.csv"
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialOverEngineerOrIsolated.csv"

execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_HiddenBridgeNodes.csv"
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PopularBottlenecks.csv"
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_SilentCoordinators.csv"
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_OverReferencesUtilities.csv"
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_FragileStructuralBridges.csv"
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_DependencyHungryOrchestrators.csv"
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_UnexpectedCentralNodes.csv"
}

# Label code units with top anomalies by archetype.
#
# Required Parameters:
# - projection_node_label=...
# Label of the nodes that will be used for the projection. Example: "Package"
# - projection_language=...
# Name of the associated programming language. Examples: "Java", "Typescript"
anomaly_detection_labels() {
local nodeLabel
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )

local language
language=$( extractQueryParameter "projection_language" "${@}" )

execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_HiddenBridgeNodes.csv"
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PopularBottlenecks.csv"
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_SilentCoordinators.csv"
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_OverReferencesUtilities.csv"
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_FragileStructuralBridges.csv"
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_DependencyHungryOrchestrators.csv"
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_UnexpectedCentralNodes.csv"
# Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
mkdir -p "${detail_report_directory}"

echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Labelling ${language} ${nodeLabel} anomalies..."
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeRemoveLabels.cypher" "${@}"
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopAuthority.csv"
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopBottleneck.csv"
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopHub.csv"
# The following two label types require Python scripts to run first and are skipped here intentionally:
# execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBridge.cypher" "${@}"
# execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}"
}

# Run the anomaly detection pipeline.
Expand All @@ -94,9 +131,12 @@ anomaly_detection_queries() {
# Label of the nodes that will be used for the projection. Example: "Package"
# - projection_weight_property=...
# Name of the node property that contains the dependency weight. Example: "weight"
# - projection_language=...
# Name of the associated programming language. Examples: "Java", "Typescript"
anomaly_detection_csv_reports() {
time anomaly_detection_features "${@}"
time anomaly_detection_queries "${@}"
time anomaly_detection_labels "${@}"
}

# Create report directory
Expand Down
89 changes: 50 additions & 39 deletions domains/anomaly-detection/anomalyDetectionFeaturePlots.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def __is_code_language_available(self) -> bool:
def __get_projection_language(self) -> str:
return self.query_parameters_["projection_language"] if self.__is_code_language_available() else ""

def get_plot_prefix(self) -> str:
def get_title_prefix(self) -> str:
if self.__is_code_language_available():
return self.__get_projection_language() + " " + self.__get_projection_node_label()
return self.__get_projection_node_label()
Expand Down Expand Up @@ -378,37 +378,40 @@ def annotate_outliers(outliers: pd.DataFrame) -> None:
plot.savefig(plot_file_path)


def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title: str, plot_file_path: str) -> None:
def plot_feature_distribution(feature_values: pd.Series, feature_name: str, title: str, plot_file_path: str) -> None:
"""
Plots the distribution of clustering coefficients.

Plots the distribution of feature's values.
Parameters
----------
clustering_coefficients : pd.Series
Series containing clustering coefficient values.
feature_values : pd.Series
Series containing feature values.
text_prefix: str
Text at the beginning of the title
"""
if clustering_coefficients.empty:
if feature_values.empty:
print("No data available to plot.")
return

plot.figure(figsize=(10, 6))
plot.figure(figsize=(10, 6))
plot.hist(clustering_coefficients, bins=40, color='blue', alpha=0.7, edgecolor='black')
plot.hist(feature_values, bins=40, color='blue', alpha=0.7, edgecolor='black')
plot.title(title, pad=20)
plot.xlabel('Clustering Coefficient')
plot.xlabel(feature_name)
plot.ylabel('Frequency')
plot.xlim(left=clustering_coefficients.min(), right=clustering_coefficients.max())
plot.xlim(left=feature_values.min(), right=feature_values.max())
# plot.yscale('log') # Use logarithmic scale for better visibility of differences
plot.grid(True)
plot.tight_layout()

mean = clustering_coefficients.mean()
standard_deviation = clustering_coefficients.std()
mean = feature_values.mean()
standard_deviation = feature_values.std()

# Vertical line for the mean
plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)
# Vertical line for 1 x standard deviations + mean (=z-score of 1)
plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=1)
plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)
# Vertical line for 2 x standard deviations + mean (=z-score of 2)
plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)

plot.tight_layout()
plot.savefig(plot_file_path)
Expand Down Expand Up @@ -812,7 +815,7 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
# ------------------------------------------------------------------------------------------------------------

parameters = parse_input_parameters()
plot_prefix = parameters.get_plot_prefix()
title_prefix = parameters.get_title_prefix()
report_directory = parameters.get_report_directory()

driver = get_graph_database_driver()
Expand All @@ -825,31 +828,39 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
data['pageRank'],
data['articleRank'],
data['shortCodeUnitName'],
title=f"{plot_prefix} distribution of PageRank - ArticleRank differences",
plot_file_path=get_file_path(f"{plot_prefix}_PageRank_Minus_ArticleRank_Distribution", parameters)
title=f"{title_prefix} distribution of PageRank - ArticleRank differences",
plot_file_path=get_file_path("PageRank_Minus_ArticleRank_Distribution", parameters)
)

plot_clustering_coefficient_distribution(
data['clusteringCoefficient'],
title=f"{plot_prefix} distribution of clustering coefficients",
plot_file_path=get_file_path(f"{plot_prefix}_ClusteringCoefficient_distribution", parameters)
plot_feature_distribution(
feature_values=data['clusteringCoefficient'],
feature_name='Clustering Coefficient',
title=f"{title_prefix} clustering coefficient distribution",
plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters)
)

plot_feature_distribution(
feature_values=data['betweenness'],
feature_name='Betweenness',
title=f"{title_prefix} betweenness centrality distribution",
plot_file_path=get_file_path("BetweennessCentrality_distribution", parameters)
)

plot_clustering_coefficient_vs_page_rank(
data['clusteringCoefficient'],
data['pageRank'],
data['shortCodeUnitName'],
data['clusterNoise'],
title=f"{plot_prefix} clustering coefficient versus PageRank",
plot_file_path=get_file_path(f"{plot_prefix}_ClusteringCoefficient_versus_PageRank", parameters)
title=f"{title_prefix} clustering coefficient versus PageRank",
plot_file_path=get_file_path("ClusteringCoefficient_versus_PageRank", parameters)
)

if (overall_cluster_count < 20):
print(f"anomalyDetectionFeaturePlots: Less than 20 clusters: {overall_cluster_count}. Only one plot containing all clusters will be created.")
plot_clusters(
clustering_visualization_dataframe=data,
title=f"{plot_prefix} all clusters overall (less than 20)",
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_Overall", parameters)
title=f"{title_prefix} all clusters overall",
plot_file_path=get_file_path("Clusters_Overall", parameters)
)
else:
print(f"anomalyDetectionFeaturePlots: More than 20 clusters: {overall_cluster_count}. Different plots focussing on different features like cluster size will be created.")
Expand All @@ -858,57 +869,57 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
)
plot_clusters(
clustering_visualization_dataframe=clusters_by_largest_size,
title=f"{plot_prefix} clusters with the largest size",
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_size", parameters)
title=f"{title_prefix} clusters with the largest size",
plot_file_path=get_file_path("Clusters_largest_size", parameters)
)

clusters_by_largest_max_radius = get_clusters_by_criteria(
data, by='clusterRadiusMax', ascending=False, cluster_count=20
)
plot_clusters(
clustering_visualization_dataframe=clusters_by_largest_max_radius,
title=f"{plot_prefix} clusters with the largest max radius",
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_max_radius", parameters)
title=f"{title_prefix} clusters with the largest max radius",
plot_file_path=get_file_path("Clusters_largest_max_radius", parameters)
)

clusters_by_largest_average_radius = get_clusters_by_criteria(
data, by='clusterRadiusAverage', ascending=False, cluster_count=20
)
plot_clusters(
clustering_visualization_dataframe=clusters_by_largest_average_radius,
title=f"{plot_prefix} clusters with the largest average radius",
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_average_radius", parameters)
title=f"{title_prefix} clusters with the largest average radius",
plot_file_path=get_file_path("Clusters_largest_average_radius", parameters)
)

plot_clusters_probabilities(
clustering_visualization_dataframe=data,
title=f"{plot_prefix} clustering probabilities (red=high uncertainty)",
plot_file_path=get_file_path(f"{plot_prefix}_Cluster_probabilities", parameters)
title=f"{title_prefix} clustering probabilities (red=high uncertainty)",
plot_file_path=get_file_path("Cluster_probabilities", parameters)
)

plot_cluster_noise(
clustering_visualization_dataframe=data,
title=f"{plot_prefix} clustering noise points that are surprisingly central (red) or popular (size)",
title=f"{title_prefix} clustering noise points that are surprisingly central (red) or popular (size)",
size_column_name='degree',
color_column_name='pageRank',
plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_highly_central_and_popular", parameters)
plot_file_path=get_file_path("ClusterNoise_highly_central_and_popular", parameters)
)

plot_cluster_noise(
clustering_visualization_dataframe=data,
title=f"{plot_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)",
title=f"{title_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)",
size_column_name='inverseClusteringCoefficient',
color_column_name='betweenness',
plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_poorly_integrated_bridges", parameters),
plot_file_path=get_file_path("ClusterNoise_poorly_integrated_bridges", parameters),
downscale_normal_sizes=0.4
)

plot_cluster_noise(
clustering_visualization_dataframe=data,
title=f"{plot_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)",
title=f"{title_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)",
size_column_name='pageToArticleRankDifference',
color_column_name='betweenness',
plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_role_inverted_bridges", parameters)
plot_file_path=get_file_path("ClusterNoise_role_inverted_bridges", parameters)
)

driver.close()
Loading
Loading