Skip to content

Commit d5065d7

Browse files
committed
Add sum of node embeddings contributing to the anomaly score (SHAP value)
1 parent 0fc854e commit d5065d7

File tree

2 files changed

+180
-1
lines changed

2 files changed

+180
-1
lines changed

domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1644,6 +1644,55 @@
16441644
" return anomaly_detected_features\n"
16451645
]
16461646
},
1647+
{
1648+
"cell_type": "code",
1649+
"execution_count": null,
1650+
"id": "486948e3",
1651+
"metadata": {},
1652+
"outputs": [],
1653+
"source": [
1654+
"def add_node_embedding_shap_sum(\n",
1655+
" shap_anomaly_values: np.ndarray,\n",
1656+
" feature_names: list[str],\n",
1657+
" anomaly_detected_features: pd.DataFrame,\n",
1658+
" anomaly_label_column: str = \"anomalyLabel\",\n",
1659+
" output_column_name: str = \"anomalyNodeEmbeddingSHAPSum\"\n",
1660+
") -> pd.DataFrame:\n",
1661+
" \"\"\"\n",
1662+
" Adds a column with the sum of SHAP values for all features that start with 'nodeEmbedding'.\n",
1663+
" The sum is signed, so that negative values contributing to an anomaly are reduced by positive numbers indicating \"normal\" tendencies.\n",
1664+
"\n",
1665+
" Parameters:\n",
1666+
" - shap_anomaly_values: SHAP values array with shape (n_samples, n_features).\n",
1667+
" - feature_names: List of names corresponding to the features.\n",
1668+
" - anomaly_detected_features: Original DataFrame containing anomaly labels.\n",
1669+
" - anomaly_label_column: Name of the column indicating anomalies (1 = anomaly).\n",
1670+
" - output_column_name: Name of the new column to store the SHAP sum.\n",
1671+
"\n",
1672+
" Returns:\n",
1673+
" - DataFrame with an additional column containing the summed SHAP values for nodeEmbedding features.\n",
1674+
" \"\"\"\n",
1675+
" # Convert SHAP values into a DataFrame for easier manipulation\n",
1676+
" shap_values_dataframe = pd.DataFrame(shap_anomaly_values, columns=feature_names)\n",
1677+
"\n",
1678+
" # Identify all features whose names start with \"nodeEmbedding\"\n",
1679+
" node_embedding_features = [name for name in feature_names if name.startswith(\"nodeEmbedding\")]\n",
1680+
"\n",
1681+
" # Default initialize new column\n",
1682+
" anomaly_detected_features[output_column_name] = 0.0\n",
1683+
"\n",
1684+
" # Get indices of rows marked as anomalies\n",
1685+
" anomaly_indices = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1].index\n",
1686+
"\n",
1687+
" # Compute raw signed sum of SHAP values for each anomaly row\n",
1688+
" for row_index in anomaly_indices:\n",
1689+
" row_shap_values = shap_values_dataframe.loc[row_index, node_embedding_features]\n",
1690+
" shap_sum = row_shap_values.sum() # signed sum\n",
1691+
" anomaly_detected_features.at[row_index, output_column_name] = shap_sum\n",
1692+
"\n",
1693+
" return anomaly_detected_features"
1694+
]
1695+
},
16471696
{
16481697
"cell_type": "code",
16491698
"execution_count": null,
@@ -1656,6 +1705,11 @@
16561705
" feature_names=java_package_anomaly_detection_feature_names,\n",
16571706
" anomaly_detected_features=java_package_anomaly_detection_features\n",
16581707
")\n",
1708+
"add_node_embedding_shap_sum(\n",
1709+
" shap_anomaly_values=java_package_anomalies_explanation_results.shap_anomaly_values,\n",
1710+
" feature_names=java_package_anomaly_detection_feature_names,\n",
1711+
" anomaly_detected_features=java_package_anomaly_detection_features \n",
1712+
")\n",
16591713
"display(java_package_anomaly_detection_features[java_package_anomaly_detection_features[\"anomalyLabel\"] == 1].sort_values(by='anomalyScore', ascending=False).head(10))"
16601714
]
16611715
},

domains/anomaly-detection/tunedAnomalyDetectionExplained.py

Lines changed: 126 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ def query_data(input_parameters: Parameters = Parameters.example()) -> pd.DataFr
260260
,codeUnit.centralityArticleRank AS articleRank
261261
,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference
262262
,codeUnit.centralityBetweenness AS betweenness
263-
,codeUnit.communityLocalClusteringCoefficient AS locallusteringCoefficient
263+
,codeUnit.communityLocalClusteringCoefficient AS localClusteringCoefficient
264264
,1.0 - codeUnit.clusteringHDBSCANProbability AS clusterApproximateOutlierScore
265265
,codeUnit.clusteringHDBSCANNoise AS clusterNoise
266266
,codeUnit.clusteringHDBSCANRadiusAverage AS clusterRadiusAverage
@@ -754,6 +754,73 @@ def plot_shap_explained_beeswarm(
754754
plot.close()
755755

756756

757+
def plot_shap_explained_local_feature_importance(
758+
index_to_explain,
759+
anomalies_explanation_results: AnomaliesExplanationResults,
760+
prepared_features: np.ndarray,
761+
feature_names: list[str],
762+
title: str,
763+
plot_file_path: str,
764+
rounding_precision: int = 4,
765+
):
766+
"""
767+
Uses the SHAP values for anomalies to visualize the local feature importance for a specific anomaly.
768+
This function generates a force plot showing how each feature contributes to the anomaly score for a specific anomaly instance.
769+
The force plot is a powerful visualization that helps to understand the impact of each feature for each as anomaly classified data point.
770+
Visual breakdown of how each feature contributes to the score.
771+
Highly interpretable for debugging single nodes.
772+
"""
773+
shap_anomaly_values = anomalies_explanation_results.shap_anomaly_values
774+
expected_anomaly_value = anomalies_explanation_results.shap_expected_anomaly_value
775+
776+
shap_values_rounded = np.round(shap_anomaly_values[index_to_explain], rounding_precision)
777+
prepared_features_rounded = prepared_features[index_to_explain].round(rounding_precision)
778+
base_value_rounded = np.round(expected_anomaly_value, rounding_precision)
779+
780+
shap.force_plot(
781+
base_value_rounded,
782+
shap_values_rounded,
783+
prepared_features_rounded,
784+
feature_names=feature_names,
785+
matplotlib=True,
786+
show=False,
787+
contribution_threshold=0.06
788+
)
789+
current_figure = plot.gcf()
790+
791+
# Resize fonts manually (best effort, affects all text)
792+
for text in current_figure.findobj(match=plot.Text):
793+
text.set_fontsize(10) # Set smaller font
794+
795+
plot.title(title, fontsize=16, loc='left', y=0.05)
796+
plot.savefig(plot_file_path)
797+
plot.close()
798+
799+
800+
def plot_all_shap_explained_local_feature_importance(
801+
data: pd.DataFrame,
802+
explanation_results: AnomaliesExplanationResults,
803+
prepared_features: np.ndarray,
804+
feature_names: list[str],
805+
parameters: Parameters,
806+
title_prefix: str = "",
807+
code_unit_name_column: str = "codeUnitName"
808+
) -> None:
809+
810+
index=0
811+
for row_index, row in data.iterrows():
812+
row_index = typing.cast(int, row_index)
813+
index=index+1
814+
plot_shap_explained_local_feature_importance(
815+
index_to_explain=row_index,
816+
anomalies_explanation_results=explanation_results,
817+
prepared_features=prepared_features,
818+
feature_names=feature_names,
819+
title=f"{title_prefix} \"{row[code_unit_name_column]}\" anomaly #{index} explained",
820+
plot_file_path=get_file_path(f"{title_prefix}_Anomaly_{index}_shap_explanation", parameters),
821+
)
822+
823+
757824
def plot_shap_explained_top_10_feature_dependence(
758825
shap_anomaly_values: np.ndarray,
759826
prepared_features: np.ndarray,
@@ -838,6 +905,48 @@ def add_top_shap_features_to_anomalies(
838905
return anomaly_detected_features
839906

840907

908+
def add_node_embedding_shap_sum(
909+
shap_anomaly_values: np.ndarray,
910+
feature_names: list[str],
911+
anomaly_detected_features: pd.DataFrame,
912+
anomaly_label_column: str = "anomalyLabel",
913+
output_column_name: str = "anomalyNodeEmbeddingSHAPSum"
914+
) -> pd.DataFrame:
915+
"""
916+
Adds a column with the sum of SHAP values for all features that start with 'nodeEmbedding'.
917+
The sum is signed, so that negative values contributing to an anomaly are reduced by positive numbers indicating "normal" tendencies.
918+
919+
Parameters:
920+
- shap_anomaly_values: SHAP values array with shape (n_samples, n_features).
921+
- feature_names: List of names corresponding to the features.
922+
- anomaly_detected_features: Original DataFrame containing anomaly labels.
923+
- anomaly_label_column: Name of the column indicating anomalies (1 = anomaly).
924+
- output_column_name: Name of the new column to store the SHAP sum.
925+
926+
Returns:
927+
- DataFrame with an additional column containing the summed SHAP values for nodeEmbedding features.
928+
"""
929+
# Convert SHAP values into a DataFrame for easier manipulation
930+
shap_values_dataframe = pd.DataFrame(shap_anomaly_values, columns=feature_names)
931+
932+
# Identify all features whose names start with "nodeEmbedding"
933+
node_embedding_features = [name for name in feature_names if name.startswith("nodeEmbedding")]
934+
935+
# Default initialize new column
936+
anomaly_detected_features[output_column_name] = 0.0
937+
938+
# Get indices of rows marked as anomalies
939+
anomaly_indices = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1].index
940+
941+
# Compute raw signed sum of SHAP values for each anomaly row
942+
for row_index in anomaly_indices:
943+
row_shap_values = shap_values_dataframe.loc[row_index, node_embedding_features]
944+
shap_sum = row_shap_values.sum() # signed sum
945+
anomaly_detected_features.at[row_index, output_column_name] = shap_sum
946+
947+
return anomaly_detected_features
948+
949+
841950
# ------------------------------------------------------------------------------------------------------------
842951
# MAIN
843952
# ------------------------------------------------------------------------------------------------------------
@@ -913,6 +1022,15 @@ def add_top_shap_features_to_anomalies(
9131022
plot_file_path=get_file_path(f"{plot_prefix}_Anomaly_feature_importance_explained", parameters)
9141023
)
9151024

1025+
plot_all_shap_explained_local_feature_importance(
1026+
data=get_top_10_anomalies(features),
1027+
explanation_results=explanation_results,
1028+
prepared_features=features_prepared,
1029+
feature_names=feature_names,
1030+
parameters=parameters,
1031+
title_prefix=plot_prefix
1032+
)
1033+
9161034
plot_shap_explained_top_10_feature_dependence(
9171035
shap_anomaly_values=explanation_results.shap_anomaly_values,
9181036
prepared_features=features_prepared,
@@ -927,6 +1045,12 @@ def add_top_shap_features_to_anomalies(
9271045
anomaly_detected_features=features
9281046
)
9291047

1048+
add_node_embedding_shap_sum(
1049+
shap_anomaly_values=explanation_results.shap_anomaly_values,
1050+
feature_names=feature_names,
1051+
anomaly_detected_features=features
1052+
)
1053+
9301054
if parameters.is_verbose():
9311055
print("tunedAnomalyDetectionExplained: Features with added anomaly score explanation columns:")
9321056
print(features[features["anomalyLabel"] == 1].sort_values(by='anomalyScore', ascending=False).head(10))
@@ -941,6 +1065,7 @@ def add_top_shap_features_to_anomalies(
9411065
'anomalyTopFeatureSHAPValue1': features['anomalyTopFeatureSHAPValue_1'],
9421066
'anomalyTopFeatureSHAPValue2': features['anomalyTopFeatureSHAPValue_2'],
9431067
'anomalyTopFeatureSHAPValue3': features['anomalyTopFeatureSHAPValue_3'],
1068+
'anomalyNodeEmbeddingSHAPSum': features['anomalyNodeEmbeddingSHAPSum'],
9441069
})
9451070
write_batch_data_into_database(data_to_write, parameters.get_projection_node_label(), verbose=parameters.is_verbose())
9461071

0 commit comments

Comments
 (0)