Add sum of node embeddings contributing to the anomaly score (SHAP value)

JohT · JohT · commit d5065d78b6c9 · 2025-09-15T17:32:46.000+02:00
diff --git a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb
@@ -1644,6 +1644,55 @@
     "    return anomaly_detected_features\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "486948e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_node_embedding_shap_sum(\n",
+    "    shap_anomaly_values: np.ndarray,\n",
+    "    feature_names: list[str],\n",
+    "    anomaly_detected_features: pd.DataFrame,\n",
+    "    anomaly_label_column: str = \"anomalyLabel\",\n",
+    "    output_column_name: str = \"anomalyNodeEmbeddingSHAPSum\"\n",
+    ") -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Adds a column with the sum of SHAP values for all features that start with 'nodeEmbedding'.\n",
+    "    The sum is signed, so that negative values contributing to an anomaly are reduced by positive numbers indicating \"normal\" tendencies.\n",
+    "\n",
+    "    Parameters:\n",
+    "    - shap_anomaly_values: SHAP values array with shape (n_samples, n_features).\n",
+    "    - feature_names: List of names corresponding to the features.\n",
+    "    - anomaly_detected_features: Original DataFrame containing anomaly labels.\n",
+    "    - anomaly_label_column: Name of the column indicating anomalies (1 = anomaly).\n",
+    "    - output_column_name: Name of the new column to store the SHAP sum.\n",
+    "\n",
+    "    Returns:\n",
+    "    - DataFrame with an additional column containing the summed SHAP values for nodeEmbedding features.\n",
+    "    \"\"\"\n",
+    "    # Convert SHAP values into a DataFrame for easier manipulation\n",
+    "    shap_values_dataframe = pd.DataFrame(shap_anomaly_values, columns=feature_names)\n",
+    "\n",
+    "    # Identify all features whose names start with \"nodeEmbedding\"\n",
+    "    node_embedding_features = [name for name in feature_names if name.startswith(\"nodeEmbedding\")]\n",
+    "\n",
+    "    # Default initialize new column\n",
+    "    anomaly_detected_features[output_column_name] = 0.0\n",
+    "\n",
+    "    # Get indices of rows marked as anomalies\n",
+    "    anomaly_indices = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1].index\n",
+    "\n",
+    "    # Compute raw signed sum of SHAP values for each anomaly row\n",
+    "    for row_index in anomaly_indices:\n",
+    "        row_shap_values = shap_values_dataframe.loc[row_index, node_embedding_features]\n",
+    "        shap_sum = row_shap_values.sum()  # signed sum\n",
+    "        anomaly_detected_features.at[row_index, output_column_name] = shap_sum\n",
+    "\n",
+    "    return anomaly_detected_features"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1656,6 +1705,11 @@
     "    feature_names=java_package_anomaly_detection_feature_names,\n",
     "    anomaly_detected_features=java_package_anomaly_detection_features\n",
     ")\n",
+    "add_node_embedding_shap_sum(\n",
+    "    shap_anomaly_values=java_package_anomalies_explanation_results.shap_anomaly_values,\n",
+    "    feature_names=java_package_anomaly_detection_feature_names,\n",
+    "    anomaly_detected_features=java_package_anomaly_detection_features \n",
+    ")\n",
     "display(java_package_anomaly_detection_features[java_package_anomaly_detection_features[\"anomalyLabel\"] == 1].sort_values(by='anomalyScore', ascending=False).head(10))"
    ]
   },
diff --git a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py
@@ -260,7 +260,7 @@ def query_data(input_parameters: Parameters = Parameters.example()) -> pd.DataFr
              ,codeUnit.centralityArticleRank                                AS articleRank
              ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank  AS pageToArticleRankDifference
              ,codeUnit.centralityBetweenness                                AS betweenness
-             ,codeUnit.communityLocalClusteringCoefficient                  AS locallusteringCoefficient
+             ,codeUnit.communityLocalClusteringCoefficient                  AS localClusteringCoefficient
              ,1.0 - codeUnit.clusteringHDBSCANProbability                   AS clusterApproximateOutlierScore
              ,codeUnit.clusteringHDBSCANNoise                               AS clusterNoise
              ,codeUnit.clusteringHDBSCANRadiusAverage                       AS clusterRadiusAverage
@@ -754,6 +754,73 @@ def plot_shap_explained_beeswarm(
     plot.close()
 
 
+def plot_shap_explained_local_feature_importance(
+    index_to_explain,
+    anomalies_explanation_results: AnomaliesExplanationResults,
+    prepared_features: np.ndarray,
+    feature_names: list[str],
+    title: str,
+    plot_file_path: str,
+    rounding_precision: int = 4,
+):
+    """    
+    Uses the SHAP values for anomalies to visualize the local feature importance for a specific anomaly.
+    This function generates a force plot showing how each feature contributes to the anomaly score for a specific anomaly instance.
+    The force plot is a powerful visualization that helps to understand the impact of each feature for each as anomaly classified data point.
+    Visual breakdown of how each feature contributes to the score.
+    Highly interpretable for debugging single nodes.
+    """
+    shap_anomaly_values = anomalies_explanation_results.shap_anomaly_values
+    expected_anomaly_value = anomalies_explanation_results.shap_expected_anomaly_value
+
+    shap_values_rounded = np.round(shap_anomaly_values[index_to_explain], rounding_precision)
+    prepared_features_rounded = prepared_features[index_to_explain].round(rounding_precision)
+    base_value_rounded = np.round(expected_anomaly_value, rounding_precision)
+
+    shap.force_plot(
+        base_value_rounded,
+        shap_values_rounded,
+        prepared_features_rounded,
+        feature_names=feature_names,
+        matplotlib=True,
+        show=False,
+        contribution_threshold=0.06
+    )
+    current_figure = plot.gcf()
+
+    # Resize fonts manually (best effort, affects all text)
+    for text in current_figure.findobj(match=plot.Text):
+        text.set_fontsize(10)  # Set smaller font
+
+    plot.title(title, fontsize=16, loc='left', y=0.05)
+    plot.savefig(plot_file_path)
+    plot.close()
+
+
+def plot_all_shap_explained_local_feature_importance(
+        data: pd.DataFrame,
+        explanation_results: AnomaliesExplanationResults,
+        prepared_features: np.ndarray,
+        feature_names: list[str],
+        parameters: Parameters,
+        title_prefix: str = "",
+        code_unit_name_column: str = "codeUnitName"
+    ) -> None:
+
+    index=0
+    for row_index, row in data.iterrows():
+        row_index = typing.cast(int, row_index)
+        index=index+1
+        plot_shap_explained_local_feature_importance(
+            index_to_explain=row_index,
+            anomalies_explanation_results=explanation_results,
+            prepared_features=prepared_features,
+            feature_names=feature_names,
+            title=f"{title_prefix} \"{row[code_unit_name_column]}\" anomaly #{index} explained",
+            plot_file_path=get_file_path(f"{title_prefix}_Anomaly_{index}_shap_explanation", parameters),
+        )
+
+
 def plot_shap_explained_top_10_feature_dependence(
     shap_anomaly_values: np.ndarray,
     prepared_features: np.ndarray,
@@ -838,6 +905,48 @@ def add_top_shap_features_to_anomalies(
     return anomaly_detected_features
 
 
+def add_node_embedding_shap_sum(
+    shap_anomaly_values: np.ndarray,
+    feature_names: list[str],
+    anomaly_detected_features: pd.DataFrame,
+    anomaly_label_column: str = "anomalyLabel",
+    output_column_name: str = "anomalyNodeEmbeddingSHAPSum"
+) -> pd.DataFrame:
+    """
+    Adds a column with the sum of SHAP values for all features that start with 'nodeEmbedding'.
+    The sum is signed, so that negative values contributing to an anomaly are reduced by positive numbers indicating "normal" tendencies.
+
+    Parameters:
+    - shap_anomaly_values: SHAP values array with shape (n_samples, n_features).
+    - feature_names: List of names corresponding to the features.
+    - anomaly_detected_features: Original DataFrame containing anomaly labels.
+    - anomaly_label_column: Name of the column indicating anomalies (1 = anomaly).
+    - output_column_name: Name of the new column to store the SHAP sum.
+
+    Returns:
+    - DataFrame with an additional column containing the summed SHAP values for nodeEmbedding features.
+    """
+    # Convert SHAP values into a DataFrame for easier manipulation
+    shap_values_dataframe = pd.DataFrame(shap_anomaly_values, columns=feature_names)
+
+    # Identify all features whose names start with "nodeEmbedding"
+    node_embedding_features = [name for name in feature_names if name.startswith("nodeEmbedding")]
+
+    # Default initialize new column
+    anomaly_detected_features[output_column_name] = 0.0
+
+    # Get indices of rows marked as anomalies
+    anomaly_indices = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1].index
+
+    # Compute raw signed sum of SHAP values for each anomaly row
+    for row_index in anomaly_indices:
+        row_shap_values = shap_values_dataframe.loc[row_index, node_embedding_features]
+        shap_sum = row_shap_values.sum()  # signed sum
+        anomaly_detected_features.at[row_index, output_column_name] = shap_sum
+
+    return anomaly_detected_features
+
+
 # ------------------------------------------------------------------------------------------------------------
 #  MAIN
 # ------------------------------------------------------------------------------------------------------------
@@ -913,6 +1022,15 @@ def add_top_shap_features_to_anomalies(
     plot_file_path=get_file_path(f"{plot_prefix}_Anomaly_feature_importance_explained", parameters)
 )
 
+plot_all_shap_explained_local_feature_importance(
+    data=get_top_10_anomalies(features),
+    explanation_results=explanation_results,
+    prepared_features=features_prepared,
+    feature_names=feature_names,
+    parameters=parameters,
+    title_prefix=plot_prefix
+)
+
 plot_shap_explained_top_10_feature_dependence(
     shap_anomaly_values=explanation_results.shap_anomaly_values,
     prepared_features=features_prepared,
@@ -927,6 +1045,12 @@ def add_top_shap_features_to_anomalies(
     anomaly_detected_features=features
 )
 
+add_node_embedding_shap_sum(
+    shap_anomaly_values=explanation_results.shap_anomaly_values,
+    feature_names=feature_names,
+    anomaly_detected_features=features
+)
+
 if parameters.is_verbose():
     print("tunedAnomalyDetectionExplained: Features with added anomaly score explanation columns:")
     print(features[features["anomalyLabel"] == 1].sort_values(by='anomalyScore', ascending=False).head(10))
@@ -941,6 +1065,7 @@ def add_top_shap_features_to_anomalies(
     'anomalyTopFeatureSHAPValue1': features['anomalyTopFeatureSHAPValue_1'],
     'anomalyTopFeatureSHAPValue2': features['anomalyTopFeatureSHAPValue_2'],
     'anomalyTopFeatureSHAPValue3': features['anomalyTopFeatureSHAPValue_3'],
+    'anomalyNodeEmbeddingSHAPSum': features['anomalyNodeEmbeddingSHAPSum'],
 })
 write_batch_data_into_database(data_to_write, parameters.get_projection_node_label(), verbose=parameters.is_verbose())