JohT
diff --git a/‎domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb‎
Lines changed: 208 additions & 97 deletions b/‎domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb‎
Lines changed: 208 additions & 97 deletions
@@ -59,6 +59,8 @@
     "from sklearn.decomposition import PCA\n",
     "from sklearn.ensemble import IsolationForest, RandomForestClassifier\n",
     "\n",
+    "import shap # Explainable AI tool\n",
+    "\n",
     "import matplotlib.pyplot as plot"
    ]
   },
@@ -410,10 +412,10 @@
     "        anomaly_label_column: str = 'anomalyLabel',\n",
     "        anomaly_score_column: str = 'anomalyScore',\n",
     ") -> pd.DataFrame:\n",
-    "    isolation_forest = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)\n",
+    "    isolation_forest = IsolationForest(n_estimators=200, contamination='auto', random_state=42)\n",
     "    anomaly_score = isolation_forest.fit_predict(prepared_features)\n",
     "\n",
-    "    original_features[anomaly_label_column] = anomaly_score * -1 # 1 = anomaly, 0 = no anomaly\n",
+    "    original_features[anomaly_label_column] = (anomaly_score == -1).astype(int)  # 1 = anomaly, 0 = normal\n",
     "    original_features[anomaly_score_column] = isolation_forest.decision_function(prepared_features) * -1  # higher = more anomalous\n",
     "    return original_features"
    ]
@@ -440,7 +442,7 @@
     "        anomaly_label_column: str = \"anomalyLabel\",\n",
     "        anomaly_score_column: str = \"anomalyScore\"\n",
     ") -> pd.DataFrame:\n",
-    "    anomalies = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == -1]\n",
+    "    anomalies = anomaly_detected_features[anomaly_detected_features[anomaly_label_column] == 1]\n",
     "    return anomalies.sort_values(by=anomaly_score_column, ascending=False).reset_index(drop=True).head(10)"
    ]
   },
@@ -456,96 +458,18 @@
   },
   {
    "cell_type": "markdown",
-   "id": "efa822ca",
+   "id": "a3936d79",
    "metadata": {},
    "source": [
-    "### 1.4 Plot the 20 most influential features\n",
-    "\n",
-    "Use Random Forest as a proxy to estimate the importance of each feature contributing to the anomaly score."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "24427977",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_feature_importances(\n",
-    "        anomaly_detected_features: pd.DataFrame, \n",
-    "        prepared_features: numpy_typing.NDArray,\n",
-    "        anomaly_label_column: str = \"anomalyLabel\",\n",
-    ") -> numpy_typing.NDArray:\n",
-    "    \"\"\"\n",
-    "    Use Random Forest as a proxy model to find out which are the most important features for the anomaly detection model (Isolation Forest).\n",
-    "    This helps to see if embedding components dominate (top 10 filled with them), and then tune accordingly.\n",
-    "    \"\"\"\n",
-    "    # Use IsolationForest labels as a \"pseudo ground truth\"\n",
-    "    y_pseudo = (anomaly_detected_features[anomaly_label_column] == -1).astype(int)\n",
-    "\n",
-    "    # Fit classifier to match the IF model\n",
-    "    proxy_random_forest = RandomForestClassifier(n_estimators=100, random_state=42)\n",
-    "    proxy_random_forest.fit(prepared_features, y_pseudo)\n",
-    "\n",
-    "    return proxy_random_forest.feature_importances_"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "97b21d49",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "java_package_anomaly_detection_importances = get_feature_importances(java_package_anomaly_detection_features, java_package_anomaly_detection_features_prepared)\n",
-    "java_package_anomaly_detection_importances_series = pd.Series(java_package_anomaly_detection_importances, index=java_package_anomaly_detection_feature_names).sort_values(ascending=False)\n",
-    "#display(java_type_anomaly_detection_importances_series.head(10))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "14d0b03e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def plot_feature_importances(feature_importances_series: pd.Series, title_prefix: str) -> None:\n",
-    "    feature_importances_series.head(20).plot(\n",
-    "        kind='barh',\n",
-    "        figsize=(10, 6),\n",
-    "        color='skyblue',\n",
-    "        title=f\"{title_prefix}: Top 20 Feature Importances (Random Forest Proxy)\",\n",
-    "        xlabel=\"Importance\"\n",
-    "    )\n",
-    "    plot.gca().invert_yaxis() # Most important feature at the top\n",
-    "    plot.tight_layout()\n",
-    "    plot.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "974a2bae",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plot_feature_importances(java_package_anomaly_detection_importances_series, title_prefix='Java Packages')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c9dd6246",
-   "metadata": {},
-   "source": [
-    "### 1.5. Plot anomalies\n",
+    "### 1.4. Plot anomalies\n",
     "\n",
     "Plots clustered nodes and highlights anomalies."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ab1e76ab",
+   "id": "c5604735",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -640,13 +564,177 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "aea29887",
+   "id": "61ec7904",
    "metadata": {},
    "outputs": [],
    "source": [
     "plot_anomalies(java_package_anomaly_detection_features, title_prefix=\"Java Package Anomalies\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "efa822ca",
+   "metadata": {},
+   "source": [
+    "### 1.5 Print the 20 most influential features\n",
+    "\n",
+    "Use Random Forest as a proxy to estimate the importance of each feature contributing to the anomaly score."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24427977",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_proxy_random_forest(\n",
+    "        anomaly_detected_features: pd.DataFrame, \n",
+    "        prepared_features: numpy_typing.NDArray,\n",
+    "        anomaly_label_column: str = \"anomalyLabel\",\n",
+    ") -> RandomForestClassifier:\n",
+    "    \"\"\"\n",
+    "    Use Random Forest as a proxy model to find out which are the most important features for the anomaly detection model (Isolation Forest).\n",
+    "    This helps to see if embedding components dominate (top 10 filled with them), and then tune accordingly.\n",
+    "    \"\"\"\n",
+    "    # Use IsolationForest labels as a \"pseudo ground truth\"\n",
+    "    y_pseudo = anomaly_detected_features[anomaly_label_column]\n",
+    "\n",
+    "    # Fit classifier to match the IF model\n",
+    "    proxy_random_forest = RandomForestClassifier(n_estimators=100, random_state=42)\n",
+    "    proxy_random_forest.fit(prepared_features, y_pseudo)\n",
+    "\n",
+    "    return proxy_random_forest"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97b21d49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "java_package_proxy_random_forest = get_proxy_random_forest(java_package_anomaly_detection_features, java_package_anomaly_detection_features_prepared)\n",
+    "java_package_anomaly_detection_importances = java_package_proxy_random_forest.feature_importances_\n",
+    "java_package_anomaly_detection_importances_series = pd.Series(java_package_anomaly_detection_importances, index=java_package_anomaly_detection_feature_names).sort_values(ascending=False)\n",
+    "print(java_package_anomaly_detection_importances_series.head(10))"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "id": "14d0b03e",
+   "metadata": {
+    "vscode": {
+     "languageId": "raw"
+    }
+   },
+   "source": [
+    "# TODO Remove if not used anymore because of a better plot using SHAP\n",
+    "def plot_feature_importances(feature_importances_series: pd.Series, title_prefix: str) -> None:\n",
+    "    feature_importances_series.head(20).plot(\n",
+    "        kind='barh',\n",
+    "        figsize=(10, 6),\n",
+    "        color='skyblue',\n",
+    "        title=f\"{title_prefix}: Top 20 Feature Importances (Random Forest Proxy)\",\n",
+    "        xlabel=\"Importance\"\n",
+    "    )\n",
+    "    plot.gca().invert_yaxis() # Most important feature at the top\n",
+    "    plot.tight_layout()\n",
+    "    plot.show()\n",
+    "\n",
+    "plot_feature_importances(java_package_anomaly_detection_importances_series, title_prefix='Java Packages')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "db03216e",
+   "metadata": {},
+   "source": [
+    "### 1.6 Use SHAP to explain the Isolation Forest Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e8c5905d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def explain_anomalies_with_shap(\n",
+    "    random_forest_model: RandomForestClassifier,\n",
+    "    anomaly_detected_features: pd.DataFrame,\n",
+    "    prepared_features: numpy_typing.NDArray,\n",
+    "    feature_names: list[str],\n",
+    "    title_prefix: str = \"\",\n",
+    "    anomaly_label_column: str = \"anomalyLabel\",\n",
+    ") -> None:\n",
+    "    \"\"\"\n",
+    "    Explain anomalies using SHAP values.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # Use TreeExplainer for Random Forest\n",
+    "    explainer = shap.TreeExplainer(random_forest_model)\n",
+    "    \n",
+    "    shap_values = explainer.shap_values(prepared_features)\n",
+    "    print(f\"Input shape: {anomaly_detected_features.shape}\")\n",
+    "    print(f\"SHAP shape: {np.shape(shap_values)}\")\n",
+    "\n",
+    "    anomaly_rows = anomaly_detected_features[anomaly_label_column] == 1 # Filter anomalies\n",
+    "    shap.summary_plot(\n",
+    "        shap_values[anomaly_rows, :, 1],  # Class 1 = anomaly\n",
+    "        prepared_features[anomaly_rows],\n",
+    "        feature_names=feature_names,\n",
+    "        plot_type=\"bar\",\n",
+    "        title=f\"{title_prefix} Anomalies explained using SHAP\",\n",
+    "        max_display=20,\n",
+    "        plot_size=(12, 6)  # (width, height) in inches\n",
+    "    )\n",
+    "\n",
+    "    # Create DataFrame of SHAP values for class 1 (anomaly)\n",
+    "    shap_df = pd.DataFrame(\n",
+    "        shap_values[:, :, 1],  # select SHAP values for class 1\n",
+    "        columns=feature_names\n",
+    "    )\n",
+    "\n",
+    "    # Add anomaly label to shap_df\n",
+    "    shap_df[\"anomalyLabel\"] = anomaly_detected_features[\"anomalyLabel\"].values\n",
+    "\n",
+    "    # Filter to only anomalies using the boolean mask\n",
+    "    anomaly_shap_df = shap_df[anomaly_rows].drop(columns=[\"anomalyLabel\"])\n",
+    "\n",
+    "    # Get top 3 features per anomaly (by absolute SHAP value)\n",
+    "    top3_per_anomaly = anomaly_shap_df.apply(\n",
+    "        lambda row: list(\n",
+    "            row.abs().sort_values(ascending=False).head(3).index\n",
+    "        ),\n",
+    "        axis=1\n",
+    "    )\n",
+    "\n",
+    "    # Add top 3 influential features to every anomaly row\n",
+    "    anomaly_detected_features[\"anomalyLabelInfluentialFeatures\"] = None\n",
+    "    anomaly_detected_features.loc[\n",
+    "        anomaly_rows, \"anomalyLabelInfluentialFeatures\"\n",
+    "    ] = top3_per_anomaly.values\n",
+    "\n",
+    "    display(anomaly_detected_features[anomaly_detected_features[\"anomalyLabel\"] == 1].sort_values(by='anomalyScore', ascending=False).head(10))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d671e71",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "explain_anomalies_with_shap(\n",
+    "    random_forest_model=java_package_proxy_random_forest,\n",
+    "    anomaly_detected_features=java_package_anomaly_detection_features, \n",
+    "    prepared_features=java_package_anomaly_detection_features_prepared,\n",
+    "    feature_names=java_package_anomaly_detection_feature_names,\n",
+    "    title_prefix=\"Java Package\"\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "5682bb64",
@@ -767,48 +855,71 @@
     "display(get_top_10_anomalies(java_type_anomaly_detection_features))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "68a00628",
+   "metadata": {},
+   "source": [
+    "### 2.4. Plot anomalies\n",
+    "\n",
+    "Plots clustered nodes and highlights anomalies."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ecc9fb4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_anomalies(java_type_anomaly_detection_features, title_prefix=\"Java Type Anomalies\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "4e565f84",
    "metadata": {},
    "source": [
-    "### 2.4 Plot the 20 most influential features\n",
+    "### 2.5 Print the 20 most influential features\n",
     "\n",
     "Use Random Forest as a proxy to estimate the importance of each feature contributing to the anomaly score."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1b97f299",
+   "id": "86945e66",
    "metadata": {},
    "outputs": [],
    "source": [
-    "java_type_anomaly_detection_importances = get_feature_importances(java_type_anomaly_detection_features, java_type_anomaly_detection_features_prepared)\n",
+    "java_type_proxy_random_forest = get_proxy_random_forest(java_type_anomaly_detection_features, java_type_anomaly_detection_features_prepared)\n",
+    "java_type_anomaly_detection_importances = java_type_proxy_random_forest.feature_importances_\n",
     "java_type_anomaly_detection_importances_series = pd.Series(java_type_anomaly_detection_importances, index=java_type_anomaly_detection_feature_names).sort_values(ascending=False)\n",
-    "#display(java_type_anomaly_detection_importances_series.head(10))\n",
-    "\n",
-    "plot_feature_importances(java_type_anomaly_detection_importances_series, title_prefix='Java Types')"
+    "print(java_type_anomaly_detection_importances_series.head(10))"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "68a00628",
+   "id": "b12a0379",
    "metadata": {},
    "source": [
-    "### 2.5. Plot anomalies\n",
-    "\n",
-    "Plots clustered nodes and highlights anomalies."
+    "### 2.6 Use SHAP to explain the Isolation Forest Model"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4ecc9fb4",
+   "id": "2d4b35c6",
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_anomalies(java_type_anomaly_detection_features, title_prefix=\"Java Type Anomalies\")"
+    "explain_anomalies_with_shap(\n",
+    "    random_forest_model=java_type_proxy_random_forest,\n",
+    "    anomaly_detected_features=java_type_anomaly_detection_features, \n",
+    "    prepared_features=java_type_anomaly_detection_features_prepared,\n",
+    "    feature_names=java_type_anomaly_detection_feature_names,\n",
+    "    title_prefix=\"Java Type\"\n",
+    ")"
    ]
   }
  ],