|
10 | 10 | # - Provide the password for Neo4j in the environment variable "NEO4J_INITIAL_PASSWORD". |
11 | 11 | # - Requires "tunedLeidenCommunityDetection.py", "tunedNodeEmbeddingClustering.py" and "umap2dNodeEmbedding.py" to be executed before this script to provide the necessary data. |
12 | 12 |
|
| 13 | +from re import M |
13 | 14 | import typing |
14 | 15 | import numpy.typing as numpy_typing |
15 | 16 |
|
@@ -947,6 +948,58 @@ def add_node_embedding_shap_sum( |
947 | 948 | return anomaly_detected_features |
948 | 949 |
|
949 | 950 |
|
| 951 | +def output_top_shap_explained_global_features_as_markdown_table( |
| 952 | + shap_anomaly_values: np.ndarray, |
| 953 | + feature_names: list[str], |
| 954 | + output_file_path: str, |
| 955 | + top_n_features: int = 10 |
| 956 | +): |
| 957 | + # Compute mean absolute shap value across all samples for each feature (importance ranking) |
| 958 | + mean_absolute_shap_values = np.abs(shap_anomaly_values).mean(axis=0) |
| 959 | + |
| 960 | + # Create DataFrame with feature names and mean shap values |
| 961 | + feature_importance = pd.DataFrame({ |
| 962 | + "Feature": feature_names, |
| 963 | + "Mean absolute SHAP value": mean_absolute_shap_values |
| 964 | + }) |
| 965 | + |
| 966 | + # Aggregate all nodeEmbedding* features |
| 967 | + mask = feature_importance["Feature"].str.startswith("nodeEmbedding") |
| 968 | + node_embedding_sum = feature_importance.loc[mask, "Mean absolute SHAP value"].sum() |
| 969 | + |
| 970 | + # Append aggregated feature |
| 971 | + feature_importance = pd.concat([ |
| 972 | + feature_importance, |
| 973 | + pd.DataFrame([{ |
| 974 | + "Feature": "*Node embeddings aggregated*", |
| 975 | + "Mean absolute SHAP value": node_embedding_sum |
| 976 | + }]) |
| 977 | + ]) |
| 978 | + |
| 979 | + # Sort by importance |
| 980 | + top_features = feature_importance.sort_values("Mean absolute SHAP value", ascending=False).head(top_n_features + 1) |
| 981 | + |
| 982 | + # Build markdown table manually using column names |
| 983 | + headers = list(top_features.columns) |
| 984 | + rows = top_features.values.tolist() |
| 985 | + |
| 986 | + markdown_header_row = "| " + " | ".join(headers) + " |\n" |
| 987 | + markdown_table = markdown_header_row |
| 988 | + |
| 989 | + markdown_header_separator_row = "| " + " | ".join(["---"] * len(headers)) + " |\n" |
| 990 | + markdown_table += markdown_header_separator_row |
| 991 | + |
| 992 | + for row in rows: |
| 993 | + markdown_data_row = "| " + " | ".join([str(row[0]), f"{row[1]:.6f}"]) + " |\n" |
| 994 | + markdown_table += markdown_data_row |
| 995 | + |
| 996 | + # Save to file |
| 997 | + with open(output_file_path, "w") as f: |
| 998 | + f.write(markdown_table) |
| 999 | + |
| 1000 | + print(f"tunedAnomalyDetectionExplained: Markdown table with top {top_n_features} SHAP explained features saved to {output_file_path}") |
| 1001 | + |
| 1002 | + |
950 | 1003 | # ------------------------------------------------------------------------------------------------------------ |
951 | 1004 | # MAIN |
952 | 1005 | # ------------------------------------------------------------------------------------------------------------ |
@@ -1051,6 +1104,12 @@ def add_node_embedding_shap_sum( |
1051 | 1104 | anomaly_detected_features=features |
1052 | 1105 | ) |
1053 | 1106 |
|
| 1107 | +output_top_shap_explained_global_features_as_markdown_table( |
| 1108 | + shap_anomaly_values=explanation_results.shap_anomaly_values, |
| 1109 | + feature_names=feature_names, |
| 1110 | + output_file_path=get_file_path(f"{plot_prefix}_Top_anomaly_features", parameters, 'md') |
| 1111 | +) |
| 1112 | + |
1054 | 1113 | if parameters.is_verbose(): |
1055 | 1114 | print("tunedAnomalyDetectionExplained: Features with added anomaly score explanation columns:") |
1056 | 1115 | print(features[features["anomalyLabel"] == 1].sort_values(by='anomalyScore', ascending=False).head(10)) |
|
0 commit comments