@@ -260,7 +260,7 @@ def query_data(input_parameters: Parameters = Parameters.example()) -> pd.DataFr
260260 ,codeUnit.centralityArticleRank AS articleRank
261261 ,codeUnit.centralityPageRank - codeUnit.centralityArticleRank AS pageToArticleRankDifference
262262 ,codeUnit.centralityBetweenness AS betweenness
263- ,codeUnit.communityLocalClusteringCoefficient AS locallusteringCoefficient
263+ ,codeUnit.communityLocalClusteringCoefficient AS localClusteringCoefficient
264264 ,1.0 - codeUnit.clusteringHDBSCANProbability AS clusterApproximateOutlierScore
265265 ,codeUnit.clusteringHDBSCANNoise AS clusterNoise
266266 ,codeUnit.clusteringHDBSCANRadiusAverage AS clusterRadiusAverage
@@ -754,6 +754,73 @@ def plot_shap_explained_beeswarm(
754754 plot .close ()
755755
756756
757+ def plot_shap_explained_local_feature_importance (
758+ index_to_explain ,
759+ anomalies_explanation_results : AnomaliesExplanationResults ,
760+ prepared_features : np .ndarray ,
761+ feature_names : list [str ],
762+ title : str ,
763+ plot_file_path : str ,
764+ rounding_precision : int = 4 ,
765+ ):
766+ """
767+ Uses the SHAP values for anomalies to visualize the local feature importance for a specific anomaly.
768+ This function generates a force plot showing how each feature contributes to the anomaly score for a specific anomaly instance.
769+ The force plot is a powerful visualization that helps to understand the impact of each feature for each as anomaly classified data point.
770+ Visual breakdown of how each feature contributes to the score.
771+ Highly interpretable for debugging single nodes.
772+ """
773+ shap_anomaly_values = anomalies_explanation_results .shap_anomaly_values
774+ expected_anomaly_value = anomalies_explanation_results .shap_expected_anomaly_value
775+
776+ shap_values_rounded = np .round (shap_anomaly_values [index_to_explain ], rounding_precision )
777+ prepared_features_rounded = prepared_features [index_to_explain ].round (rounding_precision )
778+ base_value_rounded = np .round (expected_anomaly_value , rounding_precision )
779+
780+ shap .force_plot (
781+ base_value_rounded ,
782+ shap_values_rounded ,
783+ prepared_features_rounded ,
784+ feature_names = feature_names ,
785+ matplotlib = True ,
786+ show = False ,
787+ contribution_threshold = 0.06
788+ )
789+ current_figure = plot .gcf ()
790+
791+ # Resize fonts manually (best effort, affects all text)
792+ for text in current_figure .findobj (match = plot .Text ):
793+ text .set_fontsize (10 ) # Set smaller font
794+
795+ plot .title (title , fontsize = 16 , loc = 'left' , y = 0.05 )
796+ plot .savefig (plot_file_path )
797+ plot .close ()
798+
799+
800+ def plot_all_shap_explained_local_feature_importance (
801+ data : pd .DataFrame ,
802+ explanation_results : AnomaliesExplanationResults ,
803+ prepared_features : np .ndarray ,
804+ feature_names : list [str ],
805+ parameters : Parameters ,
806+ title_prefix : str = "" ,
807+ code_unit_name_column : str = "codeUnitName"
808+ ) -> None :
809+
810+ index = 0
811+ for row_index , row in data .iterrows ():
812+ row_index = typing .cast (int , row_index )
813+ index = index + 1
814+ plot_shap_explained_local_feature_importance (
815+ index_to_explain = row_index ,
816+ anomalies_explanation_results = explanation_results ,
817+ prepared_features = prepared_features ,
818+ feature_names = feature_names ,
819+ title = f"{ title_prefix } \" { row [code_unit_name_column ]} \" anomaly #{ index } explained" ,
820+ plot_file_path = get_file_path (f"{ title_prefix } _Anomaly_{ index } _shap_explanation" , parameters ),
821+ )
822+
823+
757824def plot_shap_explained_top_10_feature_dependence (
758825 shap_anomaly_values : np .ndarray ,
759826 prepared_features : np .ndarray ,
@@ -838,6 +905,48 @@ def add_top_shap_features_to_anomalies(
838905 return anomaly_detected_features
839906
840907
908+ def add_node_embedding_shap_sum (
909+ shap_anomaly_values : np .ndarray ,
910+ feature_names : list [str ],
911+ anomaly_detected_features : pd .DataFrame ,
912+ anomaly_label_column : str = "anomalyLabel" ,
913+ output_column_name : str = "anomalyNodeEmbeddingSHAPSum"
914+ ) -> pd .DataFrame :
915+ """
916+ Adds a column with the sum of SHAP values for all features that start with 'nodeEmbedding'.
917+ The sum is signed, so that negative values contributing to an anomaly are reduced by positive numbers indicating "normal" tendencies.
918+
919+ Parameters:
920+ - shap_anomaly_values: SHAP values array with shape (n_samples, n_features).
921+ - feature_names: List of names corresponding to the features.
922+ - anomaly_detected_features: Original DataFrame containing anomaly labels.
923+ - anomaly_label_column: Name of the column indicating anomalies (1 = anomaly).
924+ - output_column_name: Name of the new column to store the SHAP sum.
925+
926+ Returns:
927+ - DataFrame with an additional column containing the summed SHAP values for nodeEmbedding features.
928+ """
929+ # Convert SHAP values into a DataFrame for easier manipulation
930+ shap_values_dataframe = pd .DataFrame (shap_anomaly_values , columns = feature_names )
931+
932+ # Identify all features whose names start with "nodeEmbedding"
933+ node_embedding_features = [name for name in feature_names if name .startswith ("nodeEmbedding" )]
934+
935+ # Default initialize new column
936+ anomaly_detected_features [output_column_name ] = 0.0
937+
938+ # Get indices of rows marked as anomalies
939+ anomaly_indices = anomaly_detected_features [anomaly_detected_features [anomaly_label_column ] == 1 ].index
940+
941+ # Compute raw signed sum of SHAP values for each anomaly row
942+ for row_index in anomaly_indices :
943+ row_shap_values = shap_values_dataframe .loc [row_index , node_embedding_features ]
944+ shap_sum = row_shap_values .sum () # signed sum
945+ anomaly_detected_features .at [row_index , output_column_name ] = shap_sum
946+
947+ return anomaly_detected_features
948+
949+
841950# ------------------------------------------------------------------------------------------------------------
842951# MAIN
843952# ------------------------------------------------------------------------------------------------------------
@@ -913,6 +1022,15 @@ def add_top_shap_features_to_anomalies(
9131022 plot_file_path = get_file_path (f"{ plot_prefix } _Anomaly_feature_importance_explained" , parameters )
9141023)
9151024
1025+ plot_all_shap_explained_local_feature_importance (
1026+ data = get_top_10_anomalies (features ),
1027+ explanation_results = explanation_results ,
1028+ prepared_features = features_prepared ,
1029+ feature_names = feature_names ,
1030+ parameters = parameters ,
1031+ title_prefix = plot_prefix
1032+ )
1033+
9161034plot_shap_explained_top_10_feature_dependence (
9171035 shap_anomaly_values = explanation_results .shap_anomaly_values ,
9181036 prepared_features = features_prepared ,
@@ -927,6 +1045,12 @@ def add_top_shap_features_to_anomalies(
9271045 anomaly_detected_features = features
9281046)
9291047
1048+ add_node_embedding_shap_sum (
1049+ shap_anomaly_values = explanation_results .shap_anomaly_values ,
1050+ feature_names = feature_names ,
1051+ anomaly_detected_features = features
1052+ )
1053+
9301054if parameters .is_verbose ():
9311055 print ("tunedAnomalyDetectionExplained: Features with added anomaly score explanation columns:" )
9321056 print (features [features ["anomalyLabel" ] == 1 ].sort_values (by = 'anomalyScore' , ascending = False ).head (10 ))
@@ -941,6 +1065,7 @@ def add_top_shap_features_to_anomalies(
9411065 'anomalyTopFeatureSHAPValue1' : features ['anomalyTopFeatureSHAPValue_1' ],
9421066 'anomalyTopFeatureSHAPValue2' : features ['anomalyTopFeatureSHAPValue_2' ],
9431067 'anomalyTopFeatureSHAPValue3' : features ['anomalyTopFeatureSHAPValue_3' ],
1068+ 'anomalyNodeEmbeddingSHAPSum' : features ['anomalyNodeEmbeddingSHAPSum' ],
9441069})
9451070write_batch_data_into_database (data_to_write , parameters .get_projection_node_label (), verbose = parameters .is_verbose ())
9461071
0 commit comments