@@ -98,7 +98,7 @@ def __is_code_language_available(self) -> bool:
9898 def __get_projection_language (self ) -> str :
9999 return self .query_parameters_ ["projection_language" ] if self .__is_code_language_available () else ""
100100
101- def get_plot_prefix (self ) -> str :
101+ def get_title_prefix (self ) -> str :
102102 if self .__is_code_language_available ():
103103 return self .__get_projection_language () + " " + self .__get_projection_node_label ()
104104 return self .__get_projection_node_label ()
@@ -378,37 +378,40 @@ def annotate_outliers(outliers: pd.DataFrame) -> None:
378378 plot .savefig (plot_file_path )
379379
380380
381- def plot_clustering_coefficient_distribution ( clustering_coefficients : pd .Series , title : str , plot_file_path : str ) -> None :
381+ def plot_feature_distribution ( feature_values : pd .Series , feature_name : str , title : str , plot_file_path : str ) -> None :
382382 """
383- Plots the distribution of clustering coefficients .
384-
383+ Plots the distribution of feature's values .
384+
385385 Parameters
386386 ----------
387- clustering_coefficients : pd.Series
388- Series containing clustering coefficient values.
387+ feature_values : pd.Series
388+ Series containing feature values.
389+ text_prefix: str
390+ Text at the beginning of the title
389391 """
390- if clustering_coefficients .empty :
392+ if feature_values .empty :
391393 print ("No data available to plot." )
392394 return
393395
394396 plot .figure (figsize = (10 , 6 ))
395397 plot .figure (figsize = (10 , 6 ))
396- plot .hist (clustering_coefficients , bins = 40 , color = 'blue' , alpha = 0.7 , edgecolor = 'black' )
398+ plot .hist (feature_values , bins = 40 , color = 'blue' , alpha = 0.7 , edgecolor = 'black' )
397399 plot .title (title , pad = 20 )
398- plot .xlabel ('Clustering Coefficient' )
400+ plot .xlabel (feature_name )
399401 plot .ylabel ('Frequency' )
400- plot .xlim (left = clustering_coefficients .min (), right = clustering_coefficients .max ())
402+ plot .xlim (left = feature_values .min (), right = feature_values .max ())
401403 # plot.yscale('log') # Use logarithmic scale for better visibility of differences
402404 plot .grid (True )
403- plot .tight_layout ()
404405
405- mean = clustering_coefficients .mean ()
406- standard_deviation = clustering_coefficients .std ()
406+ mean = feature_values .mean ()
407+ standard_deviation = feature_values .std ()
407408
408409 # Vertical line for the mean
409410 plot_standard_deviation_lines ('red' , mean , standard_deviation , standard_deviation_factor = 0 )
410411 # Vertical line for 1 x standard deviations + mean (=z-score of 1)
411- plot_standard_deviation_lines ('green' , mean , standard_deviation , standard_deviation_factor = 1 )
412+ plot_standard_deviation_lines ('orange' , mean , standard_deviation , standard_deviation_factor = 1 )
413+ # Vertical line for 2 x standard deviations + mean (=z-score of 2)
414+ plot_standard_deviation_lines ('green' , mean , standard_deviation , standard_deviation_factor = 2 )
412415
413416 plot .tight_layout ()
414417 plot .savefig (plot_file_path )
@@ -812,7 +815,7 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
812815# ------------------------------------------------------------------------------------------------------------
813816
814817parameters = parse_input_parameters ()
815- plot_prefix = parameters .get_plot_prefix ()
818+ title_prefix = parameters .get_title_prefix ()
816819report_directory = parameters .get_report_directory ()
817820
818821driver = get_graph_database_driver ()
@@ -825,31 +828,39 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
825828 data ['pageRank' ],
826829 data ['articleRank' ],
827830 data ['shortCodeUnitName' ],
828- title = f"{ plot_prefix } distribution of PageRank - ArticleRank differences" ,
829- plot_file_path = get_file_path (f" { plot_prefix } _PageRank_Minus_ArticleRank_Distribution " , parameters )
831+ title = f"{ title_prefix } distribution of PageRank - ArticleRank differences" ,
832+ plot_file_path = get_file_path ("PageRank_Minus_ArticleRank_Distribution " , parameters )
830833)
831834
832- plot_clustering_coefficient_distribution (
833- data ['clusteringCoefficient' ],
834- title = f"{ plot_prefix } distribution of clustering coefficients" ,
835- plot_file_path = get_file_path (f"{ plot_prefix } _ClusteringCoefficient_distribution" , parameters )
835+ plot_feature_distribution (
836+ feature_values = data ['clusteringCoefficient' ],
837+ feature_name = 'Clustering Coefficient' ,
838+ title = f"{ title_prefix } clustering coefficient distribution" ,
839+ plot_file_path = get_file_path ("ClusteringCoefficient_distribution" , parameters )
840+ )
841+
842+ plot_feature_distribution (
843+ feature_values = data ['betweenness' ],
844+ feature_name = 'Betweenness' ,
845+ title = f"{ title_prefix } betweenness centrality distribution" ,
846+ plot_file_path = get_file_path ("BetweennessCentrality_distribution" , parameters )
836847)
837848
838849plot_clustering_coefficient_vs_page_rank (
839850 data ['clusteringCoefficient' ],
840851 data ['pageRank' ],
841852 data ['shortCodeUnitName' ],
842853 data ['clusterNoise' ],
843- title = f"{ plot_prefix } clustering coefficient versus PageRank" ,
844- plot_file_path = get_file_path (f" { plot_prefix } _ClusteringCoefficient_versus_PageRank " , parameters )
854+ title = f"{ title_prefix } clustering coefficient versus PageRank" ,
855+ plot_file_path = get_file_path ("ClusteringCoefficient_versus_PageRank " , parameters )
845856)
846857
847858if (overall_cluster_count < 20 ):
848859 print (f"anomalyDetectionFeaturePlots: Less than 20 clusters: { overall_cluster_count } . Only one plot containing all clusters will be created." )
849860 plot_clusters (
850861 clustering_visualization_dataframe = data ,
851- title = f"{ plot_prefix } all clusters overall (less than 20) " ,
852- plot_file_path = get_file_path (f" { plot_prefix } _Clusters_Overall " , parameters )
862+ title = f"{ title_prefix } all clusters overall" ,
863+ plot_file_path = get_file_path ("Clusters_Overall " , parameters )
853864 )
854865else :
855866 print (f"anomalyDetectionFeaturePlots: More than 20 clusters: { overall_cluster_count } . Different plots focussing on different features like cluster size will be created." )
@@ -858,57 +869,57 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
858869 )
859870 plot_clusters (
860871 clustering_visualization_dataframe = clusters_by_largest_size ,
861- title = f"{ plot_prefix } clusters with the largest size" ,
862- plot_file_path = get_file_path (f" { plot_prefix } _Clusters_largest_size " , parameters )
872+ title = f"{ title_prefix } clusters with the largest size" ,
873+ plot_file_path = get_file_path ("Clusters_largest_size " , parameters )
863874 )
864875
865876 clusters_by_largest_max_radius = get_clusters_by_criteria (
866877 data , by = 'clusterRadiusMax' , ascending = False , cluster_count = 20
867878 )
868879 plot_clusters (
869880 clustering_visualization_dataframe = clusters_by_largest_max_radius ,
870- title = f"{ plot_prefix } clusters with the largest max radius" ,
871- plot_file_path = get_file_path (f" { plot_prefix } _Clusters_largest_max_radius " , parameters )
881+ title = f"{ title_prefix } clusters with the largest max radius" ,
882+ plot_file_path = get_file_path ("Clusters_largest_max_radius " , parameters )
872883 )
873884
874885 clusters_by_largest_average_radius = get_clusters_by_criteria (
875886 data , by = 'clusterRadiusAverage' , ascending = False , cluster_count = 20
876887 )
877888 plot_clusters (
878889 clustering_visualization_dataframe = clusters_by_largest_average_radius ,
879- title = f"{ plot_prefix } clusters with the largest average radius" ,
880- plot_file_path = get_file_path (f" { plot_prefix } _Clusters_largest_average_radius " , parameters )
890+ title = f"{ title_prefix } clusters with the largest average radius" ,
891+ plot_file_path = get_file_path ("Clusters_largest_average_radius " , parameters )
881892 )
882893
883894plot_clusters_probabilities (
884895 clustering_visualization_dataframe = data ,
885- title = f"{ plot_prefix } clustering probabilities (red=high uncertainty)" ,
886- plot_file_path = get_file_path (f" { plot_prefix } _Cluster_probabilities " , parameters )
896+ title = f"{ title_prefix } clustering probabilities (red=high uncertainty)" ,
897+ plot_file_path = get_file_path ("Cluster_probabilities " , parameters )
887898)
888899
889900plot_cluster_noise (
890901 clustering_visualization_dataframe = data ,
891- title = f"{ plot_prefix } clustering noise points that are surprisingly central (red) or popular (size)" ,
902+ title = f"{ title_prefix } clustering noise points that are surprisingly central (red) or popular (size)" ,
892903 size_column_name = 'degree' ,
893904 color_column_name = 'pageRank' ,
894- plot_file_path = get_file_path (f" { plot_prefix } _ClusterNoise_highly_central_and_popular " , parameters )
905+ plot_file_path = get_file_path ("ClusterNoise_highly_central_and_popular " , parameters )
895906)
896907
897908plot_cluster_noise (
898909 clustering_visualization_dataframe = data ,
899- title = f"{ plot_prefix } clustering noise points that bridge flow (red) and are poorly integrated (size)" ,
910+ title = f"{ title_prefix } clustering noise points that bridge flow (red) and are poorly integrated (size)" ,
900911 size_column_name = 'inverseClusteringCoefficient' ,
901912 color_column_name = 'betweenness' ,
902- plot_file_path = get_file_path (f" { plot_prefix } _ClusterNoise_poorly_integrated_bridges " , parameters ),
913+ plot_file_path = get_file_path ("ClusterNoise_poorly_integrated_bridges " , parameters ),
903914 downscale_normal_sizes = 0.4
904915)
905916
906917plot_cluster_noise (
907918 clustering_visualization_dataframe = data ,
908- title = f"{ plot_prefix } clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)" ,
919+ title = f"{ title_prefix } clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)" ,
909920 size_column_name = 'pageToArticleRankDifference' ,
910921 color_column_name = 'betweenness' ,
911- plot_file_path = get_file_path (f" { plot_prefix } _ClusterNoise_role_inverted_bridges " , parameters )
922+ plot_file_path = get_file_path ("ClusterNoise_role_inverted_bridges " , parameters )
912923)
913924
914925driver .close ()
0 commit comments