44# This is useful for understanding code structure, detecting modular boundaries, and identifying anomalies or outliers in large software systems without requiring manual labeling.
55# It takes the code structure as a graph in Neo4j and generates node embeddings using Fast Random Projection (FastRP).
66# These embeddings capture structural similarity and are clustered using HDBSCAN to assign labels or detect noise.
7- # For visualization, the embeddings are reduced to 2D using t-SNE.
87# All results - including embeddings, cluster labels, and 2D coordinates — are written back to Neo4j for further use.
98
109# Prerequisite:
2524
2625from neo4j import GraphDatabase , Driver
2726
28- from openTSNE .sklearn import TSNE
29-
30- from sklearn .base import BaseEstimator
27+ # from sklearn.base import BaseEstimator # Extend from sklearn BaseEstimator to use e.g. GridSearchCV for hyperparameter tuning.
3128from sklearn .metrics import adjusted_rand_score , adjusted_mutual_info_score , normalized_mutual_info_score
3229from sklearn .cluster import HDBSCAN # type: ignore
3330
3835
3936
4037class Parameters :
41- required_parameters_ = ["projection_name" , "projection_node_label" , "projection_weight_property" , "community_property" ]
38+ required_parameters_ = ["projection_name" , "projection_node_label" , "projection_weight_property" , "community_property" , "embedding_property" ]
4239
4340 def __init__ (self , input_parameters : typing .Dict [str , str ], verbose : bool = False ):
4441 self .query_parameters_ = input_parameters .copy () # copy enforces immutability
@@ -63,9 +60,6 @@ def log_dependency_versions_() -> None:
6360 from sklearn import __version__ as sklearn_version
6461 print ('scikit-learn version: {}' .format (sklearn_version ))
6562
66- from openTSNE import __version__ as openTSNE_version
67- print ('openTSNE version: {}' .format (openTSNE_version ))
68-
6963 from neo4j import __version__ as neo4j_version
7064 print ('neo4j version: {}' .format (neo4j_version ))
7165
@@ -116,6 +110,9 @@ def get_projection_name(self) -> str:
116110 def get_projection_node_label (self ) -> str :
117111 return self .query_parameters_ ["projection_node_label" ]
118112
113+ def get_embedding_property (self ) -> str :
114+ return self .query_parameters_ ["embedding_property" ]
115+
119116 def is_verbose (self ) -> bool :
120117 return self .verbose_
121118
@@ -513,7 +510,8 @@ def __init__(self,
513510 forth_iteration_weight : float = 1.0 ,
514511 ):
515512 self .parameters_ = parameters
516- self .verbose = parameters .is_verbose ()
513+ self .verbose_ = parameters .is_verbose ()
514+ self .write_property_ = parameters .get_embedding_property ()
517515
518516 self .embedding_dimension = embedding_dimension
519517 self .random_seed = random_seed
@@ -526,15 +524,15 @@ def __to_algorithm_parameters(self) -> typing.Dict['str', 'str']:
526524 "normalization_strength" : str (self .normalization_strength ),
527525 "forth_iteration_weight" : str (self .forth_iteration_weight ),
528526 "embedding_random_seed" : str (self .random_seed ),
529- "write_property" : "embeddingsFastRandomProjectionForClustering" ,
527+ "write_property" : str ( self . write_property_ ) ,
530528 ** self .parameters_ .get_query_parameters ()
531529 }
532530
533531 def __run_algorithm (self ) -> pd .DataFrame :
534532 algorithm_parameters = self .__to_algorithm_parameters ()
535533 # For Debugging:
536534 # print("Generating embeddings using Neo4j Graph Data Science with the following parameters: " + str(algorithm_parameters))
537- if self .verbose :
535+ if self .verbose_ :
538536 return query_cypher_to_data_frame (self .cypher_query_for_generating_embeddings_ , parameters = algorithm_parameters )
539537
540538 return query_cypher_to_data_frame_suppress_warnings (self .cypher_query_for_generating_embeddings_ , parameters = algorithm_parameters )
@@ -568,12 +566,12 @@ def write_embeddings(self) -> typing.Self:
568566 This is useful for further processing or analysis of the embeddings.
569567 """
570568 algorithm_parameters = self .__to_algorithm_parameters ()
571- if self .verbose :
569+ if self .verbose_ :
572570 print ("" )
573571 print ("Writing embeddings to Neo4j with the following parameters: " + str (algorithm_parameters ))
574572 print ("" )
575573
576- if self .verbose :
574+ if self .verbose_ :
577575 query_cypher_to_data_frame (self .cypher_query_for_writing_embeddings_ , parameters = algorithm_parameters )
578576 else :
579577 query_cypher_to_data_frame_suppress_warnings (self .cypher_query_for_writing_embeddings_ , parameters = algorithm_parameters )
@@ -633,63 +631,27 @@ def objective(trial):
633631 return TuneableFastRandomProjectionNodeEmbeddings (parameters , ** study .best_params ).fit ()
634632
635633
636- def prepare_node_embeddings_for_2d_visualization (embeddings : pd .DataFrame ) -> pd .DataFrame :
637- """
638- Reduces the dimensionality of the node embeddings (e.g. 64 floating point numbers in an array)
639- to two dimensions for 2D visualization.
640- see https://opentsne.readthedocs.io
641- """
642-
643- if embeddings .empty :
644- print ("No projected data for node embeddings dimensionality reduction available" )
645- return embeddings
646-
647- # Calling the fit_transform method just with a list doesn't work.
648- # It leads to an error with the following message: 'list' object has no attribute 'shape'
649- # This can be solved by converting the list to a numpy array using np.array(..).
650- # See https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape
651- embeddings_as_numpy_array = np .array (embeddings .embedding .to_list ())
652-
653- # Use t-distributed Stochastic Neighbor Embedding (t-SNE) to reduce the dimensionality
654- # of the previously calculated node embeddings to 2 dimensions for visualization
655- t_distributed_stochastic_neighbor_embedding = TSNE (n_components = 2 , verbose = False , random_state = 47 )
656- two_dimension_node_embeddings = t_distributed_stochastic_neighbor_embedding .fit_transform (embeddings_as_numpy_array )
657- # display(two_dimension_node_embeddings.shape) # Display the shape of the t-SNE result
658-
659- # Create a new DataFrame with the results of the 2 dimensional node embeddings
660- # and the code unit and artifact name of the query above as preparation for the plot
661- embeddings ['embeddingVisualizationX' ] = [value [0 ] for value in two_dimension_node_embeddings ]
662- embeddings ['embeddingVisualizationY' ] = [value [1 ] for value in two_dimension_node_embeddings ]
663-
664- return embeddings
665-
666-
667- def execute_tuned_node_embeddings_clustering (parameters : Parameters ) -> None :
668- tuned_fast_random_projection = get_tuned_fast_random_projection_node_embeddings (parameters )
669- embeddings = tuned_fast_random_projection .get_embeddings ()
670- clustering_results = coordinate_tuned_hierarchical_density_based_spatial_clustering (embeddings )
671- if parameters .is_verbose ():
672- print ("HDBSCAN clustered labels by their size descending (top 10):" , clustering_results .clustering_results_distribution .head (10 ))
673- print ("HDBSCAN clustered labels by their probability descending (top 10):" , clustering_results .clustering_results_distribution .sort_values (by = 'probability' , ascending = False ).head (10 ))
674-
675- embeddings = prepare_node_embeddings_for_2d_visualization (clustering_results .embeddings )
676-
677- tuned_fast_random_projection .write_embeddings ()
678- data_to_write = pd .DataFrame (data = {
679- 'nodeElementId' : embeddings ["nodeElementId" ],
680- 'clusteringHDBSCANLabel' : embeddings ['clusteringTunedHDBSCANLabel' ],
681- 'clusteringHDBSCANProbability' : embeddings ['clusteringTunedHDBSCANProbability' ],
682- 'clusteringHDBSCANNoise' : (embeddings ['clusteringTunedHDBSCANLabel' ] == - 1 ).astype (int ),
683- 'embeddingFastRandomProjectionVisualizationX' : embeddings ["embeddingVisualizationX" ],
684- 'embeddingFastRandomProjectionVisualizationY' : embeddings ["embeddingVisualizationY" ],
685- })
686- write_batch_data_into_database (data_to_write , parameters .get_projection_node_label ())
687-
688634# ------------------------------------------------------------------------------------------------------------
689635# MAIN
690636# ------------------------------------------------------------------------------------------------------------
691637
692638
693639parameters = parse_input_parameters ()
694640driver = get_graph_database_driver ()
695- execute_tuned_node_embeddings_clustering (parameters )
641+
642+ tuned_fast_random_projection = get_tuned_fast_random_projection_node_embeddings (parameters )
643+ embeddings = tuned_fast_random_projection .get_embeddings ()
644+
645+ clustering_results = coordinate_tuned_hierarchical_density_based_spatial_clustering (embeddings )
646+ if parameters .is_verbose ():
647+ print ("HDBSCAN clustered labels by their size descending (top 10):" , clustering_results .clustering_results_distribution .head (10 ))
648+ print ("HDBSCAN clustered labels by their probability descending (top 10):" , clustering_results .clustering_results_distribution .sort_values (by = 'probability' , ascending = False ).head (10 ))
649+
650+ tuned_fast_random_projection .write_embeddings ()
651+ data_to_write = pd .DataFrame (data = {
652+ 'nodeElementId' : embeddings ["nodeElementId" ],
653+ 'clusteringHDBSCANLabel' : embeddings ['clusteringTunedHDBSCANLabel' ],
654+ 'clusteringHDBSCANProbability' : embeddings ['clusteringTunedHDBSCANProbability' ],
655+ 'clusteringHDBSCANNoise' : (embeddings ['clusteringTunedHDBSCANLabel' ] == - 1 ).astype (int ),
656+ })
657+ write_batch_data_into_database (data_to_write , parameters .get_projection_node_label ())
0 commit comments