Use already calculated node embeddings if available

JohT · JohT · commit 468e210218e8 · 2024-05-04T13:51:24.000+02:00
diff --git a/cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher b/cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher
@@ -0,0 +1,18 @@
+// Query already calculated and written node embeddings on nodes with label in parameter $dependencies_projection_node including a communityId and centrality. Variables: dependencies_projection_node, dependencies_projection_write_property
+
+ MATCH (codeUnit)
+ WHERE $dependencies_projection_node IN LABELS(codeUnit)
+   AND codeUnit[$dependencies_projection_write_property] IS NOT NULL
+   // AND codeUnit.notExistingToForceRecalculation IS NOT NULL // uncomment this line to force recalculation
+OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
+   WITH *, replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName
+OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
+   WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName   
+ RETURN DISTINCT 
+        coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
+       ,coalesce(codeUnit.name, replace(last(split(codeUnit.fileName, '/')), '.jar', ''))                AS shortCodeUnitName
+       ,coalesce(artifactName, projectName)                                                              AS projectName
+       ,coalesce(codeUnit.communityLeidenId, 0)           AS communityId
+       ,coalesce(codeUnit.centralityPageRank, 0.01)       AS centrality
+       ,codeUnit[$dependencies_projection_write_property] AS embedding
+  ORDER BY communityId
diff --git a/cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher
@@ -9,9 +9,14 @@ CALL gds.fastRP.stream(
 YIELD nodeId, embedding
  WITH gds.util.asNode(nodeId) AS codeUnit
      ,embedding
-OPTIONAL MATCH (artifact:Artifact)-[:CONTAINS]->(codeUnit)
-RETURN DISTINCT coalesce(codeUnit.fqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
-     ,coalesce(codeUnit.communityLeidenId, 0) AS communityId
-     ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality
-     ,replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName
-     ,embedding
+OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
+   WITH *, replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName
+OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
+   WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName   
+ RETURN DISTINCT 
+        coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
+       ,coalesce(codeUnit.name, replace(last(split(codeUnit.fileName, '/')), '.jar', '')) AS shortCodeUnitName
+       ,coalesce(artifactName, projectName) AS projectName
+       ,coalesce(codeUnit.communityLeidenId, 0) AS communityId
+       ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality
+       ,embedding
diff --git a/cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher
@@ -14,9 +14,14 @@ CALL gds.beta.hashgnn.stream(
 YIELD nodeId, embedding
  WITH gds.util.asNode(nodeId) AS codeUnit
      ,embedding
-OPTIONAL MATCH (artifact:Artifact)-[:CONTAINS]->(codeUnit)
-RETURN DISTINCT coalesce(codeUnit.fqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
+OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
+   WITH *, replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName
+OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
+   WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName   
+ RETURN DISTINCT 
+        coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
+       ,coalesce(codeUnit.name, replace(last(split(codeUnit.fileName, '/')), '.jar', '')) AS shortCodeUnitName
+       ,coalesce(artifactName, projectName) AS projectName
      ,coalesce(codeUnit.communityLeidenId, 0) AS communityId
      ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality
-     ,replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName
      ,embedding
diff --git a/cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher
@@ -10,9 +10,14 @@ CALL gds.node2vec.stream(
 YIELD nodeId, embedding
  WITH gds.util.asNode(nodeId) AS codeUnit
      ,embedding
-OPTIONAL MATCH (artifact:Artifact)-[:CONTAINS]->(codeUnit)
-RETURN DISTINCT coalesce(codeUnit.fqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
+OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
+   WITH *, replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName
+OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
+   WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName   
+ RETURN DISTINCT 
+        coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
+       ,coalesce(codeUnit.name, replace(last(split(codeUnit.fileName, '/')), '.jar', '')) AS shortCodeUnitName
+       ,coalesce(artifactName, projectName) AS projectName
      ,coalesce(codeUnit.communityLeidenId, 0) AS communityId
      ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality
-     ,replace(last(split(artifact.fileName, '/')), '.jar', '') AS artifactName
      ,embedding
diff --git a/jupyter/NodeEmbeddings.ipynb b/jupyter/NodeEmbeddings.ipynb
@@ -108,6 +108,28 @@
     "    return pd.DataFrame([r.values() for r in records], columns=keys)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd1d9775",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def query_first_non_empty_cypher_to_data_frame(*filenames : str, parameters: typ.Optional[typ.Dict[str, typ.Any]] = None):\n",
+    "    \"\"\"\n",
+    "    Executes the Cypher queries of the given files and returns the first result that is not empty.\n",
+    "    If all given file names result in empty results, the last (empty) result will be returned.\n",
+    "    By additionally specifying \"limit=\" the \"LIMIT\" keyword will appended to query so that only the first results get returned.\n",
+    "    \"\"\"\n",
+    "    result=pd.DataFrame()\n",
+    "    for filename in filenames:\n",
+    "        result=query_cypher_to_data_frame(filename, parameters)\n",
+    "        if not result.empty:\n",
+    "            print(\"The results have been provided by the query filename: \" + filename)\n",
+    "            return result\n",
+    "    return result"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -177,7 +199,8 @@
     "        empty_result = pd.DataFrame(columns=[\"codeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n",
     "        return empty_result\n",
     "\n",
-    "    embeddings = query_cypher_to_data_frame(cypher_file_name, parameters)\n",
+    "    existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n",
+    "    embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n",
     "    display(embeddings.head()) # Display the first entries of the table\n",
     "    return embeddings"
    ]
@@ -210,7 +233,7 @@
     "    # and the code unit and artifact name of the query above as preparation for the plot\n",
     "    node_embeddings_for_visualization = pd.DataFrame(data = {\n",
     "        \"codeUnit\": embeddings.codeUnitName,\n",
-    "        \"artifact\": embeddings.artifactName,\n",
+    "        \"artifact\": embeddings.projectName,\n",
     "        \"communityId\": embeddings.communityId,\n",
     "        \"centrality\": embeddings.centrality,\n",
     "        \"x\": [value[0] for value in two_dimension_node_embeddings],\n",
@@ -316,7 +339,8 @@
     "    \"dependencies_projection\": \"java-package-embeddings-notebook\",\n",
     "    \"dependencies_projection_node\": \"Package\",\n",
     "    \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
-    "    \"dependencies_projection_embedding_dimension\":\"64\" \n",
+    "    \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n",
+    "    \"dependencies_projection_embedding_dimension\":\"64\"\n",
     "}\n",
     "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)\n"
    ]
@@ -396,6 +420,7 @@
     "    \"dependencies_projection\": \"typescript-module-embeddings-notebook\",\n",
     "    \"dependencies_projection_node\": \"Module\",\n",
     "    \"dependencies_projection_weight_property\": \"lowCouplingElement25PercentWeight\",\n",
+    "    \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n",
     "    \"dependencies_projection_embedding_dimension\":\"64\" \n",
     "}\n",
     "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", typescript_module_embeddings_parameters)\n"