From 9eec0cc704ab185030f0688e8ea7449386f2ff4b Mon Sep 17 00:00:00 2001 From: JohT Date: Thu, 6 Jun 2024 09:01:53 +0200 Subject: [PATCH 1/2] Ignore local conda environment data --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d4830a7c3..ba8fd3cce 100644 --- a/.gitignore +++ b/.gitignore @@ -91,4 +91,7 @@ coverage/ # Jupyter Notebook .ipynb_checkpoints -*.nbconvert* \ No newline at end of file +*.nbconvert* + +# Python environments +.conda \ No newline at end of file From 74364b5a60f3ff1bdf1b8a0a6e4aea34412cbd07 Mon Sep 17 00:00:00 2001 From: JohT Date: Thu, 6 Jun 2024 09:26:05 +0200 Subject: [PATCH 2/2] Migrate from sklearn to openTSNE --- README.md | 2 +- jupyter/NodeEmbeddingsJava.ipynb | 27 +++++++++++--------------- jupyter/NodeEmbeddingsTypescript.ipynb | 25 ++++++++++-------------- jupyter/environment.yml | 2 +- 4 files changed, 23 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 0b51db815..3df79e209 100644 --- a/README.md +++ b/README.md @@ -120,7 +120,7 @@ The [Code Structure Analysis Pipeline](./.github/workflows/java-code-analysis.ym - [pip](https://pip.pypa.io/en/stable) - [monotonic](https://github.com/atdt/monotonic) - [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver) - - [sklearn](https://scikit-learn.org) + - [openTSNE](https://github.com/pavlin-policar/openTSNE) - [wordcloud](https://github.com/amueller/word_cloud) - [Graph Visualization](./graph-visualization/README.md) uses [node.js](https://nodejs.org/de) and the dependencies listed in [package.json](./graph-visualization/package.json). diff --git a/jupyter/NodeEmbeddingsJava.ipynb b/jupyter/NodeEmbeddingsJava.ipynb index d2c4afac7..071b28efc 100644 --- a/jupyter/NodeEmbeddingsJava.ipynb +++ b/jupyter/NodeEmbeddingsJava.ipynb @@ -58,7 +58,7 @@ "import matplotlib.pyplot as plot\n", "import typing as typ\n", "import numpy as np\n", - "from sklearn.manifold import TSNE\n", + "from openTSNE.sklearn import TSNE\n", "from neo4j import GraphDatabase" ] }, @@ -69,9 +69,9 @@ "metadata": {}, "outputs": [], "source": [ - "import sklearn\n", - "print('The scikit-learn version is {}.'.format(sklearn.__version__))\n", - "print('The pandas version is {}.'.format(pd.__version__))\n" + "from openTSNE import __version__ as openTSNE_version\n", + "print('The openTSNE version is: {}'.format(openTSNE_version))\n", + "print('The pandas version is: {}'.format(pd.__version__))\n" ] }, { @@ -231,7 +231,7 @@ "\n", "> It converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data.\n", "\n", - "(see https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE)" + "(see https://opentsne.readthedocs.io)" ] }, { @@ -245,7 +245,7 @@ " \"\"\"\n", " Reduces the dimensionality of the node embeddings (e.g. 64 floating point numbers in an array)\n", " to two dimensions for 2D visualization.\n", - " see https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE\n", + " see https://opentsne.readthedocs.io\n", " \"\"\"\n", "\n", " if embeddings.empty: \n", @@ -258,16 +258,9 @@ " # See https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape\n", " embeddings_as_numpy_array = np.array(embeddings.embedding.to_list())\n", "\n", - " # The parameter \"perplexity\" needs to be smaller than the sample size\n", - " # See https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html\n", - " number_of_nodes=embeddings.shape[0]\n", - " perplexity = min(number_of_nodes - 1.0, 30.0)\n", - " print(\"t-SNE: Sample size (Number of nodes)={size}\".format(size = number_of_nodes))\n", - " print(\"t-SNE: perplexity={perplexity}\".format(perplexity=perplexity))\n", - "\n", " # Use t-distributed stochastic neighbor embedding (t-SNE) to reduce the dimensionality \n", " # of the previously calculated node embeddings to 2 dimensions for visualization\n", - " t_distributed_stochastic_neighbor_embedding = TSNE(n_components=2, perplexity=perplexity, verbose=1, random_state=50)\n", + " t_distributed_stochastic_neighbor_embedding = TSNE(n_components=2, verbose=1, random_state=47)\n", " two_dimension_node_embeddings = t_distributed_stochastic_neighbor_embedding.fit_transform(embeddings_as_numpy_array)\n", " display(two_dimension_node_embeddings.shape) # Display the shape of the t-SNE result\n", "\n", @@ -365,7 +358,9 @@ "source": [ "### 1.1 Generate Node Embeddings using Fast Random Projection (Fast RP) for Java Packages\n", "\n", - "[Fast Random Projection](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp) is used to reduce the dimensionality of the node feature space while preserving most of the distance information. Nodes with similar neighborhood result in node embedding with similar vectors." + "[Fast Random Projection](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp) is used to reduce the dimensionality of the node feature space while preserving most of the distance information. Nodes with similar neighborhood result in node embedding with similar vectors.\n", + "\n", + "**๐Ÿ‘‰Hint:** To skip existing node embeddings and always calculate them based on the parameters below edit `Node_Embeddings_0a_Query_Calculated` so that it won't return any results." ] }, { @@ -511,7 +506,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.11.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/NodeEmbeddingsTypescript.ipynb b/jupyter/NodeEmbeddingsTypescript.ipynb index 775c77733..e7b3b5df9 100644 --- a/jupyter/NodeEmbeddingsTypescript.ipynb +++ b/jupyter/NodeEmbeddingsTypescript.ipynb @@ -58,7 +58,7 @@ "import matplotlib.pyplot as plot\n", "import typing as typ\n", "import numpy as np\n", - "from sklearn.manifold import TSNE\n", + "from openTSNE.sklearn import TSNE\n", "from neo4j import GraphDatabase" ] }, @@ -69,8 +69,8 @@ "metadata": {}, "outputs": [], "source": [ - "import sklearn\n", - "print('The scikit-learn version is {}.'.format(sklearn.__version__))\n", + "from openTSNE import __version__ as openTSNE_version\n", + "print('The openTSNE version is: {}'.format(openTSNE_version))\n", "print('The pandas version is {}.'.format(pd.__version__))\n" ] }, @@ -231,7 +231,7 @@ "\n", "> It converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data.\n", "\n", - "(see https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE)" + "(see https://opentsne.readthedocs.io)" ] }, { @@ -245,7 +245,7 @@ " \"\"\"\n", " Reduces the dimensionality of the node embeddings (e.g. 32 floating point numbers in an array)\n", " to two dimensions for 2D visualization.\n", - " see https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE\n", + " see https://opentsne.readthedocs.io\n", " \"\"\"\n", "\n", " if embeddings.empty: \n", @@ -258,16 +258,9 @@ " # See https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape\n", " embeddings_as_numpy_array = np.array(embeddings.embedding.to_list())\n", "\n", - " # The parameter \"perplexity\" needs to be smaller than the sample size\n", - " # See https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html\n", - " number_of_nodes=embeddings.shape[0]\n", - " perplexity = min(number_of_nodes - 1.0, 30.0)\n", - " print(\"t-SNE: Sample size (Number of nodes)={size}\".format(size = number_of_nodes))\n", - " print(\"t-SNE: perplexity={perplexity}\".format(perplexity=perplexity))\n", - "\n", " # Use t-distributed stochastic neighbor embedding (t-SNE) to reduce the dimensionality \n", " # of the previously calculated node embeddings to 2 dimensions for visualization\n", - " t_distributed_stochastic_neighbor_embedding = TSNE(n_components=2, perplexity=perplexity, verbose=1, random_state=50)\n", + " t_distributed_stochastic_neighbor_embedding = TSNE(n_components=2, verbose=1, random_state=47)\n", " two_dimension_node_embeddings = t_distributed_stochastic_neighbor_embedding.fit_transform(embeddings_as_numpy_array)\n", " display(two_dimension_node_embeddings.shape) # Display the shape of the t-SNE result\n", "\n", @@ -365,7 +358,9 @@ "source": [ "### 1.1 Generate Node Embeddings for Typescript Modules using Fast Random Projection (Fast RP)\n", "\n", - "[Fast Random Projection](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp) is used to reduce the dimensionality of the node feature space while preserving most of the distance information. Nodes with similar neighborhood result in node embedding with similar vectors." + "[Fast Random Projection](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp) is used to reduce the dimensionality of the node feature space while preserving most of the distance information. Nodes with similar neighborhood result in node embedding with similar vectors.\n", + "\n", + "**๐Ÿ‘‰ Hint:** To skip existing node embeddings and always calculate them based on the parameters below edit `Node_Embeddings_0a_Query_Calculated` so that it won't return any results." ] }, { @@ -514,7 +509,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.11.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/environment.yml b/jupyter/environment.yml index fcbace35c..9f412aae8 100644 --- a/jupyter/environment.yml +++ b/jupyter/environment.yml @@ -10,7 +10,7 @@ dependencies: - numpy=1.23.* - pandas=1.5.* - pip=22.3.* - - scikit-learn=1.3.* # NodeEmbeddings.ipynb uses sklearn.manifold.TSNE + - opentsne=1.0.* # to visualize node embeddings in 2D (t-SNE dimensionality reduction) - pip: - monotonic==1.* - wordcloud==1.9.*