diff --git a/.github/workflows/check-links-in-documentation.yml b/.github/workflows/check-links-in-documentation.yml new file mode 100644 index 000000000..1e37a93ab --- /dev/null +++ b/.github/workflows/check-links-in-documentation.yml @@ -0,0 +1,29 @@ +name: Check links in documentation + +on: + pull_request: + branches: + - main + # Only watch root level Markdown documentation file changes + paths: + - 'README.md' + - 'COMMANDS.md' + - 'GETTING_STARTED.md' + - '.github/workflows/check-links-in-documentation.yml' # also run when this file was changed + schedule: + - cron: "15 6 1 * *" # On the first day of each month at 6:15 o'clock + +jobs: + reports: + runs-on: ubuntu-latest + steps: + - name: Checkout GIT Repository + uses: actions/checkout@v4 + + - name: Setup node.js + uses: actions/setup-node@v4 + with: + node-version-file: '.nvmrc' + + - name: Check links in top level documentation Markdown files + run: npx --yes markdown-link-check --config=markdown-lint-check-config.json README.md COMMANDS.md GETTING_STARTED.md diff --git a/.github/workflows/java-code-analysis.yml b/.github/workflows/java-code-analysis.yml index 1d78b8fc1..7b98fb1c4 100644 --- a/.github/workflows/java-code-analysis.yml +++ b/.github/workflows/java-code-analysis.yml @@ -45,7 +45,6 @@ jobs: java: 17 python: 3.11 mambaforge: 24.3.0-0 - node: 18 env: CI_COMMIT_MESSAGE: Automated code structure analysis reports (CI) @@ -66,12 +65,12 @@ jobs: distribution: 'adopt' java-version: ${{ matrix.java }} - - name: Setup node.js ${{ matrix.node }} for Graph Visualization + - name: Setup Node.js for Graph Visualization uses: actions/setup-node@v4 with: - node-version: ${{ matrix.node }} + node-version-file: 'graph-visualization/.nvmrc' - - name: Install nodes packages for Graph Visualization + - name: Install Node packages for Graph Visualization working-directory: graph-visualization run: npm ci diff --git a/.github/workflows/typescript-code-analysis.yml b/.github/workflows/typescript-code-analysis.yml index 2b2d579ab..7a6f851a0 100644 --- a/.github/workflows/typescript-code-analysis.yml +++ b/.github/workflows/typescript-code-analysis.yml @@ -45,7 +45,6 @@ jobs: java: 17 python: 3.11 mambaforge: 24.3.0-0 - node: 18 env: CI_COMMIT_MESSAGE: Automated code structure analysis reports (CI) @@ -66,12 +65,12 @@ jobs: distribution: 'adopt' java-version: ${{ matrix.java }} - - name: Setup node.js ${{ matrix.node }} for Graph Visualization + - name: Setup Node.js for Graph Visualization uses: actions/setup-node@v4 with: - node-version: ${{ matrix.node }} + node-version-file: 'graph-visualization/.nvmrc' - - name: Install nodes packages for Graph Visualization + - name: Install Node packages for Graph Visualization working-directory: graph-visualization run: npm ci diff --git a/.nvmrc b/.nvmrc new file mode 100644 index 000000000..5871e601c --- /dev/null +++ b/.nvmrc @@ -0,0 +1 @@ +v20.12.1 \ No newline at end of file diff --git a/COMMANDS.md b/COMMANDS.md index 23b032401..5c39bdd71 100644 --- a/COMMANDS.md +++ b/COMMANDS.md @@ -1,9 +1,53 @@ # Code Graph Analysis Pipeline - Commands -## Start an analysis + + +- [Start an Analysis](#start-an-analysis) + - [Command Line Options](#command-line-options) + - [Notes](#notes) + - [Examples](#examples) + - [Start an analysis with CSV reports only](#start-an-analysis-with-csv-reports-only) + - [Start an analysis with Jupyter reports only](#start-an-analysis-with-jupyter-reports-only) + - [Start an analysis with PDF generation](#start-an-analysis-with-pdf-generation) + - [Only run setup and explore the Graph manually](#only-run-setup-and-explore-the-graph-manually) +- [Generate Markdown References](#generate-markdown-references) + - [Generate Cypher Reference](#generate-cypher-reference) + - [Generate Script Reference](#generate-script-reference) + - [Generate CSV Cypher Query Report Reference](#generate-csv-cypher-query-report-reference) + - [Generate Jupyter Notebook Report Reference](#generate-jupyter-notebook-report-reference) + - [Generate Image Reference](#generate-image-reference) + - [Generate Environment Variable Reference](#generate-environment-variable-reference) +- [Validate Links in Markdown](#validate-links-in-markdown) +- [Manual Setup](#manual-setup) + - [Setup Neo4j Graph Database](#setup-neo4j-graph-database) + - [Start Neo4j Graph Database](#start-neo4j-graph-database) + - [Setup jQAssistant Java Code Analyzer](#setup-jqassistant-java-code-analyzer) + - [Download Maven Artifacts to Analyze](#download-maven-artifacts-to-analyze) + - [Reset the database and scan the java artifacts](#reset-the-database-and-scan-the-java-artifacts) +- [Database Queries](#database-queries) + - [Cypher Shell](#cypher-shell) + - [HTTP API](#http-api) + - [executeQueryFunctions.sh](#executequeryfunctionssh) +- [Stop Neo4j](#stop-neo4j) +- [Jupyter Notebook](#jupyter-notebook) + - [Create a report with executeJupyterNotebookReport.sh](#create-a-report-with-executejupyternotebookreportsh) + - [Data Availability Validation](#data-availability-validation) + - [Execute a Notebook with executeJupyterNotebook.sh](#execute-a-notebook-with-executejupyternotebooksh) + - [Manually setup the environment using Conda](#manually-setup-the-environment-using-conda) + - [Executing Jupyter Notebooks with nbconvert](#executing-jupyter-notebooks-with-nbconvert) +- [References](#references) +- [Other Commands](#other-commands) + - [Information about a process that listens to a specific local port](#information-about-a-process-that-listens-to-a-specific-local-port) + - [Kill process that listens to a specific local port](#kill-process-that-listens-to-a-specific-local-port) + - [Memory Estimation](#memory-estimation) + + + +## Start an Analysis An analysis is started with the script [analyze.sh](./scripts/analysis/analyze.sh). To run all analysis steps simple execute the following command: + ```shell ./../../scripts/analysis/analyze.sh ``` @@ -55,7 +99,7 @@ Note: Generating a PDF from a Jupyter notebook using [nbconvert](https://nbconve ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION=true ./../../scripts/analysis/analyze.sh ``` -#### Setup everything to explore the graph manually +#### Only run setup and explore the Graph manually To prepare everything for analysis including installation, configuration and preparation queries to explore the graph manually without report generation use this command: @@ -123,6 +167,14 @@ Change into the [scripts](./scripts/) directory e.g. with `cd scripts` and then ./documentation/generateEnvironmentVariableReference.sh ``` +## Validate Links in Markdown + +The following command shows how to use [markdown-link-check](https://github.com/tcort/markdown-link-check) to for example check the links in the [README.md](./README.md) file: + +```script +npx --yes markdown-link-check --quiet --progress --config=markdown-lint-check-config.json README.md COMMANDS.md GETTING_STARTED.md +``` + ## Manual Setup The manual setup is only documented for completeness. It isn't needed since the analysis also covers download, installation and configuration of all needed tools. @@ -141,7 +193,7 @@ It runs the script with a temporary `NEO4J_HOME` environment variable to not int ### Setup jQAssistant Java Code Analyzer -Use [setupJQAssistant.sh](./scripts/setupJQAssistant.sh) to download [jQAssistant](https://jqassistant.org/get-started). +Use [setupJQAssistant.sh](./scripts/setupJQAssistant.sh) to download [jQAssistant](https://jqassistant.github.io/jqassistant/doc). ### Download Maven Artifacts to analyze @@ -200,7 +252,7 @@ Query parameters can be added as arguments after the file name. Here is an examp ./scripts/executeQuery.sh ./cypher/Get_Graph_Data_Science_Library_Version.cypher a=1 ``` -### executeQueryFunctions +### executeQueryFunctions.sh The script [executeQueryFunctions.sh](./scripts/executeQueryFunctions.sh) contains functions to simplify the call of [executeQuery.sh](./scripts/executeQuery.sh) for different purposes. For example, `execute_cypher_summarized` @@ -221,7 +273,41 @@ Use [stopNeo4j.sh](./scripts/stopNeo4j.sh) to stop the locally running Neo4j Gra ## Jupyter Notebook -### Commands +### Create a report with executeJupyterNotebookReport.sh + +The script [executeJupyterNotebookReport.sh](./scripts/executeJupyterNotebookReport.sh) combines: + +- creating a directory within the "reports" directory +- data availability validation using [executeQueryFunctions.sh](#executequeryfunctionssh) +- executing and converting the given Notebook using [executeJupyterNotebook.sh](#execute-a-notebook-with-executejupyternotebooksh) + +Here is an example on how to run the report [Wordcloud.ipynb](./jupyter/Wordcloud.ipynb): + +```shell +./scripts/executeJupyterNotebookReport.sh --jupyterNotebook Wordcloud.ipynb +``` + +#### Data Availability Validation + +[Jupyter Notebooks](https://jupyter.org) can have additional custom tags within their [metadata section](https://ipython.readthedocs.io/en/3.x/notebook/nbformat.html#metadata). Opening these files with a text editor unveils that typically at the end of the file. Some editors also support editing them directly. Here, the optional metadata property `code_graph_analysis_pipeline_data_validation` is used to specify which data validation query in the [cypher/Validation](./cypher/Validation/) directory should be used. Without this property, the data validation step is skipped. If a validation is specified, it will be executed before the Jupyter Notebook is executed. If the query has at least one result, the validation is seen as successful. Otherwise, the Jupyter Notebook will not be executed. + +This is helpful for Jupyter Notebook reports that are specific to a programming language or other specific data prerequisites. The Notebook will be skipped if there is no data available which would otherwise lead to confusing and distracting reports with empty tables and figures. + +You can search the messages `Validation succeeded` or `Validation failed` inside the log to get detailed information which Notebook had been skipped for which reason. + +### Execute a Notebook with executeJupyterNotebook.sh + +[executeJupyterNotebook.sh](./scripts/executeJupyterNotebook.sh) executes a Jupyter Notebook in the command line and convert it to different formats like Markdown and PDF (optionally). It takes care of [setting up the environment](#manually-setup-the-environment-using-conda) and [uses nbconvert](#executing-jupyter-notebooks-with-nbconvert) to execute the notebook and convert it to other file formats under the hood. + +Here is an example on how to use [executeJupyterNotebook.sh](./scripts/executeJupyterNotebook.sh) to for example run [Wordcloud.ipynb](./jupyter/Wordcloud.ipynb): + +```shell +./scripts/executeJupyterNotebook.sh ./jupyter/Wordcloud.ipynb +``` + +### Manually setup the environment using Conda + +[Conda](https://conda.io) provides package, dependency, and environment management for any language. Here, it is used to setup the environment for Juypter Notebooks. - Setup environment @@ -230,10 +316,10 @@ Use [stopNeo4j.sh](./scripts/stopNeo4j.sh) to stop the locally running Neo4j Gra conda activate codegraph ``` - or by using the environment file [codegraph-environment.yml](./jupyter/codegraph-environment.yml): + or by using the environment file [codegraph-environment.yml](./jupyter/environment.yml): ```shell - conda env create --file ./jupyter/codegraph-environment.yml + conda env create --file ./jupyter/environment.yml conda activate codegraph ``` @@ -246,9 +332,13 @@ Use [stopNeo4j.sh](./scripts/stopNeo4j.sh) to stop the locally running Neo4j Gra - Export only explicit environment.yml ```shell - conda env export --from-history --name codegraph | grep -v "^prefix: " > codegraph-environment.yml + conda env export --from-history --name codegraph | grep -v "^prefix: " > explicit-codegraph-environment.yml ``` +### Executing Jupyter Notebooks with nbconvert + +[nbconvert](https://nbconvert.readthedocs.io) converts Jupyter Notebooks to other static formats including HTML, LaTeX, PDF, Markdown, reStructuredText, and more. + - Install pandoc used by nbconvert for LaTeX support (Mac) ```shell @@ -273,23 +363,19 @@ Use [stopNeo4j.sh](./scripts/stopNeo4j.sh) to stop the locally running Neo4j Gra jupyter nbconvert --to pdf ./jupyter/first-neo4j-tryout.nbconvert.ipynb ``` -- Shell script to execute and convert a Jupyter notebook file - - Use [executeJupyterNotebook.sh](./scripts/executeJupyterNotebook.sh) like this: - - ```shell - ./scripts/executeJupyterNotebook.sh ./jupyter/first-neo4j-tryout.ipynb - ``` - ## References -- [Managing environments with Conda](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) +- [Conda](https://conda.io) +- [jQAssistant](https://jqassistant.github.io/jqassistant/doc) +- [Jupyter Notebook](https://jupyter.org) - [Jupyter Notebook - Using as a command line tool](https://nbconvert.readthedocs.io/en/latest/usage.html) - [Jupyter Notebook - Installing TeX for PDF conversion](https://nbconvert.readthedocs.io/en/latest/install.html#installing-tex) -- [Integrate Neo4j with Jupyter notebook](https://medium.com/@technologydata25/connect-neo4j-to-jupyter-notebook-c178f716d6d5) +- [Jupyter Notebook Format - Metadata](https://ipython.readthedocs.io/en/3.x/notebook/nbformat.html#metadata) +- [Integrate Neo4j with Jupyter Notebook](https://medium.com/@technologydata25/connect-neo4j-to-jupyter-notebook-c178f716d6d5) - [Hello World](https://nicolewhite.github.io/neo4j-jupyter/hello-world.html) -- [py2neo](https://pypi.org/project/py2neo/) -- [The Py2neo Handbook](https://py2neo.org/2021.1/) +- [Managing environments with Conda](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) +- [Neo4j - Download](https://neo4j.com/download-center) +- [Neo4j - HTTP API](https://neo4j.com/docs/http-api/current/query) - [How to Use Conda With Github Actions](https://autobencoder.com/2020-08-24-conda-actions) - [Older database download link (neo4j community)](https://community.neo4j.com/t/older-database-download-link/43334/9) diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md index f66fcb409..fe6aa8ff5 100644 --- a/GETTING_STARTED.md +++ b/GETTING_STARTED.md @@ -6,7 +6,7 @@ For more details on how the commands work in detail see [COMMANDS](./COMMANDS.md ## πŸ›  Prerequisites -Please read through the [Prerequisites](./README.md#πŸ› -prerequisites) in the [README](./README.md) file for what is required to run the scripts. +Please read through the [Prerequisites](./README.md#hammer_and_wrench-prerequisites) in the [README](./README.md) file for what is required to run the scripts. ## Start an analysis @@ -44,7 +44,7 @@ Please read through the [Prerequisites](./README.md#πŸ› -prerequisites) in the [ ./../../scripts/downloader/downloadAxonFramework.sh ``` -1. Optionally use a script to download artifacts from Maven ([details](#download-maven-artifacts-to-analyze)) +1. Optionally use a script to download artifacts from Maven ([details](./COMMANDS.md#download-maven-artifacts-to-analyze)) 1. Start the analysis diff --git a/README.md b/README.md index 27f087351..6674c0b23 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,11 @@ -Contained within this repository is a comprehensive and automated code graph analysis pipeline. While initially designed to support Java through the utilization of [jQAssistant](https://jqassistant.org/get-started), it now also [supports Typescript](https://github.com/jqassistant-plugin/jqassistant-typescript-plugin) and is open to extension for further programming languages. The graph database [Neo4j](https://neo4j.com) serves as the foundation for storing and querying the graph, which encompasses all the structural intricacies of the analyzed code. Additionally, Neo4j's [Graph Data Science](https://neo4j.com/product/graph-data-science) provides additional algorithms like community detection to analyze the code structure. The generated reports offer flexibility, ranging from simple query results presented as CSV files to more elaborate Jupyter Notebooks converted to Markdown or PDF formats. +Contained within this repository is a comprehensive and automated code graph analysis pipeline. While initially designed to support Java through the utilization of [jQAssistant](https://jqassistant.github.io/jqassistant/doc), it now also [supports Typescript](https://github.com/jqassistant-plugin/jqassistant-typescript-plugin) and is open to extension for further programming languages. The graph database [Neo4j](https://neo4j.com) serves as the foundation for storing and querying the graph, which encompasses all the structural intricacies of the analyzed code. Additionally, Neo4j's [Graph Data Science](https://neo4j.com/product/graph-data-science) provides additional algorithms like community detection to analyze the code structure. The generated reports offer flexibility, ranging from simple query results presented as CSV files to more elaborate Jupyter Notebooks converted to Markdown or PDF formats. --- -## ✨ Features +## :sparkles: Features - Analyze static code structure as a graph - **🌟New🌟:** Also supports Typescript @@ -22,38 +22,38 @@ Contained within this repository is a comprehensive and automated code graph ana - Comprehensive list of [Cypher queries](./cypher/CYPHER.md) - Example analysis for [AxonFramework](https://github.com/AxonFramework/AxonFramework) -### πŸ“– Jupyter Notebook Reports +### :book: Jupyter Notebook Reports -Here is an overview of reports made with [Jupyter Notebooks](https://jupyter.org). For a detailed reference see [Jupyter Notebook Report Reference](#πŸ“ˆ-jupyter-notebook-report-reference) below. +Here is an overview of reports made with [Jupyter Notebooks](https://jupyter.org). For a detailed reference see [Jupyter Notebook Report Reference](#page_with_curl-jupyter-notebook-report-reference -- [External Dependencies](./results/AxonFramework-4.9.3/external-dependencies/ExternalDependencies.md) contains detailed information about external library usage ([Notebook](./jupyter/ExternalDependencies.ipynb)). -- [Internal Dependencies](./results/AxonFramework-4.9.3/internal-dependencies/InternalDependencies.md) is based on [Analyze java package metrics in a graph database](https://joht.github.io/johtizen/data/2023/04/21/java-package-metrics-analysis.html) and also includes cyclic dependencies ([Notebook](./jupyter/InternalDependencies.ipynb)). -- [Method Metrics](./results/AxonFramework-4.9.3/method-metrics/MethodMetrics.md) shows how the effective number of lines of code and the cyclomatic complexity are distributed across the methods in the code ([Notebook](./jupyter/MethodMetrics.ipynb)). -- [Node Embeddings](./results/AxonFramework-4.9.3/node-embeddings/NodeEmbeddings.md) shows how to generate node embeddings and to further reduce their dimensionality to be able to visualize them in a 2D plot ([Notebook](./jupyter/NodeEmbeddings.ipynb)). -- [Object Oriented Design Quality Metrics](./results/AxonFramework-4.9.3/object-oriented-design-metrics/ObjectOrientedDesignMetrics.md) is based on [OO Design Quality Metrics by Robert Martin](https://www.semanticscholar.org/paper/OO-Design-Quality-Metrics-Martin-October/18acd7eb21b918c8a5f619157f7e4f6d451d18f8) ([Notebook](./jupyter/ObjectOrientedDesignMetrics.ipynb)). -- [Overview](./results/AxonFramework-4.9.3/overview/Overview.md) contains overall statistics and details about methods and their complexity. ([Notebook](./jupyter/Overview.ipynb)). -- [Visibility Metrics](./results/AxonFramework-4.9.3/visibility-metrics/VisibilityMetrics.md) is based on [Visibility Metrics and the Importance of Hiding Things](https://dzone.com/articles/visibility-metrics-and-the-importance-of-hiding-th) ([Notebook](./jupyter/VisibilityMetrics.ipynb)). +- [External Dependencies](./results/AxonFramework-4.9.3/external-dependencies/ExternalDependencies.md) contains detailed information about external library usage ([Notebook](./jupyter/ExternalDependenciesJava.ipynb)). +- [Internal Dependencies](./results/AxonFramework-4.9.3/internal-dependencies/InternalDependencies.md) is based on [Analyze java package metrics in a graph database](https://joht.github.io/johtizen/data/2023/04/21/java-package-metrics-analysis.html) and also includes cyclic dependencies ([Notebook](./jupyter/InternalDependenciesJava.ipynb)). +- [Method Metrics](./results/AxonFramework-4.9.3/method-metrics/MethodMetrics.ipynb) shows how the effective number of lines of code and the cyclomatic complexity are distributed across the methods in the code ([Notebook](./jupyter/MethodMetricsJava.ipynb)). +- [Node Embeddings](./results/AxonFramework-4.9.3/node-embeddings/NodeEmbeddings.md) shows how to generate node embeddings and to further reduce their dimensionality to be able to visualize them in a 2D plot ([Notebook](./jupyter/NodeEmbeddingsJava.ipynb)). +- [Object Oriented Design Quality Metrics](./results/AxonFramework-4.9.3/object-oriented-design-metrics/ObjectOrientedDesignMetrics.md) is based on [OO Design Quality Metrics by Robert Martin](https://api.semanticscholar.org/CorpusID:18246616) ([Notebook](./jupyter/ObjectOrientedDesignMetricsJava.ipynb)). +- [Overview](./results/AxonFramework-4.9.3/overview/Overview.md) contains overall statistics and details about methods and their complexity. ([Notebook](./jupyter/OverviewJava.ipynb)). +- [Visibility Metrics](./results/AxonFramework-4.9.3/visibility-metrics/VisibilityMetrics.md) ([Notebook](./jupyter/VisibilityMetricsJava.ipynb)). - [Wordcloud](./results/AxonFramework-4.9.3/wordcloud/Wordcloud.md) contains a visual representation of package and class names ([Notebook](./jupyter/Wordcloud.ipynb)). -### πŸ“– Graph Data Science Reports +### :book: Graph Data Science Reports -Here are some reports that utilize Neo4j's [Graph Data Science Library](https://neo4j.com/product/graph-data-science). For a detailed reference of all CSV reports see [CSV Cypher Query Report Reference](#πŸ“ƒ-csv-cypher-query-report-reference) below. +Here are some reports that utilize Neo4j's [Graph Data Science Library](https://neo4j.com/product/graph-data-science). For a detailed reference of all CSV reports see [CSV Cypher Query Report Reference](#page_with_curl-csv-cypher-query-report-reference) -- [Centrality with Page Rank](./results/AxonFramework-4.9.3/centrality-csv/Centrality_Page_Rank.csv) ([Source Script](./scripts/reports/CentralityCsv.sh)) -- [Community Detection with Leiden](./results/AxonFramework-4.9.3/community-csv/Leiden_Communities.csv) ([Source Script](./scripts/reports/CommunityCsv.sh)) +- [Centrality with Page Rank](./results/AxonFramework-4.9.3/centrality-csv/Package_Centrality_Page_Rank.csv) ([Source Script](./scripts/reports/CentralityCsv.sh)) +- [Community Detection with Leiden](./results/AxonFramework-4.9.3/community-csv/Package_communityLeidenId_Community__Metrics.csv) ([Source Script](./scripts/reports/CommunityCsv.sh)) - [Node Embeddings with HashGNN](./results/AxonFramework-4.9.3/node-embeddings-csv/Package_Embeddings_HashGNN.csv) ([Source Script](./scripts/reports/NodeEmbeddingsCsv.sh)) -- [Similarity with Jaccard](./results/AxonFramework-4.9.3/similarity-csv/Similarity_Jaccard.csv) ([Source Script](./scripts/reports/SimilarityCsv.sh)) -- [Topology Sort](./results/AxonFramework-4.9.3/artifact-topology-csv/TopologicalSortedArtifacts.csv) ([Source Script](./scripts/reports/TopologicalSortCsv.sh)) +- [Similarity with Jaccard](./results/AxonFramework-4.9.3/similarity-csv/Package_Similarity.csv) ([Source Script](./scripts/reports/SimilarityCsv.sh)) +- [Topology Sort](./results/AxonFramework-4.9.3/topology-csv/Package_Topological_Sort.csv) ([Source Script](./scripts/reports/TopologicalSortCsv.sh)) -## πŸ“– Blog Articles +## :book: Blog Articles - [Analyze java dependencies with jQAssistant](https://joht.github.io/johtizen/data/2021/02/21/java-jar-dependency-analysis.html) - [Analyze java package metrics in a graph database (Part 2)](https://joht.github.io/johtizen/data/2023/04/21/java-package-metrics-analysis.html) -## πŸ›  Prerequisites +## :hammer_and_wrench: Prerequisites - Java 17 is [required for Neo4j](https://neo4j.com/docs/operations-manual/current/installation/requirements/#deployment-requirements-software) (Neo4j 5.x requirement). -- On Windows it is recommended to use the git bash provided by [git for windows](https://gitforwindows.org). +- On Windows it is recommended to use the git bash provided by [git for windows](https://github.com/git-guides/install-git#install-git-on-windows). - [jq](https://github.com/jqlang/jq) the "lightweight and flexible command-line JSON processor" needs to be installed. Latest releases: https://github.com/jqlang/jq/releases/latest. Check using `jq --version`. - Set environment variable `NEO4J_INITIAL_PASSWORD` to a password of your choice. For example: @@ -93,11 +93,11 @@ This could be as simple as running the following command in your Typescript proj - Copy the resulting json file (e.g. `.reports/jqa/ts-output.json`) into the "artifacts" directory for your analysis work directory. Custom subdirectories within "artifacts" are also supported. -## πŸš€ Getting Started +## :rocket: Getting Started See [GETTING_STARTED.md](./GETTING_STARTED.md) on how to get started on your local machine. -## πŸ— Pipeline and Tools +## :building_construction: Pipeline and Tools The [Code Structure Analysis Pipeline](./.github/workflows/java-code-analysis.yml) utilizes [GitHub Actions](https://docs.github.com/de/actions) to automate the whole analysis process: @@ -107,7 +107,7 @@ The [Code Structure Analysis Pipeline](./.github/workflows/java-code-analysis.ym - [Setup Python with Conda](https://github.com/conda-incubator/setup-miniconda) package manager [Mambaforge](https://github.com/conda-forge/miniforge#mambaforge) - Download artifacts that contain the code to be analyzed [scripts/artifacts](./scripts/downloader/) - Setup [Neo4j](https://neo4j.com) Graph Database ([analysis.sh](./scripts/analysis/analyze.sh)) -- Setup [jQAssistant](https://jqassistant.org/get-started) for Java and [Typescript](https://github.com/jqassistant-plugin/jqassistant-typescript-plugin) analysis ([analysis.sh](./scripts/analysis/analyze.sh)) +- Setup [jQAssistant](https://jqassistant.github.io/jqassistant/doc) for Java and [Typescript](https://github.com/jqassistant-plugin/jqassistant-typescript-plugin) analysis ([analysis.sh](./scripts/analysis/analyze.sh)) - Start [Neo4j](https://neo4j.com) Graph Database ([analysis.sh](./scripts/analysis/analyze.sh)) - Generate CSV Reports [scripts/reports](./scripts/reports) using the command line JSON parser [jq](https://jqlang.github.io/jq) - Generate [Jupyter Notebook](https://jupyter.org) reports using these libraries specified in the [environment.yml](./jupyter/environment.yml): @@ -119,46 +119,46 @@ The [Code Structure Analysis Pipeline](./.github/workflows/java-code-analysis.ym - [pandas](https://pandas.pydata.org) - [pip](https://pip.pypa.io/en/stable) - [monotonic](https://github.com/atdt/monotonic) - - [py2neo](https://py2neo.org) + - [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver) - [sklearn](https://scikit-learn.org) - [wordcloud](https://github.com/amueller/word_cloud) - [Graph Visualization](./graph-visualization/README.md) uses [node.js](https://nodejs.org/de) and the dependencies listed in [package.json](./graph-visualization/package.json). **Big shout-out** πŸ“£ to all the creators and contributors of these great libraries πŸ‘. Projects like this wouldn't be possible without them. Feel free to [create an issue](https://github.com/JohT/code-graph-analysis-pipeline/issues/new/choose) if something is missing or wrong in the list. -## πŸ›  Command Reference +## :runner: Command Reference [COMMANDS.md](./COMMANDS.md) contains further details on commands and how to do a manual setup. -## πŸ“ƒ CSV Cypher Query Report Reference +## :page_with_curl: CSV Cypher Query Report Reference [CSV_REPORTS.md](./results/CSV_REPORTS.md) lists all CSV Cypher query result reports inside the [results](./results) directory. It can be generated as described in [Generate CSV Report Reference](./COMMANDS.md#generate-csv-cypher-query-report-reference). -## πŸ“ˆ Jupyter Notebook Report Reference +## :page_with_curl: Jupyter Notebook Report Reference [JUPYTER_REPORTS.md](./results/JUPYTER_REPORTS.md) lists all Jupyter Notebook reports inside the [results](./results) directory. It can be generated as described in [Generate Jupyter Notebook Report Reference](./COMMANDS.md#generate-jupyter-notebook-report-reference). -## πŸ“ˆ Image Reference +## :camera: Image Reference [IMAGES.md](./results/IMAGES.md) lists all PNG images inside the [results](./results) directory. It can be generated as described in [Generate Image Reference](./COMMANDS.md#generate-image-reference). -## βš™οΈ Script Reference +## :gear: Script Reference [SCRIPTS.md](./scripts/SCRIPTS.md) lists all shell scripts of this repository including their first comment line as a description. It can be generated as described in [Generate Script Reference](./COMMANDS.md#generate-script-reference). -## πŸ”Ž Cypher Query Reference +## :mag: Cypher Query Reference [CYPHER.md](./cypher/CYPHER.md) lists all Cypher queries of this repository including their first comment line as a description. It can be generated as described in [Generate Cypher Reference](./COMMANDS.md#generate-cypher-reference). > [Cypher](https://neo4j.com/docs/getting-started/cypher-intro) is Neo4j’s graph query language that lets you retrieve data from the graph. -## βš™οΈ Environment Variable Reference +## :globe_with_meridians: Environment Variable Reference [ENVIRONMENT_VARIABLES.md](./scripts/ENVIRONMENT_VARIABLES.md) contains all environment variables that are supported by the scripts including default values and description. It can be generated as described in [Generate Environment Variable Reference](./COMMANDS.md#generate-environment-variable-reference). -## πŸ€” Questions & Answers +## :thinking: Questions & Answers - How can i run an analysis locally? - πŸ‘‰ Check the [prerequisites](#πŸ› -prerequisites). + πŸ‘‰ Check the [prerequisites](#hammer_and_wrench-prerequisites). πŸ‘‰ See [Start an analysis](./COMMANDS.md#start-an-analysis) in the [Commands Reference](./COMMANDS.md). πŸ‘‰ To get started from scratch see [GETTING_STARTED.md](./GETTING_STARTED.md). @@ -179,10 +179,10 @@ The [Code Structure Analysis Pipeline](./.github/workflows/java-code-analysis.ym πŸ‘‰ Create a new artifacts download script in the [scripts/downloader](./scripts/downloader/) directory. Take for example [downloadAxonFramework.sh](./scripts/downloader/downloadAxonFramework.sh) as a reference. πŸ‘‰ Run the script separately before executing [analyze.sh](./scripts/analysis/analyze.sh) also in the [pipeline](./.github/workflows/java-code-analysis.yml). -- How can i trigger a full rescan of all artifacts? +- How can i trigger a full re-scan of all artifacts? πŸ‘‰ Delete the file `artifactsChangeDetectionHash.txt` in the `artifacts` directory. -- How can PDF generation for Jupyter Notebooks be enabled (depends on chromium, takes more time)? +- How can i enable PDF generation for Jupyter Notebooks (depends on chromium, takes more time)? πŸ‘‰ Set environment variable `ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION` to anything except an empty string. Example: ```shell @@ -195,6 +195,10 @@ The [Code Structure Analysis Pipeline](./.github/workflows/java-code-analysis.ym ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION=true ./../../scripts/analysis/analyze.sh ``` +- Why are some Jupyter Notebook reports skipped? + πŸ‘‰ The custom Jupyter Notebook metadata property `code_graph_analysis_pipeline_data_validation` can be set to choose a query from [cypher/Validation](./cypher/Validation) that will be executed preliminary to the notebook. If the query leads to at least one result, the validation succeeds and the notebook will be run. If the query leads to no result, the notebook will be skipped. + For more details see [Data Availability Validation](./COMMANDS.md#data-availability-validation). + ## πŸ•Έ Web References - [Graph Data Science 101: Understanding Graphs and Graph Data Science](https://techfirst.medium.com/graph-data-science-101-understanding-graphs-and-graph-data-science-c25055a9db01) @@ -205,4 +209,4 @@ The [Code Structure Analysis Pipeline](./.github/workflows/java-code-analysis.ym - [Graph Data Science Community Detection Algorithms](https://neo4j.com/docs/graph-data-science/2.5/algorithms/community) - [Graph Data Science Community Similarity Algorithms](https://neo4j.com/docs/graph-data-science/2.5/algorithms/similarity) - [Graph Data Science Community Topological Sort Algorithm](https://neo4j.com/docs/graph-data-science/2.5/algorithms/dag/topological-sort) -- [Node embeddings for Beginners](https://towardsdatascience.com/node-embeddings-for-beginners-554ab1625d98) \ No newline at end of file +- [Node embeddings for Beginners](https://towardsdatascience.com/node-embeddings-for-beginners-554ab1625d98) diff --git a/cypher/List_all_existing_artifacts.cypher b/cypher/List_all_existing_java_artifacts.cypher similarity index 80% rename from cypher/List_all_existing_artifacts.cypher rename to cypher/List_all_existing_java_artifacts.cypher index b2ed60035..d8d8c11ce 100644 --- a/cypher/List_all_existing_artifacts.cypher +++ b/cypher/List_all_existing_java_artifacts.cypher @@ -1,6 +1,6 @@ // List all existing artifacts -MATCH (artifact:Artifact:Archive)-[:CONTAINS]->(package:Package)-[:CONTAINS]->(type:Type) +MATCH (artifact:Java:Artifact)-[:CONTAINS]->(package:Java:Package)-[:CONTAINS]->(type:Java:Type) WITH last(split(artifact.fileName, '/')) AS artifactName ,artifact.incomingDependencies AS incomingDependencies ,artifact.outgoingDependencies AS outgoingDependencies diff --git a/cypher/Validation/ValidateJavaArtifactDependencies.cypher b/cypher/Validation/ValidateJavaArtifactDependencies.cypher new file mode 100644 index 000000000..fd32278d5 --- /dev/null +++ b/cypher/Validation/ValidateJavaArtifactDependencies.cypher @@ -0,0 +1,5 @@ +// Check if there is at least one Java Artifact dependency. + + MATCH (source:Java:Artifact:Archive)-[dependency:DEPENDS_ON]->(target:Java:Artifact:Archive) +RETURN elementId(source) AS sourceElementId + LIMIT 1 \ No newline at end of file diff --git a/cypher/Validation/ValidateJavaExternalDependencies.cypher b/cypher/Validation/ValidateJavaExternalDependencies.cypher new file mode 100644 index 000000000..4318661e6 --- /dev/null +++ b/cypher/Validation/ValidateJavaExternalDependencies.cypher @@ -0,0 +1,5 @@ +// Check if there is at least one external Java Type dependency. + + MATCH (source:Java:Package)-[:CONTAINS]->(type:Java:Type)-[:DEPENDS_ON]->(externalType:Java:ExternalType) +RETURN elementId(source) AS sourceElementId + LIMIT 1 \ No newline at end of file diff --git a/cypher/Validation/ValidateJavaInternalDependencies.cypher b/cypher/Validation/ValidateJavaInternalDependencies.cypher new file mode 100644 index 000000000..f276fcee2 --- /dev/null +++ b/cypher/Validation/ValidateJavaInternalDependencies.cypher @@ -0,0 +1,5 @@ +// Check if there is at least one Java Artifact containing a Java Package with at least one Java Type. + +MATCH (source:Java:Artifact)-[:CONTAINS]->(package:Java:Package)-[:CONTAINS]->(type:Java:Type) +RETURN elementId(source) AS sourceElementId + LIMIT 1 \ No newline at end of file diff --git a/cypher/Validation/ValidateJavaMethods.cypher b/cypher/Validation/ValidateJavaMethods.cypher new file mode 100644 index 000000000..8d8b7b5bb --- /dev/null +++ b/cypher/Validation/ValidateJavaMethods.cypher @@ -0,0 +1,5 @@ +// Check if there is at least one Java Method, its Type and an Artifact it belongs to. + + MATCH (source:Java:Artifact)-[:CONTAINS]->(type:Java:Type)-[:DECLARES]->(method:Java:Method) +RETURN elementId(source) AS sourceElementId + LIMIT 1 \ No newline at end of file diff --git a/cypher/Validation/ValidateJavaPackageDependencies.cypher b/cypher/Validation/ValidateJavaPackageDependencies.cypher new file mode 100644 index 000000000..e66cf6233 --- /dev/null +++ b/cypher/Validation/ValidateJavaPackageDependencies.cypher @@ -0,0 +1,5 @@ +// Check if there is at least one Java Packagte dependency. + + MATCH (source:Java:Package)-[dependency:DEPENDS_ON]->(target:Java:Package) +RETURN elementId(source) AS sourceElementId + LIMIT 1 \ No newline at end of file diff --git a/cypher/Validation/ValidateJavaTypes.cypher b/cypher/Validation/ValidateJavaTypes.cypher new file mode 100644 index 000000000..38f08862a --- /dev/null +++ b/cypher/Validation/ValidateJavaTypes.cypher @@ -0,0 +1,5 @@ +// Check if there is at least one Java Method, its Type and an Artifact it belongs to. + + MATCH (source:Java:Artifact)-[:CONTAINS]->(package:Java:Package)-[:CONTAINS]->(type:Java:Type) +RETURN elementId(source) AS sourceElementId + LIMIT 1 \ No newline at end of file diff --git a/cypher/Validation/ValidateTypescriptModuleDependencies.cypher b/cypher/Validation/ValidateTypescriptModuleDependencies.cypher new file mode 100644 index 000000000..192334c96 --- /dev/null +++ b/cypher/Validation/ValidateTypescriptModuleDependencies.cypher @@ -0,0 +1,5 @@ +// Check if there is at least one Typescript Module dependency. + + MATCH (source:TS:Module)-[dependency:DEPENDS_ON]->(target:TS:Module) +RETURN elementId(source) AS sourceElementId + LIMIT 1 \ No newline at end of file diff --git a/graph-visualization/.nvmrc b/graph-visualization/.nvmrc new file mode 100644 index 000000000..5871e601c --- /dev/null +++ b/graph-visualization/.nvmrc @@ -0,0 +1 @@ +v20.12.1 \ No newline at end of file diff --git a/jupyter/DependenciesGraphJava.ipynb b/jupyter/DependenciesGraphJava.ipynb new file mode 100644 index 000000000..3aa4fcfad --- /dev/null +++ b/jupyter/DependenciesGraphJava.ipynb @@ -0,0 +1,321 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2f0eabc4", + "metadata": {}, + "source": [ + "## Artifact Dependencies\n", + "\n", + "This report includes graph visualization(s) using JavaScript and might not be exportable to some document formats.\n", + "\n", + "### References\n", + "\n", + "- [neovis.js (GitHub)](https://github.com/neo4j-contrib/neovis.js)\n", + "- [vis-network (GitHub)](https://github.com/visjs/vis-network)\n", + "- [vis network documentation](https://visjs.github.io/vis-network/docs/network)\n", + "- [Neo4j Graph Algorithms Jupyter Notebooks (GitHub)](https://github.com/neo4j-graph-analytics/graph-algorithms-notebooks)\n", + "- [Neo4j Graph Data Science Topological Sort](https://neo4j.com/docs/graph-data-science/current/algorithms/alpha/topological-sort)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8678cc2d", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from neo4j import GraphDatabase\n", + "from IPython.core.display import Javascript, HTML\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "437937a1", + "metadata": {}, + "outputs": [], + "source": [ + "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n", + "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", + "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", + "\n", + "neo4jUri = \"bolt://localhost:7687\"\n", + "neo4jUser = \"neo4j\"\n", + "neo4jPassword = os.environ.get(\"NEO4J_INITIAL_PASSWORD\")\n", + "\n", + "# Create the database driver to validate the connection\n", + "with GraphDatabase.driver(uri=neo4jUri, auth=(neo4jUser, neo4jPassword)) as driver:\n", + " driver.verify_connectivity()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f8435ee", + "metadata": {}, + "outputs": [], + "source": [ + "def neo4j_server_configuration(password, uri=\"bolt://localhost:7687\", user=\"neo4j\"):\n", + " return {\n", + " \"neo4j\": {\n", + " \"serverUrl\": uri,\n", + " \"serverUser\": user,\n", + " \"serverPassword\": password\n", + " }\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3dd385c7", + "metadata": {}, + "outputs": [], + "source": [ + "def visualization_configuration(node_distance: int = 200):\n", + " return {\n", + " \"visConfig\": {\n", + " \"nodes\": {\n", + " \"shape\": \"hexagon\",\n", + " \"shadow\": False,\n", + " \"font\": {\n", + " \"strokeWidth\": 4,\n", + " \"strokeColor\": \"#F2F2FF\",\n", + " \"size\": 12\n", + " },\n", + " \"size\": 22,\n", + " \"borderWidth\": 2,\n", + " \"widthConstraint\": {\n", + " \"maximum\": 60\n", + " }\n", + " },\n", + " \"edges\": {\n", + " \"arrows\": {\n", + " \"to\": { \n", + " \"enabled\": True,\n", + " \"scaleFactor\": 0.3\n", + " }\n", + " },\n", + " \"scaling\": {\n", + " \"max\": 6\n", + " }\n", + " },\n", + " \"physics\": {\n", + " \"hierarchicalRepulsion\": {\n", + " \"nodeDistance\": node_distance, # 120\n", + " \"centralGravity\": 0.2, # 0.0\n", + " \"springLength\": 100, # 100\n", + " \"springConstant\": 0.02, # 0.01\n", + " \"damping\": 0.09, # 0.09\n", + " \"avoidOverlap\": 0.9 # 0\n", + " },\n", + " \"solver\": \"hierarchicalRepulsion\" # barnesHut\n", + " },\n", + " \"layout\": {\n", + " \"hierarchical\": {\n", + " \"enabled\": True,\n", + " \"sortMethod\": \"directed\"\n", + " }\n", + " }\n", + " }\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f15d92e", + "metadata": {}, + "outputs": [], + "source": [ + "cssTemplate = \"\"\"\n", + ".graph-visualization {\n", + " width: 660px;\n", + " height: 660px;\n", + " border: 1px solid lightgray;\n", + "}\n", + "div.vis-tooltip {\n", + " font-size: 6px;\n", + "}\n", + "\"\"\"\n", + "\n", + "# Use JavaScript library neovis.js to render the graph into the HTML above\n", + "javascriptTemplate = \"\"\"\n", + "function draw(NeoVis) {\n", + " configuration.labels[NeoVis.NEOVIS_DEFAULT_CONFIG] = {\n", + " [NeoVis.NEOVIS_ADVANCED_CONFIG]: {\n", + " function: {\n", + " title: NeoVis.objectToTitleHtml // Show all node properties in the tooltip\n", + " }\n", + " }\n", + " }\n", + " configuration.relationships[NeoVis.NEOVIS_DEFAULT_CONFIG] = {\n", + " [NeoVis.NEOVIS_ADVANCED_CONFIG]: {\n", + " function: {\n", + " title: NeoVis.objectToTitleHtml // Show all relationship properties in the tooltip\n", + " }\n", + " }\n", + " }\n", + " configuration.labels.Artifact = {\n", + " [NeoVis.NEOVIS_ADVANCED_CONFIG]: {\n", + " function: {\n", + " // Use \"fileName\" as label. Remove leading slash, trailing \".jar\" and version number.\n", + " // TODO Enrich the Graph so that there is a distinct property for the \"cleaned up\" artifact name\n", + " label: (node) => node.properties.fileName.replace('/', '').replace('.jar', '').replace(/-[\\d\\\\.]+/, '')\n", + " }\n", + " }\n", + " }\n", + " console.debug(configuration)\n", + " const neoViz = new NeoVis.default(configuration);\n", + " neoViz.render();\n", + "}\n", + "\n", + "// Use JavaScript library neovis.js to render the graph into the HTML above\n", + "requirejs(['https://unpkg.com/neovis.js@2.1.0'], function(NeoVis){ \n", + " draw(NeoVis);\n", + "}, function (err) {\n", + " throw new Error(\"Failed to load NeoVis:\" + err);\n", + "});\n", + "\"\"\"\n", + "\n", + "htmlTemplate=\"\"\"\n", + "\n", + "\n", + "\n", + " Jupyter Notebook embedded neovis.js visualization\n", + " \n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "112ee31b", + "metadata": {}, + "source": [ + "## Dependencies Hierarchy\n", + "\n", + "The following hierarchical graphs shows dependencies with the most used and shared elements at the bottom and the ones that use the most dependencies on top. The visualization is limited to the first 20 nodes and their direct dependency ordered descending by their layer (\"maxDistanceFromSource\"). \n", + "\n", + "For the whole list of topologically sorted elements including the hierarchical layer see the detailed report `TopologicalSorted....csv`. It is for example helpful to find out in which order Artifacts need to be build/assembled in case of breaking changes." + ] + }, + { + "cell_type": "markdown", + "id": "cb500716", + "metadata": {}, + "source": [ + "### Hierarchical Java Artifact Dependencies\n", + "\n", + "The following Graph shows up to 60 Java Artifact dependencies in hierarchical form sorted by their topology." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "923e8a0a", + "metadata": {}, + "outputs": [], + "source": [ + "def java_artifact_query_configuration():\n", + " query = \"\"\"\n", + " MATCH (artifact:Java:Artifact:Archive)-[dependency:DEPENDS_ON]->(dependent:Java:Artifact:Archive)\n", + " WHERE artifact.maxDistanceFromSource IS NOT NULL\n", + " AND dependent.maxDistanceFromSource > artifact.maxDistanceFromSource\n", + " RETURN artifact, dependency, dependent\n", + " ORDER BY artifact.maxDistanceFromSource DESC\n", + " ,artifact.maxDistanceFromSource ASC\n", + " ,artifact.topologicalSortIndex ASC\n", + " ,dependent.topologicalSortIndex ASC\n", + " LIMIT 20 \n", + " \"\"\"\n", + " \n", + " return {\n", + " \"initialCypher\": query,\n", + " \"labels\": {\n", + " \"Artifact\": {\n", + " \"label\": \"fileName\"\n", + " },\n", + " },\n", + " \"relationships\": {\n", + " \"DEPENDS_ON\": {\n", + " \"value\": \"weight\",\n", + " \"label\": False\n", + " }\n", + " }\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48cd3f44", + "metadata": {}, + "outputs": [], + "source": [ + "htmlElement = {\"containerId\": \"graph-visualization-java-artifacts\"}\n", + "serverConfiguration = neo4j_server_configuration(uri=neo4jUri, user=neo4jUser,password=neo4jPassword)\n", + "\n", + "# Assemble the neovis.js configuration by joining the different parts of it\n", + "graphVisualizationConfiguration = {**htmlElement, **visualization_configuration(), **serverConfiguration, **java_artifact_query_configuration()}\n", + "\n", + "# Assemble the HTML by including CSS and Javascript\n", + "jsonConfiguration = json.dumps(graphVisualizationConfiguration)\n", + "javascriptContent=\"configuration=\" + jsonConfiguration +\"; \" + javascriptTemplate\n", + "htmlContent = htmlTemplate.format(script=javascriptContent, css=cssTemplate, containerId=htmlElement[\"containerId\"])\n", + "\n", + "# Display the HTML\n", + "display(HTML(htmlContent))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35a24daa", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "time.sleep(6)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "JohT" + } + ], + "code_graph_analysis_pipeline_data_validation": "ValidateJavaArtifactDependencies", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "title": "Neo4j Java Code-Structure Graph" + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/jupyter/ArtifactDependencies.ipynb b/jupyter/DependenciesGraphTypescript.ipynb similarity index 81% rename from jupyter/ArtifactDependencies.ipynb rename to jupyter/DependenciesGraphTypescript.ipynb index 3c6578503..3ca0f09a6 100644 --- a/jupyter/ArtifactDependencies.ipynb +++ b/jupyter/DependenciesGraphTypescript.ipynb @@ -210,74 +210,6 @@ "For the whole list of topologically sorted elements including the hierarchical layer see the detailed report `TopologicalSorted....csv`. It is for example helpful to find out in which order Artifacts need to be build/assembled in case of breaking changes." ] }, - { - "cell_type": "markdown", - "id": "cb500716", - "metadata": {}, - "source": [ - "### Hierarchical Java Artifact Dependencies\n", - "\n", - "The following Graph shows up to 60 Java Artifact dependencies in hierarchical form sorted by their topology." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "923e8a0a", - "metadata": {}, - "outputs": [], - "source": [ - "def java_artifact_query_configuration():\n", - " query = \"\"\"\n", - " MATCH (artifact:Java:Artifact:Archive)-[dependency:DEPENDS_ON]->(dependent:Java:Artifact:Archive)\n", - " WHERE artifact.maxDistanceFromSource IS NOT NULL\n", - " AND dependent.maxDistanceFromSource > artifact.maxDistanceFromSource\n", - " RETURN artifact, dependency, dependent\n", - " ORDER BY artifact.maxDistanceFromSource DESC\n", - " ,artifact.maxDistanceFromSource ASC\n", - " ,artifact.topologicalSortIndex ASC\n", - " ,dependent.topologicalSortIndex ASC\n", - " LIMIT 20 \n", - " \"\"\"\n", - " \n", - " return {\n", - " \"initialCypher\": query,\n", - " \"labels\": {\n", - " \"Artifact\": {\n", - " \"label\": \"fileName\"\n", - " },\n", - " },\n", - " \"relationships\": {\n", - " \"DEPENDS_ON\": {\n", - " \"value\": \"weight\",\n", - " \"label\": False\n", - " }\n", - " }\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48cd3f44", - "metadata": {}, - "outputs": [], - "source": [ - "htmlElement = {\"containerId\": \"graph-visualization-java-artifacts\"}\n", - "serverConfiguration = neo4j_server_configuration(uri=neo4jUri, user=neo4jUser,password=neo4jPassword)\n", - "\n", - "# Assemble the neovis.js configuration by joining the different parts of it\n", - "graphVisualizationConfiguration = {**htmlElement, **visualization_configuration(), **serverConfiguration, **java_artifact_query_configuration()}\n", - "\n", - "# Assemble the HTML by including CSS and Javascript\n", - "jsonConfiguration = json.dumps(graphVisualizationConfiguration)\n", - "javascriptContent=\"configuration=\" + jsonConfiguration +\"; \" + javascriptTemplate\n", - "htmlContent = htmlTemplate.format(script=javascriptContent, css=cssTemplate, containerId=htmlElement[\"containerId\"])\n", - "\n", - "# Display the HTML\n", - "display(HTML(htmlContent))" - ] - }, { "cell_type": "markdown", "id": "7a97adb1", @@ -366,6 +298,7 @@ "name": "JohT" } ], + "code_graph_analysis_pipeline_data_validation": "ValidateTypescriptModuleDependencies", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", diff --git a/jupyter/ExternalDependencies.ipynb b/jupyter/ExternalDependenciesJava.ipynb similarity index 99% rename from jupyter/ExternalDependencies.ipynb rename to jupyter/ExternalDependenciesJava.ipynb index 7ea81ed19..e68572be1 100644 --- a/jupyter/ExternalDependencies.ipynb +++ b/jupyter/ExternalDependenciesJava.ipynb @@ -1503,6 +1503,7 @@ "name": "JohT" } ], + "code_graph_analysis_pipeline_data_validation": "ValidateJavaExternalDependencies", "celltoolbar": "Tags", "kernelspec": { "display_name": "Python 3 (ipykernel)", diff --git a/jupyter/InternalDependencies.ipynb b/jupyter/InternalDependenciesJava.ipynb similarity index 99% rename from jupyter/InternalDependencies.ipynb rename to jupyter/InternalDependenciesJava.ipynb index 9c0def20b..ddf240d61 100644 --- a/jupyter/InternalDependencies.ipynb +++ b/jupyter/InternalDependenciesJava.ipynb @@ -111,7 +111,7 @@ "List the artifacts this notebook is based on. Different sorting variations help finding artifacts by their features and support larger code bases where the list of all artifacts gets too long.\n", "\n", "Only the top 30 entries are shown. The whole table can be found in the following CSV report: \n", - "`List_all_existing_artifacts`" + "`List_all_existing_java_artifacts`" ] }, { @@ -121,7 +121,7 @@ "metadata": {}, "outputs": [], "source": [ - "artifacts = query_cypher_to_data_frame(\"../cypher/List_all_existing_artifacts.cypher\")" + "artifacts = query_cypher_to_data_frame(\"../cypher/List_all_existing_java_artifacts.cypher\")" ] }, { @@ -570,6 +570,7 @@ "name": "JohT" } ], + "code_graph_analysis_pipeline_data_validation": "ValidateJavaInternalDependencies", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", diff --git a/jupyter/MethodMetrics.ipynb b/jupyter/MethodMetricsJava.ipynb similarity index 99% rename from jupyter/MethodMetrics.ipynb rename to jupyter/MethodMetricsJava.ipynb index 4a215429b..f543149f4 100644 --- a/jupyter/MethodMetrics.ipynb +++ b/jupyter/MethodMetricsJava.ipynb @@ -465,6 +465,7 @@ "name": "JohT" } ], + "code_graph_analysis_pipeline_data_validation": "ValidateJavaMethods", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", diff --git a/jupyter/NodeEmbeddings.ipynb b/jupyter/NodeEmbeddingsJava.ipynb similarity index 80% rename from jupyter/NodeEmbeddings.ipynb rename to jupyter/NodeEmbeddingsJava.ipynb index 8ea11c07b..b1335be55 100644 --- a/jupyter/NodeEmbeddings.ipynb +++ b/jupyter/NodeEmbeddingsJava.ipynb @@ -37,9 +37,12 @@ "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)\n", "- [Tutorial: Applied Graph Embeddings](https://neo4j.com/developer/graph-data-science/applied-graph-embeddings)\n", "- [Visualizing the embeddings in 2D](https://github.com/openai/openai-cookbook/blob/main/examples/Visualizing_embeddings_in_2D.ipynb)\n", - "- [Fast Random Projection](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp)\n", "- [scikit-learn TSNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE)\n", - "- [AttributeError: 'list' object has no attribute 'shape'](https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape)" + "- [AttributeError: 'list' object has no attribute 'shape'](https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape)\n", + "- [Fast Random Projection (neo4j)](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp)\n", + "- [HashGNN (neo4j)](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn)\n", + "- [node2vec (neo4j)](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/node2vec) computes a vector representation of a node based on second order random walks in the graph. \n", + "- [Complete guide to understanding Node2Vec algorithm](https://towardsdatascience.com/complete-guide-to-understanding-node2vec-algorithm-4e9a35e5d147)" ] }, { @@ -477,144 +480,6 @@ " \"Java Package positioned by their dependency relationships (node2vec node embeddings + t-SNE)\"\n", ")" ] - }, - { - "cell_type": "markdown", - "id": "0b42163d", - "metadata": {}, - "source": [ - "## 2. Typescript Modules" - ] - }, - { - "cell_type": "markdown", - "id": "3b468bae", - "metadata": {}, - "source": [ - "### 2.1 Generate Node Embeddings for Typescript Modules using Fast Random Projection (Fast RP)\n", - "\n", - "See section 1.1 for some background about node embeddings." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1ecc41b1", - "metadata": {}, - "outputs": [], - "source": [ - "typescript_module_embeddings_parameters={\n", - " \"dependencies_projection\": \"typescript-module-embeddings-notebook\",\n", - " \"dependencies_projection_node\": \"Module\",\n", - " \"dependencies_projection_weight_property\": \"lowCouplingElement25PercentWeight\",\n", - " \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n", - " \"dependencies_projection_embedding_dimension\":\"32\" \n", - "}\n", - "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", typescript_module_embeddings_parameters)\n" - ] - }, - { - "cell_type": "markdown", - "id": "ad17607c", - "metadata": {}, - "source": [ - "### 2.2 Dimensionality reduction with t-distributed stochastic neighbor embedding (t-SNE)\n", - "\n", - "See section 1.2 for some background about t-SNE." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58575e2f", - "metadata": {}, - "outputs": [], - "source": [ - "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)" - ] - }, - { - "cell_type": "markdown", - "id": "20084589", - "metadata": {}, - "source": [ - "### 2.3 Plot the node embeddings reduced to two dimensions for Typescript" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fdda2869", - "metadata": {}, - "outputs": [], - "source": [ - "plot_2d_node_embeddings(\n", - " node_embeddings_for_visualization, \n", - " \"Typescript Modules positioned by their dependency relationships (FastRP node embeddings + t-SNE)\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "6cac9be7", - "metadata": {}, - "source": [ - "### 2.4 Node Embeddings for Typescript Modules using HashGNN\n", - "\n", - "[HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn) resembles Graph Neural Networks (GNN) but does not include a model or require training. It combines ideas of GNNs and fast randomized algorithms. For more details see [HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn). Here, the latter 3 steps are combined into one for HashGNN." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8fe68eca", - "metadata": {}, - "outputs": [], - "source": [ - "typescript_module_embeddings_parameters={\n", - " \"dependencies_projection\": \"typescript-module-embeddings-notebook\",\n", - " \"dependencies_projection_node\": \"Module\",\n", - " \"dependencies_projection_weight_property\": \"lowCouplingElement25PercentWeight\",\n", - " \"dependencies_projection_write_property\": \"embeddingsHashGNN\",\n", - " \"dependencies_projection_embedding_dimension\":\"64\"\n", - "}\n", - "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", typescript_module_embeddings_parameters)\n", - "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", - "plot_2d_node_embeddings(\n", - " node_embeddings_for_visualization, \n", - " \"Typescript Modules positioned by their dependency relationships (HashGNN node embeddings + t-SNE)\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "0a7d66f5", - "metadata": {}, - "source": [ - "### 2.5 Node Embeddings for Typescript Modules using node2vec" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea6c52ca", - "metadata": {}, - "outputs": [], - "source": [ - "typescript_module_embeddings_parameters={\n", - " \"dependencies_projection\": \"typescript-module-embeddings-notebook\",\n", - " \"dependencies_projection_node\": \"Module\",\n", - " \"dependencies_projection_weight_property\": \"lowCouplingElement25PercentWeight\",\n", - " \"dependencies_projection_write_property\": \"embeddingsNode2Vec\",\n", - " \"dependencies_projection_embedding_dimension\":\"32\"\n", - "}\n", - "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", typescript_module_embeddings_parameters)\n", - "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", - "plot_2d_node_embeddings(\n", - " node_embeddings_for_visualization, \n", - " \"Typescript Modules positioned by their dependency relationships (node2vec node embeddings + t-SNE)\"\n", - ")" - ] } ], "metadata": { @@ -623,6 +488,7 @@ "name": "JohT" } ], + "code_graph_analysis_pipeline_data_validation": "ValidateJavaPackageDependencies", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", diff --git a/jupyter/NodeEmbeddingsTypescript.ipynb b/jupyter/NodeEmbeddingsTypescript.ipynb new file mode 100644 index 000000000..7524d6661 --- /dev/null +++ b/jupyter/NodeEmbeddingsTypescript.ipynb @@ -0,0 +1,516 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "2f0eabc4", + "metadata": {}, + "source": [ + "# Node Embeddings\n", + "\n", + "This notebook demonstrates different methods for node embeddings and how to further reduce their dimensionality to be able to visualize them in a 2D plot. \n", + "\n", + "Node embeddings are essentially an array of floating point numbers (length = embedding dimension) that can be used as \"features\" in machine learning. These numbers approximate the relationship and similarity information of each node and can also be seen as a way to encode the topology of the graph.\n", + "\n", + "## Considerations\n", + "\n", + "Due to dimensionality reduction some information gets lost, especially when visualizing node embeddings in two dimensions. Nevertheless, it helps to get an intuition on what node embeddings are and how much of the similarity and neighborhood information is retained. The latter can be observed by how well nodes of the same color and therefore same community are placed together and how much bigger nodes with a high centrality score influence them. \n", + "\n", + "If the visualization doesn't show a somehow clear separation between the communities (colors) here are some ideas for tuning: \n", + "- Clean the data, e.g. filter out very few nodes with extremely high degree that aren't actually that important\n", + "- Try directed vs. undirected projections\n", + "- Tune the embedding algorithm, e.g. use a higher dimensionality\n", + "- Tune t-SNE that is used to reduce the node embeddings dimension to two dimensions for visualization. \n", + "\n", + "It could also be the case that the node embeddings are good enough and well suited the way they are despite their visualization for the down stream task like node classification or link prediction. In that case it makes sense to see how the whole pipeline performs before tuning the node embeddings in detail. \n", + "\n", + "## Note about data dependencies\n", + "\n", + "PageRank centrality and Leiden community are also fetched from the Graph and need to be calculated first.\n", + "This makes it easier to see if the embeddings approximate the structural information of the graph in the plot.\n", + "If these properties are missing you will only see black dots all of the same size.\n", + "\n", + "
\n", + "\n", + "### References\n", + "- [jqassistant](https://jqassistant.org)\n", + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)\n", + "- [Tutorial: Applied Graph Embeddings](https://neo4j.com/developer/graph-data-science/applied-graph-embeddings)\n", + "- [Visualizing the embeddings in 2D](https://github.com/openai/openai-cookbook/blob/main/examples/Visualizing_embeddings_in_2D.ipynb)\n", + "- [scikit-learn TSNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE)\n", + "- [AttributeError: 'list' object has no attribute 'shape'](https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape)\n", + "- [Fast Random Projection (neo4j)](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp)\n", + "- [HashGNN (neo4j)](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn)\n", + "- [node2vec (neo4j)](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/node2vec) computes a vector representation of a node based on second order random walks in the graph. \n", + "- [Complete guide to understanding Node2Vec algorithm](https://towardsdatascience.com/complete-guide-to-understanding-node2vec-algorithm-4e9a35e5d147)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4191f259", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from IPython.display import display\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plot\n", + "import typing as typ\n", + "import numpy as np\n", + "from sklearn.manifold import TSNE\n", + "from neo4j import GraphDatabase" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8ef41ff", + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn\n", + "print('The scikit-learn version is {}.'.format(sklearn.__version__))\n", + "print('The pandas version is {}.'.format(pd.__version__))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c5dab37", + "metadata": {}, + "outputs": [], + "source": [ + "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n", + "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", + "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", + "\n", + "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")))\n", + "driver.verify_connectivity()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1db254b", + "metadata": {}, + "outputs": [], + "source": [ + "def get_cypher_query_from_file(filename):\n", + " with open(filename) as file:\n", + " return ' '.join(file.readlines())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59310f6f", + "metadata": {}, + "outputs": [], + "source": [ + "def query_cypher_to_data_frame(filename, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", + " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_)\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd1d9775", + "metadata": {}, + "outputs": [], + "source": [ + "def query_first_non_empty_cypher_to_data_frame(*filenames : str, parameters: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", + " \"\"\"\n", + " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", + " If all given file names result in empty results, the last (empty) result will be returned.\n", + " By additionally specifying \"limit=\" the \"LIMIT\" keyword will appended to query so that only the first results get returned.\n", + " \"\"\"\n", + " result=pd.DataFrame()\n", + " for filename in filenames:\n", + " result=query_cypher_to_data_frame(filename, parameters)\n", + " if not result.empty:\n", + " print(\"The results have been provided by the query filename: \" + filename)\n", + " return result\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d2e62d6", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO option to choose between directed and undirected projection\n", + "\n", + "def create_undirected_projection(parameters: dict) -> bool: \n", + " \"\"\"\n", + " Creates an undirected homogenous in-memory Graph projection for/with Neo4j Graph Data Science Plugin.\n", + " It returns True if there is data available for the given parameter and False otherwise.\n", + " Parameters\n", + " ----------\n", + " dependencies_projection : str\n", + " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", + " dependencies_projection_node : str\n", + " The label of the nodes that will be used for the projection. Example: \"Package\"\n", + " dependencies_projection_weight_property : str\n", + " The name of the node property that contains the dependency weight. Example: \"weight25PercentInterfaces\"\n", + " dependencies_projection_embedding_dimension : str\n", + " The number of the dimensions and therefore size of the resulting array of floating point numbers\n", + " \"\"\"\n", + " \n", + " is_data_missing=query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_0_Check_Projectable.cypher\", parameters).empty\n", + " if is_data_missing: return False\n", + "\n", + " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_1_Delete_Projection.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_2_Delete_Subgraph.cypher\", parameters)\n", + " # To include the direction of the relationships use the following line to create the projection:\n", + " # query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_3_Create_Projection.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_4_Create_Undirected_Projection.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_5_Create_Subgraph.cypher\", parameters)\n", + " return True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1aa86093", + "metadata": {}, + "outputs": [], + "source": [ + "# Feature ideas\n", + "# TODO option to choose between directed and undirected projection\n", + "# TODO option to not read already existing node embeddings to experiment with different (hpyer) parameters\n", + "# TODO run a community detection algorithm co-located in here when \"communityId\" is missing\n", + "# TODO run a centrality algorithm co-located in here when \"centrality\" score is missing\n", + "\n", + "def create_node_embeddings(cypher_file_name: str, parameters: dict) -> pd.DataFrame: \n", + " \"\"\"\n", + " Creates an in-memory Graph projection by calling \"create_undirected_projection\", \n", + " runs the cypher Query given as cypherFileName parameter to calculate and stream the node embeddings\n", + " and returns a DataFrame with the results.\n", + " \n", + " cypher_file_name\n", + " ----------\n", + " Name of the file containing the Cypher query that executes node embeddings procedure.\n", + "\n", + " parameters\n", + " ----------\n", + " dependencies_projection : str\n", + " The name prefix for the in-memory projection for dependencies. Example: \"typescript-module-embeddings-notebook\"\n", + " dependencies_projection_node : str\n", + " The label of the nodes that will be used for the projection. Example: \"Module\"\n", + " dependencies_projection_weight_property : str\n", + " The name of the node property that contains the dependency weight. Example: \"lowCouplingElement25PercentWeight\"\n", + " dependencies_projection_embedding_dimension : str\n", + " The number of the dimensions and therefore size of the resulting array of floating point numbers\n", + " \"\"\"\n", + " \n", + " is_data_available=create_undirected_projection(parameters)\n", + " \n", + " if not is_data_available:\n", + " print(\"No projected data for node embeddings calculation available\")\n", + " empty_result = pd.DataFrame(columns=[\"codeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n", + " return empty_result\n", + "\n", + " existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n", + " embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n", + " display(embeddings.head()) # Display the first entries of the table\n", + " return embeddings" + ] + }, + { + "cell_type": "markdown", + "id": "f6ec6a9b", + "metadata": {}, + "source": [ + "### Dimensionality reduction with t-distributed stochastic neighbor embedding (t-SNE)\n", + "\n", + "The following function takes the original node embeddings with a higher dimensionality, e.g. 64 floating point numbers, and reduces them into a two dimensional array for visualization. \n", + "\n", + "> It converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data.\n", + "\n", + "(see https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "720aebd3", + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_node_embeddings_for_2d_visualization(embeddings: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " Reduces the dimensionality of the node embeddings (e.g. 64 floating point numbers in an array)\n", + " to two dimensions for 2D visualization.\n", + " see https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE\n", + " \"\"\"\n", + "\n", + " if embeddings.empty: \n", + " print(\"No projected data for node embeddings dimensionality reduction available\")\n", + " return embeddings\n", + " \n", + " # Calling the fit_transform method just with a list doesn't seem to work (anymore?). \n", + " # It leads to an error with the following message: 'list' object has no attribute 'shape'\n", + " # This can be solved by converting the list to a numpy array using np.array(..).\n", + " # See https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape\n", + " embeddings_as_numpy_array = np.array(embeddings.embedding.to_list())\n", + "\n", + " # Use t-distributed stochastic neighbor embedding (t-SNE) to reduce the dimensionality \n", + " # of the previously calculated node embeddings to 2 dimensions for visualization\n", + " t_distributed_stochastic_neighbor_embedding = TSNE(n_components=2, verbose=1, random_state=50)\n", + " two_dimension_node_embeddings = t_distributed_stochastic_neighbor_embedding.fit_transform(embeddings_as_numpy_array)\n", + " display(two_dimension_node_embeddings.shape) # Display the shape of the t-SNE result\n", + "\n", + " # Create a new DataFrame with the results of the 2 dimensional node embeddings\n", + " # and the code unit and artifact name of the query above as preparation for the plot\n", + " node_embeddings_for_visualization = pd.DataFrame(data = {\n", + " \"codeUnit\": embeddings.codeUnitName,\n", + " \"artifact\": embeddings.projectName,\n", + " \"communityId\": embeddings.communityId,\n", + " \"centrality\": embeddings.centrality,\n", + " \"x\": [value[0] for value in two_dimension_node_embeddings],\n", + " \"y\": [value[1] for value in two_dimension_node_embeddings]\n", + " })\n", + " display(node_embeddings_for_visualization.head()) # Display the first line of the results\n", + " return node_embeddings_for_visualization\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d937e26e", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_2d_node_embeddings(node_embeddings_for_visualization: pd.DataFrame, title: str):\n", + " if embeddings.empty:\n", + " print(\"No projected data to plot available\")\n", + " return\n", + "\n", + " plot.scatter(\n", + " x=node_embeddings_for_visualization.x,\n", + " y=node_embeddings_for_visualization.y,\n", + " s=node_embeddings_for_visualization.centrality * 300,\n", + " c=node_embeddings_for_visualization.communityId,\n", + " cmap=main_color_map,\n", + " )\n", + " plot.title(title)\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da9e8edb", + "metadata": {}, + "outputs": [], + "source": [ + "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n", + "#This is especially needed for PDF export of tables with multiple columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9deaabce", + "metadata": {}, + "outputs": [], + "source": [ + "%%html\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2496caf", + "metadata": {}, + "outputs": [], + "source": [ + "# Main Colormap\n", + "main_color_map = 'nipy_spectral'" + ] + }, + { + "cell_type": "markdown", + "id": "0b42163d", + "metadata": {}, + "source": [ + "## 1. Typescript Modules" + ] + }, + { + "cell_type": "markdown", + "id": "3b468bae", + "metadata": {}, + "source": [ + "### 1.1 Generate Node Embeddings for Typescript Modules using Fast Random Projection (Fast RP)\n", + "\n", + "[Fast Random Projection](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp) is used to reduce the dimensionality of the node feature space while preserving most of the distance information. Nodes with similar neighborhood result in node embedding with similar vectors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ecc41b1", + "metadata": {}, + "outputs": [], + "source": [ + "typescript_module_embeddings_parameters={\n", + " \"dependencies_projection\": \"typescript-module-embeddings-notebook\",\n", + " \"dependencies_projection_node\": \"Module\",\n", + " \"dependencies_projection_weight_property\": \"lowCouplingElement25PercentWeight\",\n", + " \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n", + " \"dependencies_projection_embedding_dimension\":\"32\" \n", + "}\n", + "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", typescript_module_embeddings_parameters)\n" + ] + }, + { + "cell_type": "markdown", + "id": "ad17607c", + "metadata": {}, + "source": [ + "### 1.2 Dimensionality reduction with t-distributed stochastic neighbor embedding (t-SNE)\n", + "\n", + "This step takes the original node embeddings with a higher dimensionality, e.g. 64 floating point numbers, and reduces them into a two dimensional array for visualization. For more details look up the function declaration for \"prepare_node_embeddings_for_2d_visualization\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58575e2f", + "metadata": {}, + "outputs": [], + "source": [ + "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)" + ] + }, + { + "cell_type": "markdown", + "id": "20084589", + "metadata": {}, + "source": [ + "### 1.3 Plot the node embeddings reduced to two dimensions for Typescript" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdda2869", + "metadata": {}, + "outputs": [], + "source": [ + "plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization, \n", + " \"Typescript Modules positioned by their dependency relationships (FastRP node embeddings + t-SNE)\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6cac9be7", + "metadata": {}, + "source": [ + "### 1.4 Node Embeddings for Typescript Modules using HashGNN\n", + "\n", + "[HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn) resembles Graph Neural Networks (GNN) but does not include a model or require training. It combines ideas of GNNs and fast randomized algorithms. For more details see [HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn). Here, the latter 3 steps are combined into one for HashGNN." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fe68eca", + "metadata": {}, + "outputs": [], + "source": [ + "typescript_module_embeddings_parameters={\n", + " \"dependencies_projection\": \"typescript-module-embeddings-notebook\",\n", + " \"dependencies_projection_node\": \"Module\",\n", + " \"dependencies_projection_weight_property\": \"lowCouplingElement25PercentWeight\",\n", + " \"dependencies_projection_write_property\": \"embeddingsHashGNN\",\n", + " \"dependencies_projection_embedding_dimension\":\"64\"\n", + "}\n", + "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", typescript_module_embeddings_parameters)\n", + "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", + "plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization, \n", + " \"Typescript Modules positioned by their dependency relationships (HashGNN node embeddings + t-SNE)\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0a7d66f5", + "metadata": {}, + "source": [ + "### 1.5 Node Embeddings for Typescript Modules using node2vec\n", + "\n", + "[node2vec](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/node2vec) computes a vector representation of a node based on second order random walks in the graph. \n", + "The [node2vec](https://towardsdatascience.com/complete-guide-to-understanding-node2vec-algorithm-4e9a35e5d147) algorithm is a transductive node embedding algorithm, meaning that it needs the whole graph to be available to learn the node embeddings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea6c52ca", + "metadata": {}, + "outputs": [], + "source": [ + "typescript_module_embeddings_parameters={\n", + " \"dependencies_projection\": \"typescript-module-embeddings-notebook\",\n", + " \"dependencies_projection_node\": \"Module\",\n", + " \"dependencies_projection_weight_property\": \"lowCouplingElement25PercentWeight\",\n", + " \"dependencies_projection_write_property\": \"embeddingsNode2Vec\",\n", + " \"dependencies_projection_embedding_dimension\":\"32\"\n", + "}\n", + "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", typescript_module_embeddings_parameters)\n", + "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", + "plot_2d_node_embeddings(\n", + " node_embeddings_for_visualization, \n", + " \"Typescript Modules positioned by their dependency relationships (node2vec node embeddings + t-SNE)\"\n", + ")" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "JohT" + } + ], + "code_graph_analysis_pipeline_data_validation": "ValidateTypescriptModuleDependencies", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "title": "Object Oriented Design Quality Metrics for Java with Neo4j" + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/jupyter/ObjectOrientedDesignMetrics.ipynb b/jupyter/ObjectOrientedDesignMetricsJava.ipynb similarity index 86% rename from jupyter/ObjectOrientedDesignMetrics.ipynb rename to jupyter/ObjectOrientedDesignMetricsJava.ipynb index 33e03791e..a3e32b78b 100644 --- a/jupyter/ObjectOrientedDesignMetrics.ipynb +++ b/jupyter/ObjectOrientedDesignMetricsJava.ipynb @@ -15,6 +15,7 @@ "- [jqassistant](https://jqassistant.org)\n", "- [notebook walks through examples for integrating various packages with Neo4j](https://nicolewhite.github.io/neo4j-jupyter/hello-world.html)\n", "- [OO Design Quality Metrics](https://api.semanticscholar.org/CorpusID:18246616)\n", + "- [A Validation of Martin's Metric](https://www.researchgate.net/publication/31598248_A_Validation_of_Martin's_Metric)\n", "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)" ] }, @@ -184,27 +185,6 @@ " \"../cypher/Metrics/Set_Incoming_Java_Package_Dependencies_Including_Subpackages.cypher\", limit=20)" ] }, - { - "cell_type": "markdown", - "id": "b27f225d", - "metadata": {}, - "source": [ - "#### Table 1c\n", - "- Show the top 20 Typescript modules with the most incoming dependencies\n", - "- Set the property \"incomingDependencies\" on Module nodes if not already done." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98974b46", - "metadata": {}, - "outputs": [], - "source": [ - "query_first_non_empty_cypher_to_data_frame(\"../cypher/Metrics/Get_Incoming_Typescript_Module_Dependencies.cypher\",\n", - " \"../cypher/Metrics/Set_Incoming_Typescript_Module_Dependencies.cypher\", limit=20)" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -265,28 +245,6 @@ " \"../cypher/Metrics/Set_Outgoing_Java_Package_Dependencies_Including_Subpackages.cypher\", limit=20)" ] }, - { - "cell_type": "markdown", - "id": "747cb31d", - "metadata": {}, - "source": [ - "#### Table 2c\n", - "\n", - "- Show the top 20 Typescript modules with the most outgoing dependencies\n", - "- Set the \"outgoingDependencies\" properties on Module nodes if not already done" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1be12a2e", - "metadata": {}, - "outputs": [], - "source": [ - "query_first_non_empty_cypher_to_data_frame(\"../cypher/Metrics/Get_Outgoing_Typescript_Module_Dependencies.cypher\",\n", - " \"../cypher/Metrics/Set_Outgoing_Typescript_Module_Dependencies.cypher\", limit=20)" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -350,28 +308,6 @@ " \"../cypher/Metrics/Calculate_and_set_Instability_for_Java_Including_Subpackages.cypher\", limit=20)" ] }, - { - "cell_type": "markdown", - "id": "17c081d0", - "metadata": {}, - "source": [ - "#### Table 3c\n", - "\n", - "- Show the top 20 Typescript modules with the lowest *Instability*\n", - "- Set the property \"instability\" on Module nodes if not already done" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "77862c9e", - "metadata": {}, - "outputs": [], - "source": [ - "query_first_non_empty_cypher_to_data_frame(\"../cypher/Metrics/Get_Instability_for_Typescript.cypher\",\n", - " \"../cypher/Metrics/Calculate_and_set_Instability_for_Typescript.cypher\", limit=20)" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -475,28 +411,6 @@ "abstractnessIncludingSubpackages.sort_values(by=['abstractness', 'maxSubpackageDepth', 'numberTypes'], ascending=[False, False, False]).head(30)" ] }, - { - "cell_type": "markdown", - "id": "6f95770e", - "metadata": {}, - "source": [ - "#### Table 4e\n", - "\n", - "- Show the top 30 Typescript modules with the lowest *Abstractness*\n", - "- Set the property \"abstractness\" on Module nodes if not already done." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "65c36080", - "metadata": {}, - "outputs": [], - "source": [ - "query_first_non_empty_cypher_to_data_frame(\"../cypher/Metrics/Get_Abstractness_for_Typescript.cypher\",\n", - " \"../cypher/Metrics/Calculate_and_set_Abstractness_for_Typescript.cypher\", limit=20)" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -546,27 +460,6 @@ "instabilityPerAbstractnessIncludingSubpackages.head(30)" ] }, - { - "cell_type": "markdown", - "id": "c4fdbb1d", - "metadata": {}, - "source": [ - "#### Table 5c\n", - "\n", - "- Show the top 30 Typescript modules with the highest distance from the \"main sequence\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0a1d2d24", - "metadata": {}, - "outputs": [], - "source": [ - "instabilityPerAbstractnessTypescript = query_cypher_to_data_frame(\"../cypher/Metrics/Calculate_distance_between_abstractness_and_instability_for_Typescript.cypher\")\n", - "instabilityPerAbstractnessTypescript.head(30)" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -706,7 +599,7 @@ "id": "e9a98a47", "metadata": {}, "source": [ - "#### Figure 1a - Packages without their sub-packages" + "#### Figure 5a - Packages without their sub-packages" ] }, { @@ -725,7 +618,7 @@ "id": "a0f207bb", "metadata": {}, "source": [ - "#### Figure 1b - Packages including their sub-packages" + "#### Figure 5b - Packages including their sub-packages" ] }, { @@ -738,25 +631,6 @@ "if not instabilityPerAbstractnessIncludingSubpackages.empty:\n", " plot_instability_per_abstractness_with_main_sequence(instabilityPerAbstractnessIncludingSubpackages, 'Packages including their sub-packages')" ] - }, - { - "cell_type": "markdown", - "id": "c7499522", - "metadata": {}, - "source": [ - "#### Figure 1c - Typescript Modules" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "02763566", - "metadata": {}, - "outputs": [], - "source": [ - "if not instabilityPerAbstractnessTypescript.empty:\n", - " plot_instability_per_abstractness_with_main_sequence(instabilityPerAbstractnessTypescript, 'Typescript modules')" - ] } ], "metadata": { @@ -765,6 +639,7 @@ "name": "JohT" } ], + "code_graph_analysis_pipeline_data_validation": "ValidateJavaPackageDependencies", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", diff --git a/jupyter/ObjectOrientedDesignMetricsTypescript.ipynb b/jupyter/ObjectOrientedDesignMetricsTypescript.ipynb new file mode 100644 index 000000000..e6d93acb5 --- /dev/null +++ b/jupyter/ObjectOrientedDesignMetricsTypescript.ipynb @@ -0,0 +1,499 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "2f0eabc4", + "metadata": {}, + "source": [ + "# Object Oriented Design Quality Metrics\n", + "
\n", + "\n", + "### References\n", + "- [Analyze java package metrics in a graph database](https://joht.github.io/johtizen/data/2023/04/21/java-package-metrics-analysis.html)\n", + "- [Calculate metrics](https://101.jqassistant.org/calculate-metrics/index.html)\n", + "- [jqassistant](https://jqassistant.org)\n", + "- [notebook walks through examples for integrating various packages with Neo4j](https://nicolewhite.github.io/neo4j-jupyter/hello-world.html)\n", + "- [OO Design Quality Metrics](https://api.semanticscholar.org/CorpusID:18246616)\n", + "- [A Validation of Martin's Metric](https://www.researchgate.net/publication/31598248_A_Validation_of_Martin's_Metric)\n", + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4191f259", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plot\n", + "from neo4j import GraphDatabase" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c5dab37", + "metadata": {}, + "outputs": [], + "source": [ + "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n", + "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", + "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", + "\n", + "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")))\n", + "driver.verify_connectivity()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1db254b", + "metadata": {}, + "outputs": [], + "source": [ + "def get_cypher_query_from_file(cypher_file_name: str):\n", + " with open(cypher_file_name) as file:\n", + " return ' '.join(file.readlines())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59310f6f", + "metadata": {}, + "outputs": [], + "source": [ + "def query_cypher_to_data_frame(filename : str, limit: int = 10_000):\n", + " cypher_query_template = \"{query}\\nLIMIT {row_limit}\"\n", + " cypher_query = get_cypher_query_from_file(filename)\n", + " cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)\n", + " records, summary, keys = driver.execute_query(cypher_query)\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "013395f1", + "metadata": {}, + "outputs": [], + "source": [ + "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = 10_000):\n", + " \"\"\"\n", + " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", + " If all given file names result in empty results, the last (empty) result will be returned.\n", + " By additionally specifying \"limit=\" the \"LIMIT\" keyword will appended to query so that only the first results get returned.\n", + " \"\"\" \n", + " result=pd.DataFrame()\n", + " for filename in filenames:\n", + " result=query_cypher_to_data_frame(filename, limit)\n", + " if not result.empty:\n", + " return result\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da9e8edb", + "metadata": {}, + "outputs": [], + "source": [ + "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n", + "#This is especially needed for PDF export of tables with multiple columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9deaabce", + "metadata": {}, + "outputs": [], + "source": [ + "%%html\n", + "" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "bbb51c78", + "metadata": {}, + "source": [ + "## Incoming Dependencies\n", + "\n", + "Incoming dependencies are also denoted as \"Fan-in\", \"Afferent Coupling\" or \"in-degree\".\n", + "These are the ones that use the listed package. \n", + " \n", + "If these packages get changed, the incoming dependencies might be affected by the change. The more incoming dependencies, the harder it gets to change the code without the need to adapt the dependent code (β€œrigid code”). Even worse, it might affect the behavior of the dependent code in an unwanted way (β€œfragile code”).\n", + "\n", + "Since Java Packages are organized hierarchically, incoming dependencies can be count for every package in isolation or by including all of its sub-packages. The latter one is done without top level packages like for example \"org\" or \"org.company\" by assuring that only packages are considered that have other packages or types in the same hierarchy level (\"siblings\")." + ] + }, + { + "cell_type": "markdown", + "id": "b27f225d", + "metadata": {}, + "source": [ + "#### Table 1a\n", + "- Show the top 20 Typescript modules with the most incoming dependencies\n", + "- Set the property \"incomingDependencies\" on Module nodes if not already done." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98974b46", + "metadata": {}, + "outputs": [], + "source": [ + "query_first_non_empty_cypher_to_data_frame(\"../cypher/Metrics/Get_Incoming_Typescript_Module_Dependencies.cypher\",\n", + " \"../cypher/Metrics/Set_Incoming_Typescript_Module_Dependencies.cypher\", limit=20)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "82e9f74c", + "metadata": {}, + "source": [ + "## Outgoing Dependencies\n", + "\n", + "Outgoing dependencies are also denoted as \"Fan-out\", \"Efferent Coupling\" or \"out-degree\".\n", + "These are the ones that are used by the listed package. \n", + "\n", + "Code from other packages and libraries you’re depending on (outgoing) might change over time. The more outgoing changes, the more likely and frequently code changes are needed. This involves time and effort which can be reduced by automation of tests and version updates. Automated tests are crucial to reveal updates, that change the behavior of the code unexpectedly (β€œfragile code”). As soon as more effort is required, keeping up becomes difficult (β€œrigid code”). Not being able to use a newer version might not only restrict features, it can get problematic if there are security issues. This might force you to take β€œfast but ugly” solutions into account which further increases technical dept.\n", + "\n", + "Since Java Packages are organized hierarchically, outgoing dependencies can be count for every package in isolation or by including all of its sub-packages. The latter one is done without top level packages like for example \"org\" or \"org.company\" by assuring that only packages are considered that have other packages or types in the same hierarchy level (\"siblings\")." + ] + }, + { + "cell_type": "markdown", + "id": "747cb31d", + "metadata": {}, + "source": [ + "#### Table 2a\n", + "\n", + "- Show the top 20 Typescript modules with the most outgoing dependencies\n", + "- Set the \"outgoingDependencies\" properties on Module nodes if not already done" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1be12a2e", + "metadata": {}, + "outputs": [], + "source": [ + "query_first_non_empty_cypher_to_data_frame(\"../cypher/Metrics/Get_Outgoing_Typescript_Module_Dependencies.cypher\",\n", + " \"../cypher/Metrics/Set_Outgoing_Typescript_Module_Dependencies.cypher\", limit=20)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "22f8959e", + "metadata": {}, + "source": [ + "## Instability\n", + "\n", + "$$ Instability = \\frac{Outgoing\\:Dependencies}{Outgoing\\:Dependencies + Incoming\\:Dependencies} $$\n", + "\n", + "*Instability* is expressed as the ratio of the number of outgoing dependencies of a module (i.e., the number of packages that depend on it) to the total number of dependencies (i.e., the sum of incoming and outgoing dependencies).\n", + "\n", + "Small values near zero indicate low *Instability*. With no outgoing but some incoming dependencies the Instability is zero which is denoted as maximally stable. Such code units are more rigid and difficult to change without impacting other parts of the system. If they are changed less because of that, they are considered stable.\n", + "\n", + "Conversely, high values approaching one indicate high *Instability*. With some outgoing dependencies but no incoming ones the *Instability* is denoted as maximally unstable. Such code units are easier to change without affecting other modules, making them more flexible and less prone to cascading changes throughout the system. If they are changed more often because of that, they are considered unstable.\n", + "\n", + "Since Java Packages are organized hierarchically, *Instability* can be calculated for every package in isolation or by including all of its sub-packages. " + ] + }, + { + "cell_type": "markdown", + "id": "17c081d0", + "metadata": {}, + "source": [ + "#### Table 3a\n", + "\n", + "- Show the top 20 Typescript modules with the lowest *Instability*\n", + "- Set the property \"instability\" on Module nodes if not already done" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77862c9e", + "metadata": {}, + "outputs": [], + "source": [ + "query_first_non_empty_cypher_to_data_frame(\"../cypher/Metrics/Get_Instability_for_Typescript.cypher\",\n", + " \"../cypher/Metrics/Calculate_and_set_Instability_for_Typescript.cypher\", limit=20)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f9eb4c6f", + "metadata": {}, + "source": [ + "## Abstractness\n", + "\n", + "$$ Abstractness = \\frac{abstract\\:classes\\:in\\:category}{total\\:number\\:of\\:classes\\:in\\:category} $$\n", + "\n", + "Package *Abstractness* is expressed as the ratio of the number of abstract classes and interfaces to the total number of classes of a package.\n", + "\n", + "Zero *Abstractness* means that there are no abstract types or interfaces in the package. On the other hand, a value of one means that there are only abstract types.\n", + "\n", + "Since Java Packages are organized hierarchically, *Abstractness* can be calculated for every package in isolation or by including all of its sub-packages. " + ] + }, + { + "cell_type": "markdown", + "id": "6f95770e", + "metadata": {}, + "source": [ + "#### Table 4a\n", + "\n", + "- Show the top 30 Typescript modules with the lowest *Abstractness*\n", + "- Set the property \"abstractness\" on Module nodes if not already done." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65c36080", + "metadata": {}, + "outputs": [], + "source": [ + "query_first_non_empty_cypher_to_data_frame(\"../cypher/Metrics/Get_Abstractness_for_Typescript.cypher\",\n", + " \"../cypher/Metrics/Calculate_and_set_Abstractness_for_Typescript.cypher\", limit=20)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e10154bb", + "metadata": {}, + "source": [ + "## Distance from the main sequence\n", + "\n", + "The *main sequence* is a imaginary line that represents a good compromise between *Abstractness* and *Instability*. A high distance to this line may indicate problems. For example is very *stable* (rigid) code with low abstractness hard to change.\n", + "\n", + "Read more details on that in [OO Design Quality Metrics](https://api.semanticscholar.org/CorpusID:18246616) and [Calculate metrics](https://101.jqassistant.org/calculate-metrics/index.html)." + ] + }, + { + "cell_type": "markdown", + "id": "c4fdbb1d", + "metadata": {}, + "source": [ + "#### Table 5a\n", + "\n", + "- Show the top 30 Typescript modules with the highest distance from the \"main sequence\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a1d2d24", + "metadata": {}, + "outputs": [], + "source": [ + "instabilityPerAbstractnessTypescript = query_cypher_to_data_frame(\"../cypher/Metrics/Calculate_distance_between_abstractness_and_instability_for_Typescript.cypher\")\n", + "instabilityPerAbstractnessTypescript.head(30)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "149fc732", + "metadata": {}, + "source": [ + "### *Abstractness* vs. *Instability* Plot with \"Main Sequence\" line as reference\n", + "\n", + "- Plot *Abstractness* vs. *Instability* of all packages\n", + "- Draw the \"main sequence\" as dashed green diagonal line \n", + "- Scale the packages by the number of types they contain\n", + "- Color the packages by their distance to the \"main sequence\" (blue=near, red=far)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fbbad97", + "metadata": {}, + "outputs": [], + "source": [ + "# Function that returns the number of past (index smaller than given index) rows \n", + "# with the same value in columnName1 and columnName2\n", + "# If there was a row with the same columnName1 and columnName2 values\n", + "# def countPastEntriesWithSameValues(dataFrame, index, columnName1, columnName2):\n", + "# columnValue1 = dataFrame[columnName1][index]\n", + "# columnValue2 = dataFrame[columnName2][index]\n", + "# return len(dataFrame[\n", + "# (dataFrame.index.isin(range(0, index + 1))) & \n", + "# (dataFrame[columnName1]==columnValue1) & \n", + "# (dataFrame[columnName2]==columnValue2)\n", + "# ]) - 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48edf0ac", + "metadata": {}, + "outputs": [], + "source": [ + "def annotate_plot(data_frame: pd.DataFrame, index: int):\n", + " \"\"\"\n", + " Annotates the data points identified by the \"index\" in the plot of the \"data_frame\" \n", + " \"\"\"\n", + " x_position = data_frame.abstractness[index].item()\n", + " y_position = data_frame.instability[index].item()\n", + " artifact_name = data_frame.artifactName[index].item()\n", + " package_name = data_frame.name[index].item()\n", + "\n", + " label_box=dict(boxstyle=\"round4,pad=0.5\", fc=\"w\", alpha=0.8)\n", + " plot.annotate(artifact_name + '\\n' + package_name\n", + " ,xy=(x_position, y_position)\n", + " ,xycoords='data'\n", + " ,xytext=(20, 0)\n", + " ,textcoords='offset points'\n", + " ,size=6\n", + " ,bbox=label_box\n", + " ,arrowprops=dict(arrowstyle=\"-|>\", mutation_scale=10, color=\"black\")\n", + " )\n", + "\n", + "def index_of_sorted(data_frame: pd.DataFrame, highest: list[str] = []):\n", + " \"\"\"\n", + " Sorts the \"data_frame\" by columns 'abstractness','instability','elementsCount', 'artifactName'\n", + " and returns the index of the first row.\n", + " Columns that are contained in the list of strings parameter \"highest\" will be sorted descending.\n", + " \"\"\"\n", + " by = ['abstractness','instability','elementsCount','artifactName']\n", + " ascending = [('abstractness' not in highest), ('instability' not in highest), False, True]\n", + " return data_frame.sort_values(by=by, ascending=ascending).head(1).index\n", + "\n", + "def index_of_highest_property(data_frame: pd.DataFrame, highest: str):\n", + " \"\"\"\n", + " Sorts the \"data_frame\" by the property given in input parameter \"highest\" descending\n", + " and returns the index of the first row.\n", + " \"\"\"\n", + " by = [highest,'artifactName']\n", + " ascending = [False, True]\n", + " return data_frame.sort_values(by=by, ascending=ascending).head(1).index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd8489ca", + "metadata": {}, + "outputs": [], + "source": [ + "from matplotlib.colors import LinearSegmentedColormap\n", + "\n", + "def plot_instability_per_abstractness_with_main_sequence(data_frame: pd.DataFrame, subtitle: str = ''):\n", + " \"\"\"\n", + " Prints the so called \"Main Sequence\" as green dotted diagonal line as a reference.\n", + " On top of that every Java Package is plotted as a data point:\n", + " - The X-axis is the \"Abstractness\" where 0 = all classes to 1 = all interfaces and abstract classes\n", + " - The Y-axis is the \"Instability\" where 0 = stable (many incoming dependencies) and 1 instable (no incoming dependencies)\n", + " - The size reflects the number of contained Java Types\n", + " - The color reflects the distance to the \"Main Sequence\"\n", + " \"\"\"\n", + " \n", + " colormap=LinearSegmentedColormap.from_list('rg',[\"green\", \"gold\", \"orangered\", \"red\"], N=256) \n", + " marker_scales_bounded=data_frame.elementsCount.clip(lower=2, upper=300) * 0.7\n", + "\n", + " plot.scatter(\n", + " data_frame.abstractness, # x axis shows abstractness\n", + " data_frame.instability, # y axis shows instability\n", + " s=marker_scales_bounded, # scale depends on number of contained types\n", + " c=data_frame.distance, # color depends on distance to the main sequence\n", + " cmap=colormap, #'bwr' # colormap that prints high values in red and lower values in blue\n", + " alpha=0.5,\n", + " )\n", + " # green \"main sequence\" line\n", + " plot.plot([0,1], [1,0], c='lightgreen', linestyle='dashed') \n", + "\n", + " # Annotate largest package\n", + " annotate_plot(data_frame, index_of_highest_property(data_frame, highest='elementsCount'))\n", + " # Annotate largest package with the highest abstractness and instability\n", + " annotate_plot(data_frame, index_of_sorted(data_frame, highest=['abstractness','instability']))\n", + " # Annotate largest package with the lowest abstractness and highest instability\n", + " annotate_plot(data_frame, index_of_sorted(data_frame, highest=['instability']))\n", + " # Annotate largest package with the lowest abstractness and lowest instability\n", + " annotate_plot(data_frame, index_of_sorted(data_frame, highest=[]))\n", + " # Annotate largest package with the highest abstractness and lowest instability\n", + " annotate_plot(data_frame, index_of_sorted(data_frame, highest=['abstractness']))\n", + " # Annotate largest packages with the highest abstractness and instability near 0.5% \n", + " annotation_index = index_of_sorted(highest=['abstractness', 'instability'], data_frame=data_frame.query('abstractness <= 0.4 & instability <= 0.4'))\n", + " annotate_plot(data_frame, annotation_index)\n", + "\n", + " plot.title('Abstractness vs. Instability (\"Main Sequence\")\\n' + subtitle)\n", + " plot.xlabel('Abstractness')\n", + " plot.ylabel('Instability')\n", + " plot.show()\n" + ] + }, + { + "cell_type": "markdown", + "id": "c7499522", + "metadata": {}, + "source": [ + "#### Figure 5a - Typescript Modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02763566", + "metadata": {}, + "outputs": [], + "source": [ + "if not instabilityPerAbstractnessTypescript.empty:\n", + " plot_instability_per_abstractness_with_main_sequence(instabilityPerAbstractnessTypescript, 'Typescript modules')" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "JohT" + } + ], + "code_graph_analysis_pipeline_data_validation": "ValidateTypescriptModuleDependencies", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "title": "Object Oriented Design Quality Metrics for Java with Neo4j" + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/jupyter/Overview.ipynb b/jupyter/OverviewJava.ipynb similarity index 100% rename from jupyter/Overview.ipynb rename to jupyter/OverviewJava.ipynb diff --git a/jupyter/VisibilityMetrics.ipynb b/jupyter/VisibilityMetricsJava.ipynb similarity index 99% rename from jupyter/VisibilityMetrics.ipynb rename to jupyter/VisibilityMetricsJava.ipynb index d9a1e717d..9a1d1235d 100644 --- a/jupyter/VisibilityMetrics.ipynb +++ b/jupyter/VisibilityMetricsJava.ipynb @@ -418,6 +418,7 @@ "name": "JohT" } ], + "code_graph_analysis_pipeline_data_validation": "ValidateJavaTypes", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", diff --git a/markdown-lint-check-config.json b/markdown-lint-check-config.json new file mode 100644 index 000000000..e56bbcac3 --- /dev/null +++ b/markdown-lint-check-config.json @@ -0,0 +1,8 @@ +{ + "ignorePatterns": [ + { + "pattern": "^http://localhost" + } + ], + "aliveStatusCodes": [200, 202, 206] +} \ No newline at end of file diff --git a/scripts/analysis/analyze.sh b/scripts/analysis/analyze.sh index c0d1d5935..972975d7c 100755 --- a/scripts/analysis/analyze.sh +++ b/scripts/analysis/analyze.sh @@ -121,7 +121,7 @@ if [ ! -f "${SETTINGS_PROFILE_SCRIPT}" ] ; then exit 1 fi -# Execute the settings profile script that sets all the neccessary settings variables (overrideable by environment variables). +# Execute the settings profile script that sets all the necessary settings variables (overrideable by environment variables). echo "analyze: Using analysis settings profile script ${SETTINGS_PROFILE_SCRIPT}" source "${SETTINGS_PROFILE_SCRIPT}" diff --git a/scripts/executeJupyterNotebook.sh b/scripts/executeJupyterNotebook.sh index 953274b73..0551b198d 100755 --- a/scripts/executeJupyterNotebook.sh +++ b/scripts/executeJupyterNotebook.sh @@ -94,7 +94,7 @@ jupyter nbconvert --to notebook \ --output "$jupyter_notebook_output_file_name" \ --output-dir="./" \ --ExecutePreprocessor.timeout=480 -echo "executeJupyterNotebook: Sucessfully executed Jupyter Notebook ${jupyter_notebook_output_file_name}." +echo "executeJupyterNotebook: Successfully executed Jupyter Notebook ${jupyter_notebook_output_file_name}." # Convert the Jupyter Notebook to Markdown jupyter nbconvert --to markdown --no-input "$jupyter_notebook_output_file" @@ -104,10 +104,10 @@ jupyter nbconvert --to markdown --no-input "$jupyter_notebook_output_file" # Therefore the temporary file ".nostyle" is created and then moved to overwrite the original markdown file. sed -E '//,/<\/style>/d' "${jupyter_notebook_markdown_file}" > "${jupyter_notebook_markdown_file}.nostyle" mv -f "${jupyter_notebook_markdown_file}.nostyle" "${jupyter_notebook_markdown_file}" -echo "executeJupyterNotebook: Sucessfully created Markdown ${jupyter_notebook_markdown_file}.." +echo "executeJupyterNotebook: Successfully created Markdown ${jupyter_notebook_markdown_file}.." # Convert the Jupyter Notebook to PDF if [ -n "${ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION}" ]; then jupyter nbconvert --to webpdf --no-input --allow-chromium-download --disable-chromium-sandbox "$jupyter_notebook_output_file" - echo "executeJupyterNotebook: Sucessfully created PDF ${jupyter_notebook_output_file}." + echo "executeJupyterNotebook: Successfully created PDF ${jupyter_notebook_output_file}." fi \ No newline at end of file diff --git a/scripts/executeJupyterNotebookReport.sh b/scripts/executeJupyterNotebookReport.sh new file mode 100755 index 000000000..3e4e4722a --- /dev/null +++ b/scripts/executeJupyterNotebookReport.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash + +# Executes the given Jupyter Notebook and puts all resulting files (ipynb, md, pdf) into an accordingly named directory within the "results" directory. +# +# Command line options: +# --jupyterNotebook: Name of the Jupyter Notebook file including its file extension relative to the "jupyter" directory (required) +# --reportName: nameOfTheReportsDirectory (optional, default = kebab cased name of the Jupyter Notebook file) + +# Requires executeQueryFunctions.sh, executeJupyterNotebook.sh, cleanupAfterReportGeneration.sh + +# Override-able constants (defaults also defined in sub scripts) +REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} + +# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) +set -o errexit -o pipefail + +# Function to display script usage +usage() { + echo "Usage: $0 --jupyterNotebook nameOfTheJupyterNotebook [--reportName nameOfTheReportsDirectory]" + echo "Example: $0 --jupyterNotebook ArtifactDependencies.ipynb" + exit 1 +} + +# Converts the given camel case file name (basename) to kebab case (with dashed in between) +# Parameters: +# - File name in camel case +camel_to_kebab_case_file_name() { + basename "${1%.*}" | sed -r 's/([a-z0-9])([A-Z])/\1-\2/g' | tr '[:upper:]' '[:lower:]' +} + +# Returns the value of the Jupyter Notebook custom metadata property "code_graph_analysis_pipeline_data_validation" +# or an empty string if it doesn't exist. +# Parameters +# - Jupyter Notebook file name, e.g. ${JUPYTER_NOTEBOOK_DIRECTORY}/${jupyterNotebook} +get_data_validation_from_jupyter_metadata() { + grep -m1 -o '"code_graph_analysis_pipeline_data_validation":\s*"[^"]*"' "${1}" | cut -d '"' -f 4 || true +} + +# Uses "get_data_validation_from_jupyter_metadata" to extract the name of the +# data validation Cypher query out of the Jupyter Notebook file given as first parameter. +# The equally named Cypher query file is then loaded from the Cypher directory given as second parameter +# and the "Validation" directory in it. +# This Cypher query is then executed. If there is at least one result, then the validation is considered successful. +# +# Parameters +# - Jupyter Notebook file name, e.g. ${JUPYTER_NOTEBOOK_DIRECTORY}/${jupyterNotebook} +# - Cypher query directory, e.g. ${CYPHER_DIR} +validate_data_available() { + local jupyterNotebookFile="${1}" + local cypherDirectory="${2}" + + dataValidation=$(get_data_validation_from_jupyter_metadata "${jupyterNotebookFile}") + if [ -z "${dataValidation}" ] ; then + echo "executeJupyterNotebookReport: Skipping data validation. Jupyter Notebook ${jupyterNotebookFile} has no 'code_graph_analysis_pipeline_data_validation' metadata property." + return 0 + fi + echo "executeJupyterNotebookReport: dataValidation=${dataValidation}" + + local dataValidationCypherQuery="${cypherDirectory}/Validation/${dataValidation}.cypher" + if [ ! -f "${dataValidationCypherQuery}" ] ; then + echo "executeJupyterNotebookReport: Error: Validation Cypher Query file ${dataValidationCypherQuery} doesn't exist." + exit 1 + fi + + echo "executeJupyterNotebookReport: Validating data using Cypher query ${dataValidationCypherQuery} ..." + local dataValidationResult + dataValidationResult=$( execute_cypher_http_number_of_lines_in_result "${dataValidationCypherQuery}" ) + if [[ "${dataValidationResult}" -ge 1 ]]; then + echo "executeJupyterNotebookReport: Validation succeeded." + true; + else + echo "executeJupyterNotebookReport: Validation failed. No data from query ${dataValidationCypherQuery}." + false; + fi +} + +# Default values +reportName="" +jupyterNotebook="" + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + commandLineOption="${1}" + case ${commandLineOption} in + --jupyterNotebook) + jupyterNotebook="${2}" + shift + ;; + --reportName) + reportName="${2}" + shift + ;; + + *) + echo "executeJupyterNotebookReport: Error: Unknown option: ${commandLineOption}" + usage + ;; + esac + shift +done + +if [[ -z ${jupyterNotebook} ]]; then + echo "${USAGE}" + exit 1 +fi + +if [[ -z ${reportName} ]]; then + reportName=$(camel_to_kebab_case_file_name "${jupyterNotebook}") + echo "executeJupyterNotebookReport: reportName defaults to ${reportName}" +fi + +## Get this "scripts" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +SCRIPTS_DIR=${SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts +echo "executeJupyterNotebookReport: SCRIPTS_DIR=${SCRIPTS_DIR}" + +# Get the "scripts" directory by taking the path of this script and going one directory up. +REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-"${SCRIPTS_DIR}/reports"} # Repository directory containing the report scripts +echo "executeJupyterNotebookReport: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" + +# Get the "jupyter" directory by taking the path of this script and going two directory up and then to "jupyter". +JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY:-"${SCRIPTS_DIR}/../jupyter"} # Repository directory containing the Jupyter Notebooks +echo "executeJupyterNotebookReport: JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY}" + +# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher". +CYPHER_DIR=${CYPHER_DIR:-"${SCRIPTS_DIR}/../cypher"} +echo "executeJupyterNotebookReport CYPHER_DIR=${CYPHER_DIR}" + +# Define functions to execute cypher queries from within a given file, like e.g. "get_data_validation_from_jupyter_metadata" +source "${SCRIPTS_DIR}/executeQueryFunctions.sh" + +# Create report directory +FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${reportName}" +mkdir -p "${FULL_REPORT_DIRECTORY}" + +if validate_data_available "${JUPYTER_NOTEBOOK_DIRECTORY}/${jupyterNotebook}" "${CYPHER_DIR}"; then + # Execute and convert the given Jupyter Notebook within the given reports directory + (cd "${FULL_REPORT_DIRECTORY}" && exec "${SCRIPTS_DIR}/executeJupyterNotebook.sh" "${JUPYTER_NOTEBOOK_DIRECTORY}/${jupyterNotebook}") +else + echo "executeJupyterNotebookReport: Skipping Jupyter Notebook ${jupyterNotebook} because of missing data." +fi + + +# Clean-up after report generation. Empty reports will be deleted. +source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}" \ No newline at end of file diff --git a/scripts/executeJupyterNotebookReports.sh b/scripts/executeJupyterNotebookReports.sh deleted file mode 100755 index fd09d9ea9..000000000 --- a/scripts/executeJupyterNotebookReports.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env bash - -# Executes the Jupyter Notebook given with the command line option --jupyterNotebook and creates a report directory for the results (ipynb, md, pdf).. - -# Requires executeJupyterNotebook.sh, cleanupAfterReportGeneration.sh - -# Overrideable Constants (defaults also defined in sub scripts) -REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} - -# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) -set -o errexit -o pipefail - -# Function to display script usage -usage() { - echo "Usage: $0 --jupyterNotebook nameOfTheJupyterNotebook [--reportName nameOfTheReportsDirectory]" - echo "Example: $0 --jupyterNotebook ArtifactDependencies.ipynb" - exit 1 -} - -camel_to_kebab_case_file_name() { - basename "${1%.*}" | sed -r 's/([a-z0-9])([A-Z])/\1-\2/g' | tr '[:upper:]' '[:lower:]' -} - -# Default values -reportName="" -jupyterNotebook="" - -# Parse command line arguments -while [[ $# -gt 0 ]]; do - commandLineOption="${1}" - case ${commandLineOption} in - --jupyterNotebook) - jupyterNotebook="${2}" - shift - ;; - --reportName) - reportName="${2}" - shift - ;; - - *) - echo "executeJupyterNotebookReports: Error: Unknown option: ${commandLineOption}" - usage - ;; - esac - shift -done - -if [[ -z ${jupyterNotebook} ]]; then - echo "${USAGE}" - exit 1 -fi - -if [[ -z ${reportName} ]]; then - reportName=$(camel_to_kebab_case_file_name "${jupyterNotebook}") - echo "executeJupyterNotebookReports: reportName defaults to ${reportName}" -fi - -## Get this "scripts" directory if not already set -# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. -# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. -# This way non-standard tools like readlink aren't needed. -SCRIPTS_DIR=${SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts -echo "executeJupyterNotebookReports: SCRIPTS_DIR=${SCRIPTS_DIR}" - -# Get the "scripts" directory by taking the path of this script and going one directory up. -REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-"${SCRIPTS_DIR}/reports"} # Repository directory containing the report scripts -echo "executeJupyterNotebookReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" - -# Get the "jupyter" directory by taking the path of this script and going two directory up and then to "jupyter". -JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY:-"${SCRIPTS_DIR}/../jupyter"} # Repository directory containing the Jupyter Notebooks -echo "executeJupyterNotebookReports: JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY}" - -# Create report directory -FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${reportName}" -mkdir -p "${FULL_REPORT_DIRECTORY}" - -# Execute and convert the given Jupyter Notebook within the given reports directory -(cd "${FULL_REPORT_DIRECTORY}" && exec "${SCRIPTS_DIR}/executeJupyterNotebook.sh" "${JUPYTER_NOTEBOOK_DIRECTORY}/${jupyterNotebook}") - -# Clean-up after report generation. Empty reports will be deleted. -source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}" \ No newline at end of file diff --git a/scripts/reports/compilations/JupyterReports.sh b/scripts/reports/compilations/JupyterReports.sh index 425bf0435..11bc3973b 100755 --- a/scripts/reports/compilations/JupyterReports.sh +++ b/scripts/reports/compilations/JupyterReports.sh @@ -6,7 +6,7 @@ # For PDF generation chromium is required additionally. # Therefore these reports will take longer and require more resources than just plain database queries/procedures. -# Requires executeJupyterNotebookReports.sh, jupyter/*.ipynb +# Requires executeJupyterNotebookReport.sh, jupyter/*.ipynb # Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) set -o errexit -o pipefail @@ -33,5 +33,5 @@ echo "JupyterReports: JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY}" for jupyter_notebook_file in "${JUPYTER_NOTEBOOK_DIRECTORY}"/*.ipynb; do jupyter_notebook_file=$( basename "${jupyter_notebook_file}") echo "JupyterReports: Executing ${jupyter_notebook_file}..."; - source "${SCRIPTS_DIR}/executeJupyterNotebookReports.sh" --jupyterNotebook "${jupyter_notebook_file}" + source "${SCRIPTS_DIR}/executeJupyterNotebookReport.sh" --jupyterNotebook "${jupyter_notebook_file}" done diff --git a/scripts/setupJQAssistant.sh b/scripts/setupJQAssistant.sh index 4e134e0df..c71143e70 100755 --- a/scripts/setupJQAssistant.sh +++ b/scripts/setupJQAssistant.sh @@ -30,7 +30,7 @@ if [ -z "${TOOLS_DIRECTORY}" ]; then exit 1 else # Create tools directory if it doesn't exists - echo "setupJQAssistant: Creating tools directory <${TOOLS_DIRECTORY}> if neccessary" + echo "setupJQAssistant: Creating tools directory <${TOOLS_DIRECTORY}> if necessary" mkdir -p "${TOOLS_DIRECTORY}" fi @@ -40,7 +40,7 @@ if [ -z "${SHARED_DOWNLOADS_DIRECTORY}" ]; then exit 1 else # Create shared downloads directory if it doesn't exists - echo "setupJQAssistant: Creating shared downloads directory <${SHARED_DOWNLOADS_DIRECTORY}> if neccessary" + echo "setupJQAssistant: Creating shared downloads directory <${SHARED_DOWNLOADS_DIRECTORY}> if necessary" mkdir -p "${SHARED_DOWNLOADS_DIRECTORY}" fi @@ -57,7 +57,7 @@ if [ ! -d "${JQASSISTANT_INSTALLATION_DIRECTORY}" ] ; then # Unpack the ZIP file (-q option for less verbose output) unzip -q "${SHARED_DOWNLOADS_DIRECTORY}/${jqassistant_cli_fulldownload_file}" -d "${TOOLS_DIRECTORY}" - echo "setupJQAssistant: Installed sucessfully" + echo "setupJQAssistant: Installed successfully" else echo "setupJQAssistant: ${jqassistant_cli_fulldownload_file} already installed" fi \ No newline at end of file diff --git a/scripts/setupNeo4j.sh b/scripts/setupNeo4j.sh index 34cd0d2cf..ef331d7e7 100755 --- a/scripts/setupNeo4j.sh +++ b/scripts/setupNeo4j.sh @@ -48,7 +48,7 @@ if [ -z "${TOOLS_DIRECTORY}" ]; then exit 1 else # Create tools directory if it doesn't exists - echo "setupNeo4j: Creating tools directory <${TOOLS_DIRECTORY}> if neccessary" + echo "setupNeo4j: Creating tools directory <${TOOLS_DIRECTORY}> if necessary" mkdir -p "${TOOLS_DIRECTORY}" fi @@ -58,7 +58,7 @@ if [ -z "${SHARED_DOWNLOADS_DIRECTORY}" ]; then exit 1 else # Create shared downloads directory if it doesn't exists - echo "setupNeo4j: Creating shared downloads directory <${SHARED_DOWNLOADS_DIRECTORY}> if neccessary" + echo "setupNeo4j: Creating shared downloads directory <${SHARED_DOWNLOADS_DIRECTORY}> if necessary" mkdir -p "${SHARED_DOWNLOADS_DIRECTORY}" fi @@ -187,7 +187,7 @@ if [ ! -f "${NEO4J_PLUGINS}/${NEO4J_APOC_PLUGIN_ARTIFACT}" ] ; then echo "apoc.export.file.enabled=true" } >> "${NEO4J_APOC_CONFIG}" - echo "setupNeo4j: Awesome Procedures for Neo4j (APOC) installed sucessfully" + echo "setupNeo4j: Awesome Procedures for Neo4j (APOC) installed successfully" else echo "setupNeo4j: ${NEO4J_APOC_PLUGIN_ARTIFACT} already installed" fi @@ -218,7 +218,7 @@ if [ ! -f "${NEO4J_PLUGINS}/${neo4jGraphDataScienceReleaseArtifact}" ] ; then exit 1 fi - echo "setupNeo4j: Graph Data Science (GDS) installed sucessfully" + echo "setupNeo4j: Graph Data Science (GDS) installed successfully" else echo "setupNeo4j: ${neo4jGraphDataScienceReleaseArtifact} already installed" fi \ No newline at end of file