diff --git a/.github/workflows/java-code-analysis.yml b/.github/workflows/java-code-analysis.yml index a4f244c81..2c308728c 100644 --- a/.github/workflows/java-code-analysis.yml +++ b/.github/workflows/java-code-analysis.yml @@ -127,6 +127,7 @@ jobs: env: NEO4J_INITIAL_PASSWORD: ${{ secrets.NEO4J_INITIAL_PASSWORD }} ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION: "true" + IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT: "full" # Options: "none", "aggregated", "full" run: | ./../../scripts/analysis/analyze.sh diff --git a/.github/workflows/typescript-code-analysis.yml b/.github/workflows/typescript-code-analysis.yml index 85d48c563..5645a689d 100644 --- a/.github/workflows/typescript-code-analysis.yml +++ b/.github/workflows/typescript-code-analysis.yml @@ -132,6 +132,7 @@ jobs: env: NEO4J_INITIAL_PASSWORD: ${{ secrets.NEO4J_INITIAL_PASSWORD }} ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION: "true" + IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT: "full" # Options: "none", "aggregated", "full" run: | ./../../scripts/analysis/analyze.sh diff --git a/COMMANDS.md b/COMMANDS.md index 62938d59d..a1f01bd33 100644 --- a/COMMANDS.md +++ b/COMMANDS.md @@ -9,6 +9,7 @@ - [Start an analysis with CSV reports only](#start-an-analysis-with-csv-reports-only) - [Start an analysis with Jupyter reports only](#start-an-analysis-with-jupyter-reports-only) - [Start an analysis with PDF generation](#start-an-analysis-with-pdf-generation) + - [Start an analysis without importing git log data](#start-an-analysis-without-importing-git-log-data) - [Only run setup and explore the Graph manually](#only-run-setup-and-explore-the-graph-manually) - [Generate Markdown References](#generate-markdown-references) - [Generate Cypher Reference](#generate-cypher-reference) @@ -24,6 +25,10 @@ - [Setup jQAssistant Java Code Analyzer](#setup-jqassistant-java-code-analyzer) - [Download Maven Artifacts to analyze](#download-maven-artifacts-to-analyze) - [Reset the database and scan the java artifacts](#reset-the-database-and-scan-the-java-artifacts) + - [Import git log](#import-git-log) + - [Parameters](#parameters) + - [Resolving git files to code files](#resolving-git-files-to-code-files) + - [Import aggregated git log](#import-aggregated-git-log) - [Database Queries](#database-queries) - [Cypher Shell](#cypher-shell) - [HTTP API](#http-api) @@ -100,6 +105,14 @@ Note: Generating a PDF from a Jupyter notebook using [nbconvert](https://nbconve ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION=true ./../../scripts/analysis/analyze.sh ``` +#### Start an analysis without importing git log data + +To speed up analysis and get a smaller data footprint you can switch of git log data import of the "source" directory (if present) with `IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT="none"` as shown below or choose `IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT="aggregated"` to reduce data size by only importing monthly grouped changes instead of all commits. + +```shell +IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT="none" ./../../scripts/analysis/analyze.sh +``` + #### Only run setup and explore the Graph manually To prepare everything for analysis including installation, configuration and preparation queries to explore the graph manually @@ -214,6 +227,35 @@ enhance the data further with relationships between artifacts and packages. Be aware that this script deletes all previous relationships and nodes in the local Neo4j Graph database. +### Import git log + +Use [importGitLog.sh](./scripts/importGitLog.sh) to import git log data into the Graph. +It uses `git log` to extract commits, their authors and the names of the files changed with them. These are stored in an intermediate CSV file and are then imported into Neo4j with the following schema: + +```Cypher +(Git:Log:Author)-[:AUTHORED]->(Git:Log:Commit)->[:CONTAINS]->(Git:Log:File) +``` + +👉**Note:** Commit messages containing `[bot]` are filtered out to ignore changes made by bots. + +#### Parameters + +The optional parameter `--repository directory-path-to-a-git-repository` can be used to select a different directory for the repository. By default, the `source` directory within the analysis workspace directory is used. This command only needs the git history to be present so a `git clone --bare` is sufficient. If the `source` directory is also used for the analysis then a full git clone is of course needed (like for Typescript). + +#### Resolving git files to code files + +After git log data has been imported successfully, [Add_RESOLVES_TO_relationships_to_git_files_for_Java.cypher](./cypher/GitLog/Add_RESOLVES_TO_relationships_to_git_files_for_Java.cypher) is used to try to resolve the imported git file names to code files. This first attempt will cover most cases, but not all of them. With this approach it is, for example, not possible to distinguish identical file names in different Java jars from the git source files of a mono repo. + +You can use [List_unresolved_git_files.cypher](./cypher/GitLog/List_unresolved_git_files.cypher) to find code files that couldn't be matched to git file names and [List_ambiguous_git_files.cypher](./cypher/GitLog/List_ambiguous_git_files.cypher) to find ambiguously resolved git files. If you have any idea on how to improve this feel free to [open an issue](https://github.com/JohT/code-graph-analysis-pipeline/issues/new). + +### Import aggregated git log + +Use [importAggregatedGitLog.sh](./scripts/importAggregatedGitLog.sh) to import git log data in an aggregated form into the Graph. It works similar to the [full git log version above](#import-git-log). The only difference is that not every single commit is imported. Instead, changes are grouped per month including their commit count. This is in many cases sufficient and reduces data size and processing time significantly. Here is the resulting schema: + +```Cypher +(Git:Log:Author)-[:AUTHORED]->(Git:Log:ChangeSpan)-[:CONTAINS]->(Git:Log:File) +``` + ## Database Queries ### Cypher Shell diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md index b363f7bb4..26a04ede9 100644 --- a/GETTING_STARTED.md +++ b/GETTING_STARTED.md @@ -24,7 +24,7 @@ Please read through the [Prerequisites](./README.md#hammer_and_wrench-prerequisi cd MyFirstAnalysis ``` -1. Choose an initial password for Neo4j +1. Choose an initial password for Neo4j if not already done ```shell export NEO4J_INITIAL_PASSWORD=theinitialpasswordthatihavechosenforneo4j @@ -36,9 +36,11 @@ Please read through the [Prerequisites](./README.md#hammer_and_wrench-prerequisi mkdir artifacts ``` -1. Move the artifacts you want to analyze into the `artifacts` directory +1. Move the artifacts (Java jar or Typescript analysis json files) you want to analyze into the `artifacts` directory -1. Optionally run a predefined script to download artifacts +1. Optionally, create a `source` directory and clone the corresponding source code into it to also gather git log data. + +1. Alternatively to the steps above, run an already predefined download script ```shell ./../../scripts/downloader/downloadAxonFramework.sh @@ -48,31 +50,31 @@ Please read through the [Prerequisites](./README.md#hammer_and_wrench-prerequisi 1. Start the analysis - - Without any additional dependencies: + - Without any additional dependencies: ```shell ./../../scripts/analysis/analyze.sh --report Csv ``` - - Jupyter notebook reports when Python and Conda are installed: + - Jupyter notebook reports when Python and Conda are installed: ```shell ./../../scripts/analysis/analyze.sh --report Jupyter ``` - - Graph visualizations when Node.js and npm are installed: + - Graph visualizations when Node.js and npm are installed: ```shell ./../../scripts/analysis/analyze.sh --report Jupyter ``` - - All reports with Python, Conda, Node.js and npm installed: + - All reports with Python, Conda, Node.js and npm installed: ```shell ./../../scripts/analysis/analyze.sh ``` - - To explore the database yourself without any automatically generated reports and no additional requirements: + - To explore the database yourself without any automatically generated reports and no additional requirements: ```shell ./../../scripts/analysis/analyze.sh --explore diff --git a/README.md b/README.md index 3df79e209..dac7a07bc 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,9 @@ This could be as simple as running the following command in your Typescript proj npx --yes @jqassistant/ts-lce ``` -- Copy the resulting json file (e.g. `.reports/jqa/ts-output.json`) into the "artifacts" directory for your analysis work directory. Custom subdirectories within "artifacts" are also supported. +- It is recommended to put the cloned source code repository into a directory called `source` within the analysis workspace so that it will also be picked up to import git log data. + +- Copy the resulting json file (e.g. `.reports/jqa/ts-output.json`) into the `artifacts` directory for your analysis work directory. Custom subdirectories within `artifacts` are also supported. ## :rocket: Getting Started @@ -105,7 +107,7 @@ The [Code Structure Analysis Pipeline](./.github/workflows/java-code-analysis.ym - [Checkout GIT Repository](https://github.com/actions/checkout) - [Setup Java](https://github.com/actions/setup-java) - [Setup Python with Conda](https://github.com/conda-incubator/setup-miniconda) package manager [Mambaforge](https://github.com/conda-forge/miniforge#mambaforge) -- Download artifacts that contain the code to be analyzed [scripts/artifacts](./scripts/downloader/) +- Download artifacts and optionally source code that contain the code to be analyzed [scripts/downloader](./scripts/downloader) - Setup [Neo4j](https://neo4j.com) Graph Database ([analysis.sh](./scripts/analysis/analyze.sh)) - Setup [jQAssistant](https://jqassistant.github.io/jqassistant/doc) for Java and [Typescript](https://github.com/jqassistant-plugin/jqassistant-typescript-plugin) analysis ([analysis.sh](./scripts/analysis/analyze.sh)) - Start [Neo4j](https://neo4j.com) Graph Database ([analysis.sh](./scripts/analysis/analyze.sh)) @@ -176,7 +178,7 @@ The [Code Structure Analysis Pipeline](./.github/workflows/java-code-analysis.ym 👉 The script will automatically be included because of the directory and its name ending with "Jupyter.sh". - How can i add another code basis to be analyzed automatically? - 👉 Create a new artifacts download script in the [scripts/downloader](./scripts/downloader/) directory. Take for example [downloadAxonFramework.sh](./scripts/downloader/downloadAxonFramework.sh) as a reference. + 👉 Create a new download script in the [scripts/downloader](./scripts/downloader/) directory. Take for example [downloadAxonFramework.sh](./scripts/downloader/downloadAxonFramework.sh) as a reference. 👉 Run the script separately before executing [analyze.sh](./scripts/analysis/analyze.sh) also in the [pipeline](./.github/workflows/java-code-analysis.yml). - How can i trigger a full re-scan of all artifacts? @@ -195,6 +197,25 @@ The [Code Structure Analysis Pipeline](./.github/workflows/java-code-analysis.ym ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION=true ./../../scripts/analysis/analyze.sh ``` +- How can i disable git log data import? + 👉 Set environment variable `IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT` to `none`. Example: + + ```shell + export IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT="none" + ``` + + 👉 Alternatively prepend your command with `IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT="none"`: + + ```shell + IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT="none" ./../../scripts/analysis/analyze.sh + ``` + + 👉 An in-between option would be to only import monthly aggregated changes using `IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT="aggregated"`: + + ```shell + IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT="aggregated" ./../../scripts/analysis/analyze.sh + ``` + - Why are some Jupyter Notebook reports skipped? 👉 The custom Jupyter Notebook metadata property `code_graph_analysis_pipeline_data_validation` can be set to choose a query from [cypher/Validation](./cypher/Validation) that will be executed preliminary to the notebook. If the query leads to at least one result, the validation succeeds and the notebook will be run. If the query leads to no result, the notebook will be skipped. For more details see [Data Availability Validation](./COMMANDS.md#data-availability-validation). diff --git a/cypher/Create_a_DEPENDS_ON_relationship_for_every_DEPENDS_ON_ARTIFACT.cypher b/cypher/Create_a_DEPENDS_ON_relationship_for_every_DEPENDS_ON_ARTIFACT.cypher deleted file mode 100644 index 10dc6344e..000000000 --- a/cypher/Create_a_DEPENDS_ON_relationship_for_every_DEPENDS_ON_ARTIFACT.cypher +++ /dev/null @@ -1,8 +0,0 @@ -// Create a DEPENDS_ON relationship for every DEPENDS_ON_ARTIFACT - -MATCH (a:Artifact)-[existing:DEPENDS_ON_ARTIFACT]->(b:Artifact) -MERGE (a)-[created:DEPENDS_ON]->(b) - SET created.weight = existing.weight - WITH count(existing) as numberOfExistingRelations - ,count(created) as numberOfCreatedRelations -RETURN numberOfExistingRelations, numberOfCreatedRelations \ No newline at end of file diff --git a/cypher/Create_a_DEPENDS_ON_relationship_for_every_DEPENDS_ON_PACKAGE.cypher b/cypher/Create_a_DEPENDS_ON_relationship_for_every_DEPENDS_ON_PACKAGE.cypher deleted file mode 100644 index d051f10dd..000000000 --- a/cypher/Create_a_DEPENDS_ON_relationship_for_every_DEPENDS_ON_PACKAGE.cypher +++ /dev/null @@ -1,7 +0,0 @@ -// Create a DEPENDS_ON relationship for every DEPENDS_ON_PACKAGE - -MATCH (a:Package)-[existing:DEPENDS_ON_PACKAGE]->(b:Package) -MERGE (a)-[created:DEPENDS_ON]->(b) - WITH count(existing) as numberOfExistingRelations - ,count(created) as numberOfCreatedRelations -RETURN numberOfExistingRelations, numberOfCreatedRelations \ No newline at end of file diff --git a/cypher/External_Dependencies/List_external_Java_types_used.cypher b/cypher/External_Dependencies/List_external_Java_types_used.cypher index 52168e9f6..2800aec5a 100644 --- a/cypher/External_Dependencies/List_external_Java_types_used.cypher +++ b/cypher/External_Dependencies/List_external_Java_types_used.cypher @@ -1,3 +1,4 @@ // List external Java types used -MATCH (external:Java:ExternalType) RETURN external.fqn \ No newline at end of file +MATCH (external:Java:ExternalType) +RETURN labels(external), count(DISTINCT external.fqn) as numberOfExternalTypes \ No newline at end of file diff --git a/cypher/GitLog/Add_RESOLVES_TO_relationships_to_git_files_for_Java.cypher b/cypher/GitLog/Add_RESOLVES_TO_relationships_to_git_files_for_Java.cypher new file mode 100644 index 000000000..ab5bfe087 --- /dev/null +++ b/cypher/GitLog/Add_RESOLVES_TO_relationships_to_git_files_for_Java.cypher @@ -0,0 +1,15 @@ +// Connect git files to code files with a RESOLVES_TO relationship if their names match +// Note: Even if is tempting to combine this file with the Typescript variant, they are intentionally spearated. +// The differences are subtle but need to be thought through and tested carefully. +// Having separate files makes it obvious that there needs to be one for every new source code language. + +MATCH (code_file:File&!Git) +WHERE NOT EXISTS { (code_file)-[:RESOLVES_TO]->(other_file:File&!Git) } // only original nodes, no duplicates + WITH code_file, replace(code_file.fileName, '.class', '.java') AS codeFileName +MATCH (git_file:File&Git) +WHERE git_file.fileName ENDS WITH codeFileName +MERGE (git_file)-[:RESOLVES_TO]->(code_file) + SET git_file.resolved = true +RETURN labels(code_file)[0..4] AS codeFileLabels + ,count(DISTINCT codeFileName) AS numberOfCodeFiles + ,collect(DISTINCT codeFileName + ' <-> ' + git_file.fileName + '\n')[0..4] AS examples \ No newline at end of file diff --git a/cypher/GitLog/Add_RESOLVES_TO_relationships_to_git_files_for_Typescript.cypher b/cypher/GitLog/Add_RESOLVES_TO_relationships_to_git_files_for_Typescript.cypher new file mode 100644 index 000000000..8ff8024bc --- /dev/null +++ b/cypher/GitLog/Add_RESOLVES_TO_relationships_to_git_files_for_Typescript.cypher @@ -0,0 +1,15 @@ +// Connect git files to Typescript files with a RESOLVES_TO relationship if their names match +// Note: Even if is tempting to combine this file with the Java variant, they are intentionally spearated. +// The differences are subtle but need to be thought through and tested carefully. +// Having separate files makes it obvious that there needs to be one for every new source code language. + +MATCH (code_file:File&!Git) +WHERE NOT EXISTS { (code_file)-[:RESOLVES_TO]->(other_file:File&!Git) } // only original nodes, no duplicates + WITH code_file, code_file.absoluteFileName AS codeFileName +MATCH (git_file:File&Git) +WHERE codeFileName ENDS WITH git_file.fileName +MERGE (git_file)-[:RESOLVES_TO]->(code_file) + SET git_file.resolved = true +RETURN labels(code_file)[0..4] AS codeFileLabels + ,count(DISTINCT codeFileName) AS numberOfCodeFiles + ,collect(DISTINCT codeFileName + ' <-> ' + git_file.fileName + '\n')[0..4] AS examples \ No newline at end of file diff --git a/cypher/GitLog/Delete_git_log_data.cypher b/cypher/GitLog/Delete_git_log_data.cypher new file mode 100644 index 000000000..21c815602 --- /dev/null +++ b/cypher/GitLog/Delete_git_log_data.cypher @@ -0,0 +1,7 @@ +// Delete all Git log data in the Graph + +MATCH (n:Git) +CALL { WITH n +DETACH DELETE n +} IN TRANSACTIONS OF 1000 ROWS +RETURN count(n) as numberOfDeletedRows \ No newline at end of file diff --git a/cypher/GitLog/Import_aggregated_git_log_csv_data.cypher b/cypher/GitLog/Import_aggregated_git_log_csv_data.cypher new file mode 100644 index 000000000..d74f59fe7 --- /dev/null +++ b/cypher/GitLog/Import_aggregated_git_log_csv_data.cypher @@ -0,0 +1,17 @@ +// Import aggregated git log CSV data with the following schema: (Git:Log:Author)-[:AUTHORED]->(Git:Log:ChangeSpan)-[:CONTAINS]->(Git:Log:File) + +LOAD CSV WITH HEADERS FROM "file:///aggregatedGitLog.csv" AS row +CALL { WITH row + MERGE (git_author:Git:Log:Author {name: row.author, email: row.email}) + MERGE (git_change_span:Git:Log:ChangeSpan { + year: toInteger(row.year), + month: toInteger(row.month), + commits: toInteger(row.commits) + }) + MERGE (git_file:Git:Log:File {fileName: row.filename}) + MERGE (git_author)-[:AUTHORED]->(git_change_span) + MERGE (git_change_span)-[:CONTAINS]->(git_file) +} IN TRANSACTIONS OF 1000 ROWS +RETURN count(DISTINCT row.author) AS numberOfAuthors + ,count(DISTINCT row.filename) AS numberOfFiles + ,sum(toInteger(row.commits)) AS numberOfCommits \ No newline at end of file diff --git a/cypher/GitLog/Import_git_log_csv_data.cypher b/cypher/GitLog/Import_git_log_csv_data.cypher new file mode 100644 index 000000000..113ed8bca --- /dev/null +++ b/cypher/GitLog/Import_git_log_csv_data.cypher @@ -0,0 +1,18 @@ +// Import git log CSV data with the following schema: (Git:Log:Author)-[:AUTHORED]->(Git:Log:Commit)-[:CONTAINS]->(Git:Log:File) + +LOAD CSV WITH HEADERS FROM "file:///gitLog.csv" AS row +CALL { WITH row + MERGE (git_author:Git:Log:Author {name: row.author, email: row.email}) + MERGE (git_commit:Git:Log:Commit { + hash: row.hash, + message: row.message, + timestamp: datetime(row.timestamp), + timestamp_unix: toInteger(row.timestamp_unix) + }) + MERGE (git_file:Git:Log:File {fileName: row.filename}) + MERGE (git_author)-[:AUTHORED]->(git_commit) + MERGE (git_commit)-[:CONTAINS]->(git_file) +} IN TRANSACTIONS OF 1000 ROWS +RETURN count(DISTINCT row.author) AS numberOfAuthors + ,count(DISTINCT row.filename) AS numberOfFiles + ,count(DISTINCT row.hash) AS numberOfCommits \ No newline at end of file diff --git a/cypher/GitLog/Index_author_name.cypher b/cypher/GitLog/Index_author_name.cypher new file mode 100644 index 000000000..9b551f49a --- /dev/null +++ b/cypher/GitLog/Index_author_name.cypher @@ -0,0 +1,3 @@ +// Create index for author name (git data) + +CREATE INDEX INDEX_AUTHOR_NAME IF NOT EXISTS FOR (n:Author) ON (n.name) \ No newline at end of file diff --git a/cypher/GitLog/Index_change_span_year.cypher b/cypher/GitLog/Index_change_span_year.cypher new file mode 100644 index 000000000..beb3b7ce6 --- /dev/null +++ b/cypher/GitLog/Index_change_span_year.cypher @@ -0,0 +1,3 @@ +// Create index for change span year (aggregated git data) + +CREATE INDEX INDEX_CHANGE_SPAN_YEAR IF NOT EXISTS FOR (n:ChangeSpan) ON (n.year) \ No newline at end of file diff --git a/cypher/GitLog/Index_commit_hash.cypher b/cypher/GitLog/Index_commit_hash.cypher new file mode 100644 index 000000000..7365ba40a --- /dev/null +++ b/cypher/GitLog/Index_commit_hash.cypher @@ -0,0 +1,3 @@ +// Create index for commit hash (git data) + +CREATE INDEX INDEX_COMMIT_HASH IF NOT EXISTS FOR (n:Commit) ON (n.hash) \ No newline at end of file diff --git a/cypher/GitLog/Index_file_name.cypher b/cypher/GitLog/Index_file_name.cypher new file mode 100644 index 000000000..d078c0dd9 --- /dev/null +++ b/cypher/GitLog/Index_file_name.cypher @@ -0,0 +1,3 @@ +// Create index for the file name + +CREATE INDEX INDEX_FILE_NAME IF NOT EXISTS FOR (t:File) ON (t.fileName) \ No newline at end of file diff --git a/cypher/GitLog/List_ambiguous_git_files.cypher b/cypher/GitLog/List_ambiguous_git_files.cypher new file mode 100644 index 000000000..53f65e206 --- /dev/null +++ b/cypher/GitLog/List_ambiguous_git_files.cypher @@ -0,0 +1,20 @@ +// List ambigiously resolved git files where a single git file is attached to more than one code file for troubleshooting/testing. + +MATCH (file:File&!Git)<-[:RESOLVES_TO]-(git_file:File&Git) +OPTIONAL MATCH (artifact:Artifact:Archive)-[:CONTAINS]->(file) + WITH file.fileName AS fileName + ,reverse(split(reverse(file.fileName),'.')[0]) AS fileExtension + ,count(DISTINCT git_file.fileName) AS gitFilesCount + ,collect(DISTINCT split(git_file.fileName,'/')[0])[0..6] AS gitFileFirstPathExamples + ,collect(DISTINCT git_file.fileName)[0..6] AS gitFileExamples + ,collect(DISTINCT artifact.fileName) AS artifacts +WHERE gitFilesCount > 1 +RETURN fileName + ,fileExtension + ,gitFilesCount + ,count(*) AS numberOfCases + ,artifacts + ,gitFileFirstPathExamples + ,gitFileExamples +ORDER BY gitFilesCount DESC, fileName ASC +LIMIT 50 \ No newline at end of file diff --git a/cypher/GitLog/List_unresolved_git_files.cypher b/cypher/GitLog/List_unresolved_git_files.cypher new file mode 100644 index 000000000..4faee0ebb --- /dev/null +++ b/cypher/GitLog/List_unresolved_git_files.cypher @@ -0,0 +1,9 @@ +// List code files not covered by imported git data for troubleshooting/testing. + + MATCH (code_file:File&!Git&!Directory) + WHERE NOT EXISTS { (code_file)<-[:RESOLVES_TO]-(git_file:File&Git) } +RETURN reverse(split(reverse(code_file.fileName),'.')[0]) AS codeFileExtension + ,labels(code_file)[0..2] AS firstThreeCodeFileLabels + ,count(DISTINCT code_file.fileName) AS codeFileCount + ,collect(DISTINCT code_file.fileName)[0..6] AS codeFileExamples +LIMIT 50 \ No newline at end of file diff --git a/cypher/GitLog/Set_number_of_aggregated_git_commits.cypher b/cypher/GitLog/Set_number_of_aggregated_git_commits.cypher new file mode 100644 index 000000000..13b5060f5 --- /dev/null +++ b/cypher/GitLog/Set_number_of_aggregated_git_commits.cypher @@ -0,0 +1,7 @@ +// Set numberOfGitCommits property on code File nodes when aggregated change spans with grouped commits are present. + +MATCH (code_file:File&!Git)<-[:RESOLVES_TO]-(git_file:File&Git) +MATCH (git_file)<-[:CONTAINS]-(git_changespan:Git:ChangeSpan) + WITH code_file, sum(git_changespan.commits) AS numberOfGitCommits + SET code_file.numberOfGitCommits = numberOfGitCommits +RETURN count(DISTINCT coalesce(code_file.absoluteFileName, code_file.fileName)) AS changedCodeFiles \ No newline at end of file diff --git a/cypher/GitLog/Set_number_of_git_commits.cypher b/cypher/GitLog/Set_number_of_git_commits.cypher new file mode 100644 index 000000000..1ecb27241 --- /dev/null +++ b/cypher/GitLog/Set_number_of_git_commits.cypher @@ -0,0 +1,7 @@ +// Set numberOfGitCommits property on code File nodes when git commits are present + +MATCH (code_file:File&!Git)<-[:RESOLVES_TO]-(git_file:File&Git) +MATCH (git_file)<-[:CONTAINS]-(git_commit:Git:Commit) + WITH code_file, count(DISTINCT git_commit.hash) AS numberOfGitCommits + SET code_file.numberOfGitCommits = numberOfGitCommits +RETURN count(DISTINCT coalesce(code_file.absoluteFileName, code_file.fileName)) AS changedCodeFiles \ No newline at end of file diff --git a/scripts/analysis/analyze.sh b/scripts/analysis/analyze.sh index 972975d7c..975031905 100755 --- a/scripts/analysis/analyze.sh +++ b/scripts/analysis/analyze.sh @@ -2,11 +2,11 @@ # Coordinates the end-to-end analysis process, encompassing tool installation, graph generation, and report generation. # - Download and setup Neo4j and JQAssistant -# - Scan and analyze the contents of the artifacts directory to create the graph +# - Scan and analyze the contents of the artifacts and source directory to create the graph # - Trigger all requested reports # Note: Everything is done in the current (=working) directory and one directory above (shared downloads). -# It is recommended to create an empty directory (preferrable "temp") and +# It is recommended to create an empty directory (preferable "temp") and # within that another one for the analysis (e.g. "MyCodebaseName-Version") # and change into it prior to starting this script. diff --git a/scripts/downloader/downloadAxonFramework.sh b/scripts/downloader/downloadAxonFramework.sh index b7c5dcc51..8c6142a94 100755 --- a/scripts/downloader/downloadAxonFramework.sh +++ b/scripts/downloader/downloadAxonFramework.sh @@ -15,6 +15,7 @@ SCRIPT_FILE_NAME="$(basename -- "${BASH_SOURCE[0]}")" SCRIPT_FILE_NAME_WITHOUT_EXTENSION="${SCRIPT_FILE_NAME%%.*}" SCRIPT_FILE_NAME_WITHOUT_PREFIX_AND_EXTENSION="${SCRIPT_FILE_NAME_WITHOUT_EXTENSION##download}" ANALYSIS_NAME="${SCRIPT_FILE_NAME_WITHOUT_PREFIX_AND_EXTENSION}" +SOURCE_DIRECTORY=${SOURCE_DIRECTORY:-"source"} # Get the source repository directory (defaults to "source") echo "download${ANALYSIS_NAME}: SCRIPT_FILE_NAME=${SCRIPT_FILE_NAME}" echo "download${ANALYSIS_NAME}: SCRIPT_FILE_NAME_WITHOUT_EXTENSION=${SCRIPT_FILE_NAME_WITHOUT_EXTENSION}" @@ -22,7 +23,7 @@ echo "download${ANALYSIS_NAME}: ANALYSIS_NAME=${ANALYSIS_NAME}" # Read the first input argument containing the version(s) of the artifact(s) if [ "$#" -ne 1 ]; then - echo "Error (download${ANALYSIS_NAME}): Usage: $0 " >&2 + echo "Error (download${ANALYSIS_NAME}): Usage: $0 (e.g. 4.9.3)" >&2 exit 1 fi ARTIFACTS_VERSION=$1 @@ -41,7 +42,6 @@ echo "download${ANALYSIS_NAME}: SCRIPTS_DIR=${SCRIPTS_DIR}" ################################################################ # Download Artifacts that will be analyzed -################################################################ ARTIFACTS_GROUP="org.axonframework" source "${SCRIPTS_DIR}/downloadMavenArtifact.sh" -g ${ARTIFACTS_GROUP} -a axon-configuration -v ${ARTIFACTS_VERSION} || exit 2 source "${SCRIPTS_DIR}/downloadMavenArtifact.sh" -g ${ARTIFACTS_GROUP} -a axon-disruptor -v ${ARTIFACTS_VERSION} || exit 2 @@ -49,4 +49,8 @@ source "${SCRIPTS_DIR}/downloadMavenArtifact.sh" -g ${ARTIFACTS_GROUP} -a axon-e source "${SCRIPTS_DIR}/downloadMavenArtifact.sh" -g ${ARTIFACTS_GROUP} -a axon-messaging -v ${ARTIFACTS_VERSION} || exit 2 source "${SCRIPTS_DIR}/downloadMavenArtifact.sh" -g ${ARTIFACTS_GROUP} -a axon-modelling -v ${ARTIFACTS_VERSION} || exit 2 source "${SCRIPTS_DIR}/downloadMavenArtifact.sh" -g ${ARTIFACTS_GROUP} -a axon-test -v ${ARTIFACTS_VERSION} || exit 2 + +# Download the git history (bare clone without working tree) into the "source" folder. +# This makes it possible to additionally import the git log into the graph +git clone --bare https://github.com/AxonFramework/AxonFramework.git --branch "axon-${ARTIFACTS_VERSION}" "${SOURCE_DIRECTORY}/.git" ################################################################ \ No newline at end of file diff --git a/scripts/downloader/downloadReactRouter.sh b/scripts/downloader/downloadReactRouter.sh index 941d52d4d..f7f90c8a7 100755 --- a/scripts/downloader/downloadReactRouter.sh +++ b/scripts/downloader/downloadReactRouter.sh @@ -11,8 +11,6 @@ # Note: react-router uses pnpm as package manager which needs to be installed -# Requires downloadMavenArtifact.sh - # Fail on any error (errexit = exit on first error, errtrace = error inherited from sub-shell ,pipefail exist on errors within piped commands) set -o errexit -o errtrace -o pipefail @@ -21,6 +19,7 @@ SCRIPT_FILE_NAME="$(basename -- "${BASH_SOURCE[0]}")" SCRIPT_FILE_NAME_WITHOUT_EXTENSION="${SCRIPT_FILE_NAME%%.*}" SCRIPT_FILE_NAME_WITHOUT_PREFIX_AND_EXTENSION="${SCRIPT_FILE_NAME_WITHOUT_EXTENSION##download}" ANALYSIS_NAME="${SCRIPT_FILE_NAME_WITHOUT_PREFIX_AND_EXTENSION}" +SOURCE_DIRECTORY=${SOURCE_DIRECTORY:-"source"} # Get the source repository directory (defaults to "source") echo "download${ANALYSIS_NAME}: SCRIPT_FILE_NAME=${SCRIPT_FILE_NAME}" echo "download${ANALYSIS_NAME}: SCRIPT_FILE_NAME_WITHOUT_EXTENSION=${SCRIPT_FILE_NAME_WITHOUT_EXTENSION}" @@ -40,13 +39,13 @@ mkdir -p ./runtime/logs ################################################################ # Download react-router source files to be analyzed ################################################################ -git clone https://github.com/remix-run/react-router.git source +git clone https://github.com/remix-run/react-router.git "${SOURCE_DIRECTORY}" ( - cd source || exit + cd "${SOURCE_DIRECTORY}" || exit git checkout "react-router@${PROJECT_VERSION}" || exit pnpm install --frozen-lockfile || exit npx --yes @jqassistant/ts-lce >./../runtime/logs/jqassostant-typescript-scan.log 2>&1 || exit ) mkdir -p artifacts -mv -nv "source/.reports/jqa/ts-output.json" "artifacts/ts-react-router-${PROJECT_VERSION}.json" +mv -nv "${SOURCE_DIRECTORY}/.reports/jqa/ts-output.json" "artifacts/ts-react-router-${PROJECT_VERSION}.json" ################################################################ \ No newline at end of file diff --git a/scripts/importAggregatedGitLog.sh b/scripts/importAggregatedGitLog.sh new file mode 100755 index 000000000..e4f44c437 --- /dev/null +++ b/scripts/importAggregatedGitLog.sh @@ -0,0 +1,145 @@ +#!/usr/bin/env bash + +# Uses git log to create a comma separated values (CSV) file containing aggregated changes, their author name and email address, year and month for all the files that were changed. The CSV is then imported into Neo4j. + +# Note: This script needs the path to a git repository directory. It defaults to SOURCE_DIRECTORY ("source"). +# Note: Import will be skipped without an error if the directory is not a git repository. +# Note: This script needs git to be installed. + +# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) +set -o errexit -o pipefail + +# Overrideable Defaults +NEO4J_EDITION=${NEO4J_EDITION:-"community"} # Choose "community" or "enterprise" +NEO4J_VERSION=${NEO4J_VERSION:-"5.16.0"} +TOOLS_DIRECTORY=${TOOLS_DIRECTORY:-"tools"} # Get the tools directory (defaults to "tools") +SOURCE_DIRECTORY=${SOURCE_DIRECTORY:-"source"} # Get the source repository directory (defaults to "source") + +# Default and initial values for command line options +repository="${SOURCE_DIRECTORY}" + +# Read command line options +USAGE="importAggregatedGitLog: Usage: $0 [--repository (default=source)]" +while [[ $# -gt 0 ]]; do + key="$1" + case $key in + --repository) + repository="$2" + # Check if the explicitly given repository is a valid directory + if [ ! -d "${repository}" ] ; then + echo "importAggregatedGitLog: Error: The given repository <${repository}> is not a directory" >&2 + echo "${USAGE}" >&2 + exit 1 + fi + shift + ;; + *) + echo "importAggregatedGitLog: Error: Unknown option: ${key}" + echo "${USAGE}" >&2 + exit 1 + esac + shift +done + +# Check if the repository is actually a git repository +if ! (cd "${repository}" || exit; git rev-parse --git-dir 2> /dev/null || exit); then + echo "importAggregatedGitLog: Import skipped. ${repository} is not a git repository." + exit 0 +fi + +echo "importAggregatedGitLog: repository=${repository}" + +## Get this "scripts" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +SCRIPTS_DIR=${SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts +echo "importAggregatedGitLog: SCRIPTS_DIR=$SCRIPTS_DIR" + +# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher". +CYPHER_DIR=${CYPHER_DIR:-"${SCRIPTS_DIR}/../cypher"} +echo "importAggregatedGitLog: CYPHER_DIR=${CYPHER_DIR}" + +# Define functions (like execute_cypher and execute_cypher_summarized) to execute cypher queries from within a given file +source "${SCRIPTS_DIR}/executeQueryFunctions.sh" + +# Internal constants +IMPORTS_CYPHER_DIR="${CYPHER_DIR}/Imports" +NEO4J_INSTALLATION_NAME="neo4j-${NEO4J_EDITION}-${NEO4J_VERSION}" +NEO4J_INSTALLATION_DIRECTORY="${TOOLS_DIRECTORY}/${NEO4J_INSTALLATION_NAME}" +NEO4J_FULL_IMPORT_DIRECTORY=$(cd "${NEO4J_INSTALLATION_DIRECTORY}/import"; pwd) +OUTPUT_CSV_FILENAME="${NEO4J_FULL_IMPORT_DIRECTORY}/aggregatedGitLog.csv" + +# ----- Create a CSV file with git log data containing all commits and their changed files +echo "importAggregatedGitLog: Creating ${OUTPUT_CSV_FILENAME} from git log..." + +( + # Git log needs to be executed in the directory of the repository. + # This is done in a sub shell to automatically return to the previous directory. + cd "${repository}" || exit + + # Prints the header line of the CSV file with the names of the columns. + echo "filename,year,month,author,email,commits" > "${OUTPUT_CSV_FILENAME}" + + # Prints the aggregated git log in CSV format starting with the changed file, year-month, author, author email and number of commits. + # Includes quoted strings, double quote escaping and supports commas in strings. + git log --no-merges --pretty=format:' %ad,,,%an,,,%ae' --date=format:'%Y,%m' --name-only | \ + awk 'BEGIN { COMMA=",";QUOTE="\"" } /^ / { split($0, a, ",,,"); gsub(/^ /, "", a[1]); gsub(/"/, "\"\"", a[2]); gsub(/"/, "\"\"", a[3]); commit=a[1] COMMA QUOTE a[2] QUOTE COMMA QUOTE a[3] QUOTE } NF && !/^\ / { print "\""$0"\"," commit }' | + grep -v -F '[bot]' | \ + sort | uniq -c | \ + sed -E 's/^ *([0-9]+) (.+)/\2,\1/g' \ + >> "${OUTPUT_CSV_FILENAME}" + # Explanation: + # + # - --no-merges: Excludes merge commits from the log. + # - %ad: Author date (formatted as specified later) + # - %an: Author name + # - %ae: Author email + # - %ct: Commit date, Unix timestamp + # - %s: Subject of the commit + # - --date=format:'%Y,%m': Takes the year and the month of the date separated by a comma for example 2024,06 + # - --name-only: Lists the files affected by each commit. + # - --pretty=format starts with a space that is needed to detect the start of a line. + # - The chosen delimiters ,,, are used to separate these fields to make parsing easier. + # It is very unlikely that they appear in the contents and will be used as an intermediate step before escaping. + # + # - BEGIN { COMMA=","; QUOTE="\"" }: Initializes the variables COMMA and QUOTE to hold a comma and a double-quote character respectively. + # - /^ / { ... }: Processes lines that start with a space (indicating a file name in git log --name-only output). + # - gsub(/^ /, "", a[1]): Removes leading spaces from the first field (commit hash) that was used to indicate a new commit. + # - gsub(/"/, "\"\"", a[2]) escapes double quotes with two double quotes (CSV standard). + # a[2] is the commit author. Double quote escaping is done for every string column + # - commit=...: Constructs the commit information in CSV format, including the year-month of the change, quoted author name, and email. + # - NF && !/^\ / { print "\""$0"\"," commit }: For non-empty lines that do not start with a space (indicating commit information), + # it prints the commit information followed by the file name(s), enclosed in quotes. + # + # - grep -v -F '[bot]': Filters out commits where the commit message includes [bot] + # Used to identify commits made by automated systems or bots. + # + # - sort | uniq -c: Sorts the lines by their content (order of columns essential for that), removes duplicate lines and adds the number of duplicates at the beginning of each line + #- sed -E 's/^ *([0-9]+) (.+)/\2,\1/g': Reformats each line so that the commits count are the last column delimited by a comma. +) + +csv_file_size=$(wc -c "${OUTPUT_CSV_FILENAME}" | awk '{print $1}') +csv_lines=$(wc -l "${OUTPUT_CSV_FILENAME}" | awk '{print $1}') +echo "importAggregatedGitLog: File ${OUTPUT_CSV_FILENAME} with ${csv_file_size} bytes and ${csv_lines} lines created." +# --------- + +# ----- Import aggregate git log data csv +GIT_LOG_CYPHER_DIR="${CYPHER_DIR}/GitLog" + +echo "importAggregatedGitLog: Prepare import by creating indexes..." +execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_author_name.cypher" +execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_change_span_year.cypher" +execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_file_name.cypher" + +echo "importAggregatedGitLog: Deleting all existing git data in the Graph..." +execute_cypher "${GIT_LOG_CYPHER_DIR}/Delete_git_log_data.cypher" + +echo "importAggregatedGitLog: Importing aggregated git log data into the Graph..." +time execute_cypher "${GIT_LOG_CYPHER_DIR}/Import_aggregated_git_log_csv_data.cypher" + +echo "importAggregatedGitLog: Creating connections to nodes with matching file names..." +execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_RESOLVES_TO_relationships_to_git_files_for_Java.cypher" +execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_RESOLVES_TO_relationships_to_git_files_for_Typescript.cypher" +execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_aggregated_git_commits.cypher" +# --------- \ No newline at end of file diff --git a/scripts/importGitLog.sh b/scripts/importGitLog.sh new file mode 100755 index 000000000..93add3ef7 --- /dev/null +++ b/scripts/importGitLog.sh @@ -0,0 +1,140 @@ +#!/usr/bin/env bash + +# Uses git log to create a comma separated values (CSV) file containing all commits, their author, email address, date and all the file names that were changed with it. The CSV is then imported into Neo4j. + +# Note: This script needs the path to a git repository directory. It defaults to SOURCE_DIRECTORY ("source"). +# Note: Import will be skipped without an error if the directory is not a git repository. +# Note: This script needs git to be installed. + +# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) +set -o errexit -o pipefail + +# Overrideable Defaults +NEO4J_EDITION=${NEO4J_EDITION:-"community"} # Choose "community" or "enterprise" +NEO4J_VERSION=${NEO4J_VERSION:-"5.16.0"} +TOOLS_DIRECTORY=${TOOLS_DIRECTORY:-"tools"} # Get the tools directory (defaults to "tools") +SOURCE_DIRECTORY=${SOURCE_DIRECTORY:-"source"} # Get the source repository directory (defaults to "source") + +# Default and initial values for command line options +repository="${SOURCE_DIRECTORY}" + +# Read command line options +USAGE="importGitLog: Usage: $0 [--repository (default=source)]" +while [[ $# -gt 0 ]]; do + key="$1" + case $key in + --repository) + repository="$2" + # Check if the explicitly given repository is a valid directory + if [ ! -d "${repository}" ] ; then + echo "importGitLog: Error: The given repository <${repository}> is not a directory" >&2 + echo "${USAGE}" >&2 + exit 1 + fi + shift + ;; + *) + echo "importGitLog: Error: Unknown option: ${key}" + echo "${USAGE}" >&2 + exit 1 + esac + shift +done + +# Check if the repository is actually a git repository +if ! (cd "${repository}" || exit; git rev-parse --git-dir 2> /dev/null || exit); then + echo "importGitLog: Import skipped. ${repository} is not a git repository." + exit 0 +fi + +echo "importGitLog: repository=${repository}" + +## Get this "scripts" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +SCRIPTS_DIR=${SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts +echo "importGitLog: SCRIPTS_DIR=$SCRIPTS_DIR" + +# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher". +CYPHER_DIR=${CYPHER_DIR:-"${SCRIPTS_DIR}/../cypher"} +echo "importGitLog: CYPHER_DIR=${CYPHER_DIR}" + +# Define functions (like execute_cypher and execute_cypher_summarized) to execute cypher queries from within a given file +source "${SCRIPTS_DIR}/executeQueryFunctions.sh" + +# Internal constants +NEO4J_INSTALLATION_NAME="neo4j-${NEO4J_EDITION}-${NEO4J_VERSION}" +NEO4J_INSTALLATION_DIRECTORY="${TOOLS_DIRECTORY}/${NEO4J_INSTALLATION_NAME}" +NEO4J_FULL_IMPORT_DIRECTORY=$(cd "${NEO4J_INSTALLATION_DIRECTORY}/import"; pwd) +OUTPUT_CSV_FILENAME="${NEO4J_FULL_IMPORT_DIRECTORY}/gitLog.csv" + +# ----- Create a CSV file with git log data containing all commits and their changed files +echo "importGitLog: Creating ${OUTPUT_CSV_FILENAME} from git log..." + +( + # Git log needs to be executed in the directory of the repository. + # This is done in a sub shell to automatically return to the previous directory. + cd "${repository}" || exit + + # Prints the header line of the CSV file with the names of the columns. + echo "hash,author,email,timestamp,timestamp_unix,message,filename" > "${OUTPUT_CSV_FILENAME}" + + # Prints the git log in CSV format including the changed files. + # Includes quoted strings, double quote escaping and supports commas in strings. + git log --no-merges --pretty=format:' %h,,,%an,,,%ae,,,%aI,,,%ct,,,%s' --name-only | \ + awk 'BEGIN { COMMA=",";QUOTE="\"" } /^ / { split($0, a, ",,,"); gsub(/^ /, "", a[1]); gsub(/"/, "\"\"", a[2]); gsub(/"/, "\"\"", a[3]); gsub(/"/, "\"\"", a[6]); gsub(/\\/, " ", a[6]); commit=a[1] COMMA QUOTE a[2] QUOTE COMMA QUOTE a[3] QUOTE COMMA a[4] COMMA a[5] COMMA QUOTE a[6] QUOTE } NF && !/^\ / { print commit ",\""$0"\"" }' | \ + grep -v -F '[bot]' >> "${OUTPUT_CSV_FILENAME}" + # Explanation: + # + # - --no-merges: Excludes merge commits from the log. + # - %h: Abbreviated commit hash + # - %an: Author name + # - %ae: Author email + # - %aI: Author date, ISO 8601 format + # - %ct: Commit date, Unix timestamp + # - %s: Subject of the commit + # - --name-only: Lists the files affected by each commit. + # - --pretty=format starts with a space that is needed to detect the start of a line. + # - The chosen delimiters ,,, are used to separate these fields to make parsing easier. + # It is very unlikely that they appear in the contents and will be used as an intermediate step before escaping. + # + # - BEGIN { COMMA=","; QUOTE="\"" }: Initializes the variables COMMA and QUOTE to hold a comma and a double-quote character respectively. + # - /^ / { ... }: Processes lines that start with a space (indicating a file name in git log --name-only output). + # - gsub(/^ /, "", a[1]): Removes leading spaces from the first field (commit hash) that was used to indicate a new commit. + # - gsub(/"/, "\"\"", a[6]) escapes double quotes with two double quotes (CSV standard). + # a[6] is the commit message column. Double quote escaping is done for every string column + # - gsub(/\\/, " ", a[6]): Replaces backslashes in the commit message with spaces. + # Otherwise, \" would lead to an error since it would be seen as an non escaped double quote. + # - commit=...: Constructs the commit information in CSV format, including the quoted author name, author email, and commit message except for the file name. + # - NF && !/^\ / { print commit ",\""$0"\"" }: For non-empty lines that do not start with a space (indicating commit information), + # it prints the commit information followed by the file name(s), enclosed in quotes. + # + # - grep -v -F '[bot]': Filters out commits where the commit message includes [bot] + # Used to identify commits made by automated systems or bots. +) + +csv_file_size=$(wc -c "${OUTPUT_CSV_FILENAME}" | awk '{print $1}') +csv_lines=$(wc -l "${OUTPUT_CSV_FILENAME}" | awk '{print $1}') +echo "importGitLog: File ${OUTPUT_CSV_FILENAME} with ${csv_file_size} bytes and ${csv_lines} lines created." +# --------- + +# ----- Import git log data csv +GIT_LOG_CYPHER_DIR="${CYPHER_DIR}/GitLog" + +echo "importGitLog: Prepare import by creating indexes..." +execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_author_name.cypher" +execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_commit_hash.cypher" +execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_file_name.cypher" + +echo "importGitLog: Deleting all existing git data in the Graph..." +execute_cypher "${GIT_LOG_CYPHER_DIR}/Delete_git_log_data.cypher" + +echo "importGitLog: Importing git log data into the Graph..." +time execute_cypher "${GIT_LOG_CYPHER_DIR}/Import_git_log_csv_data.cypher" + +echo "importGitLog: Creating connections to nodes with matching file names..." +execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_RESOLVES_TO_relationships_to_git_files_for_Java.cypher" +execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_RESOLVES_TO_relationships_to_git_files_for_Typescript.cypher" +execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_commits.cypher" +# --------- \ No newline at end of file diff --git a/scripts/prepareAnalysis.sh b/scripts/prepareAnalysis.sh index 70adeb12c..242c37f17 100644 --- a/scripts/prepareAnalysis.sh +++ b/scripts/prepareAnalysis.sh @@ -2,11 +2,13 @@ # Prepares and validates the graph database before analysis -# Requires executeQueryFunctions.sh, parseCsvFunctions.sh +# Requires executeQueryFunctions.sh, parseCsvFunctions.sh, importGitLog.sh, importAggregatedGitLog # Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) set -o errexit -o pipefail +IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT=${IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT:-"full"} # Select how to import git log data. Options: "none", "aggregated", "full". Default="full". + ## Get this "scripts" directory if not already set # Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. @@ -24,10 +26,10 @@ fi CYPHER_DIR=${CYPHER_DIR:-"${SCRIPTS_DIR}/../cypher"} # Repository directory containing the cypher queries echo "prepareAnalysis: CYPHER_DIR=${CYPHER_DIR}" -# Define functions to execute a cypher query from within the given file (first and only argument) +# Define functions (like execute_cypher) to execute a cypher query from within the given file (first and only argument) source "${SCRIPTS_DIR}/executeQueryFunctions.sh" -# Define function(s) (e.g. is_csv_column_greater_zero) to parse CSV format strings from Cypher query results. +# Define functions (like is_csv_column_greater_zero) to parse CSV format strings from Cypher query results. source "${SCRIPTS_DIR}/parseCsvFunctions.sh" # Local Constants @@ -38,22 +40,26 @@ ARTIFACT_DEPENDENCIES_CYPHER_DIR="$CYPHER_DIR/Artifact_Dependencies" TYPES_CYPHER_DIR="$CYPHER_DIR/Types" TYPESCRIPT_CYPHER_DIR="$CYPHER_DIR/Typescript_Enrichment" -# Preparation - Data verification: DEPENDS_ON releationships +# Preparation - Data verification: DEPENDS_ON relationships dataVerificationResult=$( execute_cypher "${CYPHER_DIR}/Data_verification_DEPENDS_ON_relationships.cypher" "${@}") if ! is_csv_column_greater_zero "${dataVerificationResult}" "sourceNodeCount"; then echo "prepareAnalysis: Error: Data verification failed. At least one DEPENDS_ON relationship required. Check if the artifacts directory is empty or if the scan failed." exit 1 fi +# Preparation - Import git log if source or history is available +if [[ ! ${IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT} == "none" ]]; then + if [[ ${IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT} == "aggregated" ]]; then + source "${SCRIPTS_DIR}/importAggregatedGitLog.sh" + else + source "${SCRIPTS_DIR}/importGitLog.sh" + fi +fi + # Preparation - Create indices execute_cypher "${CYPHER_DIR}/Create_Java_Type_index_for_full_qualified_name.cypher" execute_cypher "${CYPHER_DIR}/Create_Typescript_index_for_full_qualified_name.cypher" -# Preparation - Create DEPENDS_ON for every DEPENDS_ON_* relationship -# Workaround for https://github.com/jQAssistant/jqa-java-plugin/issues/44 -# execute_cypher "${CYPHER_DIR}/Create_a_DEPENDS_ON_relationship_for_every_DEPENDS_ON_PACKAGE.cypher" -# execute_cypher "${CYPHER_DIR}/Create_a_DEPENDS_ON_relationship_for_every_DEPENDS_ON_ARTIFACT.cypher" - # Preparation - Enrich Graph for Typescript by adding "module" and "name" properties execute_cypher "${TYPESCRIPT_CYPHER_DIR}/Add_name_and_module_properties.cypher" @@ -63,22 +69,22 @@ execute_cypher "${TYPESCRIPT_CYPHER_DIR}/Add_RESOLVES_TO_relationship_for_matchi execute_cypher "${TYPESCRIPT_CYPHER_DIR}/Add_DEPENDS_ON_relationship_to_resolved_modules.cypher" # Preparation - Add weights to Java Package DEPENDS_ON relationships -execute_cypher "${DEPENDS_ON_CYPHER_DIR}/Add_weight_property_for_Java_Interface_Dependencies_to_Package_DEPENDS_ON_Relationship.cypher" -execute_cypher "${DEPENDS_ON_CYPHER_DIR}/Add_weight_property_to_Java_Package_DEPENDS_ON_Relationship.cypher" -execute_cypher "${DEPENDS_ON_CYPHER_DIR}/Add_weight25PercentInterfaces_to_Java_Package_DEPENDS_ON_relationships.cypher" -execute_cypher "${DEPENDS_ON_CYPHER_DIR}/Add_weight10PercentInterfaces_to_Java_Package_DEPENDS_ON_relationships.cypher" +execute_cypher_summarized "${DEPENDS_ON_CYPHER_DIR}/Add_weight_property_for_Java_Interface_Dependencies_to_Package_DEPENDS_ON_Relationship.cypher" +execute_cypher_summarized "${DEPENDS_ON_CYPHER_DIR}/Add_weight_property_to_Java_Package_DEPENDS_ON_Relationship.cypher" +execute_cypher_summarized "${DEPENDS_ON_CYPHER_DIR}/Add_weight25PercentInterfaces_to_Java_Package_DEPENDS_ON_relationships.cypher" +execute_cypher_summarized "${DEPENDS_ON_CYPHER_DIR}/Add_weight10PercentInterfaces_to_Java_Package_DEPENDS_ON_relationships.cypher" # Preparation - Add weights to Typescript Module DEPENDS_ON relationships -execute_cypher "${DEPENDS_ON_CYPHER_DIR}/Add_fine_grained_weights_for_Typescript_external_module_dependencies.cypher" -execute_cypher "${DEPENDS_ON_CYPHER_DIR}/Add_fine_grained_weights_for_Typescript_internal_module_dependencies.cypher" +execute_cypher_summarized "${DEPENDS_ON_CYPHER_DIR}/Add_fine_grained_weights_for_Typescript_external_module_dependencies.cypher" +execute_cypher_summarized "${DEPENDS_ON_CYPHER_DIR}/Add_fine_grained_weights_for_Typescript_internal_module_dependencies.cypher" # Preparation - Add Typescript Module node properties "incomingDependencies" and "outgoingDependencies" -execute_cypher "${METRICS_CYPHER_DIR}/Set_Incoming_Typescript_Module_Dependencies.cypher" -execute_cypher "${METRICS_CYPHER_DIR}/Set_Outgoing_Typescript_Module_Dependencies.cypher" +execute_cypher_summarized "${METRICS_CYPHER_DIR}/Set_Incoming_Typescript_Module_Dependencies.cypher" +execute_cypher_summarized "${METRICS_CYPHER_DIR}/Set_Outgoing_Typescript_Module_Dependencies.cypher" # Preparation - Add Java Package node properties "incomingDependencies" and "outgoingDependencies" -execute_cypher "${METRICS_CYPHER_DIR}/Set_Incoming_Java_Package_Dependencies.cypher" -execute_cypher "${METRICS_CYPHER_DIR}/Set_Outgoing_Java_Package_Dependencies.cypher" +execute_cypher_summarized "${METRICS_CYPHER_DIR}/Set_Incoming_Java_Package_Dependencies.cypher" +execute_cypher_summarized "${METRICS_CYPHER_DIR}/Set_Outgoing_Java_Package_Dependencies.cypher" # Preparation - Label external types and annotations # "external" means that there is no byte code available, not a primitive type and not a java type @@ -92,11 +98,11 @@ execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/Remove_external_type_and_ann execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/Label_external_types_and_annotations.cypher" # Preparation - Add Java Artifact node properties "incomingDependencies" and "outgoingDependencies" -execute_cypher "${ARTIFACT_DEPENDENCIES_CYPHER_DIR}/Incoming_Java_Artifact_Dependencies.cypher" -execute_cypher "${ARTIFACT_DEPENDENCIES_CYPHER_DIR}/Outgoing_Java_Artifact_Dependencies.cypher" +execute_cypher_summarized "${ARTIFACT_DEPENDENCIES_CYPHER_DIR}/Incoming_Java_Artifact_Dependencies.cypher" +execute_cypher_summarized "${ARTIFACT_DEPENDENCIES_CYPHER_DIR}/Outgoing_Java_Artifact_Dependencies.cypher" # Preparation - Add Java Type node properties "incomingDependencies" and "outgoingDependencies" -execute_cypher "${METRICS_CYPHER_DIR}/Set_Incoming_Java_Type_Dependencies.cypher" -execute_cypher "${METRICS_CYPHER_DIR}/Set_Outgoing_Java_Type_Dependencies.cypher" +execute_cypher_summarized "${METRICS_CYPHER_DIR}/Set_Incoming_Java_Type_Dependencies.cypher" +execute_cypher_summarized "${METRICS_CYPHER_DIR}/Set_Outgoing_Java_Type_Dependencies.cypher" echo "prepareAnalysis: Preparation successful" \ No newline at end of file