From e19553836732a49218ad94450df063e65978dc58 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Thu, 7 Aug 2025 21:41:24 +0200 Subject: [PATCH 1/5] Calculate association rule metrics for co-changing files --- README.md | 1 + ...HER_WITH_relationships_to_git_files.cypher | 105 ++++++++++++++---- ...it_files_that_were_changed_together.cypher | 2 +- ..._changed_together_with_another_file.cypher | 7 +- ...ise_changed_files_with_dependencies.cypher | 35 ++---- ...number_of_git_plugin_update_commits.cypher | 11 ++ ...ng_CHANGED_TOGETHER_WITH_properties.cypher | 6 + jupyter/GitHistoryGeneral.ipynb | 38 ++++++- scripts/importGit.sh | 17 +-- 9 files changed, 161 insertions(+), 61 deletions(-) create mode 100644 cypher/GitLog/Set_number_of_git_plugin_update_commits.cypher create mode 100644 cypher/GitLog/Verify_git_missing_CHANGED_TOGETHER_WITH_properties.cypher diff --git a/README.md b/README.md index f002c6147..b2f3562d7 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ Contained within this repository is a comprehensive and automated code graph ana ### :newspaper: News +- August 2025: Association rule learning for co-changing files in git history - August 2025: Anomaly detection powered by unsupervised machine learning and explainable AI - May 2025: Migrated to [Neo4j 2025.x](https://neo4j.com/docs/upgrade-migration-guide/current/version-2025/upgrade) and Java 21. diff --git a/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher b/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher index 362766b55..6d7474a62 100644 --- a/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher +++ b/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher @@ -1,45 +1,104 @@ // Connect git files that where changed together frequently with "CHANGED_TOGETHER_WITH" -MATCH (global_git_commit:Git:Commit) - WITH count(global_git_commit) AS globalCommitCount +// Determine global file count, global file count threshold (filter out refactoring commits) and global update commits +MATCH (git_commit_global:Git:Commit)-[:CONTAINS_CHANGE]->(:Git:Change)-[:UPDATES]->(git_file_global:Git:File) +WHERE git_file_global.deletedAt IS NULL + WITH git_commit_global, count(DISTINCT git_file_global) AS commitFileCount + WITH percentileDisc(commitFileCount, 0.95) AS globalFileCountThreshold + ,count(git_commit_global) AS globalUpdateCommitCount +// Main section MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[:UPDATES]->(git_file:Git:File) MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) WHERE git_file.deletedAt IS NULL // Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA) ORDER BY git_commit.sha, git_file.relativePath - WITH globalCommitCount + WITH globalFileCountThreshold + ,globalUpdateCommitCount ,git_commit.sha AS commitHash ,collect(DISTINCT git_file) AS filesInCommit // Limit the file count to min. 2 (changed together) and // max. 50 (reduce permutations, improve performance, filter out large refactorings that usually affect many files) WHERE size(filesInCommit) >= 2 - AND size(filesInCommit) <= 50 + AND size(filesInCommit) <= globalFileCountThreshold // Collect distinct pairwise (..., 2, 2) combinations of all files in the list - WITH globalCommitCount + WITH globalFileCountThreshold + ,globalUpdateCommitCount ,commitHash ,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations UNWIND fileCombinations AS fileCombination - WITH globalCommitCount + WITH globalFileCountThreshold + ,globalUpdateCommitCount ,fileCombination - ,count(DISTINCT commitHash) AS commitCount - ,collect(DISTINCT commitHash) AS commitHashes + ,count(DISTINCT commitHash) AS updateCommitCount + ,collect(DISTINCT commitHash) AS updateCommitHashes +// Deactivated: // Filter out file pairs that where changed not very often together // In detail: More than 0.1 per mille compared to overall commit count -WHERE commitCount > globalCommitCount * 0.001 - WITH fileCombination[0] AS firstFile +// WHERE updateCommitCount > globalUpdateCommitCount * 0.001 + WITH * + ,fileCombination[0] AS firstFile ,fileCombination[1] AS secondFile - ,commitCount - ,commitHashes -// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "commitCount" on it - CALL (firstFile, secondFile, commitCount, commitHashes) { + WITH * + // Get the lowest number of git update commits of both files (file pair) + ,CASE WHEN firstFile.updateCommitCount < secondFile.updateCommitCount + THEN firstFile.updateCommitCount + ELSE secondFile.updateCommitCount + END AS minUpdateCommitCount + // Calculate update commit support by dividing the update commit count by the overall commit count for both files + ,toFloat(firstFile.updateCommitCount) / globalUpdateCommitCount AS firstFileUpdateSupport + ,toFloat(secondFile.updateCommitCount) / globalUpdateCommitCount AS secondFileUpdateSupport + WITH * + // Expected likelihood that the first and the second file change together given complete randomness + ,firstFileUpdateSupport * secondFileUpdateSupport AS expectedCoUpdateSupport + WITH firstFile + ,secondFile + ,updateCommitHashes + ,updateCommitCount + // Out of all the times the less frequently changed file was touched, how often did it co-occur with the other file? + ,toFloat(updateCommitCount) / minUpdateCommitCount AS updateCommitMinConfidence + // Compared to all commits in general, how high is the percentage of the commits where both files changed together? + ,toFloat(updateCommitCount) / globalUpdateCommitCount AS updateCommitSupport + // Lift + ,toFloat(updateCommitCount) / (globalUpdateCommitCount * expectedCoUpdateSupport) AS updateCommitLift + // Jaccard Similarity: Of all commits involving either file, how many involved both? + ,toFloat(updateCommitCount) / (firstFile.updateCommitCount + secondFile.updateCommitCount - updateCommitCount) AS updateCommitJaccardSimilarity +// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "updateCommitCount" on it + CALL (firstFile, secondFile, updateCommitCount, updateCommitHashes, updateCommitMinConfidence, updateCommitSupport, updateCommitLift, updateCommitJaccardSimilarity) { MERGE (firstFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile) - SET pairwiseChange.commitCount = commitCount - ,pairwiseChange.commitHashes = commitHashes - } IN TRANSACTIONS + SET pairwiseChange.updateCommitCount = updateCommitCount + ,pairwiseChange.updateCommitHashes = updateCommitHashes + ,pairwiseChange.updateCommitMinConfidence = updateCommitMinConfidence + ,pairwiseChange.updateCommitSupport = updateCommitSupport + ,pairwiseChange.updateCommitLift = updateCommitLift + ,pairwiseChange.updateCommitJaccardSimilarity = updateCommitJaccardSimilarity + } IN TRANSACTIONS OF 500 ROWS // Return one row with some statistics about the found pairs and their commit counts -RETURN max(commitCount) AS maxCommitCount - ,avg(commitCount) AS avgCommitCount - ,percentileDisc(commitCount, 0.5) AS percentile50CommitCount - ,percentileDisc(commitCount, 0.9) AS percentile90CommitCount - ,percentileDisc(commitCount, 0.95) AS percentile95CommitCount - ,count(*) AS pairCount \ No newline at end of file +RETURN count(*) AS pairCount + + ,min(updateCommitCount) AS minCommitCount + ,max(updateCommitCount) AS maxCommitCount + ,avg(updateCommitCount) AS avgCommitCount + ,percentileDisc(updateCommitCount, 0.5) AS percentile50CommitCount + ,percentileDisc(updateCommitCount, 0.9) AS percentile90CommitCount + ,percentileDisc(updateCommitCount, 0.95) AS percentile95CommitCount + + ,min(updateCommitMinConfidence) AS minMinConfidence + ,max(updateCommitMinConfidence) AS maxMinConfidence + ,avg(updateCommitMinConfidence) AS avgMinConfidence + ,percentileDisc(updateCommitMinConfidence, 0.5) AS percentile50MinConfidence + ,percentileDisc(updateCommitMinConfidence, 0.9) AS percentile90MinConfidence + ,percentileDisc(updateCommitMinConfidence, 0.95) AS percentile95MinConfidence + + ,min(updateCommitLift) AS minLift + ,max(updateCommitLift) AS maxLift + ,avg(updateCommitLift) AS avgLift + ,percentileDisc(updateCommitLift, 0.5) AS percentile50Lift + ,percentileDisc(updateCommitLift, 0.9) AS percentile90Lift + ,percentileDisc(updateCommitLift, 0.95) AS percentile95Lift + + ,min(updateCommitJaccardSimilarity) AS minJaccardSimilarity + ,max(updateCommitJaccardSimilarity) AS maxJaccardSimilarity + ,avg(updateCommitJaccardSimilarity) AS avgJaccardSimilarity + ,percentileDisc(updateCommitJaccardSimilarity, 0.5) AS percentile50JaccardSimilarity + ,percentileDisc(updateCommitJaccardSimilarity, 0.9) AS percentile90JaccardSimilarity + ,percentileDisc(updateCommitJaccardSimilarity, 0.95) AS percentile95JaccardSimilarity \ No newline at end of file diff --git a/cypher/GitLog/List_git_files_that_were_changed_together.cypher b/cypher/GitLog/List_git_files_that_were_changed_together.cypher index f19cf4efb..b82dc30d1 100644 --- a/cypher/GitLog/List_git_files_that_were_changed_together.cypher +++ b/cypher/GitLog/List_git_files_that_were_changed_together.cypher @@ -6,5 +6,5 @@ MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile) MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(secondGitFile) RETURN gitRepository.name + '/' + firstGitFile.relativePath AS firstFile ,gitRepository.name + '/' + secondGitFile.relativePath AS secondFile - ,gitChange.commitCount AS commitCount + ,gitChange.updateCommitCount AS commitCount ORDER BY commitCount DESC diff --git a/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher b/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher index 16f8458dd..13f643649 100644 --- a/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher +++ b/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher @@ -3,6 +3,11 @@ MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository) MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile) UNWIND gitChange.commitHashes AS commitHash -RETURN gitRepository.name + '/' + firstGitFile.relativePath AS filePath + WITH gitRepository.name + '/' + firstGitFile.relativePath AS filePath ,count(DISTINCT commitHash) AS commitCount + ,sum(firstGitFile.updateCommitCount) AS fileUpdateCount + WITH * + // Out of all the times the file was touched, how often did it co-occur with other files? + ,CASE WHEN fileUpdateCount > 0 THEN toFloat(commitCount) / fileUpdateCount ELSE 0.0 END AS coChangeRate +RETURN filePath, commitCount, coChangeRate ORDER BY commitCount DESC \ No newline at end of file diff --git a/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher b/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher index b4c66604e..b780fcb91 100644 --- a/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher +++ b/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher @@ -2,38 +2,17 @@ MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File) MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile) -//De-duplicating the pairs of files isn't necessary, because the dependency relation is directed. -//WHERE elementId(firstCodeFile) < elementId(secondCodeFile) - WITH firstCodeFile.fileName AS firstFileName - ,secondCodeFile.fileName AS secondFileName +WHERE elementId(firstCodeFile) < elementId(secondCodeFile) + WITH firstCodeFile.fileName AS firstFileName + ,secondCodeFile.fileName AS secondFileName ,coalesce(dependency.weight, dependency.cardinality) AS dependencyWeight - ,pairwiseChange.commitCount AS commitCount + ,pairwiseChange.updateCommitCount AS commitCount + ,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence ,dependency.fileDistanceAsFewestChangeDirectoryCommands AS fileDistanceAsFewestChangeDirectoryCommands RETURN dependencyWeight ,commitCount + ,updateCommitMinConfidence ,fileDistanceAsFewestChangeDirectoryCommands // ,count(*) AS occurrences // ,collect(firstFileName + ' -> ' + secondFileName)[0..3] AS examples -ORDER BY dependencyWeight, commitCount - -// MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File) -// MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile) -// WHERE elementId(firstCodeFile) < elementId(secondCodeFile) -// RETURN firstCodeFile.fileName AS firstFileName -// ,secondCodeFile.fileName AS secondFileName -// ,dependency.weight AS dependencyWeight -// ,pairwiseChange.commitCount AS commitCount -// ORDER BY dependencyWeight, commitCount - -// MATCH (g1:!Git&File)-[relation:CHANGED_TOGETHER_WITH|DEPENDS_ON]-(g2:!Git&File) -// WITH count(DISTINCT relation) AS relatedFilesCount -// ,collect(DISTINCT relation) AS relations -// UNWIND relations AS relation -// WITH relatedFilesCount -// ,coalesce(relation.commitCount, 0) AS commitCount -// ,coalesce(relation.weight, 0) AS dependencyWeight -// ,coalesce(relation.fileDistanceAsFewestChangeDirectoryCommands, 0) AS fileDistanceAsFewestChangeDirectoryCommands -// RETURN dependencyWeight -// ,commitCount -// ,fileDistanceAsFewestChangeDirectoryCommands -// ORDER BY dependencyWeight, commitCount +ORDER BY dependencyWeight, commitCount \ No newline at end of file diff --git a/cypher/GitLog/Set_number_of_git_plugin_update_commits.cypher b/cypher/GitLog/Set_number_of_git_plugin_update_commits.cypher new file mode 100644 index 000000000..68a10a416 --- /dev/null +++ b/cypher/GitLog/Set_number_of_git_plugin_update_commits.cypher @@ -0,0 +1,11 @@ +// Set updateCommitCount property on Git File nodes when git commits with Update modifier (detected by the plugin) are present + +MATCH (git_file:File&Git)<-[:UPDATES]-(:Git&Change)<-[:CONTAINS_CHANGE]-(git_commit:Git&Commit) +WHERE git_file.deletedAt IS NULL + WITH git_file, count(DISTINCT git_commit.sha) AS updateCommitCount + SET git_file.updateCommitCount = updateCommitCount + WITH git_file, updateCommitCount +MATCH (code_file:File&!Git)<-[:RESOLVES_TO]-(git_file) + SET code_file.updateCommitCount = updateCommitCount +RETURN count(DISTINCT code_file) AS codeFileUpdates + ,collect(DISTINCT code_file.name)[0..4] AS codeFileExample \ No newline at end of file diff --git a/cypher/GitLog/Verify_git_missing_CHANGED_TOGETHER_WITH_properties.cypher b/cypher/GitLog/Verify_git_missing_CHANGED_TOGETHER_WITH_properties.cypher new file mode 100644 index 000000000..31e9e3e1b --- /dev/null +++ b/cypher/GitLog/Verify_git_missing_CHANGED_TOGETHER_WITH_properties.cypher @@ -0,0 +1,6 @@ +// Verify if CHANGED_TOGETHER_WITH properties from git are missing + +MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile) +RETURN (pairwiseChange.updateCommitCount IS NULL) AS updateCommitCountMissing + ,(pairwiseChange.updateCommitMinConfidence IS NULL) AS updateCommitMinConfidenceMissing + ,count(*) \ No newline at end of file diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index b8b6aa112..8a2ea19bb 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -1198,6 +1198,7 @@ "pairwise_changed_git_files = pairwise_changed_git_files.groupby(['directoryPath']).aggregate(\n", " pairwiseChangeCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n", " pairwiseChangeFileCount=pd.NamedAgg(column=\"filePath\", aggfunc=\"count\"),\n", + " pairwiseChangeAverageRate=pd.NamedAgg(column=\"coChangeRate\", aggfunc=\"mean\"),\n", ")\n", "pairwise_changed_git_files.reset_index(inplace=True)\n", "\n", @@ -1220,6 +1221,7 @@ "\n", "pairwise_changed_git_files['pairwiseChangeCommitCount'] = pairwise_changed_git_files['pairwiseChangeCommitCount'].fillna(0).astype(int)\n", "pairwise_changed_git_files['pairwiseChangeFileCount'] = pairwise_changed_git_files['pairwiseChangeFileCount'].fillna(0).astype(int)\n", + "pairwise_changed_git_files['pairwiseChangeAverageRate'] = pairwise_changed_git_files['pairwiseChangeAverageRate'].fillna(0).astype(float)\n", "pairwise_changed_git_files.reset_index(inplace=True)\n", "\n", "# Debug\n", @@ -1399,7 +1401,13 @@ " display(pearsonr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n", "\n", " display(\"Spearman Correlation with p-value for commitCount and dependencyWeight\")\n", - " display(spearmanr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))" + " display(spearmanr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n", + "\n", + " display(\"Pearson Correlation with p-value for updateCommitMinConfidence and dependencyWeight\")\n", + " display(pearsonr(pairwise_changed_git_files_with_dependencies['updateCommitMinConfidence'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n", + "\n", + " display(\"Spearman Correlation with p-value for updateCommitMinConfidence and dependencyWeight\")\n", + " display(spearmanr(pairwise_changed_git_files_with_dependencies['updateCommitMinConfidence'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))" ] }, { @@ -1431,6 +1439,34 @@ " figure.write_image(**get_plotly_figure_write_image_settings(\"PairwiseChangedFilesVsDependencyWeight\"))" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "75264b82", + "metadata": {}, + "outputs": [], + "source": [ + "# Scatter plot of all pairs of files with their min confidence (normalized update commit count) on the x axis and dependency weight on the y axis\n", + "\n", + "if pairwise_changed_git_files_with_dependencies.empty:\n", + " print(\"No data to plot\")\n", + "else:\n", + " figure = plotly_graph_objects.Figure(plotly_graph_objects.Scatter(\n", + " x=pairwise_changed_git_files_with_dependencies['updateCommitMinConfidence'], \n", + " y=pairwise_changed_git_files_with_dependencies['dependencyWeight'],\n", + " mode='markers',\n", + " ))\n", + " figure.update_layout(\n", + " **plotly_bar_layout_base_settings,\n", + " title='Pairwise changed files: Min confidence co-change rate vs. dependency weight',\n", + " xaxis_title='co-change rate (min confidence, normalized update commit count)',\n", + " yaxis_title='dependency weight',\n", + " )\n", + " figure.show(**plotly_treemap_figure_show_settings)\n", + " if is_command_line_execution():\n", + " figure.write_image(**get_plotly_figure_write_image_settings(\"PairwiseChangedFilesVsDependencyWeight\"))" + ] + }, { "cell_type": "markdown", "id": "14e87aff", diff --git a/scripts/importGit.sh b/scripts/importGit.sh index 7ed281310..708a5b496 100755 --- a/scripts/importGit.sh +++ b/scripts/importGit.sh @@ -7,6 +7,7 @@ # Note: This script needs the path to source directory that contains one or more git repositories. It defaults to SOURCE_DIRECTORY ("source"). # Note: Import will be skipped without an error if the source directory doesn't any git repositories. # Note: This script needs git to be installed. +# Note: IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT="plugin" is default and recommended. The other options "aggregated" and "full" are not actively maintained anymore. # Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) set -o errexit -o pipefail @@ -134,13 +135,14 @@ commonPostGitImport() { echo "importGit: Running verification queries for troubleshooting (non failing)..." execute_cypher "${GIT_LOG_CYPHER_DIR}/Verify_git_to_code_file_unambiguous.cypher" execute_cypher "${GIT_LOG_CYPHER_DIR}/Verify_code_to_git_file_unambiguous.cypher" + execute_cypher "${GIT_LOG_CYPHER_DIR}/Verify_git_missing_CHANGED_TOGETHER_WITH_properties.cypher" } postGitLogImport() { - commonPostGitImport - echo "importGit: Add numberOfGitCommits property to nodes with matching file names..." execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_log_commits.cypher" + + commonPostGitImport } postGitPluginImport() { @@ -149,24 +151,25 @@ postGitPluginImport() { # TODO: The deletion of all plain files in the "/.git" directory is needed # until there is a way to exclude all files inside a directory # while still being able to get them analyzed by the git plugin. - # This would most likely be solved with https://github.com/jQAssistant/jqassistant/issues/410 execute_cypher "${GIT_LOG_CYPHER_DIR}/Delete_plain_git_directory_file_nodes.cypher" execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_commit_sha.cypher" execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_file_name.cypher" execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_file_relative_path.cypher" execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_absolute_file_name.cypher" - commonPostGitImport - echo "importGit: Add numberOfGitCommits property to nodes with matching file names..." execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_plugin_commits.cypher" -} + echo "importGit: Add updateCommitCount property to file nodes and code nodes with matching file names..." + execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_plugin_update_commits.cypher" -postAggregatedGitLogImport() { commonPostGitImport +} +postAggregatedGitLogImport() { echo "importGit: Add numberOfGitCommits property to nodes with matching file names..." execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_aggregated_git_commits.cypher" + + commonPostGitImport } # Create import directory in case it doesn't exist. From e992d85937161ed7db2f44c94be392dec916da16 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Wed, 13 Aug 2025 08:33:34 +0200 Subject: [PATCH 2/5] Split experimental git history reports from general ones --- ...ise_changed_files_with_dependencies.cypher | 14 +- jupyter/GitHistoryExploration.ipynb | 427 ++++++++++++++++++ jupyter/GitHistoryGeneral.ipynb | 142 ------ 3 files changed, 437 insertions(+), 146 deletions(-) create mode 100644 jupyter/GitHistoryExploration.ipynb diff --git a/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher b/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher index b780fcb91..83562bb58 100644 --- a/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher +++ b/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher @@ -6,13 +6,19 @@ WHERE elementId(firstCodeFile) < elementId(secondCodeFile) WITH firstCodeFile.fileName AS firstFileName ,secondCodeFile.fileName AS secondFileName ,coalesce(dependency.weight, dependency.cardinality) AS dependencyWeight - ,pairwiseChange.updateCommitCount AS commitCount - ,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence - ,dependency.fileDistanceAsFewestChangeDirectoryCommands AS fileDistanceAsFewestChangeDirectoryCommands + ,dependency.fileDistanceAsFewestChangeDirectoryCommands AS fileDistance + ,pairwiseChange.updateCommitCount AS commitCount + ,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence + ,pairwiseChange.updateCommitSupport AS updateCommitSupport + ,pairwiseChange.updateCommitLift AS updateCommitLift + ,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity RETURN dependencyWeight + ,fileDistance ,commitCount ,updateCommitMinConfidence - ,fileDistanceAsFewestChangeDirectoryCommands + ,updateCommitSupport + ,updateCommitLift + ,updateCommitJaccardSimilarity // ,count(*) AS occurrences // ,collect(firstFileName + ' -> ' + secondFileName)[0..3] AS examples ORDER BY dependencyWeight, commitCount \ No newline at end of file diff --git a/jupyter/GitHistoryExploration.ipynb b/jupyter/GitHistoryExploration.ipynb new file mode 100644 index 000000000..b57b96949 --- /dev/null +++ b/jupyter/GitHistoryExploration.ipynb @@ -0,0 +1,427 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "2f0eabc4", + "metadata": {}, + "source": [ + "# git log/history\n", + "
\n", + "\n", + "### References\n", + "- [Visualizing Code: Polyglot Notebooks Repository (YouTube)](https://youtu.be/ipOpToPS-PY?si=3doePt2cp-LgEUmt)\n", + "- [gitstractor (GitHub)](https://github.com/IntegerMan/gitstractor)\n", + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4191f259", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "from scipy.stats import pearsonr, spearmanr\n", + "import matplotlib.pyplot as plot\n", + "from matplotlib.colors import ListedColormap\n", + "from neo4j import GraphDatabase" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c5dab37", + "metadata": {}, + "outputs": [], + "source": [ + "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n", + "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", + "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", + "\n", + "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")))\n", + "driver.verify_connectivity()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1db254b", + "metadata": {}, + "outputs": [], + "source": [ + "def get_cypher_query_from_file(cypher_file_name : str):\n", + " with open(cypher_file_name) as file:\n", + " return ' '.join(file.readlines())\n", + "\n", + "\n", + "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n", + " \"\"\"\n", + " Execute the Cypher query of the given file and returns the result.\n", + " filename : str : The name of the file containing the Cypher query\n", + " limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit\n", + " \"\"\"\n", + " cypher_query = get_cypher_query_from_file(filename)\n", + " if limit > 0:\n", + " cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n", + " records, summary, keys = driver.execute_query(cypher_query)\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", + "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):\n", + " \"\"\"\n", + " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", + " If all given file names result in empty results, the last (empty) result will be returned.\n", + " By additionally specifying \"limit=\" the \"LIMIT\" keyword will appended to query so that only the first results get returned.\n", + " \"\"\" \n", + " result=pd.DataFrame()\n", + " for filename in filenames:\n", + " result=query_cypher_to_data_frame(filename, limit)\n", + " if not result.empty:\n", + " return result\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a56670c9", + "metadata": {}, + "outputs": [], + "source": [ + "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n", + "#This is especially needed for PDF export of tables with multiple columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "006b9dc8", + "metadata": {}, + "outputs": [], + "source": [ + "%%html\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d565f2ed", + "metadata": {}, + "outputs": [], + "source": [ + "# Main Colormap\n", + "# main_color_map = 'nipy_spectral'\n", + "main_color_map = 'viridis'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6323e85e", + "metadata": {}, + "outputs": [], + "source": [ + "# Pandas DataFrame Display Configuration\n", + "pd.set_option('display.max_colwidth', 500)" + ] + }, + { + "cell_type": "markdown", + "id": "c15669ef", + "metadata": {}, + "source": [ + "## Pairwise Changed Files vs. Dependency Weight\n", + "\n", + "This section explores the correlation between how often pairs of files are changed together (common commit count) and their dependency weight. Note that these results should be interpreted cautiously, as comparing pairwise changes and dependencies is inherently challenging.\n", + "\n", + "### Considerations\n", + "- **Historical vs. Current State**: Pairwise changes reflect the entire git history, while dependency weight represents the current state of the codebase.\n", + "- **Commit Granularity**: Developers may use different commit strategies, such as squashing changes into a single commit or creating fine-grained commits. Ideally, each commit should represent a single semantic change for accurate analysis.\n", + "- **Dependency Representation**: Some file types (e.g., Java files with import statements) clearly define dependencies, while others (e.g., shell scripts, XML, YAML) lack explicit dependency relationships.\n", + "- **Repository Characteristics**: Repositories with generated code may have many large commits, while stabilized repositories may only update configuration files for dependency changes." + ] + }, + { + "cell_type": "markdown", + "id": "98a2feea", + "metadata": {}, + "source": [ + "#### Data Preview" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a067f8e6", + "metadata": {}, + "outputs": [], + "source": [ + "pairwise_changed_git_files_with_dependencies = query_cypher_to_data_frame(\"../cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher\")\n", + "pairwise_changed_git_files_with_dependencies.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "01db2db9", + "metadata": {}, + "source": [ + "#### Data Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fe48db8", + "metadata": {}, + "outputs": [], + "source": [ + "display(\"Pairwise changed git files compared to dependency weights - Overall statistics\")\n", + "display(pairwise_changed_git_files_with_dependencies.describe())\n", + "\n", + "# The correlation matrix plot can be found further below\n", + "# display(\"Pairwise changed git files compared to dependency weights - Pearson Correlation\")\n", + "# display(pairwise_changed_git_files_with_dependencies.corr(method='pearson'))\n", + "\n", + "# display(\"Pairwise changed git files compared to dependency weights - Spearman Correlation\")\n", + "# display(pairwise_changed_git_files_with_dependencies.corr(method='spearman'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a4ae651", + "metadata": {}, + "outputs": [], + "source": [ + "if pairwise_changed_git_files_with_dependencies.shape[0] < 5:\n", + " print(\"Less than 5 samples are not enough to calculate p-values\")\n", + "else:\n", + " display(\"Pearson Correlation with p-value for commitCount and dependencyWeight\")\n", + " display(pearsonr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n", + "\n", + " display(\"Spearman Correlation with p-value for commitCount and dependencyWeight\")\n", + " display(spearmanr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n", + "\n", + " display(\"Pearson Correlation with p-value for updateCommitMinConfidence and dependencyWeight\")\n", + " display(pearsonr(pairwise_changed_git_files_with_dependencies['updateCommitMinConfidence'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n", + "\n", + " display(\"Spearman Correlation with p-value for updateCommitMinConfidence and dependencyWeight\")\n", + " display(spearmanr(pairwise_changed_git_files_with_dependencies['updateCommitMinConfidence'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32125058", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_correlation_matrix(correlation_matrix: pd.DataFrame, title_suffix: str = \"\") -> None:\n", + " \"\"\"\n", + " Plots the correlation matrix of the features in the DataFrame.\n", + " \n", + " :param java_package_anomaly_detection_features: DataFrame containing the features.\n", + " :param java_package_features_to_standardize: List of feature names to include in the correlation matrix.\n", + " \"\"\"\n", + " figure, axis = plot.subplots(figsize=(6, 6))\n", + " color_axis = axis.matshow(correlation_matrix, cmap=\"coolwarm\")\n", + " figure.colorbar(color_axis)\n", + " axis.set_xticks(range(len(correlation_matrix.columns)))\n", + " axis.set_yticks(range(len(correlation_matrix.index)))\n", + " axis.set_xticklabels(correlation_matrix.columns, rotation=90, fontsize=8)\n", + " axis.set_yticklabels(correlation_matrix.index, fontsize=8)\n", + " for (i, j), correlation_value in np.ndenumerate(correlation_matrix.values):\n", + " axis.text(j, i, f\"{correlation_value:.2f}\", ha='center', va='center', color='black', fontsize=8, bbox=dict(facecolor='white', alpha=0.2, edgecolor='none'))\n", + " plot.title(f\"Correlation Matrix {title_suffix}\", fontsize=10)\n", + " plot.tight_layout()\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a375279", + "metadata": {}, + "outputs": [], + "source": [ + "plot_correlation_matrix(pairwise_changed_git_files_with_dependencies.corr(method=\"pearson\"), \"(Pearson)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9efa7d61", + "metadata": {}, + "outputs": [], + "source": [ + "plot_correlation_matrix(pairwise_changed_git_files_with_dependencies.corr(method=\"spearman\"), \"(Spearman)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f17b39b4", + "metadata": {}, + "outputs": [], + "source": [ + "def pvalue_matrix(data: pd.DataFrame) -> pd.DataFrame:\n", + " columns = data.columns\n", + " # Fill the diagonal with value 1.0. \n", + " # A p-value for 2 identical values would lead to warnings and not reveal any valueable insights.\n", + " p_values = pd.DataFrame(np.ones((len(columns), len(columns))), columns=columns, index=columns)\n", + "\n", + " for i in range(len(columns)):\n", + " for j in range(i+1, len(columns)):\n", + " _, p_value = pearsonr(data[columns[i]], data[columns[j]])\n", + " if np.isnan(p_value): # replace nan with 1.0 = no significance\n", + " p_value = 1.0\n", + " if np.isclose(p_value, 0.0, rtol=1e-15, atol=1e-15): # replace values to close to zero by 1 = no significance\n", + " p_value = 1.0\n", + " p_values.iloc[i, j] = p_value\n", + " p_values.iloc[j, i] = p_value\n", + "\n", + " return p_values\n", + "\n", + "def plot_p_value_matrix(p_value_matrix: pd.DataFrame):\n", + "\n", + " # Map values to 0 (green) and 1 (white)\n", + " data_for_plot = np.where(p_value_matrix < 0.05, 0, 1)\n", + "\n", + " # Make a colormap: green for low p-values, white for others\n", + " color_map = ListedColormap([\"limegreen\", \"white\"])\n", + "\n", + " # Plot heatmap\n", + " figure, axis = plot.subplots(figsize=(6, 5))\n", + " image = axis.imshow(data_for_plot, cmap=color_map, vmin=0, vmax=1)\n", + "\n", + " # Add colorbar\n", + " # color_bar = plot.colorbar(image, ax=axis)\n", + " # color_bar.set_label(\"p-value\")\n", + "\n", + " # Show all ticks\n", + " axis.set_xticks(np.arange(len(p_value_matrix.columns)))\n", + " axis.set_yticks(np.arange(len(p_value_matrix.index)))\n", + " axis.set_xticklabels(p_value_matrix.columns, fontsize=8)\n", + " axis.set_yticklabels(p_value_matrix.index, fontsize=8)\n", + "\n", + " # Rotate tick labels\n", + " plot.setp(axis.get_xticklabels(), rotation=45, ha=\"right\", rotation_mode=\"anchor\", fontsize=8)\n", + "\n", + " # Annotate with values\n", + " for i in range(len(p_value_matrix.columns)):\n", + " for j in range(len(p_value_matrix.index)):\n", + " cell_value = p_value_matrix.iloc[i, j]\n", + " if cell_value < 0.001:\n", + " cell_text = f\"{cell_value:.1e}\" # scientific notation\n", + " else:\n", + " cell_text = f\"{cell_value:.4f}\" # normal 4-decimal format\n", + " axis.text(j, i, cell_text, ha=\"center\", va=\"center\", color=\"black\", fontsize=6)\n", + "\n", + " plot.title(\"p-value Matrix (< 0.05 in Green)\", pad=20, fontsize=10)\n", + " plot.tight_layout()\n", + " plot.show()\n", + "\n", + "# Plot p-values showing statistical significance\n", + "plot_p_value_matrix(pvalue_matrix(pairwise_changed_git_files_with_dependencies))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c799b77", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_git_changes_vs_dependencies(pairwise_changes: pd.DataFrame, title: str, x_column: str, y_column: str):\n", + " if pairwise_changes.empty:\n", + " print(\"No projected data to plot.\")\n", + " return\n", + "\n", + " plot.scatter(\n", + " x=pairwise_changes[x_column],\n", + " y=pairwise_changes[y_column],\n", + " s=3,\n", + " )\n", + " plot.xlabel(x_column)\n", + " plot.ylabel(y_column)\n", + " plot.title(title, pad=20)\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "747f9590", + "metadata": {}, + "outputs": [], + "source": [ + "# Scatter plot of all pairs of files with their commit count on the x axis and dependency weight on the y axis\n", + "\n", + "plot_git_changes_vs_dependencies(\n", + " pairwise_changed_git_files_with_dependencies,\n", + " 'Pairwise changed files: Number of changes (commitCount) vs. dependency weight',\n", + " 'commitCount',\n", + " 'dependencyWeight'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75264b82", + "metadata": {}, + "outputs": [], + "source": [ + "# Scatter plot of all pairs of files with their min confidence (normalized update commit count) on the x axis and dependency weight on the y axis\n", + "\n", + "plot_git_changes_vs_dependencies(\n", + " pairwise_changed_git_files_with_dependencies,\n", + " 'Pairwise changed files: Min confidence co-change rate vs. dependency weight',\n", + " 'updateCommitMinConfidence',\n", + " 'dependencyWeight'\n", + ")" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "JohT" + } + ], + "code_graph_analysis_pipeline_data_validation": "ValidateAlwaysFalse", + "kernelspec": { + "display_name": "codegraph", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + }, + "title": "Git History Charts with Neo4j (Additional Manual Exploration)" + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index 8a2ea19bb..65b2c2d1d 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -1325,148 +1325,6 @@ " figure.write_image(**get_plotly_figure_write_image_settings(\"ChangedFilesPerCommit\"))" ] }, - { - "cell_type": "markdown", - "id": "c15669ef", - "metadata": {}, - "source": [ - "## Pairwise Changed Files vs. Dependency Weight\n", - "\n", - "This section explores the correlation between how often pairs of files are changed together (common commit count) and their dependency weight. Note that these results should be interpreted cautiously, as comparing pairwise changes and dependencies is inherently challenging.\n", - "\n", - "### Considerations\n", - "- **Historical vs. Current State**: Pairwise changes reflect the entire git history, while dependency weight represents the current state of the codebase.\n", - "- **Commit Granularity**: Developers may use different commit strategies, such as squashing changes into a single commit or creating fine-grained commits. Ideally, each commit should represent a single semantic change for accurate analysis.\n", - "- **Dependency Representation**: Some file types (e.g., Java files with import statements) clearly define dependencies, while others (e.g., shell scripts, XML, YAML) lack explicit dependency relationships.\n", - "- **Repository Characteristics**: Repositories with generated code may have many large commits, while stabilized repositories may only update configuration files for dependency changes." - ] - }, - { - "cell_type": "markdown", - "id": "98a2feea", - "metadata": {}, - "source": [ - "#### Data Preview" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a067f8e6", - "metadata": {}, - "outputs": [], - "source": [ - "pairwise_changed_git_files_with_dependencies = query_cypher_to_data_frame(\"../cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher\")\n", - "pairwise_changed_git_files_with_dependencies.head(20)" - ] - }, - { - "cell_type": "markdown", - "id": "01db2db9", - "metadata": {}, - "source": [ - "#### Data Statistics" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9fe48db8", - "metadata": {}, - "outputs": [], - "source": [ - "display(\"Pairwise changed git files compared to dependency weights - Overall statistics\")\n", - "display(pairwise_changed_git_files_with_dependencies.describe())\n", - "\n", - "display(\"Pairwise changed git files compared to dependency weights - Pearson Correlation\")\n", - "display(pairwise_changed_git_files_with_dependencies.corr(method='pearson'))\n", - "\n", - "display(\"Pairwise changed git files compared to dependency weights - Spearman Correlation\")\n", - "display(pairwise_changed_git_files_with_dependencies.corr(method='spearman'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a4ae651", - "metadata": {}, - "outputs": [], - "source": [ - "if pairwise_changed_git_files_with_dependencies.shape[0] < 5:\n", - " print(\"Less than 5 samples are not enough to calculate p-values\")\n", - "else:\n", - " from scipy.stats import pearsonr, spearmanr\n", - "\n", - " display(\"Pearson Correlation with p-value for commitCount and dependencyWeight\")\n", - " display(pearsonr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n", - "\n", - " display(\"Spearman Correlation with p-value for commitCount and dependencyWeight\")\n", - " display(spearmanr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n", - "\n", - " display(\"Pearson Correlation with p-value for updateCommitMinConfidence and dependencyWeight\")\n", - " display(pearsonr(pairwise_changed_git_files_with_dependencies['updateCommitMinConfidence'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n", - "\n", - " display(\"Spearman Correlation with p-value for updateCommitMinConfidence and dependencyWeight\")\n", - " display(spearmanr(pairwise_changed_git_files_with_dependencies['updateCommitMinConfidence'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "747f9590", - "metadata": {}, - "outputs": [], - "source": [ - "# Scatter plot of all pairs of files with their commit count on the x axis and dependency weight on the y axis\n", - "\n", - "if pairwise_changed_git_files_with_dependencies.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " figure = plotly_graph_objects.Figure(plotly_graph_objects.Scatter(\n", - " x=pairwise_changed_git_files_with_dependencies['commitCount'], \n", - " y=pairwise_changed_git_files_with_dependencies['dependencyWeight'],\n", - " mode='markers',\n", - " # marker=dict(size=pairwise_changed_git_files_with_dependencies['occurrences'] + 8)\n", - " ))\n", - " figure.update_layout(\n", - " **plotly_bar_layout_base_settings,\n", - " title='Pairwise changed files: Number of changes (commitCount) vs. dependency weight',\n", - " xaxis_title='commit count',\n", - " yaxis_title='dependency weight',\n", - " )\n", - " figure.show(**plotly_treemap_figure_show_settings)\n", - " if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"PairwiseChangedFilesVsDependencyWeight\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75264b82", - "metadata": {}, - "outputs": [], - "source": [ - "# Scatter plot of all pairs of files with their min confidence (normalized update commit count) on the x axis and dependency weight on the y axis\n", - "\n", - "if pairwise_changed_git_files_with_dependencies.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " figure = plotly_graph_objects.Figure(plotly_graph_objects.Scatter(\n", - " x=pairwise_changed_git_files_with_dependencies['updateCommitMinConfidence'], \n", - " y=pairwise_changed_git_files_with_dependencies['dependencyWeight'],\n", - " mode='markers',\n", - " ))\n", - " figure.update_layout(\n", - " **plotly_bar_layout_base_settings,\n", - " title='Pairwise changed files: Min confidence co-change rate vs. dependency weight',\n", - " xaxis_title='co-change rate (min confidence, normalized update commit count)',\n", - " yaxis_title='dependency weight',\n", - " )\n", - " figure.show(**plotly_treemap_figure_show_settings)\n", - " if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"PairwiseChangedFilesVsDependencyWeight\"))" - ] - }, { "cell_type": "markdown", "id": "14e87aff", From 5bd3e681f126230cfcdebe9dfb9491799f162d71 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Tue, 19 Aug 2025 08:48:37 +0200 Subject: [PATCH 3/5] Add histogram plots for pairwise changed git files --- .../Add_file_name and_extension.cypher | 6 +- ...HER_WITH_relationships_to_git_files.cypher | 7 +- ..._changed_together_with_another_file.cypher | 4 +- .../GitLog/List_pairwise_changed_files.cypher | 21 ++ ...ise_changed_files_with_dependencies.cypher | 2 +- jupyter/GitHistoryGeneral.ipynb | 315 +++++++++++++++++- scripts/reports/GitHistoryCsv.sh | 5 +- 7 files changed, 334 insertions(+), 26 deletions(-) create mode 100644 cypher/GitLog/List_pairwise_changed_files.cypher diff --git a/cypher/General_Enrichment/Add_file_name and_extension.cypher b/cypher/General_Enrichment/Add_file_name and_extension.cypher index ec1ef1a24..e1499fc20 100644 --- a/cypher/General_Enrichment/Add_file_name and_extension.cypher +++ b/cypher/General_Enrichment/Add_file_name and_extension.cypher @@ -1,10 +1,10 @@ - // Add "name", "extension" and "extensionExtended" properties to File nodes + // Add "name", "extension" and "extensionExtended" properties to File nodes. Supports Git:File nodes with "relativePath" property. MATCH (file:File) - WHERE file.fileName IS NOT NULL + WHERE (file.fileName IS NOT NULL OR file.relativePath IS NOT NULL) AND file.name IS NULL // Don't override an already existing "name" property WITH * - ,file.fileName AS fileName + ,coalesce(file.fileName, file.relativePath) AS fileName WITH * ,last(split(fileName, '/')) AS fileNameWithoutPath WITH * diff --git a/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher b/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher index 6d7474a62..57f1a5ac4 100644 --- a/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher +++ b/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher @@ -32,9 +32,8 @@ UNWIND fileCombinations AS fileCombination ,count(DISTINCT commitHash) AS updateCommitCount ,collect(DISTINCT commitHash) AS updateCommitHashes // Deactivated: -// Filter out file pairs that where changed not very often together -// In detail: More than 0.1 per mille compared to overall commit count -// WHERE updateCommitCount > globalUpdateCommitCount * 0.001 +// Filter out file pairs that weren't changed very often together +WHERE updateCommitCount > 2 WITH * ,fileCombination[0] AS firstFile ,fileCombination[1] AS secondFile @@ -65,7 +64,7 @@ UNWIND fileCombinations AS fileCombination // Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "updateCommitCount" on it CALL (firstFile, secondFile, updateCommitCount, updateCommitHashes, updateCommitMinConfidence, updateCommitSupport, updateCommitLift, updateCommitJaccardSimilarity) { MERGE (firstFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile) - SET pairwiseChange.updateCommitCount = updateCommitCount + SET pairwiseChange.updateCommitCount = toInteger(updateCommitCount) ,pairwiseChange.updateCommitHashes = updateCommitHashes ,pairwiseChange.updateCommitMinConfidence = updateCommitMinConfidence ,pairwiseChange.updateCommitSupport = updateCommitSupport diff --git a/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher b/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher index 13f643649..478e741e0 100644 --- a/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher +++ b/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher @@ -2,10 +2,10 @@ MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository) MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile) -UNWIND gitChange.commitHashes AS commitHash +UNWIND gitChange.updateCommitHashes AS commitHash WITH gitRepository.name + '/' + firstGitFile.relativePath AS filePath ,count(DISTINCT commitHash) AS commitCount - ,sum(firstGitFile.updateCommitCount) AS fileUpdateCount + ,sum(firstGitFile.updateCommitCount) AS fileUpdateCount WITH * // Out of all the times the file was touched, how often did it co-occur with other files? ,CASE WHEN fileUpdateCount > 0 THEN toFloat(commitCount) / fileUpdateCount ELSE 0.0 END AS coChangeRate diff --git a/cypher/GitLog/List_pairwise_changed_files.cypher b/cypher/GitLog/List_pairwise_changed_files.cypher new file mode 100644 index 000000000..1041da894 --- /dev/null +++ b/cypher/GitLog/List_pairwise_changed_files.cypher @@ -0,0 +1,21 @@ +// List pairs of files that were changed together. Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first. + +MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File) +WHERE elementId(firstFile) < elementId(secondFile) + WITH * + ,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName + ,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName +RETURN firstFileName + ,secondFileName + ,firstFile.name + '
' + secondFile.name AS filePairLineBreak + ,firstFileName + '
' + secondFileName AS filePairWithRelativePathLineBreak + ,firstFile.name + '↔' + secondFile.name AS filePair + ,firstFileName + '↔' + secondFileName AS filePairWithRelativePath + ,firstFile.extension AS firstFileExtension + ,secondFile.extension AS secondFileExtension + ,firstFile.extension + '↔' + secondFile.extension AS fileExtensionPair + ,toInteger(pairwiseChange.updateCommitCount) AS updateCommitCount + ,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence + ,pairwiseChange.updateCommitSupport AS updateCommitSupport + ,pairwiseChange.updateCommitLift AS updateCommitLift + ,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity \ No newline at end of file diff --git a/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher b/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher index 83562bb58..331f7c96e 100644 --- a/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher +++ b/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher @@ -1,4 +1,4 @@ -// List pair of files that were changed together and that have a declared dependency between each other. +// List pair of files that were changed together and that have a declared dependency between each other. Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher and Add_CHANGED_TOGETHER_WITH_relationships_to_code_files.cypher to run first. MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File) MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile) diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index 65b2c2d1d..cbad27e8c 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -37,7 +37,8 @@ "source": [ "from neo4j import GraphDatabase\n", "from plotly import graph_objects as plotly_graph_objects\n", - "from plotly.express import colors as plotly_colors" + "from plotly.express import colors as plotly_colors\n", + "from plotly.subplots import make_subplots" ] }, { @@ -195,7 +196,7 @@ "# Base settings for Plotly Treemap\n", "\n", "plotly_main_layout_base_settings = dict(\n", - " margin=dict(t=50, l=15, r=15, b=15),\n", + " margin=dict(t=80, l=15, r=15, b=15),\n", ")\n", "plotly_treemap_layout_base_settings = dict(\n", " **plotly_main_layout_base_settings\n", @@ -1181,34 +1182,34 @@ "metadata": {}, "outputs": [], "source": [ - "pairwise_changed_git_files = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher\")\n", + "data_to_display = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher\")\n", "\n", "# Debug\n", "# display(\"1. pairwise changed files --------------\")\n", "# display(pairwise_changed_git_files)\n", "\n", "# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n", - "pairwise_changed_git_files = add_directory_column(pairwise_changed_git_files, 'filePath', 'directoryPath')\n", + "data_to_display = add_directory_column(data_to_display, 'filePath', 'directoryPath')\n", "\n", "# Debug\n", "# display(\"2. added directories --------------\")\n", "# display(pairwise_changed_git_files)\n", "\n", "# Group the git files by their directory and author and count the number of files of each directory (across all levels).\n", - "pairwise_changed_git_files = pairwise_changed_git_files.groupby(['directoryPath']).aggregate(\n", + "data_to_display = data_to_display.groupby(['directoryPath']).aggregate(\n", " pairwiseChangeCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n", " pairwiseChangeFileCount=pd.NamedAgg(column=\"filePath\", aggfunc=\"count\"),\n", " pairwiseChangeAverageRate=pd.NamedAgg(column=\"coChangeRate\", aggfunc=\"mean\"),\n", ")\n", - "pairwise_changed_git_files.reset_index(inplace=True)\n", + "data_to_display.reset_index(inplace=True)\n", "\n", "# Debug\n", "# display(\"3. after grouping --------------\")\n", "# display(pairwise_changed_git_files)\n", "\n", - "pairwise_changed_git_files = pd.merge(\n", + "data_to_display = pd.merge(\n", " git_files_with_commit_statistics, \n", - " pairwise_changed_git_files, \n", + " data_to_display, \n", " left_on='directoryPath', \n", " right_on=\"directoryPath\",\n", " how=\"left\",\n", @@ -1219,10 +1220,10 @@ "# display(\"4. after merging --------------\")\n", "# display(pairwise_changed_git_files)\n", "\n", - "pairwise_changed_git_files['pairwiseChangeCommitCount'] = pairwise_changed_git_files['pairwiseChangeCommitCount'].fillna(0).astype(int)\n", - "pairwise_changed_git_files['pairwiseChangeFileCount'] = pairwise_changed_git_files['pairwiseChangeFileCount'].fillna(0).astype(int)\n", - "pairwise_changed_git_files['pairwiseChangeAverageRate'] = pairwise_changed_git_files['pairwiseChangeAverageRate'].fillna(0).astype(float)\n", - "pairwise_changed_git_files.reset_index(inplace=True)\n", + "data_to_display['pairwiseChangeCommitCount'] = data_to_display['pairwiseChangeCommitCount'].fillna(0).astype(int)\n", + "data_to_display['pairwiseChangeFileCount'] = data_to_display['pairwiseChangeFileCount'].fillna(0).astype(int)\n", + "data_to_display['pairwiseChangeAverageRate'] = data_to_display['pairwiseChangeAverageRate'].fillna(0).astype(float)\n", + "data_to_display.reset_index(inplace=True)\n", "\n", "# Debug\n", "# display(\"5. after NaN fill --------------\")\n", @@ -1236,15 +1237,15 @@ "metadata": {}, "outputs": [], "source": [ - "pairwise_changed_git_files = add_quantile_limited_column(pairwise_changed_git_files, \"pairwiseChangeCommitCount\", 0.98)\n", + "data_to_display = add_quantile_limited_column(data_to_display, \"pairwiseChangeCommitCount\", 0.98)\n", "\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", - " create_treemap_commit_statistics_settings(pairwise_changed_git_files),\n", + " create_treemap_commit_statistics_settings(data_to_display),\n", " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", " # values = pairwise_changed_git_files['fileCount'],\n", " marker=dict(\n", " **plotly_treemap_marker_base_colorscale,\n", - " colors=pairwise_changed_git_files['pairwiseChangeCommitCount_limited'], \n", + " colors=data_to_display['pairwiseChangeCommitCount_limited'], \n", " colorbar=dict(title=\"Changes\"),\n", " ),\n", "))\n", @@ -1325,6 +1326,290 @@ " figure.write_image(**get_plotly_figure_write_image_settings(\"ChangedFilesPerCommit\"))" ] }, + { + "cell_type": "markdown", + "id": "322d6cf9", + "metadata": {}, + "source": [ + "## Pairwise Changed Files\n", + "\n", + "This section analyzes files that where changed together within the same commit and provides several metrics to quantify the strength of the co-change relationship:\n", + "\n", + "- **Commit Count**: The number of commits in which two files were changed together.\n", + "- **Commit Lift**: A ratio that indicates whether the co-change pattern is stronger than random chance, given how often each file changes.\n", + "- **Jaccard Similarity**: The ratio of commits involving either file that also involved both files.\n", + "\n", + "The following tables show the top pairwise changed files based on these metrics.\n", + "The following charts show how these metrics are distributed across pairs of files that were changed together." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f97aba8f", + "metadata": {}, + "outputs": [], + "source": [ + "pairwise_changed_git_files = query_cypher_to_data_frame(\"../cypher/GitLog/List_pairwise_changed_files.cypher\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f874da0", + "metadata": {}, + "outputs": [], + "source": [ + "def find_top_pairwise_changed_file_extensions(input_data: pd.DataFrame, top_n: int = 10) -> pd.Series:\n", + " \"\"\"\n", + " Finds the top N pairwise changed file extensions based on commit count.\n", + " input_data : pd.DataFrame : DataFrame containing pairwise changed files with their pair counts and extensions\n", + " top_n : int : The number of top extensions to return\n", + " return : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n", + " \"\"\"\n", + " top_extensions = input_data.groupby('fileExtensionPair').aggregate({'filePairWithRelativePath': 'count'}).reset_index()\n", + " return top_extensions.sort_values(by='filePairWithRelativePath', ascending=False).reset_index(drop=True).head(top_n)['fileExtensionPair']\n", + "\n", + "top_pairwise_changed_file_extensions = find_top_pairwise_changed_file_extensions(pairwise_changed_git_files, top_n=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c34ceea", + "metadata": {}, + "outputs": [], + "source": [ + "def display_table_for_top_pairwise_changed_file_extensions(\n", + " data_to_display: pd.DataFrame, \n", + " top_pairwise_changed_file_extensions: pd.Series,\n", + " sort_column: str,\n", + " top_n: int = 10\n", + " ):\n", + " \"\"\"\n", + " Displays a table for each top pairwise changed file extension.\n", + " data_to_plot : pd.DataFrame : DataFrame containing pairwise changed files with their commit counts\n", + " top_pairwise_changed_file_extensions : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n", + " sort_column : str : The column to sort the data by (default is \"pairwiseChangeCommitCount\")\n", + " top_n : int : The number of top entries to display for each extension (default is 10)\n", + " \"\"\"\n", + " \n", + " if data_to_display.empty:\n", + " print(\"No data to display\")\n", + " return\n", + " \n", + " if top_pairwise_changed_file_extensions.empty:\n", + " print(\"No top pairwise changed file extensions to display\")\n", + " return\n", + "\n", + " # Display each top pairwise changed file extension with its corresponding data\n", + " selected_columns = [\"fileExtensionPair\", \"filePair\", sort_column, \"filePairWithRelativePath\"]\n", + " data_to_display = data_to_display[selected_columns]\n", + " \n", + " combined_data_for_top_extensions = pd.DataFrame().reindex_like(data_to_display.head(0)) # Create an empty DataFrame with the same columns as data_to_display\n", + " \n", + " for index, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n", + " filtered_data = data_to_display[data_to_display[\"fileExtensionPair\"] == extension]\n", + " sorted_data = filtered_data.sort_values(by=sort_column, ascending=False).head(top_n).reset_index()\n", + " combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data[selected_columns]], ignore_index=True)\n", + " \n", + " display(combined_data_for_top_extensions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aeca70e", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_histogram_of_pairwise_changed_files(\n", + " data_to_plot: pd.DataFrame,\n", + " top_pairwise_changed_file_extensions: pd.Series,\n", + " x_axis_column: str = \"updateCommitCount\",\n", + " x_axis_label: str = \"Commit Count\",\n", + " output_file_name: str = \"CoChangedFilesByCommitCount\",\n", + " sub_plot_rows: int = 4, \n", + " sub_plot_columns: int = 1,\n", + " ):\n", + " \"\"\"\n", + " Plots a histogram of pairwise changed files based on their commit count.\n", + " pairwise_changed_git_files : pd.DataFrame : DataFrame containing pairwise changed files with their commit counts\n", + " top_extensions_number : int : The number of top extensions to plot\n", + " \"\"\"\n", + "\n", + " if data_to_plot.empty:\n", + " print(\"No data to plot\")\n", + " return\n", + " \n", + " if top_pairwise_changed_file_extensions.size != sub_plot_rows * sub_plot_columns:\n", + " raise ValueError(f\"Number of top pairwise changed file extensions ({top_pairwise_changed_file_extensions.size}) does not match the number of subplots ({sub_plot_rows * sub_plot_columns}).\")\n", + "\n", + " figure = make_subplots(\n", + " rows=sub_plot_rows, \n", + " cols=sub_plot_columns, \n", + " subplot_titles=top_pairwise_changed_file_extensions,\n", + " vertical_spacing=0.04, \n", + " horizontal_spacing=0.04\n", + " )\n", + "\n", + " # Add one subplot per extension pair\n", + " for index, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n", + " row = (index - 1) // sub_plot_columns + 1\n", + " column = (index - 1) % sub_plot_columns + 1\n", + "\n", + " data_for_subplot = data_to_plot[data_to_plot[\"fileExtensionPair\"] == extension]\n", + "\n", + " figure.add_trace(\n", + " plotly_graph_objects.Histogram(\n", + " x=data_for_subplot[x_axis_column],\n", + " text=data_for_subplot[\"filePairLineBreak\"],\n", + " textposition=\"inside\",\n", + " hovertext=data_for_subplot[\"filePairWithRelativePath\"],\n", + " nbinsx=40,\n", + " textfont=dict(size=12, color=\"white\"),\n", + " name=extension,\n", + " ),\n", + " row=row,\n", + " col=column\n", + " )\n", + " # Make subplot title larger\n", + " figure.layout.annotations[index - 1].update(font=dict(size=18))\n", + " # Label subplot x axis \n", + " figure.update_xaxes(title_text=x_axis_label, row=row, col=column)\n", + " # Label subplot y axis and make it logarithmic\n", + " figure.update_yaxes(title_text=\"File Pair Count (log)\", type=\"log\", row=row, col=column)\n", + "\n", + " figure.update_layout(\n", + " margin=dict(t=100, l=10, r=10, b=10),\n", + " title=\"Co-Changed Files by their \" + x_axis_label.lower(),\n", + " title_font_size=20,\n", + " title_y=0.99,\n", + " bargap=0.05,\n", + " height=2000,\n", + " width=1000,\n", + " showlegend=False\n", + " )\n", + "\n", + " figure.show(**plotly_treemap_figure_show_settings)\n", + " if is_command_line_execution():\n", + " figure.write_image(**get_plotly_figure_write_image_settings(output_file_name))" + ] + }, + { + "cell_type": "markdown", + "id": "fff50751", + "metadata": {}, + "source": [ + "### Files changed together by commit count" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67e2a3c4", + "metadata": {}, + "outputs": [], + "source": [ + "display_table_for_top_pairwise_changed_file_extensions(\n", + " pairwise_changed_git_files,\n", + " top_pairwise_changed_file_extensions,\n", + " sort_column=\"updateCommitCount\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "721c2c8a", + "metadata": {}, + "outputs": [], + "source": [ + "plot_histogram_of_pairwise_changed_files(\n", + " data_to_plot = pairwise_changed_git_files,\n", + " top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions,\n", + " x_axis_column = \"updateCommitCount\",\n", + " x_axis_label = \"Commit Count\",\n", + " output_file_name = \"CoChangedFilesByCommitCount\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "132fd688", + "metadata": {}, + "source": [ + "### Files changed together by commit lift" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d05088a", + "metadata": {}, + "outputs": [], + "source": [ + "display_table_for_top_pairwise_changed_file_extensions(\n", + " pairwise_changed_git_files,\n", + " top_pairwise_changed_file_extensions,\n", + " sort_column=\"updateCommitLift\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73afeeed", + "metadata": {}, + "outputs": [], + "source": [ + "plot_histogram_of_pairwise_changed_files(\n", + " data_to_plot = pairwise_changed_git_files,\n", + " top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions,\n", + " x_axis_column = \"updateCommitLift\",\n", + " x_axis_label = \"Commit Lift\",\n", + " output_file_name = \"CoChangedFilesByCommitLift\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2a977fc8", + "metadata": {}, + "source": [ + "### Files changed together by commit Jaccard similarity" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41911a35", + "metadata": {}, + "outputs": [], + "source": [ + "display_table_for_top_pairwise_changed_file_extensions(\n", + " pairwise_changed_git_files,\n", + " top_pairwise_changed_file_extensions,\n", + " sort_column=\"updateCommitJaccardSimilarity\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce034bce", + "metadata": {}, + "outputs": [], + "source": [ + "plot_histogram_of_pairwise_changed_files(\n", + " data_to_plot = pairwise_changed_git_files,\n", + " top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions,\n", + " x_axis_column = \"updateCommitJaccardSimilarity\",\n", + " x_axis_label = \"Commit Jaccard Similarity\",\n", + " output_file_name = \"CoChangedFilesByCommitJaccardSimilarity\"\n", + ")" + ] + }, { "cell_type": "markdown", "id": "14e87aff", diff --git a/scripts/reports/GitHistoryCsv.sh b/scripts/reports/GitHistoryCsv.sh index 77d357e5b..5fe98cdf5 100755 --- a/scripts/reports/GitHistoryCsv.sh +++ b/scripts/reports/GitHistoryCsv.sh @@ -47,9 +47,12 @@ execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_file_directories_with_commit_stat # Overall distribution of how many files were changed with one git commit, how many were changed with two, etc. execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_per_commit_distribution.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_files_per_commit_distribution.csv" -# Data basis for finding out if there is a correlation between pairwise changed files and their dependencies +# Find pairwise changed files that depend on each other execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_with_dependencies.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_with_dependencies.csv" +# List pairwise changed files with various metrics +execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files.csv" + # Clean-up after report generation. Empty reports will be deleted. source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}" From 910029b96bd5d309f9bb7912e12140e284b7bba0 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Wed, 20 Aug 2025 08:39:17 +0200 Subject: [PATCH 4/5] Add reports for frequently changed files --- .../GitLog/List_pairwise_changed_files.cypher | 2 +- ...e_changed_files_top_selected_metric.cypher | 41 +++ jupyter/GitHistoryGeneral.ipynb | 301 +++++++++++++++++- scripts/reports/GitHistoryCsv.sh | 5 +- 4 files changed, 331 insertions(+), 18 deletions(-) create mode 100644 cypher/GitLog/List_pairwise_changed_files_top_selected_metric.cypher diff --git a/cypher/GitLog/List_pairwise_changed_files.cypher b/cypher/GitLog/List_pairwise_changed_files.cypher index 1041da894..048c07b7b 100644 --- a/cypher/GitLog/List_pairwise_changed_files.cypher +++ b/cypher/GitLog/List_pairwise_changed_files.cypher @@ -14,7 +14,7 @@ RETURN firstFileName ,firstFile.extension AS firstFileExtension ,secondFile.extension AS secondFileExtension ,firstFile.extension + '↔' + secondFile.extension AS fileExtensionPair - ,toInteger(pairwiseChange.updateCommitCount) AS updateCommitCount + ,pairwiseChange.updateCommitCount AS updateCommitCount ,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence ,pairwiseChange.updateCommitSupport AS updateCommitSupport ,pairwiseChange.updateCommitLift AS updateCommitLift diff --git a/cypher/GitLog/List_pairwise_changed_files_top_selected_metric.cypher b/cypher/GitLog/List_pairwise_changed_files_top_selected_metric.cypher new file mode 100644 index 000000000..fb2c6832c --- /dev/null +++ b/cypher/GitLog/List_pairwise_changed_files_top_selected_metric.cypher @@ -0,0 +1,41 @@ +// Get the top 4 file extensions that where changed together most often and list top 20 pair that were changed together for each of the top file extension pair by their highest commit lift (>1: changes more often than by random chance). Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first. + +MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File) +WHERE firstFile.extension < secondFile.extension + OR (firstFile.extension = secondFile.extension AND elementId(firstFile) < elementId(secondFile)) + WITH firstFile.extension + '↔' + secondFile.extension AS fileExtensionPair + ,count(DISTINCT pairwiseChange) AS pairCount +ORDER BY pairCount DESC + WITH collect(fileExtensionPair)[0..4] AS top4FileExtensionPairs +UNWIND top4FileExtensionPairs AS fileExtensionPair +CALL { + WITH fileExtensionPair + MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File) + WHERE elementId(firstFile) < elementId(secondFile) + AND firstFile.extension + '↔' + secondFile.extension = fileExtensionPair + WITH * + ,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName + ,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName + RETURN firstFile.name AS firstFileNameShort + ,secondFile.name AS secondFileNameShort + ,firstFileName + ,secondFileName + ,pairwiseChange[$selected_pair_metric] AS selectedMetric + ,pairwiseChange.updateCommitLift AS updateCommitLift + ,pairwiseChange.updateCommitCount AS updateCommitCount + ,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence + ,pairwiseChange.updateCommitSupport AS updateCommitSupport + ,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity + ORDER BY selectedMetric DESC, firstFileName ASC, secondFileName ASC + LIMIT 20 +} +RETURN fileExtensionPair + ,firstFileNameShort + ,secondFileNameShort + ,updateCommitCount + ,updateCommitMinConfidence + ,updateCommitLift + ,updateCommitJaccardSimilarity + ,updateCommitSupport + ,firstFileName + ,secondFileName \ No newline at end of file diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index cbad27e8c..dd90735bb 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -1246,16 +1246,16 @@ " marker=dict(\n", " **plotly_treemap_marker_base_colorscale,\n", " colors=data_to_display['pairwiseChangeCommitCount_limited'], \n", - " colorbar=dict(title=\"Changes\"),\n", + " colorbar=dict(title=\"Co-Changes\"),\n", " ),\n", "))\n", "figure.update_layout(\n", " **plotly_treemap_layout_base_settings,\n", - " title='Pairwise file changes',\n", + " title='Co-Changing files in update commits',\n", ")\n", "figure.show(**plotly_treemap_figure_show_settings)\n", "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"PairwiseFileChanges\"))" + " figure.write_image(**get_plotly_figure_write_image_settings(\"CoChangingFiles\"))" ] }, { @@ -1343,6 +1343,16 @@ "The following charts show how these metrics are distributed across pairs of files that were changed together." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7721dfd", + "metadata": {}, + "outputs": [], + "source": [ + "# Initial steps: Function Declaration and Data Preparation" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1368,20 +1378,99 @@ " return : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n", " \"\"\"\n", " top_extensions = input_data.groupby('fileExtensionPair').aggregate({'filePairWithRelativePath': 'count'}).reset_index()\n", - " return top_extensions.sort_values(by='filePairWithRelativePath', ascending=False).reset_index(drop=True).head(top_n)['fileExtensionPair']\n", - "\n", - "top_pairwise_changed_file_extensions = find_top_pairwise_changed_file_extensions(pairwise_changed_git_files, top_n=4)" + " return top_extensions.sort_values(by='filePairWithRelativePath', ascending=False).reset_index(drop=True).head(top_n)['fileExtensionPair']" ] }, { "cell_type": "code", "execution_count": null, - "id": "3c34ceea", + "id": "7e228e63", + "metadata": {}, + "outputs": [], + "source": [ + "top_pairwise_changed_file_extensions = find_top_pairwise_changed_file_extensions(pairwise_changed_git_files, top_n=4)\n", + "# Only keep the pairwise change files with the top file extensions\n", + "pairwise_changed_git_files = pairwise_changed_git_files[pairwise_changed_git_files['fileExtensionPair'].isin(top_pairwise_changed_file_extensions)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c07abbf", + "metadata": {}, + "outputs": [], + "source": [ + "def add_file_extension_rank_column(data_frame: pd.DataFrame, column_name: str):\n", + " \"\"\"\n", + " Adds a 'fileExtensionPair' based rank column to the DataFrame for the value of the specified column.\n", + " data_frame : pd.DataFrame : The input DataFrame\n", + " column_name : str : The name of the column to rank\n", + " return : pd.DataFrame : The DataFrame with added rank column\n", + " \"\"\"\n", + " if column_name + '_rank' in data_frame.columns:\n", + " return data_frame # Column already exists\n", + " \n", + " # Create a new rank column based on the specified column and group by the group column\n", + " data_frame[f\"{column_name}ExtensionRank\"] = data_frame.groupby('fileExtensionPair')[column_name].rank(ascending=False, method='dense').astype(int)\n", + " return data_frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84b01643", + "metadata": {}, + "outputs": [], + "source": [ + "pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitCount\")\n", + "pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitMinConfidence\")\n", + "pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitJaccardSimilarity\")\n", + "pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitLift\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad158020", "metadata": {}, "outputs": [], "source": [ "def display_table_for_top_pairwise_changed_file_extensions(\n", " data_to_display: pd.DataFrame, \n", + " metric_column: str,\n", + " top_n: int = 10\n", + " ):\n", + " \"\"\"\n", + " Displays a table containing the top N ranked pairwise changed file extensions based on the specified metric column.\n", + " data_to_plot : pd.DataFrame : DataFrame containing pairwise changed files with their commit counts\n", + " metric_column : str : The column to sort the data by\n", + " top_n : int : The number of top entries to display for each extension (default is 10)\n", + " \"\"\"\n", + " filtered_data = data_to_display[data_to_display[metric_column + \"ExtensionRank\"] <= top_n]\n", + " \n", + " # Group by the file extensions and the metric and its rank.\n", + " # Since some entries might have the same metric value, we aggregate by the first file pair with relative path and the first file pair.\n", + " # This way we can pick the top n entries for each file extension pair.\n", + " grouping_columns = [\"fileExtensionPair\", metric_column, metric_column + \"ExtensionRank\"]\n", + " grouped_data = filtered_data.groupby(grouping_columns).aggregate(\n", + " filePair=pd.NamedAgg(column=\"filePair\", aggfunc=\"first\"),\n", + " filePairWithRelativePath=pd.NamedAgg(column=\"filePairWithRelativePath\", aggfunc=\"first\"),\n", + " ).reset_index()\n", + " \n", + " return grouped_data.sort_values(by=grouping_columns, ascending=[True, False, True]).reset_index(drop=True).rename(columns={metric_column + \"ExtensionRank\": \"GroupRank\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c34ceea", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO delete if not needed anymore\n", + "\n", + "def display_table_for_top_pairwise_changed_file_extensions_deprecated(\n", + " data_to_display: pd.DataFrame, \n", " top_pairwise_changed_file_extensions: pd.Series,\n", " sort_column: str,\n", " top_n: int = 10\n", @@ -1408,10 +1497,10 @@ " \n", " combined_data_for_top_extensions = pd.DataFrame().reindex_like(data_to_display.head(0)) # Create an empty DataFrame with the same columns as data_to_display\n", " \n", - " for index, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n", + " for _, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n", " filtered_data = data_to_display[data_to_display[\"fileExtensionPair\"] == extension]\n", " sorted_data = filtered_data.sort_values(by=sort_column, ascending=False).head(top_n).reset_index()\n", - " combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data[selected_columns]], ignore_index=True)\n", + " combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data], ignore_index=True)\n", " \n", " display(combined_data_for_top_extensions)" ] @@ -1507,14 +1596,13 @@ { "cell_type": "code", "execution_count": null, - "id": "67e2a3c4", + "id": "1cd03b3f", "metadata": {}, "outputs": [], "source": [ "display_table_for_top_pairwise_changed_file_extensions(\n", " pairwise_changed_git_files,\n", - " top_pairwise_changed_file_extensions,\n", - " sort_column=\"updateCommitCount\"\n", + " \"updateCommitCount\",\n", ")" ] }, @@ -1534,6 +1622,46 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "55be3351", + "metadata": {}, + "source": [ + "### Files changed together by commit min confidence\n", + "\n", + "The commit min confidence is the commit count where both files were changed divided by the commit count of the file with the least commits.\n", + "This metric is useful to identify pairs of files that are frequently changed together and is not biased by single files that are changed very often." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1c9df18", + "metadata": {}, + "outputs": [], + "source": [ + "display_table_for_top_pairwise_changed_file_extensions(\n", + " pairwise_changed_git_files,\n", + " \"updateCommitMinConfidence\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a54edcd", + "metadata": {}, + "outputs": [], + "source": [ + "plot_histogram_of_pairwise_changed_files(\n", + " data_to_plot = pairwise_changed_git_files,\n", + " top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions,\n", + " x_axis_column = \"updateCommitMinConfidence\",\n", + " x_axis_label = \"Commit Min Confidence\",\n", + " output_file_name = \"CoChangedFilesByCommitMinConfidence\"\n", + ")" + ] + }, { "cell_type": "markdown", "id": "132fd688", @@ -1551,8 +1679,7 @@ "source": [ "display_table_for_top_pairwise_changed_file_extensions(\n", " pairwise_changed_git_files,\n", - " top_pairwise_changed_file_extensions,\n", - " sort_column=\"updateCommitLift\"\n", + " \"updateCommitLift\"\n", ")" ] }, @@ -1589,8 +1716,7 @@ "source": [ "display_table_for_top_pairwise_changed_file_extensions(\n", " pairwise_changed_git_files,\n", - " top_pairwise_changed_file_extensions,\n", - " sort_column=\"updateCommitJaccardSimilarity\"\n", + " \"updateCommitJaccardSimilarity\"\n", ")" ] }, @@ -1610,6 +1736,149 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "727772c7", + "metadata": {}, + "source": [ + "### Find pairwise changed files with many highly ranked metrics\n", + "\n", + "Find those pairwise changed files that have a high rank in many metrics by calculating a combined (weighted) score based on the ranks of each metric.\n", + "This is useful to identify pairs of files that score high in most metrics, which indicates a strong co-change relationship." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "330cd50d", + "metadata": {}, + "outputs": [], + "source": [ + "metric_rank_columns = [\n", + " 'updateCommitCountExtensionRank',\n", + " 'updateCommitMinConfidenceExtensionRank',\n", + " 'updateCommitJaccardSimilarityExtensionRank',\n", + " 'updateCommitLiftExtensionRank'\n", + "]\n", + "\n", + "metric_columns = [\n", + " 'updateCommitCount',\n", + " 'updateCommitMinConfidence',\n", + " 'updateCommitJaccardSimilarity',\n", + " 'updateCommitLift'\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9de55c0b", + "metadata": {}, + "outputs": [], + "source": [ + "pairwise_changed_git_files['combinedMetricsScore'] = (\n", + " pairwise_changed_git_files['updateCommitCountExtensionRank'] +\n", + " pairwise_changed_git_files['updateCommitMinConfidenceExtensionRank'] +\n", + " pairwise_changed_git_files['updateCommitJaccardSimilarityExtensionRank'] +\n", + " pairwise_changed_git_files['updateCommitLiftExtensionRank']\n", + ")\n", + "\n", + "columns_to_show = [\"fileExtensionPair\", \"filePair\", \"combinedMetricsScore\"] + metric_rank_columns + metric_columns + [\"filePairWithRelativePath\"]\n", + "\n", + "pairwise_changed_git_files_top_10_ranks = pairwise_changed_git_files.\\\n", + " sort_values(by=[\"fileExtensionPair\", \"combinedMetricsScore\"], ascending=[True, True]).\\\n", + " groupby(\"fileExtensionPair\").\\\n", + " head(10).\\\n", + " reset_index(drop=True)\\\n", + " [columns_to_show]\n", + "\n", + "display(pairwise_changed_git_files_top_10_ranks)" + ] + }, + { + "cell_type": "markdown", + "id": "b5b92b79", + "metadata": {}, + "source": [ + "### Pairwise changed files with pareto-optimal metrics\n", + "\n", + "A pair (count, confidence, jaccard, lift) is Pareto-optimal if there is no other pair that is better or equal in all metrics and strictly better in at least one. In other words, it is not \"dominated\" by any other pair.\n", + "\n", + "The frontier = the “best tradeoffs.”" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a9e5be6", + "metadata": {}, + "outputs": [], + "source": [ + "def pareto_frontier(input_data, metrics, maximize=True):\n", + " \"\"\"\n", + " Extracts the Pareto frontier (skyline) from a DataFrame.\n", + "\n", + " input_data: DataFrame\n", + " metrics: list of column names to consider\n", + " maximize: True if higher is better for all metrics\n", + " \"\"\"\n", + " data = input_data[metrics].to_numpy()\n", + " if not maximize:\n", + " data = -data # flip sign if minimizing\n", + " \n", + " # Keep track of which rows are dominated (start with none)\n", + " is_dominated = np.zeros(len(data), dtype=bool)\n", + " for i, point in enumerate(data):\n", + " # Skip if already marked dominated\n", + " if is_dominated[i]:\n", + " continue\n", + " # Check which other rows dominate this row\n", + " dominates = np.all(data >= point, axis=1) & np.any(data > point, axis=1)\n", + " # If any row dominates this one, mark this row as dominated\n", + " is_dominated |= dominates\n", + " \n", + " # Keep only non-dominated rows = Pareto frontier\n", + " return input_data[~is_dominated].reset_index(drop=True)" + ] + }, + { + "cell_type": "markdown", + "id": "04ff7564", + "metadata": {}, + "source": [ + "#### Pairwise changed files with pareto-optimal metrics - not considering file extensions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56dc0360", + "metadata": {}, + "outputs": [], + "source": [ + "columns_to_show_for_pareto_frontier = [\"filePair\", \"combinedMetricsScore\"] + metric_columns + metric_rank_columns + [\"filePairWithRelativePath\"]\n", + "display(pareto_frontier(pairwise_changed_git_files, metric_columns, maximize=False)[columns_to_show_for_pareto_frontier].head(40))" + ] + }, + { + "cell_type": "markdown", + "id": "dabccd77", + "metadata": {}, + "source": [ + "#### Pairwise changed files with pareto-optimal metrics - using ranks grouped by file extensions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43de84bc", + "metadata": {}, + "outputs": [], + "source": [ + "columns_to_show_for_pareto_frontier_with_extensions = [\"fileExtensionPair\", \"filePair\", \"combinedMetricsScore\"] + metric_columns + metric_rank_columns + [\"filePairWithRelativePath\"]\n", + "display(pareto_frontier(pairwise_changed_git_files, metric_rank_columns, maximize=False)[columns_to_show_for_pareto_frontier_with_extensions].head(40))" + ] + }, { "cell_type": "markdown", "id": "14e87aff", diff --git a/scripts/reports/GitHistoryCsv.sh b/scripts/reports/GitHistoryCsv.sh index 5fe98cdf5..9f588cdd1 100755 --- a/scripts/reports/GitHistoryCsv.sh +++ b/scripts/reports/GitHistoryCsv.sh @@ -51,7 +51,10 @@ execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_per_commit_distribution.cyp execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_with_dependencies.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_with_dependencies.csv" # List pairwise changed files with various metrics -execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files.csv" +execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_top_selected_metric.cypher" "selected_pair_metric=updateCommitCount" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_top_count.csv" +execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_top_selected_metric.cypher" "selected_pair_metric=updateCommitMinConfidence" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_top_min_confidence.csv" +execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_top_selected_metric.cypher" "selected_pair_metric=updateCommitJaccardSimilarity" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_top_jaccard.csv" +execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_top_selected_metric.cypher" "selected_pair_metric=updateCommitLift" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_top_lift.csv" # Clean-up after report generation. Empty reports will be deleted. source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}" From 2976072d35e0d00b1bd83c76fe23df5b88c00a43 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Fri, 22 Aug 2025 09:26:39 +0200 Subject: [PATCH 5/5] Optimize treemap with files that are often changed with others --- ..._changed_together_with_another_file.cypher | 5 +- jupyter/GitHistoryGeneral.ipynb | 206 +++++++++++------- 2 files changed, 137 insertions(+), 74 deletions(-) diff --git a/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher b/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher index 478e741e0..77fee0216 100644 --- a/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher +++ b/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher @@ -1,13 +1,16 @@ // List git files that where frequently changed with another file. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files". MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository) +WHERE elementId(firstGitFile) < elementId(secondGitFile) MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile) UNWIND gitChange.updateCommitHashes AS commitHash WITH gitRepository.name + '/' + firstGitFile.relativePath AS filePath ,count(DISTINCT commitHash) AS commitCount ,sum(firstGitFile.updateCommitCount) AS fileUpdateCount + ,max(gitChange.updateCommitLift) AS maxLift + ,avg(gitChange.updateCommitLift) AS avgLift WITH * // Out of all the times the file was touched, how often did it co-occur with other files? ,CASE WHEN fileUpdateCount > 0 THEN toFloat(commitCount) / fileUpdateCount ELSE 0.0 END AS coChangeRate -RETURN filePath, commitCount, coChangeRate +RETURN filePath, commitCount, coChangeRate, maxLift, avgLift ORDER BY commitCount DESC \ No newline at end of file diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index dd90735bb..117c03746 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -1169,16 +1169,101 @@ }, { "cell_type": "markdown", - "id": "80bd7c28", + "id": "d8c6ccee", "metadata": {}, "source": [ - "### File changed frequently with other files" + "## Filecount per commit\n", + "\n", + "Shows how many commits had changed one file, how many had changed two files, and so on.\n", + "The chart is limited to 30 lines for improved readability.\n", + "The data preview also includes overall statistics including the number of commits that are filtered out in the chart." + ] + }, + { + "cell_type": "markdown", + "id": "ed53b6e5", + "metadata": {}, + "source": [ + "### Preview data" ] }, { "cell_type": "code", "execution_count": null, - "id": "24055998", + "id": "5526e458", + "metadata": {}, + "outputs": [], + "source": [ + "git_file_count_per_commit = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_per_commit_distribution.cypher\")\n", + "\n", + "print(\"Sum of commits that changed more than 30 files (each) = \" + str(git_file_count_per_commit[git_file_count_per_commit['filesPerCommit'] > 30]['commitCount'].sum()))\n", + "print(\"Max changed files with one commit = \" + str(git_file_count_per_commit['filesPerCommit'].max()))\n", + "display(git_file_count_per_commit.describe())\n", + "display(git_file_count_per_commit.head(30))" + ] + }, + { + "cell_type": "markdown", + "id": "dcea826e", + "metadata": {}, + "source": [ + "### Bar chart with the number of files per commit distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e9dbc57", + "metadata": {}, + "outputs": [], + "source": [ + "if git_file_count_per_commit.empty:\n", + " print(\"No data to plot\")\n", + "else:\n", + " figure = plotly_graph_objects.Figure(plotly_graph_objects.Bar(\n", + " x=git_file_count_per_commit['filesPerCommit'].head(30), \n", + " y=git_file_count_per_commit['commitCount'].head(30)),\n", + " )\n", + " figure.update_layout(\n", + " **plotly_bar_layout_base_settings,\n", + " title='Changed files per commit',\n", + " xaxis_title='file count',\n", + " yaxis_title='commit count'\n", + " )\n", + " figure.show(**plotly_treemap_figure_show_settings)\n", + " if is_command_line_execution():\n", + " figure.write_image(**get_plotly_figure_write_image_settings(\"ChangedFilesPerCommit\"))" + ] + }, + { + "cell_type": "markdown", + "id": "322d6cf9", + "metadata": {}, + "source": [ + "## Pairwise Changed Files\n", + "\n", + "This section analyzes files that where changed together within the same commit and provides several metrics to quantify the strength of the co-change relationship:\n", + "\n", + "- **Commit Count**: The number of commits in which two files were changed together.\n", + "- **Commit Lift**: A ratio that indicates whether the co-change pattern is stronger than random chance, given how often each file changes.\n", + "- **Jaccard Similarity**: The ratio of commits involving either file that also involved both files.\n", + "\n", + "The following tables show the top pairwise changed files based on these metrics.\n", + "The following charts show how these metrics are distributed across pairs of files that were changed together." + ] + }, + { + "cell_type": "markdown", + "id": "4c081f85", + "metadata": {}, + "source": [ + "### Treemap with files changed frequently with others" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30942bd4", "metadata": {}, "outputs": [], "source": [ @@ -1200,6 +1285,8 @@ " pairwiseChangeCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n", " pairwiseChangeFileCount=pd.NamedAgg(column=\"filePath\", aggfunc=\"count\"),\n", " pairwiseChangeAverageRate=pd.NamedAgg(column=\"coChangeRate\", aggfunc=\"mean\"),\n", + " pairwiseChangeMaxLift=pd.NamedAgg(column=\"maxLift\", aggfunc=\"max\"),\n", + " pairwiseChangeAverageLift=pd.NamedAgg(column=\"avgLift\", aggfunc=\"mean\"),\n", ")\n", "data_to_display.reset_index(inplace=True)\n", "\n", @@ -1223,6 +1310,8 @@ "data_to_display['pairwiseChangeCommitCount'] = data_to_display['pairwiseChangeCommitCount'].fillna(0).astype(int)\n", "data_to_display['pairwiseChangeFileCount'] = data_to_display['pairwiseChangeFileCount'].fillna(0).astype(int)\n", "data_to_display['pairwiseChangeAverageRate'] = data_to_display['pairwiseChangeAverageRate'].fillna(0).astype(float)\n", + "data_to_display['pairwiseChangeMaxLift'] = data_to_display['pairwiseChangeMaxLift'].fillna(0).astype(float)\n", + "data_to_display['pairwiseChangeAverageLift'] = data_to_display['pairwiseChangeAverageLift'].fillna(0).astype(float)\n", "data_to_display.reset_index(inplace=True)\n", "\n", "# Debug\n", @@ -1233,7 +1322,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19b5a98a", + "id": "1052776d", "metadata": {}, "outputs": [], "source": [ @@ -1251,96 +1340,67 @@ "))\n", "figure.update_layout(\n", " **plotly_treemap_layout_base_settings,\n", - " title='Co-Changing files in update commits',\n", + " title='Files that likely co-change with others in update commits',\n", ")\n", "figure.show(**plotly_treemap_figure_show_settings)\n", "if is_command_line_execution():\n", " figure.write_image(**get_plotly_figure_write_image_settings(\"CoChangingFiles\"))" ] }, - { - "cell_type": "markdown", - "id": "d8c6ccee", - "metadata": {}, - "source": [ - "## Filecount per commit\n", - "\n", - "Shows how many commits had changed one file, how many had changed two files, and so on.\n", - "The chart is limited to 30 lines for improved readability.\n", - "The data preview also includes overall statistics including the number of commits that are filtered out in the chart." - ] - }, - { - "cell_type": "markdown", - "id": "ed53b6e5", - "metadata": {}, - "source": [ - "### Preview data" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "5526e458", + "id": "3ec95adf", "metadata": {}, "outputs": [], "source": [ - "git_file_count_per_commit = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_per_commit_distribution.cypher\")\n", + "data_to_display = add_quantile_limited_column(data_to_display, \"pairwiseChangeMaxLift\", 0.98)\n", "\n", - "print(\"Sum of commits that changed more than 30 files (each) = \" + str(git_file_count_per_commit[git_file_count_per_commit['filesPerCommit'] > 30]['commitCount'].sum()))\n", - "print(\"Max changed files with one commit = \" + str(git_file_count_per_commit['filesPerCommit'].max()))\n", - "display(git_file_count_per_commit.describe())\n", - "display(git_file_count_per_commit.head(30))" - ] - }, - { - "cell_type": "markdown", - "id": "dcea826e", - "metadata": {}, - "source": [ - "### Bar chart with the number of files per commit distribution" + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(data_to_display),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = pairwise_changed_git_files['fileCount'],\n", + " marker=dict(\n", + " **plotly_treemap_marker_base_colorscale,\n", + " colors=data_to_display['pairwiseChangeMaxLift_limited'], \n", + " colorbar=dict(title=\"Co-Change Lift\"),\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Co-Changing files in update commits max lift (1=random, >1=more than random, <1=less than random)',\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)\n", + "if is_command_line_execution():\n", + " figure.write_image(**get_plotly_figure_write_image_settings(\"CoChangingFilesMaxLift\"))" ] }, { "cell_type": "code", "execution_count": null, - "id": "9e9dbc57", + "id": "0e33b873", "metadata": {}, "outputs": [], "source": [ - "if git_file_count_per_commit.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " figure = plotly_graph_objects.Figure(plotly_graph_objects.Bar(\n", - " x=git_file_count_per_commit['filesPerCommit'].head(30), \n", - " y=git_file_count_per_commit['commitCount'].head(30)),\n", - " )\n", - " figure.update_layout(\n", - " **plotly_bar_layout_base_settings,\n", - " title='Changed files per commit',\n", - " xaxis_title='file count',\n", - " yaxis_title='commit count'\n", - " )\n", - " figure.show(**plotly_treemap_figure_show_settings)\n", - " if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"ChangedFilesPerCommit\"))" - ] - }, - { - "cell_type": "markdown", - "id": "322d6cf9", - "metadata": {}, - "source": [ - "## Pairwise Changed Files\n", - "\n", - "This section analyzes files that where changed together within the same commit and provides several metrics to quantify the strength of the co-change relationship:\n", + "data_to_display = add_quantile_limited_column(data_to_display, \"pairwiseChangeAverageLift\", 0.98)\n", "\n", - "- **Commit Count**: The number of commits in which two files were changed together.\n", - "- **Commit Lift**: A ratio that indicates whether the co-change pattern is stronger than random chance, given how often each file changes.\n", - "- **Jaccard Similarity**: The ratio of commits involving either file that also involved both files.\n", - "\n", - "The following tables show the top pairwise changed files based on these metrics.\n", - "The following charts show how these metrics are distributed across pairs of files that were changed together." + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(data_to_display),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = pairwise_changed_git_files['fileCount'],\n", + " marker=dict(\n", + " **plotly_treemap_marker_base_colorscale,\n", + " colors=data_to_display['pairwiseChangeAverageLift_limited'], \n", + " colorbar=dict(title=\"Co-Change Lift\"),\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Co-Changing files in update commits average lift (1=random, >1=more than random, <1=less than random)',\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)\n", + "if is_command_line_execution():\n", + " figure.write_image(**get_plotly_figure_write_image_settings(\"CoChangingFilesAverageLift\"))" ] }, { @@ -1350,7 +1410,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Initial steps: Function Declaration and Data Preparation" + "# Initial steps: Function Declaration and Data Preparation for co-change distribution analysis" ] }, {