diff --git a/README.md b/README.md
index f002c6147..b2f3562d7 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@ Contained within this repository is a comprehensive and automated code graph ana
### :newspaper: News
+- August 2025: Association rule learning for co-changing files in git history
- August 2025: Anomaly detection powered by unsupervised machine learning and explainable AI
- May 2025: Migrated to [Neo4j 2025.x](https://neo4j.com/docs/upgrade-migration-guide/current/version-2025/upgrade) and Java 21.
diff --git a/cypher/General_Enrichment/Add_file_name and_extension.cypher b/cypher/General_Enrichment/Add_file_name and_extension.cypher
index ec1ef1a24..e1499fc20 100644
--- a/cypher/General_Enrichment/Add_file_name and_extension.cypher
+++ b/cypher/General_Enrichment/Add_file_name and_extension.cypher
@@ -1,10 +1,10 @@
- // Add "name", "extension" and "extensionExtended" properties to File nodes
+ // Add "name", "extension" and "extensionExtended" properties to File nodes. Supports Git:File nodes with "relativePath" property.
MATCH (file:File)
- WHERE file.fileName IS NOT NULL
+ WHERE (file.fileName IS NOT NULL OR file.relativePath IS NOT NULL)
AND file.name IS NULL // Don't override an already existing "name" property
WITH *
- ,file.fileName AS fileName
+ ,coalesce(file.fileName, file.relativePath) AS fileName
WITH *
,last(split(fileName, '/')) AS fileNameWithoutPath
WITH *
diff --git a/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher b/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher
index 362766b55..57f1a5ac4 100644
--- a/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher
+++ b/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher
@@ -1,45 +1,103 @@
// Connect git files that where changed together frequently with "CHANGED_TOGETHER_WITH"
-MATCH (global_git_commit:Git:Commit)
- WITH count(global_git_commit) AS globalCommitCount
+// Determine global file count, global file count threshold (filter out refactoring commits) and global update commits
+MATCH (git_commit_global:Git:Commit)-[:CONTAINS_CHANGE]->(:Git:Change)-[:UPDATES]->(git_file_global:Git:File)
+WHERE git_file_global.deletedAt IS NULL
+ WITH git_commit_global, count(DISTINCT git_file_global) AS commitFileCount
+ WITH percentileDisc(commitFileCount, 0.95) AS globalFileCountThreshold
+ ,count(git_commit_global) AS globalUpdateCommitCount
+// Main section
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[:UPDATES]->(git_file:Git:File)
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
WHERE git_file.deletedAt IS NULL
// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)
ORDER BY git_commit.sha, git_file.relativePath
- WITH globalCommitCount
+ WITH globalFileCountThreshold
+ ,globalUpdateCommitCount
,git_commit.sha AS commitHash
,collect(DISTINCT git_file) AS filesInCommit
// Limit the file count to min. 2 (changed together) and
// max. 50 (reduce permutations, improve performance, filter out large refactorings that usually affect many files)
WHERE size(filesInCommit) >= 2
- AND size(filesInCommit) <= 50
+ AND size(filesInCommit) <= globalFileCountThreshold
// Collect distinct pairwise (..., 2, 2) combinations of all files in the list
- WITH globalCommitCount
+ WITH globalFileCountThreshold
+ ,globalUpdateCommitCount
,commitHash
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
UNWIND fileCombinations AS fileCombination
- WITH globalCommitCount
+ WITH globalFileCountThreshold
+ ,globalUpdateCommitCount
,fileCombination
- ,count(DISTINCT commitHash) AS commitCount
- ,collect(DISTINCT commitHash) AS commitHashes
-// Filter out file pairs that where changed not very often together
-// In detail: More than 0.1 per mille compared to overall commit count
-WHERE commitCount > globalCommitCount * 0.001
- WITH fileCombination[0] AS firstFile
+ ,count(DISTINCT commitHash) AS updateCommitCount
+ ,collect(DISTINCT commitHash) AS updateCommitHashes
+// Deactivated:
+// Filter out file pairs that weren't changed very often together
+WHERE updateCommitCount > 2
+ WITH *
+ ,fileCombination[0] AS firstFile
,fileCombination[1] AS secondFile
- ,commitCount
- ,commitHashes
-// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "commitCount" on it
- CALL (firstFile, secondFile, commitCount, commitHashes) {
+ WITH *
+ // Get the lowest number of git update commits of both files (file pair)
+ ,CASE WHEN firstFile.updateCommitCount < secondFile.updateCommitCount
+ THEN firstFile.updateCommitCount
+ ELSE secondFile.updateCommitCount
+ END AS minUpdateCommitCount
+ // Calculate update commit support by dividing the update commit count by the overall commit count for both files
+ ,toFloat(firstFile.updateCommitCount) / globalUpdateCommitCount AS firstFileUpdateSupport
+ ,toFloat(secondFile.updateCommitCount) / globalUpdateCommitCount AS secondFileUpdateSupport
+ WITH *
+ // Expected likelihood that the first and the second file change together given complete randomness
+ ,firstFileUpdateSupport * secondFileUpdateSupport AS expectedCoUpdateSupport
+ WITH firstFile
+ ,secondFile
+ ,updateCommitHashes
+ ,updateCommitCount
+ // Out of all the times the less frequently changed file was touched, how often did it co-occur with the other file?
+ ,toFloat(updateCommitCount) / minUpdateCommitCount AS updateCommitMinConfidence
+ // Compared to all commits in general, how high is the percentage of the commits where both files changed together?
+ ,toFloat(updateCommitCount) / globalUpdateCommitCount AS updateCommitSupport
+ // Lift
+ ,toFloat(updateCommitCount) / (globalUpdateCommitCount * expectedCoUpdateSupport) AS updateCommitLift
+ // Jaccard Similarity: Of all commits involving either file, how many involved both?
+ ,toFloat(updateCommitCount) / (firstFile.updateCommitCount + secondFile.updateCommitCount - updateCommitCount) AS updateCommitJaccardSimilarity
+// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "updateCommitCount" on it
+ CALL (firstFile, secondFile, updateCommitCount, updateCommitHashes, updateCommitMinConfidence, updateCommitSupport, updateCommitLift, updateCommitJaccardSimilarity) {
MERGE (firstFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile)
- SET pairwiseChange.commitCount = commitCount
- ,pairwiseChange.commitHashes = commitHashes
- } IN TRANSACTIONS
+ SET pairwiseChange.updateCommitCount = toInteger(updateCommitCount)
+ ,pairwiseChange.updateCommitHashes = updateCommitHashes
+ ,pairwiseChange.updateCommitMinConfidence = updateCommitMinConfidence
+ ,pairwiseChange.updateCommitSupport = updateCommitSupport
+ ,pairwiseChange.updateCommitLift = updateCommitLift
+ ,pairwiseChange.updateCommitJaccardSimilarity = updateCommitJaccardSimilarity
+ } IN TRANSACTIONS OF 500 ROWS
// Return one row with some statistics about the found pairs and their commit counts
-RETURN max(commitCount) AS maxCommitCount
- ,avg(commitCount) AS avgCommitCount
- ,percentileDisc(commitCount, 0.5) AS percentile50CommitCount
- ,percentileDisc(commitCount, 0.9) AS percentile90CommitCount
- ,percentileDisc(commitCount, 0.95) AS percentile95CommitCount
- ,count(*) AS pairCount
\ No newline at end of file
+RETURN count(*) AS pairCount
+
+ ,min(updateCommitCount) AS minCommitCount
+ ,max(updateCommitCount) AS maxCommitCount
+ ,avg(updateCommitCount) AS avgCommitCount
+ ,percentileDisc(updateCommitCount, 0.5) AS percentile50CommitCount
+ ,percentileDisc(updateCommitCount, 0.9) AS percentile90CommitCount
+ ,percentileDisc(updateCommitCount, 0.95) AS percentile95CommitCount
+
+ ,min(updateCommitMinConfidence) AS minMinConfidence
+ ,max(updateCommitMinConfidence) AS maxMinConfidence
+ ,avg(updateCommitMinConfidence) AS avgMinConfidence
+ ,percentileDisc(updateCommitMinConfidence, 0.5) AS percentile50MinConfidence
+ ,percentileDisc(updateCommitMinConfidence, 0.9) AS percentile90MinConfidence
+ ,percentileDisc(updateCommitMinConfidence, 0.95) AS percentile95MinConfidence
+
+ ,min(updateCommitLift) AS minLift
+ ,max(updateCommitLift) AS maxLift
+ ,avg(updateCommitLift) AS avgLift
+ ,percentileDisc(updateCommitLift, 0.5) AS percentile50Lift
+ ,percentileDisc(updateCommitLift, 0.9) AS percentile90Lift
+ ,percentileDisc(updateCommitLift, 0.95) AS percentile95Lift
+
+ ,min(updateCommitJaccardSimilarity) AS minJaccardSimilarity
+ ,max(updateCommitJaccardSimilarity) AS maxJaccardSimilarity
+ ,avg(updateCommitJaccardSimilarity) AS avgJaccardSimilarity
+ ,percentileDisc(updateCommitJaccardSimilarity, 0.5) AS percentile50JaccardSimilarity
+ ,percentileDisc(updateCommitJaccardSimilarity, 0.9) AS percentile90JaccardSimilarity
+ ,percentileDisc(updateCommitJaccardSimilarity, 0.95) AS percentile95JaccardSimilarity
\ No newline at end of file
diff --git a/cypher/GitLog/List_git_files_that_were_changed_together.cypher b/cypher/GitLog/List_git_files_that_were_changed_together.cypher
index f19cf4efb..b82dc30d1 100644
--- a/cypher/GitLog/List_git_files_that_were_changed_together.cypher
+++ b/cypher/GitLog/List_git_files_that_were_changed_together.cypher
@@ -6,5 +6,5 @@ MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(secondGitFile)
RETURN gitRepository.name + '/' + firstGitFile.relativePath AS firstFile
,gitRepository.name + '/' + secondGitFile.relativePath AS secondFile
- ,gitChange.commitCount AS commitCount
+ ,gitChange.updateCommitCount AS commitCount
ORDER BY commitCount DESC
diff --git a/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher b/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher
index 16f8458dd..77fee0216 100644
--- a/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher
+++ b/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher
@@ -1,8 +1,16 @@
// List git files that where frequently changed with another file. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".
MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
+WHERE elementId(firstGitFile) < elementId(secondGitFile)
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
-UNWIND gitChange.commitHashes AS commitHash
-RETURN gitRepository.name + '/' + firstGitFile.relativePath AS filePath
+UNWIND gitChange.updateCommitHashes AS commitHash
+ WITH gitRepository.name + '/' + firstGitFile.relativePath AS filePath
,count(DISTINCT commitHash) AS commitCount
+ ,sum(firstGitFile.updateCommitCount) AS fileUpdateCount
+ ,max(gitChange.updateCommitLift) AS maxLift
+ ,avg(gitChange.updateCommitLift) AS avgLift
+ WITH *
+ // Out of all the times the file was touched, how often did it co-occur with other files?
+ ,CASE WHEN fileUpdateCount > 0 THEN toFloat(commitCount) / fileUpdateCount ELSE 0.0 END AS coChangeRate
+RETURN filePath, commitCount, coChangeRate, maxLift, avgLift
ORDER BY commitCount DESC
\ No newline at end of file
diff --git a/cypher/GitLog/List_pairwise_changed_files.cypher b/cypher/GitLog/List_pairwise_changed_files.cypher
new file mode 100644
index 000000000..048c07b7b
--- /dev/null
+++ b/cypher/GitLog/List_pairwise_changed_files.cypher
@@ -0,0 +1,21 @@
+// List pairs of files that were changed together. Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first.
+
+MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
+WHERE elementId(firstFile) < elementId(secondFile)
+ WITH *
+ ,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName
+ ,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName
+RETURN firstFileName
+ ,secondFileName
+ ,firstFile.name + '
' + secondFile.name AS filePairLineBreak
+ ,firstFileName + '
' + secondFileName AS filePairWithRelativePathLineBreak
+ ,firstFile.name + '↔' + secondFile.name AS filePair
+ ,firstFileName + '↔' + secondFileName AS filePairWithRelativePath
+ ,firstFile.extension AS firstFileExtension
+ ,secondFile.extension AS secondFileExtension
+ ,firstFile.extension + '↔' + secondFile.extension AS fileExtensionPair
+ ,pairwiseChange.updateCommitCount AS updateCommitCount
+ ,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
+ ,pairwiseChange.updateCommitSupport AS updateCommitSupport
+ ,pairwiseChange.updateCommitLift AS updateCommitLift
+ ,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity
\ No newline at end of file
diff --git a/cypher/GitLog/List_pairwise_changed_files_top_selected_metric.cypher b/cypher/GitLog/List_pairwise_changed_files_top_selected_metric.cypher
new file mode 100644
index 000000000..fb2c6832c
--- /dev/null
+++ b/cypher/GitLog/List_pairwise_changed_files_top_selected_metric.cypher
@@ -0,0 +1,41 @@
+// Get the top 4 file extensions that where changed together most often and list top 20 pair that were changed together for each of the top file extension pair by their highest commit lift (>1: changes more often than by random chance). Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first.
+
+MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
+WHERE firstFile.extension < secondFile.extension
+ OR (firstFile.extension = secondFile.extension AND elementId(firstFile) < elementId(secondFile))
+ WITH firstFile.extension + '↔' + secondFile.extension AS fileExtensionPair
+ ,count(DISTINCT pairwiseChange) AS pairCount
+ORDER BY pairCount DESC
+ WITH collect(fileExtensionPair)[0..4] AS top4FileExtensionPairs
+UNWIND top4FileExtensionPairs AS fileExtensionPair
+CALL {
+ WITH fileExtensionPair
+ MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
+ WHERE elementId(firstFile) < elementId(secondFile)
+ AND firstFile.extension + '↔' + secondFile.extension = fileExtensionPair
+ WITH *
+ ,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName
+ ,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName
+ RETURN firstFile.name AS firstFileNameShort
+ ,secondFile.name AS secondFileNameShort
+ ,firstFileName
+ ,secondFileName
+ ,pairwiseChange[$selected_pair_metric] AS selectedMetric
+ ,pairwiseChange.updateCommitLift AS updateCommitLift
+ ,pairwiseChange.updateCommitCount AS updateCommitCount
+ ,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
+ ,pairwiseChange.updateCommitSupport AS updateCommitSupport
+ ,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity
+ ORDER BY selectedMetric DESC, firstFileName ASC, secondFileName ASC
+ LIMIT 20
+}
+RETURN fileExtensionPair
+ ,firstFileNameShort
+ ,secondFileNameShort
+ ,updateCommitCount
+ ,updateCommitMinConfidence
+ ,updateCommitLift
+ ,updateCommitJaccardSimilarity
+ ,updateCommitSupport
+ ,firstFileName
+ ,secondFileName
\ No newline at end of file
diff --git a/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher b/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher
index b4c66604e..331f7c96e 100644
--- a/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher
+++ b/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher
@@ -1,39 +1,24 @@
-// List pair of files that were changed together and that have a declared dependency between each other.
+// List pair of files that were changed together and that have a declared dependency between each other. Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher and Add_CHANGED_TOGETHER_WITH_relationships_to_code_files.cypher to run first.
MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File)
MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
-//De-duplicating the pairs of files isn't necessary, because the dependency relation is directed.
-//WHERE elementId(firstCodeFile) < elementId(secondCodeFile)
- WITH firstCodeFile.fileName AS firstFileName
- ,secondCodeFile.fileName AS secondFileName
+WHERE elementId(firstCodeFile) < elementId(secondCodeFile)
+ WITH firstCodeFile.fileName AS firstFileName
+ ,secondCodeFile.fileName AS secondFileName
,coalesce(dependency.weight, dependency.cardinality) AS dependencyWeight
- ,pairwiseChange.commitCount AS commitCount
- ,dependency.fileDistanceAsFewestChangeDirectoryCommands AS fileDistanceAsFewestChangeDirectoryCommands
+ ,dependency.fileDistanceAsFewestChangeDirectoryCommands AS fileDistance
+ ,pairwiseChange.updateCommitCount AS commitCount
+ ,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
+ ,pairwiseChange.updateCommitSupport AS updateCommitSupport
+ ,pairwiseChange.updateCommitLift AS updateCommitLift
+ ,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity
RETURN dependencyWeight
+ ,fileDistance
,commitCount
- ,fileDistanceAsFewestChangeDirectoryCommands
+ ,updateCommitMinConfidence
+ ,updateCommitSupport
+ ,updateCommitLift
+ ,updateCommitJaccardSimilarity
// ,count(*) AS occurrences
// ,collect(firstFileName + ' -> ' + secondFileName)[0..3] AS examples
-ORDER BY dependencyWeight, commitCount
-
-// MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File)
-// MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
-// WHERE elementId(firstCodeFile) < elementId(secondCodeFile)
-// RETURN firstCodeFile.fileName AS firstFileName
-// ,secondCodeFile.fileName AS secondFileName
-// ,dependency.weight AS dependencyWeight
-// ,pairwiseChange.commitCount AS commitCount
-// ORDER BY dependencyWeight, commitCount
-
-// MATCH (g1:!Git&File)-[relation:CHANGED_TOGETHER_WITH|DEPENDS_ON]-(g2:!Git&File)
-// WITH count(DISTINCT relation) AS relatedFilesCount
-// ,collect(DISTINCT relation) AS relations
-// UNWIND relations AS relation
-// WITH relatedFilesCount
-// ,coalesce(relation.commitCount, 0) AS commitCount
-// ,coalesce(relation.weight, 0) AS dependencyWeight
-// ,coalesce(relation.fileDistanceAsFewestChangeDirectoryCommands, 0) AS fileDistanceAsFewestChangeDirectoryCommands
-// RETURN dependencyWeight
-// ,commitCount
-// ,fileDistanceAsFewestChangeDirectoryCommands
-// ORDER BY dependencyWeight, commitCount
+ORDER BY dependencyWeight, commitCount
\ No newline at end of file
diff --git a/cypher/GitLog/Set_number_of_git_plugin_update_commits.cypher b/cypher/GitLog/Set_number_of_git_plugin_update_commits.cypher
new file mode 100644
index 000000000..68a10a416
--- /dev/null
+++ b/cypher/GitLog/Set_number_of_git_plugin_update_commits.cypher
@@ -0,0 +1,11 @@
+// Set updateCommitCount property on Git File nodes when git commits with Update modifier (detected by the plugin) are present
+
+MATCH (git_file:File&Git)<-[:UPDATES]-(:Git&Change)<-[:CONTAINS_CHANGE]-(git_commit:Git&Commit)
+WHERE git_file.deletedAt IS NULL
+ WITH git_file, count(DISTINCT git_commit.sha) AS updateCommitCount
+ SET git_file.updateCommitCount = updateCommitCount
+ WITH git_file, updateCommitCount
+MATCH (code_file:File&!Git)<-[:RESOLVES_TO]-(git_file)
+ SET code_file.updateCommitCount = updateCommitCount
+RETURN count(DISTINCT code_file) AS codeFileUpdates
+ ,collect(DISTINCT code_file.name)[0..4] AS codeFileExample
\ No newline at end of file
diff --git a/cypher/GitLog/Verify_git_missing_CHANGED_TOGETHER_WITH_properties.cypher b/cypher/GitLog/Verify_git_missing_CHANGED_TOGETHER_WITH_properties.cypher
new file mode 100644
index 000000000..31e9e3e1b
--- /dev/null
+++ b/cypher/GitLog/Verify_git_missing_CHANGED_TOGETHER_WITH_properties.cypher
@@ -0,0 +1,6 @@
+// Verify if CHANGED_TOGETHER_WITH properties from git are missing
+
+MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
+RETURN (pairwiseChange.updateCommitCount IS NULL) AS updateCommitCountMissing
+ ,(pairwiseChange.updateCommitMinConfidence IS NULL) AS updateCommitMinConfidenceMissing
+ ,count(*)
\ No newline at end of file
diff --git a/jupyter/GitHistoryExploration.ipynb b/jupyter/GitHistoryExploration.ipynb
new file mode 100644
index 000000000..b57b96949
--- /dev/null
+++ b/jupyter/GitHistoryExploration.ipynb
@@ -0,0 +1,427 @@
+{
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "2f0eabc4",
+ "metadata": {},
+ "source": [
+ "# git log/history\n",
+ "
\n",
+ "\n",
+ "### References\n",
+ "- [Visualizing Code: Polyglot Notebooks Repository (YouTube)](https://youtu.be/ipOpToPS-PY?si=3doePt2cp-LgEUmt)\n",
+ "- [gitstractor (GitHub)](https://github.com/IntegerMan/gitstractor)\n",
+ "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4191f259",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from scipy.stats import pearsonr, spearmanr\n",
+ "import matplotlib.pyplot as plot\n",
+ "from matplotlib.colors import ListedColormap\n",
+ "from neo4j import GraphDatabase"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1c5dab37",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n",
+ "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n",
+ "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n",
+ "\n",
+ "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")))\n",
+ "driver.verify_connectivity()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c1db254b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_cypher_query_from_file(cypher_file_name : str):\n",
+ " with open(cypher_file_name) as file:\n",
+ " return ' '.join(file.readlines())\n",
+ "\n",
+ "\n",
+ "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n",
+ " \"\"\"\n",
+ " Execute the Cypher query of the given file and returns the result.\n",
+ " filename : str : The name of the file containing the Cypher query\n",
+ " limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit\n",
+ " \"\"\"\n",
+ " cypher_query = get_cypher_query_from_file(filename)\n",
+ " if limit > 0:\n",
+ " cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n",
+ " records, summary, keys = driver.execute_query(cypher_query)\n",
+ " return pd.DataFrame([r.values() for r in records], columns=keys)\n",
+ "\n",
+ "\n",
+ "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):\n",
+ " \"\"\"\n",
+ " Executes the Cypher queries of the given files and returns the first result that is not empty.\n",
+ " If all given file names result in empty results, the last (empty) result will be returned.\n",
+ " By additionally specifying \"limit=\" the \"LIMIT\" keyword will appended to query so that only the first results get returned.\n",
+ " \"\"\" \n",
+ " result=pd.DataFrame()\n",
+ " for filename in filenames:\n",
+ " result=query_cypher_to_data_frame(filename, limit)\n",
+ " if not result.empty:\n",
+ " return result\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a56670c9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n",
+ "#This is especially needed for PDF export of tables with multiple columns."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "006b9dc8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%html\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d565f2ed",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Main Colormap\n",
+ "# main_color_map = 'nipy_spectral'\n",
+ "main_color_map = 'viridis'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6323e85e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Pandas DataFrame Display Configuration\n",
+ "pd.set_option('display.max_colwidth', 500)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c15669ef",
+ "metadata": {},
+ "source": [
+ "## Pairwise Changed Files vs. Dependency Weight\n",
+ "\n",
+ "This section explores the correlation between how often pairs of files are changed together (common commit count) and their dependency weight. Note that these results should be interpreted cautiously, as comparing pairwise changes and dependencies is inherently challenging.\n",
+ "\n",
+ "### Considerations\n",
+ "- **Historical vs. Current State**: Pairwise changes reflect the entire git history, while dependency weight represents the current state of the codebase.\n",
+ "- **Commit Granularity**: Developers may use different commit strategies, such as squashing changes into a single commit or creating fine-grained commits. Ideally, each commit should represent a single semantic change for accurate analysis.\n",
+ "- **Dependency Representation**: Some file types (e.g., Java files with import statements) clearly define dependencies, while others (e.g., shell scripts, XML, YAML) lack explicit dependency relationships.\n",
+ "- **Repository Characteristics**: Repositories with generated code may have many large commits, while stabilized repositories may only update configuration files for dependency changes."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "98a2feea",
+ "metadata": {},
+ "source": [
+ "#### Data Preview"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a067f8e6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pairwise_changed_git_files_with_dependencies = query_cypher_to_data_frame(\"../cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher\")\n",
+ "pairwise_changed_git_files_with_dependencies.head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "01db2db9",
+ "metadata": {},
+ "source": [
+ "#### Data Statistics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9fe48db8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display(\"Pairwise changed git files compared to dependency weights - Overall statistics\")\n",
+ "display(pairwise_changed_git_files_with_dependencies.describe())\n",
+ "\n",
+ "# The correlation matrix plot can be found further below\n",
+ "# display(\"Pairwise changed git files compared to dependency weights - Pearson Correlation\")\n",
+ "# display(pairwise_changed_git_files_with_dependencies.corr(method='pearson'))\n",
+ "\n",
+ "# display(\"Pairwise changed git files compared to dependency weights - Spearman Correlation\")\n",
+ "# display(pairwise_changed_git_files_with_dependencies.corr(method='spearman'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5a4ae651",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if pairwise_changed_git_files_with_dependencies.shape[0] < 5:\n",
+ " print(\"Less than 5 samples are not enough to calculate p-values\")\n",
+ "else:\n",
+ " display(\"Pearson Correlation with p-value for commitCount and dependencyWeight\")\n",
+ " display(pearsonr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n",
+ "\n",
+ " display(\"Spearman Correlation with p-value for commitCount and dependencyWeight\")\n",
+ " display(spearmanr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n",
+ "\n",
+ " display(\"Pearson Correlation with p-value for updateCommitMinConfidence and dependencyWeight\")\n",
+ " display(pearsonr(pairwise_changed_git_files_with_dependencies['updateCommitMinConfidence'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n",
+ "\n",
+ " display(\"Spearman Correlation with p-value for updateCommitMinConfidence and dependencyWeight\")\n",
+ " display(spearmanr(pairwise_changed_git_files_with_dependencies['updateCommitMinConfidence'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "32125058",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def plot_correlation_matrix(correlation_matrix: pd.DataFrame, title_suffix: str = \"\") -> None:\n",
+ " \"\"\"\n",
+ " Plots the correlation matrix of the features in the DataFrame.\n",
+ " \n",
+ " :param java_package_anomaly_detection_features: DataFrame containing the features.\n",
+ " :param java_package_features_to_standardize: List of feature names to include in the correlation matrix.\n",
+ " \"\"\"\n",
+ " figure, axis = plot.subplots(figsize=(6, 6))\n",
+ " color_axis = axis.matshow(correlation_matrix, cmap=\"coolwarm\")\n",
+ " figure.colorbar(color_axis)\n",
+ " axis.set_xticks(range(len(correlation_matrix.columns)))\n",
+ " axis.set_yticks(range(len(correlation_matrix.index)))\n",
+ " axis.set_xticklabels(correlation_matrix.columns, rotation=90, fontsize=8)\n",
+ " axis.set_yticklabels(correlation_matrix.index, fontsize=8)\n",
+ " for (i, j), correlation_value in np.ndenumerate(correlation_matrix.values):\n",
+ " axis.text(j, i, f\"{correlation_value:.2f}\", ha='center', va='center', color='black', fontsize=8, bbox=dict(facecolor='white', alpha=0.2, edgecolor='none'))\n",
+ " plot.title(f\"Correlation Matrix {title_suffix}\", fontsize=10)\n",
+ " plot.tight_layout()\n",
+ " plot.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5a375279",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plot_correlation_matrix(pairwise_changed_git_files_with_dependencies.corr(method=\"pearson\"), \"(Pearson)\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9efa7d61",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plot_correlation_matrix(pairwise_changed_git_files_with_dependencies.corr(method=\"spearman\"), \"(Spearman)\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f17b39b4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def pvalue_matrix(data: pd.DataFrame) -> pd.DataFrame:\n",
+ " columns = data.columns\n",
+ " # Fill the diagonal with value 1.0. \n",
+ " # A p-value for 2 identical values would lead to warnings and not reveal any valueable insights.\n",
+ " p_values = pd.DataFrame(np.ones((len(columns), len(columns))), columns=columns, index=columns)\n",
+ "\n",
+ " for i in range(len(columns)):\n",
+ " for j in range(i+1, len(columns)):\n",
+ " _, p_value = pearsonr(data[columns[i]], data[columns[j]])\n",
+ " if np.isnan(p_value): # replace nan with 1.0 = no significance\n",
+ " p_value = 1.0\n",
+ " if np.isclose(p_value, 0.0, rtol=1e-15, atol=1e-15): # replace values to close to zero by 1 = no significance\n",
+ " p_value = 1.0\n",
+ " p_values.iloc[i, j] = p_value\n",
+ " p_values.iloc[j, i] = p_value\n",
+ "\n",
+ " return p_values\n",
+ "\n",
+ "def plot_p_value_matrix(p_value_matrix: pd.DataFrame):\n",
+ "\n",
+ " # Map values to 0 (green) and 1 (white)\n",
+ " data_for_plot = np.where(p_value_matrix < 0.05, 0, 1)\n",
+ "\n",
+ " # Make a colormap: green for low p-values, white for others\n",
+ " color_map = ListedColormap([\"limegreen\", \"white\"])\n",
+ "\n",
+ " # Plot heatmap\n",
+ " figure, axis = plot.subplots(figsize=(6, 5))\n",
+ " image = axis.imshow(data_for_plot, cmap=color_map, vmin=0, vmax=1)\n",
+ "\n",
+ " # Add colorbar\n",
+ " # color_bar = plot.colorbar(image, ax=axis)\n",
+ " # color_bar.set_label(\"p-value\")\n",
+ "\n",
+ " # Show all ticks\n",
+ " axis.set_xticks(np.arange(len(p_value_matrix.columns)))\n",
+ " axis.set_yticks(np.arange(len(p_value_matrix.index)))\n",
+ " axis.set_xticklabels(p_value_matrix.columns, fontsize=8)\n",
+ " axis.set_yticklabels(p_value_matrix.index, fontsize=8)\n",
+ "\n",
+ " # Rotate tick labels\n",
+ " plot.setp(axis.get_xticklabels(), rotation=45, ha=\"right\", rotation_mode=\"anchor\", fontsize=8)\n",
+ "\n",
+ " # Annotate with values\n",
+ " for i in range(len(p_value_matrix.columns)):\n",
+ " for j in range(len(p_value_matrix.index)):\n",
+ " cell_value = p_value_matrix.iloc[i, j]\n",
+ " if cell_value < 0.001:\n",
+ " cell_text = f\"{cell_value:.1e}\" # scientific notation\n",
+ " else:\n",
+ " cell_text = f\"{cell_value:.4f}\" # normal 4-decimal format\n",
+ " axis.text(j, i, cell_text, ha=\"center\", va=\"center\", color=\"black\", fontsize=6)\n",
+ "\n",
+ " plot.title(\"p-value Matrix (< 0.05 in Green)\", pad=20, fontsize=10)\n",
+ " plot.tight_layout()\n",
+ " plot.show()\n",
+ "\n",
+ "# Plot p-values showing statistical significance\n",
+ "plot_p_value_matrix(pvalue_matrix(pairwise_changed_git_files_with_dependencies))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5c799b77",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def plot_git_changes_vs_dependencies(pairwise_changes: pd.DataFrame, title: str, x_column: str, y_column: str):\n",
+ " if pairwise_changes.empty:\n",
+ " print(\"No projected data to plot.\")\n",
+ " return\n",
+ "\n",
+ " plot.scatter(\n",
+ " x=pairwise_changes[x_column],\n",
+ " y=pairwise_changes[y_column],\n",
+ " s=3,\n",
+ " )\n",
+ " plot.xlabel(x_column)\n",
+ " plot.ylabel(y_column)\n",
+ " plot.title(title, pad=20)\n",
+ " plot.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "747f9590",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Scatter plot of all pairs of files with their commit count on the x axis and dependency weight on the y axis\n",
+ "\n",
+ "plot_git_changes_vs_dependencies(\n",
+ " pairwise_changed_git_files_with_dependencies,\n",
+ " 'Pairwise changed files: Number of changes (commitCount) vs. dependency weight',\n",
+ " 'commitCount',\n",
+ " 'dependencyWeight'\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "75264b82",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Scatter plot of all pairs of files with their min confidence (normalized update commit count) on the x axis and dependency weight on the y axis\n",
+ "\n",
+ "plot_git_changes_vs_dependencies(\n",
+ " pairwise_changed_git_files_with_dependencies,\n",
+ " 'Pairwise changed files: Min confidence co-change rate vs. dependency weight',\n",
+ " 'updateCommitMinConfidence',\n",
+ " 'dependencyWeight'\n",
+ ")"
+ ]
+ }
+ ],
+ "metadata": {
+ "authors": [
+ {
+ "name": "JohT"
+ }
+ ],
+ "code_graph_analysis_pipeline_data_validation": "ValidateAlwaysFalse",
+ "kernelspec": {
+ "display_name": "codegraph",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.9"
+ },
+ "title": "Git History Charts with Neo4j (Additional Manual Exploration)"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb
index b8b6aa112..117c03746 100644
--- a/jupyter/GitHistoryGeneral.ipynb
+++ b/jupyter/GitHistoryGeneral.ipynb
@@ -37,7 +37,8 @@
"source": [
"from neo4j import GraphDatabase\n",
"from plotly import graph_objects as plotly_graph_objects\n",
- "from plotly.express import colors as plotly_colors"
+ "from plotly.express import colors as plotly_colors\n",
+ "from plotly.subplots import make_subplots"
]
},
{
@@ -195,7 +196,7 @@
"# Base settings for Plotly Treemap\n",
"\n",
"plotly_main_layout_base_settings = dict(\n",
- " margin=dict(t=50, l=15, r=15, b=15),\n",
+ " margin=dict(t=80, l=15, r=15, b=15),\n",
")\n",
"plotly_treemap_layout_base_settings = dict(\n",
" **plotly_main_layout_base_settings\n",
@@ -1168,46 +1169,134 @@
},
{
"cell_type": "markdown",
- "id": "80bd7c28",
+ "id": "d8c6ccee",
+ "metadata": {},
+ "source": [
+ "## Filecount per commit\n",
+ "\n",
+ "Shows how many commits had changed one file, how many had changed two files, and so on.\n",
+ "The chart is limited to 30 lines for improved readability.\n",
+ "The data preview also includes overall statistics including the number of commits that are filtered out in the chart."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ed53b6e5",
+ "metadata": {},
+ "source": [
+ "### Preview data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5526e458",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "git_file_count_per_commit = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_per_commit_distribution.cypher\")\n",
+ "\n",
+ "print(\"Sum of commits that changed more than 30 files (each) = \" + str(git_file_count_per_commit[git_file_count_per_commit['filesPerCommit'] > 30]['commitCount'].sum()))\n",
+ "print(\"Max changed files with one commit = \" + str(git_file_count_per_commit['filesPerCommit'].max()))\n",
+ "display(git_file_count_per_commit.describe())\n",
+ "display(git_file_count_per_commit.head(30))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dcea826e",
+ "metadata": {},
+ "source": [
+ "### Bar chart with the number of files per commit distribution"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9e9dbc57",
"metadata": {},
+ "outputs": [],
"source": [
- "### File changed frequently with other files"
+ "if git_file_count_per_commit.empty:\n",
+ " print(\"No data to plot\")\n",
+ "else:\n",
+ " figure = plotly_graph_objects.Figure(plotly_graph_objects.Bar(\n",
+ " x=git_file_count_per_commit['filesPerCommit'].head(30), \n",
+ " y=git_file_count_per_commit['commitCount'].head(30)),\n",
+ " )\n",
+ " figure.update_layout(\n",
+ " **plotly_bar_layout_base_settings,\n",
+ " title='Changed files per commit',\n",
+ " xaxis_title='file count',\n",
+ " yaxis_title='commit count'\n",
+ " )\n",
+ " figure.show(**plotly_treemap_figure_show_settings)\n",
+ " if is_command_line_execution():\n",
+ " figure.write_image(**get_plotly_figure_write_image_settings(\"ChangedFilesPerCommit\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "322d6cf9",
+ "metadata": {},
+ "source": [
+ "## Pairwise Changed Files\n",
+ "\n",
+ "This section analyzes files that where changed together within the same commit and provides several metrics to quantify the strength of the co-change relationship:\n",
+ "\n",
+ "- **Commit Count**: The number of commits in which two files were changed together.\n",
+ "- **Commit Lift**: A ratio that indicates whether the co-change pattern is stronger than random chance, given how often each file changes.\n",
+ "- **Jaccard Similarity**: The ratio of commits involving either file that also involved both files.\n",
+ "\n",
+ "The following tables show the top pairwise changed files based on these metrics.\n",
+ "The following charts show how these metrics are distributed across pairs of files that were changed together."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4c081f85",
+ "metadata": {},
+ "source": [
+ "### Treemap with files changed frequently with others"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "24055998",
+ "id": "30942bd4",
"metadata": {},
"outputs": [],
"source": [
- "pairwise_changed_git_files = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher\")\n",
+ "data_to_display = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher\")\n",
"\n",
"# Debug\n",
"# display(\"1. pairwise changed files --------------\")\n",
"# display(pairwise_changed_git_files)\n",
"\n",
"# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n",
- "pairwise_changed_git_files = add_directory_column(pairwise_changed_git_files, 'filePath', 'directoryPath')\n",
+ "data_to_display = add_directory_column(data_to_display, 'filePath', 'directoryPath')\n",
"\n",
"# Debug\n",
"# display(\"2. added directories --------------\")\n",
"# display(pairwise_changed_git_files)\n",
"\n",
"# Group the git files by their directory and author and count the number of files of each directory (across all levels).\n",
- "pairwise_changed_git_files = pairwise_changed_git_files.groupby(['directoryPath']).aggregate(\n",
+ "data_to_display = data_to_display.groupby(['directoryPath']).aggregate(\n",
" pairwiseChangeCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
" pairwiseChangeFileCount=pd.NamedAgg(column=\"filePath\", aggfunc=\"count\"),\n",
+ " pairwiseChangeAverageRate=pd.NamedAgg(column=\"coChangeRate\", aggfunc=\"mean\"),\n",
+ " pairwiseChangeMaxLift=pd.NamedAgg(column=\"maxLift\", aggfunc=\"max\"),\n",
+ " pairwiseChangeAverageLift=pd.NamedAgg(column=\"avgLift\", aggfunc=\"mean\"),\n",
")\n",
- "pairwise_changed_git_files.reset_index(inplace=True)\n",
+ "data_to_display.reset_index(inplace=True)\n",
"\n",
"# Debug\n",
"# display(\"3. after grouping --------------\")\n",
"# display(pairwise_changed_git_files)\n",
"\n",
- "pairwise_changed_git_files = pd.merge(\n",
+ "data_to_display = pd.merge(\n",
" git_files_with_commit_statistics, \n",
- " pairwise_changed_git_files, \n",
+ " data_to_display, \n",
" left_on='directoryPath', \n",
" right_on=\"directoryPath\",\n",
" how=\"left\",\n",
@@ -1218,9 +1307,12 @@
"# display(\"4. after merging --------------\")\n",
"# display(pairwise_changed_git_files)\n",
"\n",
- "pairwise_changed_git_files['pairwiseChangeCommitCount'] = pairwise_changed_git_files['pairwiseChangeCommitCount'].fillna(0).astype(int)\n",
- "pairwise_changed_git_files['pairwiseChangeFileCount'] = pairwise_changed_git_files['pairwiseChangeFileCount'].fillna(0).astype(int)\n",
- "pairwise_changed_git_files.reset_index(inplace=True)\n",
+ "data_to_display['pairwiseChangeCommitCount'] = data_to_display['pairwiseChangeCommitCount'].fillna(0).astype(int)\n",
+ "data_to_display['pairwiseChangeFileCount'] = data_to_display['pairwiseChangeFileCount'].fillna(0).astype(int)\n",
+ "data_to_display['pairwiseChangeAverageRate'] = data_to_display['pairwiseChangeAverageRate'].fillna(0).astype(float)\n",
+ "data_to_display['pairwiseChangeMaxLift'] = data_to_display['pairwiseChangeMaxLift'].fillna(0).astype(float)\n",
+ "data_to_display['pairwiseChangeAverageLift'] = data_to_display['pairwiseChangeAverageLift'].fillna(0).astype(float)\n",
+ "data_to_display.reset_index(inplace=True)\n",
"\n",
"# Debug\n",
"# display(\"5. after NaN fill --------------\")\n",
@@ -1230,205 +1322,621 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "19b5a98a",
+ "id": "1052776d",
"metadata": {},
"outputs": [],
"source": [
- "pairwise_changed_git_files = add_quantile_limited_column(pairwise_changed_git_files, \"pairwiseChangeCommitCount\", 0.98)\n",
+ "data_to_display = add_quantile_limited_column(data_to_display, \"pairwiseChangeCommitCount\", 0.98)\n",
"\n",
"figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
- " create_treemap_commit_statistics_settings(pairwise_changed_git_files),\n",
+ " create_treemap_commit_statistics_settings(data_to_display),\n",
" # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
" # values = pairwise_changed_git_files['fileCount'],\n",
" marker=dict(\n",
" **plotly_treemap_marker_base_colorscale,\n",
- " colors=pairwise_changed_git_files['pairwiseChangeCommitCount_limited'], \n",
- " colorbar=dict(title=\"Changes\"),\n",
+ " colors=data_to_display['pairwiseChangeCommitCount_limited'], \n",
+ " colorbar=dict(title=\"Co-Changes\"),\n",
" ),\n",
"))\n",
"figure.update_layout(\n",
" **plotly_treemap_layout_base_settings,\n",
- " title='Pairwise file changes',\n",
+ " title='Files that likely co-change with others in update commits',\n",
")\n",
"figure.show(**plotly_treemap_figure_show_settings)\n",
"if is_command_line_execution():\n",
- " figure.write_image(**get_plotly_figure_write_image_settings(\"PairwiseFileChanges\"))"
+ " figure.write_image(**get_plotly_figure_write_image_settings(\"CoChangingFiles\"))"
]
},
{
- "cell_type": "markdown",
- "id": "d8c6ccee",
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3ec95adf",
"metadata": {},
+ "outputs": [],
"source": [
- "## Filecount per commit\n",
+ "data_to_display = add_quantile_limited_column(data_to_display, \"pairwiseChangeMaxLift\", 0.98)\n",
"\n",
- "Shows how many commits had changed one file, how many had changed two files, and so on.\n",
- "The chart is limited to 30 lines for improved readability.\n",
- "The data preview also includes overall statistics including the number of commits that are filtered out in the chart."
+ "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
+ " create_treemap_commit_statistics_settings(data_to_display),\n",
+ " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
+ " # values = pairwise_changed_git_files['fileCount'],\n",
+ " marker=dict(\n",
+ " **plotly_treemap_marker_base_colorscale,\n",
+ " colors=data_to_display['pairwiseChangeMaxLift_limited'], \n",
+ " colorbar=dict(title=\"Co-Change Lift\"),\n",
+ " ),\n",
+ "))\n",
+ "figure.update_layout(\n",
+ " **plotly_treemap_layout_base_settings,\n",
+ " title='Co-Changing files in update commits max lift (1=random, >1=more than random, <1=less than random)',\n",
+ ")\n",
+ "figure.show(**plotly_treemap_figure_show_settings)\n",
+ "if is_command_line_execution():\n",
+ " figure.write_image(**get_plotly_figure_write_image_settings(\"CoChangingFilesMaxLift\"))"
]
},
{
- "cell_type": "markdown",
- "id": "ed53b6e5",
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0e33b873",
"metadata": {},
+ "outputs": [],
"source": [
- "### Preview data"
+ "data_to_display = add_quantile_limited_column(data_to_display, \"pairwiseChangeAverageLift\", 0.98)\n",
+ "\n",
+ "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
+ " create_treemap_commit_statistics_settings(data_to_display),\n",
+ " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
+ " # values = pairwise_changed_git_files['fileCount'],\n",
+ " marker=dict(\n",
+ " **plotly_treemap_marker_base_colorscale,\n",
+ " colors=data_to_display['pairwiseChangeAverageLift_limited'], \n",
+ " colorbar=dict(title=\"Co-Change Lift\"),\n",
+ " ),\n",
+ "))\n",
+ "figure.update_layout(\n",
+ " **plotly_treemap_layout_base_settings,\n",
+ " title='Co-Changing files in update commits average lift (1=random, >1=more than random, <1=less than random)',\n",
+ ")\n",
+ "figure.show(**plotly_treemap_figure_show_settings)\n",
+ "if is_command_line_execution():\n",
+ " figure.write_image(**get_plotly_figure_write_image_settings(\"CoChangingFilesAverageLift\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "5526e458",
+ "id": "e7721dfd",
"metadata": {},
"outputs": [],
"source": [
- "git_file_count_per_commit = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_per_commit_distribution.cypher\")\n",
- "\n",
- "print(\"Sum of commits that changed more than 30 files (each) = \" + str(git_file_count_per_commit[git_file_count_per_commit['filesPerCommit'] > 30]['commitCount'].sum()))\n",
- "print(\"Max changed files with one commit = \" + str(git_file_count_per_commit['filesPerCommit'].max()))\n",
- "display(git_file_count_per_commit.describe())\n",
- "display(git_file_count_per_commit.head(30))"
+ "# Initial steps: Function Declaration and Data Preparation for co-change distribution analysis"
]
},
{
- "cell_type": "markdown",
- "id": "dcea826e",
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f97aba8f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pairwise_changed_git_files = query_cypher_to_data_frame(\"../cypher/GitLog/List_pairwise_changed_files.cypher\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8f874da0",
"metadata": {},
+ "outputs": [],
"source": [
- "### Bar chart with the number of files per commit distribution"
+ "def find_top_pairwise_changed_file_extensions(input_data: pd.DataFrame, top_n: int = 10) -> pd.Series:\n",
+ " \"\"\"\n",
+ " Finds the top N pairwise changed file extensions based on commit count.\n",
+ " input_data : pd.DataFrame : DataFrame containing pairwise changed files with their pair counts and extensions\n",
+ " top_n : int : The number of top extensions to return\n",
+ " return : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n",
+ " \"\"\"\n",
+ " top_extensions = input_data.groupby('fileExtensionPair').aggregate({'filePairWithRelativePath': 'count'}).reset_index()\n",
+ " return top_extensions.sort_values(by='filePairWithRelativePath', ascending=False).reset_index(drop=True).head(top_n)['fileExtensionPair']"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "9e9dbc57",
+ "id": "7e228e63",
"metadata": {},
"outputs": [],
"source": [
- "if git_file_count_per_commit.empty:\n",
- " print(\"No data to plot\")\n",
- "else:\n",
- " figure = plotly_graph_objects.Figure(plotly_graph_objects.Bar(\n",
- " x=git_file_count_per_commit['filesPerCommit'].head(30), \n",
- " y=git_file_count_per_commit['commitCount'].head(30)),\n",
+ "top_pairwise_changed_file_extensions = find_top_pairwise_changed_file_extensions(pairwise_changed_git_files, top_n=4)\n",
+ "# Only keep the pairwise change files with the top file extensions\n",
+ "pairwise_changed_git_files = pairwise_changed_git_files[pairwise_changed_git_files['fileExtensionPair'].isin(top_pairwise_changed_file_extensions)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8c07abbf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def add_file_extension_rank_column(data_frame: pd.DataFrame, column_name: str):\n",
+ " \"\"\"\n",
+ " Adds a 'fileExtensionPair' based rank column to the DataFrame for the value of the specified column.\n",
+ " data_frame : pd.DataFrame : The input DataFrame\n",
+ " column_name : str : The name of the column to rank\n",
+ " return : pd.DataFrame : The DataFrame with added rank column\n",
+ " \"\"\"\n",
+ " if column_name + '_rank' in data_frame.columns:\n",
+ " return data_frame # Column already exists\n",
+ " \n",
+ " # Create a new rank column based on the specified column and group by the group column\n",
+ " data_frame[f\"{column_name}ExtensionRank\"] = data_frame.groupby('fileExtensionPair')[column_name].rank(ascending=False, method='dense').astype(int)\n",
+ " return data_frame"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "84b01643",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitCount\")\n",
+ "pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitMinConfidence\")\n",
+ "pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitJaccardSimilarity\")\n",
+ "pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitLift\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ad158020",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def display_table_for_top_pairwise_changed_file_extensions(\n",
+ " data_to_display: pd.DataFrame, \n",
+ " metric_column: str,\n",
+ " top_n: int = 10\n",
+ " ):\n",
+ " \"\"\"\n",
+ " Displays a table containing the top N ranked pairwise changed file extensions based on the specified metric column.\n",
+ " data_to_plot : pd.DataFrame : DataFrame containing pairwise changed files with their commit counts\n",
+ " metric_column : str : The column to sort the data by\n",
+ " top_n : int : The number of top entries to display for each extension (default is 10)\n",
+ " \"\"\"\n",
+ " filtered_data = data_to_display[data_to_display[metric_column + \"ExtensionRank\"] <= top_n]\n",
+ " \n",
+ " # Group by the file extensions and the metric and its rank.\n",
+ " # Since some entries might have the same metric value, we aggregate by the first file pair with relative path and the first file pair.\n",
+ " # This way we can pick the top n entries for each file extension pair.\n",
+ " grouping_columns = [\"fileExtensionPair\", metric_column, metric_column + \"ExtensionRank\"]\n",
+ " grouped_data = filtered_data.groupby(grouping_columns).aggregate(\n",
+ " filePair=pd.NamedAgg(column=\"filePair\", aggfunc=\"first\"),\n",
+ " filePairWithRelativePath=pd.NamedAgg(column=\"filePairWithRelativePath\", aggfunc=\"first\"),\n",
+ " ).reset_index()\n",
+ " \n",
+ " return grouped_data.sort_values(by=grouping_columns, ascending=[True, False, True]).reset_index(drop=True).rename(columns={metric_column + \"ExtensionRank\": \"GroupRank\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3c34ceea",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# TODO delete if not needed anymore\n",
+ "\n",
+ "def display_table_for_top_pairwise_changed_file_extensions_deprecated(\n",
+ " data_to_display: pd.DataFrame, \n",
+ " top_pairwise_changed_file_extensions: pd.Series,\n",
+ " sort_column: str,\n",
+ " top_n: int = 10\n",
+ " ):\n",
+ " \"\"\"\n",
+ " Displays a table for each top pairwise changed file extension.\n",
+ " data_to_plot : pd.DataFrame : DataFrame containing pairwise changed files with their commit counts\n",
+ " top_pairwise_changed_file_extensions : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n",
+ " sort_column : str : The column to sort the data by (default is \"pairwiseChangeCommitCount\")\n",
+ " top_n : int : The number of top entries to display for each extension (default is 10)\n",
+ " \"\"\"\n",
+ " \n",
+ " if data_to_display.empty:\n",
+ " print(\"No data to display\")\n",
+ " return\n",
+ " \n",
+ " if top_pairwise_changed_file_extensions.empty:\n",
+ " print(\"No top pairwise changed file extensions to display\")\n",
+ " return\n",
+ "\n",
+ " # Display each top pairwise changed file extension with its corresponding data\n",
+ " selected_columns = [\"fileExtensionPair\", \"filePair\", sort_column, \"filePairWithRelativePath\"]\n",
+ " data_to_display = data_to_display[selected_columns]\n",
+ " \n",
+ " combined_data_for_top_extensions = pd.DataFrame().reindex_like(data_to_display.head(0)) # Create an empty DataFrame with the same columns as data_to_display\n",
+ " \n",
+ " for _, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n",
+ " filtered_data = data_to_display[data_to_display[\"fileExtensionPair\"] == extension]\n",
+ " sorted_data = filtered_data.sort_values(by=sort_column, ascending=False).head(top_n).reset_index()\n",
+ " combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data], ignore_index=True)\n",
+ " \n",
+ " display(combined_data_for_top_extensions)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5aeca70e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def plot_histogram_of_pairwise_changed_files(\n",
+ " data_to_plot: pd.DataFrame,\n",
+ " top_pairwise_changed_file_extensions: pd.Series,\n",
+ " x_axis_column: str = \"updateCommitCount\",\n",
+ " x_axis_label: str = \"Commit Count\",\n",
+ " output_file_name: str = \"CoChangedFilesByCommitCount\",\n",
+ " sub_plot_rows: int = 4, \n",
+ " sub_plot_columns: int = 1,\n",
+ " ):\n",
+ " \"\"\"\n",
+ " Plots a histogram of pairwise changed files based on their commit count.\n",
+ " pairwise_changed_git_files : pd.DataFrame : DataFrame containing pairwise changed files with their commit counts\n",
+ " top_extensions_number : int : The number of top extensions to plot\n",
+ " \"\"\"\n",
+ "\n",
+ " if data_to_plot.empty:\n",
+ " print(\"No data to plot\")\n",
+ " return\n",
+ " \n",
+ " if top_pairwise_changed_file_extensions.size != sub_plot_rows * sub_plot_columns:\n",
+ " raise ValueError(f\"Number of top pairwise changed file extensions ({top_pairwise_changed_file_extensions.size}) does not match the number of subplots ({sub_plot_rows * sub_plot_columns}).\")\n",
+ "\n",
+ " figure = make_subplots(\n",
+ " rows=sub_plot_rows, \n",
+ " cols=sub_plot_columns, \n",
+ " subplot_titles=top_pairwise_changed_file_extensions,\n",
+ " vertical_spacing=0.04, \n",
+ " horizontal_spacing=0.04\n",
" )\n",
+ "\n",
+ " # Add one subplot per extension pair\n",
+ " for index, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n",
+ " row = (index - 1) // sub_plot_columns + 1\n",
+ " column = (index - 1) % sub_plot_columns + 1\n",
+ "\n",
+ " data_for_subplot = data_to_plot[data_to_plot[\"fileExtensionPair\"] == extension]\n",
+ "\n",
+ " figure.add_trace(\n",
+ " plotly_graph_objects.Histogram(\n",
+ " x=data_for_subplot[x_axis_column],\n",
+ " text=data_for_subplot[\"filePairLineBreak\"],\n",
+ " textposition=\"inside\",\n",
+ " hovertext=data_for_subplot[\"filePairWithRelativePath\"],\n",
+ " nbinsx=40,\n",
+ " textfont=dict(size=12, color=\"white\"),\n",
+ " name=extension,\n",
+ " ),\n",
+ " row=row,\n",
+ " col=column\n",
+ " )\n",
+ " # Make subplot title larger\n",
+ " figure.layout.annotations[index - 1].update(font=dict(size=18))\n",
+ " # Label subplot x axis \n",
+ " figure.update_xaxes(title_text=x_axis_label, row=row, col=column)\n",
+ " # Label subplot y axis and make it logarithmic\n",
+ " figure.update_yaxes(title_text=\"File Pair Count (log)\", type=\"log\", row=row, col=column)\n",
+ "\n",
" figure.update_layout(\n",
- " **plotly_bar_layout_base_settings,\n",
- " title='Changed files per commit',\n",
- " xaxis_title='file count',\n",
- " yaxis_title='commit count'\n",
+ " margin=dict(t=100, l=10, r=10, b=10),\n",
+ " title=\"Co-Changed Files by their \" + x_axis_label.lower(),\n",
+ " title_font_size=20,\n",
+ " title_y=0.99,\n",
+ " bargap=0.05,\n",
+ " height=2000,\n",
+ " width=1000,\n",
+ " showlegend=False\n",
" )\n",
+ "\n",
" figure.show(**plotly_treemap_figure_show_settings)\n",
" if is_command_line_execution():\n",
- " figure.write_image(**get_plotly_figure_write_image_settings(\"ChangedFilesPerCommit\"))"
+ " figure.write_image(**get_plotly_figure_write_image_settings(output_file_name))"
]
},
{
"cell_type": "markdown",
- "id": "c15669ef",
+ "id": "fff50751",
"metadata": {},
"source": [
- "## Pairwise Changed Files vs. Dependency Weight\n",
- "\n",
- "This section explores the correlation between how often pairs of files are changed together (common commit count) and their dependency weight. Note that these results should be interpreted cautiously, as comparing pairwise changes and dependencies is inherently challenging.\n",
+ "### Files changed together by commit count"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1cd03b3f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_table_for_top_pairwise_changed_file_extensions(\n",
+ " pairwise_changed_git_files,\n",
+ " \"updateCommitCount\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "721c2c8a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plot_histogram_of_pairwise_changed_files(\n",
+ " data_to_plot = pairwise_changed_git_files,\n",
+ " top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions,\n",
+ " x_axis_column = \"updateCommitCount\",\n",
+ " x_axis_label = \"Commit Count\",\n",
+ " output_file_name = \"CoChangedFilesByCommitCount\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "55be3351",
+ "metadata": {},
+ "source": [
+ "### Files changed together by commit min confidence\n",
"\n",
- "### Considerations\n",
- "- **Historical vs. Current State**: Pairwise changes reflect the entire git history, while dependency weight represents the current state of the codebase.\n",
- "- **Commit Granularity**: Developers may use different commit strategies, such as squashing changes into a single commit or creating fine-grained commits. Ideally, each commit should represent a single semantic change for accurate analysis.\n",
- "- **Dependency Representation**: Some file types (e.g., Java files with import statements) clearly define dependencies, while others (e.g., shell scripts, XML, YAML) lack explicit dependency relationships.\n",
- "- **Repository Characteristics**: Repositories with generated code may have many large commits, while stabilized repositories may only update configuration files for dependency changes."
+ "The commit min confidence is the commit count where both files were changed divided by the commit count of the file with the least commits.\n",
+ "This metric is useful to identify pairs of files that are frequently changed together and is not biased by single files that are changed very often."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a1c9df18",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_table_for_top_pairwise_changed_file_extensions(\n",
+ " pairwise_changed_git_files,\n",
+ " \"updateCommitMinConfidence\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7a54edcd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plot_histogram_of_pairwise_changed_files(\n",
+ " data_to_plot = pairwise_changed_git_files,\n",
+ " top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions,\n",
+ " x_axis_column = \"updateCommitMinConfidence\",\n",
+ " x_axis_label = \"Commit Min Confidence\",\n",
+ " output_file_name = \"CoChangedFilesByCommitMinConfidence\"\n",
+ ")"
]
},
{
"cell_type": "markdown",
- "id": "98a2feea",
+ "id": "132fd688",
"metadata": {},
"source": [
- "#### Data Preview"
+ "### Files changed together by commit lift"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9d05088a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_table_for_top_pairwise_changed_file_extensions(\n",
+ " pairwise_changed_git_files,\n",
+ " \"updateCommitLift\"\n",
+ ")"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "a067f8e6",
+ "id": "73afeeed",
"metadata": {},
"outputs": [],
"source": [
- "pairwise_changed_git_files_with_dependencies = query_cypher_to_data_frame(\"../cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher\")\n",
- "pairwise_changed_git_files_with_dependencies.head(20)"
+ "plot_histogram_of_pairwise_changed_files(\n",
+ " data_to_plot = pairwise_changed_git_files,\n",
+ " top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions,\n",
+ " x_axis_column = \"updateCommitLift\",\n",
+ " x_axis_label = \"Commit Lift\",\n",
+ " output_file_name = \"CoChangedFilesByCommitLift\"\n",
+ ")"
]
},
{
"cell_type": "markdown",
- "id": "01db2db9",
+ "id": "2a977fc8",
"metadata": {},
"source": [
- "#### Data Statistics"
+ "### Files changed together by commit Jaccard similarity"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "9fe48db8",
+ "id": "41911a35",
"metadata": {},
"outputs": [],
"source": [
- "display(\"Pairwise changed git files compared to dependency weights - Overall statistics\")\n",
- "display(pairwise_changed_git_files_with_dependencies.describe())\n",
+ "display_table_for_top_pairwise_changed_file_extensions(\n",
+ " pairwise_changed_git_files,\n",
+ " \"updateCommitJaccardSimilarity\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ce034bce",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plot_histogram_of_pairwise_changed_files(\n",
+ " data_to_plot = pairwise_changed_git_files,\n",
+ " top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions,\n",
+ " x_axis_column = \"updateCommitJaccardSimilarity\",\n",
+ " x_axis_label = \"Commit Jaccard Similarity\",\n",
+ " output_file_name = \"CoChangedFilesByCommitJaccardSimilarity\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "727772c7",
+ "metadata": {},
+ "source": [
+ "### Find pairwise changed files with many highly ranked metrics\n",
"\n",
- "display(\"Pairwise changed git files compared to dependency weights - Pearson Correlation\")\n",
- "display(pairwise_changed_git_files_with_dependencies.corr(method='pearson'))\n",
+ "Find those pairwise changed files that have a high rank in many metrics by calculating a combined (weighted) score based on the ranks of each metric.\n",
+ "This is useful to identify pairs of files that score high in most metrics, which indicates a strong co-change relationship."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "330cd50d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "metric_rank_columns = [\n",
+ " 'updateCommitCountExtensionRank',\n",
+ " 'updateCommitMinConfidenceExtensionRank',\n",
+ " 'updateCommitJaccardSimilarityExtensionRank',\n",
+ " 'updateCommitLiftExtensionRank'\n",
+ "]\n",
"\n",
- "display(\"Pairwise changed git files compared to dependency weights - Spearman Correlation\")\n",
- "display(pairwise_changed_git_files_with_dependencies.corr(method='spearman'))"
+ "metric_columns = [\n",
+ " 'updateCommitCount',\n",
+ " 'updateCommitMinConfidence',\n",
+ " 'updateCommitJaccardSimilarity',\n",
+ " 'updateCommitLift'\n",
+ "]"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "5a4ae651",
+ "id": "9de55c0b",
"metadata": {},
"outputs": [],
"source": [
- "if pairwise_changed_git_files_with_dependencies.shape[0] < 5:\n",
- " print(\"Less than 5 samples are not enough to calculate p-values\")\n",
- "else:\n",
- " from scipy.stats import pearsonr, spearmanr\n",
+ "pairwise_changed_git_files['combinedMetricsScore'] = (\n",
+ " pairwise_changed_git_files['updateCommitCountExtensionRank'] +\n",
+ " pairwise_changed_git_files['updateCommitMinConfidenceExtensionRank'] +\n",
+ " pairwise_changed_git_files['updateCommitJaccardSimilarityExtensionRank'] +\n",
+ " pairwise_changed_git_files['updateCommitLiftExtensionRank']\n",
+ ")\n",
+ "\n",
+ "columns_to_show = [\"fileExtensionPair\", \"filePair\", \"combinedMetricsScore\"] + metric_rank_columns + metric_columns + [\"filePairWithRelativePath\"]\n",
+ "\n",
+ "pairwise_changed_git_files_top_10_ranks = pairwise_changed_git_files.\\\n",
+ " sort_values(by=[\"fileExtensionPair\", \"combinedMetricsScore\"], ascending=[True, True]).\\\n",
+ " groupby(\"fileExtensionPair\").\\\n",
+ " head(10).\\\n",
+ " reset_index(drop=True)\\\n",
+ " [columns_to_show]\n",
+ "\n",
+ "display(pairwise_changed_git_files_top_10_ranks)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b5b92b79",
+ "metadata": {},
+ "source": [
+ "### Pairwise changed files with pareto-optimal metrics\n",
"\n",
- " display(\"Pearson Correlation with p-value for commitCount and dependencyWeight\")\n",
- " display(pearsonr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n",
+ "A pair (count, confidence, jaccard, lift) is Pareto-optimal if there is no other pair that is better or equal in all metrics and strictly better in at least one. In other words, it is not \"dominated\" by any other pair.\n",
"\n",
- " display(\"Spearman Correlation with p-value for commitCount and dependencyWeight\")\n",
- " display(spearmanr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))"
+ "The frontier = the “best tradeoffs.”"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "747f9590",
+ "id": "4a9e5be6",
"metadata": {},
"outputs": [],
"source": [
- "# Scatter plot of all pairs of files with their commit count on the x axis and dependency weight on the y axis\n",
+ "def pareto_frontier(input_data, metrics, maximize=True):\n",
+ " \"\"\"\n",
+ " Extracts the Pareto frontier (skyline) from a DataFrame.\n",
"\n",
- "if pairwise_changed_git_files_with_dependencies.empty:\n",
- " print(\"No data to plot\")\n",
- "else:\n",
- " figure = plotly_graph_objects.Figure(plotly_graph_objects.Scatter(\n",
- " x=pairwise_changed_git_files_with_dependencies['commitCount'], \n",
- " y=pairwise_changed_git_files_with_dependencies['dependencyWeight'],\n",
- " mode='markers',\n",
- " # marker=dict(size=pairwise_changed_git_files_with_dependencies['occurrences'] + 8)\n",
- " ))\n",
- " figure.update_layout(\n",
- " **plotly_bar_layout_base_settings,\n",
- " title='Pairwise changed files: Number of changes (commitCount) vs. dependency weight',\n",
- " xaxis_title='commit count',\n",
- " yaxis_title='dependency weight',\n",
- " )\n",
- " figure.show(**plotly_treemap_figure_show_settings)\n",
- " if is_command_line_execution():\n",
- " figure.write_image(**get_plotly_figure_write_image_settings(\"PairwiseChangedFilesVsDependencyWeight\"))"
+ " input_data: DataFrame\n",
+ " metrics: list of column names to consider\n",
+ " maximize: True if higher is better for all metrics\n",
+ " \"\"\"\n",
+ " data = input_data[metrics].to_numpy()\n",
+ " if not maximize:\n",
+ " data = -data # flip sign if minimizing\n",
+ " \n",
+ " # Keep track of which rows are dominated (start with none)\n",
+ " is_dominated = np.zeros(len(data), dtype=bool)\n",
+ " for i, point in enumerate(data):\n",
+ " # Skip if already marked dominated\n",
+ " if is_dominated[i]:\n",
+ " continue\n",
+ " # Check which other rows dominate this row\n",
+ " dominates = np.all(data >= point, axis=1) & np.any(data > point, axis=1)\n",
+ " # If any row dominates this one, mark this row as dominated\n",
+ " is_dominated |= dominates\n",
+ " \n",
+ " # Keep only non-dominated rows = Pareto frontier\n",
+ " return input_data[~is_dominated].reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "04ff7564",
+ "metadata": {},
+ "source": [
+ "#### Pairwise changed files with pareto-optimal metrics - not considering file extensions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "56dc0360",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "columns_to_show_for_pareto_frontier = [\"filePair\", \"combinedMetricsScore\"] + metric_columns + metric_rank_columns + [\"filePairWithRelativePath\"]\n",
+ "display(pareto_frontier(pairwise_changed_git_files, metric_columns, maximize=False)[columns_to_show_for_pareto_frontier].head(40))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dabccd77",
+ "metadata": {},
+ "source": [
+ "#### Pairwise changed files with pareto-optimal metrics - using ranks grouped by file extensions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "43de84bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "columns_to_show_for_pareto_frontier_with_extensions = [\"fileExtensionPair\", \"filePair\", \"combinedMetricsScore\"] + metric_columns + metric_rank_columns + [\"filePairWithRelativePath\"]\n",
+ "display(pareto_frontier(pairwise_changed_git_files, metric_rank_columns, maximize=False)[columns_to_show_for_pareto_frontier_with_extensions].head(40))"
]
},
{
diff --git a/scripts/importGit.sh b/scripts/importGit.sh
index 7ed281310..708a5b496 100755
--- a/scripts/importGit.sh
+++ b/scripts/importGit.sh
@@ -7,6 +7,7 @@
# Note: This script needs the path to source directory that contains one or more git repositories. It defaults to SOURCE_DIRECTORY ("source").
# Note: Import will be skipped without an error if the source directory doesn't any git repositories.
# Note: This script needs git to be installed.
+# Note: IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT="plugin" is default and recommended. The other options "aggregated" and "full" are not actively maintained anymore.
# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
set -o errexit -o pipefail
@@ -134,13 +135,14 @@ commonPostGitImport() {
echo "importGit: Running verification queries for troubleshooting (non failing)..."
execute_cypher "${GIT_LOG_CYPHER_DIR}/Verify_git_to_code_file_unambiguous.cypher"
execute_cypher "${GIT_LOG_CYPHER_DIR}/Verify_code_to_git_file_unambiguous.cypher"
+ execute_cypher "${GIT_LOG_CYPHER_DIR}/Verify_git_missing_CHANGED_TOGETHER_WITH_properties.cypher"
}
postGitLogImport() {
- commonPostGitImport
-
echo "importGit: Add numberOfGitCommits property to nodes with matching file names..."
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_log_commits.cypher"
+
+ commonPostGitImport
}
postGitPluginImport() {
@@ -149,24 +151,25 @@ postGitPluginImport() {
# TODO: The deletion of all plain files in the "/.git" directory is needed
# until there is a way to exclude all files inside a directory
# while still being able to get them analyzed by the git plugin.
- # This would most likely be solved with https://github.com/jQAssistant/jqassistant/issues/410
execute_cypher "${GIT_LOG_CYPHER_DIR}/Delete_plain_git_directory_file_nodes.cypher"
execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_commit_sha.cypher"
execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_file_name.cypher"
execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_file_relative_path.cypher"
execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_absolute_file_name.cypher"
- commonPostGitImport
-
echo "importGit: Add numberOfGitCommits property to nodes with matching file names..."
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_plugin_commits.cypher"
-}
+ echo "importGit: Add updateCommitCount property to file nodes and code nodes with matching file names..."
+ execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_plugin_update_commits.cypher"
-postAggregatedGitLogImport() {
commonPostGitImport
+}
+postAggregatedGitLogImport() {
echo "importGit: Add numberOfGitCommits property to nodes with matching file names..."
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_aggregated_git_commits.cypher"
+
+ commonPostGitImport
}
# Create import directory in case it doesn't exist.
diff --git a/scripts/reports/GitHistoryCsv.sh b/scripts/reports/GitHistoryCsv.sh
index 77d357e5b..9f588cdd1 100755
--- a/scripts/reports/GitHistoryCsv.sh
+++ b/scripts/reports/GitHistoryCsv.sh
@@ -47,9 +47,15 @@ execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_file_directories_with_commit_stat
# Overall distribution of how many files were changed with one git commit, how many were changed with two, etc.
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_per_commit_distribution.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_files_per_commit_distribution.csv"
-# Data basis for finding out if there is a correlation between pairwise changed files and their dependencies
+# Find pairwise changed files that depend on each other
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_with_dependencies.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_with_dependencies.csv"
+# List pairwise changed files with various metrics
+execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_top_selected_metric.cypher" "selected_pair_metric=updateCommitCount" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_top_count.csv"
+execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_top_selected_metric.cypher" "selected_pair_metric=updateCommitMinConfidence" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_top_min_confidence.csv"
+execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_top_selected_metric.cypher" "selected_pair_metric=updateCommitJaccardSimilarity" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_top_jaccard.csv"
+execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_top_selected_metric.cypher" "selected_pair_metric=updateCommitLift" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_top_lift.csv"
+
# Clean-up after report generation. Empty reports will be deleted.
source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}"