Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Contained within this repository is a comprehensive and automated code graph ana

### :newspaper: News

- August 2025: Association rule learning for co-changing files in git history
- August 2025: Anomaly detection powered by unsupervised machine learning and explainable AI
- May 2025: Migrated to [Neo4j 2025.x](https://neo4j.com/docs/upgrade-migration-guide/current/version-2025/upgrade) and Java 21.

Expand Down
6 changes: 3 additions & 3 deletions cypher/General_Enrichment/Add_file_name and_extension.cypher
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
// Add "name", "extension" and "extensionExtended" properties to File nodes
// Add "name", "extension" and "extensionExtended" properties to File nodes. Supports Git:File nodes with "relativePath" property.

MATCH (file:File)
WHERE file.fileName IS NOT NULL
WHERE (file.fileName IS NOT NULL OR file.relativePath IS NOT NULL)
AND file.name IS NULL // Don't override an already existing "name" property
WITH *
,file.fileName AS fileName
,coalesce(file.fileName, file.relativePath) AS fileName
WITH *
,last(split(fileName, '/')) AS fileNameWithoutPath
WITH *
Expand Down
Original file line number Diff line number Diff line change
@@ -1,45 +1,103 @@
// Connect git files that where changed together frequently with "CHANGED_TOGETHER_WITH"

MATCH (global_git_commit:Git:Commit)
WITH count(global_git_commit) AS globalCommitCount
// Determine global file count, global file count threshold (filter out refactoring commits) and global update commits
MATCH (git_commit_global:Git:Commit)-[:CONTAINS_CHANGE]->(:Git:Change)-[:UPDATES]->(git_file_global:Git:File)
WHERE git_file_global.deletedAt IS NULL
WITH git_commit_global, count(DISTINCT git_file_global) AS commitFileCount
WITH percentileDisc(commitFileCount, 0.95) AS globalFileCountThreshold
,count(git_commit_global) AS globalUpdateCommitCount
// Main section
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[:UPDATES]->(git_file:Git:File)
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
WHERE git_file.deletedAt IS NULL
// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)
ORDER BY git_commit.sha, git_file.relativePath
WITH globalCommitCount
WITH globalFileCountThreshold
,globalUpdateCommitCount
,git_commit.sha AS commitHash
,collect(DISTINCT git_file) AS filesInCommit
// Limit the file count to min. 2 (changed together) and
// max. 50 (reduce permutations, improve performance, filter out large refactorings that usually affect many files)
WHERE size(filesInCommit) >= 2
AND size(filesInCommit) <= 50
AND size(filesInCommit) <= globalFileCountThreshold
// Collect distinct pairwise (..., 2, 2) combinations of all files in the list
WITH globalCommitCount
WITH globalFileCountThreshold
,globalUpdateCommitCount
,commitHash
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
UNWIND fileCombinations AS fileCombination
WITH globalCommitCount
WITH globalFileCountThreshold
,globalUpdateCommitCount
,fileCombination
,count(DISTINCT commitHash) AS commitCount
,collect(DISTINCT commitHash) AS commitHashes
// Filter out file pairs that where changed not very often together
// In detail: More than 0.1 per mille compared to overall commit count
WHERE commitCount > globalCommitCount * 0.001
WITH fileCombination[0] AS firstFile
,count(DISTINCT commitHash) AS updateCommitCount
,collect(DISTINCT commitHash) AS updateCommitHashes
// Deactivated:
// Filter out file pairs that weren't changed very often together
WHERE updateCommitCount > 2
WITH *
,fileCombination[0] AS firstFile
,fileCombination[1] AS secondFile
,commitCount
,commitHashes
// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "commitCount" on it
CALL (firstFile, secondFile, commitCount, commitHashes) {
WITH *
// Get the lowest number of git update commits of both files (file pair)
,CASE WHEN firstFile.updateCommitCount < secondFile.updateCommitCount
THEN firstFile.updateCommitCount
ELSE secondFile.updateCommitCount
END AS minUpdateCommitCount
// Calculate update commit support by dividing the update commit count by the overall commit count for both files
,toFloat(firstFile.updateCommitCount) / globalUpdateCommitCount AS firstFileUpdateSupport
,toFloat(secondFile.updateCommitCount) / globalUpdateCommitCount AS secondFileUpdateSupport
WITH *
// Expected likelihood that the first and the second file change together given complete randomness
,firstFileUpdateSupport * secondFileUpdateSupport AS expectedCoUpdateSupport
WITH firstFile
,secondFile
,updateCommitHashes
,updateCommitCount
// Out of all the times the less frequently changed file was touched, how often did it co-occur with the other file?
,toFloat(updateCommitCount) / minUpdateCommitCount AS updateCommitMinConfidence
// Compared to all commits in general, how high is the percentage of the commits where both files changed together?
,toFloat(updateCommitCount) / globalUpdateCommitCount AS updateCommitSupport
// Lift
,toFloat(updateCommitCount) / (globalUpdateCommitCount * expectedCoUpdateSupport) AS updateCommitLift
// Jaccard Similarity: Of all commits involving either file, how many involved both?
,toFloat(updateCommitCount) / (firstFile.updateCommitCount + secondFile.updateCommitCount - updateCommitCount) AS updateCommitJaccardSimilarity
// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "updateCommitCount" on it
CALL (firstFile, secondFile, updateCommitCount, updateCommitHashes, updateCommitMinConfidence, updateCommitSupport, updateCommitLift, updateCommitJaccardSimilarity) {
MERGE (firstFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile)
SET pairwiseChange.commitCount = commitCount
,pairwiseChange.commitHashes = commitHashes
} IN TRANSACTIONS
SET pairwiseChange.updateCommitCount = toInteger(updateCommitCount)
,pairwiseChange.updateCommitHashes = updateCommitHashes
,pairwiseChange.updateCommitMinConfidence = updateCommitMinConfidence
,pairwiseChange.updateCommitSupport = updateCommitSupport
,pairwiseChange.updateCommitLift = updateCommitLift
,pairwiseChange.updateCommitJaccardSimilarity = updateCommitJaccardSimilarity
} IN TRANSACTIONS OF 500 ROWS
// Return one row with some statistics about the found pairs and their commit counts
RETURN max(commitCount) AS maxCommitCount
,avg(commitCount) AS avgCommitCount
,percentileDisc(commitCount, 0.5) AS percentile50CommitCount
,percentileDisc(commitCount, 0.9) AS percentile90CommitCount
,percentileDisc(commitCount, 0.95) AS percentile95CommitCount
,count(*) AS pairCount
RETURN count(*) AS pairCount

,min(updateCommitCount) AS minCommitCount
,max(updateCommitCount) AS maxCommitCount
,avg(updateCommitCount) AS avgCommitCount
,percentileDisc(updateCommitCount, 0.5) AS percentile50CommitCount
,percentileDisc(updateCommitCount, 0.9) AS percentile90CommitCount
,percentileDisc(updateCommitCount, 0.95) AS percentile95CommitCount

,min(updateCommitMinConfidence) AS minMinConfidence
,max(updateCommitMinConfidence) AS maxMinConfidence
,avg(updateCommitMinConfidence) AS avgMinConfidence
,percentileDisc(updateCommitMinConfidence, 0.5) AS percentile50MinConfidence
,percentileDisc(updateCommitMinConfidence, 0.9) AS percentile90MinConfidence
,percentileDisc(updateCommitMinConfidence, 0.95) AS percentile95MinConfidence

,min(updateCommitLift) AS minLift
,max(updateCommitLift) AS maxLift
,avg(updateCommitLift) AS avgLift
,percentileDisc(updateCommitLift, 0.5) AS percentile50Lift
,percentileDisc(updateCommitLift, 0.9) AS percentile90Lift
,percentileDisc(updateCommitLift, 0.95) AS percentile95Lift

,min(updateCommitJaccardSimilarity) AS minJaccardSimilarity
,max(updateCommitJaccardSimilarity) AS maxJaccardSimilarity
,avg(updateCommitJaccardSimilarity) AS avgJaccardSimilarity
,percentileDisc(updateCommitJaccardSimilarity, 0.5) AS percentile50JaccardSimilarity
,percentileDisc(updateCommitJaccardSimilarity, 0.9) AS percentile90JaccardSimilarity
,percentileDisc(updateCommitJaccardSimilarity, 0.95) AS percentile95JaccardSimilarity
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(secondGitFile)
RETURN gitRepository.name + '/' + firstGitFile.relativePath AS firstFile
,gitRepository.name + '/' + secondGitFile.relativePath AS secondFile
,gitChange.commitCount AS commitCount
,gitChange.updateCommitCount AS commitCount
ORDER BY commitCount DESC
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
// List git files that where frequently changed with another file. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".

MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
WHERE elementId(firstGitFile) < elementId(secondGitFile)
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
UNWIND gitChange.commitHashes AS commitHash
RETURN gitRepository.name + '/' + firstGitFile.relativePath AS filePath
UNWIND gitChange.updateCommitHashes AS commitHash
WITH gitRepository.name + '/' + firstGitFile.relativePath AS filePath
,count(DISTINCT commitHash) AS commitCount
,sum(firstGitFile.updateCommitCount) AS fileUpdateCount
,max(gitChange.updateCommitLift) AS maxLift
,avg(gitChange.updateCommitLift) AS avgLift
WITH *
// Out of all the times the file was touched, how often did it co-occur with other files?
,CASE WHEN fileUpdateCount > 0 THEN toFloat(commitCount) / fileUpdateCount ELSE 0.0 END AS coChangeRate
RETURN filePath, commitCount, coChangeRate, maxLift, avgLift
ORDER BY commitCount DESC
21 changes: 21 additions & 0 deletions cypher/GitLog/List_pairwise_changed_files.cypher
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// List pairs of files that were changed together. Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first.

MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
WHERE elementId(firstFile) < elementId(secondFile)
WITH *
,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName
,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName
RETURN firstFileName
,secondFileName
,firstFile.name + '<br>' + secondFile.name AS filePairLineBreak
,firstFileName + '<br>' + secondFileName AS filePairWithRelativePathLineBreak
,firstFile.name + '↔' + secondFile.name AS filePair
,firstFileName + '↔' + secondFileName AS filePairWithRelativePath
,firstFile.extension AS firstFileExtension
,secondFile.extension AS secondFileExtension
,firstFile.extension + '↔' + secondFile.extension AS fileExtensionPair
,pairwiseChange.updateCommitCount AS updateCommitCount
,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
,pairwiseChange.updateCommitSupport AS updateCommitSupport
,pairwiseChange.updateCommitLift AS updateCommitLift
,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Get the top 4 file extensions that where changed together most often and list top 20 pair that were changed together for each of the top file extension pair by their highest commit lift (>1: changes more often than by random chance). Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first.

MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
WHERE firstFile.extension < secondFile.extension
OR (firstFile.extension = secondFile.extension AND elementId(firstFile) < elementId(secondFile))
WITH firstFile.extension + '↔' + secondFile.extension AS fileExtensionPair
,count(DISTINCT pairwiseChange) AS pairCount
ORDER BY pairCount DESC
WITH collect(fileExtensionPair)[0..4] AS top4FileExtensionPairs
UNWIND top4FileExtensionPairs AS fileExtensionPair
CALL {
WITH fileExtensionPair
MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
WHERE elementId(firstFile) < elementId(secondFile)
AND firstFile.extension + '↔' + secondFile.extension = fileExtensionPair
WITH *
,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName
,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName
RETURN firstFile.name AS firstFileNameShort
,secondFile.name AS secondFileNameShort
,firstFileName
,secondFileName
,pairwiseChange[$selected_pair_metric] AS selectedMetric
,pairwiseChange.updateCommitLift AS updateCommitLift
,pairwiseChange.updateCommitCount AS updateCommitCount
,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
,pairwiseChange.updateCommitSupport AS updateCommitSupport
,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity
ORDER BY selectedMetric DESC, firstFileName ASC, secondFileName ASC
LIMIT 20
}
RETURN fileExtensionPair
,firstFileNameShort
,secondFileNameShort
,updateCommitCount
,updateCommitMinConfidence
,updateCommitLift
,updateCommitJaccardSimilarity
,updateCommitSupport
,firstFileName
,secondFileName
47 changes: 16 additions & 31 deletions cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher
Original file line number Diff line number Diff line change
@@ -1,39 +1,24 @@
// List pair of files that were changed together and that have a declared dependency between each other.
// List pair of files that were changed together and that have a declared dependency between each other. Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher and Add_CHANGED_TOGETHER_WITH_relationships_to_code_files.cypher to run first.

MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File)
MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
//De-duplicating the pairs of files isn't necessary, because the dependency relation is directed.
//WHERE elementId(firstCodeFile) < elementId(secondCodeFile)
WITH firstCodeFile.fileName AS firstFileName
,secondCodeFile.fileName AS secondFileName
WHERE elementId(firstCodeFile) < elementId(secondCodeFile)
WITH firstCodeFile.fileName AS firstFileName
,secondCodeFile.fileName AS secondFileName
,coalesce(dependency.weight, dependency.cardinality) AS dependencyWeight
,pairwiseChange.commitCount AS commitCount
,dependency.fileDistanceAsFewestChangeDirectoryCommands AS fileDistanceAsFewestChangeDirectoryCommands
,dependency.fileDistanceAsFewestChangeDirectoryCommands AS fileDistance
,pairwiseChange.updateCommitCount AS commitCount
,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
,pairwiseChange.updateCommitSupport AS updateCommitSupport
,pairwiseChange.updateCommitLift AS updateCommitLift
,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity
RETURN dependencyWeight
,fileDistance
,commitCount
,fileDistanceAsFewestChangeDirectoryCommands
,updateCommitMinConfidence
,updateCommitSupport
,updateCommitLift
,updateCommitJaccardSimilarity
// ,count(*) AS occurrences
// ,collect(firstFileName + ' -> ' + secondFileName)[0..3] AS examples
ORDER BY dependencyWeight, commitCount

// MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File)
// MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
// WHERE elementId(firstCodeFile) < elementId(secondCodeFile)
// RETURN firstCodeFile.fileName AS firstFileName
// ,secondCodeFile.fileName AS secondFileName
// ,dependency.weight AS dependencyWeight
// ,pairwiseChange.commitCount AS commitCount
// ORDER BY dependencyWeight, commitCount

// MATCH (g1:!Git&File)-[relation:CHANGED_TOGETHER_WITH|DEPENDS_ON]-(g2:!Git&File)
// WITH count(DISTINCT relation) AS relatedFilesCount
// ,collect(DISTINCT relation) AS relations
// UNWIND relations AS relation
// WITH relatedFilesCount
// ,coalesce(relation.commitCount, 0) AS commitCount
// ,coalesce(relation.weight, 0) AS dependencyWeight
// ,coalesce(relation.fileDistanceAsFewestChangeDirectoryCommands, 0) AS fileDistanceAsFewestChangeDirectoryCommands
// RETURN dependencyWeight
// ,commitCount
// ,fileDistanceAsFewestChangeDirectoryCommands
// ORDER BY dependencyWeight, commitCount
ORDER BY dependencyWeight, commitCount
11 changes: 11 additions & 0 deletions cypher/GitLog/Set_number_of_git_plugin_update_commits.cypher
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// Set updateCommitCount property on Git File nodes when git commits with Update modifier (detected by the plugin) are present

MATCH (git_file:File&Git)<-[:UPDATES]-(:Git&Change)<-[:CONTAINS_CHANGE]-(git_commit:Git&Commit)
WHERE git_file.deletedAt IS NULL
WITH git_file, count(DISTINCT git_commit.sha) AS updateCommitCount
SET git_file.updateCommitCount = updateCommitCount
WITH git_file, updateCommitCount
MATCH (code_file:File&!Git)<-[:RESOLVES_TO]-(git_file)
SET code_file.updateCommitCount = updateCommitCount
RETURN count(DISTINCT code_file) AS codeFileUpdates
,collect(DISTINCT code_file.name)[0..4] AS codeFileExample
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// Verify if CHANGED_TOGETHER_WITH properties from git are missing

MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
RETURN (pairwiseChange.updateCommitCount IS NULL) AS updateCommitCountMissing
,(pairwiseChange.updateCommitMinConfidence IS NULL) AS updateCommitMinConfidenceMissing
,count(*)
Loading
Loading