11// Connect git files that where changed together frequently with "CHANGED_TOGETHER_WITH"
22
3- MATCH (global_git_commit :Git :Commit )
4- WITH count (global_git_commit ) AS globalCommitCount
3+ // Determine global file count, global file count threshold (filter out refactoring commits) and global update commits
4+ MATCH (git_commit_global :Git :Commit )- [ : CONTAINS_CHANGE ] -> (:Git :Change )- [ : UPDATES ] -> (git_file_global :Git :File )
5+ WHERE git_file_global .deletedAt IS NULL
6+ WITH git_commit_global , count (DISTINCT git_file_global ) AS commitFileCount
7+ WITH percentileDisc (commitFileCount , 0.95 ) AS globalFileCountThreshold
8+ ,count (git_commit_global ) AS globalUpdateCommitCount
9+ // Main section
510MATCH (git_commit :Git :Commit )- [ : CONTAINS_CHANGE ] -> (git_change :Git :Change )- [ : UPDATES ] -> (git_file :Git :File )
611MATCH (git_repository :Git &Repository )- [ : HAS_FILE ] -> (git_file )
712WHERE git_file .deletedAt IS NULL
813// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)
914ORDER BY git_commit .sha , git_file .relativePath
10- WITH globalCommitCount
15+ WITH globalFileCountThreshold
16+ ,globalUpdateCommitCount
1117 ,git_commit .sha AS commitHash
1218 ,collect (DISTINCT git_file ) AS filesInCommit
1319// Limit the file count to min. 2 (changed together) and
1420// max. 50 (reduce permutations, improve performance, filter out large refactorings that usually affect many files)
1521WHERE size (filesInCommit ) >= 2
16- AND size (filesInCommit ) <= 50
22+ AND size (filesInCommit ) <= globalFileCountThreshold
1723// Collect distinct pairwise (..., 2, 2) combinations of all files in the list
18- WITH globalCommitCount
24+ WITH globalFileCountThreshold
25+ ,globalUpdateCommitCount
1926 ,commitHash
2027 ,apoc .coll .combinations (filesInCommit , 2 , 2 ) AS fileCombinations
2128UNWIND fileCombinations AS fileCombination
22- WITH globalCommitCount
29+ WITH globalFileCountThreshold
30+ ,globalUpdateCommitCount
2331 ,fileCombination
24- ,count (DISTINCT commitHash ) AS commitCount
25- ,collect (DISTINCT commitHash ) AS commitHashes
32+ ,count (DISTINCT commitHash ) AS updateCommitCount
33+ ,collect (DISTINCT commitHash ) AS updateCommitHashes
34+ // Deactivated:
2635// Filter out file pairs that where changed not very often together
2736// In detail: More than 0.1 per mille compared to overall commit count
28- WHERE commitCount > globalCommitCount * 0.001
29- WITH fileCombination [0 ] AS firstFile
37+ // WHERE updateCommitCount > globalUpdateCommitCount * 0.001
38+ WITH *
39+ ,fileCombination [0 ] AS firstFile
3040 ,fileCombination [1 ] AS secondFile
31- ,commitCount
32- ,commitHashes
33- // Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "commitCount" on it
34- CALL (firstFile , secondFile , commitCount , commitHashes ) {
41+ WITH *
42+ // Get the lowest number of git update commits of both files (file pair)
43+ ,CASE WHEN firstFile .updateCommitCount < secondFile .updateCommitCount
44+ THEN firstFile .updateCommitCount
45+ ELSE secondFile .updateCommitCount
46+ END AS minUpdateCommitCount
47+ // Calculate update commit support by dividing the update commit count by the overall commit count for both files
48+ ,toFloat (firstFile .updateCommitCount ) / globalUpdateCommitCount AS firstFileUpdateSupport
49+ ,toFloat (secondFile .updateCommitCount ) / globalUpdateCommitCount AS secondFileUpdateSupport
50+ WITH *
51+ // Expected likelihood that the first and the second file change together given complete randomness
52+ ,firstFileUpdateSupport * secondFileUpdateSupport AS expectedCoUpdateSupport
53+ WITH firstFile
54+ ,secondFile
55+ ,updateCommitHashes
56+ ,updateCommitCount
57+ // Out of all the times the less frequently changed file was touched, how often did it co-occur with the other file?
58+ ,toFloat (updateCommitCount ) / minUpdateCommitCount AS updateCommitMinConfidence
59+ // Compared to all commits in general, how high is the percentage of the commits where both files changed together?
60+ ,toFloat (updateCommitCount ) / globalUpdateCommitCount AS updateCommitSupport
61+ // Lift
62+ ,toFloat (updateCommitCount ) / (globalUpdateCommitCount * expectedCoUpdateSupport ) AS updateCommitLift
63+ // Jaccard Similarity: Of all commits involving either file, how many involved both?
64+ ,toFloat (updateCommitCount ) / (firstFile .updateCommitCount + secondFile .updateCommitCount - updateCommitCount ) AS updateCommitJaccardSimilarity
65+ // Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "updateCommitCount" on it
66+ CALL (firstFile , secondFile , updateCommitCount , updateCommitHashes , updateCommitMinConfidence , updateCommitSupport , updateCommitLift , updateCommitJaccardSimilarity ) {
3567 MERGE (firstFile )- [pairwiseChange : CHANGED_TOGETHER_WITH ]- (secondFile )
36- SET pairwiseChange .commitCount = commitCount
37- , pairwiseChange .commitHashes = commitHashes
38- } IN TRANSACTIONS
68+ SET pairwiseChange .updateCommitCount = updateCommitCount
69+ , pairwiseChange .updateCommitHashes = updateCommitHashes
70+ , pairwiseChange .updateCommitMinConfidence = updateCommitMinConfidence
71+ , pairwiseChange .updateCommitSupport = updateCommitSupport
72+ , pairwiseChange .updateCommitLift = updateCommitLift
73+ , pairwiseChange .updateCommitJaccardSimilarity = updateCommitJaccardSimilarity
74+ } IN TRANSACTIONS OF 500 ROWS
3975// Return one row with some statistics about the found pairs and their commit counts
40- RETURN max (commitCount ) AS maxCommitCount
41- ,avg (commitCount ) AS avgCommitCount
42- ,percentileDisc (commitCount , 0.5 ) AS percentile50CommitCount
43- ,percentileDisc (commitCount , 0.9 ) AS percentile90CommitCount
44- ,percentileDisc (commitCount , 0.95 ) AS percentile95CommitCount
45- ,count (* ) AS pairCount
76+ RETURN count (* ) AS pairCount
77+
78+ ,min (updateCommitCount ) AS minCommitCount
79+ ,max (updateCommitCount ) AS maxCommitCount
80+ ,avg (updateCommitCount ) AS avgCommitCount
81+ ,percentileDisc (updateCommitCount , 0.5 ) AS percentile50CommitCount
82+ ,percentileDisc (updateCommitCount , 0.9 ) AS percentile90CommitCount
83+ ,percentileDisc (updateCommitCount , 0.95 ) AS percentile95CommitCount
84+
85+ ,min (updateCommitMinConfidence ) AS minMinConfidence
86+ ,max (updateCommitMinConfidence ) AS maxMinConfidence
87+ ,avg (updateCommitMinConfidence ) AS avgMinConfidence
88+ ,percentileDisc (updateCommitMinConfidence , 0.5 ) AS percentile50MinConfidence
89+ ,percentileDisc (updateCommitMinConfidence , 0.9 ) AS percentile90MinConfidence
90+ ,percentileDisc (updateCommitMinConfidence , 0.95 ) AS percentile95MinConfidence
91+
92+ ,min (updateCommitLift ) AS minLift
93+ ,max (updateCommitLift ) AS maxLift
94+ ,avg (updateCommitLift ) AS avgLift
95+ ,percentileDisc (updateCommitLift , 0.5 ) AS percentile50Lift
96+ ,percentileDisc (updateCommitLift , 0.9 ) AS percentile90Lift
97+ ,percentileDisc (updateCommitLift , 0.95 ) AS percentile95Lift
98+
99+ ,min (updateCommitJaccardSimilarity ) AS minJaccardSimilarity
100+ ,max (updateCommitJaccardSimilarity ) AS maxJaccardSimilarity
101+ ,avg (updateCommitJaccardSimilarity ) AS avgJaccardSimilarity
102+ ,percentileDisc (updateCommitJaccardSimilarity , 0.5 ) AS percentile50JaccardSimilarity
103+ ,percentileDisc (updateCommitJaccardSimilarity , 0.9 ) AS percentile90JaccardSimilarity
104+ ,percentileDisc (updateCommitJaccardSimilarity , 0.95 ) AS percentile95JaccardSimilarity
0 commit comments