Skip to content

Commit 5a29a51

Browse files
committed
Calculate min confidence of two files changing together (normalized co-change count)
1 parent 679e21e commit 5a29a51

9 files changed

+160
-60
lines changed

cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_code_files.cypher

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
// Take the already existing "CHANGED_TOGETHER_WITH" relationship between git files and apply it to resolved file nodes. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".
22

33
MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
4-
WHERE elementId(firstGitFile) < elementId(secondGitFile)
4+
//De-duplicating the pairs of files isn't necessary, because the dependency relation is directed.
5+
//WHERE elementId(firstGitFile) < elementId(secondGitFile)
56
MATCH (firstGitFile)-[:RESOLVES_TO]->(firstCodeFile:File&!Git&!Repository)
67
MATCH (secondGitFile)-[:RESOLVES_TO]->(secondCodeFile:File&!Git&!Repository)
78
CALL (firstCodeFile, secondCodeFile, gitChange) {
Lines changed: 82 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,104 @@
11
// Connect git files that where changed together frequently with "CHANGED_TOGETHER_WITH"
22

3-
MATCH (global_git_commit:Git:Commit)
4-
WITH count(global_git_commit) AS globalCommitCount
3+
// Determine global file count, global file count threshold (filter out refactoring commits) and global update commits
4+
MATCH (git_commit_global:Git:Commit)-[:CONTAINS_CHANGE]->(:Git:Change)-[:UPDATES]->(git_file_global:Git:File)
5+
WHERE git_file_global.deletedAt IS NULL
6+
WITH git_commit_global, count(DISTINCT git_file_global) AS commitFileCount
7+
WITH percentileDisc(commitFileCount, 0.95) AS globalFileCountThreshold
8+
,count(git_commit_global) AS globalUpdateCommitCount
9+
// Main section
510
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[:UPDATES]->(git_file:Git:File)
611
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
712
WHERE git_file.deletedAt IS NULL
813
// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)
914
ORDER BY git_commit.sha, git_file.relativePath
10-
WITH globalCommitCount
15+
WITH globalFileCountThreshold
16+
,globalUpdateCommitCount
1117
,git_commit.sha AS commitHash
1218
,collect(DISTINCT git_file) AS filesInCommit
1319
// Limit the file count to min. 2 (changed together) and
1420
// max. 50 (reduce permutations, improve performance, filter out large refactorings that usually affect many files)
1521
WHERE size(filesInCommit) >= 2
16-
AND size(filesInCommit) <= 50
22+
AND size(filesInCommit) <= globalFileCountThreshold
1723
// Collect distinct pairwise (..., 2, 2) combinations of all files in the list
18-
WITH globalCommitCount
24+
WITH globalFileCountThreshold
25+
,globalUpdateCommitCount
1926
,commitHash
2027
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
2128
UNWIND fileCombinations AS fileCombination
22-
WITH globalCommitCount
29+
WITH globalFileCountThreshold
30+
,globalUpdateCommitCount
2331
,fileCombination
24-
,count(DISTINCT commitHash) AS commitCount
25-
,collect(DISTINCT commitHash) AS commitHashes
32+
,count(DISTINCT commitHash) AS updateCommitCount
33+
,collect(DISTINCT commitHash) AS updateCommitHashes
34+
// Deactivated:
2635
// Filter out file pairs that where changed not very often together
2736
// In detail: More than 0.1 per mille compared to overall commit count
28-
WHERE commitCount > globalCommitCount * 0.001
29-
WITH fileCombination[0] AS firstFile
37+
// WHERE updateCommitCount > globalUpdateCommitCount * 0.001
38+
WITH *
39+
,fileCombination[0] AS firstFile
3040
,fileCombination[1] AS secondFile
31-
,commitCount
32-
,commitHashes
33-
// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "commitCount" on it
34-
CALL (firstFile, secondFile, commitCount, commitHashes) {
41+
WITH *
42+
// Get the lowest number of git update commits of both files (file pair)
43+
,CASE WHEN firstFile.updateCommitCount < secondFile.updateCommitCount
44+
THEN firstFile.updateCommitCount
45+
ELSE secondFile.updateCommitCount
46+
END AS minUpdateCommitCount
47+
// Calculate update commit support by dividing the update commit count by the overall commit count for both files
48+
,toFloat(firstFile.updateCommitCount) / globalUpdateCommitCount AS firstFileUpdateSupport
49+
,toFloat(secondFile.updateCommitCount) / globalUpdateCommitCount AS secondFileUpdateSupport
50+
WITH *
51+
// Expected likelihood that the first and the second file change together given complete randomness
52+
,firstFileUpdateSupport * secondFileUpdateSupport AS expectedCoUpdateSupport
53+
WITH firstFile
54+
,secondFile
55+
,updateCommitHashes
56+
,updateCommitCount
57+
// Out of all the times the less frequently changed file was touched, how often did it co-occur with the other file?
58+
,toFloat(updateCommitCount) / minUpdateCommitCount AS updateCommitMinConfidence
59+
// Compared to all commits in general, how high is the percentage of the commits where both files changed together?
60+
,toFloat(updateCommitCount) / globalUpdateCommitCount AS updateCommitSupport
61+
// Lift
62+
,toFloat(updateCommitCount) / (globalUpdateCommitCount * expectedCoUpdateSupport) AS updateCommitLift
63+
// Jaccard Similarity: Of all commits involving either file, how many involved both?
64+
,toFloat(updateCommitCount) / (firstFile.updateCommitCount + secondFile.updateCommitCount - updateCommitCount) AS updateCommitJaccardSimilarity
65+
// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "updateCommitCount" on it
66+
CALL (firstFile, secondFile, updateCommitCount, updateCommitHashes, updateCommitMinConfidence, updateCommitSupport, updateCommitLift, updateCommitJaccardSimilarity) {
3567
MERGE (firstFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile)
36-
SET pairwiseChange.commitCount = commitCount
37-
,pairwiseChange.commitHashes = commitHashes
38-
} IN TRANSACTIONS
68+
SET pairwiseChange.updateCommitCount = updateCommitCount
69+
,pairwiseChange.updateCommitHashes = updateCommitHashes
70+
,pairwiseChange.updateCommitMinConfidence = updateCommitMinConfidence
71+
,pairwiseChange.updateCommitSupport = updateCommitSupport
72+
,pairwiseChange.updateCommitLift = updateCommitLift
73+
,pairwiseChange.updateCommitJaccardSimilarity = updateCommitJaccardSimilarity
74+
} IN TRANSACTIONS OF 500 ROWS
3975
// Return one row with some statistics about the found pairs and their commit counts
40-
RETURN max(commitCount) AS maxCommitCount
41-
,avg(commitCount) AS avgCommitCount
42-
,percentileDisc(commitCount, 0.5) AS percentile50CommitCount
43-
,percentileDisc(commitCount, 0.9) AS percentile90CommitCount
44-
,percentileDisc(commitCount, 0.95) AS percentile95CommitCount
45-
,count(*) AS pairCount
76+
RETURN count(*) AS pairCount
77+
78+
,min(updateCommitCount) AS minCommitCount
79+
,max(updateCommitCount) AS maxCommitCount
80+
,avg(updateCommitCount) AS avgCommitCount
81+
,percentileDisc(updateCommitCount, 0.5) AS percentile50CommitCount
82+
,percentileDisc(updateCommitCount, 0.9) AS percentile90CommitCount
83+
,percentileDisc(updateCommitCount, 0.95) AS percentile95CommitCount
84+
85+
,min(updateCommitMinConfidence) AS minMinConfidence
86+
,max(updateCommitMinConfidence) AS maxMinConfidence
87+
,avg(updateCommitMinConfidence) AS avgMinConfidence
88+
,percentileDisc(updateCommitMinConfidence, 0.5) AS percentile50MinConfidence
89+
,percentileDisc(updateCommitMinConfidence, 0.9) AS percentile90MinConfidence
90+
,percentileDisc(updateCommitMinConfidence, 0.95) AS percentile95MinConfidence
91+
92+
,min(updateCommitLift) AS minLift
93+
,max(updateCommitLift) AS maxLift
94+
,avg(updateCommitLift) AS avgLift
95+
,percentileDisc(updateCommitLift, 0.5) AS percentile50Lift
96+
,percentileDisc(updateCommitLift, 0.9) AS percentile90Lift
97+
,percentileDisc(updateCommitLift, 0.95) AS percentile95Lift
98+
99+
,min(updateCommitJaccardSimilarity) AS minJaccardSimilarity
100+
,max(updateCommitJaccardSimilarity) AS maxJaccardSimilarity
101+
,avg(updateCommitJaccardSimilarity) AS avgJaccardSimilarity
102+
,percentileDisc(updateCommitJaccardSimilarity, 0.5) AS percentile50JaccardSimilarity
103+
,percentileDisc(updateCommitJaccardSimilarity, 0.9) AS percentile90JaccardSimilarity
104+
,percentileDisc(updateCommitJaccardSimilarity, 0.95) AS percentile95JaccardSimilarity

cypher/GitLog/List_git_files_that_were_changed_together.cypher

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,5 @@ MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
66
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(secondGitFile)
77
RETURN gitRepository.name + '/' + firstGitFile.relativePath AS firstFile
88
,gitRepository.name + '/' + secondGitFile.relativePath AS secondFile
9-
,gitChange.commitCount AS commitCount
9+
,gitChange.updateCommitCount AS commitCount
1010
ORDER BY commitCount DESC

cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@
33
MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
44
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
55
UNWIND gitChange.commitHashes AS commitHash
6-
RETURN gitRepository.name + '/' + firstGitFile.relativePath AS filePath
6+
WITH gitRepository.name + '/' + firstGitFile.relativePath AS filePath
77
,count(DISTINCT commitHash) AS commitCount
8+
,sum(firstGitFile.updateCommitCount) AS fileUpdateCount
9+
WITH *
10+
// Out of all the times the file was touched, how often did it co-occur with other files?
11+
,CASE WHEN fileUpdateCount > 0 THEN toFloat(commitCount) / fileUpdateCount ELSE 0.0 END AS coChangeRate
12+
RETURN filePath, commitCount, coChangeRate
813
ORDER BY commitCount DESC

cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher

Lines changed: 6 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,36 +4,16 @@ MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File)
44
MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
55
//De-duplicating the pairs of files isn't necessary, because the dependency relation is directed.
66
//WHERE elementId(firstCodeFile) < elementId(secondCodeFile)
7-
WITH firstCodeFile.fileName AS firstFileName
8-
,secondCodeFile.fileName AS secondFileName
7+
WITH firstCodeFile.fileName AS firstFileName
8+
,secondCodeFile.fileName AS secondFileName
99
,coalesce(dependency.weight, dependency.cardinality) AS dependencyWeight
10-
,pairwiseChange.commitCount AS commitCount
10+
,pairwiseChange.updateCommitCount AS commitCount
11+
,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
1112
,dependency.fileDistanceAsFewestChangeDirectoryCommands AS fileDistanceAsFewestChangeDirectoryCommands
1213
RETURN dependencyWeight
1314
,commitCount
15+
,updateCommitMinConfidence
1416
,fileDistanceAsFewestChangeDirectoryCommands
1517
// ,count(*) AS occurrences
1618
// ,collect(firstFileName + ' -> ' + secondFileName)[0..3] AS examples
17-
ORDER BY dependencyWeight, commitCount
18-
19-
// MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File)
20-
// MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
21-
// WHERE elementId(firstCodeFile) < elementId(secondCodeFile)
22-
// RETURN firstCodeFile.fileName AS firstFileName
23-
// ,secondCodeFile.fileName AS secondFileName
24-
// ,dependency.weight AS dependencyWeight
25-
// ,pairwiseChange.commitCount AS commitCount
26-
// ORDER BY dependencyWeight, commitCount
27-
28-
// MATCH (g1:!Git&File)-[relation:CHANGED_TOGETHER_WITH|DEPENDS_ON]-(g2:!Git&File)
29-
// WITH count(DISTINCT relation) AS relatedFilesCount
30-
// ,collect(DISTINCT relation) AS relations
31-
// UNWIND relations AS relation
32-
// WITH relatedFilesCount
33-
// ,coalesce(relation.commitCount, 0) AS commitCount
34-
// ,coalesce(relation.weight, 0) AS dependencyWeight
35-
// ,coalesce(relation.fileDistanceAsFewestChangeDirectoryCommands, 0) AS fileDistanceAsFewestChangeDirectoryCommands
36-
// RETURN dependencyWeight
37-
// ,commitCount
38-
// ,fileDistanceAsFewestChangeDirectoryCommands
39-
// ORDER BY dependencyWeight, commitCount
19+
ORDER BY dependencyWeight, commitCount
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
// Set updateCommitCount property on Git File nodes when git commits with Update modifier (detected by the plugin) are present
2+
3+
MATCH (git_file:File&Git)<-[:UPDATES]-(:Git&Change)<-[:CONTAINS_CHANGE]-(git_commit:Git&Commit)
4+
WHERE git_file.deletedAt IS NULL
5+
WITH git_file, count(DISTINCT git_commit.sha) AS updateCommitCount
6+
SET git_file.updateCommitCount = updateCommitCount
7+
WITH git_file, updateCommitCount
8+
MATCH (code_file:File&!Git)<-[:RESOLVES_TO]-(git_file)
9+
SET code_file.updateCommitCount = updateCommitCount
10+
RETURN count(DISTINCT code_file) AS codeFileUpdates
11+
,collect(DISTINCT code_file.name)[0..4] AS codeFileExample
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
// Verify if CHANGED_TOGETHER_WITH properties from git are missing
2+
3+
MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
4+
RETURN (pairwiseChange.updateCommitCount IS NULL) AS updateCommitCountMissing
5+
,(pairwiseChange.updateCommitMinConfidence IS NULL) AS updateCommitMinConfidenceMissing
6+
,count(*)

jupyter/GitHistoryGeneral.ipynb

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1198,6 +1198,7 @@
11981198
"pairwise_changed_git_files = pairwise_changed_git_files.groupby(['directoryPath']).aggregate(\n",
11991199
" pairwiseChangeCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
12001200
" pairwiseChangeFileCount=pd.NamedAgg(column=\"filePath\", aggfunc=\"count\"),\n",
1201+
" pairwiseChangeAverageRate=pd.NamedAgg(column=\"coChangeRate\", aggfunc=\"mean\"),\n",
12011202
")\n",
12021203
"pairwise_changed_git_files.reset_index(inplace=True)\n",
12031204
"\n",
@@ -1220,6 +1221,7 @@
12201221
"\n",
12211222
"pairwise_changed_git_files['pairwiseChangeCommitCount'] = pairwise_changed_git_files['pairwiseChangeCommitCount'].fillna(0).astype(int)\n",
12221223
"pairwise_changed_git_files['pairwiseChangeFileCount'] = pairwise_changed_git_files['pairwiseChangeFileCount'].fillna(0).astype(int)\n",
1224+
"pairwise_changed_git_files['pairwiseChangeAverageRate'] = pairwise_changed_git_files['pairwiseChangeAverageRate'].fillna(0).astype(float)\n",
12231225
"pairwise_changed_git_files.reset_index(inplace=True)\n",
12241226
"\n",
12251227
"# Debug\n",
@@ -1399,7 +1401,13 @@
13991401
" display(pearsonr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n",
14001402
"\n",
14011403
" display(\"Spearman Correlation with p-value for commitCount and dependencyWeight\")\n",
1402-
" display(spearmanr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))"
1404+
" display(spearmanr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n",
1405+
"\n",
1406+
" display(\"Pearson Correlation with p-value for updateCommitMinConfidence and dependencyWeight\")\n",
1407+
" display(pearsonr(pairwise_changed_git_files_with_dependencies['updateCommitMinConfidence'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n",
1408+
"\n",
1409+
" display(\"Spearman Correlation with p-value for updateCommitMinConfidence and dependencyWeight\")\n",
1410+
" display(spearmanr(pairwise_changed_git_files_with_dependencies['updateCommitMinConfidence'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))"
14031411
]
14041412
},
14051413
{
@@ -1431,6 +1439,34 @@
14311439
" figure.write_image(**get_plotly_figure_write_image_settings(\"PairwiseChangedFilesVsDependencyWeight\"))"
14321440
]
14331441
},
1442+
{
1443+
"cell_type": "code",
1444+
"execution_count": null,
1445+
"id": "75264b82",
1446+
"metadata": {},
1447+
"outputs": [],
1448+
"source": [
1449+
"# Scatter plot of all pairs of files with their min confidence (normalized update commit count) on the x axis and dependency weight on the y axis\n",
1450+
"\n",
1451+
"if pairwise_changed_git_files_with_dependencies.empty:\n",
1452+
" print(\"No data to plot\")\n",
1453+
"else:\n",
1454+
" figure = plotly_graph_objects.Figure(plotly_graph_objects.Scatter(\n",
1455+
" x=pairwise_changed_git_files_with_dependencies['updateCommitMinConfidence'], \n",
1456+
" y=pairwise_changed_git_files_with_dependencies['dependencyWeight'],\n",
1457+
" mode='markers',\n",
1458+
" ))\n",
1459+
" figure.update_layout(\n",
1460+
" **plotly_bar_layout_base_settings,\n",
1461+
" title='Pairwise changed files: Min confidence co-change rate vs. dependency weight',\n",
1462+
" xaxis_title='co-change rate (min confidence, normalized update commit count)',\n",
1463+
" yaxis_title='dependency weight',\n",
1464+
" )\n",
1465+
" figure.show(**plotly_treemap_figure_show_settings)\n",
1466+
" if is_command_line_execution():\n",
1467+
" figure.write_image(**get_plotly_figure_write_image_settings(\"PairwiseChangedFilesVsDependencyWeight\"))"
1468+
]
1469+
},
14341470
{
14351471
"cell_type": "markdown",
14361472
"id": "14e87aff",

scripts/importGit.sh

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -134,13 +134,14 @@ commonPostGitImport() {
134134
echo "importGit: Running verification queries for troubleshooting (non failing)..."
135135
execute_cypher "${GIT_LOG_CYPHER_DIR}/Verify_git_to_code_file_unambiguous.cypher"
136136
execute_cypher "${GIT_LOG_CYPHER_DIR}/Verify_code_to_git_file_unambiguous.cypher"
137+
execute_cypher "${GIT_LOG_CYPHER_DIR}/Verify_git_missing_CHANGED_TOGETHER_WITH_properties.cypher"
137138
}
138139

139140
postGitLogImport() {
140-
commonPostGitImport
141-
142141
echo "importGit: Add numberOfGitCommits property to nodes with matching file names..."
143142
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_log_commits.cypher"
143+
144+
commonPostGitImport
144145
}
145146

146147
postGitPluginImport() {
@@ -149,24 +150,25 @@ postGitPluginImport() {
149150
# TODO: The deletion of all plain files in the "/.git" directory is needed
150151
# until there is a way to exclude all files inside a directory
151152
# while still being able to get them analyzed by the git plugin.
152-
# This would most likely be solved with https://github.com/jQAssistant/jqassistant/issues/410
153153
execute_cypher "${GIT_LOG_CYPHER_DIR}/Delete_plain_git_directory_file_nodes.cypher"
154154
execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_commit_sha.cypher"
155155
execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_file_name.cypher"
156156
execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_file_relative_path.cypher"
157157
execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_absolute_file_name.cypher"
158158

159-
commonPostGitImport
160-
161159
echo "importGit: Add numberOfGitCommits property to nodes with matching file names..."
162160
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_plugin_commits.cypher"
163-
}
161+
echo "importGit: Add updateCommitCount property to file nodes and code nodes with matching file names..."
162+
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_plugin_update_commits.cypher"
164163

165-
postAggregatedGitLogImport() {
166164
commonPostGitImport
165+
}
167166

167+
postAggregatedGitLogImport() {
168168
echo "importGit: Add numberOfGitCommits property to nodes with matching file names..."
169169
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_aggregated_git_commits.cypher"
170+
171+
commonPostGitImport
170172
}
171173

172174
# Create import directory in case it doesn't exist.

0 commit comments

Comments
 (0)