Skip to content

Commit 2976072

Browse files
committed
Optimize treemap with files that are often changed with others
1 parent 910029b commit 2976072

File tree

2 files changed

+137
-74
lines changed

2 files changed

+137
-74
lines changed
Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
// List git files that where frequently changed with another file. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".
22

33
MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
4+
WHERE elementId(firstGitFile) < elementId(secondGitFile)
45
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
56
UNWIND gitChange.updateCommitHashes AS commitHash
67
WITH gitRepository.name + '/' + firstGitFile.relativePath AS filePath
78
,count(DISTINCT commitHash) AS commitCount
89
,sum(firstGitFile.updateCommitCount) AS fileUpdateCount
10+
,max(gitChange.updateCommitLift) AS maxLift
11+
,avg(gitChange.updateCommitLift) AS avgLift
912
WITH *
1013
// Out of all the times the file was touched, how often did it co-occur with other files?
1114
,CASE WHEN fileUpdateCount > 0 THEN toFloat(commitCount) / fileUpdateCount ELSE 0.0 END AS coChangeRate
12-
RETURN filePath, commitCount, coChangeRate
15+
RETURN filePath, commitCount, coChangeRate, maxLift, avgLift
1316
ORDER BY commitCount DESC

jupyter/GitHistoryGeneral.ipynb

Lines changed: 133 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,16 +1169,101 @@
11691169
},
11701170
{
11711171
"cell_type": "markdown",
1172-
"id": "80bd7c28",
1172+
"id": "d8c6ccee",
11731173
"metadata": {},
11741174
"source": [
1175-
"### File changed frequently with other files"
1175+
"## Filecount per commit\n",
1176+
"\n",
1177+
"Shows how many commits had changed one file, how many had changed two files, and so on.\n",
1178+
"The chart is limited to 30 lines for improved readability.\n",
1179+
"The data preview also includes overall statistics including the number of commits that are filtered out in the chart."
1180+
]
1181+
},
1182+
{
1183+
"cell_type": "markdown",
1184+
"id": "ed53b6e5",
1185+
"metadata": {},
1186+
"source": [
1187+
"### Preview data"
11761188
]
11771189
},
11781190
{
11791191
"cell_type": "code",
11801192
"execution_count": null,
1181-
"id": "24055998",
1193+
"id": "5526e458",
1194+
"metadata": {},
1195+
"outputs": [],
1196+
"source": [
1197+
"git_file_count_per_commit = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_per_commit_distribution.cypher\")\n",
1198+
"\n",
1199+
"print(\"Sum of commits that changed more than 30 files (each) = \" + str(git_file_count_per_commit[git_file_count_per_commit['filesPerCommit'] > 30]['commitCount'].sum()))\n",
1200+
"print(\"Max changed files with one commit = \" + str(git_file_count_per_commit['filesPerCommit'].max()))\n",
1201+
"display(git_file_count_per_commit.describe())\n",
1202+
"display(git_file_count_per_commit.head(30))"
1203+
]
1204+
},
1205+
{
1206+
"cell_type": "markdown",
1207+
"id": "dcea826e",
1208+
"metadata": {},
1209+
"source": [
1210+
"### Bar chart with the number of files per commit distribution"
1211+
]
1212+
},
1213+
{
1214+
"cell_type": "code",
1215+
"execution_count": null,
1216+
"id": "9e9dbc57",
1217+
"metadata": {},
1218+
"outputs": [],
1219+
"source": [
1220+
"if git_file_count_per_commit.empty:\n",
1221+
" print(\"No data to plot\")\n",
1222+
"else:\n",
1223+
" figure = plotly_graph_objects.Figure(plotly_graph_objects.Bar(\n",
1224+
" x=git_file_count_per_commit['filesPerCommit'].head(30), \n",
1225+
" y=git_file_count_per_commit['commitCount'].head(30)),\n",
1226+
" )\n",
1227+
" figure.update_layout(\n",
1228+
" **plotly_bar_layout_base_settings,\n",
1229+
" title='Changed files per commit',\n",
1230+
" xaxis_title='file count',\n",
1231+
" yaxis_title='commit count'\n",
1232+
" )\n",
1233+
" figure.show(**plotly_treemap_figure_show_settings)\n",
1234+
" if is_command_line_execution():\n",
1235+
" figure.write_image(**get_plotly_figure_write_image_settings(\"ChangedFilesPerCommit\"))"
1236+
]
1237+
},
1238+
{
1239+
"cell_type": "markdown",
1240+
"id": "322d6cf9",
1241+
"metadata": {},
1242+
"source": [
1243+
"## Pairwise Changed Files\n",
1244+
"\n",
1245+
"This section analyzes files that where changed together within the same commit and provides several metrics to quantify the strength of the co-change relationship:\n",
1246+
"\n",
1247+
"- **Commit Count**: The number of commits in which two files were changed together.\n",
1248+
"- **Commit Lift**: A ratio that indicates whether the co-change pattern is stronger than random chance, given how often each file changes.\n",
1249+
"- **Jaccard Similarity**: The ratio of commits involving either file that also involved both files.\n",
1250+
"\n",
1251+
"The following tables show the top pairwise changed files based on these metrics.\n",
1252+
"The following charts show how these metrics are distributed across pairs of files that were changed together."
1253+
]
1254+
},
1255+
{
1256+
"cell_type": "markdown",
1257+
"id": "4c081f85",
1258+
"metadata": {},
1259+
"source": [
1260+
"### Treemap with files changed frequently with others"
1261+
]
1262+
},
1263+
{
1264+
"cell_type": "code",
1265+
"execution_count": null,
1266+
"id": "30942bd4",
11821267
"metadata": {},
11831268
"outputs": [],
11841269
"source": [
@@ -1200,6 +1285,8 @@
12001285
" pairwiseChangeCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
12011286
" pairwiseChangeFileCount=pd.NamedAgg(column=\"filePath\", aggfunc=\"count\"),\n",
12021287
" pairwiseChangeAverageRate=pd.NamedAgg(column=\"coChangeRate\", aggfunc=\"mean\"),\n",
1288+
" pairwiseChangeMaxLift=pd.NamedAgg(column=\"maxLift\", aggfunc=\"max\"),\n",
1289+
" pairwiseChangeAverageLift=pd.NamedAgg(column=\"avgLift\", aggfunc=\"mean\"),\n",
12031290
")\n",
12041291
"data_to_display.reset_index(inplace=True)\n",
12051292
"\n",
@@ -1223,6 +1310,8 @@
12231310
"data_to_display['pairwiseChangeCommitCount'] = data_to_display['pairwiseChangeCommitCount'].fillna(0).astype(int)\n",
12241311
"data_to_display['pairwiseChangeFileCount'] = data_to_display['pairwiseChangeFileCount'].fillna(0).astype(int)\n",
12251312
"data_to_display['pairwiseChangeAverageRate'] = data_to_display['pairwiseChangeAverageRate'].fillna(0).astype(float)\n",
1313+
"data_to_display['pairwiseChangeMaxLift'] = data_to_display['pairwiseChangeMaxLift'].fillna(0).astype(float)\n",
1314+
"data_to_display['pairwiseChangeAverageLift'] = data_to_display['pairwiseChangeAverageLift'].fillna(0).astype(float)\n",
12261315
"data_to_display.reset_index(inplace=True)\n",
12271316
"\n",
12281317
"# Debug\n",
@@ -1233,7 +1322,7 @@
12331322
{
12341323
"cell_type": "code",
12351324
"execution_count": null,
1236-
"id": "19b5a98a",
1325+
"id": "1052776d",
12371326
"metadata": {},
12381327
"outputs": [],
12391328
"source": [
@@ -1251,96 +1340,67 @@
12511340
"))\n",
12521341
"figure.update_layout(\n",
12531342
" **plotly_treemap_layout_base_settings,\n",
1254-
" title='Co-Changing files in update commits',\n",
1343+
" title='Files that likely co-change with others in update commits',\n",
12551344
")\n",
12561345
"figure.show(**plotly_treemap_figure_show_settings)\n",
12571346
"if is_command_line_execution():\n",
12581347
" figure.write_image(**get_plotly_figure_write_image_settings(\"CoChangingFiles\"))"
12591348
]
12601349
},
1261-
{
1262-
"cell_type": "markdown",
1263-
"id": "d8c6ccee",
1264-
"metadata": {},
1265-
"source": [
1266-
"## Filecount per commit\n",
1267-
"\n",
1268-
"Shows how many commits had changed one file, how many had changed two files, and so on.\n",
1269-
"The chart is limited to 30 lines for improved readability.\n",
1270-
"The data preview also includes overall statistics including the number of commits that are filtered out in the chart."
1271-
]
1272-
},
1273-
{
1274-
"cell_type": "markdown",
1275-
"id": "ed53b6e5",
1276-
"metadata": {},
1277-
"source": [
1278-
"### Preview data"
1279-
]
1280-
},
12811350
{
12821351
"cell_type": "code",
12831352
"execution_count": null,
1284-
"id": "5526e458",
1353+
"id": "3ec95adf",
12851354
"metadata": {},
12861355
"outputs": [],
12871356
"source": [
1288-
"git_file_count_per_commit = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_per_commit_distribution.cypher\")\n",
1357+
"data_to_display = add_quantile_limited_column(data_to_display, \"pairwiseChangeMaxLift\", 0.98)\n",
12891358
"\n",
1290-
"print(\"Sum of commits that changed more than 30 files (each) = \" + str(git_file_count_per_commit[git_file_count_per_commit['filesPerCommit'] > 30]['commitCount'].sum()))\n",
1291-
"print(\"Max changed files with one commit = \" + str(git_file_count_per_commit['filesPerCommit'].max()))\n",
1292-
"display(git_file_count_per_commit.describe())\n",
1293-
"display(git_file_count_per_commit.head(30))"
1294-
]
1295-
},
1296-
{
1297-
"cell_type": "markdown",
1298-
"id": "dcea826e",
1299-
"metadata": {},
1300-
"source": [
1301-
"### Bar chart with the number of files per commit distribution"
1359+
"figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
1360+
" create_treemap_commit_statistics_settings(data_to_display),\n",
1361+
" # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
1362+
" # values = pairwise_changed_git_files['fileCount'],\n",
1363+
" marker=dict(\n",
1364+
" **plotly_treemap_marker_base_colorscale,\n",
1365+
" colors=data_to_display['pairwiseChangeMaxLift_limited'], \n",
1366+
" colorbar=dict(title=\"Co-Change Lift\"),\n",
1367+
" ),\n",
1368+
"))\n",
1369+
"figure.update_layout(\n",
1370+
" **plotly_treemap_layout_base_settings,\n",
1371+
" title='Co-Changing files in update commits max lift (1=random, >1=more than random, <1=less than random)',\n",
1372+
")\n",
1373+
"figure.show(**plotly_treemap_figure_show_settings)\n",
1374+
"if is_command_line_execution():\n",
1375+
" figure.write_image(**get_plotly_figure_write_image_settings(\"CoChangingFilesMaxLift\"))"
13021376
]
13031377
},
13041378
{
13051379
"cell_type": "code",
13061380
"execution_count": null,
1307-
"id": "9e9dbc57",
1381+
"id": "0e33b873",
13081382
"metadata": {},
13091383
"outputs": [],
13101384
"source": [
1311-
"if git_file_count_per_commit.empty:\n",
1312-
" print(\"No data to plot\")\n",
1313-
"else:\n",
1314-
" figure = plotly_graph_objects.Figure(plotly_graph_objects.Bar(\n",
1315-
" x=git_file_count_per_commit['filesPerCommit'].head(30), \n",
1316-
" y=git_file_count_per_commit['commitCount'].head(30)),\n",
1317-
" )\n",
1318-
" figure.update_layout(\n",
1319-
" **plotly_bar_layout_base_settings,\n",
1320-
" title='Changed files per commit',\n",
1321-
" xaxis_title='file count',\n",
1322-
" yaxis_title='commit count'\n",
1323-
" )\n",
1324-
" figure.show(**plotly_treemap_figure_show_settings)\n",
1325-
" if is_command_line_execution():\n",
1326-
" figure.write_image(**get_plotly_figure_write_image_settings(\"ChangedFilesPerCommit\"))"
1327-
]
1328-
},
1329-
{
1330-
"cell_type": "markdown",
1331-
"id": "322d6cf9",
1332-
"metadata": {},
1333-
"source": [
1334-
"## Pairwise Changed Files\n",
1335-
"\n",
1336-
"This section analyzes files that where changed together within the same commit and provides several metrics to quantify the strength of the co-change relationship:\n",
1385+
"data_to_display = add_quantile_limited_column(data_to_display, \"pairwiseChangeAverageLift\", 0.98)\n",
13371386
"\n",
1338-
"- **Commit Count**: The number of commits in which two files were changed together.\n",
1339-
"- **Commit Lift**: A ratio that indicates whether the co-change pattern is stronger than random chance, given how often each file changes.\n",
1340-
"- **Jaccard Similarity**: The ratio of commits involving either file that also involved both files.\n",
1341-
"\n",
1342-
"The following tables show the top pairwise changed files based on these metrics.\n",
1343-
"The following charts show how these metrics are distributed across pairs of files that were changed together."
1387+
"figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
1388+
" create_treemap_commit_statistics_settings(data_to_display),\n",
1389+
" # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
1390+
" # values = pairwise_changed_git_files['fileCount'],\n",
1391+
" marker=dict(\n",
1392+
" **plotly_treemap_marker_base_colorscale,\n",
1393+
" colors=data_to_display['pairwiseChangeAverageLift_limited'], \n",
1394+
" colorbar=dict(title=\"Co-Change Lift\"),\n",
1395+
" ),\n",
1396+
"))\n",
1397+
"figure.update_layout(\n",
1398+
" **plotly_treemap_layout_base_settings,\n",
1399+
" title='Co-Changing files in update commits average lift (1=random, >1=more than random, <1=less than random)',\n",
1400+
")\n",
1401+
"figure.show(**plotly_treemap_figure_show_settings)\n",
1402+
"if is_command_line_execution():\n",
1403+
" figure.write_image(**get_plotly_figure_write_image_settings(\"CoChangingFilesAverageLift\"))"
13441404
]
13451405
},
13461406
{
@@ -1350,7 +1410,7 @@
13501410
"metadata": {},
13511411
"outputs": [],
13521412
"source": [
1353-
"# Initial steps: Function Declaration and Data Preparation"
1413+
"# Initial steps: Function Declaration and Data Preparation for co-change distribution analysis"
13541414
]
13551415
},
13561416
{

0 commit comments

Comments
 (0)