From 5242804ad517b82b928e7ebd87c9d64b1d2f8a0e Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sat, 12 Apr 2025 19:03:41 +0200 Subject: [PATCH 1/2] Fix missing git changes due to not reliably present label --- ...Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher | 3 ++- ...List_git_files_that_were_changed_together_all_in_one.cypher | 2 +- ...t_were_changed_together_with_another_file_all_in_one.cypher | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher b/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher index 5836cf46b..362766b55 100644 --- a/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher +++ b/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher @@ -2,7 +2,8 @@ MATCH (global_git_commit:Git:Commit) WITH count(global_git_commit) AS globalCommitCount -MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) +MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[:UPDATES]->(git_file:Git:File) +MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) WHERE git_file.deletedAt IS NULL // Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA) ORDER BY git_commit.sha, git_file.relativePath diff --git a/cypher/GitLog/List_git_files_that_were_changed_together_all_in_one.cypher b/cypher/GitLog/List_git_files_that_were_changed_together_all_in_one.cypher index 9eb91fa79..6b690ed8e 100644 --- a/cypher/GitLog/List_git_files_that_were_changed_together_all_in_one.cypher +++ b/cypher/GitLog/List_git_files_that_were_changed_together_all_in_one.cypher @@ -2,7 +2,7 @@ MATCH (global_git_commit:Git:Commit) WITH count(global_git_commit) AS globalCommitCount -MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) +MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) WHERE git_file.deletedAt IS NULL WITH *, git_repository.name + '/' + git_file.relativePath AS filePath diff --git a/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file_all_in_one.cypher b/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file_all_in_one.cypher index 236ee8374..e07ea2271 100644 --- a/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file_all_in_one.cypher +++ b/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file_all_in_one.cypher @@ -2,7 +2,7 @@ MATCH (global_git_commit:Git:Commit) WITH count(global_git_commit) AS globalCommitCount -MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File) +MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[:UPDATES]->(git_file:Git:File) MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) WHERE git_file.deletedAt IS NULL WITH *, git_repository.name + '/' + git_file.relativePath AS filePath From 7e5886904bcfe503a73dfba654aa972418f064b0 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Thu, 10 Apr 2025 07:57:00 +0200 Subject: [PATCH 2/2] Compare pairwise changed files with their dependency weights --- ...ise_changed_files_with_dependencies.cypher | 38 ++++++++ jupyter/GitHistoryGeneral.ipynb | 95 +++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher diff --git a/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher b/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher new file mode 100644 index 000000000..ee752248a --- /dev/null +++ b/cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher @@ -0,0 +1,38 @@ +// List pair of files that were changed together and that have a declared dependency between each other. + +MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File) +MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile) +WHERE elementId(firstCodeFile) < elementId(secondCodeFile) + WITH firstCodeFile.fileName AS firstFileName + ,secondCodeFile.fileName AS secondFileName + ,coalesce(dependency.weight, dependency.cardinality) AS dependencyWeight + ,pairwiseChange.commitCount AS commitCount + ,dependency.fileDistanceAsFewestChangeDirectoryCommands AS fileDistanceAsFewestChangeDirectoryCommands +RETURN dependencyWeight + ,commitCount + ,fileDistanceAsFewestChangeDirectoryCommands + // ,count(*) AS occurrences + // ,collect(firstFileName + ' -> ' + secondFileName)[0..3] AS examples +ORDER BY dependencyWeight, commitCount + +// MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File) +// MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile) +// WHERE elementId(firstCodeFile) < elementId(secondCodeFile) +// RETURN firstCodeFile.fileName AS firstFileName +// ,secondCodeFile.fileName AS secondFileName +// ,dependency.weight AS dependencyWeight +// ,pairwiseChange.commitCount AS commitCount +// ORDER BY dependencyWeight, commitCount + +// MATCH (g1:!Git&File)-[relation:CHANGED_TOGETHER_WITH|DEPENDS_ON]-(g2:!Git&File) +// WITH count(DISTINCT relation) AS relatedFilesCount +// ,collect(DISTINCT relation) AS relations +// UNWIND relations AS relation +// WITH relatedFilesCount +// ,coalesce(relation.commitCount, 0) AS commitCount +// ,coalesce(relation.weight, 0) AS dependencyWeight +// ,coalesce(relation.fileDistanceAsFewestChangeDirectoryCommands, 0) AS fileDistanceAsFewestChangeDirectoryCommands +// RETURN dependencyWeight +// ,commitCount +// ,fileDistanceAsFewestChangeDirectoryCommands +// ORDER BY dependencyWeight, commitCount diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index 4b34a65d9..4010bb5b0 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -1281,6 +1281,101 @@ " figure.show(**plotly_treemap_figure_show_settings)" ] }, + { + "cell_type": "markdown", + "id": "c15669ef", + "metadata": {}, + "source": [ + "## Pairwise Changed Files vs. Dependency Weight\n", + "\n", + "This section explores the correlation between how often pairs of files are changed together (common commit count) and their dependency weight. Note that these results should be interpreted cautiously, as comparing pairwise changes and dependencies is inherently challenging.\n", + "\n", + "### Considerations\n", + "- **Historical vs. Current State**: Pairwise changes reflect the entire git history, while dependency weight represents the current state of the codebase.\n", + "- **Commit Granularity**: Developers may use different commit strategies, such as squashing changes into a single commit or creating fine-grained commits. Ideally, each commit should represent a single semantic change for accurate analysis.\n", + "- **Dependency Representation**: Some file types (e.g., Java files with import statements) clearly define dependencies, while others (e.g., shell scripts, XML, YAML) lack explicit dependency relationships.\n", + "- **Repository Characteristics**: Repositories with generated code may have many large commits, while stabilized repositories may only update configuration files for dependency changes." + ] + }, + { + "cell_type": "markdown", + "id": "98a2feea", + "metadata": {}, + "source": [ + "#### Data Preview" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a067f8e6", + "metadata": {}, + "outputs": [], + "source": [ + "pairwise_changed_git_files_with_dependencies = query_cypher_to_data_frame(\"../cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher\")\n", + "pairwise_changed_git_files_with_dependencies.head(20)" + ] + }, + { + "cell_type": "markdown", + "id": "01db2db9", + "metadata": {}, + "source": [ + "#### Data Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fe48db8", + "metadata": {}, + "outputs": [], + "source": [ + "display(\"Pairwise changed git files compared to dependency weights - Overall statistics\")\n", + "display(pairwise_changed_git_files_with_dependencies.describe())\n", + "\n", + "display(\"Pairwise changed git files compared to dependency weights - Pearson Correlation\")\n", + "display(pairwise_changed_git_files_with_dependencies.corr(method='pearson'))\n", + "\n", + "display(\"Pairwise changed git files compared to dependency weights - Spearman Correlation\")\n", + "display(pairwise_changed_git_files_with_dependencies.corr(method='spearman'))\n", + "\n", + "from scipy.stats import pearsonr, spearmanr\n", + "\n", + "display(\"Pearson Correlation with p-value for commitCount and dependencyWeight\")\n", + "display(pearsonr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n", + "\n", + "display(\"Spearman Correlation with p-value for commitCount and dependencyWeight\")\n", + "display(spearmanr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "747f9590", + "metadata": {}, + "outputs": [], + "source": [ + "# Scatter plot of all pairs of files with their commit count on the x axis and dependency weight on the y axis\n", + "\n", + "if pairwise_changed_git_files_with_dependencies.empty:\n", + " print(\"No data to plot\")\n", + "else:\n", + " figure = plotly_graph_objects.Figure(plotly_graph_objects.Scatter(\n", + " x=pairwise_changed_git_files_with_dependencies['commitCount'], \n", + " y=pairwise_changed_git_files_with_dependencies['dependencyWeight'],\n", + " mode='markers',\n", + " # marker=dict(size=pairwise_changed_git_files_with_dependencies['occurrences'] + 8)\n", + " ))\n", + " figure.update_layout(\n", + " **plotly_bar_layout_base_settings,\n", + " title='Pairwise changed files: Number of changes (commitCount) vs. dependency weight',\n", + " xaxis_title='commit count',\n", + " yaxis_title='dependency weight',\n", + " )\n", + " figure.show(**plotly_treemap_figure_show_settings)" + ] + }, { "cell_type": "markdown", "id": "14e87aff",