Skip to content

Commit a0f36a3

Browse files
authored
Merge pull request #419 from JohT/feature/classify_git_commits_and_ignore_automated_commits_in_co_change.analysis
Classify git commits and use only manual commits in co-change analysis
2 parents bf6816a + bc72c91 commit a0f36a3

File tree

7 files changed

+63
-62
lines changed

7 files changed

+63
-62
lines changed

conda-environment.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ dependencies:
2222
- plotly=6.0.*
2323
- python-kaleido=0.2.* # To render plotly plots. Static image export for web-based visualization libraries.
2424
- scikit-learn=1.6.* # To try out this HDBSCAN implementation
25-
- seaborn=0.13 # To visualize clustering results
26-
- optuna=4.3.*
25+
- seaborn=0.13.* # To visualize clustering results
26+
- optuna=4.5.*
2727
- umap-learn=0.5.* # to visualize node embeddings in 2D (UMAP dimensionality reduction)
2828
- shap=0.48.*
2929
- pip:
30-
- neo4j==5.23.*
30+
- neo4j==5.28.*

cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
// Determine global file count, global file count threshold (filter out refactoring commits) and global update commits
44
MATCH (git_commit_global:Git:Commit)-[:CONTAINS_CHANGE]->(:Git:Change)-[:UPDATES]->(git_file_global:Git:File)
55
WHERE git_file_global.deletedAt IS NULL
6+
AND git_commit_global.isManualCommit
67
WITH git_commit_global, count(DISTINCT git_file_global) AS commitFileCount
78
WITH percentileDisc(commitFileCount, 0.95) AS globalFileCountThreshold
89
,count(git_commit_global) AS globalUpdateCommitCount
910
// Main section
1011
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[:UPDATES]->(git_file:Git:File)
12+
WHERE git_commit.isManualCommit
1113
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
1214
WHERE git_file.deletedAt IS NULL
1315
// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)

cypher/GitLog/List_pairwise_changed_files.cypher

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
// List pairs of files that were changed together. Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first.
22

33
MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
4-
WHERE elementId(firstFile) < elementId(secondFile)
4+
WHERE firstFile.extension < secondFile.extension
5+
OR (firstFile.extension = secondFile.extension AND elementId(firstFile) < elementId(secondFile))
56
WITH *
67
,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName
78
,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// Classify git commits and set properties like isMergeCommit, isAutomationCommit (=isBotCommit or isMavenCommit).
2+
3+
MATCH (git_commit:Git:Commit)
4+
WITH git_commit,
5+
COUNT { (git_commit)-[:HAS_PARENT]->(:Git:Commit) } AS parentCount
6+
WITH git_commit,
7+
parentCount >= 2 AS isMergeCommit,
8+
git_commit.author CONTAINS '[bot]' AS isBotAuthor,
9+
git_commit.message STARTS WITH '[maven' AS isMavenCommit
10+
WITH git_commit,
11+
isMergeCommit,
12+
isBotAuthor,
13+
isMavenCommit,
14+
(isBotAuthor OR isMavenCommit) AS isAutomatedCommit
15+
SET git_commit.isMergeCommit = isMergeCommit,
16+
git_commit.isBotAuthor = isBotAuthor,
17+
git_commit.isMavenCommit = isMavenCommit,
18+
git_commit.isAutomatedCommit = isAutomatedCommit,
19+
git_commit.isManualCommit = NOT isAutomatedCommit
20+
RETURN count(git_commit) AS classifiedCommits
21+
// For Debugging:
22+
// ,isMergeCommit
23+
// ,isBotAuthor
24+
// ,isMavenCommit
25+
// ,isAutomatedCommit

jupyter/GitHistoryGeneral.ipynb

Lines changed: 21 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1426,19 +1426,22 @@
14261426
{
14271427
"cell_type": "code",
14281428
"execution_count": null,
1429-
"id": "8f874da0",
1429+
"id": "0da821b1",
14301430
"metadata": {},
14311431
"outputs": [],
14321432
"source": [
1433-
"def find_top_pairwise_changed_file_extensions(input_data: pd.DataFrame, top_n: int = 10) -> pd.Series:\n",
1433+
"def find_top_pairwise_changed_file_extensions(input_data: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:\n",
14341434
" \"\"\"\n",
14351435
" Finds the top N pairwise changed file extensions based on commit count.\n",
14361436
" input_data : pd.DataFrame : DataFrame containing pairwise changed files with their pair counts and extensions\n",
14371437
" top_n : int : The number of top extensions to return\n",
14381438
" return : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n",
14391439
" \"\"\"\n",
1440-
" top_extensions = input_data.groupby('fileExtensionPair').aggregate({'filePairWithRelativePath': 'count'}).reset_index()\n",
1441-
" return top_extensions.sort_values(by='filePairWithRelativePath', ascending=False).reset_index(drop=True).head(top_n)['fileExtensionPair']"
1440+
" top_extensions = input_data.groupby('fileExtensionPair', observed=False).aggregate(\n",
1441+
" fileExtensionPairCount=pd.NamedAgg(column=\"filePairWithRelativePath\", aggfunc=\"count\")\n",
1442+
" ).reset_index()\n",
1443+
" \n",
1444+
" return top_extensions.sort_values(by='fileExtensionPairCount', ascending=False).reset_index(drop=True).head(top_n)"
14421445
]
14431446
},
14441447
{
@@ -1449,7 +1452,11 @@
14491452
"outputs": [],
14501453
"source": [
14511454
"top_pairwise_changed_file_extensions = find_top_pairwise_changed_file_extensions(pairwise_changed_git_files, top_n=4)\n",
1452-
"# Only keep the pairwise change files with the top file extensions\n",
1455+
"display(top_pairwise_changed_file_extensions)\n",
1456+
"\n",
1457+
"pairwise_changed_git_files = pairwise_changed_git_files.merge(top_pairwise_changed_file_extensions, on='fileExtensionPair')\n",
1458+
"\n",
1459+
"top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions['fileExtensionPair']\n",
14531460
"pairwise_changed_git_files = pairwise_changed_git_files[pairwise_changed_git_files['fileExtensionPair'].isin(top_pairwise_changed_file_extensions)]"
14541461
]
14551462
},
@@ -1471,7 +1478,7 @@
14711478
" return data_frame # Column already exists\n",
14721479
" \n",
14731480
" # Create a new rank column based on the specified column and group by the group column\n",
1474-
" data_frame[f\"{column_name}ExtensionRank\"] = data_frame.groupby('fileExtensionPair')[column_name].rank(ascending=False, method='dense').astype(int)\n",
1481+
" data_frame[f\"{column_name}ExtensionRank\"] = data_frame.groupby('fileExtensionPair', observed=False)[column_name].rank(ascending=False, method='dense').astype(int)\n",
14751482
" return data_frame"
14761483
]
14771484
},
@@ -1511,58 +1518,18 @@
15111518
" # Group by the file extensions and the metric and its rank.\n",
15121519
" # Since some entries might have the same metric value, we aggregate by the first file pair with relative path and the first file pair.\n",
15131520
" # This way we can pick the top n entries for each file extension pair.\n",
1514-
" grouping_columns = [\"fileExtensionPair\", metric_column, metric_column + \"ExtensionRank\"]\n",
1521+
" grouping_columns = [\"fileExtensionPairCount\", \"fileExtensionPair\", metric_column, metric_column + \"ExtensionRank\"]\n",
15151522
" grouped_data = filtered_data.groupby(grouping_columns).aggregate(\n",
15161523
" filePair=pd.NamedAgg(column=\"filePair\", aggfunc=\"first\"),\n",
15171524
" filePairWithRelativePath=pd.NamedAgg(column=\"filePairWithRelativePath\", aggfunc=\"first\"),\n",
15181525
" ).reset_index()\n",
15191526
" \n",
1520-
" return grouped_data.sort_values(by=grouping_columns, ascending=[True, False, True]).reset_index(drop=True).rename(columns={metric_column + \"ExtensionRank\": \"GroupRank\"})"
1521-
]
1522-
},
1523-
{
1524-
"cell_type": "code",
1525-
"execution_count": null,
1526-
"id": "3c34ceea",
1527-
"metadata": {},
1528-
"outputs": [],
1529-
"source": [
1530-
"# TODO delete if not needed anymore\n",
1531-
"\n",
1532-
"def display_table_for_top_pairwise_changed_file_extensions_deprecated(\n",
1533-
" data_to_display: pd.DataFrame, \n",
1534-
" top_pairwise_changed_file_extensions: pd.Series,\n",
1535-
" sort_column: str,\n",
1536-
" top_n: int = 10\n",
1537-
" ):\n",
1538-
" \"\"\"\n",
1539-
" Displays a table for each top pairwise changed file extension.\n",
1540-
" data_to_plot : pd.DataFrame : DataFrame containing pairwise changed files with their commit counts\n",
1541-
" top_pairwise_changed_file_extensions : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n",
1542-
" sort_column : str : The column to sort the data by (default is \"pairwiseChangeCommitCount\")\n",
1543-
" top_n : int : The number of top entries to display for each extension (default is 10)\n",
1544-
" \"\"\"\n",
1545-
" \n",
1546-
" if data_to_display.empty:\n",
1547-
" print(\"No data to display\")\n",
1548-
" return\n",
1549-
" \n",
1550-
" if top_pairwise_changed_file_extensions.empty:\n",
1551-
" print(\"No top pairwise changed file extensions to display\")\n",
1552-
" return\n",
1553-
"\n",
1554-
" # Display each top pairwise changed file extension with its corresponding data\n",
1555-
" selected_columns = [\"fileExtensionPair\", \"filePair\", sort_column, \"filePairWithRelativePath\"]\n",
1556-
" data_to_display = data_to_display[selected_columns]\n",
1557-
" \n",
1558-
" combined_data_for_top_extensions = pd.DataFrame().reindex_like(data_to_display.head(0)) # Create an empty DataFrame with the same columns as data_to_display\n",
1559-
" \n",
1560-
" for _, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n",
1561-
" filtered_data = data_to_display[data_to_display[\"fileExtensionPair\"] == extension]\n",
1562-
" sorted_data = filtered_data.sort_values(by=sort_column, ascending=False).head(top_n).reset_index()\n",
1563-
" combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data], ignore_index=True)\n",
1564-
" \n",
1565-
" display(combined_data_for_top_extensions)"
1527+
" return (grouped_data\n",
1528+
" .sort_values(by=grouping_columns, ascending=[False, True, False, True])\n",
1529+
" .reset_index(drop=True)\n",
1530+
" .rename(columns={metric_column + \"ExtensionRank\": \"GroupRank\"})\n",
1531+
" .drop(columns=['fileExtensionPairCount'])\n",
1532+
" )"
15661533
]
15671534
},
15681535
{
@@ -1598,7 +1565,7 @@
15981565
" rows=sub_plot_rows, \n",
15991566
" cols=sub_plot_columns, \n",
16001567
" subplot_titles=top_pairwise_changed_file_extensions,\n",
1601-
" vertical_spacing=0.04, \n",
1568+
" vertical_spacing=0.06, \n",
16021569
" horizontal_spacing=0.04\n",
16031570
" )\n",
16041571
"\n",

requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,15 @@ typing-extensions==4.12.* # Needed for opentsne and Python >= 3.12
1717
wordcloud==1.9.*
1818
monotonic==1.*
1919
plotly[kaleido]==6.2.*
20-
seaborn==0.13 # To visualize clustering results
20+
seaborn==0.13.* # To visualize clustering results
2121

2222
# --- Machine Learning / Optimization ---
2323
scikit-learn==1.6.*
24-
optuna==4.3.*
24+
optuna==4.5.*
2525
umap-learn==0.5.* # Dimensionality reduction to visualize node embeddings in 2D
2626

2727
# --- Database connector ---
28-
neo4j==5.23.*
28+
neo4j==5.28.*
2929

3030
# --- Native/scientific packages (may require compilation) ---
3131
# These are included but may cause install errors in pip/venv

scripts/importGit.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,10 @@ commonPostGitImport() {
141141
postGitLogImport() {
142142
echo "importGit: Add numberOfGitCommits property to nodes with matching file names..."
143143
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_log_commits.cypher"
144-
144+
145+
echo "importGit: Classify git commits (e.g. isMergeCommit, isAutomatedCommit)..."
146+
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_commit_classification_properties.cypher"
147+
145148
commonPostGitImport
146149
}
147150

@@ -157,6 +160,9 @@ postGitPluginImport() {
157160
execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_file_relative_path.cypher"
158161
execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_absolute_file_name.cypher"
159162

163+
echo "importGit: Classify git commits (e.g. isMergeCommit, isAutomatedCommit)..."
164+
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_commit_classification_properties.cypher"
165+
160166
echo "importGit: Add numberOfGitCommits property to nodes with matching file names..."
161167
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_plugin_commits.cypher"
162168
echo "importGit: Add updateCommitCount property to file nodes and code nodes with matching file names..."

0 commit comments

Comments
 (0)