Merge pull request #419 from JohT/feature/classify_git_commits_and_ignore_automated_commits_in_co_change.analysis

JohT · web-flow · commit a0f36a3f97ea · 2025-08-26T08:23:29.000+02:00
Classify git commits and use only manual commits in co-change analysis
diff --git a/conda-environment.yml b/conda-environment.yml
@@ -22,9 +22,9 @@ dependencies:
   - plotly=6.0.*
   - python-kaleido=0.2.* # To render plotly plots. Static image export for web-based visualization libraries.
   - scikit-learn=1.6.* # To try out this HDBSCAN implementation 
-  - seaborn=0.13 # To visualize clustering results
-  - optuna=4.3.*
+  - seaborn=0.13.* # To visualize clustering results
+  - optuna=4.5.*
   - umap-learn=0.5.* # to visualize node embeddings in 2D (UMAP dimensionality reduction)
   - shap=0.48.*
   - pip:
-      - neo4j==5.23.*
+      - neo4j==5.28.*
diff --git a/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher b/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher
@@ -3,11 +3,13 @@
 // Determine global file count, global file count threshold (filter out refactoring commits) and global update commits
 MATCH (git_commit_global:Git:Commit)-[:CONTAINS_CHANGE]->(:Git:Change)-[:UPDATES]->(git_file_global:Git:File)
 WHERE git_file_global.deletedAt IS NULL
+  AND git_commit_global.isManualCommit
  WITH git_commit_global, count(DISTINCT git_file_global) AS commitFileCount
  WITH percentileDisc(commitFileCount, 0.95) AS globalFileCountThreshold
      ,count(git_commit_global)              AS globalUpdateCommitCount
 // Main section
 MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[:UPDATES]->(git_file:Git:File)
+WHERE git_commit.isManualCommit
 MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
 WHERE git_file.deletedAt IS NULL
 // Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)
diff --git a/cypher/GitLog/List_pairwise_changed_files.cypher b/cypher/GitLog/List_pairwise_changed_files.cypher
@@ -1,7 +1,8 @@
 // List pairs of files that were changed together. Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first.
 
 MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
-WHERE elementId(firstFile) < elementId(secondFile)
+WHERE firstFile.extension < secondFile.extension
+   OR (firstFile.extension = secondFile.extension AND elementId(firstFile) < elementId(secondFile))
  WITH *
      ,coalesce(firstFile.relativePath, firstFile.fileName)    AS firstFileName
      ,coalesce(secondFile.relativePath, secondFile.fileName)  AS secondFileName
diff --git a/cypher/GitLog/Set_commit_classification_properties.cypher b/cypher/GitLog/Set_commit_classification_properties.cypher
@@ -0,0 +1,25 @@
+// Classify git commits and set properties like isMergeCommit, isAutomationCommit (=isBotCommit or isMavenCommit).
+
+MATCH (git_commit:Git:Commit)
+WITH git_commit,
+     COUNT { (git_commit)-[:HAS_PARENT]->(:Git:Commit) } AS parentCount
+WITH git_commit,
+     parentCount >= 2                                    AS isMergeCommit,
+     git_commit.author    CONTAINS '[bot]'               AS isBotAuthor,
+     git_commit.message STARTS WITH '[maven'             AS isMavenCommit
+WITH git_commit,
+     isMergeCommit,
+     isBotAuthor,
+     isMavenCommit,
+     (isBotAuthor OR isMavenCommit)                      AS isAutomatedCommit
+SET git_commit.isMergeCommit     = isMergeCommit,
+    git_commit.isBotAuthor       = isBotAuthor,
+    git_commit.isMavenCommit     = isMavenCommit,
+    git_commit.isAutomatedCommit = isAutomatedCommit,
+    git_commit.isManualCommit    = NOT isAutomatedCommit
+RETURN count(git_commit) AS classifiedCommits
+// For Debugging:
+//      ,isMergeCommit
+//      ,isBotAuthor
+//      ,isMavenCommit
+//      ,isAutomatedCommit
diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb
@@ -1426,19 +1426,22 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "8f874da0",
+   "id": "0da821b1",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def find_top_pairwise_changed_file_extensions(input_data: pd.DataFrame, top_n: int = 10) -> pd.Series:\n",
+    "def find_top_pairwise_changed_file_extensions(input_data: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:\n",
     "    \"\"\"\n",
     "    Finds the top N pairwise changed file extensions based on commit count.\n",
     "    input_data : pd.DataFrame : DataFrame containing pairwise changed files with their pair counts and extensions\n",
     "    top_n : int : The number of top extensions to return\n",
     "    return : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n",
     "    \"\"\"\n",
-    "    top_extensions = input_data.groupby('fileExtensionPair').aggregate({'filePairWithRelativePath': 'count'}).reset_index()\n",
-    "    return top_extensions.sort_values(by='filePairWithRelativePath', ascending=False).reset_index(drop=True).head(top_n)['fileExtensionPair']"
+    "    top_extensions = input_data.groupby('fileExtensionPair', observed=False).aggregate(\n",
+    "        fileExtensionPairCount=pd.NamedAgg(column=\"filePairWithRelativePath\", aggfunc=\"count\")\n",
+    "    ).reset_index()\n",
+    "    \n",
+    "    return top_extensions.sort_values(by='fileExtensionPairCount', ascending=False).reset_index(drop=True).head(top_n)"
    ]
   },
   {
@@ -1449,7 +1452,11 @@
    "outputs": [],
    "source": [
     "top_pairwise_changed_file_extensions = find_top_pairwise_changed_file_extensions(pairwise_changed_git_files, top_n=4)\n",
-    "# Only keep the pairwise change files with the top file extensions\n",
+    "display(top_pairwise_changed_file_extensions)\n",
+    "\n",
+    "pairwise_changed_git_files = pairwise_changed_git_files.merge(top_pairwise_changed_file_extensions, on='fileExtensionPair')\n",
+    "\n",
+    "top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions['fileExtensionPair']\n",
     "pairwise_changed_git_files = pairwise_changed_git_files[pairwise_changed_git_files['fileExtensionPair'].isin(top_pairwise_changed_file_extensions)]"
    ]
   },
@@ -1471,7 +1478,7 @@
     "        return data_frame # Column already exists\n",
     "    \n",
     "    # Create a new rank column based on the specified column and group by the group column\n",
-    "    data_frame[f\"{column_name}ExtensionRank\"] = data_frame.groupby('fileExtensionPair')[column_name].rank(ascending=False, method='dense').astype(int)\n",
+    "    data_frame[f\"{column_name}ExtensionRank\"] = data_frame.groupby('fileExtensionPair', observed=False)[column_name].rank(ascending=False, method='dense').astype(int)\n",
     "    return data_frame"
    ]
   },
@@ -1511,58 +1518,18 @@
     "    # Group by the file extensions and the metric and its rank.\n",
     "    # Since some entries might have the same metric value, we aggregate by the first file pair with relative path and the first file pair.\n",
     "    # This way we can pick the top n entries for each file extension pair.\n",
-    "    grouping_columns = [\"fileExtensionPair\", metric_column,  metric_column + \"ExtensionRank\"]\n",
+    "    grouping_columns = [\"fileExtensionPairCount\", \"fileExtensionPair\", metric_column,  metric_column + \"ExtensionRank\"]\n",
     "    grouped_data = filtered_data.groupby(grouping_columns).aggregate(\n",
     "        filePair=pd.NamedAgg(column=\"filePair\", aggfunc=\"first\"),\n",
     "        filePairWithRelativePath=pd.NamedAgg(column=\"filePairWithRelativePath\", aggfunc=\"first\"),\n",
     "    ).reset_index()\n",
     "    \n",
-    "    return grouped_data.sort_values(by=grouping_columns, ascending=[True, False, True]).reset_index(drop=True).rename(columns={metric_column + \"ExtensionRank\": \"GroupRank\"})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3c34ceea",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# TODO delete if not needed anymore\n",
-    "\n",
-    "def display_table_for_top_pairwise_changed_file_extensions_deprecated(\n",
-    "        data_to_display: pd.DataFrame, \n",
-    "        top_pairwise_changed_file_extensions: pd.Series,\n",
-    "        sort_column: str,\n",
-    "        top_n: int = 10\n",
-    "    ):\n",
-    "    \"\"\"\n",
-    "    Displays a table for each top pairwise changed file extension.\n",
-    "    data_to_plot : pd.DataFrame : DataFrame containing pairwise changed files with their commit counts\n",
-    "    top_pairwise_changed_file_extensions : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n",
-    "    sort_column : str : The column to sort the data by (default is \"pairwiseChangeCommitCount\")\n",
-    "    top_n : int : The number of top entries to display for each extension (default is 10)\n",
-    "    \"\"\"\n",
-    "    \n",
-    "    if data_to_display.empty:\n",
-    "        print(\"No data to display\")\n",
-    "        return\n",
-    "    \n",
-    "    if top_pairwise_changed_file_extensions.empty:\n",
-    "        print(\"No top pairwise changed file extensions to display\")\n",
-    "        return\n",
-    "\n",
-    "    # Display each top pairwise changed file extension with its corresponding data\n",
-    "    selected_columns = [\"fileExtensionPair\", \"filePair\", sort_column, \"filePairWithRelativePath\"]\n",
-    "    data_to_display = data_to_display[selected_columns]\n",
-    "    \n",
-    "    combined_data_for_top_extensions = pd.DataFrame().reindex_like(data_to_display.head(0))  # Create an empty DataFrame with the same columns as data_to_display\n",
-    "    \n",
-    "    for _, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n",
-    "        filtered_data = data_to_display[data_to_display[\"fileExtensionPair\"] == extension]\n",
-    "        sorted_data = filtered_data.sort_values(by=sort_column, ascending=False).head(top_n).reset_index()\n",
-    "        combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data], ignore_index=True)\n",
-    "    \n",
-    "    display(combined_data_for_top_extensions)"
+    "    return (grouped_data\n",
+    "            .sort_values(by=grouping_columns, ascending=[False, True, False, True])\n",
+    "            .reset_index(drop=True)\n",
+    "            .rename(columns={metric_column + \"ExtensionRank\": \"GroupRank\"})\n",
+    "            .drop(columns=['fileExtensionPairCount'])\n",
+    "        )"
    ]
   },
   {
@@ -1598,7 +1565,7 @@
     "        rows=sub_plot_rows, \n",
     "        cols=sub_plot_columns, \n",
     "        subplot_titles=top_pairwise_changed_file_extensions,\n",
-    "        vertical_spacing=0.04,   \n",
+    "        vertical_spacing=0.06,   \n",
     "        horizontal_spacing=0.04\n",
     "    )\n",
     "\n",
diff --git a/requirements.txt b/requirements.txt
@@ -17,15 +17,15 @@ typing-extensions==4.12.* # Needed for opentsne and Python >= 3.12
 wordcloud==1.9.*
 monotonic==1.*
 plotly[kaleido]==6.2.*
-seaborn==0.13 # To visualize clustering results
+seaborn==0.13.* # To visualize clustering results
 
 # --- Machine Learning / Optimization ---
 scikit-learn==1.6.*
-optuna==4.3.*
+optuna==4.5.*
 umap-learn==0.5.* # Dimensionality reduction to visualize node embeddings in 2D
 
 # --- Database connector ---
-neo4j==5.23.*
+neo4j==5.28.*
 
 # --- Native/scientific packages (may require compilation) ---
 # These are included but may cause install errors in pip/venv
diff --git a/scripts/importGit.sh b/scripts/importGit.sh
@@ -141,7 +141,10 @@ commonPostGitImport() {
 postGitLogImport() {
   echo "importGit: Add numberOfGitCommits property to nodes with matching file names..."
   execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_log_commits.cypher"
-
+  
+  echo "importGit: Classify git commits (e.g. isMergeCommit, isAutomatedCommit)..."
+  execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_commit_classification_properties.cypher"
+  
   commonPostGitImport
 }
 
@@ -157,6 +160,9 @@ postGitPluginImport() {
   execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_file_relative_path.cypher"
   execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_absolute_file_name.cypher"
 
+  echo "importGit: Classify git commits (e.g. isMergeCommit, isAutomatedCommit)..."
+  execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_commit_classification_properties.cypher"
+
   echo "importGit: Add numberOfGitCommits property to nodes with matching file names..."
   execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_plugin_commits.cypher"
   echo "importGit: Add updateCommitCount property to file nodes and code nodes with matching file names..."