From 98d9ce41a761ca1c4bc86db70942ed9cab601ff4 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sun, 24 Aug 2025 13:07:19 +0200 Subject: [PATCH 1/3] Classify git commits --- ...et_commit_classification_properties.cypher | 25 +++++++++++++++++++ scripts/importGit.sh | 8 +++++- 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 cypher/GitLog/Set_commit_classification_properties.cypher diff --git a/cypher/GitLog/Set_commit_classification_properties.cypher b/cypher/GitLog/Set_commit_classification_properties.cypher new file mode 100644 index 000000000..988eac727 --- /dev/null +++ b/cypher/GitLog/Set_commit_classification_properties.cypher @@ -0,0 +1,25 @@ +// Classify git commits and set properties like isMergeCommit, isAutomationCommit (=isBotCommit or isMavenCommit). + +MATCH (git_commit:Git:Commit) +WITH git_commit, + COUNT { (git_commit)-[:HAS_PARENT]->(:Git:Commit) } AS parentCount +WITH git_commit, + parentCount >= 2 AS isMergeCommit, + git_commit.author CONTAINS '[bot]' AS isBotAuthor, + git_commit.message STARTS WITH '[maven' AS isMavenCommit +WITH git_commit, + isMergeCommit, + isBotAuthor, + isMavenCommit, + (isBotAuthor OR isMavenCommit) AS isAutomatedCommit +SET git_commit.isMergeCommit = isMergeCommit, + git_commit.isBotAuthor = isBotAuthor, + git_commit.isMavenCommit = isMavenCommit, + git_commit.isAutomatedCommit = isAutomatedCommit, + git_commit.isManualCommit = NOT isAutomatedCommit +RETURN count(git_commit) AS classifiedCommits +// For Debugging: +// ,isMergeCommit +// ,isBotAuthor +// ,isMavenCommit +// ,isAutomatedCommit \ No newline at end of file diff --git a/scripts/importGit.sh b/scripts/importGit.sh index 708a5b496..812b14e5a 100755 --- a/scripts/importGit.sh +++ b/scripts/importGit.sh @@ -141,7 +141,10 @@ commonPostGitImport() { postGitLogImport() { echo "importGit: Add numberOfGitCommits property to nodes with matching file names..." execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_log_commits.cypher" - + + echo "importGit: Classify git commits (e.g. isMergeCommit, isAutomatedCommit)..." + execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_commit_classification_properties.cypher" + commonPostGitImport } @@ -157,6 +160,9 @@ postGitPluginImport() { execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_file_relative_path.cypher" execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_absolute_file_name.cypher" + echo "importGit: Classify git commits (e.g. isMergeCommit, isAutomatedCommit)..." + execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_commit_classification_properties.cypher" + echo "importGit: Add numberOfGitCommits property to nodes with matching file names..." execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_plugin_commits.cypher" echo "importGit: Add updateCommitCount property to file nodes and code nodes with matching file names..." From d371a6d0d98f69caaefd68d90ef8f8411d0379de Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sun, 24 Aug 2025 13:07:53 +0200 Subject: [PATCH 2/3] Use only manual commits for co-change analysis --- ...HER_WITH_relationships_to_git_files.cypher | 2 + .../GitLog/List_pairwise_changed_files.cypher | 3 +- jupyter/GitHistoryGeneral.ipynb | 75 ++++++------------- 3 files changed, 25 insertions(+), 55 deletions(-) diff --git a/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher b/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher index 57f1a5ac4..79cad2774 100644 --- a/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher +++ b/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher @@ -3,11 +3,13 @@ // Determine global file count, global file count threshold (filter out refactoring commits) and global update commits MATCH (git_commit_global:Git:Commit)-[:CONTAINS_CHANGE]->(:Git:Change)-[:UPDATES]->(git_file_global:Git:File) WHERE git_file_global.deletedAt IS NULL + AND git_commit_global.isManualCommit WITH git_commit_global, count(DISTINCT git_file_global) AS commitFileCount WITH percentileDisc(commitFileCount, 0.95) AS globalFileCountThreshold ,count(git_commit_global) AS globalUpdateCommitCount // Main section MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[:UPDATES]->(git_file:Git:File) +WHERE git_commit.isManualCommit MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) WHERE git_file.deletedAt IS NULL // Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA) diff --git a/cypher/GitLog/List_pairwise_changed_files.cypher b/cypher/GitLog/List_pairwise_changed_files.cypher index 048c07b7b..43737358d 100644 --- a/cypher/GitLog/List_pairwise_changed_files.cypher +++ b/cypher/GitLog/List_pairwise_changed_files.cypher @@ -1,7 +1,8 @@ // List pairs of files that were changed together. Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first. MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File) -WHERE elementId(firstFile) < elementId(secondFile) +WHERE firstFile.extension < secondFile.extension + OR (firstFile.extension = secondFile.extension AND elementId(firstFile) < elementId(secondFile)) WITH * ,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName ,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index 117c03746..f4b7fd451 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -1426,19 +1426,22 @@ { "cell_type": "code", "execution_count": null, - "id": "8f874da0", + "id": "0da821b1", "metadata": {}, "outputs": [], "source": [ - "def find_top_pairwise_changed_file_extensions(input_data: pd.DataFrame, top_n: int = 10) -> pd.Series:\n", + "def find_top_pairwise_changed_file_extensions(input_data: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:\n", " \"\"\"\n", " Finds the top N pairwise changed file extensions based on commit count.\n", " input_data : pd.DataFrame : DataFrame containing pairwise changed files with their pair counts and extensions\n", " top_n : int : The number of top extensions to return\n", " return : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n", " \"\"\"\n", - " top_extensions = input_data.groupby('fileExtensionPair').aggregate({'filePairWithRelativePath': 'count'}).reset_index()\n", - " return top_extensions.sort_values(by='filePairWithRelativePath', ascending=False).reset_index(drop=True).head(top_n)['fileExtensionPair']" + " top_extensions = input_data.groupby('fileExtensionPair', observed=False).aggregate(\n", + " fileExtensionPairCount=pd.NamedAgg(column=\"filePairWithRelativePath\", aggfunc=\"count\")\n", + " ).reset_index()\n", + " \n", + " return top_extensions.sort_values(by='fileExtensionPairCount', ascending=False).reset_index(drop=True).head(top_n)" ] }, { @@ -1449,7 +1452,11 @@ "outputs": [], "source": [ "top_pairwise_changed_file_extensions = find_top_pairwise_changed_file_extensions(pairwise_changed_git_files, top_n=4)\n", - "# Only keep the pairwise change files with the top file extensions\n", + "display(top_pairwise_changed_file_extensions)\n", + "\n", + "pairwise_changed_git_files = pairwise_changed_git_files.merge(top_pairwise_changed_file_extensions, on='fileExtensionPair')\n", + "\n", + "top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions['fileExtensionPair']\n", "pairwise_changed_git_files = pairwise_changed_git_files[pairwise_changed_git_files['fileExtensionPair'].isin(top_pairwise_changed_file_extensions)]" ] }, @@ -1471,7 +1478,7 @@ " return data_frame # Column already exists\n", " \n", " # Create a new rank column based on the specified column and group by the group column\n", - " data_frame[f\"{column_name}ExtensionRank\"] = data_frame.groupby('fileExtensionPair')[column_name].rank(ascending=False, method='dense').astype(int)\n", + " data_frame[f\"{column_name}ExtensionRank\"] = data_frame.groupby('fileExtensionPair', observed=False)[column_name].rank(ascending=False, method='dense').astype(int)\n", " return data_frame" ] }, @@ -1511,58 +1518,18 @@ " # Group by the file extensions and the metric and its rank.\n", " # Since some entries might have the same metric value, we aggregate by the first file pair with relative path and the first file pair.\n", " # This way we can pick the top n entries for each file extension pair.\n", - " grouping_columns = [\"fileExtensionPair\", metric_column, metric_column + \"ExtensionRank\"]\n", + " grouping_columns = [\"fileExtensionPairCount\", \"fileExtensionPair\", metric_column, metric_column + \"ExtensionRank\"]\n", " grouped_data = filtered_data.groupby(grouping_columns).aggregate(\n", " filePair=pd.NamedAgg(column=\"filePair\", aggfunc=\"first\"),\n", " filePairWithRelativePath=pd.NamedAgg(column=\"filePairWithRelativePath\", aggfunc=\"first\"),\n", " ).reset_index()\n", " \n", - " return grouped_data.sort_values(by=grouping_columns, ascending=[True, False, True]).reset_index(drop=True).rename(columns={metric_column + \"ExtensionRank\": \"GroupRank\"})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3c34ceea", - "metadata": {}, - "outputs": [], - "source": [ - "# TODO delete if not needed anymore\n", - "\n", - "def display_table_for_top_pairwise_changed_file_extensions_deprecated(\n", - " data_to_display: pd.DataFrame, \n", - " top_pairwise_changed_file_extensions: pd.Series,\n", - " sort_column: str,\n", - " top_n: int = 10\n", - " ):\n", - " \"\"\"\n", - " Displays a table for each top pairwise changed file extension.\n", - " data_to_plot : pd.DataFrame : DataFrame containing pairwise changed files with their commit counts\n", - " top_pairwise_changed_file_extensions : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n", - " sort_column : str : The column to sort the data by (default is \"pairwiseChangeCommitCount\")\n", - " top_n : int : The number of top entries to display for each extension (default is 10)\n", - " \"\"\"\n", - " \n", - " if data_to_display.empty:\n", - " print(\"No data to display\")\n", - " return\n", - " \n", - " if top_pairwise_changed_file_extensions.empty:\n", - " print(\"No top pairwise changed file extensions to display\")\n", - " return\n", - "\n", - " # Display each top pairwise changed file extension with its corresponding data\n", - " selected_columns = [\"fileExtensionPair\", \"filePair\", sort_column, \"filePairWithRelativePath\"]\n", - " data_to_display = data_to_display[selected_columns]\n", - " \n", - " combined_data_for_top_extensions = pd.DataFrame().reindex_like(data_to_display.head(0)) # Create an empty DataFrame with the same columns as data_to_display\n", - " \n", - " for _, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n", - " filtered_data = data_to_display[data_to_display[\"fileExtensionPair\"] == extension]\n", - " sorted_data = filtered_data.sort_values(by=sort_column, ascending=False).head(top_n).reset_index()\n", - " combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data], ignore_index=True)\n", - " \n", - " display(combined_data_for_top_extensions)" + " return (grouped_data\n", + " .sort_values(by=grouping_columns, ascending=[False, True, False, True])\n", + " .reset_index(drop=True)\n", + " .rename(columns={metric_column + \"ExtensionRank\": \"GroupRank\"})\n", + " .drop(columns=['fileExtensionPairCount'])\n", + " )" ] }, { @@ -1598,7 +1565,7 @@ " rows=sub_plot_rows, \n", " cols=sub_plot_columns, \n", " subplot_titles=top_pairwise_changed_file_extensions,\n", - " vertical_spacing=0.04, \n", + " vertical_spacing=0.06, \n", " horizontal_spacing=0.04\n", " )\n", "\n", From bc72c914bfad29bfc203e65ddfd81fd6b7c84b70 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Tue, 26 Aug 2025 07:52:18 +0200 Subject: [PATCH 3/3] Update python dependencies seaborn, optuna, neo4j --- conda-environment.yml | 6 +++--- requirements.txt | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conda-environment.yml b/conda-environment.yml index bc0dcea86..e3c9cfaf2 100644 --- a/conda-environment.yml +++ b/conda-environment.yml @@ -22,9 +22,9 @@ dependencies: - plotly=6.0.* - python-kaleido=0.2.* # To render plotly plots. Static image export for web-based visualization libraries. - scikit-learn=1.6.* # To try out this HDBSCAN implementation - - seaborn=0.13 # To visualize clustering results - - optuna=4.3.* + - seaborn=0.13.* # To visualize clustering results + - optuna=4.5.* - umap-learn=0.5.* # to visualize node embeddings in 2D (UMAP dimensionality reduction) - shap=0.48.* - pip: - - neo4j==5.23.* \ No newline at end of file + - neo4j==5.28.* \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 6d912e39a..5e12b5ce7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,15 +17,15 @@ typing-extensions==4.12.* # Needed for opentsne and Python >= 3.12 wordcloud==1.9.* monotonic==1.* plotly[kaleido]==6.2.* -seaborn==0.13 # To visualize clustering results +seaborn==0.13.* # To visualize clustering results # --- Machine Learning / Optimization --- scikit-learn==1.6.* -optuna==4.3.* +optuna==4.5.* umap-learn==0.5.* # Dimensionality reduction to visualize node embeddings in 2D # --- Database connector --- -neo4j==5.23.* +neo4j==5.28.* # --- Native/scientific packages (may require compilation) --- # These are included but may cause install errors in pip/venv