Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions conda-environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ dependencies:
- plotly=6.0.*
- python-kaleido=0.2.* # To render plotly plots. Static image export for web-based visualization libraries.
- scikit-learn=1.6.* # To try out this HDBSCAN implementation
- seaborn=0.13 # To visualize clustering results
- optuna=4.3.*
- seaborn=0.13.* # To visualize clustering results
- optuna=4.5.*
- umap-learn=0.5.* # to visualize node embeddings in 2D (UMAP dimensionality reduction)
- shap=0.48.*
- pip:
- neo4j==5.23.*
- neo4j==5.28.*
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
// Determine global file count, global file count threshold (filter out refactoring commits) and global update commits
MATCH (git_commit_global:Git:Commit)-[:CONTAINS_CHANGE]->(:Git:Change)-[:UPDATES]->(git_file_global:Git:File)
WHERE git_file_global.deletedAt IS NULL
AND git_commit_global.isManualCommit
WITH git_commit_global, count(DISTINCT git_file_global) AS commitFileCount
WITH percentileDisc(commitFileCount, 0.95) AS globalFileCountThreshold
,count(git_commit_global) AS globalUpdateCommitCount
// Main section
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[:UPDATES]->(git_file:Git:File)
WHERE git_commit.isManualCommit
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
WHERE git_file.deletedAt IS NULL
// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)
Expand Down
3 changes: 2 additions & 1 deletion cypher/GitLog/List_pairwise_changed_files.cypher
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
// List pairs of files that were changed together. Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first.

MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
WHERE elementId(firstFile) < elementId(secondFile)
WHERE firstFile.extension < secondFile.extension
OR (firstFile.extension = secondFile.extension AND elementId(firstFile) < elementId(secondFile))
WITH *
,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName
,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName
Expand Down
25 changes: 25 additions & 0 deletions cypher/GitLog/Set_commit_classification_properties.cypher
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Classify git commits and set properties like isMergeCommit, isAutomationCommit (=isBotCommit or isMavenCommit).

MATCH (git_commit:Git:Commit)
WITH git_commit,
COUNT { (git_commit)-[:HAS_PARENT]->(:Git:Commit) } AS parentCount
WITH git_commit,
parentCount >= 2 AS isMergeCommit,
git_commit.author CONTAINS '[bot]' AS isBotAuthor,
git_commit.message STARTS WITH '[maven' AS isMavenCommit
WITH git_commit,
isMergeCommit,
isBotAuthor,
isMavenCommit,
(isBotAuthor OR isMavenCommit) AS isAutomatedCommit
SET git_commit.isMergeCommit = isMergeCommit,
git_commit.isBotAuthor = isBotAuthor,
git_commit.isMavenCommit = isMavenCommit,
git_commit.isAutomatedCommit = isAutomatedCommit,
git_commit.isManualCommit = NOT isAutomatedCommit
RETURN count(git_commit) AS classifiedCommits
// For Debugging:
// ,isMergeCommit
// ,isBotAuthor
// ,isMavenCommit
// ,isAutomatedCommit
75 changes: 21 additions & 54 deletions jupyter/GitHistoryGeneral.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1426,19 +1426,22 @@
{
"cell_type": "code",
"execution_count": null,
"id": "8f874da0",
"id": "0da821b1",
"metadata": {},
"outputs": [],
"source": [
"def find_top_pairwise_changed_file_extensions(input_data: pd.DataFrame, top_n: int = 10) -> pd.Series:\n",
"def find_top_pairwise_changed_file_extensions(input_data: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:\n",
" \"\"\"\n",
" Finds the top N pairwise changed file extensions based on commit count.\n",
" input_data : pd.DataFrame : DataFrame containing pairwise changed files with their pair counts and extensions\n",
" top_n : int : The number of top extensions to return\n",
" return : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n",
" \"\"\"\n",
" top_extensions = input_data.groupby('fileExtensionPair').aggregate({'filePairWithRelativePath': 'count'}).reset_index()\n",
" return top_extensions.sort_values(by='filePairWithRelativePath', ascending=False).reset_index(drop=True).head(top_n)['fileExtensionPair']"
" top_extensions = input_data.groupby('fileExtensionPair', observed=False).aggregate(\n",
" fileExtensionPairCount=pd.NamedAgg(column=\"filePairWithRelativePath\", aggfunc=\"count\")\n",
" ).reset_index()\n",
" \n",
" return top_extensions.sort_values(by='fileExtensionPairCount', ascending=False).reset_index(drop=True).head(top_n)"
]
},
{
Expand All @@ -1449,7 +1452,11 @@
"outputs": [],
"source": [
"top_pairwise_changed_file_extensions = find_top_pairwise_changed_file_extensions(pairwise_changed_git_files, top_n=4)\n",
"# Only keep the pairwise change files with the top file extensions\n",
"display(top_pairwise_changed_file_extensions)\n",
"\n",
"pairwise_changed_git_files = pairwise_changed_git_files.merge(top_pairwise_changed_file_extensions, on='fileExtensionPair')\n",
"\n",
"top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions['fileExtensionPair']\n",
"pairwise_changed_git_files = pairwise_changed_git_files[pairwise_changed_git_files['fileExtensionPair'].isin(top_pairwise_changed_file_extensions)]"
]
},
Expand All @@ -1471,7 +1478,7 @@
" return data_frame # Column already exists\n",
" \n",
" # Create a new rank column based on the specified column and group by the group column\n",
" data_frame[f\"{column_name}ExtensionRank\"] = data_frame.groupby('fileExtensionPair')[column_name].rank(ascending=False, method='dense').astype(int)\n",
" data_frame[f\"{column_name}ExtensionRank\"] = data_frame.groupby('fileExtensionPair', observed=False)[column_name].rank(ascending=False, method='dense').astype(int)\n",
" return data_frame"
]
},
Expand Down Expand Up @@ -1511,58 +1518,18 @@
" # Group by the file extensions and the metric and its rank.\n",
" # Since some entries might have the same metric value, we aggregate by the first file pair with relative path and the first file pair.\n",
" # This way we can pick the top n entries for each file extension pair.\n",
" grouping_columns = [\"fileExtensionPair\", metric_column, metric_column + \"ExtensionRank\"]\n",
" grouping_columns = [\"fileExtensionPairCount\", \"fileExtensionPair\", metric_column, metric_column + \"ExtensionRank\"]\n",
" grouped_data = filtered_data.groupby(grouping_columns).aggregate(\n",
" filePair=pd.NamedAgg(column=\"filePair\", aggfunc=\"first\"),\n",
" filePairWithRelativePath=pd.NamedAgg(column=\"filePairWithRelativePath\", aggfunc=\"first\"),\n",
" ).reset_index()\n",
" \n",
" return grouped_data.sort_values(by=grouping_columns, ascending=[True, False, True]).reset_index(drop=True).rename(columns={metric_column + \"ExtensionRank\": \"GroupRank\"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3c34ceea",
"metadata": {},
"outputs": [],
"source": [
"# TODO delete if not needed anymore\n",
"\n",
"def display_table_for_top_pairwise_changed_file_extensions_deprecated(\n",
" data_to_display: pd.DataFrame, \n",
" top_pairwise_changed_file_extensions: pd.Series,\n",
" sort_column: str,\n",
" top_n: int = 10\n",
" ):\n",
" \"\"\"\n",
" Displays a table for each top pairwise changed file extension.\n",
" data_to_plot : pd.DataFrame : DataFrame containing pairwise changed files with their commit counts\n",
" top_pairwise_changed_file_extensions : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n",
" sort_column : str : The column to sort the data by (default is \"pairwiseChangeCommitCount\")\n",
" top_n : int : The number of top entries to display for each extension (default is 10)\n",
" \"\"\"\n",
" \n",
" if data_to_display.empty:\n",
" print(\"No data to display\")\n",
" return\n",
" \n",
" if top_pairwise_changed_file_extensions.empty:\n",
" print(\"No top pairwise changed file extensions to display\")\n",
" return\n",
"\n",
" # Display each top pairwise changed file extension with its corresponding data\n",
" selected_columns = [\"fileExtensionPair\", \"filePair\", sort_column, \"filePairWithRelativePath\"]\n",
" data_to_display = data_to_display[selected_columns]\n",
" \n",
" combined_data_for_top_extensions = pd.DataFrame().reindex_like(data_to_display.head(0)) # Create an empty DataFrame with the same columns as data_to_display\n",
" \n",
" for _, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n",
" filtered_data = data_to_display[data_to_display[\"fileExtensionPair\"] == extension]\n",
" sorted_data = filtered_data.sort_values(by=sort_column, ascending=False).head(top_n).reset_index()\n",
" combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data], ignore_index=True)\n",
" \n",
" display(combined_data_for_top_extensions)"
" return (grouped_data\n",
" .sort_values(by=grouping_columns, ascending=[False, True, False, True])\n",
" .reset_index(drop=True)\n",
" .rename(columns={metric_column + \"ExtensionRank\": \"GroupRank\"})\n",
" .drop(columns=['fileExtensionPairCount'])\n",
" )"
]
},
{
Expand Down Expand Up @@ -1598,7 +1565,7 @@
" rows=sub_plot_rows, \n",
" cols=sub_plot_columns, \n",
" subplot_titles=top_pairwise_changed_file_extensions,\n",
" vertical_spacing=0.04, \n",
" vertical_spacing=0.06, \n",
" horizontal_spacing=0.04\n",
" )\n",
"\n",
Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ typing-extensions==4.12.* # Needed for opentsne and Python >= 3.12
wordcloud==1.9.*
monotonic==1.*
plotly[kaleido]==6.2.*
seaborn==0.13 # To visualize clustering results
seaborn==0.13.* # To visualize clustering results

# --- Machine Learning / Optimization ---
scikit-learn==1.6.*
optuna==4.3.*
optuna==4.5.*
umap-learn==0.5.* # Dimensionality reduction to visualize node embeddings in 2D

# --- Database connector ---
neo4j==5.23.*
neo4j==5.28.*

# --- Native/scientific packages (may require compilation) ---
# These are included but may cause install errors in pip/venv
Expand Down
8 changes: 7 additions & 1 deletion scripts/importGit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,10 @@ commonPostGitImport() {
postGitLogImport() {
echo "importGit: Add numberOfGitCommits property to nodes with matching file names..."
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_log_commits.cypher"


echo "importGit: Classify git commits (e.g. isMergeCommit, isAutomatedCommit)..."
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_commit_classification_properties.cypher"

commonPostGitImport
}

Expand All @@ -157,6 +160,9 @@ postGitPluginImport() {
execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_file_relative_path.cypher"
execute_cypher "${GIT_LOG_CYPHER_DIR}/Index_absolute_file_name.cypher"

echo "importGit: Classify git commits (e.g. isMergeCommit, isAutomatedCommit)..."
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_commit_classification_properties.cypher"

echo "importGit: Add numberOfGitCommits property to nodes with matching file names..."
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_plugin_commits.cypher"
echo "importGit: Add updateCommitCount property to file nodes and code nodes with matching file names..."
Expand Down
Loading