|
1426 | 1426 | { |
1427 | 1427 | "cell_type": "code", |
1428 | 1428 | "execution_count": null, |
1429 | | - "id": "8f874da0", |
| 1429 | + "id": "0da821b1", |
1430 | 1430 | "metadata": {}, |
1431 | 1431 | "outputs": [], |
1432 | 1432 | "source": [ |
1433 | | - "def find_top_pairwise_changed_file_extensions(input_data: pd.DataFrame, top_n: int = 10) -> pd.Series:\n", |
| 1433 | + "def find_top_pairwise_changed_file_extensions(input_data: pd.DataFrame, top_n: int = 10) -> pd.DataFrame:\n", |
1434 | 1434 | " \"\"\"\n", |
1435 | 1435 | " Finds the top N pairwise changed file extensions based on commit count.\n", |
1436 | 1436 | " input_data : pd.DataFrame : DataFrame containing pairwise changed files with their pair counts and extensions\n", |
1437 | 1437 | " top_n : int : The number of top extensions to return\n", |
1438 | 1438 | " return : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n", |
1439 | 1439 | " \"\"\"\n", |
1440 | | - " top_extensions = input_data.groupby('fileExtensionPair').aggregate({'filePairWithRelativePath': 'count'}).reset_index()\n", |
1441 | | - " return top_extensions.sort_values(by='filePairWithRelativePath', ascending=False).reset_index(drop=True).head(top_n)['fileExtensionPair']" |
| 1440 | + " top_extensions = input_data.groupby('fileExtensionPair', observed=False).aggregate(\n", |
| 1441 | + " fileExtensionPairCount=pd.NamedAgg(column=\"filePairWithRelativePath\", aggfunc=\"count\")\n", |
| 1442 | + " ).reset_index()\n", |
| 1443 | + " \n", |
| 1444 | + " return top_extensions.sort_values(by='fileExtensionPairCount', ascending=False).reset_index(drop=True).head(top_n)" |
1442 | 1445 | ] |
1443 | 1446 | }, |
1444 | 1447 | { |
|
1449 | 1452 | "outputs": [], |
1450 | 1453 | "source": [ |
1451 | 1454 | "top_pairwise_changed_file_extensions = find_top_pairwise_changed_file_extensions(pairwise_changed_git_files, top_n=4)\n", |
1452 | | - "# Only keep the pairwise change files with the top file extensions\n", |
| 1455 | + "display(top_pairwise_changed_file_extensions)\n", |
| 1456 | + "\n", |
| 1457 | + "pairwise_changed_git_files = pairwise_changed_git_files.merge(top_pairwise_changed_file_extensions, on='fileExtensionPair')\n", |
| 1458 | + "\n", |
| 1459 | + "top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions['fileExtensionPair']\n", |
1453 | 1460 | "pairwise_changed_git_files = pairwise_changed_git_files[pairwise_changed_git_files['fileExtensionPair'].isin(top_pairwise_changed_file_extensions)]" |
1454 | 1461 | ] |
1455 | 1462 | }, |
|
1471 | 1478 | " return data_frame # Column already exists\n", |
1472 | 1479 | " \n", |
1473 | 1480 | " # Create a new rank column based on the specified column and group by the group column\n", |
1474 | | - " data_frame[f\"{column_name}ExtensionRank\"] = data_frame.groupby('fileExtensionPair')[column_name].rank(ascending=False, method='dense').astype(int)\n", |
| 1481 | + " data_frame[f\"{column_name}ExtensionRank\"] = data_frame.groupby('fileExtensionPair', observed=False)[column_name].rank(ascending=False, method='dense').astype(int)\n", |
1475 | 1482 | " return data_frame" |
1476 | 1483 | ] |
1477 | 1484 | }, |
|
1511 | 1518 | " # Group by the file extensions and the metric and its rank.\n", |
1512 | 1519 | " # Since some entries might have the same metric value, we aggregate by the first file pair with relative path and the first file pair.\n", |
1513 | 1520 | " # This way we can pick the top n entries for each file extension pair.\n", |
1514 | | - " grouping_columns = [\"fileExtensionPair\", metric_column, metric_column + \"ExtensionRank\"]\n", |
| 1521 | + " grouping_columns = [\"fileExtensionPairCount\", \"fileExtensionPair\", metric_column, metric_column + \"ExtensionRank\"]\n", |
1515 | 1522 | " grouped_data = filtered_data.groupby(grouping_columns).aggregate(\n", |
1516 | 1523 | " filePair=pd.NamedAgg(column=\"filePair\", aggfunc=\"first\"),\n", |
1517 | 1524 | " filePairWithRelativePath=pd.NamedAgg(column=\"filePairWithRelativePath\", aggfunc=\"first\"),\n", |
1518 | 1525 | " ).reset_index()\n", |
1519 | 1526 | " \n", |
1520 | | - " return grouped_data.sort_values(by=grouping_columns, ascending=[True, False, True]).reset_index(drop=True).rename(columns={metric_column + \"ExtensionRank\": \"GroupRank\"})" |
1521 | | - ] |
1522 | | - }, |
1523 | | - { |
1524 | | - "cell_type": "code", |
1525 | | - "execution_count": null, |
1526 | | - "id": "3c34ceea", |
1527 | | - "metadata": {}, |
1528 | | - "outputs": [], |
1529 | | - "source": [ |
1530 | | - "# TODO delete if not needed anymore\n", |
1531 | | - "\n", |
1532 | | - "def display_table_for_top_pairwise_changed_file_extensions_deprecated(\n", |
1533 | | - " data_to_display: pd.DataFrame, \n", |
1534 | | - " top_pairwise_changed_file_extensions: pd.Series,\n", |
1535 | | - " sort_column: str,\n", |
1536 | | - " top_n: int = 10\n", |
1537 | | - " ):\n", |
1538 | | - " \"\"\"\n", |
1539 | | - " Displays a table for each top pairwise changed file extension.\n", |
1540 | | - " data_to_plot : pd.DataFrame : DataFrame containing pairwise changed files with their commit counts\n", |
1541 | | - " top_pairwise_changed_file_extensions : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n", |
1542 | | - " sort_column : str : The column to sort the data by (default is \"pairwiseChangeCommitCount\")\n", |
1543 | | - " top_n : int : The number of top entries to display for each extension (default is 10)\n", |
1544 | | - " \"\"\"\n", |
1545 | | - " \n", |
1546 | | - " if data_to_display.empty:\n", |
1547 | | - " print(\"No data to display\")\n", |
1548 | | - " return\n", |
1549 | | - " \n", |
1550 | | - " if top_pairwise_changed_file_extensions.empty:\n", |
1551 | | - " print(\"No top pairwise changed file extensions to display\")\n", |
1552 | | - " return\n", |
1553 | | - "\n", |
1554 | | - " # Display each top pairwise changed file extension with its corresponding data\n", |
1555 | | - " selected_columns = [\"fileExtensionPair\", \"filePair\", sort_column, \"filePairWithRelativePath\"]\n", |
1556 | | - " data_to_display = data_to_display[selected_columns]\n", |
1557 | | - " \n", |
1558 | | - " combined_data_for_top_extensions = pd.DataFrame().reindex_like(data_to_display.head(0)) # Create an empty DataFrame with the same columns as data_to_display\n", |
1559 | | - " \n", |
1560 | | - " for _, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n", |
1561 | | - " filtered_data = data_to_display[data_to_display[\"fileExtensionPair\"] == extension]\n", |
1562 | | - " sorted_data = filtered_data.sort_values(by=sort_column, ascending=False).head(top_n).reset_index()\n", |
1563 | | - " combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data], ignore_index=True)\n", |
1564 | | - " \n", |
1565 | | - " display(combined_data_for_top_extensions)" |
| 1527 | + " return (grouped_data\n", |
| 1528 | + " .sort_values(by=grouping_columns, ascending=[False, True, False, True])\n", |
| 1529 | + " .reset_index(drop=True)\n", |
| 1530 | + " .rename(columns={metric_column + \"ExtensionRank\": \"GroupRank\"})\n", |
| 1531 | + " .drop(columns=['fileExtensionPairCount'])\n", |
| 1532 | + " )" |
1566 | 1533 | ] |
1567 | 1534 | }, |
1568 | 1535 | { |
|
1598 | 1565 | " rows=sub_plot_rows, \n", |
1599 | 1566 | " cols=sub_plot_columns, \n", |
1600 | 1567 | " subplot_titles=top_pairwise_changed_file_extensions,\n", |
1601 | | - " vertical_spacing=0.04, \n", |
| 1568 | + " vertical_spacing=0.06, \n", |
1602 | 1569 | " horizontal_spacing=0.04\n", |
1603 | 1570 | " )\n", |
1604 | 1571 | "\n", |
|
0 commit comments