diff --git a/README.md b/README.md index d6b8e9efb..069a5fba5 100644 --- a/README.md +++ b/README.md @@ -219,6 +219,13 @@ The [Code Structure Analysis Pipeline](./.github/workflows/java-code-analysis.ym 👉 The custom Jupyter Notebook metadata property `code_graph_analysis_pipeline_data_validation` can be set to choose a query from [cypher/Validation](./cypher/Validation) that will be executed preliminary to the notebook. If the query leads to at least one result, the validation succeeds and the notebook will be run. If the query leads to no result, the notebook will be skipped. For more details see [Data Availability Validation](./COMMANDS.md#data-availability-validation). +- How can i increase the heap memory when scanning large Typescript projects? + 👉 Use the environment variable TYPESCRIPT_SCAN_HEAP_MEMORY in megabyte (default = 4096): + + ```shell + TYPESCRIPT_SCAN_HEAP_MEMORY=16384 ./../../scripts/analysis/analyze.sh + ``` + ## 🕸 Web References - [Graph Data Science 101: Understanding Graphs and Graph Data Science](https://techfirst.medium.com/graph-data-science-101-understanding-graphs-and-graph-data-science-c25055a9db01) diff --git a/cypher/External_Dependencies/Package_json_dependencies_by_package.cypher b/cypher/External_Dependencies/Package_json_dependencies_by_package.cypher new file mode 100644 index 000000000..be05a127e --- /dev/null +++ b/cypher/External_Dependencies/Package_json_dependencies_by_package.cypher @@ -0,0 +1,10 @@ +// List package.json dependencies by package + + MATCH (package:NPM:Package)-[:DECLARES_DEPENDENCY]->(dependency:NPM:Dependency) + OPTIONAL MATCH (package)-[:CONTAINS]->(:Json:Object)-[:HAS_KEY]->(:Json:Key{name:'name'})-[:HAS_VALUE]->(packageName:Json:Scalar:Value) +RETURN replace(replace(package.fileName, '/npm-package-json/', ''), '/package.json', '') + AS packageDirectory + ,packageName.value AS packageName + ,dependency.name AS dependencyName + ,dependency.dependency AS dependencyVersion +ORDER BY packageName, dependencyName \ No newline at end of file diff --git a/cypher/External_Dependencies/Package_json_dependencies_combinations.cypher b/cypher/External_Dependencies/Package_json_dependencies_combinations.cypher new file mode 100644 index 000000000..02a192c0b --- /dev/null +++ b/cypher/External_Dependencies/Package_json_dependencies_combinations.cypher @@ -0,0 +1,18 @@ +// List most used combination of 2 and 3 dependencies + + MATCH (package:NPM:Package)-[:DECLARES_DEPENDENCY]->(dependency:NPM:Dependency) + OPTIONAL MATCH (package)-[:CONTAINS]->(:Json:Object)-[:HAS_KEY]->(:Json:Key{name:'name'})-[:HAS_VALUE]->(packageName:Json:Scalar:Value) + WITH package.fileName AS packageFileName + ,dependency.name AS dependencyName + ORDER BY packageFileName, dependencyName + WITH packageFileName + ,apoc.coll.combinations(collect(dependencyName), 2, 3) AS dependencyCombinations +UNWIND dependencyCombinations AS dependencyCombination + WITH dependencyCombination + ,count(*) as occurrences + ,collect(packageFileName) AS packages + WHERE occurrences > 1 +RETURN dependencyCombination + ,occurrences + ,packages[0..9] AS firstTenPackages +ORDER BY occurrences DESC \ No newline at end of file diff --git a/cypher/External_Dependencies/Package_json_dependencies_combinations_with_versions.cypher b/cypher/External_Dependencies/Package_json_dependencies_combinations_with_versions.cypher new file mode 100644 index 000000000..49ac90aad --- /dev/null +++ b/cypher/External_Dependencies/Package_json_dependencies_combinations_with_versions.cypher @@ -0,0 +1,19 @@ +// List most used combination of 2 and 3 dependencies including version specifier + + MATCH (package:NPM:Package)-[:DECLARES_DEPENDENCY]->(dependency:NPM:Dependency) + OPTIONAL MATCH (package)-[:CONTAINS]->(:Json:Object)-[:HAS_KEY]->(:Json:Key{name:'name'})-[:HAS_VALUE]->(packageName:Json:Scalar:Value) + WITH package.fileName AS packageFileName + ,dependency.name AS dependencyName + ,dependency.dependency AS dependencyVersion + ORDER BY packageFileName, dependencyName + WITH packageFileName + ,apoc.coll.combinations(collect(dependencyName + ' ' + dependencyVersion), 2, 3) AS dependencyCombinations +UNWIND dependencyCombinations AS dependencyCombination + WITH dependencyCombination + ,count(*) as occurrences + ,collect(packageFileName) AS packages + WHERE occurrences > 1 +RETURN dependencyCombination + ,occurrences + ,packages[0..9] AS firstTenPackages +ORDER BY occurrences DESC \ No newline at end of file diff --git a/cypher/External_Dependencies/Package_json_dependencies_occurrence.cypher b/cypher/External_Dependencies/Package_json_dependencies_occurrence.cypher new file mode 100644 index 000000000..0207460cb --- /dev/null +++ b/cypher/External_Dependencies/Package_json_dependencies_occurrence.cypher @@ -0,0 +1,16 @@ +// List package.json dependencies by the number they are used by all packages + + MATCH (package:NPM:Package)-[:DECLARES_DEPENDENCY]->(dependency:NPM:Dependency) + OPTIONAL MATCH (package)-[:CONTAINS]->(:Json:Object)-[:HAS_KEY]->(:Json:Key{name:'name'})-[:HAS_VALUE]->(packageName:Json:Scalar:Value) + WITH replace(replace(package.fileName, '/npm-package-json/', ''), '/package.json', '') + AS packageDirectory + ,packageName.value AS packageName + ,dependency.name AS dependencyName + ,dependency.dependency AS dependencyVersion +RETURN dependencyName + ,count(*) AS usingPackageCount + ,count(DISTINCT dependencyVersion) AS dependencyVersionCount + ,collect(packageName)[0..9] AS packageNameExamples + ,collect(dependencyVersion)[0..4] AS dependencyVersionExamples + ,collect(packageDirectory)[0..4] AS packageDirectory +ORDER BY usingPackageCount DESC \ No newline at end of file diff --git a/cypher/Overview/Words_for_git_author_Wordcloud_with_frequency.cypher b/cypher/Overview/Words_for_git_author_Wordcloud_with_frequency.cypher new file mode 100644 index 000000000..691f328c1 --- /dev/null +++ b/cypher/Overview/Words_for_git_author_Wordcloud_with_frequency.cypher @@ -0,0 +1,6 @@ +// Wordcloud of git authors and their commit count + + MATCH (author:Git:Author)-[:COMMITTED]-(commit:Git:Commit) + WHERE NOT author.name CONTAINS '[bot]' + AND size(author.name) > 1 +RETURN author.name AS word, count(commit) AS frequency \ No newline at end of file diff --git a/cypher/Overview/Words_for_universal_Wordcloud.cypher b/cypher/Overview/Words_for_universal_Wordcloud.cypher index 914635829..0f51bd6d7 100644 --- a/cypher/Overview/Words_for_universal_Wordcloud.cypher +++ b/cypher/Overview/Words_for_universal_Wordcloud.cypher @@ -1,6 +1,6 @@ // Words for universal Wordcloud -MATCH (named:!Key&!Primitive&!PrimitiveType&!Void&!JavaType&!ResolvedDuplicateType&!ExternalType) +MATCH (named:!Key&!Primitive&!PrimitiveType&!Void&!JavaType&!ResolvedDuplicateType&!ExternalType&!Git) WHERE named.name > '' AND named.name <> 'package-info' AND named.name <> '' diff --git a/cypher/Path_Finding/Path_Finding_5_All_pairs_shortest_path_distribution_per_project.cypher b/cypher/Path_Finding/Path_Finding_5_All_pairs_shortest_path_distribution_per_project.cypher index f7f430dad..4649c772a 100644 --- a/cypher/Path_Finding/Path_Finding_5_All_pairs_shortest_path_distribution_per_project.cypher +++ b/cypher/Path_Finding/Path_Finding_5_All_pairs_shortest_path_distribution_per_project.cypher @@ -26,9 +26,14 @@ UNWIND sourcesAndTargets AS sourceAndTarget // Optionally get the project (e.g. Java Artifact, Typescript Project) the source and target belong to OPTIONAL MATCH (sourceProject:Artifact|Project)-[:CONTAINS]->(source) OPTIONAL MATCH (targetProject:Artifact|Project)-[:CONTAINS]->(target) +// Optionally get the name of the scan that contained that project +OPTIONAL MATCH (sourceScan:TS:Scan)-[:CONTAINS_PROJECT]->(sourceProject) +OPTIONAL MATCH (targetScan:TS:Scan)-[:CONTAINS_PROJECT]->(targetProject) // Group by project name, if the target project is the same and the distance. Return those as result. RETURN sourceProject.name AS sourceProject + ,sourceScan.name AS sourceScan ,(targetProject <> sourceProject) AS isDifferentTargetProject + ,(targetScan <> sourceScan) AS isDifferentTargetScan ,distance ,distanceTotalPairCount ,distanceTotalSourceCount diff --git a/cypher/Path_Finding/Path_Finding_5_All_pairs_shortest_path_examples.cypher b/cypher/Path_Finding/Path_Finding_5_All_pairs_shortest_path_examples.cypher new file mode 100644 index 000000000..4aba1d9fa --- /dev/null +++ b/cypher/Path_Finding/Path_Finding_5_All_pairs_shortest_path_examples.cypher @@ -0,0 +1,23 @@ +// Path Finding - All pairs shortest path algorithm - Stream - Longest paths as examples + + CALL gds.allShortestPaths.stream($dependencies_projection + '-cleaned') + YIELD sourceNodeId, targetNodeId, distance +// Filter out all pairs that have no connection (infinite distance) + WHERE gds.util.isFinite(distance) = true + AND sourceNodeId <> targetNodeId // Filter out cyclic dependencies + WITH toInteger(distance) AS distance + ,sourceNodeId + ,targetNodeId + ,gds.util.asNode(sourceNodeId) AS source + ,gds.util.asNode(targetNodeId) AS target +// Optionally get the project (e.g. Java Artifact, Typescript Project) the source and target belong to +OPTIONAL MATCH (sourceProject:Artifact|Project)-[:CONTAINS]->(source) +// Optionally get the name of the scan that contained that project +OPTIONAL MATCH (sourceScan:TS:Scan)-[:CONTAINS_PROJECT]->(sourceProject) + WITH *, coalesce(sourceScan, sourceProject).name AS sourceContainerName + ORDER BY distance DESC, sourceContainerName ASC +// Only output the top 10 entries + LIMIT 10 +// Get the shortest path for the source and target node + MATCH path = SHORTEST 1 (source)-[:DEPENDS_ON]->+(target) +RETURN distance, sourceContainerName, sourceProject, sourceScan, path \ No newline at end of file diff --git a/cypher/Path_Finding/Path_Finding_6_Longest_paths_distribution_per_project.cypher b/cypher/Path_Finding/Path_Finding_6_Longest_paths_distribution_per_project.cypher index f2f607bb2..3c2f21bc9 100644 --- a/cypher/Path_Finding/Path_Finding_6_Longest_paths_distribution_per_project.cypher +++ b/cypher/Path_Finding/Path_Finding_6_Longest_paths_distribution_per_project.cypher @@ -24,9 +24,14 @@ UNWIND sourcesAndTargets AS sourceAndTarget // Optionally get the project (e.g. Java Artifact, Typescript Project) the source and target belong to OPTIONAL MATCH (sourceProject:Artifact|Project)-[:CONTAINS]->(source) OPTIONAL MATCH (targetProject:Artifact|Project)-[:CONTAINS]->(target) +// Optionally get the name of the scan that contained that project +OPTIONAL MATCH (sourceScan:TS:Scan)-[:CONTAINS_PROJECT]->(sourceProject) +OPTIONAL MATCH (targetScan:TS:Scan)-[:CONTAINS_PROJECT]->(targetProject) // Group by project name, if the target project is the same and the distance. Return those as result. RETURN sourceProject.name AS sourceProject + ,sourceScan.name AS sourceScan ,(targetProject <> sourceProject) AS isDifferentTargetProject + ,(targetScan <> sourceScan) AS isDifferentTargetScan ,distance ,distanceTotalPairCount ,distanceTotalSourceCount diff --git a/cypher/Path_Finding/Path_Finding_6_Longest_paths_examples.cypher b/cypher/Path_Finding/Path_Finding_6_Longest_paths_examples.cypher new file mode 100644 index 000000000..6d12571bc --- /dev/null +++ b/cypher/Path_Finding/Path_Finding_6_Longest_paths_examples.cypher @@ -0,0 +1,25 @@ +// Path Finding - Longest path - Stream - Max. paths as examples + + CALL gds.dag.longestPath.stream($dependencies_projection + '-cleaned') + YIELD index, sourceNode, targetNode, totalCost, path + WITH index + ,path + ,toInteger(totalCost) AS distance + ,sourceNode AS sourceNodeId + ,targetNode AS targetNodeId + WHERE sourceNodeId <> targetNodeId // Filter out cyclic dependencies + WITH * + ,gds.util.asNode(sourceNodeId) AS source + ,gds.util.asNode(targetNodeId) AS target +// Optionally get the project (e.g. Java Artifact, Typescript Project) the source and target belong to +OPTIONAL MATCH (sourceProject:Artifact|Project)-[:CONTAINS]->(source) +OPTIONAL MATCH (targetProject:Artifact|Project)-[:CONTAINS]->(target) +// Optionally get the name of the scan that contained that project +OPTIONAL MATCH (sourceScan:TS:Scan)-[:CONTAINS_PROJECT]->(sourceProject) +OPTIONAL MATCH (targetScan:TS:Scan)-[:CONTAINS_PROJECT]->(targetProject) + WITH *, coalesce(sourceScan, sourceProject).name AS sourceContainerName + ORDER BY distance DESC, sourceContainerName ASC +// Only output the top 10 entries + LIMIT 10 +// Group by project name, if the target project is the same and the distance. Return those as result. +RETURN distance, index, sourceContainerName, sourceProject, sourceScan, path \ No newline at end of file diff --git a/cypher/Typescript_Enrichment/Add_name_to_property_on_scan_nodes.cypher b/cypher/Typescript_Enrichment/Add_name_to_property_on_scan_nodes.cypher index 91fba0296..b2b1e5ea8 100644 --- a/cypher/Typescript_Enrichment/Add_name_to_property_on_scan_nodes.cypher +++ b/cypher/Typescript_Enrichment/Add_name_to_property_on_scan_nodes.cypher @@ -2,6 +2,8 @@ MATCH (typescriptScan:TS:Scan) WITH typescriptScan - ,replace(reverse(split(reverse(typescriptScan.fileName), '/')[0]), '.json', '') AS scanName + ,reverse(split(reverse(split(typescriptScan.fileName, '/.reports/')[0]), '/')[0]) AS scanName SET typescriptScan.name = scanName -RETURN count(*) AS numberOfNamesScans \ No newline at end of file +RETURN count(*) AS numberOfNamesScans +// Debugging +//RETURN scanName, scanNameOld, typescriptScan.fileName \ No newline at end of file diff --git a/jupyter/ExternalDependenciesJava.ipynb b/jupyter/ExternalDependenciesJava.ipynb index e68572be1..b35645241 100644 --- a/jupyter/ExternalDependenciesJava.ipynb +++ b/jupyter/ExternalDependenciesJava.ipynb @@ -6,7 +6,7 @@ "id": "2f0eabc4", "metadata": {}, "source": [ - "# External Dependencies\n", + "# External Dependencies for Java\n", "
\n", "\n", "### References\n", @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 233, "id": "4191f259", "metadata": {}, "outputs": [], @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 234, "id": "1c5dab37", "metadata": {}, "outputs": [], @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 235, "id": "c1db254b", "metadata": {}, "outputs": [], @@ -56,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 236, "id": "59310f6f", "metadata": {}, "outputs": [], @@ -68,7 +68,151 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 237, + "id": "f02d9944", + "metadata": {}, + "outputs": [], + "source": [ + "def group_to_others_below_threshold(data_frame : pd.DataFrame, value_column : str, name_column: str, threshold: float) -> pd.DataFrame: \n", + " \"\"\"\n", + " Adds a new percentage column for the value column and \n", + " groups all values below the given threshold to \"others\" in the name column.\n", + "\n", + " Parameters:\n", + " - data_frame (pd.DataFrame): Input pandas DataFrame\n", + " - value_column (str): Name of the column that contains the numeric value\n", + " - name_column (str): Name of the column that contains the group name that will be replaced by \"others\" for small values\n", + " - threshold (float): Threshold in % that is used to group values below it into the \"others\" group\n", + "\n", + " Returns:\n", + " int:Returning value\n", + "\n", + " \"\"\"\n", + " result_data_frame = data_frame[[name_column, value_column]].copy();\n", + "\n", + " percent_column_name = value_column + 'Percent';\n", + "\n", + " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", + " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", + "\n", + " # Convert name column to string values if it wasn't of that type before\n", + " result_data_frame[name_column] = result_data_frame[name_column].astype(str)\n", + "\n", + " # Change the group name to \"others\" if it is called less than the specified threshold\n", + " result_data_frame.loc[result_data_frame[percent_column_name] < threshold, name_column] = 'others';\n", + "\n", + " # Group by name column (foremost the new \"others\" entries) and sum their percentage\n", + " #result_data_frame = result_data_frame.groupby(name_column)[percent_column_name].sum();\n", + " result_data_frame = result_data_frame.groupby(name_column).sum();\n", + " # Sort by values descending\n", + " #return result_data_frame.sort_values(ascending=False).to_frame();\n", + " return result_data_frame.sort_values(by=percent_column_name, ascending=False);" + ] + }, + { + "cell_type": "code", + "execution_count": 238, + "id": "47cc11b0", + "metadata": {}, + "outputs": [], + "source": [ + "def filter_values_below_threshold(data_frame : pd.DataFrame, value_column : str, upper_limit: float = 100.0) -> pd.DataFrame: \n", + " \"\"\"\n", + " Adds a new percentage column for the value column and \n", + " groups all values below the given threshold to \"others\" in the name column.\n", + "\n", + " Parameters:\n", + " - data_frame (pd.DataFrame): Input pandas DataFrame\n", + " - value_column (str): Name of the column that contains the numeric value\n", + " - upper_limit (float): Defaults to 100%. Filters out all entries exceeding this limit. Intended to drill down \"others\" in a second chart/table.\n", + "\n", + " Returns:\n", + " int:Returning value\n", + "\n", + " \"\"\"\n", + " result_data_frame = data_frame.copy();\n", + "\n", + " percent_column_name = value_column + 'Percent';\n", + "\n", + " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", + " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", + "\n", + " # Limit entries to meet the an optional upper limit (in percentage)\n", + " result_data_frame = result_data_frame.query(\"`\" + percent_column_name + \"` <= \" + str(upper_limit))\n", + "\n", + " result_data_frame = result_data_frame.reset_index(drop=True)\n", + " return result_data_frame.sort_values(by=percent_column_name, ascending=False);" + ] + }, + { + "cell_type": "code", + "execution_count": 239, + "id": "89a12ec4", + "metadata": {}, + "outputs": [], + "source": [ + "def explode_index_value(input_data_frame: pd.DataFrame, index_value_to_emphasize: str = 'others', base_value: float = 0.02, emphasize_value: float = 0.2):\n", + " \"\"\"\n", + " \"Explode\" offsets slices in a pie chart plot by a given value.\n", + " The specified index value will be emphasized with a larger value to make it stand out in the pie chart plot.\n", + "\n", + " Parameters:\n", + " - input_data_frame (pd.DataFrame): Input pandas DataFrame with the data that will be plot. (Required)\n", + " - index_value_to_emphasize (str): Value of the index that will be emphasized. (Default= 'others')\n", + " - base_value (float): Base value for all pies in the chart. (Default=0.02)\n", + " - emphasize_value (float): Value for the emphasized pie in the chart. (Default=0.2)\n", + "\n", + " Returns:\n", + " Array with the same size as the number of rows/pies to plot containing the \"explode\" value for each of them\n", + "\n", + " \"\"\"\n", + " # Each entry in the list corresponds to an x value\n", + " # The comparison with the index_value_to_emphasize produces an array of booleans where nth entry with the emphasized value is \"true\"\n", + " # Multiplying it leads to 1 for True and 0 for False therefore \"exploding\" the emphasized entry whilst \n", + " return (input_data_frame.index == index_value_to_emphasize) * emphasize_value + base_value " + ] + }, + { + "cell_type": "code", + "execution_count": 240, + "id": "e9b1ccad", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_pie_chart(input_data_frame: pd.DataFrame, title: str):\n", + " if input_data_frame.empty:\n", + " print(\"No data to plot for title '\" + title + \"'.\")\n", + " return\n", + " \n", + " name_of_the_first_column_containing_the_values=input_data_frame.columns[0]\n", + " total_sum = input_data_frame[name_of_the_first_column_containing_the_values].sum()\n", + " \n", + " def custom_auto_percentage_format(percentage):\n", + " return '{:1.2f}% ({:.0f})'.format(percentage, total_sum * percentage / 100.0)\n", + " \n", + " plot.figure();\n", + "\n", + " axis = input_data_frame.plot(\n", + " kind='pie',\n", + " y=name_of_the_first_column_containing_the_values + 'Percent',\n", + " ylabel='',\n", + " legend=True,\n", + " labeldistance=None,\n", + " autopct=custom_auto_percentage_format,\n", + " textprops={'fontsize': 6},\n", + " pctdistance=1.15,\n", + " cmap=main_color_map,\n", + " figsize=(9,9),\n", + " explode=explode_index_value(input_data_frame, index_value_to_emphasize='others')\n", + " )\n", + " plot.title(title, pad=15)\n", + " axis.legend(bbox_to_anchor=(1.08, 1), loc='upper left')\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 241, "id": "da9e8edb", "metadata": {}, "outputs": [], @@ -102,7 +246,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 243, "id": "c2496caf", "metadata": {}, "outputs": [], @@ -172,50 +316,12 @@ "id": "1143afcb", "metadata": {}, "source": [ - "#### Table 1 Chart 1 - Most called external packages in % by types\n", + "#### Table 1 Chart 1a - Most called external packages in % by types (more than 0.7% overall)\n", "\n", "External packages that are used less than 0.7% are grouped into the name \"others\" to get a cleaner chart\n", "with the most significant external packages and how ofter they are called in percent." ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "44a11aec", - "metadata": {}, - "outputs": [], - "source": [ - "def group_to_others_below_threshold(data_frame : pd.DataFrame, value_column : str, name_column: str, threshold: float) -> pd.DataFrame: \n", - " \"\"\"Adds a new percentage column for the value column and \n", - " groups all values below the given threshold to \"others\" in the name column.\n", - "\n", - " Parameters:\n", - " - data_frame (pd.DataFrame): Input pandas DataFrame\n", - " - value_column (str): Name of the column that contains the numeric value\n", - " - name_column (str): Name of the column that contains the group name that will be replaced by \"others\" for small values\n", - " - threshold (float): Threshold in % that is used to group values below it into the \"others\" group\n", - "\n", - " Returns:\n", - " int:Returning value\n", - "\n", - " \"\"\"\n", - " result_data_frame = data_frame.copy();\n", - "\n", - " percent_column_name = value_column + 'Percent';\n", - "\n", - " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", - " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", - "\n", - " # Change the external package name to \"others\" if it is called less than the specified threshold\n", - " result_data_frame.loc[result_data_frame[percent_column_name] < threshold, name_column] = 'others';\n", - "\n", - " # Group external package name (foremost the new \"others\" entries) and sum their percentage\n", - " result_data_frame = result_data_frame.groupby(name_column)[percent_column_name].sum();\n", - "\n", - " # Sort by values descending\n", - " return result_data_frame.sort_values(ascending=False);" - ] - }, { "cell_type": "code", "execution_count": null, @@ -228,33 +334,42 @@ " value_column='numberOfExternalCallerTypes',\n", " name_column='externalPackageName',\n", " threshold= 0.7\n", - ");" + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_package_by_type_usage_significant,\n", + " title='Top external package usage [%] by type (more than 0.7% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "61565f22", + "metadata": {}, + "source": [ + "#### Table 1 Chart 1b - Most called external packages in % by types (less than 0.7% overall \"others\" drill-down)\n", + "\n", + "Shows the lowest (less than 0.7% overall) most called external package. Therefore, this plot breaks down the \"others\" slice of the pie chart above. Values under 0.3% from that will be grouped into \"others\" to get a cleaner plot." ] }, { "cell_type": "code", "execution_count": null, - "id": "688b6d56", + "id": "d15ba749", "metadata": {}, "outputs": [], "source": [ - "if external_package_by_type_usage_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", + "external_package_by_type_usage_drill_down_others=filter_values_below_threshold(external_package_usage, 'numberOfExternalCallerTypes', 0.7)\n", "\n", - " axis = external_package_by_type_usage_significant.plot(\n", - " kind='pie',\n", - " title='Top external package usage [%] by type',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "external_package_by_type_usage_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_package_by_type_usage_drill_down_others,\n", + " value_column='numberOfExternalCallerTypes',\n", + " name_column='externalPackageName',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_package_by_type_usage_significant_drill_down_others,\n", + " title='Top external package usage [%] by type (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -262,7 +377,7 @@ "id": "84c123dc", "metadata": {}, "source": [ - "#### Table 1 Chart 2 - Most called external packages in % by packages\n", + "#### Table 1 Chart 2a - Most called external packages in % by packages (more than 0.7% overall)\n", "\n", "External packages that are used less than 0.7% are grouped into the name \"others\" to get a cleaner chart\n", "with the most significant external packages and how ofter they are called in percent." @@ -280,33 +395,42 @@ " value_column='numberOfExternalCallerPackages',\n", " name_column='externalPackageName',\n", " threshold= 0.7\n", - ");" + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_package_by_package_usage_significant,\n", + " title='Top external package usage [%] by package (more than 0.7% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "db98344d", + "metadata": {}, + "source": [ + "#### Table 1 Chart 2b - Most called external packages in % by packages (less than 0.7% overall \"others\" drill-down)\n", + "\n", + "Shows the lowest (less than 0.7% overall) most called external package. Therefore, this plot breaks down the \"others\" slice of the pie chart above. Values under 0.3% from that will be grouped into \"others\" to get a cleaner plot." ] }, { "cell_type": "code", "execution_count": null, - "id": "c165f403", + "id": "62ffff85", "metadata": {}, "outputs": [], "source": [ - "if external_package_by_package_usage_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", + "external_package_usage_by_package_drill_down_others=filter_values_below_threshold(external_package_usage, 'numberOfExternalCallerPackages', 0.7)\n", "\n", - " axis = external_package_by_package_usage_significant.plot(\n", - " kind='pie',\n", - " title='Top external package usage [%] by package',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "external_package_by_package_usage_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_package_usage_by_package_drill_down_others,\n", + " value_column='numberOfExternalCallerPackages',\n", + " name_column='externalPackageName',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_package_by_package_usage_significant_drill_down_others,\n", + " title='Top external package usage [%] by package (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -350,7 +474,7 @@ "id": "279932a6", "metadata": {}, "source": [ - "#### Table 2 Chart 1 - Most called second level external packages in % by type\n", + "#### Table 2 Chart 1a - Most called second level external packages in % by type\n", "\n", "External package groups that are used less than 0.7% are grouped into the name \"others\" to get a cleaner chart\n", "with the most significant external packages and how ofter they are called in percent." @@ -369,24 +493,41 @@ " name_column='externalSecondLevelPackageName',\n", " threshold= 0.7\n", ");\n", + "plot_pie_chart(\n", + " input_data_frame=external_grouped_package_by_type_usage_significant,\n", + " title='Top external package (grouped by first 2 layers) usage [%] by type (more than 0.7% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "378b9eef", + "metadata": {}, + "source": [ + "#### Table 2 Chart 1b - Most called second level external packages in % by type (less than 0.7% overall \"others\" drill-down)\n", "\n", - "if external_grouped_package_by_type_usage_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", + "Shows the lowest (less than 0.7% overall) most called external package. Therefore, this plot breaks down the \"others\" slice of the pie chart above. Values under 0.3% from that will be grouped into \"others\" to get a cleaner plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f8a467e", + "metadata": {}, + "outputs": [], + "source": [ + "external_grouped_package_by_type_usage_drill_down_others=filter_values_below_threshold(external_grouped_package_usage, 'numberOfExternalCallerTypes', 0.7)\n", "\n", - " axis = external_grouped_package_by_type_usage_significant.plot(\n", - " kind='pie',\n", - " title='Top external package (grouped by first 2 layers) usage [%] by type',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "external_grouped_package_by_type_usage_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_grouped_package_by_type_usage_drill_down_others,\n", + " value_column='numberOfExternalCallerTypes',\n", + " name_column='externalSecondLevelPackageName',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_grouped_package_by_type_usage_significant_drill_down_others,\n", + " title='Top external package (grouped by first 2 layers) usage [%] by type (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -394,7 +535,7 @@ "id": "43c3e1a3", "metadata": {}, "source": [ - "#### Table 2 Chart 2 - Most called second level external packages in % by package\n", + "#### Table 2 Chart 2a - Most called second level external packages in % by package (more than 0.7% overall)\n", "\n", "External package groups that are used less than 0.7% are grouped into the name \"others\" to get a cleaner chart\n", "with the most significant external packages and how ofter they are called in percent." @@ -413,23 +554,41 @@ " name_column='externalSecondLevelPackageName',\n", " threshold= 0.7\n", ");\n", + "plot_pie_chart(\n", + " input_data_frame=external_grouped_package_by_package_usage_significant,\n", + " title='Top external package (grouped by first 2 layers) usage [%] by package (more than 0.7% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "aa3592f0", + "metadata": {}, + "source": [ + "#### Table 2 Chart 2b - Most called second level external packages in % by package (less than 0.7% overall \"others\" drill-down)\n", "\n", - "if external_grouped_package_by_package_usage_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", - " axis = external_grouped_package_by_package_usage_significant.plot(\n", - " kind='pie',\n", - " title='Top external package (grouped by first 2 layers) usage [%] by package',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "Shows the lowest (less than 0.7% overall) most called external package. Therefore, this plot breaks down the \"others\" slice of the pie chart above. Values under 0.3% from that will be grouped into \"others\" to get a cleaner plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09eae481", + "metadata": {}, + "outputs": [], + "source": [ + "external_grouped_package_by_package_usage_drill_down_others=filter_values_below_threshold(external_grouped_package_usage, 'numberOfExternalCallerPackages', 0.7)\n", + "\n", + "external_grouped_package_by_package_usage_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_grouped_package_by_type_usage_drill_down_others,\n", + " value_column='numberOfExternalCallerPackages',\n", + " name_column='externalSecondLevelPackageName',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_grouped_package_by_package_usage_significant_drill_down_others,\n", + " title='Top external package (grouped by first 2 layers) usage [%] by package (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -561,7 +720,7 @@ "id": "b210eea0", "metadata": {}, "source": [ - "#### Table 3 Chart 1 - Most widely spread external packages in % by types\n", + "#### Table 3 Chart 1a - Most widely spread external packages in % by types (more than 0.5% overall)\n", "\n", "External packages that are used less than 0.5% are grouped into the name \"others\" to get a cleaner chart with the most significant external packages." ] @@ -579,23 +738,41 @@ " name_column='externalPackageName',\n", " threshold= 0.5\n", ");\n", + "plot_pie_chart(\n", + " input_data_frame=external_package_type_usage_spread_significant,\n", + " title='Top external package usage spread [%] by type (more than 0.5% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "d380fd56", + "metadata": {}, + "source": [ + "#### Table 3 Chart 1b - Most widely spread external packages in % by types (less than 0.5% overall \"others\" drill-down)\n", "\n", - "if external_package_type_usage_spread_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", - " axis = external_package_type_usage_spread_significant.plot(\n", - " kind='pie',\n", - " title='Top external package usage spread [%] by type',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "Shows the lowest (less than 0.5% overall) most spread external packages. Therefore, this plot breaks down the \"others\" slice of the pie chart above. Values under 0.3% from that will be grouped into \"others\" to get a cleaner plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2eb0e8f", + "metadata": {}, + "outputs": [], + "source": [ + "external_package_type_usage_spread_drill_down_others=filter_values_below_threshold(external_package_usage_spread, 'sumNumberOfTypes', 0.5)\n", + "\n", + "external_package_type_usage_spread_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_package_type_usage_spread_drill_down_others,\n", + " value_column='sumNumberOfTypes',\n", + " name_column='externalPackageName',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_package_type_usage_spread_significant_drill_down_others,\n", + " title='Top external package usage spread [%] by type (less than 0.5% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -603,7 +780,7 @@ "id": "c48740e3", "metadata": {}, "source": [ - "#### Table 3 Chart 2 - Most widely spread external packages in % by packages\n", + "#### Table 3 Chart 2a - Most widely spread external packages in % by packages (more than 0.5% overall)\n", "\n", "External packages that are used less than 0.5% are grouped into the name \"others\" to get a cleaner chart with the most significant external packages." ] @@ -621,23 +798,41 @@ " name_column='externalPackageName',\n", " threshold= 0.5\n", ");\n", + "plot_pie_chart(\n", + " input_data_frame=external_package_usage_package_spread_significant,\n", + " title='Top external package usage spread [%] by package (more than 0.5% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f14995ee", + "metadata": {}, + "source": [ + "#### Table 3 Chart 2b - Most widely spread external packages in % by packages (less than 0.5% overall \"others\" drill-down)\n", "\n", - "if external_package_usage_package_spread_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", - " axis = external_package_usage_package_spread_significant.plot(\n", - " kind='pie',\n", - " title='Top external package usage spread [%] by package',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "Shows the lowest (less than 0.5% overall) most spread external packages. Therefore, this plot breaks down the \"others\" slice of the pie chart above. Values under 0.3% from that will be grouped into \"others\" to get a cleaner plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08c09016", + "metadata": {}, + "outputs": [], + "source": [ + "external_package_usage_package_spread_drill_down_others=filter_values_below_threshold(external_package_usage_spread, 'sumNumberOfPackages', 0.5)\n", + "\n", + "external_grouped_package_type_usage_spread_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_package_usage_package_spread_drill_down_others,\n", + " value_column='sumNumberOfPackages',\n", + " name_column='externalPackageName',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_grouped_package_type_usage_spread_significant_drill_down_others,\n", + " title='Top external package usage spread [%] by type (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -683,7 +878,7 @@ "id": "e6f098e6", "metadata": {}, "source": [ - "#### Table 4 Chart 1 - Most widely spread second level external packages in % by type\n", + "#### Table 4 Chart 1a - Most widely spread second level external packages in % by type (more than 0.5% overall)\n", "\n", "External package groups that are used less than 0.5% are grouped into the name \"others\" to get a cleaner chart\n", "with the most significant external packages and how ofter they are called in percent." @@ -702,24 +897,41 @@ " name_column='externalSecondLevelPackageName',\n", " threshold= 0.5\n", ");\n", + "plot_pie_chart(\n", + " input_data_frame=external_grouped_package_type_usage_spread_significant,\n", + " title='Top external package (grouped by first 2 layers) usage spread [%] by type'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "681b2597", + "metadata": {}, + "source": [ + "#### Table 4 Chart 1b - Most widely spread second level external packages in % by type (less than 0.5% overall \"others\" drill-down)\n", "\n", - "if external_grouped_package_type_usage_spread_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", + "External packages that are used less than 0.5% are grouped into the name \"others\" to get a cleaner chart with the most significant external packages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "062a297b", + "metadata": {}, + "outputs": [], + "source": [ + "external_grouped_package_type_usage_spread_drill_down_others=filter_values_below_threshold(external_grouped_package_usage_spread, 'sumNumberOfTypes', 0.5)\n", "\n", - " axis = external_grouped_package_type_usage_spread_significant.plot(\n", - " kind='pie',\n", - " title='Top external package (grouped by first 2 layers) usage spread [%] by type',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "external_package_usage_package_spread_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_grouped_package_type_usage_spread_drill_down_others,\n", + " value_column='sumNumberOfTypes',\n", + " name_column='externalSecondLevelPackageName',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_package_usage_package_spread_significant_drill_down_others,\n", + " title='Top external package usage spread [%] by type (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -727,7 +939,7 @@ "id": "0b91082e", "metadata": {}, "source": [ - "#### Table 4 Chart 2 - Most widely spread second level external packages in % by package\n", + "#### Table 4 Chart 2a - Most widely spread second level external packages in % by package (more than 0.5% overall)\n", "\n", "External package groups that are used less than 0.5% are grouped into the name \"others\" to get a cleaner chart\n", "with the most significant external packages and how ofter they are called in percent." @@ -746,24 +958,41 @@ " name_column='externalSecondLevelPackageName',\n", " threshold= 0.5\n", ");\n", + "plot_pie_chart(\n", + " input_data_frame=external_grouped_package_package_usage_spread_significant,\n", + " title='Top external package (grouped by first 2 layers) usage spread [%] by package (more than 0.5% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c6234de8", + "metadata": {}, + "source": [ + "#### Table 4 Chart 2b - Most widely spread second level external packages in % by package (less than 0.5% overall \"others\" drill-down)\n", "\n", - "if external_grouped_package_package_usage_spread_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", + "External packages that are used less than 0.5% are grouped into the name \"others\" to get a cleaner chart with the most significant external packages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a03be97", + "metadata": {}, + "outputs": [], + "source": [ + "external_grouped_package_package_usage_spread_drill_down_others=filter_values_below_threshold(external_grouped_package_usage_spread, 'sumNumberOfPackages', 0.5)\n", "\n", - " axis = external_grouped_package_package_usage_spread_significant.plot(\n", - " kind='pie',\n", - " title='Top external package (grouped by first 2 layers) usage spread [%] by package',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "external_grouped_package_package_usage_spread_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_grouped_package_package_usage_spread_drill_down_others,\n", + " value_column='sumNumberOfPackages',\n", + " name_column='externalSecondLevelPackageName',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_grouped_package_package_usage_spread_significant_drill_down_others,\n", + " title='Top external package (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -911,7 +1140,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 210, "id": "fd9667a9", "metadata": {}, "outputs": [], @@ -1351,7 +1580,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 224, "id": "ad1db8af", "metadata": {}, "outputs": [], @@ -1488,7 +1717,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 228, "id": "46baa3c1", "metadata": {}, "outputs": [], @@ -1503,8 +1732,8 @@ "name": "JohT" } ], - "code_graph_analysis_pipeline_data_validation": "ValidateJavaExternalDependencies", "celltoolbar": "Tags", + "code_graph_analysis_pipeline_data_validation": "ValidateJavaExternalDependencies", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", @@ -1520,9 +1749,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.11.9" }, - "title": "Object Oriented Design Quality Metrics for Java with Neo4j" + "title": "External Dependencies for Java" }, "nbformat": 4, "nbformat_minor": 5 diff --git a/jupyter/ExternalDependenciesTypescript.ipynb b/jupyter/ExternalDependenciesTypescript.ipynb index 0c79edbdf..a6d80e822 100644 --- a/jupyter/ExternalDependenciesTypescript.ipynb +++ b/jupyter/ExternalDependenciesTypescript.ipynb @@ -6,7 +6,7 @@ "id": "2f0eabc4", "metadata": {}, "source": [ - "# External Dependencies\n", + "# External Dependencies for Typescript\n", "
\n", "\n", "### References\n", @@ -66,6 +66,154 @@ " return pd.DataFrame([r.values() for r in records], columns=keys)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "da9e8edb", + "metadata": {}, + "outputs": [], + "source": [ + "def group_to_others_below_threshold(data_frame : pd.DataFrame, value_column : str, name_column: str, threshold: float) -> pd.DataFrame: \n", + " \"\"\"\n", + " Adds a new percentage column for the value column and \n", + " groups all values below the given threshold to \"others\" in the name column.\n", + "\n", + " Parameters:\n", + " - data_frame (pd.DataFrame): Input pandas DataFrame\n", + " - value_column (str): Name of the column that contains the numeric value\n", + " - name_column (str): Name of the column that contains the group name that will be replaced by \"others\" for small values\n", + " - threshold (float): Threshold in % that is used to group values below it into the \"others\" group\n", + "\n", + " Returns:\n", + " int:Returning value\n", + "\n", + " \"\"\"\n", + " result_data_frame = data_frame[[name_column, value_column]].copy();\n", + "\n", + " percent_column_name = value_column + 'Percent';\n", + "\n", + " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", + " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", + "\n", + " # Convert name column to string values if it wasn't of that type before\n", + " result_data_frame[name_column] = result_data_frame[name_column].astype(str)\n", + "\n", + " # Change the group name to \"others\" if it is called less than the specified threshold\n", + " result_data_frame.loc[result_data_frame[percent_column_name] < threshold, name_column] = 'others';\n", + "\n", + " # Group by name column (foremost the new \"others\" entries) and sum their percentage\n", + " #result_data_frame = result_data_frame.groupby(name_column)[percent_column_name].sum();\n", + " result_data_frame = result_data_frame.groupby(name_column).sum();\n", + " # Sort by values descending\n", + " #return result_data_frame.sort_values(ascending=False).to_frame();\n", + " return result_data_frame.sort_values(by=percent_column_name, ascending=False);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe7da2e1", + "metadata": {}, + "outputs": [], + "source": [ + "def filter_values_below_threshold(data_frame : pd.DataFrame, value_column : str, upper_limit: float = 100.0) -> pd.DataFrame: \n", + " \"\"\"\n", + " Adds a new percentage column for the value column and \n", + " groups all values below the given threshold to \"others\" in the name column.\n", + "\n", + " Parameters:\n", + " - data_frame (pd.DataFrame): Input pandas DataFrame\n", + " - value_column (str): Name of the column that contains the numeric value\n", + " - upper_limit (float): Defaults to 100%. Filters out all entries exceeding this limit. Intended to drill down \"others\" in a second chart/table.\n", + "\n", + " Returns:\n", + " int:Returning value\n", + "\n", + " \"\"\"\n", + " result_data_frame = data_frame.copy();\n", + "\n", + " percent_column_name = value_column + 'Percent';\n", + "\n", + " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", + " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", + "\n", + " # Limit entries to meet the an optional upper limit (in percentage)\n", + " result_data_frame = result_data_frame.query(\"`\" + percent_column_name + \"` <= \" + str(upper_limit))\n", + "\n", + " result_data_frame = result_data_frame.reset_index(drop=True)\n", + " return result_data_frame.sort_values(by=percent_column_name, ascending=False);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9deaabce", + "metadata": { + "tags": [ + "table-css" + ] + }, + "outputs": [], + "source": [ + "def explode_index_value(input_data_frame: pd.DataFrame, index_value_to_emphasize: str = 'others', base_value: float = 0.02, emphasize_value: float = 0.2):\n", + " \"\"\"\n", + " \"Explode\" offsets slices in a pie chart plot by a given value.\n", + " The specified index value will be emphasized with a larger value to make it stand out in the pie chart plot.\n", + "\n", + " Parameters:\n", + " - input_data_frame (pd.DataFrame): Input pandas DataFrame with the data that will be plot. (Required)\n", + " - index_value_to_emphasize (str): Value of the index that will be emphasized. (Default= 'others')\n", + " - base_value (float): Base value for all pies in the chart. (Default=0.02)\n", + " - emphasize_value (float): Value for the emphasized pie in the chart. (Default=0.2)\n", + "\n", + " Returns:\n", + " Array with the same size as the number of rows/pies to plot containing the \"explode\" value for each of them\n", + "\n", + " \"\"\"\n", + " # Each entry in the list corresponds to an x value\n", + " # The comparison with the index_value_to_emphasize produces an array of booleans where nth entry with the emphasized value is \"true\"\n", + " # Multiplying it leads to 1 for True and 0 for False therefore \"exploding\" the emphasized entry whilst \n", + " return (input_data_frame.index == index_value_to_emphasize) * emphasize_value + base_value " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2496caf", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_pie_chart(input_data_frame: pd.DataFrame, title: str):\n", + " if input_data_frame.empty:\n", + " print(\"No data to plot for title '\" + title + \"'.\")\n", + " return\n", + " \n", + " name_of_the_first_column_containing_the_values=input_data_frame.columns[0]\n", + " total_sum = input_data_frame[name_of_the_first_column_containing_the_values].sum()\n", + " \n", + " def custom_auto_percentage_format(percentage):\n", + " return '{:1.2f}% ({:.0f})'.format(percentage, total_sum * percentage / 100.0)\n", + " \n", + " plot.figure();\n", + "\n", + " axis = input_data_frame.plot(\n", + " kind='pie',\n", + " y=name_of_the_first_column_containing_the_values + 'Percent',\n", + " ylabel='',\n", + " legend=True,\n", + " labeldistance=None,\n", + " autopct=custom_auto_percentage_format,\n", + " textprops={'fontsize': 6},\n", + " pctdistance=1.15,\n", + " cmap=main_color_map,\n", + " figsize=(9,9),\n", + " explode=explode_index_value(input_data_frame, index_value_to_emphasize='others')\n", + " )\n", + " plot.title(title, pad=15)\n", + " axis.legend(bbox_to_anchor=(1.08, 1), loc='upper left')\n", + " plot.show()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -170,7 +318,7 @@ "id": "1143afcb", "metadata": {}, "source": [ - "#### Table 1 Chart 1 - Most called external modules in % by internal elements\n", + "#### Table 1 Chart 1a - Most called external modules in % by internal elements (more than 0.7% overall)\n", "\n", "External modules that are used less than 0.7% are grouped into \"others\" to get a cleaner chart\n", "containing the most significant external modules and how ofter they are called by internal elements in percent." @@ -179,45 +327,7 @@ { "cell_type": "code", "execution_count": null, - "id": "44a11aec", - "metadata": {}, - "outputs": [], - "source": [ - "def group_to_others_below_threshold(data_frame : pd.DataFrame, value_column : str, name_column: str, threshold: float) -> pd.DataFrame: \n", - " \"\"\"Adds a new percentage column for the value column and \n", - " groups all values below the given threshold to \"others\" in the name column.\n", - "\n", - " Parameters:\n", - " - data_frame (pd.DataFrame): Input pandas DataFrame\n", - " - value_column (str): Name of the column that contains the numeric value\n", - " - name_column (str): Name of the column that contains the group name that will be replaced by \"others\" for small values\n", - " - threshold (float): Threshold in % that is used to group values below it into the \"others\" group\n", - "\n", - " Returns:\n", - " int:Returning value\n", - "\n", - " \"\"\"\n", - " result_data_frame = data_frame.copy();\n", - "\n", - " percent_column_name = value_column + 'Percent';\n", - "\n", - " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", - " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", - "\n", - " # Change the external module name to \"others\" if it is called less than the specified threshold\n", - " result_data_frame.loc[result_data_frame[percent_column_name] < threshold, name_column] = 'others';\n", - "\n", - " # Group external module name (foremost the new \"others\" entries) and sum their percentage\n", - " result_data_frame = result_data_frame.groupby(name_column)[percent_column_name].sum();\n", - "\n", - " # Sort by values descending\n", - " return result_data_frame.sort_values(ascending=False);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "99ef3fad", + "id": "243f2908", "metadata": {}, "outputs": [], "source": [ @@ -226,33 +336,42 @@ " value_column='numberOfExternalCallerElements',\n", " name_column='externalModuleName',\n", " threshold= 0.7\n", - ");" + ");\n", + "plot_pie_chart(\n", + " input_data_frame=external_module_by_internal_element_usage_significant,\n", + " title='Top external module usage [%] by internal elements (more than 0.7% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "d59d7924", + "metadata": {}, + "source": [ + "#### Table 1 Chart 1b - Most called external modules in % by internal elements (less than 0.7% overall \"others\" drill-down)\n", + "\n", + "Shows the lowest (less than 0.7% overall) most called external modules. Therefore, this plot breaks down the \"others\" slice of the pie chart above. Values under 0.3% from that will be grouped into \"others\" to get a cleaner plot." ] }, { "cell_type": "code", "execution_count": null, - "id": "688b6d56", + "id": "81be06d7", "metadata": {}, "outputs": [], "source": [ - "if external_module_by_internal_element_usage_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", + "external_module_by_internal_element_usage_drill_down_others=filter_values_below_threshold(external_module_usage, 'numberOfExternalCallerElements', 0.7)\n", "\n", - " axis = external_module_by_internal_element_usage_significant.plot(\n", - " kind='pie',\n", - " title='Top external module usage [%] by internal elements',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "external_module_by_internal_element_usage_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_module_by_internal_element_usage_drill_down_others,\n", + " value_column='numberOfExternalCallerElements',\n", + " name_column='externalModuleName',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_module_by_internal_element_usage_significant_drill_down_others,\n", + " title='Top external module usage [%] by internal elements (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -260,7 +379,7 @@ "id": "84c123dc", "metadata": {}, "source": [ - "#### Table 1 Chart 2 - Most called external modules in % by internal modules\n", + "#### Table 1 Chart 2a - Most called external modules in % by internal modules (more than 0.7% overall)\n", "\n", "External modules that are used less than 0.7% are grouped into \"others\" to get a cleaner chart\n", "containing the most significant external modules and how ofter they are called by internal modules in percent." @@ -269,7 +388,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c10499a5", + "id": "c165f403", "metadata": {}, "outputs": [], "source": [ @@ -278,33 +397,42 @@ " value_column='numberOfExternalCallerModules',\n", " name_column='externalModuleName',\n", " threshold= 0.7\n", - ");" + ");\n", + "plot_pie_chart(\n", + " input_data_frame=external_module_used_by_internal_modules_significant,\n", + " title='Top external module usage [%] by internal modules (more than 0.7% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a5a33ad1", + "metadata": {}, + "source": [ + "#### Table 1 Chart 2b - Most called external modules in % by internal modules (less than 0.7% overall \"others\" drill-down)\n", + "\n", + "Shows the lowest (less than 0.7% overall) most called external modules. Therefore, this plot breaks down the \"others\" slice of the pie chart above. Values under 0.3% from that will be grouped into \"others\" to get a cleaner plot." ] }, { "cell_type": "code", "execution_count": null, - "id": "c165f403", + "id": "72104f22", "metadata": {}, "outputs": [], "source": [ - "if external_module_used_by_internal_modules_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", + "external_module_used_by_internal_modules_drill_down_others=filter_values_below_threshold(external_module_usage, 'numberOfExternalCallerModules', 0.7)\n", "\n", - " axis = external_module_used_by_internal_modules_significant.plot(\n", - " kind='pie',\n", - " title='Top external module usage [%] by internal modules',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "external_module_used_by_internal_modules_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_module_used_by_internal_modules_drill_down_others,\n", + " value_column='numberOfExternalCallerModules',\n", + " name_column='externalModuleName',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_module_used_by_internal_modules_significant_drill_down_others,\n", + " title='Top external module usage [%] by internal modules (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -348,7 +476,7 @@ "id": "279932a6", "metadata": {}, "source": [ - "#### Table 2 Chart 1 - Most called external namespaces in % by internal element\n", + "#### Table 2 Chart 1a - Most called external namespaces in % by internal element (more than 0.7% overall)\n", "\n", "External namespaces that are used less than 0.7% are grouped into \"others\" to get a cleaner chart\n", "containing the most significant external namespaces and how ofter they are called by internal elements in percent." @@ -367,24 +495,41 @@ " name_column='externalNamespaceName',\n", " threshold= 0.7\n", ");\n", + "plot_pie_chart(\n", + " input_data_frame=external_namespace_use_by_internal_elements_significantly,\n", + " title='Top external namespace usage [%] by internal elements (more than 0.7% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1372eff1", + "metadata": {}, + "source": [ + "#### Table 2 Chart 1a - Most called external namespaces in % by internal element (less than 0.7% overall \"others\" drill-down)\n", "\n", - "if external_namespace_use_by_internal_elements_significantly.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", + "Shows the lowest (less than 0.7% overall) most called external namespaces. Therefore, this plot breaks down the \"others\" slice of the pie chart above. Values under 0.3% from that will be grouped into \"others\" to get a cleaner plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ef25c8e", + "metadata": {}, + "outputs": [], + "source": [ + "external_namespace_use_by_internal_elements_drill_down_others=filter_values_below_threshold(external_namespace_usage, 'numberOfExternalCallerElements', 0.7)\n", "\n", - " axis = external_namespace_use_by_internal_elements_significantly.plot(\n", - " kind='pie',\n", - " title='Top external namespace usage [%] by internal elements',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "external_namespace_use_by_internal_elements_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_namespace_use_by_internal_elements_drill_down_others,\n", + " value_column='numberOfExternalCallerElements',\n", + " name_column='externalNamespaceName',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_namespace_use_by_internal_elements_significant_drill_down_others,\n", + " title='Top external namespace usage [%] by internal elements (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -392,7 +537,7 @@ "id": "43c3e1a3", "metadata": {}, "source": [ - "#### Table 2 Chart 2 - Most called external namespaces in % by internal modules\n", + "#### Table 2 Chart 2a - Most called external namespaces in % by internal modules (more than 0.7% overall)\n", "\n", "External namespaces that are used less than 0.7% are grouped into \"others\" to get a cleaner chart\n", "containing the most significant external namespaces and how ofter they are called by internal modules in percent." @@ -411,23 +556,41 @@ " name_column='externalNamespaceName',\n", " threshold= 0.7\n", ");\n", + "plot_pie_chart(\n", + " input_data_frame=external_namespace_used_by_internal_modules_significantly,\n", + " title='Top external namespace usage [%] by internal modules (more than 0.7% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "665ba73f", + "metadata": {}, + "source": [ + "#### Table 2 Chart 2b - Most called external namespaces in % by internal modules (less than 0.7% overall \"others\" drill-down)\n", "\n", - "if external_namespace_used_by_internal_modules_significantly.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", - " axis = external_namespace_used_by_internal_modules_significantly.plot(\n", - " kind='pie',\n", - " title='Top external namespace usage [%] by internal modules',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "Shows the lowest (less than 0.7% overall) most called external namespaces. Therefore, this plot breaks down the \"others\" slice of the pie chart above. Values under 0.3% from that will be grouped into \"others\" to get a cleaner plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8107fd74", + "metadata": {}, + "outputs": [], + "source": [ + "external_namespace_used_by_internal_modules_drill_down_others=filter_values_below_threshold(external_namespace_usage, 'numberOfExternalCallerModules', 0.7)\n", + "\n", + "external_namespace_used_by_internal_modules_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_namespace_used_by_internal_modules_drill_down_others,\n", + " value_column='numberOfExternalCallerModules',\n", + " name_column='externalNamespaceName',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_namespace_used_by_internal_modules_significant_drill_down_others,\n", + " title='Top external namespace usage [%] by internal modules (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -562,9 +725,9 @@ "id": "b210eea0", "metadata": {}, "source": [ - "#### Table 3 Chart 1 - Most widely spread external packages in % by types\n", + "#### Table 3 Chart 1a - Most widely spread external module in % by internal elements (more than 0.5% overall)\n", "\n", - "External packages that are used less than 0.5% are grouped into the name \"others\" to get a cleaner chart with the most significant external packages." + "External modules that are used less than 0.5% are grouped into the name \"others\" to get a cleaner chart with the most significant external module." ] }, { @@ -580,23 +743,41 @@ " name_column='externalModuleName',\n", " threshold= 0.5\n", ");\n", + "plot_pie_chart(\n", + " input_data_frame=external_module_by_internal_element_usage_spread_significant,\n", + " title='Top external module usage spread [%] by internal elements (more than 0.5% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "02b7f814", + "metadata": {}, + "source": [ + "#### Table 3 Chart 1b - Most widely spread external modules in % by types (less than 0.5% overall \"others\" drill-down)\n", "\n", - "if external_module_by_internal_element_usage_spread_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", - " axis = external_module_by_internal_element_usage_spread_significant.plot(\n", - " kind='pie',\n", - " title='Top external module usage spread [%] by internal elements',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "Shows the lowest (less than 0.5% overall) most widely spread external modules. Therefore, this plot breaks down the \"others\" slice of the pie chart above. Values under 0.3% from that will be grouped into \"others\" to get a cleaner plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a63c9a4", + "metadata": {}, + "outputs": [], + "source": [ + "external_module_by_internal_element_usage_spread_drill_down_others=filter_values_below_threshold(external_module_usage_spread, 'sumNumberOfInternalElements', 0.5)\n", + "\n", + "external_module_by_internal_element_usage_spread_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_module_by_internal_element_usage_spread_drill_down_others,\n", + " value_column='sumNumberOfInternalElements',\n", + " name_column='externalModuleName',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_module_by_internal_element_usage_spread_significant_drill_down_others,\n", + " title='Top external module usage spread [%] by internal elements (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -604,7 +785,7 @@ "id": "c48740e3", "metadata": {}, "source": [ - "#### Table 3 Chart 2 - Most widely spread external modules in % by internal modules\n", + "#### Table 3 Chart 2a - Most widely spread external modules in % by internal modules (more than 0.5% overall)\n", "\n", "External modules that are used less than 0.5% are grouped into \"others\" to get a cleaner chart containing the most significant external modules." ] @@ -622,23 +803,41 @@ " name_column='externalModuleName',\n", " threshold= 0.5\n", ");\n", + "plot_pie_chart(\n", + " input_data_frame=external_modules_used_by_internal_modules_spread_significant,\n", + " title='Top external module usage spread [%] by internal modules (more than 0.5% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "4aa7569b", + "metadata": {}, + "source": [ + "#### Table 3 Chart 2b - Most widely spread external modules in % by internal modules (less than 0.5% overall \"others\" drill-down)\n", "\n", - "if external_modules_used_by_internal_modules_spread_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", - " axis = external_modules_used_by_internal_modules_spread_significant.plot(\n", - " kind='pie',\n", - " title='Top external module usage spread [%] by internal modules',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "Shows the lowest (less than 0.5% overall) most widely spread external modules. Therefore, this plot breaks down the \"others\" slice of the pie chart above. Values under 0.3% from that will be grouped into \"others\" to get a cleaner plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ce91cf2", + "metadata": {}, + "outputs": [], + "source": [ + "external_modules_used_by_internal_modules_spread_drill_down_others=filter_values_below_threshold(external_module_usage_spread, 'numberOfInternalModules', 0.5)\n", + "\n", + "external_modules_used_by_internal_modules_spread_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_modules_used_by_internal_modules_spread_drill_down_others,\n", + " value_column='numberOfInternalModules',\n", + " name_column='externalModuleName',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_modules_used_by_internal_modules_spread_significant_drill_down_others,\n", + " title='Top external module usage spread [%] by internal modules (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -715,7 +914,7 @@ "id": "04840973", "metadata": {}, "source": [ - "#### Table 4 Chart 1 - Most widely spread external namespaces in % by internal element\n", + "#### Table 4 Chart 1a - Most widely spread external namespaces in % by internal element (less than 0.5% overall)\n", "\n", "External namespaces that are used less than 0.5% are grouped into \"others\" to get a cleaner chart\n", "containing the most significant external namespaces and how ofter they are called in percent." @@ -734,24 +933,41 @@ " name_column='externalModuleNamespace',\n", " threshold= 0.5\n", ");\n", + "plot_pie_chart(\n", + " input_data_frame=external_namespace_usage_significant,\n", + " title='Top external namespace usage spread [%] by internal elements (less than 0.5% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f9d4eadb", + "metadata": {}, + "source": [ + "#### Table 4 Chart 1b - Most widely spread external namespaces in % by internal element (less than 0.5% overall \"others\" drill-down)\n", "\n", - "if external_namespace_usage_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", + "Shows the lowest (less than 0.5% overall) most widely spread external namespaces. Therefore, this plot breaks down the \"others\" slice of the pie chart above. Values under 0.3% from that will be grouped into \"others\" to get a cleaner plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cdd53bf", + "metadata": {}, + "outputs": [], + "source": [ + "external_namespace_usage_spread_drill_down_others=filter_values_below_threshold(external_namespace_usage_spread, 'sumNumberOfInternalElements', 0.5)\n", "\n", - " axis = external_namespace_usage_significant.plot(\n", - " kind='pie',\n", - " title='Top external namespace usage spread [%] by internal elements',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "external_namespace_usage_spread_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_namespace_usage_spread_drill_down_others,\n", + " value_column='sumNumberOfInternalElements',\n", + " name_column='externalModuleNamespace',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_namespace_usage_spread_significant_drill_down_others,\n", + " title='Top external namespace usage spread [%] by internal elements (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -759,7 +975,7 @@ "id": "fb678b02", "metadata": {}, "source": [ - "#### Table 4 Chart 2 - Most widely spread external namespace in % by internal modules\n", + "#### Table 4 Chart 2a - Most widely spread external namespace in % by internal modules (more than 0.5% overall)\n", "\n", "External namespaces that are used less than 0.5% are grouped into \"others\" to get a cleaner chart\n", "containing the most significant external namespaces and how ofter they are called in percent." @@ -778,24 +994,41 @@ " name_column='externalModuleNamespace',\n", " threshold= 0.5\n", ");\n", + "plot_pie_chart(\n", + " input_data_frame=external_namespace_internal_module_usage_spread_significant,\n", + " title='Top external namespace usage spread [%] by internal modules (more than 0.5% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e766c689", + "metadata": {}, + "source": [ + "#### Table 4 Chart 2b - Most widely spread external namespace in % by internal modules (less than 0.5% overall \"others\" drill-down)\n", "\n", - "if external_namespace_internal_module_usage_spread_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", + "Shows the lowest (less than 0.5% overall) most widely spread external namespaces. Therefore, this plot breaks down the \"others\" slice of the pie chart above. Values under 0.3% from that will be grouped into \"others\" to get a cleaner plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99dc8d15", + "metadata": {}, + "outputs": [], + "source": [ + "external_namespace_internal_module_usage_spread_drill_down_others=filter_values_below_threshold(external_namespace_usage_spread, 'numberOfInternalModules', 0.5)\n", "\n", - " axis = external_namespace_internal_module_usage_spread_significant.plot(\n", - " kind='pie',\n", - " title='Top external namespace usage spread [%] by internal modules',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "external_namespace_internal_module_usage_spread_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_namespace_internal_module_usage_spread_drill_down_others,\n", + " value_column='numberOfInternalModules',\n", + " name_column='externalModuleNamespace',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_namespace_internal_module_usage_spread_significant_drill_down_others,\n", + " title='Top external namespace usage spread [%] by internal modules (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -803,7 +1036,7 @@ "id": "14e4da22", "metadata": {}, "source": [ - "#### Table 4 Chart 3 - External namespaces with the most used declarations in %\n", + "#### Table 4 Chart 3a - External namespaces with the most used declarations in % (more than 0.5% overall)\n", "\n", "External namespaces that are used less than 0.5% are grouped into \"others\" to get a cleaner chart\n", "containing the most significant external namespaces and how ofter they are called in percent." @@ -822,24 +1055,41 @@ " name_column='externalModuleNamespace',\n", " threshold= 0.5\n", ");\n", + "plot_pie_chart(\n", + " input_data_frame=external_namespace_declaration_usage_significant,\n", + " title='Top external namespace declaration usage [%] (more than 0.5% overall)'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1b527d90", + "metadata": {}, + "source": [ + "#### Table 4 Chart 3b - External namespaces with the most used declarations in % (less than 0.5% overall \"others\" drill-down)\n", "\n", - "if external_namespace_declaration_usage_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", + "Shows the lowest (less than 0.5% overall) external namespaces with the most used declarations. Therefore, this plot breaks down the \"others\" slice of the pie chart above. Values under 0.3% from that will be grouped into \"others\" to get a cleaner plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f3008e7", + "metadata": {}, + "outputs": [], + "source": [ + "external_namespace_declaration_usage_drill_down_others=filter_values_below_threshold(external_namespace_usage_spread, 'sumNumberOfUsedExternalDeclarations', 0.5)\n", "\n", - " axis = external_namespace_declaration_usage_significant.plot(\n", - " kind='pie',\n", - " title='Top external namespace declaration usage [%]',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "external_namespace_declaration_usage_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_namespace_declaration_usage_drill_down_others,\n", + " value_column='sumNumberOfUsedExternalDeclarations',\n", + " name_column='externalModuleNamespace',\n", + " threshold= 0.3\n", + ")\n", + "plot_pie_chart(\n", + " input_data_frame=external_namespace_declaration_usage_significant_drill_down_others,\n", + " title='Top external namespace declaration usage (less than 0.7% overall \"others\" drill-down)'\n", + ")" ] }, { @@ -1151,14 +1401,6 @@ "external_package_usage_per_artifact_distribution_truncated" ] }, - { - "cell_type": "markdown", - "id": "f90caf44", - "metadata": {}, - "source": [ - "# TODO" - ] - }, { "cell_type": "markdown", "id": "055e5a36", @@ -1279,8 +1521,16 @@ " ,size=6\n", " ,bbox=label_box\n", " ,arrowprops=dict(arrowstyle=\"-|>\", mutation_scale=10, color=\"black\")\n", - " )\n", - "\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66811915", + "metadata": {}, + "outputs": [], + "source": [ "def index_of_sorted(data_frame: pd.DataFrame, highest: list[str] = []):\n", " \"\"\"\n", " Sorts the \"data_frame\" by columns 'numberOfExternalModules','maxNumberOfInternalElementsPercentage','internalModuleElementsCount', 'internalModuleName'\n", @@ -1404,7 +1654,7 @@ "pygments_lexer": "ipython3", "version": "3.11.9" }, - "title": "Object Oriented Design Quality Metrics for Java with Neo4j" + "title": "External Dependencies for Typescript" }, "nbformat": 4, "nbformat_minor": 5 diff --git a/jupyter/OverviewGeneral.ipynb b/jupyter/OverviewGeneral.ipynb index 92f8437e9..92657b2c4 100644 --- a/jupyter/OverviewGeneral.ipynb +++ b/jupyter/OverviewGeneral.ipynb @@ -71,6 +71,115 @@ " return pd.DataFrame([r.values() for r in records], columns=keys)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cff8d0e", + "metadata": {}, + "outputs": [], + "source": [ + "def group_to_others_below_threshold(data_frame : pd.DataFrame, value_column : str, name_column: str, threshold: float) -> pd.DataFrame: \n", + " \"\"\"\n", + " Adds a new percentage column for the value column and \n", + " groups all values below the given threshold to \"others\" in the name column.\n", + "\n", + " Parameters:\n", + " - data_frame (pd.DataFrame): Input pandas DataFrame\n", + " - value_column (str): Name of the column that contains the numeric value\n", + " - name_column (str): Name of the column that contains the group name that will be replaced by \"others\" for small values\n", + " - threshold (float): Threshold in % that is used to group values below it into the \"others\" group\n", + "\n", + " Returns:\n", + " int:Returning value\n", + "\n", + " \"\"\"\n", + " result_data_frame = data_frame[[name_column, value_column]].copy();\n", + "\n", + " percent_column_name = value_column + 'Percent';\n", + "\n", + " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", + " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", + "\n", + " # Convert name column to string values if it wasn't of that type before\n", + " result_data_frame[name_column] = result_data_frame[name_column].astype(str)\n", + "\n", + " # Change the group name to \"others\" if it is called less than the specified threshold\n", + " result_data_frame.loc[result_data_frame[percent_column_name] < threshold, name_column] = 'others';\n", + "\n", + " # Group by name column (foremost the new \"others\" entries) and sum their percentage\n", + " #result_data_frame = result_data_frame.groupby(name_column)[percent_column_name].sum();\n", + " result_data_frame = result_data_frame.groupby(name_column).sum();\n", + " # Sort by values descending\n", + " #return result_data_frame.sort_values(ascending=False).to_frame();\n", + " return result_data_frame.sort_values(by=percent_column_name, ascending=False);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47d035a0", + "metadata": {}, + "outputs": [], + "source": [ + "def explode_index_value(input: pd.DataFrame, index_value_to_emphasize: str = 'others', base_value: float = 0.02, emphasize_value: float = 0.2):\n", + " \"\"\"\n", + " \"Explode\" offsets slices in a pie chart plot by a given value.\n", + " The specified index value will be emphasized with a larger value to make it stand out in the pie chart plot.\n", + "\n", + " Parameters:\n", + " - input (pd.DataFrame): Input pandas DataFrame with the data that will be plot. (Required)\n", + " - index_value_to_emphasize (str): Value of the index that will be emphasized. (Default= 'others')\n", + " - base_value (float): Base value for all pies in the chart. (Default=0.02)\n", + " - emphasize_value (float): Value for the emphasized pie in the chart. (Default=0.2)\n", + "\n", + " Returns:\n", + " Array with the same size as the number of rows/pies to plot containing the \"explode\" value for each of them\n", + "\n", + " \"\"\"\n", + " # Each entry in the list corresponds to an x value\n", + " # The comparison with the index_value_to_emphasize produces an array of booleans where nth entry with the emphasized value is \"true\"\n", + " # Multiplying it leads to 1 for True and 0 for False therefore \"exploding\" the emphasized entry whilst \n", + " return (input.index == index_value_to_emphasize) * emphasize_value + base_value " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae2e12bd", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_pie_chart(input_data_frame: pd.DataFrame, title: str):\n", + " if input_data_frame.empty:\n", + " print(\"No data to plot for title '\" + title + \"'.\")\n", + " return\n", + "\n", + " plot.figure();\n", + " \n", + " name_of_the_first_column_containing_the_values=input_data_frame.columns[0]\n", + " total_sum = input_data_frame[name_of_the_first_column_containing_the_values].sum()\n", + " \n", + " def custom_auto_percentage_format(percentage):\n", + " return '{:1.2f}% ({:.0f})'.format(percentage, total_sum * percentage / 100.0)\n", + "\n", + " axis = input_data_frame.plot(\n", + " kind='pie',\n", + " y=name_of_the_first_column_containing_the_values + 'Percent',\n", + " ylabel='',\n", + " legend=True,\n", + " labeldistance=None,\n", + " autopct=custom_auto_percentage_format,\n", + " textprops={'fontsize': 6},\n", + " pctdistance=1.15,\n", + " cmap=main_color_map,\n", + " figsize=(9,9),\n", + " explode=explode_index_value(input_data_frame, index_value_to_emphasize='others')\n", + " )\n", + " plot.title(title, pad=15)\n", + " axis.legend(bbox_to_anchor=(1.08, 1), loc='upper left')\n", + " plot.show()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -112,51 +221,6 @@ "main_color_map = 'nipy_spectral'" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0b83a8a", - "metadata": {}, - "outputs": [], - "source": [ - "def group_to_others_below_threshold(data_frame : pd.DataFrame, value_column : str, name_column: str, threshold: float) -> pd.DataFrame: \n", - " \"\"\"Adds a new percentage column for the value column and \n", - " groups all values below the given threshold to \"others\" in the name column.\n", - "\n", - " Parameters:\n", - " - data_frame (pd.DataFrame): Input pandas DataFrame\n", - " - value_column (str): Name of the column that contains the numeric value\n", - " - name_column (str): Name of the column that contains the group name that will be replaced by \"others\" for small values\n", - " - threshold (float): Threshold in % that is used to group values below it into the \"others\" group\n", - "\n", - " Returns:\n", - " int:Returning value\n", - "\n", - " \"\"\"\n", - " result_data_frame = data_frame.copy();\n", - "\n", - " percent_column_name = value_column + 'Percent';\n", - "\n", - " percent_column_already_exists = percent_column_name in result_data_frame.columns\n", - "\n", - " if not percent_column_already_exists:\n", - " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", - " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", - "\n", - " # Convert name column to string values if it wasn't of that type before\n", - " result_data_frame[name_column] = result_data_frame[name_column].astype(str)\n", - "\n", - " # Change the group name to \"others\" if it is called less than the specified threshold\n", - " result_data_frame.loc[result_data_frame[percent_column_name] < threshold, name_column] = 'others';\n", - "\n", - " # Group by name column (foremost the new \"others\" entries) and sum their percentage\n", - " #result_data_frame = result_data_frame.groupby(name_column)[percent_column_name].sum();\n", - " result_data_frame = result_data_frame.groupby(name_column).sum();\n", - " # Sort by values descending\n", - " #return result_data_frame.sort_values(ascending=False).to_frame();\n", - " return result_data_frame.sort_values(by=percent_column_name, ascending=False);" - ] - }, { "cell_type": "markdown", "id": "0c68aa20", @@ -186,9 +250,6 @@ "outputs": [], "source": [ "node_count_by_label_combination = query_cypher_to_data_frame(\"../cypher/Overview/Node_label_combination_count.cypher\")\n", - "total_number_of_nodes = node_count_by_label_combination['nodesWithThatLabels'].sum()\n", - "print(\"Total number of nodes:\", total_number_of_nodes)\n", - "\n", "node_count_by_label_combination.head(30)" ] }, @@ -215,38 +276,10 @@ " name_column='nodeLabels',\n", " threshold= 0.5\n", ");\n", - "\n", - "if node_count_by_label_combination_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", - " \n", - " # Explode offsets slices of the pie chart by the given value\n", - " # Each entry in the list corresponds to an x value\n", - " # The comparison with \"others\" produces an array of booleans where nth entry with the \"others\" value is \"true\"\n", - " # Multiplying it leads to 1 for True and 0 for False therefore \"exploding\" the \"others\" entry a bit more than the rest\n", - " explode=(node_count_by_label_combination_significant.index == 'others') * 0.2 + 0.02\n", - " \n", - " def custom_auto_percentage_format(percentage):\n", - " return '{:1.2f}% ({:.0f})'.format(percentage, total_number_of_nodes*percentage/100)\n", - "\n", - " axis = node_count_by_label_combination_significant.plot(\n", - " kind='pie',\n", - " title='Nodes per label combination (more than 0.5% overall)',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct=custom_auto_percentage_format,\n", - " textprops={'fontsize': 6},\n", - " pctdistance=1.15,\n", - " cmap=main_color_map,\n", - " figsize=(9,9),\n", - " x='nodeLabels',\n", - " y='nodesWithThatLabelsPercent',\n", - " ylabel='',\n", - " explode=explode\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.08, 1), loc='upper left', )\n", - " plot.show()" + "plot_pie_chart(\n", + " input_data_frame=node_count_by_label_combination_significant,\n", + " title='Nodes per label combination (more than 0.5% overall)'\n", + ")" ] }, { @@ -295,40 +328,12 @@ " data_frame=node_count_by_label_combination_lowest_first,\n", " value_column='nodesWithThatLabels',\n", " name_column='nodeLabels',\n", - " threshold= 0.01\n", + " threshold= 0.3\n", ");\n", - "\n", - "if node_count_by_label_combination_lowest_first_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", - " \n", - " # Explode offsets slices of the pie chart by the given value\n", - " # Each entry in the list corresponds to an x value\n", - " # The comparison with \"others\" produces an array of booleans where nth entry with the \"others\" value is \"true\"\n", - " # Multiplying it leads to 1 for True and 0 for False therefore \"exploding\" the \"others\" entry a bit more than the rest\n", - " explode=(node_count_by_label_combination_lowest_first_significant.index == 'others') * 0.2 + 0.02\n", - "\n", - " def custom_auto_percentage_format(percentage):\n", - " return '{:1.2f}% ({:.0f})'.format(percentage, total_number_of_nodes*percentage/100)\n", - "\n", - " axis = node_count_by_label_combination_lowest_first_significant.plot(\n", - " kind='pie',\n", - " title='Nodes per label combination (less than 0.5% overall)',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct=custom_auto_percentage_format,\n", - " textprops={'fontsize': 6},\n", - " pctdistance=1.15,\n", - " cmap=main_color_map,\n", - " figsize=(9,9),\n", - " x='nodeLabels',\n", - " y='nodesWithThatLabelsPercent',\n", - " ylabel='',\n", - " explode=explode\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.08, 1), loc='upper left')\n", - " plot.show()" + "plot_pie_chart(\n", + " input_data_frame=node_count_by_label_combination_lowest_first_significant,\n", + " title='Nodes per label combination (less than 0.5% overall)'\n", + ")" ] }, { @@ -485,38 +490,10 @@ " name_column='relationshipType',\n", " threshold= 0.5\n", ");\n", - "\n", - "if relationship_count_by_type_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", - " \n", - " # Explode offsets slices of the pie chart by the given value\n", - " # Each entry in the list corresponds to an x value\n", - " # The comparison with \"others\" produces an array of booleans where nth entry with the \"others\" value is \"true\"\n", - " # Multiplying it leads to 1 for True and 0 for False therefore \"exploding\" the \"others\" entry a bit more than the rest\n", - " explode=(relationship_count_by_type_significant.index == 'others') * 0.2 + 0.02\n", - " \n", - " def custom_auto_percentage_format(percentage):\n", - " return '{:1.2f}% ({:.0f})'.format(percentage, total_number_of_relationships*percentage/100)\n", - "\n", - " axis = relationship_count_by_type_significant.plot(\n", - " kind='pie',\n", - " title='Relationship types (more than 0.5% overall)',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct=custom_auto_percentage_format,\n", - " textprops={'fontsize': 6},\n", - " pctdistance=1.15,\n", - " cmap=main_color_map,\n", - " figsize=(9,9),\n", - " x='relationshipType',\n", - " y='nodesWithThatRelationshipTypePercent',\n", - " ylabel='',\n", - " explode=explode\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.08, 1), loc='upper left', )\n", - " plot.show()" + "plot_pie_chart(\n", + " input_data_frame=relationship_count_by_type_significant,\n", + " title='Relationship types (more than 0.5% overall)'\n", + ")" ] }, { @@ -560,45 +537,16 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "relationship_count_by_type_lowest_first_significant = group_to_others_below_threshold(\n", " data_frame=relationship_count_by_type_lowest_first,\n", " value_column='nodesWithThatRelationshipType',\n", " name_column='relationshipType',\n", - " threshold= 0.01\n", + " threshold= 0.3\n", ");\n", - "\n", - "if relationship_count_by_type_lowest_first_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", - " \n", - " # Explode offsets slices of the pie chart by the given value\n", - " # Each entry in the list corresponds to an x value\n", - " # The comparison with \"others\" produces an array of booleans where nth entry with the \"others\" value is \"true\"\n", - " # Multiplying it leads to 1 for True and 0 for False therefore \"exploding\" the \"others\" entry a bit more than the rest\n", - " explode=(relationship_count_by_type_lowest_first_significant.index == 'others') * 0.2 + 0.02\n", - "\n", - " def custom_auto_percentage_format(percentage):\n", - " return '{:1.2f}% ({:.0f})'.format(percentage, total_number_of_relationships*percentage/100)\n", - "\n", - " axis = relationship_count_by_type_lowest_first_significant.plot(\n", - " kind='pie',\n", - " title='Relationship types (less than 0.5% overall)',\n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct=custom_auto_percentage_format,\n", - " textprops={'fontsize': 6},\n", - " pctdistance=1.15,\n", - " cmap=main_color_map,\n", - " figsize=(9,9),\n", - " x='relationshipType',\n", - " y='nodesWithThatRelationshipTypePercent',\n", - " ylabel='',\n", - " explode=explode\n", - " )\n", - " axis.legend(bbox_to_anchor=(1.08, 1), loc='upper left')\n", - " plot.show()" + "plot_pie_chart(\n", + " input_data_frame=relationship_count_by_type_lowest_first_significant,\n", + " title='Relationship types (less than 0.5% overall)'\n", + ")" ] }, { @@ -644,12 +592,14 @@ "metadata": {}, "outputs": [], "source": [ + "total_number_of_nodes = node_count_by_label_combination['nodesWithThatLabels'].sum()\n", + "\n", "print(\"total_number_of_nodes (vertices):\", total_number_of_nodes)\n", "print(\"total_number_of_relationships (edges):\", total_number_of_relationships)\n", "\n", "total_directed_graph_density=total_number_of_relationships / (total_number_of_nodes * (total_number_of_nodes - 1))\n", "print(\"-> total directed graph density:\", total_directed_graph_density)\n", - "print(\"-> total directed graph density in percent:\", total_directed_graph_density * 100)\n" + "print(\"-> total directed graph density in percent:\", total_directed_graph_density * 100)" ] } ], diff --git a/jupyter/OverviewJava.ipynb b/jupyter/OverviewJava.ipynb index ca7438718..cd1db0c00 100644 --- a/jupyter/OverviewJava.ipynb +++ b/jupyter/OverviewJava.ipynb @@ -66,6 +66,115 @@ " return pd.DataFrame([r.values() for r in records], columns=keys)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "93e0055c", + "metadata": {}, + "outputs": [], + "source": [ + "def group_to_others_below_threshold(data_frame : pd.DataFrame, value_column : str, name_column: str, threshold: float) -> pd.DataFrame: \n", + " \"\"\"\n", + " Adds a new percentage column for the value column and \n", + " groups all values below the given threshold to \"others\" in the name column.\n", + "\n", + " Parameters:\n", + " - data_frame (pd.DataFrame): Input pandas DataFrame\n", + " - value_column (str): Name of the column that contains the numeric value\n", + " - name_column (str): Name of the column that contains the group name that will be replaced by \"others\" for small values\n", + " - threshold (float): Threshold in % that is used to group values below it into the \"others\" group\n", + "\n", + " Returns:\n", + " int:Returning value\n", + "\n", + " \"\"\"\n", + " result_data_frame = data_frame[[name_column, value_column]].copy();\n", + "\n", + " percent_column_name = value_column + 'Percent';\n", + "\n", + " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", + " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", + "\n", + " # Convert name column to string values if it wasn't of that type before\n", + " result_data_frame[name_column] = result_data_frame[name_column].astype(str)\n", + "\n", + " # Change the group name to \"others\" if it is called less than the specified threshold\n", + " result_data_frame.loc[result_data_frame[percent_column_name] < threshold, name_column] = 'others';\n", + "\n", + " # Group by name column (foremost the new \"others\" entries) and sum their percentage\n", + " #result_data_frame = result_data_frame.groupby(name_column)[percent_column_name].sum();\n", + " result_data_frame = result_data_frame.groupby(name_column).sum();\n", + " # Sort by values descending\n", + " #return result_data_frame.sort_values(ascending=False).to_frame();\n", + " return result_data_frame.sort_values(by=percent_column_name, ascending=False);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43214838", + "metadata": {}, + "outputs": [], + "source": [ + "def explode_index_value(input: pd.DataFrame, index_value_to_emphasize: str = 'others', base_value: float = 0.02, emphasize_value: float = 0.2):\n", + " \"\"\"\n", + " \"Explode\" offsets slices in a pie chart plot by a given value.\n", + " The specified index value will be emphasized with a larger value to make it stand out in the pie chart plot.\n", + "\n", + " Parameters:\n", + " - input (pd.DataFrame): Input pandas DataFrame with the data that will be plot. (Required)\n", + " - index_value_to_emphasize (str): Value of the index that will be emphasized. (Default= 'others')\n", + " - base_value (float): Base value for all pies in the chart. (Default=0.02)\n", + " - emphasize_value (float): Value for the emphasized pie in the chart. (Default=0.2)\n", + "\n", + " Returns:\n", + " Array with the same size as the number of rows/pies to plot containing the \"explode\" value for each of them\n", + "\n", + " \"\"\"\n", + " # Each entry in the list corresponds to an x value\n", + " # The comparison with the index_value_to_emphasize produces an array of booleans where nth entry with the emphasized value is \"true\"\n", + " # Multiplying it leads to 1 for True and 0 for False therefore \"exploding\" the emphasized entry whilst \n", + " return (input.index == index_value_to_emphasize) * emphasize_value + base_value " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96bfb823", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_pie_chart(input_data_frame: pd.DataFrame, title: str):\n", + " if input_data_frame.empty:\n", + " print(\"No data to plot for title '\" + title + \"'.\")\n", + " return\n", + "\n", + " plot.figure();\n", + " \n", + " name_of_the_first_column_containing_the_values=input_data_frame.columns[0]\n", + " total_sum = input_data_frame[name_of_the_first_column_containing_the_values].sum()\n", + " \n", + " def custom_auto_percentage_format(percentage):\n", + " return '{:1.2f}% ({:.0f})'.format(percentage, total_sum * percentage / 100.0)\n", + "\n", + " axis = input_data_frame.plot(\n", + " kind='pie',\n", + " y=name_of_the_first_column_containing_the_values + 'Percent',\n", + " ylabel='',\n", + " legend=True,\n", + " labeldistance=None,\n", + " autopct=custom_auto_percentage_format,\n", + " textprops={'fontsize': 6},\n", + " pctdistance=1.15,\n", + " cmap=main_color_map,\n", + " figsize=(9,9),\n", + " explode=explode_index_value(input_data_frame, index_value_to_emphasize='others')\n", + " )\n", + " plot.title(title, pad=15)\n", + " axis.legend(bbox_to_anchor=(1.08, 1), loc='upper left')\n", + " plot.show()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -116,6 +225,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8333d13e", "metadata": {}, @@ -414,44 +524,6 @@ "The following chat shows artifacts with the largest package count in percentage. Artifacts with less than 0.7% package count are grouped into \"others\" to focus on the most significant artifacts regarding their package count." ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "102710a5", - "metadata": {}, - "outputs": [], - "source": [ - "def group_to_others_below_threshold(data_frame : pd.DataFrame, value_column : str, name_column: str, threshold: float) -> pd.DataFrame: \n", - " \"\"\"Adds a new percentage column for the value column and \n", - " groups all values below the given threshold to \"others\" in the name column.\n", - "\n", - " Parameters:\n", - " - data_frame (pd.DataFrame): Input pandas DataFrame\n", - " - value_column (str): Name of the column that contains the numeric value\n", - " - name_column (str): Name of the column that contains the group name that will be replaced by \"others\" for small values\n", - " - threshold (float): Threshold in % that is used to group values below it into the \"others\" group\n", - "\n", - " Returns:\n", - " int:Returning value\n", - "\n", - " \"\"\"\n", - " result_data_frame = data_frame.copy();\n", - "\n", - " percent_column_name = value_column + 'Percent';\n", - "\n", - " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", - " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", - "\n", - " # Change the external package name to \"others\" if it is called less than the specified threshold\n", - " result_data_frame.loc[result_data_frame[percent_column_name] < threshold, name_column] = 'others';\n", - "\n", - " # Group external package name (foremost the new \"others\" entries) and sum their percentage\n", - " result_data_frame = result_data_frame.groupby(name_column)[percent_column_name].sum();\n", - "\n", - " # Sort by values descending\n", - " return result_data_frame.sort_values(ascending=False);" - ] - }, { "cell_type": "code", "execution_count": null, @@ -465,24 +537,10 @@ " name_column='artifactName',\n", " threshold= 0.7\n", ");\n", - "\n", - "if types_per_artifact_sorted_significant.empty:\n", - " print(\"No data to plot\")\n", - "else:\n", - " plot.figure();\n", - " types_per_artifact_sorted_significant.plot(\n", - " y='numberOfPackages', \n", - " kind='pie', \n", - " title='Number of packages per artifact', \n", - " legend=True,\n", - " labeldistance=None,\n", - " autopct='%1.2f%%',\n", - " textprops={'fontsize': 5},\n", - " pctdistance=1.2,\n", - " cmap=main_color_map\n", - " )\n", - " plot.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - " plot.show()" + "plot_pie_chart(\n", + " input_data_frame=types_per_artifact_sorted_significant,\n", + " title='Number of packages per artifact'\n", + ")" ] } ], diff --git a/jupyter/PathFindingJava.ipynb b/jupyter/PathFindingJava.ipynb index c4045f826..7b8993b61 100644 --- a/jupyter/PathFindingJava.ipynb +++ b/jupyter/PathFindingJava.ipynb @@ -12,7 +12,7 @@ "\n", "Path algorithms in Graphs are famous for e.g. finding the fastest way from one place to another. How can these be applied to static code analysis and how can the results be interpreted?\n", "\n", - "One promising algorithm is [All Pairs Shortest Path](https://neo4j.com/docs/graph-data-science/current/algorithms/all-pairs-shortest-path). It shows dependencies from a different perspective and provides an overview on how directly or indirectly dependencies are connected to each other. The longest shortest path has an additional meaning: It is also known as the [**Graph Diameter**](https://mathworld.wolfram.com/GraphDiameter.html) and is very useful as a metric for the complexity of the Graph (or Subgraphs). The same applies to the longest path (for directed acyclic graphs) that can uncover long dependency chains.\n", + "One promising algorithm is [All Pairs Shortest Path](https://neo4j.com/docs/graph-data-science/current/algorithms/all-pairs-shortest-path). It shows dependencies from a different perspective and provides an overview on how directly or indirectly dependencies are connected to each other. The longest shortest path has an additional meaning: It is also known as the [**Graph Diameter**](https://mathworld.wolfram.com/GraphDiameter.html) and is very useful as a metric for the complexity of the Graph (or Subgraphs). The longest path (for directed acyclic graphs) can uncover the longest existing (worst case) dependency chains as long as there are no cycles in the Graph.\n", "\n", "
\n", "\n", @@ -100,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 309, + "execution_count": 64, "id": "d19447e1", "metadata": {}, "outputs": [], @@ -116,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 310, + "execution_count": 65, "id": "807bba03", "metadata": {}, "outputs": [], @@ -146,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 312, + "execution_count": 67, "id": "648e2a5a", "metadata": {}, "outputs": [], @@ -157,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 313, + "execution_count": 68, "id": "e49ca888", "metadata": {}, "outputs": [], @@ -168,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 314, + "execution_count": 69, "id": "1c5dab37", "metadata": {}, "outputs": [], @@ -183,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 315, + "execution_count": 70, "id": "c1db254b", "metadata": {}, "outputs": [], @@ -195,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 316, + "execution_count": 71, "id": "59310f6f", "metadata": {}, "outputs": [], @@ -207,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 317, + "execution_count": 72, "id": "7d2e62d6", "metadata": {}, "outputs": [], @@ -242,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": 318, + "execution_count": 73, "id": "3f2e905c", "metadata": {}, "outputs": [], @@ -258,7 +258,7 @@ }, { "cell_type": "code", - "execution_count": 319, + "execution_count": 74, "id": "d2d60597", "metadata": {}, "outputs": [], @@ -291,7 +291,7 @@ }, { "cell_type": "code", - "execution_count": 320, + "execution_count": 75, "id": "5ef848fd", "metadata": {}, "outputs": [], @@ -310,37 +310,52 @@ }, { "cell_type": "code", - "execution_count": 344, - "id": "a9211397", + "execution_count": 76, + "id": "a1c433f7", "metadata": {}, "outputs": [], "source": [ - "def get_longest_path_for_each_source_project(data_frame: pd.DataFrame) -> pd.DataFrame:\n", + "def get_longest_path_for_column(column : str, data_frame: pd.DataFrame) -> pd.DataFrame:\n", " \"\"\"\n", - " Returns the DataFrame grouped by the source project (e.g. Java artifact or Typescript project) containing their max/longest distance.\n", + " Returns the DataFrame grouped by the source Typescript scan (with one ore more projects) containing their max/longest distance.\n", + " \n", + " column\n", + " ----------\n", + " Name of the column to group by. Example: \"sourceProject\".\n", " \n", " data_frame\n", " ----------\n", - " Contains the path algorithm result including the columns \"isDifferentTargetProject\", \"distance\" and \"sourceProject\"\n", + " Contains the path algorithm result including the columns \"distance\" and \"sourceScan\"\n", " \"\"\"\n", " \n", - " return data_frame.groupby('sourceProject')['distance'].max().sort_values(ascending=False)" + " return data_frame.groupby(column)['distance'].max().sort_values(ascending=False)" ] }, { "cell_type": "code", - "execution_count": 322, + "execution_count": 77, "id": "036264ca", "metadata": {}, "outputs": [], "source": [ - "def get_distance_distribution_for_each_source_project(data_frame: pd.DataFrame) -> pd.DataFrame:\n", + "def get_distance_distribution_for_each(column : str, data_frame: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " Returns the transposed pivot of the DataFrame grouped by the given column.\n", + " \n", + " column\n", + " ----------\n", + " Name of the column to group by. Example: \"sourceProject\".\n", + " \n", + " data_frame\n", + " ----------\n", + " Contains the path algorithm result including the columns \"distance\", \"pairCount\" and the chosen column parameter.\n", + " \"\"\"\n", "\n", " data_frame = data_frame.copy()\n", "\n", " # Rows contain the source project (e.g. Java artifact or Typescript project) and its the pair count.\n", " # The columns contain the distances.\n", - " data_frame = data_frame.pivot(index='distance', columns='sourceProject', values='pairCount')\n", + " data_frame = data_frame.pivot(index='distance', columns=column, values='pairCount')\n", "\n", " # Sort by column sum and then take only the first 40 columns\n", " sum_of_pair_count = data_frame.sum()\n", @@ -357,12 +372,12 @@ }, { "cell_type": "code", - "execution_count": 323, + "execution_count": 78, "id": "de2e71ce", "metadata": {}, "outputs": [], "source": [ - "def normalize_distance_distribution_for_each_source_project(data_frame: pd.DataFrame) -> pd.DataFrame:\n", + "def normalize_distance_distribution_for_each_row(data_frame: pd.DataFrame) -> pd.DataFrame:\n", " \"\"\"\n", " Returns the normalized data in percentage of the DataFrame for each source project\n", " \n", @@ -375,7 +390,7 @@ }, { "cell_type": "code", - "execution_count": 324, + "execution_count": 79, "id": "27a583e9", "metadata": {}, "outputs": [], @@ -417,7 +432,7 @@ }, { "cell_type": "code", - "execution_count": 325, + "execution_count": 80, "id": "e63ddc97", "metadata": {}, "outputs": [], @@ -469,18 +484,18 @@ }, { "cell_type": "code", - "execution_count": 326, + "execution_count": 81, "id": "7243fbfd", "metadata": {}, "outputs": [], "source": [ - "def plot_longest_distance_per_source_project(data_frame: pd.DataFrame, title: str, xlabel: str, ylabel: str):\n", + "def plot_longest_distance_for_each_row(data_frame: pd.DataFrame, title: str, xlabel: str, ylabel: str):\n", " \"\"\"\n", " Plots the longest distance per source project \n", " \n", " data_frame\n", " ----------\n", - " Contains the path algorithm result including the columns \"distance\" and \"distanceTotalPairCount\"\n", + " Contains the path algorithm result pivot table including the columns \"distance\"\n", "\n", " title\n", " ----------\n", @@ -514,18 +529,18 @@ }, { "cell_type": "code", - "execution_count": 327, + "execution_count": 82, "id": "5262a4ea", "metadata": {}, "outputs": [], "source": [ - "def plot_source_projects_distances_stacked(data_frame: pd.DataFrame, title: str, xlabel: str, ylabel: str, logy: bool = False):\n", + "def plot_stacked_distances_for_each_row(data_frame: pd.DataFrame, title: str, xlabel: str, ylabel: str, logy: bool = False):\n", " \"\"\"\n", " Plots each source project (e.g. Java artifact or Typescript project) stacked by distance (number of dependency pairs)\n", " \n", " data_frame\n", " ----------\n", - " Contains the output of \"get_distance_distribution_for_each_source_project\" with the \n", + " Contains the output of \"get_distance_distribution_for_each\" with e.g. the \n", " source projects as rows, distances as columns and number of pairs as values.\n", "\n", " title\n", @@ -590,7 +605,7 @@ }, { "cell_type": "code", - "execution_count": 328, + "execution_count": 83, "id": "1ecc41b1", "metadata": {}, "outputs": [], @@ -604,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 329, + "execution_count": 84, "id": "0b637ce2", "metadata": {}, "outputs": [], @@ -645,7 +660,7 @@ }, { "cell_type": "code", - "execution_count": 331, + "execution_count": 86, "id": "62f50f28", "metadata": {}, "outputs": [], @@ -796,7 +811,7 @@ "metadata": {}, "outputs": [], "source": [ - "graph_diameter_per_artifact = get_longest_path_for_each_source_project(all_pairs_shortest_paths_distribution_per_artifact_isolated)\n", + "graph_diameter_per_artifact = get_longest_path_for_column('sourceProject', all_pairs_shortest_paths_distribution_per_artifact_isolated)\n", "graph_diameter_per_artifact.head(20)" ] }, @@ -807,7 +822,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_longest_distance_per_source_project(\n", + "plot_longest_distance_for_each_row(\n", " data_frame=graph_diameter_per_artifact,\n", " title='Longest shortest path (\"diameter\") for Java package dependencies per artifact',\n", " xlabel='Artifact',\n", @@ -825,12 +840,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 95, "id": "7b90cfbc", "metadata": {}, "outputs": [], "source": [ - "all_pairs_shortest_paths_distribution_per_artifact_isolated_pivot = get_distance_distribution_for_each_source_project(all_pairs_shortest_paths_distribution_per_artifact_isolated)" + "all_pairs_shortest_paths_distribution_per_artifact_isolated_pivot = get_distance_distribution_for_each('sourceProject', all_pairs_shortest_paths_distribution_per_artifact_isolated)" ] }, { @@ -840,7 +855,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_source_projects_distances_stacked(\n", + "plot_stacked_distances_for_each_row(\n", " data_frame=all_pairs_shortest_paths_distribution_per_artifact_isolated_pivot,\n", " title='All pairs shortest path for Java package dependencies stacked per artifact (absolute, logarithmic)',\n", " xlabel='Artifact',\n", @@ -867,7 +882,7 @@ "outputs": [], "source": [ "# Normalize data (percent of sum pairs)\n", - "all_pairs_shortest_paths_distribution_per_artifact_isolated_normalized_pivot=normalize_distance_distribution_for_each_source_project(all_pairs_shortest_paths_distribution_per_artifact_isolated_pivot)\n", + "all_pairs_shortest_paths_distribution_per_artifact_isolated_normalized_pivot=normalize_distance_distribution_for_each_row(all_pairs_shortest_paths_distribution_per_artifact_isolated_pivot)\n", "all_pairs_shortest_paths_distribution_per_artifact_isolated_normalized_pivot.head(50)" ] }, @@ -878,7 +893,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_source_projects_distances_stacked(\n", + "plot_stacked_distances_for_each_row(\n", " data_frame=all_pairs_shortest_paths_distribution_per_artifact_isolated_normalized_pivot.head(50),\n", " title='All pairs shortest path for Java package dependencies stacked per artifact (normalized in %)',\n", " xlabel='Artifact',\n", @@ -910,7 +925,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 99, "id": "8b43628b", "metadata": {}, "outputs": [], @@ -1062,7 +1077,7 @@ "metadata": {}, "outputs": [], "source": [ - "longest_path_per_artifact = get_longest_path_for_each_source_project(longest_paths_distribution_per_artifact_isolated)\n", + "longest_path_per_artifact = get_longest_path_for_column('sourceProject', longest_paths_distribution_per_artifact_isolated)\n", "longest_path_per_artifact.head(20)" ] }, @@ -1073,7 +1088,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_longest_distance_per_source_project(\n", + "plot_longest_distance_for_each_row(\n", " data_frame=longest_path_per_artifact,\n", " title='Max. longest path for Java package dependencies per artifact',\n", " xlabel='Artifact',\n", @@ -1091,12 +1106,12 @@ }, { "cell_type": "code", - "execution_count": 219, + "execution_count": 108, "id": "9765cec6", "metadata": {}, "outputs": [], "source": [ - "longest_paths_distribution_per_artifact_isolated_pivot = get_distance_distribution_for_each_source_project(longest_paths_distribution_per_artifact_isolated)" + "longest_paths_distribution_per_artifact_isolated_pivot = get_distance_distribution_for_each('sourceProject', longest_paths_distribution_per_artifact_isolated)" ] }, { @@ -1106,7 +1121,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_source_projects_distances_stacked(\n", + "plot_stacked_distances_for_each_row(\n", " data_frame=longest_paths_distribution_per_artifact_isolated_pivot,\n", " title='Longest path for Java package dependencies stacked per artifact (absolute, logarithmic)',\n", " xlabel='Artifact',\n", @@ -1133,7 +1148,7 @@ "outputs": [], "source": [ "# Normalize data (percent of sum pairs)\n", - "longest_paths_distribution_per_artifact_isolated_normalized_pivot=normalize_distance_distribution_for_each_source_project(longest_paths_distribution_per_artifact_isolated_pivot)\n", + "longest_paths_distribution_per_artifact_isolated_normalized_pivot=normalize_distance_distribution_for_each_row(longest_paths_distribution_per_artifact_isolated_pivot)\n", "longest_paths_distribution_per_artifact_isolated_normalized_pivot.head(50)" ] }, @@ -1144,7 +1159,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_source_projects_distances_stacked(\n", + "plot_stacked_distances_for_each_row(\n", " data_frame=longest_paths_distribution_per_artifact_isolated_normalized_pivot.head(50),\n", " title='Longest path for Java package dependencies stacked per artifact (normalized in %)',\n", " xlabel='Artifact',\n", @@ -1182,7 +1197,7 @@ }, { "cell_type": "code", - "execution_count": 223, + "execution_count": 112, "id": "cec6c79b", "metadata": {}, "outputs": [], @@ -1196,7 +1211,7 @@ }, { "cell_type": "code", - "execution_count": 224, + "execution_count": 113, "id": "deda506f", "metadata": {}, "outputs": [], @@ -1236,7 +1251,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 115, "id": "b0cdf8c5", "metadata": {}, "outputs": [], @@ -1344,7 +1359,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 120, "id": "29e4e8f2", "metadata": {}, "outputs": [], diff --git a/jupyter/PathFindingTypescript.ipynb b/jupyter/PathFindingTypescript.ipynb index bf60ab9ee..c0a7e0e86 100644 --- a/jupyter/PathFindingTypescript.ipynb +++ b/jupyter/PathFindingTypescript.ipynb @@ -12,7 +12,7 @@ "\n", "Path algorithms in Graphs are famous for e.g. finding the fastest way from one place to another. How can these be applied to static code analysis and how can the results be interpreted?\n", "\n", - "One promising algorithm is [All Pairs Shortest Path](https://neo4j.com/docs/graph-data-science/current/algorithms/all-pairs-shortest-path). It shows dependencies from a different perspective and provides an overview on how directly or indirectly dependencies are connected to each other. The longest shortest path has an additional meaning: It is also known as the [**Graph Diameter**](https://mathworld.wolfram.com/GraphDiameter.html) and is very useful as a metric for the complexity of the Graph (or Subgraphs). The same applies to the longest path (for directed acyclic graphs) that can uncover long dependency chains.\n", + "One promising algorithm is [All Pairs Shortest Path](https://neo4j.com/docs/graph-data-science/current/algorithms/all-pairs-shortest-path). It shows dependencies from a different perspective and provides an overview on how directly or indirectly dependencies are connected to each other. The longest shortest path has an additional meaning: It is also known as the [**Graph Diameter**](https://mathworld.wolfram.com/GraphDiameter.html) and is very useful as a metric for the complexity of the Graph (or Subgraphs). The longest path (for directed acyclic graphs) can uncover the longest existing (worst case) dependency chains as long as there are no cycles in the Graph.\n", "\n", "
\n", "\n", @@ -100,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 1, "id": "d19447e1", "metadata": {}, "outputs": [], @@ -116,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 2, "id": "807bba03", "metadata": {}, "outputs": [], @@ -146,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 4, "id": "648e2a5a", "metadata": {}, "outputs": [], @@ -157,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 5, "id": "e49ca888", "metadata": {}, "outputs": [], @@ -168,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 6, "id": "1c5dab37", "metadata": {}, "outputs": [], @@ -183,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 7, "id": "c1db254b", "metadata": {}, "outputs": [], @@ -195,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 8, "id": "59310f6f", "metadata": {}, "outputs": [], @@ -207,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 9, "id": "7d2e62d6", "metadata": {}, "outputs": [], @@ -242,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 10, "id": "3f2e905c", "metadata": {}, "outputs": [], @@ -253,12 +253,12 @@ " \"\"\"\n", " \n", " print(\"No projected data for path finding available\")\n", - " return pd.DataFrame(columns=['totalCost', 'sourceProject', 'isDifferentTargetProject', 'distance', 'distanceTotalPairCount', 'distanceTotalSourceCount', 'distanceTotalTargetCount', 'nodeCount', 'pairCount'])" + " return pd.DataFrame(columns=['totalCost', 'sourceProject', 'sourceScan', 'isDifferentTargetProject', 'isDifferentTargetScan', 'distance', 'distanceTotalPairCount', 'distanceTotalSourceCount', 'distanceTotalTargetCount', 'nodeCount', 'pairCount'])" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 11, "id": "d2d60597", "metadata": {}, "outputs": [], @@ -291,7 +291,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 12, "id": "5ef848fd", "metadata": {}, "outputs": [], @@ -310,37 +310,55 @@ }, { "cell_type": "code", - "execution_count": 41, - "id": "a9211397", + "execution_count": 13, + "id": "e84f2736", "metadata": {}, "outputs": [], "source": [ - "def get_longest_path_for_each_source_project(data_frame: pd.DataFrame) -> pd.DataFrame:\n", + "def get_longest_path_for_column(column : str, data_frame: pd.DataFrame) -> pd.DataFrame:\n", " \"\"\"\n", - " Returns the DataFrame grouped by the source project (e.g. Java artifact or Typescript project) containing their max/longest distance.\n", + " Returns the DataFrame grouped by the given column containing their max/longest distance.\n", + " \n", + " column\n", + " ----------\n", + " Name of the column to group by. Example: \"sourceProject\".\n", " \n", " data_frame\n", " ----------\n", - " Contains the path algorithm result including the columns \"isDifferentTargetProject\", \"distance\" and \"sourceProject\"\n", + " Contains the path algorithm result including the columns \"distance\" and the chosen column parameter.\n", " \"\"\"\n", " \n", - " return data_frame.groupby('sourceProject')['distance'].max().sort_values(ascending=False)" + " return data_frame.groupby(column)['distance'].max().sort_values(ascending=False)" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 14, "id": "036264ca", "metadata": {}, "outputs": [], "source": [ - "def get_distance_distribution_for_each_source_project(data_frame: pd.DataFrame) -> pd.DataFrame:\n", + "def get_distance_distribution_for_each(column : str, data_frame: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " Returns the transposed pivot of the DataFrame grouped by the given column.\n", + " \n", + " column\n", + " ----------\n", + " Name of the column to group by. Example: \"sourceProject\".\n", + " \n", + " data_frame\n", + " ----------\n", + " Contains the path algorithm result including the columns \"distance\", \"pairCount\" and the chosen column parameter.\n", + " \"\"\"\n", "\n", " data_frame = data_frame.copy()\n", "\n", - " # Rows contain the source project (e.g. Java artifact or Typescript project) and its the pair count.\n", - " # The columns contain the distances.\n", - " data_frame = data_frame.pivot(index='distance', columns='sourceProject', values='pairCount')\n", + " # If not already grouped, group by the given column and the distance and sum up the pair count (=number of paths)\n", + " data_frame = data_frame.groupby([column, \"distance\"], as_index=False)[\"pairCount\"].apply(sum)\n", + "\n", + " # The rows of the parameter \"column\" contain the source project or scan (e.g. Java artifact or Typescript project) and their path count.\n", + " # The columns contain the distances (length of the paths).\n", + " data_frame = data_frame.pivot(index='distance', columns=column, values='pairCount')\n", "\n", " # Sort by column sum and then take only the first 40 columns\n", " sum_of_pair_count = data_frame.sum()\n", @@ -349,7 +367,7 @@ " # Fill missing values with zeroes\n", " data_frame = data_frame.fillna(0)\n", "\n", - " # Transpose the table (flip columns and rows) to have a row for every source project\n", + " # Transpose the table (flip columns and rows) to have a row for every column (e.g. \"sourceProject\")\n", " data_frame = data_frame.transpose()\n", "\n", " return data_frame" @@ -357,14 +375,14 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 15, "id": "de2e71ce", "metadata": {}, "outputs": [], "source": [ - "def normalize_distance_distribution_for_each_source_project(data_frame: pd.DataFrame) -> pd.DataFrame:\n", + "def normalize_distance_distribution_for_each_row(data_frame: pd.DataFrame) -> pd.DataFrame:\n", " \"\"\"\n", - " Returns the normalized data in percentage of the DataFrame for each source project\n", + " Returns the normalized data in percentage of the DataFrame for each row\n", " \n", " data_frame\n", " ----------\n", @@ -375,7 +393,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 16, "id": "27a583e9", "metadata": {}, "outputs": [], @@ -417,7 +435,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 17, "id": "e63ddc97", "metadata": {}, "outputs": [], @@ -469,18 +487,18 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 18, "id": "7243fbfd", "metadata": {}, "outputs": [], "source": [ - "def plot_longest_distance_per_source_project(data_frame: pd.DataFrame, title: str, xlabel: str, ylabel: str):\n", + "def plot_longest_distance_of_each_row(data_frame: pd.DataFrame, title: str, xlabel: str, ylabel: str):\n", " \"\"\"\n", " Plots the longest distance per source project \n", " \n", " data_frame\n", " ----------\n", - " Contains the path algorithm result including the columns \"distance\" and \"distanceTotalPairCount\"\n", + " Contains the path algorithm result pivot table including the columns \"distance\"\n", "\n", " title\n", " ----------\n", @@ -514,18 +532,18 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 19, "id": "5262a4ea", "metadata": {}, "outputs": [], "source": [ - "def plot_source_projects_distances_stacked(data_frame: pd.DataFrame, title: str, xlabel: str, ylabel: str, logy: bool = False):\n", + "def plot_stacked_distances_for_each_row(data_frame: pd.DataFrame, title: str, xlabel: str, ylabel: str, logy: bool = False):\n", " \"\"\"\n", " Plots each source project (e.g. Java artifact or Typescript project) stacked by distance (number of dependency pairs)\n", " \n", " data_frame\n", " ----------\n", - " Contains the output of \"get_distance_distribution_for_each_source_project\" with the \n", + " Contains the output of \"get_distance_distribution_for_each\" with e.g. the \n", " source projects as rows, distances as columns and number of pairs as values.\n", "\n", " title\n", @@ -562,7 +580,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 20, "id": "6afb912b", "metadata": {}, "outputs": [], @@ -601,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 21, "id": "40653fe5", "metadata": {}, "outputs": [], @@ -667,7 +685,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 22, "id": "1ecc41b1", "metadata": {}, "outputs": [], @@ -681,7 +699,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 23, "id": "0b637ce2", "metadata": {}, "outputs": [], @@ -721,13 +739,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "62f50f28", "metadata": {}, "outputs": [], "source": [ "# Execute algorithm \"All pairs shortest path\" and query overall and project specific results\n", - "all_pairs_shortest_paths_distribution_per_project=query_if_data_available(is_module_data_available, \"../cypher/Path_Finding/Path_Finding_5_All_pairs_shortest_path_distribution_per_project.cypher\", module_path_finding_parameters)" + "all_pairs_shortest_paths_distribution_per_project_and_scan=query_if_data_available(is_module_data_available, \"../cypher/Path_Finding/Path_Finding_5_All_pairs_shortest_path_distribution_per_project.cypher\", module_path_finding_parameters)" ] }, { @@ -745,7 +763,7 @@ "metadata": {}, "outputs": [], "source": [ - "module_dependencies_graph_diameter=all_pairs_shortest_paths_distribution_per_project['distance'].max()\n", + "module_dependencies_graph_diameter=all_pairs_shortest_paths_distribution_per_project_and_scan['distance'].max()\n", "print('The diameter (longest shortest path) of the projected module dependencies Graph is:', module_dependencies_graph_diameter)" ] }, @@ -764,7 +782,7 @@ "metadata": {}, "outputs": [], "source": [ - "all_pairs_shortest_paths_distribution_per_project_in_total=get_total_distance_distribution(all_pairs_shortest_paths_distribution_per_project)\n", + "all_pairs_shortest_paths_distribution_per_project_in_total=get_total_distance_distribution(all_pairs_shortest_paths_distribution_per_project_and_scan)\n", "all_pairs_shortest_paths_distribution_per_project_in_total.head(50)" ] }, @@ -798,22 +816,6 @@ "#### All pairs shortest path in total - Path count per length - Pie chart" ] }, - { - "cell_type": "code", - "execution_count": 57, - "id": "1d6fdf9a", - "metadata": {}, - "outputs": [], - "source": [ - "# all_pairs_shortest_paths_distribution_per_project_in_total_significant = group_to_others_below_threshold(\n", - "# data_frame=all_pairs_shortest_paths_distribution_per_project_in_total,\n", - "# value_column='distanceTotalPairCount',\n", - "# name_column='distance',\n", - "# threshold= 0.7\n", - "# );\n", - "# all_pairs_shortest_paths_distribution_per_project_in_total_significant" - ] - }, { "cell_type": "code", "execution_count": null, @@ -846,7 +848,7 @@ "metadata": {}, "outputs": [], "source": [ - "all_pairs_shortest_paths_distribution_per_project.head(10)" + "all_pairs_shortest_paths_distribution_per_project_and_scan.head(10)" ] }, { @@ -868,7 +870,7 @@ "metadata": {}, "outputs": [], "source": [ - "all_pairs_shortest_paths_distribution_per_project_isolated=all_pairs_shortest_paths_distribution_per_project.query('isDifferentTargetProject == False')\n", + "all_pairs_shortest_paths_distribution_per_project_isolated=all_pairs_shortest_paths_distribution_per_project_and_scan.query('isDifferentTargetProject == False')\n", "all_pairs_shortest_paths_distribution_per_project_isolated.head(10)" ] }, @@ -889,7 +891,7 @@ "metadata": {}, "outputs": [], "source": [ - "graph_diameter_per_project = get_longest_path_for_each_source_project(all_pairs_shortest_paths_distribution_per_project_isolated)\n", + "graph_diameter_per_project = get_longest_path_for_column('sourceProject', all_pairs_shortest_paths_distribution_per_project_isolated)\n", "graph_diameter_per_project.head(20)" ] }, @@ -900,7 +902,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_longest_distance_per_source_project(\n", + "plot_longest_distance_of_each_row(\n", " data_frame=graph_diameter_per_project,\n", " title='Longest shortest path (\"diameter\") for Typescript module dependencies per project',\n", " xlabel='Project',\n", @@ -918,12 +920,12 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 34, "id": "7b90cfbc", "metadata": {}, "outputs": [], "source": [ - "all_pairs_shortest_paths_distribution_per_project_isolated_pivot = get_distance_distribution_for_each_source_project(all_pairs_shortest_paths_distribution_per_project_isolated)" + "all_pairs_shortest_paths_distribution_per_project_isolated_pivot = get_distance_distribution_for_each('sourceProject', all_pairs_shortest_paths_distribution_per_project_isolated)" ] }, { @@ -933,7 +935,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_source_projects_distances_stacked(\n", + "plot_stacked_distances_for_each_row(\n", " data_frame=all_pairs_shortest_paths_distribution_per_project_isolated_pivot,\n", " title='All pairs shortest path for Typescript module dependencies stacked per project (absolute, logarithmic)',\n", " xlabel='Project',\n", @@ -960,7 +962,7 @@ "outputs": [], "source": [ "# Normalize data (percent of sum pairs)\n", - "all_pairs_shortest_paths_distribution_per_project_isolated_normalized_pivot=normalize_distance_distribution_for_each_source_project(all_pairs_shortest_paths_distribution_per_project_isolated_pivot)\n", + "all_pairs_shortest_paths_distribution_per_project_isolated_normalized_pivot=normalize_distance_distribution_for_each_row(all_pairs_shortest_paths_distribution_per_project_isolated_pivot)\n", "all_pairs_shortest_paths_distribution_per_project_isolated_normalized_pivot.head(50)" ] }, @@ -971,7 +973,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_source_projects_distances_stacked(\n", + "plot_stacked_distances_for_each_row(\n", " data_frame=all_pairs_shortest_paths_distribution_per_project_isolated_normalized_pivot.head(50),\n", " title='All pairs shortest path for Typescript module dependencies stacked per project (normalized in %)',\n", " xlabel='Project',\n", @@ -979,6 +981,140 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "f76616b7", + "metadata": {}, + "source": [ + "### 1.1.5 All pairs shortest path for each scan\n", + "\n", + "In this section we'll focus only on pairs of nodes that both belong to the same scan, filtering out every line that has `isDifferentTargetScan==False`. The first ten rows are shown in a table followed by charts that show the distribution of shortest path distances across different scans in stacked bar charts (absolute and normalized).\n", + "\n", + "**Note:** It is possible that a (shortest) path could have nodes in between that belong to different scans. Therefore, the data of each scan isn't perfectly isolated. However, it shows how the dependencies interact across scans \"in real life\" while still providing a decent isolation of each scan." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e551ada6", + "metadata": {}, + "outputs": [], + "source": [ + "all_pairs_shortest_paths_distribution_per_scan_isolated=all_pairs_shortest_paths_distribution_per_project_and_scan.query('isDifferentTargetScan == False')\n", + "\n", + "all_pairs_shortest_paths_distribution_per_scan_isolated.\\\n", + " groupby([\"sourceScan\", \"distance\"], as_index=False)\\\n", + " [[\"pairCount\", \"sourceNodeCount\",\"targetNodeCount\"]].\\\n", + " apply(max).head(20)" + ] + }, + { + "cell_type": "markdown", + "id": "3f9defa3", + "metadata": {}, + "source": [ + "#### All pairs shortest path for each scan - Longest shortest path (Diameter) for each scan\n", + "\n", + "Shows the top 20 scans with the longest shortest path (=Graph Diameter)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9c57989", + "metadata": {}, + "outputs": [], + "source": [ + "graph_diameter_per_scan = get_longest_path_for_column('sourceScan', all_pairs_shortest_paths_distribution_per_scan_isolated)\n", + "graph_diameter_per_scan.head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd33364f", + "metadata": {}, + "outputs": [], + "source": [ + "plot_longest_distance_of_each_row(\n", + " data_frame=graph_diameter_per_scan,\n", + " title='Longest shortest path (\"diameter\") for Typescript module dependencies per scan',\n", + " xlabel='Scan',\n", + " ylabel='longest path length'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0551c259", + "metadata": {}, + "source": [ + "#### All pairs shortest path for each scan - Bar chart (absolute)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "64f86f9d", + "metadata": {}, + "outputs": [], + "source": [ + "all_pairs_shortest_paths_distribution_per_scan_isolated_pivot = get_distance_distribution_for_each('sourceScan', all_pairs_shortest_paths_distribution_per_scan_isolated)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d9e4e55", + "metadata": {}, + "outputs": [], + "source": [ + "plot_stacked_distances_for_each_row(\n", + " data_frame=all_pairs_shortest_paths_distribution_per_scan_isolated_pivot,\n", + " title='All pairs shortest path for Typescript module dependencies stacked per scan (absolute, logarithmic)',\n", + " xlabel='Scan',\n", + " ylabel='Typescript module paths',\n", + " logy=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7bb61fa8", + "metadata": {}, + "source": [ + "#### All pairs shortest path for each scan - Bar chart (normalized)\n", + "\n", + "Shows the top 50 scans with the highest number of dependency paths stacked by their length." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d09379a3", + "metadata": {}, + "outputs": [], + "source": [ + "# Normalize data (percent of sum pairs)\n", + "all_pairs_shortest_paths_distribution_per_scan_isolated_normalized_pivot=normalize_distance_distribution_for_each_row(all_pairs_shortest_paths_distribution_per_project_isolated_pivot)\n", + "all_pairs_shortest_paths_distribution_per_scan_isolated_normalized_pivot.head(50)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a222ac47", + "metadata": {}, + "outputs": [], + "source": [ + "plot_stacked_distances_for_each_row(\n", + " data_frame=all_pairs_shortest_paths_distribution_per_scan_isolated_normalized_pivot.head(50),\n", + " title='All pairs shortest path for Typescript module dependencies stacked per scan (normalized in %)',\n", + " xlabel='Scan',\n", + " ylabel='Typescript module paths'\n", + ")" + ] + }, { "cell_type": "markdown", "id": "37da9d90", @@ -1003,7 +1139,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "id": "8b43628b", "metadata": {}, "outputs": [], @@ -1102,7 +1238,7 @@ "source": [ "### 1.2.2 Longest path in detail\n", "\n", - "The following table shows the first 10 rows with all details of the query above. It contains the results of the \"longest path\" algorithm including the artifact the source node belongs to and if the target node is in the same artifact or not. The main intuition is to show how the data is structured. It provides the basis for tables and charts shown in following sections below, that filter and group the data accordingly." + "The following table shows the first 10 rows with all details of the query above. It contains the results of the \"longest path\" algorithm including the project the source node belongs to and if the target node is in the same project or not. The main intuition is to show how the data is structured. It provides the basis for tables and charts shown in following sections below, that filter and group the data accordingly." ] }, { @@ -1122,9 +1258,9 @@ "source": [ "### 1.2.3 Longest path for each project\n", "\n", - "In this section we'll focus only on pairs of nodes that both belong to the same artifact, filtering out every line that has `isDifferentTargetProject==False`. The first ten rows are shown in a table followed by charts that show the distribution of longest path distances across different artifacts in stacked bar charts (absolute and normalized).\n", + "In this section we'll focus only on pairs of nodes that both belong to the same project, filtering out every line that has `isDifferentTargetProject==False`. The first ten rows are shown in a table followed by charts that show the distribution of longest path distances across different projects in stacked bar charts (absolute and normalized).\n", "\n", - "**Note:** It is possible that a (longest) path could have nodes in between that belong to different artifacts. Therefore, the data of each artifact isn't perfectly isolated. However, it shows how the dependencies interact across artifacts \"in real life\" while still providing a decent isolation of each artifact." + "**Note:** It is possible that a (longest) path could have nodes in between that belong to different projects. Therefore, the data of each project isn't perfectly isolated. However, it shows how the dependencies interact across projects \"in real life\" while still providing a decent isolation of each project." ] }, { @@ -1143,9 +1279,9 @@ "id": "34ec57d5", "metadata": {}, "source": [ - "#### Longest path for each artifact - Max. longest path for each artifact\n", + "#### Longest path for each project - Max. longest path for each project\n", "\n", - "Shows the top 20 artifacts with their max. longest path." + "Shows the top 20 projects with their max. longest path." ] }, { @@ -1155,7 +1291,7 @@ "metadata": {}, "outputs": [], "source": [ - "longest_path_per_project = get_longest_path_for_each_source_project(longest_paths_distribution_per_project_isolated)\n", + "longest_path_per_project = get_longest_path_for_column('sourceProject', longest_paths_distribution_per_project_isolated)\n", "longest_path_per_project.head(20)" ] }, @@ -1166,10 +1302,10 @@ "metadata": {}, "outputs": [], "source": [ - "plot_longest_distance_per_source_project(\n", + "plot_longest_distance_of_each_row(\n", " data_frame=longest_path_per_project,\n", - " title='Max. longest path for Java package dependencies per artifact',\n", - " xlabel='Artifact',\n", + " title='Max. longest path for Typescript module dependencies per project',\n", + " xlabel='Project',\n", " ylabel='max. longest path length'\n", ")" ] @@ -1179,17 +1315,17 @@ "id": "c0c2d8ba", "metadata": {}, "source": [ - "#### Longest path for each artifact - Bar chart (absolute)" + "#### Longest path for each project - Bar chart (absolute)" ] }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 54, "id": "47bd1a08", "metadata": {}, "outputs": [], "source": [ - "longest_paths_distribution_per_project_isolated_pivot = get_distance_distribution_for_each_source_project(longest_paths_distribution_per_project_isolated)" + "longest_paths_distribution_per_project_isolated_pivot = get_distance_distribution_for_each('sourceProject', longest_paths_distribution_per_project_isolated)" ] }, { @@ -1199,11 +1335,11 @@ "metadata": {}, "outputs": [], "source": [ - "plot_source_projects_distances_stacked(\n", + "plot_stacked_distances_for_each_row(\n", " data_frame=longest_paths_distribution_per_project_isolated_pivot,\n", - " title='Longest path for Java package dependencies stacked per artifact (absolute, logarithmic)',\n", - " xlabel='Artifact',\n", - " ylabel='Java package paths',\n", + " title='Longest path for Typescript module dependencies stacked per project (absolute, logarithmic)',\n", + " xlabel='Project',\n", + " ylabel='Typescript module paths',\n", " logy=True\n", ")" ] @@ -1213,9 +1349,9 @@ "id": "fc4c2036", "metadata": {}, "source": [ - "#### Longest path for each artifact - Bar chart (normalized)\n", + "#### Longest path for each project - Bar chart (normalized)\n", "\n", - "Shows the top 50 artifacts with the highest number of dependency paths stacked by their length." + "Shows the top 50 projects with the highest number of dependency paths stacked by their length." ] }, { @@ -1226,7 +1362,7 @@ "outputs": [], "source": [ "# Normalize data (percent of sum pairs)\n", - "longest_paths_distribution_per_project_isolated_normalized_pivot=normalize_distance_distribution_for_each_source_project(longest_paths_distribution_per_project_isolated_pivot)\n", + "longest_paths_distribution_per_project_isolated_normalized_pivot=normalize_distance_distribution_for_each_row(longest_paths_distribution_per_project_isolated_pivot)\n", "longest_paths_distribution_per_project_isolated_normalized_pivot.head(50)" ] }, @@ -1237,11 +1373,141 @@ "metadata": {}, "outputs": [], "source": [ - "plot_source_projects_distances_stacked(\n", + "plot_stacked_distances_for_each_row(\n", " data_frame=longest_paths_distribution_per_project_isolated_normalized_pivot.head(50),\n", - " title='Longest path for Java package dependencies stacked per artifact (normalized in %)',\n", - " xlabel='Artifact',\n", - " ylabel='Java package paths'\n", + " title='Longest path for Typescript module dependencies stacked per project (normalized in %)',\n", + " xlabel='Project',\n", + " ylabel='Typescript module paths'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b76e3a9c", + "metadata": {}, + "source": [ + "### 1.2.4 Longest path for each scan\n", + "\n", + "In this section we'll focus only on pairs of nodes that both belong to the same scan, filtering out every line that has `isDifferentTargetScan==False`. The first ten rows are shown in a table followed by charts that show the distribution of longest path distances across different scans in stacked bar charts (absolute and normalized).\n", + "\n", + "**Note:** It is possible that a (longest) path could have nodes in-between that belong to different scans. Therefore, the data of each scan isn't perfectly isolated. However, it shows how the dependencies interact across scans \"in real life\" while still providing a decent amount of isolation of each scan." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45439d79", + "metadata": {}, + "outputs": [], + "source": [ + "longest_paths_distribution_per_scan_isolated=longest_paths_distribution_per_project.query('isDifferentTargetScan == False')\n", + "longest_paths_distribution_per_scan_isolated.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "a6c0e349", + "metadata": {}, + "source": [ + "#### Longest path for each scan - Max. longest path for each scan\n", + "\n", + "Shows the top 20 scans with their max. longest path." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "236ec07d", + "metadata": {}, + "outputs": [], + "source": [ + "longest_path_per_scan = get_longest_path_for_column('sourceScan', longest_paths_distribution_per_scan_isolated)\n", + "longest_path_per_scan.head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37215d21", + "metadata": {}, + "outputs": [], + "source": [ + "plot_longest_distance_of_each_row(\n", + " data_frame=longest_path_per_scan,\n", + " title='Max. longest path for Typescript module dependencies per scan',\n", + " xlabel='Module',\n", + " ylabel='max. longest path length'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8622d0f2", + "metadata": {}, + "source": [ + "#### Longest path for each scan - Bar chart (absolute)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "f70894c2", + "metadata": {}, + "outputs": [], + "source": [ + "longest_paths_distribution_per_scan_isolated_pivot = get_distance_distribution_for_each('sourceScan', longest_paths_distribution_per_scan_isolated)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "675b71b9", + "metadata": {}, + "outputs": [], + "source": [ + "plot_stacked_distances_for_each_row(\n", + " data_frame=longest_paths_distribution_per_scan_isolated_pivot,\n", + " title='Longest path for Typescript module dependencies stacked per scan (absolute, logarithmic)',\n", + " xlabel='Module',\n", + " ylabel='Typescript module paths',\n", + " logy=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "47bb4916", + "metadata": {}, + "source": [ + "#### Longest path for each scan - Bar chart (normalized)\n", + "\n", + "Shows the top 50 scans with the highest number of dependency paths stacked by their length." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4426a2f7", + "metadata": {}, + "outputs": [], + "source": [ + "# Normalize data (percent of sum pairs)\n", + "longest_paths_distribution_per_scan_isolated_normalized_pivot=normalize_distance_distribution_for_each_row(longest_paths_distribution_per_scan_isolated_pivot)\n", + "longest_paths_distribution_per_scan_isolated_normalized_pivot.head(50)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4d85859", + "metadata": {}, + "outputs": [], + "source": [ + "plot_stacked_distances_for_each_row(\n", + " data_frame=longest_paths_distribution_per_scan_isolated_normalized_pivot.head(50),\n", + " title='Longest path for Typescript module dependencies stacked per scan (normalized in %)',\n", + " xlabel='Scan',\n", + " ylabel='Typescript module paths'\n", ")" ] }, diff --git a/jupyter/Wordcloud.ipynb b/jupyter/Wordcloud.ipynb index 9d1ca80ef..f84f03d5e 100644 --- a/jupyter/Wordcloud.ipynb +++ b/jupyter/Wordcloud.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 247, "id": "4191f259", "metadata": {}, "outputs": [], @@ -24,13 +24,14 @@ "import os\n", "import pandas as pd\n", "import matplotlib.pyplot as plot\n", + "import typing as typ\n", "from neo4j import GraphDatabase\n", "from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 248, "id": "1c5dab37", "metadata": {}, "outputs": [], @@ -45,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 249, "id": "c1db254b", "metadata": {}, "outputs": [], @@ -57,19 +58,19 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "59310f6f", + "execution_count": 250, + "id": "6e8772aa", "metadata": {}, "outputs": [], "source": [ - "def query_cypher_to_data_frame(filename):\n", - " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", + "def query_cypher_to_data_frame(filename: str, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", + " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_)\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 251, "id": "da9e8edb", "metadata": {}, "outputs": [], @@ -99,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 253, "id": "c2496caf", "metadata": {}, "outputs": [], @@ -119,51 +120,147 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "0ed729e3", + "execution_count": 254, + "id": "82b46521", "metadata": {}, "outputs": [], "source": [ - "# Query data from graph database\n", - "words = query_cypher_to_data_frame(\"../cypher/Overview/Words_for_universal_Wordcloud.cypher\")\n", - "words.head(30)" + "customized_stop_words = STOPWORDS.union([\n", + " 'builder', 'exception', 'abstract', 'helper', 'util', 'callback', 'factory', 'result',\n", + " 'handler', 'type', 'module', 'name', 'parameter', 'lambda', 'access', 'create', 'message', \n", + " 'ts', 'js', 'tsx', 'jsx', 'css', 'htm', 'html', 'props', 'use', 'id', 'ref', 'hook', 'event', \n", + " 'span', 'data', 'context', 'form', 'get', 'set', 'object', 'null', 'new', 'plugin', 'package', \n", + " 'types', 'dom', 'static', 'view', 'link', 'build', 'element', 'impl', 'function', 'test', \n", + " 'dev', 'event', 'mock', 'error', 'input', 'sdk', 'api', 'item', 'end', 'value', 'param', 'start'\n", + "])" ] }, { "cell_type": "code", - "execution_count": null, - "id": "62328fe4", + "execution_count": 255, + "id": "d2016fd0", "metadata": {}, "outputs": [], "source": [ - "# Join all words into one text separated by spaces\n", - "text = \" \".join(i for i in words.word)\n", - "number_of_words=len(words.word)\n", - "print(\"There are {} words in the dataset.\".format(number_of_words))\n", + "def plot_word_cloud_from_text(words: pd.DataFrame, title: str):\n", + " \n", + " # Join all words into one text separated by spaces\n", + " text = \" \".join(word for word in words.word)\n", + " number_of_words=len(words.word)\n", + " print(\"There are {} words in the dataset for the plot titled '{}'.\".format(number_of_words, title))\n", "\n", - "# Define stop words\n", - "stopwords = set(STOPWORDS)\n", - "stopwords.update(['builder', 'exception', 'abstract', 'helper', 'util', 'callback', 'factory', 'result',\n", - " 'handler', 'type', 'module', 'name', 'parameter', 'lambda', 'access', 'create', 'message', \n", - " 'ts', 'js', 'tsx', 'jsx', 'css', 'htm', 'html', 'props', 'use', 'id', 'ref', 'hook', 'event', \n", - " 'span', 'data', 'context', 'form', 'get', 'set', 'object', 'null', 'new'])\n", + " if number_of_words <= 0:\n", + " return\n", "\n", - "if number_of_words > 0:\n", " wordcloud = WordCloud(\n", " width=800, \n", - " height=400,\n", - " max_words=400, \n", - " stopwords=stopwords,\n", + " height=800,\n", + " max_words=600, \n", + " stopwords=customized_stop_words,\n", + " collocations=False,\n", " background_color='white', \n", " colormap='viridis'\n", " ).generate(text)\n", "\n", " # Plot the word cloud\n", - " plot.figure(figsize=(15,10))\n", + " plot.figure(figsize=(15,15))\n", + " plot.imshow(wordcloud, interpolation='bilinear')\n", + " plot.axis(\"off\")\n", + " plot.title(title)\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 256, + "id": "9fbf37ac", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_word_cloud_with_frequency(words_with_frequency: pd.DataFrame, title: str):\n", + " \n", + " if words_with_frequency.empty:\n", + " return\n", + " \n", + " # Expects the first column of the DataFrame to contain the words/text and the second column to contain the count/frequency.\n", + " words_with_frequency_dict=words_with_frequency.set_index(words_with_frequency.columns[0]).to_dict()[words_with_frequency.columns[1]]\n", + " wordcloud = WordCloud(\n", + " width=800, \n", + " height=800,\n", + " max_words=600, \n", + " stopwords=customized_stop_words,\n", + " collocations=False,\n", + " background_color='white', \n", + " colormap='viridis'\n", + " ).generate_from_frequencies(words_with_frequency_dict)\n", + "\n", + " # Plot the word cloud\n", + " plot.figure(figsize=(15,15))\n", " plot.imshow(wordcloud, interpolation='bilinear')\n", " plot.axis(\"off\")\n", + " plot.title(title)\n", " plot.show()" ] + }, + { + "cell_type": "markdown", + "id": "58dd502f", + "metadata": {}, + "source": [ + "## WordCloud of names in code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ed729e3", + "metadata": {}, + "outputs": [], + "source": [ + "# Query data from graph database\n", + "words = query_cypher_to_data_frame(\"../cypher/Overview/Words_for_universal_Wordcloud.cypher\")\n", + "words.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b26bfe71", + "metadata": {}, + "outputs": [], + "source": [ + "plot_word_cloud_from_text(words=words, title='Wordcloud of names in code')" + ] + }, + { + "cell_type": "markdown", + "id": "9624f5ea", + "metadata": {}, + "source": [ + "## WordCloud of git authors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8ae94a2", + "metadata": {}, + "outputs": [], + "source": [ + "# Query data from graph database\n", + "git_author_words_with_frequency = query_cypher_to_data_frame(\"../cypher/Overview/Words_for_git_author_Wordcloud_with_frequency.cypher\")\n", + "git_author_words_with_frequency.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1a81f62", + "metadata": {}, + "outputs": [], + "source": [ + "plot_word_cloud_with_frequency(git_author_words_with_frequency, 'Wordcloud of git authors (using frequency)')" + ] } ], "metadata": { @@ -187,7 +284,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.11.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/scripts/detectChangedFiles.sh b/scripts/detectChangedFiles.sh index 4193c5252..f4ae282c5 100755 --- a/scripts/detectChangedFiles.sh +++ b/scripts/detectChangedFiles.sh @@ -84,7 +84,7 @@ file_names_and_sizes() { if [ -d "$1" ]; then # If it's a directory, list all files inside # except for "node_modules", "target", "temp" and the change detection file itself - find "$1" \ + find -L "$1" \ -type d -name "node_modules" -prune -o \ -type d -name "target" -prune -o \ -type d -name "temp" -prune -o \ diff --git a/scripts/reports/ExternalDependenciesCsv.sh b/scripts/reports/ExternalDependenciesCsv.sh index adc8369d0..5f61f42e6 100755 --- a/scripts/reports/ExternalDependenciesCsv.sh +++ b/scripts/reports/ExternalDependenciesCsv.sh @@ -74,5 +74,8 @@ execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/External_module_usage_per_in execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/List_external_modules_resolved_to_internal_ones_for_Typescript.cypher" > "${FULL_REPORT_DIRECTORY}/External_modules_resolved_to_internal_ones_for_Typescript.csv" +execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/Package_json_dependencies_occurrence.cypher" > "${FULL_REPORT_DIRECTORY}/Package_json_dependencies_occurrence.csv" +execute_cypher "${EXTERNAL_DEPENDENCIES_CYPHER_DIR}/Package_json_dependencies_combinations.cypher" > "${FULL_REPORT_DIRECTORY}/Package_json_dependencies_combinations.csv" + # Clean-up after report generation. Empty reports will be deleted. source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}" \ No newline at end of file diff --git a/scripts/scanTypescript.sh b/scripts/scanTypescript.sh index a7080040e..beb75af38 100755 --- a/scripts/scanTypescript.sh +++ b/scripts/scanTypescript.sh @@ -9,6 +9,7 @@ set -o errexit -o pipefail ARTIFACTS_DIRECTORY=${ARTIFACTS_DIRECTORY:-"artifacts"} SOURCE_DIRECTORY=${SOURCE_DIRECTORY:-"source"} +TYPESCRIPT_SCAN_HEAP_MEMORY=${TYPESCRIPT_SCAN_HEAP_MEMORY:-"4096"} # Heap memory in megabytes for Typescript scanning with (Node.js process). Defaults to 4096 MB. ## Get this "scripts" directory if not already set # Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. @@ -54,7 +55,7 @@ else # For later troubleshooting, the output is also copied to a dedicated log file using "tee". # Note: Don't worry about the hardcoded version number. It will be updated by Renovate using a custom Manager. # Note: NODE_OPTIONS --max-old-space-size=4096 increases the memory for larger projects to scan - NODE_OPTIONS="${NODE_OPTIONS} --max-old-space-size=4096" npx --yes @jqassistant/ts-lce@1.3.0 "${directory}" --extension React 2>&1 | tee "${LOG_DIRECTORY}/jqassistant-typescript-scan-${directory_name}.log" >&2 + NODE_OPTIONS="${NODE_OPTIONS} --max-old-space-size=${TYPESCRIPT_SCAN_HEAP_MEMORY}" npx --yes @jqassistant/ts-lce@1.3.0 "${directory}" --extension React 2>&1 | tee "${LOG_DIRECTORY}/jqassistant-typescript-scan-${directory_name}.log" >&2 done changeDetectionReturnCode=$( source "${SCRIPTS_DIR}/detectChangedFiles.sh" --hashfile "${changeDetectionHashFilePath}" --paths "./${SOURCE_DIRECTORY}")