|
15 | 15 | "- [jqassistant](https://jqassistant.org)\n", |
16 | 16 | "- [notebook walks through examples for integrating various packages with Neo4j](https://nicolewhite.github.io/neo4j-jupyter/hello-world.html)\n", |
17 | 17 | "- [OO Design Quality Metrics](https://api.semanticscholar.org/CorpusID:18246616)\n", |
18 | | - "- [py2neo](https://py2neo.org/2021.1/)" |
| 18 | + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)" |
19 | 19 | ] |
20 | 20 | }, |
21 | 21 | { |
|
100 | 100 | "</style>" |
101 | 101 | ] |
102 | 102 | }, |
103 | | - { |
104 | | - "attachments": {}, |
105 | | - "cell_type": "markdown", |
106 | | - "id": "91d80bf7", |
107 | | - "metadata": {}, |
108 | | - "source": [ |
109 | | - "## Artifacts\n", |
110 | | - "\n", |
111 | | - "#### Table 1\n", |
112 | | - "\n", |
113 | | - "- List all the artifacts this notebook is based on" |
114 | | - ] |
115 | | - }, |
116 | | - { |
117 | | - "cell_type": "code", |
118 | | - "execution_count": null, |
119 | | - "id": "dc682db6", |
120 | | - "metadata": {}, |
121 | | - "outputs": [], |
122 | | - "source": [ |
123 | | - "query_cypher_to_data_frame(\"../cypher/List_all_existing_artifacts.cypher\")" |
124 | | - ] |
125 | | - }, |
126 | 103 | { |
127 | 104 | "attachments": {}, |
128 | 105 | "cell_type": "markdown", |
|
256 | 233 | "\n", |
257 | 234 | "#### Table 6\n", |
258 | 235 | "\n", |
259 | | - "- Show the top 20 packages with the highest distance from the \"main sequence\"" |
| 236 | + "- Show the top 30 packages with the highest distance from the \"main sequence\"" |
260 | 237 | ] |
261 | 238 | }, |
262 | 239 | { |
|
267 | 244 | "outputs": [], |
268 | 245 | "source": [ |
269 | 246 | "instabilityPerAbstractness = query_cypher_to_data_frame(\"../cypher/Metrics/Calculate_distance_between_abstractness_and_instability.cypher\")\n", |
270 | | - "instabilityPerAbstractness.head(20)" |
| 247 | + "instabilityPerAbstractness.head(30)" |
271 | 248 | ] |
272 | 249 | }, |
273 | 250 | { |
|
295 | 272 | "# Function that returns the number of past (index smaller than given index) rows \n", |
296 | 273 | "# with the same value in columnName1 and columnName2\n", |
297 | 274 | "# If there was a row with the same columnName1 and columnName2 values\n", |
298 | | - "def countPastEntriesWithSameValues(dataFrame, index, columnName1, columnName2):\n", |
299 | | - " columnValue1 = dataFrame[columnName1][index]\n", |
300 | | - " columnValue2 = dataFrame[columnName2][index]\n", |
301 | | - " return len(dataFrame[\n", |
302 | | - " (dataFrame.index.isin(range(0, index + 1))) & \n", |
303 | | - " (dataFrame[columnName1]==columnValue1) & \n", |
304 | | - " (dataFrame[columnName2]==columnValue2)\n", |
305 | | - " ]) - 1" |
| 275 | + "# def countPastEntriesWithSameValues(dataFrame, index, columnName1, columnName2):\n", |
| 276 | + "# columnValue1 = dataFrame[columnName1][index]\n", |
| 277 | + "# columnValue2 = dataFrame[columnName2][index]\n", |
| 278 | + "# return len(dataFrame[\n", |
| 279 | + "# (dataFrame.index.isin(range(0, index + 1))) & \n", |
| 280 | + "# (dataFrame[columnName1]==columnValue1) & \n", |
| 281 | + "# (dataFrame[columnName2]==columnValue2)\n", |
| 282 | + "# ]) - 1" |
| 283 | + ] |
| 284 | + }, |
| 285 | + { |
| 286 | + "cell_type": "code", |
| 287 | + "execution_count": null, |
| 288 | + "id": "36d8cf50", |
| 289 | + "metadata": {}, |
| 290 | + "outputs": [], |
| 291 | + "source": [ |
| 292 | + "instabilityPerAbstractness.packageName[0]" |
306 | 293 | ] |
307 | 294 | }, |
308 | 295 | { |
|
312 | 299 | "metadata": {}, |
313 | 300 | "outputs": [], |
314 | 301 | "source": [ |
| 302 | + "def annotate_plot(data_frame: pd.DataFrame, index: int):\n", |
| 303 | + " \"\"\"\n", |
| 304 | + " Annotates the data points identified by the \"index\" in the plot of the \"data_frame\" \n", |
| 305 | + " \"\"\"\n", |
| 306 | + " x_position = data_frame.abstractness[index].item()\n", |
| 307 | + " y_position = data_frame.instability[index].item()\n", |
| 308 | + " artifact_name = data_frame.artifactName[index].item()\n", |
| 309 | + " package_name = data_frame.packageName[index].item()\n", |
| 310 | + "\n", |
| 311 | + " label_box=dict(boxstyle=\"round4,pad=0.5\", fc=\"w\", alpha=0.8)\n", |
| 312 | + " plot.annotate(artifact_name + '\\n' + package_name\n", |
| 313 | + " ,xy=(x_position, y_position)\n", |
| 314 | + " ,xycoords='data'\n", |
| 315 | + " ,xytext=(20, 0)\n", |
| 316 | + " ,textcoords='offset points'\n", |
| 317 | + " ,size=6\n", |
| 318 | + " ,bbox=label_box\n", |
| 319 | + " ,arrowprops=dict(arrowstyle=\"-|>\", mutation_scale=10, color=\"black\")\n", |
| 320 | + " )\n", |
| 321 | + "\n", |
| 322 | + "def index_of_sorted(data_frame: pd.DataFrame, highest: list[str] = []):\n", |
| 323 | + " \"\"\"\n", |
| 324 | + " Sorts the \"data_frame\" by columns 'abstractness','instability','typesInPackage', 'artifactName'\n", |
| 325 | + " and returns the index of the first row.\n", |
| 326 | + " Columns that are contained in the list of strings parameter \"highest\" will be sorted descending.\n", |
| 327 | + " \"\"\"\n", |
| 328 | + " by = ['abstractness','instability','typesInPackage', 'artifactName']\n", |
| 329 | + " ascending = [('abstractness' not in highest), ('instability' not in highest), False, True]\n", |
| 330 | + " return data_frame.sort_values(by=by, ascending=ascending).head(1).index\n", |
| 331 | + "\n", |
| 332 | + "\n", |
315 | 333 | "# data points scaled by the number of types and colored by the distance to the \"main sequence\"\n", |
316 | 334 | "plot.scatter(\n", |
317 | 335 | " instabilityPerAbstractness.abstractness, # x axis shows abstractness\n", |
|
323 | 341 | "# green \"main sequence\" line\n", |
324 | 342 | "plot.plot([0,1], [1,0], c='lightgreen', linestyle='dashed') \n", |
325 | 343 | "\n", |
326 | | - "# add the packagenames to the those with the 15 highest distance values\n", |
327 | | - "distanceAnnotationThreshold = instabilityPerAbstractness.distance.nlargest(15).iloc[-1]\n", |
328 | | - "# (variant) highest 15% (quantile) of all distance values\n", |
329 | | - "# distanceAnnotationThreshold = instabilityPerAbstractness.distance.quantile(0.85)\n", |
330 | | - "for i, name in enumerate(instabilityPerAbstractness.packageName):\n", |
331 | | - " if (instabilityPerAbstractness.distance[i] >= distanceAnnotationThreshold):\n", |
332 | | - " x_position = instabilityPerAbstractness.abstractness[i]\n", |
333 | | - " y_position = instabilityPerAbstractness.instability[i]\n", |
334 | | - " # To overcome overlapping text annotations for multiple data points on the same position, \n", |
335 | | - " # entries with same position values in the past indizes are count and used to offset the y-position\n", |
336 | | - " # so that multiple names are written underneath each other.\n", |
337 | | - " alreadyExistingPositions = countPastEntriesWithSameValues(instabilityPerAbstractness, i, 'abstractness', 'instability')\n", |
338 | | - " y_position = y_position - alreadyExistingPositions / len(instabilityPerAbstractness) * 2\n", |
339 | | - " \n", |
340 | | - " plot.annotate(name, (x_position, y_position), size=6)\n", |
341 | | - " \n", |
| 344 | + "# Annotate largest package with the highest abstractness and instability\n", |
| 345 | + "annotation_index = index_of_sorted(highest=['abstractness','instability'], data_frame=instabilityPerAbstractness)\n", |
| 346 | + "annotate_plot(instabilityPerAbstractness, annotation_index)\n", |
| 347 | + "\n", |
| 348 | + "# Annotate largest package with the lowest abstractness and highest instability\n", |
| 349 | + "annotation_index = index_of_sorted(highest=['instability'], data_frame=instabilityPerAbstractness)\n", |
| 350 | + "annotate_plot(instabilityPerAbstractness, annotation_index)\n", |
| 351 | + "\n", |
| 352 | + "# Annotate largest package with the lowest abstractness and lowest instability\n", |
| 353 | + "annotation_index = index_of_sorted(highest=[], data_frame=instabilityPerAbstractness)\n", |
| 354 | + "annotate_plot(instabilityPerAbstractness, annotation_index)\n", |
| 355 | + "\n", |
| 356 | + "# Annotate largest package with the highest abstractness and lowest instability\n", |
| 357 | + "annotation_index = index_of_sorted(highest=['abstractness'], data_frame=instabilityPerAbstractness)\n", |
| 358 | + "annotate_plot(instabilityPerAbstractness, annotation_index)\n", |
| 359 | + "\n", |
| 360 | + "# Annotate largest packages with the highest abstractness and instability near 0.5% \n", |
| 361 | + "annotation_index = index_of_sorted(highest=['abstractness', 'instability'], data_frame=instabilityPerAbstractness.query('abstractness <= 0.5 & instability <= 0.5'))\n", |
| 362 | + "annotate_plot(instabilityPerAbstractness, annotation_index)\n", |
| 363 | + "\n", |
342 | 364 | "plot.title('Abstractness vs. Instability (\"Main Sequence\")')\n", |
343 | 365 | "plot.xlabel('Abstractness')\n", |
344 | 366 | "plot.ylabel('Instability')\n", |
|
0 commit comments