diff --git a/scripts/process-benchmarks.ipynb b/scripts/process-benchmarks.ipynb index 8419a5f8..f60fba9a 100644 --- a/scripts/process-benchmarks.ipynb +++ b/scripts/process-benchmarks.ipynb @@ -93,19 +93,33 @@ " match = PATH_REGEX.match(path.name)\n", " if match is None:\n", " continue\n", - " \n", + "\n", " experiment = match.groupdict()\n", - " \n", + "\n", " with open(path, \"r\") as fp:\n", " stats = json.load(fp)\n", "\n", - " entry = [match[\"engine\"], match[\"m\"], match[\"ef\"], \n", - " match[\"dataset\"], match[\"search_index\"], match[\"date\"], \n", - " stats[\"params\"], stats[\"results\"]]\n", + " params = stats[\"params\"]\n", + " dataset = params.pop(\"dataset\")\n", + " engine = params.pop(\"engine\")\n", + "\n", + " entry = {\n", + " \"dataset\": dataset,\n", + " \"engine\": engine,\n", + " \"m\": match[\"m\"],\n", + " \"ef\": match[\"ef\"],\n", + " \"date\": match[\"date\"],\n", + " \"params\": params,\n", + " \"results\": stats[\"results\"],\n", + " }\n", + "\n", " if experiment[\"operation\"] == \"search\":\n", + " entry.update({\"search_index\": match[\"search_index\"]})\n", " search_results.append(entry)\n", " elif experiment[\"operation\"] == \"upload\":\n", " upload_results.append(entry)\n", + " else:\n", + " raise Exception(\"Unknown operation\")\n", "\n", "len(upload_results), len(search_results)" ] @@ -113,18 +127,10 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2022-08-05T10:03:54.157465Z", - "start_time": "2022-08-05T10:03:54.153118Z" - }, - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [ - "column_names = [\"engine\", \"m\", \"ef\", \"dataset\", \"search_index\", \"date\", \"params\", \"results\"]" + "upload_results, search_results[0]" ] }, { @@ -141,18 +147,15 @@ }, "outputs": [], "source": [ - "upload_df = pd.DataFrame(upload_results, columns=column_names) \\\n", - " .drop(columns=\"search_index\")\n", + "upload_df = pd.DataFrame(upload_results)\n", "upload_df[\"date\"] = pd.to_datetime(upload_df[\"date\"], format=\"%Y-%m-%d-%H-%M-%S\")\n", "upload_df = upload_df.sort_values(\"date\", ascending=False) \\\n", " .groupby([\"engine\", \"m\", \"ef\", \"dataset\"]) \\\n", - " .last()\n", - "upload_df = pd.concat([upload_df, upload_df[\"results\"].apply(pd.Series)], axis=1)\n", - "upload_df = upload_df.drop(columns=\"results\")\n", - "\n", - "print(len(upload_df))\n", + " .first()\n", "\n", - "upload_df.sort_values(\"total_time\", ascending=True).head(n=5)" + "temp_df = upload_df.copy()\n", + "temp_df[\"total_time\"] = temp_df[\"results\"].apply(lambda x: x[\"total_time\"])\n", + "temp_df.sort_values(\"total_time\", ascending=True).head(n=5)" ] }, { @@ -169,18 +172,15 @@ }, "outputs": [], "source": [ - "search_df = pd.DataFrame(search_results, columns=column_names)\n", + "search_df = pd.DataFrame(search_results)\n", "search_df[\"date\"] = pd.to_datetime(search_df[\"date\"], format=\"%Y-%m-%d-%H-%M-%S\")\n", "search_df = search_df.sort_values(\"date\", ascending=False) \\\n", " .groupby([\"engine\", \"m\", \"ef\", \"dataset\", \"search_index\"]) \\\n", " .first()\n", "\n", - "print(len(search_df))\n", - "\n", - "for column_name in [\"params\", \"results\"]:\n", - " search_df = pd.concat([search_df, search_df[column_name].apply(pd.Series)], axis=1)\n", - " search_df = search_df.drop(columns=column_name)\n", - "search_df.sort_values(\"rps\", ascending=False).head(n=10)" + "temp_df = search_df.copy()\n", + "temp_df['rps'] = temp_df['results'].apply(lambda x: x[\"rps\"])\n", + "temp_df.sort_values(\"rps\", ascending=False).head(n=10)" ] }, { @@ -203,50 +203,56 @@ "metadata": {}, "outputs": [], "source": [ - "json_all = []\n", - "json_1_or_100_thread = []\n", + "json_results = []\n", "\n", "for index, row in joined_df.reset_index().iterrows():\n", " engine_params = {}\n", - " if isinstance(row['search_params'], dict):\n", - " engine_params.update(row['search_params'])\n", - " if isinstance(row['params'], dict):\n", - " engine_params.update(row['params'])\n", + " \n", + " if isinstance(row['params_upload'], dict):\n", + " engine_params.update(row['params_upload'])\n", + " if isinstance(row['params_search'], dict):\n", + " search_params = row['params_search']\n", + " engine_params.update(search_params.get('config', {}))\n", + " engine_params.update(search_params.get('params', {}))\n", + " engine_params.update(search_params.get('search_params', {}))\n", + " engine_params.update(search_params.get('vectorIndexConfig', {}))\n", + "\n", + " engine_params.pop('experiment')\n", + " engine_params.pop('parallel')\n", "\n", " engine_name = row['engine']\n", "\n", - " if engine_name == \"qdrant-rps\" or engine_name == \"qdrant-bq-rps\" or engine_name == \"qdrant-sq-rps\":\n", + " if engine_name.startswith(\"qdrant-\"):\n", " engine_name = \"qdrant\"\n", "\n", " json_object = {\n", " \"engine_name\": engine_name,\n", - " \"setup_name\": f\"{row['engine']}-m-{row['m']}-ef-{row['ef']}\",\n", + " \"setup_name\": f\"{row['params_search']['experiment']}\",\n", " \"dataset_name\": row['dataset'],\n", - " # \"search_idx\": row['search_index'],\n", - " \"upload_time\": row['upload_time'],\n", - " \"total_upload_time\": row['total_time_upload'],\n", - " \"p95_time\": row['p95_time'],\n", - " \"rps\": row['rps'],\n", - " \"parallel\": row['parallel'],\n", - " \"p99_time\": row['p99_time'],\n", - " \"mean_time\": row['mean_time'],\n", - " \"mean_precisions\": row['mean_precisions'],\n", + " \"search_idx\": row['search_index'],\n", + " \"upload_time\": row['results_upload']['upload_time'],\n", + " \"total_upload_time\": row['results_upload']['total_time'],\n", + " \"p95_time\": row['results_search']['p95_time'],\n", + " \"rps\": row['results_search']['rps'],\n", + " \"parallel\": row['params_search']['parallel'],\n", + " \"p99_time\": row['results_search']['p99_time'],\n", + " \"mean_time\": row['results_search']['mean_time'],\n", + " \"mean_precisions\": row['results_search']['mean_precisions'],\n", " \"engine_params\": engine_params,\n", " }\n", - " json_all.append(json_object)\n", - " \n", - " parallel = row['parallel']\n", + " json_results.append(json_object)\n", "\n", - " if parallel == 1 or parallel == 100:\n", - " json_1_or_100_thread.append(json_object)\n", - "\n", - "format = '%Y-%M-%d' # T%H:%M:%S\n", + "format = '%Y-%M-%dT%H:%M:%S'\n", "now = datetime.now().replace(tzinfo=timezone.utc).strftime(format)\n", "\n", - "Path(f\"results-{now}.json\").write_text(json.dumps(json_all, indent=2))\n", - "Path(f\"results-1-100-threads-{now}.json\").write_text(json.dumps(json_1_or_100_thread, indent=2))\n", + "Path(f\"results.json\").write_text(json.dumps(json_results, indent=2))\n", + "Path(f\"results-{now}.json\").write_text(json.dumps(json_results, indent=2))\n", + "\n", + "print(json_results[-1], len(json_results))\n", "\n", - "json_1_or_100_thread[-1], len(json_all), len(json_1_or_100_thread)" + "results_df = pd.DataFrame(json_results).sort_values(\"p99_time\", ascending=True)\n", + "# results_df.to_csv('results.csv')\n", + "results_df" ] } ],