From 9eaf17212f38c96d7b5b943ae8e10cf8340fb413 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Thu, 24 Apr 2025 19:18:14 -0400 Subject: [PATCH 1/6] feat(benchmark): collect benchmarks for last 5 versions of datafusion in line protocol format --- benchmarks/collect_bench.sh | 60 +++++++++++++++++++++++++++++++++++++ benchmarks/lineprotocol.py | 40 +++++++++++-------------- 2 files changed, 77 insertions(+), 23 deletions(-) create mode 100755 benchmarks/collect_bench.sh diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh new file mode 100755 index 000000000000..7bd661ab4a06 --- /dev/null +++ b/benchmarks/collect_bench.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script is meant for developers of DataFusion -- it is runnable +# from the standard DataFusion development environment and uses cargo, +# etc and orchestrates gathering data and run the benchmark binary to +# collect benchmarks from the current main and last 5 major releases. + +trap 'git checkout main' EXIT #checkout to main on exit +ARG1=$1 + +main(){ +timestamp=$(date +%s) +lp_file="results/$ARG1-$timestamp.lp" +mkdir -p results +touch $lp_file + +cp lineprotocol.py results/ + +git fetch upstream main +git checkout main + +# get current major version +output=$(cargo metadata --format-version=1 --no-deps | jq '.packages[] | select(.name == "datafusion") | .version') +major_version=$(echo "$output" | grep -oE '[0-9]+' | head -n1) + + +# run for current main +echo "current major version: $major_version" +export RESULTS_DIR="results/$major_version.0.0" +./bench.sh run $ARG1 +python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file + +# run for last 5 major releases +for i in {1..5}; do + echo "running benchmark on $((major_version-i)).0.0" + git fetch upstream $((major_version-i)).0.0 + git checkout $((major_version-i)).0.0 + export RESULTS_DIR="results/$((major_version-i)).0.0" + ./bench.sh run $ARG1 + python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file +done +} + +main \ No newline at end of file diff --git a/benchmarks/lineprotocol.py b/benchmarks/lineprotocol.py index 75e09b662e3e..9f9e37ce6ef4 100644 --- a/benchmarks/lineprotocol.py +++ b/benchmarks/lineprotocol.py @@ -1,23 +1,20 @@ #!/usr/bin/env python -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -""" +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the + Converts a given json to LineProtocol format that can be visualised by grafana/other systems that support LineProtocol. @@ -27,10 +24,7 @@ benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=1,row_count=10838832,elapsed_ms=68694468 1691105678000000000 benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=2,row_count=10838832,elapsed_ms=63392883 1691105678000000000 benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=3,row_count=10838832,elapsed_ms=66388367 1691105678000000000 -""" -# sort.json -""" { "queries": [ { @@ -180,7 +174,7 @@ def main() -> None: ) options = parser.parse_args() - lineformat(options.baseline_path) + lineformat(options.path) From c6fa6a63ea9c22d7b07c33fd14f5ac4bd7d9fbb3 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Thu, 24 Apr 2025 19:19:25 -0400 Subject: [PATCH 2/6] fix: bring back sort.json comment --- benchmarks/lineprotocol.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/lineprotocol.py b/benchmarks/lineprotocol.py index 9f9e37ce6ef4..58cbbaaffd70 100644 --- a/benchmarks/lineprotocol.py +++ b/benchmarks/lineprotocol.py @@ -25,6 +25,7 @@ benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=2,row_count=10838832,elapsed_ms=63392883 1691105678000000000 benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=3,row_count=10838832,elapsed_ms=66388367 1691105678000000000 +sort.json { "queries": [ { From 0790500fed38aa3c5b7743280a5e1e64a2d06e42 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Thu, 24 Apr 2025 19:20:55 -0400 Subject: [PATCH 3/6] fix: complete the license --- benchmarks/lineprotocol.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/lineprotocol.py b/benchmarks/lineprotocol.py index 58cbbaaffd70..5d4667d2cb7e 100644 --- a/benchmarks/lineprotocol.py +++ b/benchmarks/lineprotocol.py @@ -14,6 +14,8 @@ software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. Converts a given json to LineProtocol format that can be visualised by grafana/other systems that support LineProtocol. From c0e037abfd1711593fe574e3052ec99f3b28fa96 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Fri, 25 Apr 2025 13:32:38 -0400 Subject: [PATCH 4/6] fix: elapsed time in milliseconds --- benchmarks/lineprotocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/lineprotocol.py b/benchmarks/lineprotocol.py index 5d4667d2cb7e..5300eafbafd1 100644 --- a/benchmarks/lineprotocol.py +++ b/benchmarks/lineprotocol.py @@ -166,7 +166,7 @@ def lineformat( query_str = f"query=\"{query.query}\"" timestamp = f"{query.start_time*10**9}" for iter_num, result in enumerate(query.iterations): - print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed*1000:.0f} {timestamp}\n") + print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed:.0f} {timestamp}\n") def main() -> None: parser = ArgumentParser() From 81fb13b5876dada48decd07fb9dab1cf985a4a7e Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Fri, 25 Apr 2025 13:45:04 -0400 Subject: [PATCH 5/6] fix: 3-digit float number as elapsed --- benchmarks/lineprotocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/lineprotocol.py b/benchmarks/lineprotocol.py index 5300eafbafd1..2e282b056447 100644 --- a/benchmarks/lineprotocol.py +++ b/benchmarks/lineprotocol.py @@ -166,7 +166,7 @@ def lineformat( query_str = f"query=\"{query.query}\"" timestamp = f"{query.start_time*10**9}" for iter_num, result in enumerate(query.iterations): - print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed:.0f} {timestamp}\n") + print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed:.3f} {timestamp}\n") def main() -> None: parser = ArgumentParser() From 9a997d708016b4e52ab1476aba85217da7139519 Mon Sep 17 00:00:00 2001 From: Sara Ghodsi <55705790+saraghds@users.noreply.github.com> Date: Tue, 13 May 2025 16:48:15 -0400 Subject: [PATCH 6/6] fix: json file names --- benchmarks/collect_bench.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/benchmarks/collect_bench.sh b/benchmarks/collect_bench.sh index 7bd661ab4a06..3da5531c4a97 100755 --- a/benchmarks/collect_bench.sh +++ b/benchmarks/collect_bench.sh @@ -24,12 +24,22 @@ trap 'git checkout main' EXIT #checkout to main on exit ARG1=$1 +get_json_file_name() { + if [[ "$1" == "tpch" ]]; then + echo "tpch_sf1" + else + echo "$1" + fi +} + main(){ timestamp=$(date +%s) lp_file="results/$ARG1-$timestamp.lp" mkdir -p results touch $lp_file +json_file_name=$(get_json_file_name $ARG1) + cp lineprotocol.py results/ git fetch upstream main @@ -44,7 +54,7 @@ major_version=$(echo "$output" | grep -oE '[0-9]+' | head -n1) echo "current major version: $major_version" export RESULTS_DIR="results/$major_version.0.0" ./bench.sh run $ARG1 -python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file +python3 results/lineprotocol.py $RESULTS_DIR/$json_file_name.json >> $lp_file # run for last 5 major releases for i in {1..5}; do @@ -53,7 +63,7 @@ for i in {1..5}; do git checkout $((major_version-i)).0.0 export RESULTS_DIR="results/$((major_version-i)).0.0" ./bench.sh run $ARG1 - python3 results/lineprotocol.py $RESULTS_DIR/$ARG1.json >> $lp_file + python3 results/lineprotocol.py $RESULTS_DIR/$json_file_name.json >> $lp_file done }