datafold · erezsh · Jul 1, 2022 · Jun 29, 2022 · Jun 29, 2022 · Jun 30, 2022
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -32,15 +32,18 @@ jobs:
         uses: actions/setup-python@v3
         with:
           python-version: ${{ matrix.python-version }}
-      
+
       - name: Build the stack
-        run: docker-compose up -d mysql
+        run: docker-compose up -d mysql postgres presto
 
       - name: Install Poetry
         run: pip install poetry
 
       - name: Install package
-        run: poetry install
-      
+        run: "poetry install"
+
       - name: Run unit tests
-        run: poetry run python3 -m unittest
+        env:
+            DATADIFF_SNOWFLAKE_URI: '${{ secrets.DATADIFF_SNOWFLAKE_URI }}'
+            DATADIFF_PRESTO_URI: '${{ secrets.DATADIFF_PRESTO_URI }}'
+        run: poetry run unittest-parallel -j 16
diff --git a/.gitignore b/.gitignore
@@ -134,6 +134,7 @@ ratings*.csv
 drive
 mysqltuner.pl
 benchmark_*.jsonl
+benchmark_*.png
 
 # Mac
 .DS_Store
diff --git a/README.md b/README.md
@@ -171,9 +171,9 @@ Users can also install several drivers at once:
 Usage: `data-diff DB1_URI TABLE1_NAME DB2_URI TABLE2_NAME [OPTIONS]`
 
 See the [example command](#example-command-and-output) and the [sample
-connection strings](#supported-databases). 
+connection strings](#supported-databases).
 
-Note that for some databases, the arguments that you enter in the command line 
+Note that for some databases, the arguments that you enter in the command line
 may be case-sensitive. This is the case for the Snowflake schema and table names.
 
 Options:
@@ -423,11 +423,16 @@ $ docker-compose up -d mysql postgres # run mysql and postgres dbs in background
 
 **3. Run Unit Tests**
 
+There are more than 1000 tests for all the different type and database
+combinations, so we recommend using `unittest-parallel` that's installed as a
+development dependency.
+
 ```shell-session
-$ poetry run python3 -m unittest
+$ poetry run unittest-parallel -j 16 #  run all tests
+$ poetry run python -m unittest -k <test> #  run individual test
 ```
 
-**4. Seed the Database(s)**
+**4. Seed the Database(s) (optional)**
 
 First, download the CSVs of seeding data:
 
@@ -451,7 +456,7 @@ $ poetry run preql -f dev/prepare_db.pql mssql://<uri>
 $ poetry run preql -f dev/prepare_db.pql bigquery:///<project>
 ```
 
-**5. Run **data-diff** against seeded database**
+**5. Run **data-diff** against seeded database (optional)**
 
 ```bash
 poetry run python3 -m data_diff postgresql://postgres:Password1@localhost/postgres rating postgresql://postgres:Password1@localhost/postgres rating_del1 --verbose
@@ -460,7 +465,14 @@ poetry run python3 -m data_diff postgresql://postgres:Password1@localhost/postgr
 **6. Run benchmarks (optional)**
 
 ```shell-session
-$ dev/benchmark.sh
+$ dev/benchmark.sh #  runs benchmarks and puts results in benchmark_<sha>.csv
+$ poetry run python3 dev/graph.py #  create graphs from benchmark_*.csv files
+```
+
+You can adjust how many rows we benchmark with by passing `N_SAMPLES` to `dev/benchmark.sh`:
+
+```shell-session
+$ N_SAMPLES=100000000 dev/benchmark.sh #  100m which is our canonical target
 ```
 
 

diff --git a/data_diff/utils.py b/data_diff/utils.py
@@ -1,3 +1,5 @@
+import math
+
 from typing import Sequence, Optional, Tuple, Union, Dict, Any
 from uuid import UUID
 
@@ -38,3 +40,14 @@ def is_uuid(u):
     except ValueError:
         return False
     return True
+
+
+def number_to_human(n):
+    millnames = ["", "k", "m", "b"]
+    n = float(n)
+    millidx = max(
+        0,
+        min(len(millnames) - 1, int(math.floor(0 if n == 0 else math.log10(abs(n)) / 3))),
+    )
+
+    return "{:.0f}{}".format(n / 10 ** (3 * millidx), millnames[millidx])
diff --git a/dev/benchmark.sh b/dev/benchmark.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+run_test() {
+    N_SAMPLES=${N_SAMPLES:-1000000} N_THREADS=${N_THREADS:-16} LOG_LEVEL=${LOG_LEVEL:-info} BENCHMARK=1 \
+        poetry run python3 -m unittest tests/test_database_types.py -v -k $1
+}
+
+run_test "postgresql_int_mysql_int"
+run_test "mysql_int_mysql_int"
+run_test "postgresql_int_postgresql_int"
+run_test "postgresql_ts6_n_tz_mysql_ts0"
+run_test "postgresql_ts6_n_tz_snowflake_ts9"
+run_test "postgresql_int_presto_int"
+run_test "postgresql_int_redshift_int"
+run_test "postgresql_int_snowflake_int"
+run_test "postgresql_int_bigquery_int"
+run_test "snowflake_int_snowflake_int"
+
+poetry run python dev/graph.py
diff --git a/dev/graph.py b/dev/graph.py
@@ -0,0 +1,56 @@
+# Use this to graph the benchmarking results (see benchmark.sh)
+#
+# To run this:
+#   - pip install pandas
+#   - pip install plotly
+#
+
+import pandas as pd
+import plotly.graph_objects as go
+from data_diff.utils import number_to_human
+import glob
+
+for benchmark_file in glob.glob("benchmark_*.jsonl"):
+    rows = pd.read_json(benchmark_file, lines=True)
+    rows["cloud"] = rows["test"].str.match(r".*(snowflake|redshift|presto|bigquery)")
+    sha = benchmark_file.split("_")[1].split(".")[0]
+    print(f"Generating graphs from {benchmark_file}..")
+
+    for n_rows, group in rows.groupby(["rows"]):
+        image_path = f"benchmark_{sha}_{number_to_human(n_rows)}.png"
+        print(f"\t rows: {number_to_human(n_rows)}, image: {image_path}")
+
+        r = group.drop_duplicates(subset=["name_human"])
+        r = r.sort_values(by=["cloud", "source_type", "target_type", "name_human"])
+
+        fig = go.Figure(
+            data=[
+                go.Bar(
+                    name="count(*)",
+                    x=r["name_human"],
+                    y=r["count_max_sec"],
+                    text=r["count_max_sec"],
+                    textfont=dict(color="blue"),
+                ),
+                go.Bar(
+                    name="data-diff (checksum)",
+                    x=r["name_human"],
+                    y=r["checksum_sec"],
+                    text=r["checksum_sec"],
+                    textfont=dict(color="red"),
+                ),
+                go.Bar(
+                    name="Download and compare †",
+                    x=r["name_human"],
+                    y=r["download_sec"],
+                    text=r["download_sec"],
+                    textfont=dict(color="green"),
+                ),
+            ]
+        )
+        # Change the bar mode
+        fig.update_layout(title=f"data-diff {number_to_human(n_rows)} rows, sha: {sha}")
+        fig.update_traces(texttemplate="%{text:.1f}", textposition="outside")
+        fig.update_layout(uniformtext_minsize=2, uniformtext_mode="hide")
+        fig.update_yaxes(title="Time")
+        fig.write_image(image_path, scale=2)