Merge pull request #299 from datafold/nov17_tests

erezsh · web-flow · commit 5449a047a071 · 2022-11-19T13:53:53.000-03:00
Refactor tests to use insert_rows_in_batches(), instead of internally…
diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
@@ -185,7 +185,8 @@ def _bisect_and_diff_tables(self, table1, table2, info_tree):
             raise NotImplementedError(f"Cannot use column of type {key_type} as a key")
         if not isinstance(key_type2, IKey):
             raise NotImplementedError(f"Cannot use column of type {key_type2} as a key")
-        assert key_type.python_type is key_type2.python_type
+        if key_type.python_type is not key_type2.python_type:
+            raise TypeError(f"Incompatible key types: {key_type} and {key_type2}")
 
         # Query min/max values
         key_ranges = self._threaded_call_as_completed("query_key_range", [table1, table2])
diff --git a/data_diff/sqeleton/databases/base.py b/data_diff/sqeleton/databases/base.py
@@ -8,9 +8,10 @@
 import threading
 from abc import abstractmethod
 from uuid import UUID
+import decimal
 
 from ..utils import is_uuid, safezip
-from ..queries import Expr, Compiler, table, Select, SKIP, Explain
+from ..queries import Expr, Compiler, table, Select, SKIP, Explain, Code
 from .database_types import (
     AbstractDatabase,
     AbstractDialect,
@@ -133,10 +134,15 @@ def _constant_value(self, v):
         elif isinstance(v, str):
             return f"'{v}'"
         elif isinstance(v, datetime):
-            # TODO use self.timestamp_value
-            return f"timestamp '{v}'"
+            return self.timestamp_value(v)
         elif isinstance(v, UUID):
             return f"'{v}'"
+        elif isinstance(v, decimal.Decimal):
+            return str(v)
+        elif isinstance(v, bytearray):
+            return f"'{v.decode()}'"
+        elif isinstance(v, Code):
+            return v.code
         return repr(v)
 
     def constant_values(self, rows) -> str:
@@ -334,7 +340,7 @@ def _process_table_schema(
         # Return a dict of form {name: type} after normalization
         return col_dict
 
-    def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType], where: str = None, sample_size=32):
+    def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType], where: str = None, sample_size=64):
         """Refine the types in the column dict, by querying the database for a sample of their values
 
         'where' restricts the rows to be sampled.
diff --git a/data_diff/sqeleton/databases/clickhouse.py b/data_diff/sqeleton/databases/clickhouse.py
@@ -8,6 +8,7 @@
     ThreadedDatabase,
     import_helper,
     ConnectError,
+    DbTime,
 )
 from .database_types import (
     ColType,
@@ -146,6 +147,10 @@ def _parse_type_repr(self, type_repr: str) -> Optional[Type[ColType]]:
 
         return self.TYPE_CLASSES.get(type_repr)
 
+    # def timestamp_value(self, t: DbTime) -> str:
+    #     # return f"'{t}'"
+    #     return f"'{str(t)[:19]}'"
+
 
 class Clickhouse(ThreadedDatabase):
     dialect = Dialect()
diff --git a/data_diff/sqeleton/queries/__init__.py b/data_diff/sqeleton/queries/__init__.py
@@ -1,4 +1,4 @@
 from .compiler import Compiler
 from .api import this, join, outerjoin, table, SKIP, sum_, avg, min_, max_, cte, commit, when, coalesce
-from .ast_classes import Expr, ExprNode, Select, Count, BinOp, Explain, In
+from .ast_classes import Expr, ExprNode, Select, Count, BinOp, Explain, In, Code
 from .extras import Checksum, NormalizeAsString, ApplyFuncAndNormalizeAsString
diff --git a/data_diff/sqeleton/queries/api.py b/data_diff/sqeleton/queries/api.py
@@ -86,4 +86,13 @@ def coalesce(*exprs):
     return Func("COALESCE", exprs)
 
 
+def insert_rows_in_batches(db, table: TablePath, rows, *, columns=None, batch_size=1024 * 8):
+    assert batch_size > 0
+    rows = list(rows)
+
+    while rows:
+        batch, rows = rows[:batch_size], rows[batch_size:]
+        db.query(table.insert_rows(batch, columns=columns))
+
+
 commit = Commit()
diff --git a/data_diff/sqeleton/queries/ast_classes.py b/data_diff/sqeleton/queries/ast_classes.py
@@ -43,6 +43,12 @@ def cast_to(self, to):
 
 Expr = Union[ExprNode, str, bool, int, datetime, ArithString, None]
 
+@dataclass
+class Code(ExprNode):
+    code: str
+
+    def compile(self, c: Compiler) -> str:
+        return self.code
 
 def _expr_type(e: Expr) -> type:
     if isinstance(e, ExprNode):
diff --git a/tests/common.py b/tests/common.py
@@ -13,6 +13,7 @@
 from data_diff import tracking
 from data_diff import connect
 from data_diff.sqeleton.queries.api import table
+from data_diff.sqeleton.databases import Database
 from data_diff.query_utils import drop_table
 
 tracking.disable_tracking()
@@ -85,7 +86,7 @@ def get_git_revision_short_hash() -> str:
 _database_instances = {}
 
 
-def get_conn(cls: type):
+def get_conn(cls: type) -> Database:
     if cls not in _database_instances:
         _database_instances[cls] = connect(CONN_STRINGS[cls], N_THREADS)
     return _database_instances[cls]
diff --git a/tests/test_database_types.py b/tests/test_database_types.py
@@ -18,7 +18,8 @@
 from data_diff.query_utils import drop_table
 from data_diff.utils import accumulate
 from data_diff.sqeleton.utils import number_to_human
-from data_diff.sqeleton.queries import table, commit
+from data_diff.sqeleton.queries import table, commit, this, Code
+from data_diff.sqeleton.queries.api import insert_rows_in_batches
 from data_diff.hashdiff_tables import HashDiffer, DEFAULT_BISECTION_THRESHOLD
 from data_diff.table_segment import TableSegment
 from .common import (
@@ -362,32 +363,25 @@ class PaginatedTable:
     # much memory.
     RECORDS_PER_BATCH = 1000000
 
-    def __init__(self, table, conn):
-        self.table = table
+    def __init__(self, table_path, conn):
+        self.table_path = table_path
         self.conn = conn
 
     def __iter__(self):
-        iter = PaginatedTable(self.table, self.conn)
-        iter.last_id = 0
-        iter.values = []
-        iter.value_index = 0
-        return iter
-
-    def __next__(self) -> str:
-        if self.value_index == len(self.values):  #  end of current batch
-            query = f"SELECT id, col FROM {self.table} WHERE id > {self.last_id} ORDER BY id ASC LIMIT {self.RECORDS_PER_BATCH}"
-            if isinstance(self.conn, db.Oracle):
-                query = f"SELECT id, col FROM {self.table} WHERE id > {self.last_id} ORDER BY id ASC OFFSET 0 ROWS FETCH NEXT {self.RECORDS_PER_BATCH} ROWS ONLY"
-
-            self.values = self.conn.query(query, list)
-            if len(self.values) == 0:  #  we must be done!
-                raise StopIteration
-            self.last_id = self.values[-1][0]
-            self.value_index = 0
-
-        this_value = self.values[self.value_index]
-        self.value_index += 1
-        return this_value
+        last_id = 0
+        while True:
+            query = (
+                table(self.table_path)
+                .select(this.id, this.col)
+                .where(this.id > last_id)
+                .order_by(this.id)
+                .limit(self.RECORDS_PER_BATCH)
+            )
+            rows = self.conn.query(query, list)
+            if not rows:
+                break
+            last_id = rows[-1][0]
+            yield from rows
 
 
 class DateTimeFaker:
@@ -560,90 +554,42 @@ def expand_params(testcase_func, param_num, param):
     return name
 
 
-def _insert_to_table(conn, table, values, type):
-    current_n_rows = conn.query(f"SELECT COUNT(*) FROM {table}", int)
+def _insert_to_table(conn, table_path, values, type):
+    tbl = table(table_path)
+
+    current_n_rows = conn.query(tbl.count(), int)
     if current_n_rows == N_SAMPLES:
         assert BENCHMARK, "Table should've been deleted, or we should be in BENCHMARK mode"
         return
     elif current_n_rows > 0:
-        conn.query(drop_table(table))
-        _create_table_with_indexes(conn, table, type)
-
-    if BENCHMARK and N_SAMPLES > 10_000:
-        description = f"{conn.name}: {table}"
-        values = rich.progress.track(values, total=N_SAMPLES, description=description)
-
-    default_insertion_query = f"INSERT INTO {table} (id, col) VALUES "
-    if isinstance(conn, db.Oracle):
-        default_insertion_query = f"INSERT INTO {table} (id, col)"
-
-    batch_size = 8000
-    if isinstance(conn, db.BigQuery):
-        batch_size = 1000
-
-    insertion_query = default_insertion_query
-    selects = []
-    for j, sample in values:
-        if re.search(r"(time zone|tz)", type):
-            sample = sample.replace(tzinfo=timezone.utc)
+        conn.query(drop_table(table_name))
+        _create_table_with_indexes(conn, table_path, type)
 
-        if isinstance(sample, bytearray):
-            value = f"'{sample.decode()}'"
+    # if BENCHMARK and N_SAMPLES > 10_000:
+    #     description = f"{conn.name}: {table}"
+    #     values = rich.progress.track(values, total=N_SAMPLES, description=description)
 
-        elif type == "boolean":
-            value = str(bool(sample))
+    if type == "boolean":
+        values = [(i, bool(sample)) for i, sample in values]
+    elif re.search(r"(time zone|tz)", type):
+        values = [(i, sample.replace(tzinfo=timezone.utc)) for i, sample in values]
 
-        elif isinstance(conn, db.Clickhouse):
-            if type.startswith("DateTime64"):
-                value = f"'{sample.replace(tzinfo=None)}'"
+    if isinstance(conn, db.Clickhouse):
+        if type.startswith("DateTime64"):
+            values = [(i, f"{sample.replace(tzinfo=None)}") for i, sample in values]
 
-            elif type == "DateTime":
-                sample = sample.replace(tzinfo=None)
-                # Clickhouse's DateTime does not allow to store micro/milli/nano seconds
-                value = f"'{str(sample)[:19]}'"
+        elif type == "DateTime":
+            # Clickhouse's DateTime does not allow to store micro/milli/nano seconds
+            values = [(i, str(sample)[:19]) for i, sample in values]
 
-            elif type.startswith("Decimal"):
-                precision = int(type[8:].rstrip(")").split(",")[1])
-                value = round(sample, precision)
+        elif type.startswith("Decimal("):
+            precision = int(type[8:].rstrip(")").split(",")[1])
+            values = [(i, round(sample, precision)) for i, sample in values]
+    elif isinstance(conn, db.BigQuery) and type == "datetime":
+        values = [(i, Code(f"cast(timestamp '{sample}' as datetime)")) for i, sample in values]
 
-            else:
-                value = f"'{sample}'"
-
-        elif isinstance(sample, (float, Decimal, int)):
-            value = str(sample)
-        elif isinstance(sample, datetime) and isinstance(conn, (db.Presto, db.Oracle, db.Trino)):
-            value = f"timestamp '{sample}'"
-        elif isinstance(sample, datetime) and isinstance(conn, db.BigQuery) and type == "datetime":
-            value = f"cast(timestamp '{sample}' as datetime)"
-
-        else:
-            value = f"'{sample}'"
-
-        if isinstance(conn, db.Oracle):
-            selects.append(f"SELECT {j}, {value} FROM dual")
-        else:
-            insertion_query += f"({j}, {value}),"
-
-        # Some databases want small batch sizes...
-        # Need to also insert on the last row, might not divide cleanly!
-        if j % batch_size == 0 or j == N_SAMPLES:
-            if isinstance(conn, db.Oracle):
-                insertion_query += " UNION ALL ".join(selects)
-                conn.query(insertion_query, None)
-                selects = []
-                insertion_query = default_insertion_query
-            else:
-                conn.query(insertion_query[0:-1], None)
-                insertion_query = default_insertion_query
-
-    if insertion_query != default_insertion_query:
-        # Very bad, but this whole function needs to go
-        if isinstance(conn, db.Oracle):
-            insertion_query += " UNION ALL ".join(selects)
-            conn.query(insertion_query, None)
-        else:
-            conn.query(insertion_query[0:-1], None)
 
+    insert_rows_in_batches(conn, tbl, values, columns=["id", "col"])
     conn.query(commit)
 
 
@@ -676,17 +622,27 @@ def _create_indexes(conn, table):
             raise (err)
 
 
-def _create_table_with_indexes(conn, table, type):
+def _create_table_with_indexes(conn, table_path, type_):
+    table_name = ".".join(map(conn.dialect.quote, table_path))
+
+    tbl = table(
+        table_path,
+        schema={
+            "id": int,
+            "col": type_,
+        },
+    )
+
     if isinstance(conn, db.Oracle):
-        already_exists = conn.query(f"SELECT COUNT(*) from tab where tname='{table.upper()}'", int) > 0
+        already_exists = conn.query(f"SELECT COUNT(*) from tab where tname='{table_name.upper()}'", int) > 0
         if not already_exists:
-            conn.query(f"CREATE TABLE {table}(id int, col {type})", None)
+            conn.query(tbl.create())
     elif isinstance(conn, db.Clickhouse):
-        conn.query(f"CREATE TABLE {table}(id int, col {type}) engine = Memory;", None)
+        conn.query(f"CREATE TABLE {table_name}(id int, col {type_}) engine = Memory;", None)
     else:
-        conn.query(f"CREATE TABLE IF NOT EXISTS {table}(id int, col {type})", None)
+        conn.query(tbl.create(if_not_exists=True))
 
-    _create_indexes(conn, table)
+    _create_indexes(conn, table_name)
     conn.query(commit)
 
 
@@ -725,17 +681,15 @@ def test_types(self, source_db, target_db, source_type, target_type, type_catego
 
         self.src_table_path = src_table_path = src_conn.parse_table_name(src_table_name)
         self.dst_table_path = dst_table_path = dst_conn.parse_table_name(dst_table_name)
-        self.src_table = src_table = ".".join(map(src_conn.dialect.quote, src_table_path))
-        self.dst_table = dst_table = ".".join(map(dst_conn.dialect.quote, dst_table_path))
 
         start = time.monotonic()
         if not BENCHMARK:
             drop_table(src_conn, src_table_path)
-        _create_table_with_indexes(src_conn, src_table, source_type)
-        _insert_to_table(src_conn, src_table, enumerate(sample_values, 1), source_type)
+        _create_table_with_indexes(src_conn, src_table_path, source_type)
+        _insert_to_table(src_conn, src_table_path, enumerate(sample_values, 1), source_type)
         insertion_source_duration = time.monotonic() - start
 
-        values_in_source = PaginatedTable(src_table, src_conn)
+        values_in_source = PaginatedTable(src_table_path, src_conn)
         if source_db is db.Presto or source_db is db.Trino:
             if source_type.startswith("decimal"):
                 values_in_source = ((a, Decimal(b)) for a, b in values_in_source)
@@ -745,8 +699,8 @@ def test_types(self, source_db, target_db, source_type, target_type, type_catego
         start = time.monotonic()
         if not BENCHMARK:
             drop_table(dst_conn, dst_table_path)
-        _create_table_with_indexes(dst_conn, dst_table, target_type)
-        _insert_to_table(dst_conn, dst_table, values_in_source, target_type)
+        _create_table_with_indexes(dst_conn, dst_table_path, target_type)
+        _insert_to_table(dst_conn, dst_table_path, values_in_source, target_type)
         insertion_target_duration = time.monotonic() - start
 
         if type_category == "uuid":
@@ -813,8 +767,8 @@ def test_types(self, source_db, target_db, source_type, target_type, type_catego
             "rows": N_SAMPLES,
             "rows_human": number_to_human(N_SAMPLES),
             "name_human": f"{source_db.__name__}/{sanitize(source_type)} <-> {target_db.__name__}/{sanitize(target_type)}",
-            "src_table": src_table[1:-1],  #  remove quotes
-            "target_table": dst_table[1:-1],
+            "src_table": src_table_path,
+            "target_table": dst_table_path,
             "source_type": source_type,
             "target_type": target_type,
             "insertion_source_sec": round(insertion_source_duration, 3),
diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py
@@ -731,7 +731,10 @@ def test_left_table_empty(self):
 
 class TestInfoTree(unittest.TestCase):
     def test_info_tree_root(self):
-        self.ddb = get_conn(db.DuckDB)
+        try:
+            self.db = get_conn(db.DuckDB)
+        except KeyError:    # ddb not defined
+            self.db = get_conn(db.MySQL)
 
         table_suffix = random_table_suffix()
         self.table_src_name = f"src{table_suffix}"
@@ -750,10 +753,10 @@ def test_info_tree_root(self):
             self.table2.insert_rows([i] for i in range(2000)),
         ]
         for q in queries:
-            self.ddb.query(q)
+            self.db.query(q)
 
-        ts1 = TableSegment(self.ddb, self.table1.path, ("id",))
-        ts2 = TableSegment(self.ddb, self.table2.path, ("id",))
+        ts1 = TableSegment(self.db, self.table1.path, ("id",))
+        ts2 = TableSegment(self.db, self.table2.path, ("id",))
 
         for differ in (HashDiffer(bisection_threshold=64), JoinDiffer(True)):
             diff_res = differ.diff_tables(ts1, ts2)