Support for varying alphanums, with special characters

erezsh · erezsh · commit 5b4dc4dffe20 · 2022-09-20T11:29:38.000+03:00
Re-implementation of alphanums, segmented without the use of intermediary ints
diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py
@@ -16,8 +16,10 @@
     Float,
     ColType_UUID,
     Native_UUID,
-    String_Alphanum,
     String_UUID,
+    String_Alphanum,
+    String_FixedAlphanum,
+    String_VaryingAlphanum,
     TemporalType,
     UnknownColType,
     Text,
@@ -79,6 +81,7 @@ class Database(AbstractDatabase):
 
     TYPE_CLASSES: Dict[str, type] = {}
     default_schema: str = None
+    SUPPORTS_ALPHANUMS = True
 
     @property
     def name(self):
@@ -229,23 +232,22 @@ def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType], whe
                     col_dict[col_name] = String_UUID()
                     continue
 
-            alphanum_samples = [s for s in samples if s and String_Alphanum.test_value(s)]
-            if alphanum_samples:
-                if len(alphanum_samples) != len(samples):
-                    logger.warning(
-                        f"Mixed Alphanum/Non-Alphanum values detected in column {'.'.join(table_path)}.{col_name}, disabling Alphanum support."
-                    )
-                else:
-                    assert col_name in col_dict
-                    lens = set(map(len, alphanum_samples))
-                    if len(lens) > 1:
+            if self.SUPPORTS_ALPHANUMS:  # Anything but MySQL (so far)
+                alphanum_samples = [s for s in samples if String_Alphanum.test_value(s)]
+                if alphanum_samples:
+                    if len(alphanum_samples) != len(samples):
                         logger.warning(
-                            f"Mixed Alphanum lengths detected in column {'.'.join(table_path)}.{col_name}, disabling Alphanum support."
+                            f"Mixed Alphanum/Non-Alphanum values detected in column {'.'.join(table_path)}.{col_name}. It cannot be used as a key."
                         )
                     else:
-                        (length,) = lens
-                        col_dict[col_name] = String_Alphanum(length=length)
-                        continue
+                        assert col_name in col_dict
+                        lens = set(map(len, alphanum_samples))
+                        if len(lens) > 1:
+                            col_dict[col_name] = String_VaryingAlphanum()
+                        else:
+                            (length,) = lens
+                            col_dict[col_name] = String_FixedAlphanum(length=length)
+                            continue
 
     # @lru_cache()
     # def get_table_schema(self, path: DbPath) -> Dict[str, ColType]:
diff --git a/data_diff/databases/database_types.py b/data_diff/databases/database_types.py
@@ -92,10 +92,7 @@ class String_UUID(StringType, ColType_UUID):
     pass
 
 
-@dataclass
 class String_Alphanum(StringType, ColType_Alphanum):
-    length: int
-
     @staticmethod
     def test_value(value: str) -> bool:
         try:
@@ -104,6 +101,18 @@ def test_value(value: str) -> bool:
         except ValueError:
             return False
 
+    def make_value(self, value):
+        return self.python_type(value)
+
+
+class String_VaryingAlphanum(String_Alphanum):
+    pass
+
+
+@dataclass
+class String_FixedAlphanum(String_Alphanum):
+    length: int
+
     def make_value(self, value):
         if len(value) != self.length:
             raise ValueError(f"Expected alphanumeric value of length {self.length}, but got '{value}'.")
diff --git a/data_diff/databases/mysql.py b/data_diff/databases/mysql.py
@@ -28,6 +28,7 @@ class MySQL(ThreadedDatabase):
         "binary": Text,
     }
     ROUNDS_ON_PREC_LOSS = True
+    SUPPORTS_ALPHANUMS = False
 
     def __init__(self, *, thread_count, **kw):
         self._args = kw
diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
@@ -121,7 +121,7 @@ def diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
             logger.info(
                 f"Diffing tables | segments: {self.bisection_factor}, bisection threshold: {self.bisection_threshold}. "
                 f"key-range: {table1.min_key}..{table2.max_key}, "
-                f"size: {table1.approximate_size()}"
+                f"size: table1 <= {table1.approximate_size()}, table2 <= {table2.approximate_size()}"
             )
 
             # Bisect (split) the table into segments, and diff them recursively.
@@ -218,12 +218,12 @@ def _validate_and_adjust_columns(self, table1, table2):
                         "If encoding/formatting differs between databases, it may result in false positives."
                     )
 
-    def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
+    def _bisect_and_diff_tables(self, table1: TableSegment, table2: TableSegment, level=0, max_rows=None):
         assert table1.is_bounded and table2.is_bounded
 
         if max_rows is None:
             # We can be sure that row_count <= max_rows
-            max_rows = table1.max_key - table1.min_key
+            max_rows = max(table1.approximate_size(), table2.approximate_size())
 
         # If count is below the threshold, just download and compare the columns locally
         # This saves time, as bisection speed is limited by ping and query performance.
@@ -254,37 +254,38 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
 
         # Recursively compare each pair of corresponding segments between table1 and table2
         diff_iters = [
-            self._diff_tables(t1, t2, level + 1, i + 1, len(segmented1))
+            self._diff_tables(t1, t2, max_rows, level + 1, i + 1, len(segmented1))
             for i, (t1, t2) in enumerate(safezip(segmented1, segmented2))
         ]
 
         for res in self._thread_map(list, diff_iters):
             yield from res
 
-    def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_count=None):
+    def _diff_tables(
+        self, table1: TableSegment, table2: TableSegment, max_rows: int, level=0, segment_index=None, segment_count=None
+    ):
         logger.info(
             ". " * level + f"Diffing segment {segment_index}/{segment_count}, "
             f"key-range: {table1.min_key}..{table2.max_key}, "
-            f"size: {table2.max_key-table1.min_key}"
+            f"size <= {max_rows}"
         )
 
         # When benchmarking, we want the ability to skip checksumming. This
         # allows us to download all rows for comparison in performance. By
         # default, data-diff will checksum the section first (when it's below
         # the threshold) and _then_ download it.
         if BENCHMARK:
-            max_rows_from_keys = max(table1.max_key - table1.min_key, table2.max_key - table2.min_key)
-            if max_rows_from_keys < self.bisection_threshold:
-                yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max_rows_from_keys)
+            if max_rows < self.bisection_threshold:
+                yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max_rows)
                 return
 
         (count1, checksum1), (count2, checksum2) = self._threaded_call("count_and_checksum", [table1, table2])
 
         if count1 == 0 and count2 == 0:
-            logger.warning(
-                "Uneven distribution of keys detected. (big gaps in the key column). "
-                "For better performance, we recommend to increase the bisection-threshold."
-            )
+            # logger.warning(
+            #     f"Uneven distribution of keys detected in segment {table1.min_key}..{table2.max_key}. (big gaps in the key column). "
+            #     "For better performance, we recommend to increase the bisection-threshold."
+            # )
             assert checksum1 is None and checksum2 is None
             return
 
diff --git a/data_diff/table_segment.py b/data_diff/table_segment.py
@@ -4,7 +4,7 @@
 
 from runtype import dataclass
 
-from .utils import ArithString, split_space
+from .utils import ArithString, split_space, ArithAlphanumeric
 
 from .databases.base import Database
 from .databases.database_types import DbPath, DbKey, DbTime, Native_UUID, Schema, create_schema
@@ -149,8 +149,9 @@ def choose_checkpoints(self, count: int) -> List[DbKey]:
         assert self.is_bounded
         if isinstance(self.min_key, ArithString):
             assert type(self.min_key) is type(self.max_key)
-            checkpoints = split_space(self.min_key.int, self.max_key.int, count)
-            return [self.min_key.new(int=i) for i in checkpoints]
+            checkpoints = self.min_key.range(self.max_key, count)
+            assert all(self.min_key <= x <= self.max_key for x in checkpoints)
+            return checkpoints
 
         return split_space(self.min_key, self.max_key, count)
 
diff --git a/data_diff/utils.py b/data_diff/utils.py
@@ -9,7 +9,7 @@
 import string
 import threading
 
-alphanums = string.digits + string.ascii_lowercase
+alphanums = " -" + string.digits + string.ascii_uppercase + "_" + string.ascii_lowercase
 
 
 def safezip(*args):
@@ -29,6 +29,11 @@ class ArithString:
     def new(cls, *args, **kw):
         return cls(*args, **kw)
 
+    def range(self, other: "ArithString", count: int):
+        assert isinstance(other, ArithString)
+        checkpoints = split_space(self.int, other.int, count)
+        return [self.new(int=i) for i in checkpoints]
+
 
 class ArithUUID(UUID, ArithString):
     "A UUID that supports basic arithmetic (add, sub)"
@@ -49,70 +54,96 @@ def __sub__(self, other: Union[UUID, int]):
         return NotImplemented
 
 
-def numberToBase(num, base):
+def numberToAlphanum(num: int, base: str = alphanums) -> str:
     digits = []
     while num > 0:
-        num, remainder = divmod(num, base)
+        num, remainder = divmod(num, len(base))
         digits.append(remainder)
-    return "".join(alphanums[i] for i in digits[::-1])
+    return "".join(base[i] for i in digits[::-1])
 
 
-class ArithAlphanumeric(ArithString):
-    def __init__(self, str: str = None, int: int = None, max_len=None):
-        if str is None:
-            str = numberToBase(int, len(alphanums))
-        else:
-            assert int is None
+def alphanumToNumber(alphanum: str, base: str) -> int:
+    num = 0
+    for c in alphanum:
+        num = num * len(base) + base.index(c)
+    return num
+
+
+def justify_alphanums(s1: str, s2: str):
+    max_len = max(len(s1), len(s2))
+    s1 = s1.ljust(max_len)
+    s2 = s2.ljust(max_len)
+    return s1, s2
 
-        if max_len and len(str) > max_len:
+
+def alphanums_to_numbers(s1: str, s2: str):
+    s1, s2 = justify_alphanums(s1, s2)
+    n1 = alphanumToNumber(s1, alphanums)
+    n2 = alphanumToNumber(s2, alphanums)
+    return n1, n2
+
+
+class ArithAlphanumeric(ArithString):
+    def __init__(self, s: str, max_len=None):
+        if s is None:
+            raise ValueError("Alphanum string cannot be None")
+        if max_len and len(s) > max_len:
             raise ValueError(f"Length of alphanum value '{str}' is longer than the expected {max_len}")
 
-        self._str = str
+        for ch in s:
+            if ch not in alphanums:
+                raise ValueError(f"Unexpected character {ch} in alphanum string")
+
+        self._str = s
         self._max_len = max_len
 
-    @property
-    def int(self):
-        return int(self._str, len(alphanums))
+    # @property
+    # def int(self):
+    #     return alphanumToNumber(self._str, alphanums)
 
     def __str__(self):
         s = self._str
         if self._max_len:
-            s = s.rjust(self._max_len, "0")
+            s = s.rjust(self._max_len, alphanums[0])
         return s
 
     def __len__(self):
         return len(self._str)
 
-    def __int__(self):
-        return self.int
-
     def __repr__(self):
         return f'alphanum"{self._str}"'
 
-    def __add__(self, other: "Union[ArithAlphanumeric, int]"):
+    def __add__(self, other: "Union[ArithAlphanumeric, int]") -> "ArithAlphanumeric":
         if isinstance(other, int):
-            res = self.new(int=self.int + other)
-            if len(str(res)) != len(self):
-                raise ValueError("Overflow error when adding to alphanumeric")
-            return res
+            if other != 1:
+                raise NotImplementedError("not implemented for arbitrary numbers")
+            lastchar = self._str[-1] if self._str else alphanums[0]
+            s = self._str[:-1] + alphanums[alphanums.index(lastchar) + other]
+            return self.new(s)
         return NotImplemented
 
-    def __sub__(self, other: "Union[ArithAlphanumeric, int]"):
-        if isinstance(other, int):
-            return type(self)(int=self.int - other)
-        elif isinstance(other, ArithAlphanumeric):
-            return self.int - other.int
+    def range(self, other: "ArithAlphanumeric", count: int):
+        assert isinstance(other, ArithAlphanumeric)
+        n1, n2 = alphanums_to_numbers(self._str, other._str)
+        split = split_space(n1, n2, count)
+        return [self.new(numberToAlphanum(s)) for s in split]
+
+    def __sub__(self, other: "Union[ArithAlphanumeric, int]") -> float:
+        if isinstance(other, ArithAlphanumeric):
+            n1, n2 = alphanums_to_numbers(self._str, other._str)
+            return n1 - n2
+
         return NotImplemented
 
     def __ge__(self, other):
         if not isinstance(other, type(self)):
             return NotImplemented
-        return self.int >= other.int
+        return self._str >= other._str
 
     def __lt__(self, other):
         if not isinstance(other, type(self)):
             return NotImplemented
-        return self.int < other.int
+        return self._str < other._str
 
     def new(self, *args, **kw):
         return type(self)(*args, **kw, max_len=self._max_len)
diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@ class MySQL(ThreadedDatabase):`
`28`	`28`	`"binary": Text,`
`29`	`29`	`}`
`30`	`30`	`ROUNDS_ON_PREC_LOSS = True`
	`31`	`+ SUPPORTS_ALPHANUMS = False`
`31`	`32`
`32`	`33`	`def __init__(self, , thread_count, *kw):`
`33`	`34`	`self._args = kw`