Squash abstract MD5 & Normalizer mixins into the base dialect

Sergey Vasilyev · Sergey Vasilyev · commit a39cd0bf3875 · 2023-10-25T18:48:22.000+02:00
The MD5 &amp; normalizing methods are implemented in every supported database with 100% coverage. We have no databases that do not implement these methods. As such, they can be simply moved to the base dialect class to ensure the 100% coverage in the future. No changes are required from the specific dialect classes.
diff --git a/data_diff/abcs/mixins.py b/data_diff/abcs/mixins.py
@@ -19,104 +19,3 @@
 @attrs.define(frozen=False)
 class AbstractMixin(ABC):
     "A mixin for a database dialect"
-
-
-@attrs.define(frozen=False)
-class AbstractMixin_NormalizeValue(AbstractMixin):
-    @abstractmethod
-    def to_comparable(self, value: str, coltype: ColType) -> str:
-        """Ensure that the expression is comparable in ``IS DISTINCT FROM``."""
-
-    @abstractmethod
-    def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
-        """Creates an SQL expression, that converts 'value' to a normalized timestamp.
-
-        The returned expression must accept any SQL datetime/timestamp, and return a string.
-
-        Date format: ``YYYY-MM-DD HH:mm:SS.FFFFFF``
-
-        Precision of dates should be rounded up/down according to coltype.rounds
-        """
-
-    @abstractmethod
-    def normalize_number(self, value: str, coltype: FractionalType) -> str:
-        """Creates an SQL expression, that converts 'value' to a normalized number.
-
-        The returned expression must accept any SQL int/numeric/float, and return a string.
-
-        Floats/Decimals are expected in the format
-        "I.P"
-
-        Where I is the integer part of the number (as many digits as necessary),
-        and must be at least one digit (0).
-        P is the fractional digits, the amount of which is specified with
-        coltype.precision. Trailing zeroes may be necessary.
-        If P is 0, the dot is omitted.
-
-        Note: We use 'precision' differently than most databases. For decimals,
-        it's the same as ``numeric_scale``, and for floats, who use binary precision,
-        it can be calculated as ``log10(2**numeric_precision)``.
-        """
-
-    def normalize_boolean(self, value: str, _coltype: Boolean) -> str:
-        """Creates an SQL expression, that converts 'value' to either '0' or '1'."""
-        return self.to_string(value)
-
-    def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
-        """Creates an SQL expression, that strips uuids of artifacts like whitespace."""
-        if isinstance(coltype, String_UUID):
-            return f"TRIM({value})"
-        return self.to_string(value)
-
-    def normalize_json(self, value: str, _coltype: JSON) -> str:
-        """Creates an SQL expression, that converts 'value' to its minified json string representation."""
-        return self.to_string(value)
-
-    def normalize_array(self, value: str, _coltype: Array) -> str:
-        """Creates an SQL expression, that serialized an array into a JSON string."""
-        return self.to_string(value)
-
-    def normalize_struct(self, value: str, _coltype: Struct) -> str:
-        """Creates an SQL expression, that serialized a typed struct into a JSON string."""
-        return self.to_string(value)
-
-    def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
-        """Creates an SQL expression, that converts 'value' to a normalized representation.
-
-        The returned expression must accept any SQL value, and return a string.
-
-        The default implementation dispatches to a method according to `coltype`:
-
-        ::
-
-            TemporalType    -> normalize_timestamp()
-            FractionalType  -> normalize_number()
-            *else*          -> to_string()
-
-            (`Integer` falls in the *else* category)
-
-        """
-        if isinstance(coltype, TemporalType):
-            return self.normalize_timestamp(value, coltype)
-        elif isinstance(coltype, FractionalType):
-            return self.normalize_number(value, coltype)
-        elif isinstance(coltype, ColType_UUID):
-            return self.normalize_uuid(value, coltype)
-        elif isinstance(coltype, Boolean):
-            return self.normalize_boolean(value, coltype)
-        elif isinstance(coltype, JSON):
-            return self.normalize_json(value, coltype)
-        elif isinstance(coltype, Array):
-            return self.normalize_array(value, coltype)
-        elif isinstance(coltype, Struct):
-            return self.normalize_struct(value, coltype)
-        return self.to_string(value)
-
-
-@attrs.define(frozen=False)
-class AbstractMixin_MD5(AbstractMixin):
-    """Methods for calculating an MD6 hash as an integer."""
-
-    @abstractmethod
-    def md5_as_int(self, s: str) -> str:
-        "Provide SQL for computing md5 and returning an int"
diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py
@@ -55,6 +55,8 @@
 )
 from data_diff.abcs.database_types import (
     Array,
+    ColType_UUID,
+    FractionalType,
     Struct,
     ColType,
     Integer,
@@ -74,7 +76,6 @@
     JSON,
 )
 from data_diff.abcs.mixins import Compilable
-from data_diff.abcs.mixins import AbstractMixin_NormalizeValue
 
 logger = logging.getLogger("database")
 cv_params = contextvars.ContextVar("params")
@@ -762,6 +763,95 @@ def to_string(self, s: str) -> str:
     def set_timezone_to_utc(self) -> str:
         "Provide SQL for setting the session timezone to UTC"
 
+    @abstractmethod
+    def md5_as_int(self, s: str) -> str:
+        "Provide SQL for computing md5 and returning an int"
+
+    @abstractmethod
+    def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
+        """Creates an SQL expression, that converts 'value' to a normalized timestamp.
+
+        The returned expression must accept any SQL datetime/timestamp, and return a string.
+
+        Date format: ``YYYY-MM-DD HH:mm:SS.FFFFFF``
+
+        Precision of dates should be rounded up/down according to coltype.rounds
+        """
+
+    @abstractmethod
+    def normalize_number(self, value: str, coltype: FractionalType) -> str:
+        """Creates an SQL expression, that converts 'value' to a normalized number.
+
+        The returned expression must accept any SQL int/numeric/float, and return a string.
+
+        Floats/Decimals are expected in the format
+        "I.P"
+
+        Where I is the integer part of the number (as many digits as necessary),
+        and must be at least one digit (0).
+        P is the fractional digits, the amount of which is specified with
+        coltype.precision. Trailing zeroes may be necessary.
+        If P is 0, the dot is omitted.
+
+        Note: We use 'precision' differently than most databases. For decimals,
+        it's the same as ``numeric_scale``, and for floats, who use binary precision,
+        it can be calculated as ``log10(2**numeric_precision)``.
+        """
+
+    def normalize_boolean(self, value: str, _coltype: Boolean) -> str:
+        """Creates an SQL expression, that converts 'value' to either '0' or '1'."""
+        return self.to_string(value)
+
+    def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
+        """Creates an SQL expression, that strips uuids of artifacts like whitespace."""
+        if isinstance(coltype, String_UUID):
+            return f"TRIM({value})"
+        return self.to_string(value)
+
+    def normalize_json(self, value: str, _coltype: JSON) -> str:
+        """Creates an SQL expression, that converts 'value' to its minified json string representation."""
+        return self.to_string(value)
+
+    def normalize_array(self, value: str, _coltype: Array) -> str:
+        """Creates an SQL expression, that serialized an array into a JSON string."""
+        return self.to_string(value)
+
+    def normalize_struct(self, value: str, _coltype: Struct) -> str:
+        """Creates an SQL expression, that serialized a typed struct into a JSON string."""
+        return self.to_string(value)
+
+    def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
+        """Creates an SQL expression, that converts 'value' to a normalized representation.
+
+        The returned expression must accept any SQL value, and return a string.
+
+        The default implementation dispatches to a method according to `coltype`:
+
+        ::
+
+            TemporalType    -> normalize_timestamp()
+            FractionalType  -> normalize_number()
+            *else*          -> to_string()
+
+            (`Integer` falls in the *else* category)
+
+        """
+        if isinstance(coltype, TemporalType):
+            return self.normalize_timestamp(value, coltype)
+        elif isinstance(coltype, FractionalType):
+            return self.normalize_number(value, coltype)
+        elif isinstance(coltype, ColType_UUID):
+            return self.normalize_uuid(value, coltype)
+        elif isinstance(coltype, Boolean):
+            return self.normalize_boolean(value, coltype)
+        elif isinstance(coltype, JSON):
+            return self.normalize_json(value, coltype)
+        elif isinstance(coltype, Array):
+            return self.normalize_array(value, coltype)
+        elif isinstance(coltype, Struct):
+            return self.normalize_struct(value, coltype)
+        return self.to_string(value)
+
     def optimizer_hints(self, hints: str) -> str:
         return f"/*+ {hints} */ "
 
@@ -960,10 +1050,7 @@ def _refine_coltypes(
         if not text_columns:
             return
 
-        if isinstance(self.dialect, AbstractMixin_NormalizeValue):
-            fields = [Code(self.dialect.normalize_uuid(self.dialect.quote(c), String_UUID())) for c in text_columns]
-        else:
-            fields = this[text_columns]
+        fields = [Code(self.dialect.normalize_uuid(self.dialect.quote(c), String_UUID())) for c in text_columns]
 
         samples_by_row = self.query(
             table(*table_path).select(*fields).where(Code(where) if where else SKIP).limit(sample_size), list
diff --git a/data_diff/databases/bigquery.py b/data_diff/databases/bigquery.py
@@ -20,12 +20,6 @@
     Boolean,
     UnknownColType,
 )
-from data_diff.abcs.mixins import (
-    AbstractMixin_MD5,
-    AbstractMixin_NormalizeValue,
-)
-from data_diff.abcs.compiler import Compilable
-from data_diff.queries.api import this, table, SKIP, code
 from data_diff.databases.base import (
     BaseDialect,
     Database,
@@ -61,7 +55,7 @@ def import_bigquery_service_account_impersonation():
 
 
 @attrs.define(frozen=False)
-class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
+class Dialect(BaseDialect):
     name = "BigQuery"
     ROUNDS_ON_PREC_LOSS = False  # Technically BigQuery doesn't allow implicit rounding or truncation
     TYPE_CLASSES = {
diff --git a/data_diff/databases/clickhouse.py b/data_diff/databases/clickhouse.py
@@ -24,7 +24,6 @@
     Timestamp,
     Boolean,
 )
-from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
 
 # https://clickhouse.com/docs/en/operations/server-configuration-parameters/settings/#default-database
 DEFAULT_DATABASE = "default"
@@ -38,7 +37,7 @@ def import_clickhouse():
 
 
 @attrs.define(frozen=False)
-class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
+class Dialect(BaseDialect):
     name = "Clickhouse"
     ROUNDS_ON_PREC_LOSS = False
     TYPE_CLASSES = {
diff --git a/data_diff/databases/databricks.py b/data_diff/databases/databricks.py
@@ -17,7 +17,6 @@
     UnknownColType,
     Boolean,
 )
-from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
 from data_diff.databases.base import (
     MD5_HEXDIGITS,
     CHECKSUM_HEXDIGITS,
@@ -37,7 +36,7 @@ def import_databricks():
 
 
 @attrs.define(frozen=False)
-class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
+class Dialect(BaseDialect):
     name = "Databricks"
     ROUNDS_ON_PREC_LOSS = True
     TYPE_CLASSES = {
diff --git a/data_diff/databases/duckdb.py b/data_diff/databases/duckdb.py
@@ -17,10 +17,6 @@
     FractionalType,
     Boolean,
 )
-from data_diff.abcs.mixins import (
-    AbstractMixin_MD5,
-    AbstractMixin_NormalizeValue,
-)
 from data_diff.databases.base import (
     Database,
     BaseDialect,
@@ -41,7 +37,7 @@ def import_duckdb():
 
 
 @attrs.define(frozen=False)
-class Dialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
+class Dialect(BaseDialect):
     name = "DuckDB"
     ROUNDS_ON_PREC_LOSS = False
     SUPPORTS_PRIMARY_KEY = True
diff --git a/data_diff/databases/mssql.py b/data_diff/databases/mssql.py
@@ -2,7 +2,6 @@
 
 import attrs
 
-from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
 from data_diff.databases.base import (
     CHECKSUM_HEXDIGITS,
     CHECKSUM_OFFSET,
@@ -36,11 +35,7 @@ def import_mssql():
 
 
 @attrs.define(frozen=False)
-class Dialect(
-    BaseDialect,
-    AbstractMixin_MD5,
-    AbstractMixin_NormalizeValue,
-):
+class Dialect(BaseDialect):
     name = "MsSQL"
     ROUNDS_ON_PREC_LOSS = True
     SUPPORTS_PRIMARY_KEY = True
diff --git a/data_diff/databases/mysql.py b/data_diff/databases/mysql.py
@@ -15,10 +15,6 @@
     Boolean,
     Date,
 )
-from data_diff.abcs.mixins import (
-    AbstractMixin_MD5,
-    AbstractMixin_NormalizeValue,
-)
 from data_diff.databases.base import (
     ThreadedDatabase,
     import_helper,
@@ -41,11 +37,7 @@ def import_mysql():
 
 
 @attrs.define(frozen=False)
-class Dialect(
-    BaseDialect,
-    AbstractMixin_MD5,
-    AbstractMixin_NormalizeValue,
-):
+class Dialect(BaseDialect):
     name = "MySQL"
     ROUNDS_ON_PREC_LOSS = True
     SUPPORTS_PRIMARY_KEY = True
diff --git a/data_diff/databases/oracle.py b/data_diff/databases/oracle.py
@@ -16,7 +16,6 @@
     TimestampTZ,
     FractionalType,
 )
-from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
 from data_diff.databases.base import (
     BaseDialect,
     ThreadedDatabase,
@@ -42,8 +41,6 @@ def import_oracle():
 @attrs.define(frozen=False)
 class Dialect(
     BaseDialect,
-    AbstractMixin_MD5,
-    AbstractMixin_NormalizeValue,
 ):
     name = "Oracle"
     SUPPORTS_PRIMARY_KEY = True
diff --git a/data_diff/databases/postgresql.py b/data_diff/databases/postgresql.py
@@ -18,7 +18,6 @@
     Boolean,
     Date,
 )
-from data_diff.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
 from data_diff.databases.base import BaseDialect, ThreadedDatabase, import_helper, ConnectError
 from data_diff.databases.base import (
     MD5_HEXDIGITS,
@@ -40,7 +39,7 @@ def import_postgresql():
 
 
 @attrs.define(frozen=False)
-class PostgresqlDialect(BaseDialect, AbstractMixin_MD5, AbstractMixin_NormalizeValue):
+class PostgresqlDialect(BaseDialect):
     name = "PostgreSQL"
     ROUNDS_ON_PREC_LOSS = True
     SUPPORTS_PRIMARY_KEY = True
diff --git a/data_diff/databases/presto.py b/data_diff/databases/presto.py
diff --git a/data_diff/databases/snowflake.py b/data_diff/databases/snowflake.py
diff --git a/data_diff/databases/vertica.py b/data_diff/databases/vertica.py
diff --git a/tests/test_query.py b/tests/test_query.py