Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 8b0257e

Browse files
committed
add checksum offset to avoid bigint overflow
1 parent 57b889d commit 8b0257e

File tree

13 files changed

+37
-12
lines changed

13 files changed

+37
-12
lines changed

data_diff/databases/base.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1156,6 +1156,17 @@ def is_autocommit(self) -> bool:
11561156
_CHECKSUM_BITSIZE = CHECKSUM_HEXDIGITS << 2
11571157
CHECKSUM_MASK = (2**_CHECKSUM_BITSIZE) - 1
11581158

1159+
# bigint is typically 8 bytes
1160+
# if checksum is shorter, most databases will pad it with zeros
1161+
# 0xFF → 0x00000000000000FF;
1162+
# because of that, the numeric representation is always positive,
1163+
# which limits the number of checksums that we can add together before overflowing.
1164+
# we can fix that by adding a negative offset of half the max value,
1165+
# so that the distribution is from -0.5*max to +0.5*max.
1166+
# then negative numbers can compensate for the positive ones allowing to add more checksums together
1167+
# without overflowing.
1168+
CHECKSUM_OFFSET = CHECKSUM_MASK // 2
1169+
11591170
DEFAULT_DATETIME_PRECISION = 6
11601171
DEFAULT_NUMERIC_PRECISION = 24
11611172

data_diff/databases/bigquery.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
ConnectError,
3737
apply_query,
3838
QueryResult,
39+
CHECKSUM_OFFSET,
40+
CHECKSUM_HEXDIGITS
3941
)
4042
from data_diff.databases.base import TIMESTAMP_PRECISION_POS, ThreadLocalInterpreter, Mixin_RandomSample
4143

@@ -62,7 +64,7 @@ def import_bigquery_service_account_impersonation():
6264
@attrs.define(frozen=False)
6365
class Mixin_MD5(AbstractMixin_MD5):
6466
def md5_as_int(self, s: str) -> str:
65-
return f"cast(cast( ('0x' || substr(TO_HEX(md5({s})), 18)) as int64) as numeric)"
67+
return f"cast(cast( ('0x' || substr(TO_HEX(md5({s})), {CHECKSUM_HEXDIGITS})) as int64) as numeric) - {CHECKSUM_OFFSET}"
6668

6769

6870
@attrs.define(frozen=False)

data_diff/databases/clickhouse.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
MD5_HEXDIGITS,
77
CHECKSUM_HEXDIGITS,
88
TIMESTAMP_PRECISION_POS,
9+
CHECKSUM_OFFSET,
910
BaseDialect,
1011
ThreadedDatabase,
1112
import_helper,
@@ -41,7 +42,7 @@ def import_clickhouse():
4142
class Mixin_MD5(AbstractMixin_MD5):
4243
def md5_as_int(self, s: str) -> str:
4344
substr_idx = 1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS
44-
return f"reinterpretAsUInt128(reverse(unhex(lowerUTF8(substr(hex(MD5({s})), {substr_idx})))))"
45+
return f"reinterpretAsUInt128(reverse(unhex(lowerUTF8(substr(hex(MD5({s})), {substr_idx}))))) - {CHECKSUM_OFFSET}"
4546

4647

4748
@attrs.define(frozen=False)

data_diff/databases/databricks.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from data_diff.databases.base import (
2222
MD5_HEXDIGITS,
2323
CHECKSUM_HEXDIGITS,
24+
CHECKSUM_OFFSET,
2425
BaseDialect,
2526
ThreadedDatabase,
2627
import_helper,
@@ -39,7 +40,7 @@ def import_databricks():
3940
@attrs.define(frozen=False)
4041
class Mixin_MD5(AbstractMixin_MD5):
4142
def md5_as_int(self, s: str) -> str:
42-
return f"cast(conv(substr(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) as decimal(38, 0))"
43+
return f"cast(conv(substr(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) as decimal(38, 0)) - {CHECKSUM_OFFSET}"
4344

4445

4546
@attrs.define(frozen=False)

data_diff/databases/duckdb.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
ConnectError,
3030
ThreadLocalInterpreter,
3131
TIMESTAMP_PRECISION_POS,
32+
CHECKSUM_OFFSET
3233
)
3334
from data_diff.databases.base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, Mixin_Schema
3435
from data_diff.queries.ast_classes import Func, Compilable, ITable
@@ -45,7 +46,7 @@ def import_duckdb():
4546
@attrs.define(frozen=False)
4647
class Mixin_MD5(AbstractMixin_MD5):
4748
def md5_as_int(self, s: str) -> str:
48-
return f"('0x' || SUBSTRING(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS},{CHECKSUM_HEXDIGITS}))::BIGINT"
49+
return f"('0x' || SUBSTRING(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS},{CHECKSUM_HEXDIGITS}))::BIGINT - {CHECKSUM_OFFSET}"
4950

5051

5152
@attrs.define(frozen=False)

data_diff/databases/mssql.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ def normalize_number(self, value: str, coltype: FractionalType) -> str:
6060
@attrs.define(frozen=False)
6161
class Mixin_MD5(AbstractMixin_MD5):
6262
def md5_as_int(self, s: str) -> str:
63-
return f"convert(bigint, convert(varbinary, '0x' + RIGHT(CONVERT(NVARCHAR(32), HashBytes('MD5', {s}), 2), {CHECKSUM_HEXDIGITS}), 1))"
63+
offset = 549755813887
64+
return f"convert(bigint, convert(varbinary, '0x' + RIGHT(CONVERT(NVARCHAR(32), HashBytes('MD5', {s}), 2), {CHECKSUM_HEXDIGITS}), 1)) - {offset}"
6465

6566

6667
@attrs.define(frozen=False)

data_diff/databases/mysql.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
MD5_HEXDIGITS,
3232
CHECKSUM_HEXDIGITS,
3333
TIMESTAMP_PRECISION_POS,
34+
CHECKSUM_OFFSET,
3435
Mixin_Schema,
3536
Mixin_RandomSample,
3637
)
@@ -47,7 +48,7 @@ def import_mysql():
4748
@attrs.define(frozen=False)
4849
class Mixin_MD5(AbstractMixin_MD5):
4950
def md5_as_int(self, s: str) -> str:
50-
return f"cast(conv(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) as unsigned)"
51+
return f"cast(conv(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16, 10) as unsigned) - {CHECKSUM_OFFSET}"
5152

5253

5354
@attrs.define(frozen=False)

data_diff/databases/oracle.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
ConnectError,
2828
QueryError,
2929
Mixin_RandomSample,
30+
CHECKSUM_OFFSET,
31+
CHECKSUM_HEXDIGITS
3032
)
3133
from data_diff.databases.base import TIMESTAMP_PRECISION_POS
3234

@@ -45,7 +47,7 @@ class Mixin_MD5(AbstractMixin_MD5):
4547
def md5_as_int(self, s: str) -> str:
4648
# standard_hash is faster than DBMS_CRYPTO.Hash
4749
# TODO: Find a way to use UTL_RAW.CAST_TO_BINARY_INTEGER ?
48-
return f"to_number(substr(standard_hash({s}, 'MD5'), 18), 'xxxxxxxxxxxxxxx')"
50+
return f"to_number(substr(standard_hash({s}, 'MD5'), {CHECKSUM_HEXDIGITS}), 'xxxxxxxxxxxxxxx') - {CHECKSUM_OFFSET}"
4951

5052

5153
@attrs.define(frozen=False)

data_diff/databases/postgresql.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
CHECKSUM_HEXDIGITS,
2626
_CHECKSUM_BITSIZE,
2727
TIMESTAMP_PRECISION_POS,
28+
CHECKSUM_OFFSET,
2829
Mixin_RandomSample,
2930
)
3031

@@ -42,7 +43,7 @@ def import_postgresql():
4243
@attrs.define(frozen=False)
4344
class Mixin_MD5(AbstractMixin_MD5):
4445
def md5_as_int(self, s: str) -> str:
45-
return f"('x' || substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}))::bit({_CHECKSUM_BITSIZE})::bigint"
46+
return f"('x' || substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}))::bit({_CHECKSUM_BITSIZE})::bigint - {CHECKSUM_OFFSET}"
4647

4748

4849
@attrs.define(frozen=False)

data_diff/databases/presto.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from data_diff.databases.base import (
3434
MD5_HEXDIGITS,
3535
CHECKSUM_HEXDIGITS,
36+
CHECKSUM_OFFSET,
3637
TIMESTAMP_PRECISION_POS,
3738
)
3839

@@ -56,7 +57,7 @@ def import_presto():
5657
@attrs.define(frozen=False)
5758
class Mixin_MD5(AbstractMixin_MD5):
5859
def md5_as_int(self, s: str) -> str:
59-
return f"cast(from_base(substr(to_hex(md5(to_utf8({s}))), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16) as decimal(38, 0))"
60+
return f"cast(from_base(substr(to_hex(md5(to_utf8({s}))), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16) as decimal(38, 0)) - {CHECKSUM_OFFSET}"
6061

6162

6263
@attrs.define(frozen=False)

0 commit comments

Comments
 (0)