Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 70a7827

Browse files
authored
Merge pull request #303 from datafold/nov19_better_errors
CLI: Better errors + tiny bugfix
2 parents 5449a04 + 641eadf commit 70a7827

File tree

3 files changed

+29
-8
lines changed

3 files changed

+29
-8
lines changed

data_diff/__main__.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,25 @@ def _get_schema(pair):
4545
return db.query_table_schema(table_path)
4646

4747

48-
def diff_schemas(schema1, schema2, columns):
48+
def diff_schemas(table1, table2, schema1, schema2, columns):
4949
logging.info("Diffing schemas...")
5050
attrs = "name", "type", "datetime_precision", "numeric_precision", "numeric_scale"
5151
for c in columns:
5252
if c is None: # Skip for convenience
5353
continue
5454
diffs = []
55-
for attr, v1, v2 in safezip(attrs, schema1[c], schema2[c]):
55+
56+
if c not in schema1:
57+
cols = ', '.join(schema1)
58+
raise ValueError(f"Column '{c}' not found in table 1, named '{table1}'. Columns: {cols}")
59+
if c not in schema2:
60+
cols = ', '.join(schema1)
61+
raise ValueError(f"Column '{c}' not found in table 2, named '{table2}'. Columns: {cols}")
62+
63+
col1 = schema1[c]
64+
col2 = schema2[c]
65+
66+
for attr, v1, v2 in safezip(attrs, col1, col2):
5667
if v1 != v2:
5768
diffs.append(f"{attr}:({v1} != {v2})")
5869
if diffs:
@@ -197,7 +208,13 @@ def main(conf, run, **kw):
197208
if kw["algorithm"] == Algorithm.AUTO:
198209
kw["algorithm"] = Algorithm.JOINDIFF if indb_syntax else Algorithm.HASHDIFF
199210

200-
return _main(**kw)
211+
try:
212+
return _main(**kw)
213+
except Exception as e:
214+
logging.error(e)
215+
if kw["debug"]:
216+
raise
217+
201218

202219

203220
def _main(
@@ -357,6 +374,8 @@ def _main(
357374

358375
if db1 is db2:
359376
diff_schemas(
377+
table_names[0],
378+
table_names[1],
360379
schema1,
361380
schema2,
362381
(

data_diff/diff_tables.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,9 @@ def _diff_tables_wrapper(self, table1: TableSegment, table2: TableSegment, info_
132132

133133
if is_tracking_enabled():
134134
runtime = time.monotonic() - start
135-
table1_count = info_tree.info.rowcounts[1]
136-
table2_count = info_tree.info.rowcounts[2]
135+
rowcounts = info_tree.info.rowcounts
136+
table1_count = rowcounts[1] if rowcounts else None
137+
table2_count = rowcounts[2] if rowcounts else None
137138
diff_count = info_tree.info.diff_count
138139
err_message = truncate_error(repr(error))
139140
event_json = create_end_event_json(

data_diff/joindiff_tables.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ def _test_duplicate_keys(self, table1: TableSegment, table2: TableSegment):
216216

217217
unvalidated = list(set(key_columns) - set(unique))
218218
if unvalidated:
219+
logger.info(f"Validating that the are no duplicate keys in columns: {unvalidated}")
219220
# Validate that there are no duplicate keys
220221
self.stats["validated_unique_keys"] = self.stats.get("validated_unique_keys", []) + [unvalidated]
221222
q = t.select(total=Count(), total_distinct=Count(Concat(this[unvalidated]), distinct=True))
@@ -237,7 +238,7 @@ def _test_null_keys(self, table1, table2):
237238
raise ValueError("NULL values in one or more primary keys")
238239

239240
def _collect_stats(self, i, table_seg: TableSegment, info_tree: InfoTree):
240-
logger.info(f"Collecting stats for table #{i}")
241+
logger.debug(f"Collecting stats for table #{i}")
241242
db = table_seg.database
242243

243244
# Metrics
@@ -305,7 +306,7 @@ def _create_outer_join(self, table1, table2):
305306
return diff_rows, a_cols, b_cols, is_diff_cols, all_rows
306307

307308
def _count_diff_per_column(self, db, diff_rows, cols, is_diff_cols):
308-
logger.info("Counting differences per column")
309+
logger.debug("Counting differences per column")
309310
is_diff_cols_counts = db.query(diff_rows.select(sum_(this[c]) for c in is_diff_cols), tuple)
310311
diff_counts = {}
311312
for name, count in safezip(cols, is_diff_cols_counts):
@@ -319,7 +320,7 @@ def _sample_and_count_exclusive(self, db, diff_rows, a_cols, b_cols):
319320
exclusive_rows_query = diff_rows.where(this.is_exclusive_a | this.is_exclusive_b)
320321

321322
if not self.sample_exclusive_rows:
322-
logger.info("Counting exclusive rows")
323+
logger.debug("Counting exclusive rows")
323324
self.stats["exclusive_count"] = db.query(exclusive_rows_query.count(), int)
324325
return
325326

0 commit comments

Comments
 (0)