Skip to content

Commit fec818b

Browse files
author
Damian Stewart
committed
tests and fixes
1 parent 0b229d9 commit fec818b

File tree

5 files changed

+48
-187
lines changed

5 files changed

+48
-187
lines changed

src/__init__.py

Whitespace-only changes.

src/script/__init__.py

Whitespace-only changes.

src/script/is_table_sorted.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from tqdm.auto import tqdm
1111

1212

13-
def are_parquet_file_row_groups_sorted(pf: pq.ParquetFile, column_name: str) -> bool:
13+
def are_parquet_file_row_groups_sorted(pf: pq.ParquetFile, column_name: str) -> tuple[bool, str, str]:
1414
sort_column_index = next(i for i, name in enumerate(pf.schema.names)
1515
if name == column_name)
1616

@@ -27,6 +27,7 @@ def are_parquet_file_row_groups_sorted(pf: pq.ParquetFile, column_name: str) ->
2727
return False, None, None
2828
whole_min = column.statistics.min if whole_min is None else min(column.statistics.min, whole_min)
2929
whole_max = column.statistics.max if whole_max is None else max(column.statistics.max, whole_max)
30+
prev_max = column.statistics.max
3031
return True, whole_min, whole_max
3132

3233

src/test/__init__.py

Whitespace-only changes.

src/test/test_is_table_sorted.py

Lines changed: 46 additions & 186 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,15 @@
1+
import pytest
2+
import random
13
from unittest.mock import MagicMock
24
import os
35
import sys
46

5-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'script'))
6-
from is_table_sorted import are_parquet_file_row_groups_sorted
7+
from src.script.is_table_sorted import are_parquet_file_row_groups_sorted
78

89

9-
def _create_mock_parquet_file(column_name: str, row_groups_stats: list[tuple]):
10-
"""
11-
Helper to create a mock ParquetFile with specified row group statistics.
12-
13-
Args:
14-
column_name: Name of the column to sort by
15-
row_groups_stats: List of (min, max) tuples for each row group
16-
"""
10+
def _create_mock_parquet_file(column_name: str, row_groups_stats: list[tuple[str, str]]):
1711
mock_pf = MagicMock()
18-
mock_pf.schema.names = [column_name, 'data']
12+
mock_pf.schema.names = [column_name]
1913
mock_pf.num_row_groups = len(row_groups_stats)
2014

2115
mock_row_groups = []
@@ -31,180 +25,46 @@ def _create_mock_parquet_file(column_name: str, row_groups_stats: list[tuple]):
3125
return mock_pf
3226

3327

34-
# Tests for sorted row groups
35-
3628
def test_single_row_group_sorted():
37-
"""Test with a single row group (trivially sorted)"""
38-
mock_pf = _create_mock_parquet_file(
39-
'url_surtkey',
40-
[('com,example)/page1', 'com,example)/page3')]
41-
)
42-
43-
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
44-
45-
assert is_sorted is True
46-
assert min_val is not None
47-
assert max_val is not None
48-
49-
50-
def test_multiple_row_groups_strictly_increasing():
51-
"""Test with multiple row groups in strictly increasing order"""
52-
mock_pf = _create_mock_parquet_file(
53-
'url_surtkey',
54-
[
55-
('com,aaa)/', 'com,bbb)/'),
56-
('com,ccc)/', 'com,ddd)/'),
57-
('com,eee)/', 'com,fff)/')
58-
]
59-
)
60-
61-
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
62-
63-
assert is_sorted is True
64-
assert min_val is not None
65-
assert max_val is not None
66-
67-
68-
def test_boundary_case_adjacent_values():
69-
"""Test with row groups that have adjacent but non-overlapping values"""
70-
mock_pf = _create_mock_parquet_file(
71-
'url',
72-
[
73-
('com,example)/a', 'com,example)/z'),
74-
('com,example,aaa)/', 'com,example,zzz)/')
75-
]
76-
)
77-
78-
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url')
79-
80-
assert is_sorted is True
81-
assert min_val is not None
82-
assert max_val is not None
83-
84-
85-
def test_two_row_groups_strictly_increasing_strings():
86-
"""Test with two row groups with string values in strictly increasing order"""
87-
mock_pf = _create_mock_parquet_file(
88-
'url_surtkey',
89-
[
90-
('com,apple)/', 'com,banana)/'),
91-
('com,cherry)/', 'com,date)/')
92-
]
93-
)
94-
95-
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
96-
97-
assert is_sorted is True
98-
assert min_val is not None
99-
assert max_val is not None
100-
101-
102-
def test_many_row_groups_strictly_increasing():
103-
"""Test with many row groups, all strictly increasing"""
104-
row_groups = [
105-
('com,aaa)/', 'com,aaa,zzz)/'),
106-
('com,bbb)/', 'com,bbb,zzz)/'),
107-
('com,ccc)/', 'com,ccc,zzz)/'),
108-
('com,ddd)/', 'com,ddd,zzz)/'),
109-
('com,eee)/', 'com,eee,zzz)/'),
110-
]
29+
mock_pf = _create_mock_parquet_file('url_surtkey', [('a', 'b')])
30+
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, column_name='url_surtkey')
31+
assert is_sorted
32+
assert min_val == 'a'
33+
assert max_val == 'b'
34+
35+
36+
def test_row_groups_sorted():
37+
all_row_groups = [('a', 'b'), ('c', 'd'), ('e', 'f'), ('g', 'h')]
38+
for n in range(1, len(all_row_groups)):
39+
row_groups = all_row_groups[:n]
40+
mock_pf = _create_mock_parquet_file('url_surtkey', row_groups)
41+
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, column_name='url_surtkey')
42+
assert is_sorted
43+
assert is_sorted
44+
assert min_val == row_groups[0][0]
45+
assert max_val == row_groups[-1][1]
46+
47+
48+
def test_row_groups_unsorted():
49+
all_row_groups = [('a', 'b'), ('c', 'd'), ('e', 'f'), ('g', 'h')]
50+
count = 0
51+
while count < 100:
52+
for n in range(2, len(all_row_groups)):
53+
row_groups = all_row_groups[:n].copy()
54+
random.shuffle(row_groups)
55+
if row_groups == all_row_groups[:n]:
56+
# shuffle resulted in same order, try again
57+
continue
58+
59+
mock_pf = _create_mock_parquet_file('url_surtkey', row_groups)
60+
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, column_name='url_surtkey')
61+
assert not is_sorted
62+
63+
count += 1
64+
65+
66+
def test_row_groups_overlapping():
67+
row_groups = [('a', 'c'), ('b', 'd')]
11168
mock_pf = _create_mock_parquet_file('url_surtkey', row_groups)
112-
113-
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
114-
115-
assert is_sorted is True
116-
assert min_val is not None
117-
assert max_val is not None
118-
119-
120-
# Tests for non-sorted row groups
121-
122-
def test_two_row_groups_overlapping():
123-
"""Test with two row groups where second min is less than first max (overlapping)"""
124-
mock_pf = _create_mock_parquet_file(
125-
'url_surtkey',
126-
[
127-
('a', 'd'),
128-
('b', 'e')
129-
]
130-
)
131-
132-
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
133-
134-
assert is_sorted is False
135-
assert min_val is None
136-
assert max_val is None
137-
138-
139-
def test_row_groups_completely_out_of_order():
140-
"""Test with row groups in descending order"""
141-
mock_pf = _create_mock_parquet_file(
142-
'url_surtkey',
143-
[
144-
('z', 'zz'),
145-
('a', 'b') # completely before the first group
146-
]
147-
)
148-
149-
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
150-
151-
assert is_sorted is False
152-
assert min_val is None
153-
assert max_val is None
154-
155-
156-
def test_multiple_row_groups_with_middle_unsorted():
157-
"""Test with multiple row groups where the middle one breaks the sort order"""
158-
mock_pf = _create_mock_parquet_file(
159-
'url_surtkey',
160-
[
161-
('a', 'b'),
162-
('z', 'zz'), # correctly sorted so far
163-
('c', 'd') # breaks ordering (min 'c' < previous max 'zz')
164-
]
165-
)
166-
167-
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
168-
169-
assert is_sorted is False
170-
assert min_val is None
171-
assert max_val is None
172-
173-
174-
def test_row_groups_equal_boundary_allowed():
175-
"""Test that row groups where second min equals first max are allowed (>= not >)"""
176-
mock_pf = _create_mock_parquet_file(
177-
'url_surtkey',
178-
[
179-
('a', 'b'),
180-
('b', 'c') # min equals prev_max - this is allowed
181-
]
182-
)
183-
184-
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
185-
186-
assert is_sorted is True
187-
assert min_val is not None
188-
assert max_val is not None
189-
190-
191-
def test_slight_overlap_in_middle():
192-
"""Test detecting overlap in the middle of many row groups"""
193-
mock_pf = _create_mock_parquet_file(
194-
'url_surtkey',
195-
[
196-
('a', 'az'),
197-
('b', 'bz'),
198-
('c', 'cz'),
199-
('ba', 'baz'), # overlaps with previous ('ba' < 'c')
200-
('d', 'dz'),
201-
]
202-
)
203-
204-
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, 'url_surtkey')
205-
206-
assert is_sorted is False
207-
assert min_val is None
208-
assert max_val is None
209-
210-
69+
is_sorted, min_val, max_val = are_parquet_file_row_groups_sorted(mock_pf, column_name='url_surtkey')
70+
assert not is_sorted

0 commit comments

Comments
 (0)