1+ import pytest
2+ import random
13from unittest .mock import MagicMock
24import os
35import sys
46
5- sys .path .insert (0 , os .path .join (os .path .dirname (__file__ ), '..' , 'script' ))
6- from is_table_sorted import are_parquet_file_row_groups_sorted
7+ from src .script .is_table_sorted import are_parquet_file_row_groups_sorted
78
89
9- def _create_mock_parquet_file (column_name : str , row_groups_stats : list [tuple ]):
10- """
11- Helper to create a mock ParquetFile with specified row group statistics.
12-
13- Args:
14- column_name: Name of the column to sort by
15- row_groups_stats: List of (min, max) tuples for each row group
16- """
10+ def _create_mock_parquet_file (column_name : str , row_groups_stats : list [tuple [str , str ]]):
1711 mock_pf = MagicMock ()
18- mock_pf .schema .names = [column_name , 'data' ]
12+ mock_pf .schema .names = [column_name ]
1913 mock_pf .num_row_groups = len (row_groups_stats )
2014
2115 mock_row_groups = []
@@ -31,180 +25,46 @@ def _create_mock_parquet_file(column_name: str, row_groups_stats: list[tuple]):
3125 return mock_pf
3226
3327
34- # Tests for sorted row groups
35-
3628def test_single_row_group_sorted ():
37- """Test with a single row group (trivially sorted)"""
38- mock_pf = _create_mock_parquet_file (
39- 'url_surtkey' ,
40- [('com,example)/page1' , 'com,example)/page3' )]
41- )
42-
43- is_sorted , min_val , max_val = are_parquet_file_row_groups_sorted (mock_pf , 'url_surtkey' )
44-
45- assert is_sorted is True
46- assert min_val is not None
47- assert max_val is not None
48-
49-
50- def test_multiple_row_groups_strictly_increasing ():
51- """Test with multiple row groups in strictly increasing order"""
52- mock_pf = _create_mock_parquet_file (
53- 'url_surtkey' ,
54- [
55- ('com,aaa)/' , 'com,bbb)/' ),
56- ('com,ccc)/' , 'com,ddd)/' ),
57- ('com,eee)/' , 'com,fff)/' )
58- ]
59- )
60-
61- is_sorted , min_val , max_val = are_parquet_file_row_groups_sorted (mock_pf , 'url_surtkey' )
62-
63- assert is_sorted is True
64- assert min_val is not None
65- assert max_val is not None
66-
67-
68- def test_boundary_case_adjacent_values ():
69- """Test with row groups that have adjacent but non-overlapping values"""
70- mock_pf = _create_mock_parquet_file (
71- 'url' ,
72- [
73- ('com,example)/a' , 'com,example)/z' ),
74- ('com,example,aaa)/' , 'com,example,zzz)/' )
75- ]
76- )
77-
78- is_sorted , min_val , max_val = are_parquet_file_row_groups_sorted (mock_pf , 'url' )
79-
80- assert is_sorted is True
81- assert min_val is not None
82- assert max_val is not None
83-
84-
85- def test_two_row_groups_strictly_increasing_strings ():
86- """Test with two row groups with string values in strictly increasing order"""
87- mock_pf = _create_mock_parquet_file (
88- 'url_surtkey' ,
89- [
90- ('com,apple)/' , 'com,banana)/' ),
91- ('com,cherry)/' , 'com,date)/' )
92- ]
93- )
94-
95- is_sorted , min_val , max_val = are_parquet_file_row_groups_sorted (mock_pf , 'url_surtkey' )
96-
97- assert is_sorted is True
98- assert min_val is not None
99- assert max_val is not None
100-
101-
102- def test_many_row_groups_strictly_increasing ():
103- """Test with many row groups, all strictly increasing"""
104- row_groups = [
105- ('com,aaa)/' , 'com,aaa,zzz)/' ),
106- ('com,bbb)/' , 'com,bbb,zzz)/' ),
107- ('com,ccc)/' , 'com,ccc,zzz)/' ),
108- ('com,ddd)/' , 'com,ddd,zzz)/' ),
109- ('com,eee)/' , 'com,eee,zzz)/' ),
110- ]
29+ mock_pf = _create_mock_parquet_file ('url_surtkey' , [('a' , 'b' )])
30+ is_sorted , min_val , max_val = are_parquet_file_row_groups_sorted (mock_pf , column_name = 'url_surtkey' )
31+ assert is_sorted
32+ assert min_val == 'a'
33+ assert max_val == 'b'
34+
35+
36+ def test_row_groups_sorted ():
37+ all_row_groups = [('a' , 'b' ), ('c' , 'd' ), ('e' , 'f' ), ('g' , 'h' )]
38+ for n in range (1 , len (all_row_groups )):
39+ row_groups = all_row_groups [:n ]
40+ mock_pf = _create_mock_parquet_file ('url_surtkey' , row_groups )
41+ is_sorted , min_val , max_val = are_parquet_file_row_groups_sorted (mock_pf , column_name = 'url_surtkey' )
42+ assert is_sorted
43+ assert is_sorted
44+ assert min_val == row_groups [0 ][0 ]
45+ assert max_val == row_groups [- 1 ][1 ]
46+
47+
48+ def test_row_groups_unsorted ():
49+ all_row_groups = [('a' , 'b' ), ('c' , 'd' ), ('e' , 'f' ), ('g' , 'h' )]
50+ count = 0
51+ while count < 100 :
52+ for n in range (2 , len (all_row_groups )):
53+ row_groups = all_row_groups [:n ].copy ()
54+ random .shuffle (row_groups )
55+ if row_groups == all_row_groups [:n ]:
56+ # shuffle resulted in same order, try again
57+ continue
58+
59+ mock_pf = _create_mock_parquet_file ('url_surtkey' , row_groups )
60+ is_sorted , min_val , max_val = are_parquet_file_row_groups_sorted (mock_pf , column_name = 'url_surtkey' )
61+ assert not is_sorted
62+
63+ count += 1
64+
65+
66+ def test_row_groups_overlapping ():
67+ row_groups = [('a' , 'c' ), ('b' , 'd' )]
11168 mock_pf = _create_mock_parquet_file ('url_surtkey' , row_groups )
112-
113- is_sorted , min_val , max_val = are_parquet_file_row_groups_sorted (mock_pf , 'url_surtkey' )
114-
115- assert is_sorted is True
116- assert min_val is not None
117- assert max_val is not None
118-
119-
120- # Tests for non-sorted row groups
121-
122- def test_two_row_groups_overlapping ():
123- """Test with two row groups where second min is less than first max (overlapping)"""
124- mock_pf = _create_mock_parquet_file (
125- 'url_surtkey' ,
126- [
127- ('a' , 'd' ),
128- ('b' , 'e' )
129- ]
130- )
131-
132- is_sorted , min_val , max_val = are_parquet_file_row_groups_sorted (mock_pf , 'url_surtkey' )
133-
134- assert is_sorted is False
135- assert min_val is None
136- assert max_val is None
137-
138-
139- def test_row_groups_completely_out_of_order ():
140- """Test with row groups in descending order"""
141- mock_pf = _create_mock_parquet_file (
142- 'url_surtkey' ,
143- [
144- ('z' , 'zz' ),
145- ('a' , 'b' ) # completely before the first group
146- ]
147- )
148-
149- is_sorted , min_val , max_val = are_parquet_file_row_groups_sorted (mock_pf , 'url_surtkey' )
150-
151- assert is_sorted is False
152- assert min_val is None
153- assert max_val is None
154-
155-
156- def test_multiple_row_groups_with_middle_unsorted ():
157- """Test with multiple row groups where the middle one breaks the sort order"""
158- mock_pf = _create_mock_parquet_file (
159- 'url_surtkey' ,
160- [
161- ('a' , 'b' ),
162- ('z' , 'zz' ), # correctly sorted so far
163- ('c' , 'd' ) # breaks ordering (min 'c' < previous max 'zz')
164- ]
165- )
166-
167- is_sorted , min_val , max_val = are_parquet_file_row_groups_sorted (mock_pf , 'url_surtkey' )
168-
169- assert is_sorted is False
170- assert min_val is None
171- assert max_val is None
172-
173-
174- def test_row_groups_equal_boundary_allowed ():
175- """Test that row groups where second min equals first max are allowed (>= not >)"""
176- mock_pf = _create_mock_parquet_file (
177- 'url_surtkey' ,
178- [
179- ('a' , 'b' ),
180- ('b' , 'c' ) # min equals prev_max - this is allowed
181- ]
182- )
183-
184- is_sorted , min_val , max_val = are_parquet_file_row_groups_sorted (mock_pf , 'url_surtkey' )
185-
186- assert is_sorted is True
187- assert min_val is not None
188- assert max_val is not None
189-
190-
191- def test_slight_overlap_in_middle ():
192- """Test detecting overlap in the middle of many row groups"""
193- mock_pf = _create_mock_parquet_file (
194- 'url_surtkey' ,
195- [
196- ('a' , 'az' ),
197- ('b' , 'bz' ),
198- ('c' , 'cz' ),
199- ('ba' , 'baz' ), # overlaps with previous ('ba' < 'c')
200- ('d' , 'dz' ),
201- ]
202- )
203-
204- is_sorted , min_val , max_val = are_parquet_file_row_groups_sorted (mock_pf , 'url_surtkey' )
205-
206- assert is_sorted is False
207- assert min_val is None
208- assert max_val is None
209-
210-
69+ is_sorted , min_val , max_val = are_parquet_file_row_groups_sorted (mock_pf , column_name = 'url_surtkey' )
70+ assert not is_sorted
0 commit comments