@@ -88,29 +88,6 @@ def catalog() -> Catalog:
8888 ],
8989}
9090
91- TEST_DATA_WITHOUT_NULL = {
92- 'bool' : [False , True , True ],
93- 'string' : ['a' , 'z' , 'z' ],
94- # Go over the 16 bytes to kick in truncation
95- 'string_long' : ['a' * 22 , 'a' * 22 , 'z' * 22 ],
96- 'int' : [1 , 1 , 9 ],
97- 'long' : [1 , 1 , 9 ],
98- 'float' : [0.0 , 0.0 , 0.9 ],
99- 'double' : [0.0 , 0.0 , 0.9 ],
100- 'timestamp' : [datetime (2023 , 1 , 1 , 19 , 25 , 00 ), datetime (2023 , 1 , 1 , 19 , 25 , 00 ), datetime (2023 , 3 , 1 , 19 , 25 , 00 )],
101- 'timestamptz' : [datetime (2023 , 1 , 1 , 19 , 25 , 00 ), datetime (2023 , 1 , 1 , 19 , 25 , 00 ), datetime (2023 , 3 , 1 , 19 , 25 , 00 )],
102- 'date' : [date (2023 , 1 , 1 ), date (2023 , 1 , 1 ), date (2023 , 3 , 1 )],
103- # Not supported by Spark
104- # 'time': [time(1, 22, 0), None, time(19, 25, 0)],
105- # Not natively supported by Arrow
106- # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes],
107- 'binary' : [b'\01 ' , b'\01 ' , b'\22 ' ],
108- 'fixed' : [
109- uuid .UUID ('00000000-0000-0000-0000-000000000000' ).bytes ,
110- uuid .UUID ('00000000-0000-0000-0000-000000000000' ).bytes ,
111- uuid .UUID ('11111111-1111-1111-1111-111111111111' ).bytes ,
112- ],
113- }
11491
11592TABLE_SCHEMA = Schema (
11693 NestedField (field_id = 1 , name = "bool" , field_type = BooleanType (), required = False ),
@@ -144,30 +121,6 @@ def session_catalog() -> Catalog:
144121 )
145122
146123
147- @pytest .fixture (scope = "session" )
148- def arrow_table_without_null () -> pa .Table :
149- """PyArrow table with all kinds of columns"""
150- pa_schema = pa .schema ([
151- ("bool" , pa .bool_ ()),
152- ("string" , pa .string ()),
153- ("string_long" , pa .string ()),
154- ("int" , pa .int32 ()),
155- ("long" , pa .int64 ()),
156- ("float" , pa .float32 ()),
157- ("double" , pa .float64 ()),
158- ("timestamp" , pa .timestamp (unit = "us" )),
159- ("timestamptz" , pa .timestamp (unit = "us" , tz = "UTC" )),
160- ("date" , pa .date32 ()),
161- # Not supported by Spark
162- # ("time", pa.time64("us")),
163- # Not natively supported by Arrow
164- # ("uuid", pa.fixed(16)),
165- ("binary" , pa .binary ()),
166- ("fixed" , pa .binary (16 )),
167- ])
168- return pa .Table .from_pydict (TEST_DATA_WITHOUT_NULL , schema = pa_schema )
169-
170-
171124@pytest .fixture (scope = "session" )
172125def arrow_table_with_null () -> pa .Table :
173126 """PyArrow table with all kinds of columns"""
@@ -191,10 +144,10 @@ def arrow_table_with_null() -> pa.Table:
191144 ])
192145 return pa .Table .from_pydict (TEST_DATA_WITH_NULL , schema = pa_schema )
193146
194- # stub
147+
195148@pytest .fixture (scope = "session" , autouse = True )
196- def table_v1_without_null_partitioned (session_catalog : Catalog , arrow_table_without_null : pa .Table ) -> None :
197- identifier = "default.arrow_table_v1_without_null_partitioned "
149+ def table_v1_with_null_partitioned (session_catalog : Catalog , arrow_table_with_null : pa .Table , request ) -> None :
150+ identifier = "default.arrow_table_v1_with_null_partitioned "
198151
199152 try :
200153 session_catalog .drop_table (identifier = identifier )
@@ -207,33 +160,14 @@ def table_v1_without_null_partitioned(session_catalog: Catalog, arrow_table_with
207160 partition_spec = PartitionSpec (PartitionField (source_id = 4 , field_id = 1001 , transform = IdentityTransform (), name = "int" )),
208161 properties = {'format-version' : '1' },
209162 )
210- tbl .append (arrow_table_without_null )
163+ tbl .append (arrow_table_with_null )
211164
212165 assert tbl .format_version == 1 , f"Expected v1, got: v{ tbl .format_version } "
213166
214- # # for above
215- # @pytest.fixture(scope="session", autouse=True)
216- # def table_v1_with_null_partitioned(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
217- # identifier = "default.arrow_table_v1_without_null_partitioned"
218-
219- # try:
220- # session_catalog.drop_table(identifier=identifier)
221- # except NoSuchTableError:
222- # pass
223-
224- # tbl = session_catalog.create_table(
225- # identifier=identifier,
226- # schema=TABLE_SCHEMA,
227- # partition_spec=PartitionSpec(PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int")),
228- # properties={'format-version': '1'},
229- # )
230- # tbl.append(arrow_table_with_null)
231-
232- # assert tbl.format_version == 1, f"Expected v1, got: v{tbl.format_version}"
233167
234168@pytest .fixture (scope = "session" , autouse = True )
235- def table_v1_appended_without_null_partitioned (session_catalog : Catalog , arrow_table_without_null : pa .Table ) -> None :
236- identifier = "default.arrow_table_v1_appended_without_null_partitioned "
169+ def table_v1_appended_with_null_partitioned (session_catalog : Catalog , arrow_table_with_null : pa .Table ) -> None :
170+ identifier = "default.arrow_table_v1_appended_with_null_partitioned "
237171
238172 try :
239173 session_catalog .drop_table (identifier = identifier )
@@ -243,14 +177,14 @@ def table_v1_appended_without_null_partitioned(session_catalog: Catalog, arrow_t
243177 tbl = session_catalog .create_table (identifier = identifier , schema = TABLE_SCHEMA , properties = {'format-version' : '1' })
244178
245179 for _ in range (2 ):
246- tbl .append (arrow_table_without_null )
180+ tbl .append (arrow_table_with_null )
247181
248182 assert tbl .format_version == 1 , f"Expected v1, got: v{ tbl .format_version } "
249183
250184
251185@pytest .fixture (scope = "session" , autouse = True )
252- def table_v2_without_null_partitioned (session_catalog : Catalog , arrow_table_without_null : pa .Table ) -> None :
253- identifier = "default.arrow_table_v2_without_null_partitioned "
186+ def table_v2_with_null_partitioned (session_catalog : Catalog , arrow_table_with_null : pa .Table ) -> None :
187+ identifier = "default.arrow_table_v2_with_null_partitioned "
254188
255189 try :
256190 session_catalog .drop_table (identifier = identifier )
@@ -263,14 +197,14 @@ def table_v2_without_null_partitioned(session_catalog: Catalog, arrow_table_with
263197 partition_spec = PartitionSpec (PartitionField (source_id = 4 , field_id = 1001 , transform = IdentityTransform (), name = "int" )),
264198 properties = {'format-version' : '2' },
265199 )
266- tbl .append (arrow_table_without_null )
200+ tbl .append (arrow_table_with_null )
267201
268202 assert tbl .format_version == 2 , f"Expected v2, got: v{ tbl .format_version } "
269203
270204
271205@pytest .fixture (scope = "session" , autouse = True )
272- def table_v2_appended_without_null_partitioned (session_catalog : Catalog , arrow_table_without_null : pa .Table ) -> None :
273- identifier = "default.arrow_table_v2_appended_without_null_partitioned "
206+ def table_v2_appended_with_null_partitioned (session_catalog : Catalog , arrow_table_with_null : pa .Table ) -> None :
207+ identifier = "default.arrow_table_v2_appended_with_null_partitioned "
274208
275209 try :
276210 session_catalog .drop_table (identifier = identifier )
@@ -280,32 +214,14 @@ def table_v2_appended_without_null_partitioned(session_catalog: Catalog, arrow_t
280214 tbl = session_catalog .create_table (identifier = identifier , schema = TABLE_SCHEMA , properties = {'format-version' : '2' })
281215
282216 for _ in range (2 ):
283- tbl .append (arrow_table_without_null )
217+ tbl .append (arrow_table_with_null )
284218
285219 assert tbl .format_version == 2 , f"Expected v1, got: v{ tbl .format_version } "
286220
287221
288- @pytest .mark .newyork
289- @pytest .mark .parametrize ("col" , TEST_DATA_WITHOUT_NULL .keys ())
290- @pytest .mark .parametrize ("format_version" , [1 , 2 ])
291- def test_query_filter_null (spark : SparkSession , col : str , format_version : int ) -> None :
292- identifier = f"default.arrow_table_v{ format_version } _without_null_partitioned"
293- df = spark .table (identifier )
294- assert df .where (f"{ col } is not null" ).count () == 3 , f"Expected 3 rows for { col } "
295-
296-
297- @pytest .mark .adrian
298- @pytest .mark .parametrize ("col" , TEST_DATA_WITHOUT_NULL .keys ())
299- @pytest .mark .parametrize ("format_version" , [1 , 2 ])
300- def test_query_filter_appended_null_partitioned (spark : SparkSession , col : str , format_version : int ) -> None :
301- identifier = f"default.arrow_table_v{ format_version } _appended_without_null_partitioned"
302- df = spark .table (identifier )
303- assert df .where (f"{ col } is not null" ).count () == 6 , f"Expected 6 rows for { col } "
304-
305-
306222@pytest .fixture (scope = "session" , autouse = True )
307- def table_v1_v2_appended_without_null (session_catalog : Catalog , arrow_table_without_null : pa .Table ) -> None :
308- identifier = "default.arrow_table_v1_v2_appended_without_null "
223+ def table_v1_v2_appended_with_null (session_catalog : Catalog , arrow_table_with_null : pa .Table ) -> None :
224+ identifier = "default.arrow_table_v1_v2_appended_with_null "
309225
310226 try :
311227 session_catalog .drop_table (identifier = identifier )
@@ -318,17 +234,38 @@ def table_v1_v2_appended_without_null(session_catalog: Catalog, arrow_table_with
318234 partition_spec = PartitionSpec (PartitionField (source_id = 4 , field_id = 1001 , transform = IdentityTransform (), name = "int" )),
319235 properties = {'format-version' : '1' },
320236 )
321- tbl .append (arrow_table_without_null )
237+ tbl .append (arrow_table_with_null )
322238
323239 assert tbl .format_version == 1 , f"Expected v1, got: v{ tbl .format_version } "
324240
325241 with tbl .transaction () as tx :
326242 tx .upgrade_table_version (format_version = 2 )
327243
328- tbl .append (arrow_table_without_null )
244+ tbl .append (arrow_table_with_null )
329245
330246 assert tbl .format_version == 2 , f"Expected v2, got: v{ tbl .format_version } "
331247
248+ # todo parametrize partition for each of the columns
249+ @pytest .mark .integration
250+ @pytest .mark .parametrize ("col" , TEST_DATA_WITH_NULL .keys ())
251+ @pytest .mark .parametrize ("format_version" , [1 , 2 ])
252+ def test_query_filter_null_partitioned (spark : SparkSession , col : str , format_version : int , ) -> None :
253+ identifier = f"default.arrow_table_v{ format_version } _with_null_partitioned"
254+ df = spark .table (identifier )
255+ assert df .where (f"{ col } is not null" ).count () == 2 , f"Expected 2 rows for { col } "
256+
257+
258+ # todo parametrize partition for each of the columns
259+ @pytest .mark .integeration
260+ @pytest .mark .parametrize ("col" , TEST_DATA_WITH_NULL .keys ())
261+ @pytest .mark .parametrize ("format_version" , [1 , 2 ])
262+ def test_query_filter_appended_null_partitioned (spark : SparkSession , col : str , format_version : int ) -> None :
263+ identifier = f"default.arrow_table_v{ format_version } _appended_with_null_partitioned"
264+ df = spark .table (identifier )
265+ assert df .where (f"{ col } is not null" ).count () == 4 , f"Expected 6 rows for { col } "
266+
267+
268+
332269
333270@pytest .fixture (scope = "session" )
334271def spark () -> SparkSession :
@@ -363,17 +300,18 @@ def spark() -> SparkSession:
363300
364301 return spark
365302
366-
367- @pytest .mark .adrian
368- @pytest .mark .parametrize ("col" , TEST_DATA_WITHOUT_NULL .keys ())
303+ # todo parametrize partition for each of the columns
304+ @pytest .mark .integeration
305+ @pytest .mark .parametrize ("col" , TEST_DATA_WITH_NULL .keys ())
369306def test_query_filter_v1_v2_append_null (spark : SparkSession , col : str ) -> None :
370- identifier = "default.arrow_table_v1_v2_appended_without_null "
307+ identifier = "default.arrow_table_v1_v2_appended_with_null "
371308 df = spark .table (identifier )
372- assert df .where (f"{ col } is not null" ).count () == 6 , f"Expected 3 row for { col } "
309+ assert df .where (f"{ col } is not null" ).count () == 4 , f"Expected 4 row for { col } "
310+
373311
374312
375- @pytest .mark .adrian
376- def test_summaries (spark : SparkSession , session_catalog : Catalog , arrow_table_without_null : pa .Table ) -> None :
313+ @pytest .mark .integeration
314+ def test_summaries_with_null (spark : SparkSession , session_catalog : Catalog , arrow_table_with_null : pa .Table ) -> None :
377315 identifier = "default.arrow_table_summaries"
378316
379317 try :
@@ -387,8 +325,8 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi
387325 properties = {'format-version' : '2' },
388326 )
389327
390- tbl .append (arrow_table_without_null )
391- tbl .append (arrow_table_without_null )
328+ tbl .append (arrow_table_with_null )
329+ tbl .append (arrow_table_with_null )
392330
393331 rows = spark .sql (
394332 f"""
@@ -404,32 +342,33 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi
404342 summaries = [row .summary for row in rows ]
405343
406344 assert summaries [0 ] == {
407- 'added-data-files' : '2 ' ,
408- 'added-files-size' : '10433 ' ,
345+ 'added-data-files' : '3 ' ,
346+ 'added-files-size' : '14471 ' ,
409347 'added-records' : '3' ,
410- 'total-data-files' : '2 ' ,
348+ 'total-data-files' : '3 ' ,
411349 'total-delete-files' : '0' ,
412350 'total-equality-deletes' : '0' ,
413- 'total-files-size' : '10433 ' ,
351+ 'total-files-size' : '14471 ' ,
414352 'total-position-deletes' : '0' ,
415353 'total-records' : '3' ,
416354 }
417355
418356 assert summaries [1 ] == {
419- 'added-data-files' : '2 ' ,
420- 'added-files-size' : '10433 ' ,
357+ 'added-data-files' : '3 ' ,
358+ 'added-files-size' : '14471 ' ,
421359 'added-records' : '3' ,
422- 'total-data-files' : '4 ' ,
360+ 'total-data-files' : '6 ' ,
423361 'total-delete-files' : '0' ,
424362 'total-equality-deletes' : '0' ,
425- 'total-files-size' : '20866 ' ,
363+ 'total-files-size' : '28942 ' ,
426364 'total-position-deletes' : '0' ,
427365 'total-records' : '6' ,
428366 }
429367
430368
431- @pytest .mark .adrian
432- def test_data_files (spark : SparkSession , session_catalog : Catalog , arrow_table_without_null : pa .Table ) -> None :
369+ # todo parametrize partition for each of the columns
370+ @pytest .mark .integeration
371+ def test_data_files_with_null (spark : SparkSession , session_catalog : Catalog , arrow_table_with_null : pa .Table ) -> None :
433372 identifier = "default.arrow_data_files"
434373
435374 try :
@@ -443,8 +382,8 @@ def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_w
443382 properties = {'format-version' : '1' },
444383 )
445384
446- tbl .append (arrow_table_without_null )
447- tbl .append (arrow_table_without_null )
385+ tbl .append (arrow_table_with_null )
386+ tbl .append (arrow_table_with_null )
448387
449388 # added_data_files_count, existing_data_files_count, deleted_data_files_count
450389 rows = spark .sql (
@@ -454,7 +393,7 @@ def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_w
454393 """
455394 ).collect ()
456395
457- assert [row .added_data_files_count for row in rows ] == [2 , 2 , 2 ]
396+ assert [row .added_data_files_count for row in rows ] == [3 , 3 , 3 ]
458397 assert [row .existing_data_files_count for row in rows ] == [
459398 0 ,
460399 0 ,
@@ -463,8 +402,9 @@ def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_w
463402 assert [row .deleted_data_files_count for row in rows ] == [0 , 0 , 0 ]
464403
465404
466- @pytest .mark .adrian
467- def test_invalid_arguments (spark : SparkSession , session_catalog : Catalog , arrow_table_without_null : pa .Table ) -> None :
405+ # i think this test does not need to duplicate
406+ @pytest .mark .integeration
407+ def test_invalid_arguments (spark : SparkSession , session_catalog : Catalog , arrow_table_with_null : pa .Table ) -> None :
468408 identifier = "default.arrow_data_files"
469409
470410 try :
0 commit comments