Skip to content

Commit b74521e

Browse files
committed
fix avro writer writing null; add nullable int column partition test
1 parent fcd94fd commit b74521e

File tree

3 files changed

+67
-126
lines changed

3 files changed

+67
-126
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ test-integration:
4242
docker-compose -f dev/docker-compose-integration.yml up -d
4343
sleep 5
4444
docker-compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py
45-
poetry run pytest tests/ -v -m newyork ${PYTEST_ARGS} -s
45+
poetry run pytest tests/ -v -m integration ${PYTEST_ARGS} -s
4646

4747
test-integration-rebuild:
4848
docker-compose -f dev/docker-compose-integration.yml kill

pyiceberg/manifest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,7 @@ def data_file_with_partition(partition_type: StructType, format_version: Literal
308308
field_id=field.field_id,
309309
name=field.name,
310310
field_type=partition_field_to_data_file_partition_field(field.field_type),
311+
required=False
311312
)
312313
for field in partition_type.fields
313314
])

tests/integration/test_partitioned_writes.py

Lines changed: 65 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -88,29 +88,6 @@ def catalog() -> Catalog:
8888
],
8989
}
9090

91-
TEST_DATA_WITHOUT_NULL = {
92-
'bool': [False, True, True],
93-
'string': ['a', 'z', 'z'],
94-
# Go over the 16 bytes to kick in truncation
95-
'string_long': ['a' * 22, 'a' * 22, 'z' * 22],
96-
'int': [1, 1, 9],
97-
'long': [1, 1, 9],
98-
'float': [0.0, 0.0, 0.9],
99-
'double': [0.0, 0.0, 0.9],
100-
'timestamp': [datetime(2023, 1, 1, 19, 25, 00), datetime(2023, 1, 1, 19, 25, 00), datetime(2023, 3, 1, 19, 25, 00)],
101-
'timestamptz': [datetime(2023, 1, 1, 19, 25, 00), datetime(2023, 1, 1, 19, 25, 00), datetime(2023, 3, 1, 19, 25, 00)],
102-
'date': [date(2023, 1, 1), date(2023, 1, 1), date(2023, 3, 1)],
103-
# Not supported by Spark
104-
# 'time': [time(1, 22, 0), None, time(19, 25, 0)],
105-
# Not natively supported by Arrow
106-
# 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes],
107-
'binary': [b'\01', b'\01', b'\22'],
108-
'fixed': [
109-
uuid.UUID('00000000-0000-0000-0000-000000000000').bytes,
110-
uuid.UUID('00000000-0000-0000-0000-000000000000').bytes,
111-
uuid.UUID('11111111-1111-1111-1111-111111111111').bytes,
112-
],
113-
}
11491

11592
TABLE_SCHEMA = Schema(
11693
NestedField(field_id=1, name="bool", field_type=BooleanType(), required=False),
@@ -144,30 +121,6 @@ def session_catalog() -> Catalog:
144121
)
145122

146123

147-
@pytest.fixture(scope="session")
148-
def arrow_table_without_null() -> pa.Table:
149-
"""PyArrow table with all kinds of columns"""
150-
pa_schema = pa.schema([
151-
("bool", pa.bool_()),
152-
("string", pa.string()),
153-
("string_long", pa.string()),
154-
("int", pa.int32()),
155-
("long", pa.int64()),
156-
("float", pa.float32()),
157-
("double", pa.float64()),
158-
("timestamp", pa.timestamp(unit="us")),
159-
("timestamptz", pa.timestamp(unit="us", tz="UTC")),
160-
("date", pa.date32()),
161-
# Not supported by Spark
162-
# ("time", pa.time64("us")),
163-
# Not natively supported by Arrow
164-
# ("uuid", pa.fixed(16)),
165-
("binary", pa.binary()),
166-
("fixed", pa.binary(16)),
167-
])
168-
return pa.Table.from_pydict(TEST_DATA_WITHOUT_NULL, schema=pa_schema)
169-
170-
171124
@pytest.fixture(scope="session")
172125
def arrow_table_with_null() -> pa.Table:
173126
"""PyArrow table with all kinds of columns"""
@@ -191,10 +144,10 @@ def arrow_table_with_null() -> pa.Table:
191144
])
192145
return pa.Table.from_pydict(TEST_DATA_WITH_NULL, schema=pa_schema)
193146

194-
# stub
147+
195148
@pytest.fixture(scope="session", autouse=True)
196-
def table_v1_without_null_partitioned(session_catalog: Catalog, arrow_table_without_null: pa.Table) -> None:
197-
identifier = "default.arrow_table_v1_without_null_partitioned"
149+
def table_v1_with_null_partitioned(session_catalog: Catalog, arrow_table_with_null: pa.Table, request) -> None:
150+
identifier = "default.arrow_table_v1_with_null_partitioned"
198151

199152
try:
200153
session_catalog.drop_table(identifier=identifier)
@@ -207,33 +160,14 @@ def table_v1_without_null_partitioned(session_catalog: Catalog, arrow_table_with
207160
partition_spec=PartitionSpec(PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int")),
208161
properties={'format-version': '1'},
209162
)
210-
tbl.append(arrow_table_without_null)
163+
tbl.append(arrow_table_with_null)
211164

212165
assert tbl.format_version == 1, f"Expected v1, got: v{tbl.format_version}"
213166

214-
# # for above
215-
# @pytest.fixture(scope="session", autouse=True)
216-
# def table_v1_with_null_partitioned(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
217-
# identifier = "default.arrow_table_v1_without_null_partitioned"
218-
219-
# try:
220-
# session_catalog.drop_table(identifier=identifier)
221-
# except NoSuchTableError:
222-
# pass
223-
224-
# tbl = session_catalog.create_table(
225-
# identifier=identifier,
226-
# schema=TABLE_SCHEMA,
227-
# partition_spec=PartitionSpec(PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int")),
228-
# properties={'format-version': '1'},
229-
# )
230-
# tbl.append(arrow_table_with_null)
231-
232-
# assert tbl.format_version == 1, f"Expected v1, got: v{tbl.format_version}"
233167

234168
@pytest.fixture(scope="session", autouse=True)
235-
def table_v1_appended_without_null_partitioned(session_catalog: Catalog, arrow_table_without_null: pa.Table) -> None:
236-
identifier = "default.arrow_table_v1_appended_without_null_partitioned"
169+
def table_v1_appended_with_null_partitioned(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
170+
identifier = "default.arrow_table_v1_appended_with_null_partitioned"
237171

238172
try:
239173
session_catalog.drop_table(identifier=identifier)
@@ -243,14 +177,14 @@ def table_v1_appended_without_null_partitioned(session_catalog: Catalog, arrow_t
243177
tbl = session_catalog.create_table(identifier=identifier, schema=TABLE_SCHEMA, properties={'format-version': '1'})
244178

245179
for _ in range(2):
246-
tbl.append(arrow_table_without_null)
180+
tbl.append(arrow_table_with_null)
247181

248182
assert tbl.format_version == 1, f"Expected v1, got: v{tbl.format_version}"
249183

250184

251185
@pytest.fixture(scope="session", autouse=True)
252-
def table_v2_without_null_partitioned(session_catalog: Catalog, arrow_table_without_null: pa.Table) -> None:
253-
identifier = "default.arrow_table_v2_without_null_partitioned"
186+
def table_v2_with_null_partitioned(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
187+
identifier = "default.arrow_table_v2_with_null_partitioned"
254188

255189
try:
256190
session_catalog.drop_table(identifier=identifier)
@@ -263,14 +197,14 @@ def table_v2_without_null_partitioned(session_catalog: Catalog, arrow_table_with
263197
partition_spec=PartitionSpec(PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int")),
264198
properties={'format-version': '2'},
265199
)
266-
tbl.append(arrow_table_without_null)
200+
tbl.append(arrow_table_with_null)
267201

268202
assert tbl.format_version == 2, f"Expected v2, got: v{tbl.format_version}"
269203

270204

271205
@pytest.fixture(scope="session", autouse=True)
272-
def table_v2_appended_without_null_partitioned(session_catalog: Catalog, arrow_table_without_null: pa.Table) -> None:
273-
identifier = "default.arrow_table_v2_appended_without_null_partitioned"
206+
def table_v2_appended_with_null_partitioned(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
207+
identifier = "default.arrow_table_v2_appended_with_null_partitioned"
274208

275209
try:
276210
session_catalog.drop_table(identifier=identifier)
@@ -280,32 +214,14 @@ def table_v2_appended_without_null_partitioned(session_catalog: Catalog, arrow_t
280214
tbl = session_catalog.create_table(identifier=identifier, schema=TABLE_SCHEMA, properties={'format-version': '2'})
281215

282216
for _ in range(2):
283-
tbl.append(arrow_table_without_null)
217+
tbl.append(arrow_table_with_null)
284218

285219
assert tbl.format_version == 2, f"Expected v1, got: v{tbl.format_version}"
286220

287221

288-
@pytest.mark.newyork
289-
@pytest.mark.parametrize("col", TEST_DATA_WITHOUT_NULL.keys())
290-
@pytest.mark.parametrize("format_version", [1, 2])
291-
def test_query_filter_null(spark: SparkSession, col: str, format_version: int) -> None:
292-
identifier = f"default.arrow_table_v{format_version}_without_null_partitioned"
293-
df = spark.table(identifier)
294-
assert df.where(f"{col} is not null").count() == 3, f"Expected 3 rows for {col}"
295-
296-
297-
@pytest.mark.adrian
298-
@pytest.mark.parametrize("col", TEST_DATA_WITHOUT_NULL.keys())
299-
@pytest.mark.parametrize("format_version", [1, 2])
300-
def test_query_filter_appended_null_partitioned(spark: SparkSession, col: str, format_version: int) -> None:
301-
identifier = f"default.arrow_table_v{format_version}_appended_without_null_partitioned"
302-
df = spark.table(identifier)
303-
assert df.where(f"{col} is not null").count() == 6, f"Expected 6 rows for {col}"
304-
305-
306222
@pytest.fixture(scope="session", autouse=True)
307-
def table_v1_v2_appended_without_null(session_catalog: Catalog, arrow_table_without_null: pa.Table) -> None:
308-
identifier = "default.arrow_table_v1_v2_appended_without_null"
223+
def table_v1_v2_appended_with_null(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
224+
identifier = "default.arrow_table_v1_v2_appended_with_null"
309225

310226
try:
311227
session_catalog.drop_table(identifier=identifier)
@@ -318,17 +234,38 @@ def table_v1_v2_appended_without_null(session_catalog: Catalog, arrow_table_with
318234
partition_spec=PartitionSpec(PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int")),
319235
properties={'format-version': '1'},
320236
)
321-
tbl.append(arrow_table_without_null)
237+
tbl.append(arrow_table_with_null)
322238

323239
assert tbl.format_version == 1, f"Expected v1, got: v{tbl.format_version}"
324240

325241
with tbl.transaction() as tx:
326242
tx.upgrade_table_version(format_version=2)
327243

328-
tbl.append(arrow_table_without_null)
244+
tbl.append(arrow_table_with_null)
329245

330246
assert tbl.format_version == 2, f"Expected v2, got: v{tbl.format_version}"
331247

248+
# todo parametrize partition for each of the columns
249+
@pytest.mark.integration
250+
@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys())
251+
@pytest.mark.parametrize("format_version", [1, 2])
252+
def test_query_filter_null_partitioned(spark: SparkSession, col: str, format_version: int, ) -> None:
253+
identifier = f"default.arrow_table_v{format_version}_with_null_partitioned"
254+
df = spark.table(identifier)
255+
assert df.where(f"{col} is not null").count() == 2, f"Expected 2 rows for {col}"
256+
257+
258+
# todo parametrize partition for each of the columns
259+
@pytest.mark.integeration
260+
@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys())
261+
@pytest.mark.parametrize("format_version", [1, 2])
262+
def test_query_filter_appended_null_partitioned(spark: SparkSession, col: str, format_version: int) -> None:
263+
identifier = f"default.arrow_table_v{format_version}_appended_with_null_partitioned"
264+
df = spark.table(identifier)
265+
assert df.where(f"{col} is not null").count() == 4, f"Expected 6 rows for {col}"
266+
267+
268+
332269

333270
@pytest.fixture(scope="session")
334271
def spark() -> SparkSession:
@@ -363,17 +300,18 @@ def spark() -> SparkSession:
363300

364301
return spark
365302

366-
367-
@pytest.mark.adrian
368-
@pytest.mark.parametrize("col", TEST_DATA_WITHOUT_NULL.keys())
303+
# todo parametrize partition for each of the columns
304+
@pytest.mark.integeration
305+
@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys())
369306
def test_query_filter_v1_v2_append_null(spark: SparkSession, col: str) -> None:
370-
identifier = "default.arrow_table_v1_v2_appended_without_null"
307+
identifier = "default.arrow_table_v1_v2_appended_with_null"
371308
df = spark.table(identifier)
372-
assert df.where(f"{col} is not null").count() == 6, f"Expected 3 row for {col}"
309+
assert df.where(f"{col} is not null").count() == 4, f"Expected 4 row for {col}"
310+
373311

374312

375-
@pytest.mark.adrian
376-
def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_without_null: pa.Table) -> None:
313+
@pytest.mark.integeration
314+
def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
377315
identifier = "default.arrow_table_summaries"
378316

379317
try:
@@ -387,8 +325,8 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi
387325
properties={'format-version': '2'},
388326
)
389327

390-
tbl.append(arrow_table_without_null)
391-
tbl.append(arrow_table_without_null)
328+
tbl.append(arrow_table_with_null)
329+
tbl.append(arrow_table_with_null)
392330

393331
rows = spark.sql(
394332
f"""
@@ -404,32 +342,33 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi
404342
summaries = [row.summary for row in rows]
405343

406344
assert summaries[0] == {
407-
'added-data-files': '2',
408-
'added-files-size': '10433',
345+
'added-data-files': '3',
346+
'added-files-size': '14471',
409347
'added-records': '3',
410-
'total-data-files': '2',
348+
'total-data-files': '3',
411349
'total-delete-files': '0',
412350
'total-equality-deletes': '0',
413-
'total-files-size': '10433',
351+
'total-files-size': '14471',
414352
'total-position-deletes': '0',
415353
'total-records': '3',
416354
}
417355

418356
assert summaries[1] == {
419-
'added-data-files': '2',
420-
'added-files-size': '10433',
357+
'added-data-files': '3',
358+
'added-files-size': '14471',
421359
'added-records': '3',
422-
'total-data-files': '4',
360+
'total-data-files': '6',
423361
'total-delete-files': '0',
424362
'total-equality-deletes': '0',
425-
'total-files-size': '20866',
363+
'total-files-size': '28942',
426364
'total-position-deletes': '0',
427365
'total-records': '6',
428366
}
429367

430368

431-
@pytest.mark.adrian
432-
def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_without_null: pa.Table) -> None:
369+
# todo parametrize partition for each of the columns
370+
@pytest.mark.integeration
371+
def test_data_files_with_null(spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
433372
identifier = "default.arrow_data_files"
434373

435374
try:
@@ -443,8 +382,8 @@ def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_w
443382
properties={'format-version': '1'},
444383
)
445384

446-
tbl.append(arrow_table_without_null)
447-
tbl.append(arrow_table_without_null)
385+
tbl.append(arrow_table_with_null)
386+
tbl.append(arrow_table_with_null)
448387

449388
# added_data_files_count, existing_data_files_count, deleted_data_files_count
450389
rows = spark.sql(
@@ -454,7 +393,7 @@ def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_w
454393
"""
455394
).collect()
456395

457-
assert [row.added_data_files_count for row in rows] == [2, 2, 2]
396+
assert [row.added_data_files_count for row in rows] == [3, 3, 3]
458397
assert [row.existing_data_files_count for row in rows] == [
459398
0,
460399
0,
@@ -463,8 +402,9 @@ def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_w
463402
assert [row.deleted_data_files_count for row in rows] == [0, 0, 0]
464403

465404

466-
@pytest.mark.adrian
467-
def test_invalid_arguments(spark: SparkSession, session_catalog: Catalog, arrow_table_without_null: pa.Table) -> None:
405+
# i think this test does not need to duplicate
406+
@pytest.mark.integeration
407+
def test_invalid_arguments(spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
468408
identifier = "default.arrow_data_files"
469409

470410
try:

0 commit comments

Comments
 (0)