diff --git a/.gitignore b/.gitignore index 563dcc3..2742801 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,8 @@ scratchpad.ipynb .pytest_cache/ .coverage poetry.lock + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class diff --git a/tests/dataset_validation/README.md b/tests/dataset_validation/README.md new file mode 100644 index 0000000..8625011 --- /dev/null +++ b/tests/dataset_validation/README.md @@ -0,0 +1,6 @@ +## Dataset Validation Tests + +This directory contains tests which perform validation on +datasets. These can be costly as they require downloading and checking +data in each dataset, so these are located in a separate directory +from the main system tests. diff --git a/tests/dataset_validation/__init__.py b/tests/dataset_validation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/dataset_validation/test_validate_public_datasets.py b/tests/dataset_validation/test_validate_public_datasets.py new file mode 100644 index 0000000..ae78176 --- /dev/null +++ b/tests/dataset_validation/test_validate_public_datasets.py @@ -0,0 +1,33 @@ +import pytest +import pinecone_datasets + + +def pytest_generate_tests(metafunc): + # Discover the set of datasets in the public repo, populating the + # 'dataset' parameter with them all. + metafunc.parametrize("dataset", pinecone_datasets.list_datasets()) + + +def test_all_datasets_valid(dataset): + """For the given dataset, check we can successfully load it from cloud + storage (i.e. metadata checks pass and necessary files are present""" + ds = pinecone_datasets.load_dataset(dataset) + # Ideally should check all sets for this, but some are _very_ big and OOM kill + # a typical VM + if ds.metadata.documents > 2_000_000: + pytest.skip( + f"Skipping dataset '{dataset} which is larger than 2,000,000 vectors (has {ds.metadata.documents:,})" + ) + df = ds.documents + duplicates = df[df["id"].duplicated()] + num_duplicates = len(duplicates) + if num_duplicates: + print("Summary of duplicate IDs in vectors:") + print(duplicates) + assert ( + num_duplicates == 0 + ), f"Not all vector ids are unique - found {len(duplicates)} duplicates out of {len(df)} total vectors" + + assert ds.metadata.documents == len( + df + ), f"Count of vectors found in Dataset file ({len(ds.documents)}) does not match count in metadata ({ds.metadata.documents})"