From 1b750f3a71e052cfb69ef61840bd28ff4af9d68c Mon Sep 17 00:00:00 2001 From: Dave Rigby Date: Fri, 9 Feb 2024 12:26:36 +0000 Subject: [PATCH] Add dataset_validation tests Add test to validate all public datasets are valid. These are added to their own directory as they can be slow to run and need a large amount of RAM to hold each dataset. The first test added (test_all_datasets_valid) performs some basic validation of each dataset: - Does the number of vectors in the data files match what the metadata says? - Are there any duplicate ids? This only checks datasets with 2M or fewer vectors, as larger ones require more than 32GB of RAM to load and validate. This currently means 2 datasets are skipped: * Skipping dataset 'ANN_DEEP1B_d96_angular which is larger than 2,000,000 vectors (has 9,990,000) * Skipping dataset 'msmarco-v1-bm25-allMiniLML6V2 which is larger than 2,000,000 vectors (has 8,841,823) --- .gitignore | 5 +++ tests/dataset_validation/README.md | 6 ++++ tests/dataset_validation/__init__.py | 0 .../test_validate_public_datasets.py | 33 +++++++++++++++++++ 4 files changed, 44 insertions(+) create mode 100644 tests/dataset_validation/README.md create mode 100644 tests/dataset_validation/__init__.py create mode 100644 tests/dataset_validation/test_validate_public_datasets.py diff --git a/.gitignore b/.gitignore index 563dcc3..2742801 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,8 @@ scratchpad.ipynb .pytest_cache/ .coverage poetry.lock + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class diff --git a/tests/dataset_validation/README.md b/tests/dataset_validation/README.md new file mode 100644 index 0000000..8625011 --- /dev/null +++ b/tests/dataset_validation/README.md @@ -0,0 +1,6 @@ +## Dataset Validation Tests + +This directory contains tests which perform validation on +datasets. These can be costly as they require downloading and checking +data in each dataset, so these are located in a separate directory +from the main system tests. diff --git a/tests/dataset_validation/__init__.py b/tests/dataset_validation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/dataset_validation/test_validate_public_datasets.py b/tests/dataset_validation/test_validate_public_datasets.py new file mode 100644 index 0000000..ae78176 --- /dev/null +++ b/tests/dataset_validation/test_validate_public_datasets.py @@ -0,0 +1,33 @@ +import pytest +import pinecone_datasets + + +def pytest_generate_tests(metafunc): + # Discover the set of datasets in the public repo, populating the + # 'dataset' parameter with them all. + metafunc.parametrize("dataset", pinecone_datasets.list_datasets()) + + +def test_all_datasets_valid(dataset): + """For the given dataset, check we can successfully load it from cloud + storage (i.e. metadata checks pass and necessary files are present""" + ds = pinecone_datasets.load_dataset(dataset) + # Ideally should check all sets for this, but some are _very_ big and OOM kill + # a typical VM + if ds.metadata.documents > 2_000_000: + pytest.skip( + f"Skipping dataset '{dataset} which is larger than 2,000,000 vectors (has {ds.metadata.documents:,})" + ) + df = ds.documents + duplicates = df[df["id"].duplicated()] + num_duplicates = len(duplicates) + if num_duplicates: + print("Summary of duplicate IDs in vectors:") + print(duplicates) + assert ( + num_duplicates == 0 + ), f"Not all vector ids are unique - found {len(duplicates)} duplicates out of {len(df)} total vectors" + + assert ds.metadata.documents == len( + df + ), f"Count of vectors found in Dataset file ({len(ds.documents)}) does not match count in metadata ({ds.metadata.documents})"