From 1b750f3a71e052cfb69ef61840bd28ff4af9d68c Mon Sep 17 00:00:00 2001
From: Dave Rigby <dave.r@pinecone.io>
Date: Fri, 9 Feb 2024 12:26:36 +0000
Subject: [PATCH] Add dataset_validation tests

Add test to validate all public datasets are valid. These are added to
their own directory as they can be slow to run and need a large amount
of RAM to hold each dataset.

The first test added (test_all_datasets_valid) performs some basic
validation of each dataset:

- Does the number of vectors in the data files match what the metadata
  says?

- Are there any duplicate ids?

This only checks datasets with 2M or fewer vectors, as larger ones
require more than 32GB of RAM to load and validate. This currently
means 2 datasets are skipped:

* Skipping dataset 'ANN_DEEP1B_d96_angular which is larger than
  2,000,000 vectors (has 9,990,000)

* Skipping dataset 'msmarco-v1-bm25-allMiniLML6V2 which is larger than
  2,000,000 vectors (has 8,841,823)
---
 .gitignore                                    |  5 +++
 tests/dataset_validation/README.md            |  6 ++++
 tests/dataset_validation/__init__.py          |  0
 .../test_validate_public_datasets.py          | 33 +++++++++++++++++++
 4 files changed, 44 insertions(+)
 create mode 100644 tests/dataset_validation/README.md
 create mode 100644 tests/dataset_validation/__init__.py
 create mode 100644 tests/dataset_validation/test_validate_public_datasets.py

diff --git a/.gitignore b/.gitignore
index 563dcc3..2742801 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,8 @@ scratchpad.ipynb
 .pytest_cache/
 .coverage
 poetry.lock
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
diff --git a/tests/dataset_validation/README.md b/tests/dataset_validation/README.md
new file mode 100644
index 0000000..8625011
--- /dev/null
+++ b/tests/dataset_validation/README.md
@@ -0,0 +1,6 @@
+## Dataset Validation Tests
+
+This directory contains tests which perform validation on
+datasets. These can be costly as they require downloading and checking
+data in each dataset, so these are located in a separate directory
+from the main system tests.
diff --git a/tests/dataset_validation/__init__.py b/tests/dataset_validation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/dataset_validation/test_validate_public_datasets.py b/tests/dataset_validation/test_validate_public_datasets.py
new file mode 100644
index 0000000..ae78176
--- /dev/null
+++ b/tests/dataset_validation/test_validate_public_datasets.py
@@ -0,0 +1,33 @@
+import pytest
+import pinecone_datasets
+
+
+def pytest_generate_tests(metafunc):
+    # Discover the set of datasets in the public repo, populating the
+    # 'dataset' parameter with them all.
+    metafunc.parametrize("dataset", pinecone_datasets.list_datasets())
+
+
+def test_all_datasets_valid(dataset):
+    """For the given dataset, check we can successfully load it from cloud
+    storage (i.e. metadata checks pass and necessary files are present"""
+    ds = pinecone_datasets.load_dataset(dataset)
+    # Ideally should check all sets for this, but some are _very_ big and OOM kill
+    # a typical VM
+    if ds.metadata.documents > 2_000_000:
+        pytest.skip(
+            f"Skipping dataset '{dataset} which is larger than 2,000,000 vectors (has {ds.metadata.documents:,})"
+        )
+    df = ds.documents
+    duplicates = df[df["id"].duplicated()]
+    num_duplicates = len(duplicates)
+    if num_duplicates:
+        print("Summary of duplicate IDs in vectors:")
+        print(duplicates)
+    assert (
+        num_duplicates == 0
+    ), f"Not all vector ids are unique - found {len(duplicates)} duplicates out of {len(df)} total vectors"
+
+    assert ds.metadata.documents == len(
+        df
+    ), f"Count of vectors found in Dataset file ({len(ds.documents)}) does not match count in metadata ({ds.metadata.documents})"