Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,8 @@ scratchpad.ipynb
.pytest_cache/
.coverage
poetry.lock

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
6 changes: 6 additions & 0 deletions tests/dataset_validation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
## Dataset Validation Tests

This directory contains tests which perform validation on
datasets. These can be costly as they require downloading and checking
data in each dataset, so these are located in a separate directory
from the main system tests.
Empty file.
33 changes: 33 additions & 0 deletions tests/dataset_validation/test_validate_public_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pytest
import pinecone_datasets


def pytest_generate_tests(metafunc):
# Discover the set of datasets in the public repo, populating the
# 'dataset' parameter with them all.
metafunc.parametrize("dataset", pinecone_datasets.list_datasets())


def test_all_datasets_valid(dataset):
"""For the given dataset, check we can successfully load it from cloud
storage (i.e. metadata checks pass and necessary files are present"""
ds = pinecone_datasets.load_dataset(dataset)
# Ideally should check all sets for this, but some are _very_ big and OOM kill
# a typical VM
if ds.metadata.documents > 2_000_000:
pytest.skip(
f"Skipping dataset '{dataset} which is larger than 2,000,000 vectors (has {ds.metadata.documents:,})"
)
df = ds.documents
duplicates = df[df["id"].duplicated()]
num_duplicates = len(duplicates)
if num_duplicates:
print("Summary of duplicate IDs in vectors:")
print(duplicates)
assert (
num_duplicates == 0
), f"Not all vector ids are unique - found {len(duplicates)} duplicates out of {len(df)} total vectors"

assert ds.metadata.documents == len(
df
), f"Count of vectors found in Dataset file ({len(ds.documents)}) does not match count in metadata ({ds.metadata.documents})"