diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 37095a3..b26725d 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -13,7 +13,7 @@ source .venv/bin/activate Install package in editable mode. ```shell -poetry install --with dev,test,lint +uv sync --group test ``` Start PostgreSQL/PGVector. @@ -22,7 +22,7 @@ docker run --rm -it --name pgvector-container \ -e POSTGRES_USER=langchain \ -e POSTGRES_PASSWORD=langchain \ -e POSTGRES_DB=langchain_test \ - -p 6024:5432 pgvector/pgvector:pg16 \ + -p 5432:5432 pgvector/pgvector:pg16 \ postgres -c log_statement=all ``` diff --git a/examples/pg_vectorstore.ipynb b/examples/pg_vectorstore.ipynb index 2c20e90..a6e3837 100644 --- a/examples/pg_vectorstore.ipynb +++ b/examples/pg_vectorstore.ipynb @@ -359,7 +359,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To enable search with filters, it is necessary to declare the columns that you want to filter on when creating the table. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents.\n", + "To achieve performant search with filters, it is crucial to declare the columns you want to filter on within the `metadata_columns` when creating the table, as filtering directly on these columns is far more efficient than attempting to filter on fields within a metadata JSON column. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents.\n", "\n", "`PGVectorStore` currently supports the following operators.\n", "\n", diff --git a/examples/pg_vectorstore_how_to.ipynb b/examples/pg_vectorstore_how_to.ipynb index 2c5e75a..fb38bfa 100644 --- a/examples/pg_vectorstore_how_to.ipynb +++ b/examples/pg_vectorstore_how_to.ipynb @@ -530,7 +530,7 @@ "source": [ "### Search for documents with metadata filter\n", "\n", - "A Vector Store can take advantage of relational data to filter similarity searches. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents. See the [migration guide](https://github.com/langchain-ai/langchain-postgres/blob/main/examples/migrate_pgvector_to_pgvectorstore.ipynb) for details on how to migrate to use metadata columns.\n", + "A Vector Store can take advantage of relational data to filter similarity searches. The vectorstore supports a set of filters that can be applied against the metadata fields of the documents. See the [migration guide](https://github.com/langchain-ai/langchain-postgres/blob/main/examples/migrate_pgvector_to_pgvectorstore.ipynb) for details on how to migrate to use metadata columns for efficient filtering.\n", "\n", "`PGVectorStore` currently supports the following operators and all Postgres data types.\n", "\n", @@ -645,7 +645,7 @@ "\n", "- **`metadata_columns=[\"name\", \"category\", \"price_usd\", \"quantity\", \"sku\", \"image_url\"]`**: These columns are treated as metadata for each product. Metadata provides additional information about a product, such as its name, category, price, quantity available, SKU (Stock Keeping Unit), and an image URL. This information is useful for displaying product details in search results or for filtering and categorization.\n", "\n", - "- **`metadata_json_column=\"metadata\"`**: The `metadata` column can store any additional information about the products in a flexible JSON format. This allows for storing varied and complex data that doesn't fit into the standard columns.\n" + "- **`metadata_json_column=\"metadata\"`**: The `metadata` column can store any additional information about the products in a flexible JSON format. This allows for storing varied and complex data that doesn't fit into the standard columns. Note that filtering on fields within the JSON but not in `metadata_columns` will be less efficient.\n" ] }, { diff --git a/langchain_postgres/v2/async_vectorstore.py b/langchain_postgres/v2/async_vectorstore.py index 8382b3e..302dcbd 100644 --- a/langchain_postgres/v2/async_vectorstore.py +++ b/langchain_postgres/v2/async_vectorstore.py @@ -2,6 +2,7 @@ from __future__ import annotations import copy +import datetime import json import uuid from typing import Any, Callable, Iterable, Optional, Sequence @@ -54,6 +55,16 @@ .union(SPECIAL_CASED_OPERATORS) ) +PYTHON_TO_POSTGRES_TYPE_MAP = { + int: "INTEGER", + float: "FLOAT", + str: "TEXT", + bool: "BOOLEAN", + datetime.date: "DATE", + datetime.datetime: "TIMESTAMP", + datetime.time: "TIME", +} + class AsyncPGVectorStore(VectorStore): """Postgres Vector Store class""" @@ -1096,19 +1107,33 @@ def _handle_field_filter( operator = "$eq" filter_value = value + field_selector = field + if self.metadata_json_column is not None and field not in self.metadata_columns and field not in ( + self.id_column, + self.content_column, + self.embedding_column + ): + filter_value_type = type(filter_value[0]) if (isinstance(filter_value, list) or isinstance(filter_value, tuple)) else type(filter_value) + postgres_type = PYTHON_TO_POSTGRES_TYPE_MAP.get(filter_value_type) + if postgres_type is None: + raise ValueError(f"Unsupported type: {filter_value_type}") + field_selector = f"{self.metadata_json_column}->>'{field}'" + if postgres_type != "TEXT" and operator != "$exists": + field_selector = f"({field_selector})::{postgres_type}" + suffix_id = str(uuid.uuid4()).split("-")[0] if operator in COMPARISONS_TO_NATIVE: # Then we implement an equality filter # native is trusted input native = COMPARISONS_TO_NATIVE[operator] param_name = f"{field}_{suffix_id}" - return f"{field} {native} :{param_name}", {f"{param_name}": filter_value} + return f"{field_selector} {native} :{param_name}", {f"{param_name}": filter_value} elif operator == "$between": # Use AND with two comparisons low, high = filter_value low_param_name = f"{field}_low_{suffix_id}" high_param_name = f"{field}_high_{suffix_id}" - return f"({field} BETWEEN :{low_param_name} AND :{high_param_name})", { + return f"({field_selector} BETWEEN :{low_param_name} AND :{high_param_name})", { f"{low_param_name}": low, f"{high_param_name}": high, } @@ -1126,18 +1151,18 @@ def _handle_field_filter( ) param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}" if operator == "$in": - return f"{field} = ANY(:{param_name})", {f"{param_name}": filter_value} + return f"{field_selector} = ANY(:{param_name})", {f"{param_name}": filter_value} else: # i.e. $nin - return f"{field} <> ALL (:{param_name})", { + return f"{field_selector} <> ALL (:{param_name})", { f"{param_name}": filter_value } elif operator in {"$like", "$ilike"}: param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}" if operator == "$like": - return f"({field} LIKE :{param_name})", {f"{param_name}": filter_value} + return f"({field_selector} LIKE :{param_name})", {f"{param_name}": filter_value} else: # i.e. $ilike - return f"({field} ILIKE :{param_name})", {f"{param_name}": filter_value} + return f"({field_selector} ILIKE :{param_name})", {f"{param_name}": filter_value} elif operator == "$exists": if not isinstance(filter_value, bool): raise ValueError( @@ -1146,9 +1171,9 @@ def _handle_field_filter( ) else: if filter_value: - return f"({field} IS NOT NULL)", {} + return f"({field_selector} IS NOT NULL)", {} else: - return f"({field} IS NULL)", {} + return f"({field_selector} IS NULL)", {} else: raise NotImplementedError() diff --git a/tests/unit_tests/fixtures/metadata_filtering_data.py b/tests/unit_tests/fixtures/metadata_filtering_data.py index 8df8c01..684eef4 100644 --- a/tests/unit_tests/fixtures/metadata_filtering_data.py +++ b/tests/unit_tests/fixtures/metadata_filtering_data.py @@ -239,6 +239,179 @@ {"inventory_location": {"$exists": False}}, ["WB003"], ), + # JSON metadata filter + ( + {"code_json": "FT004"}, + ["FT004"], + ), + ( + {"name_json": "Smart Fitness Tracker"}, + ["FT004"], + ), + ( + {"is_available_json": True}, + ["WH001", "FT004", "EC002"], + ), + ( + {"code_json": "WH001", "is_available_json": True}, + ["WH001"], + ), + ( + {"available_quantity_json": {"$eq": 10}}, + ["EC002"], + ), + ( + {"available_quantity_json": {"$ne": 0}}, + ["WH001", "FT004", "EC002"], + ), + ( + {"available_quantity_json": {"$gt": 60}}, + ["FT004"], + ), + ( + {"available_quantity_json": {"$gte": 50}}, + ["WH001", "FT004"], + ), + ( + {"available_quantity_json": {"$lt": 5}}, + ["WB003"], + ), + ( + {"available_quantity_json": {"$lte": 10}}, + ["WB003", "EC002"], + ), + ( + {"code_json": {"$eq": "WH001"}}, + ["WH001"], + ), + ( + {"code_json": {"$ne": "WB003"}}, + ["WH001", "FT004", "EC002"], + ), + ( + {"name_json": {"$gt": "Wireless Headphones"}}, + [], + ), + ( + {"name_json": {"$gte": "Wireless Headphones"}}, + ["WH001"], + ), + ( + {"name_json": {"$lt": "Smart Fitness Tracker"}}, + ["EC002"], + ), + ( + {"name_json": {"$lte": "Smart Fitness Tracker"}}, + ["FT004", "EC002"], + ), + ( + {"is_available_json": {"$eq": True}}, + ["WH001", "FT004", "EC002"], + ), + ( + {"is_available_json": {"$ne": True}}, + ["WB003"], + ), + ( + {"price_json": {"$gt": 200.0}}, + ["EC002"], + ), + ( + {"price_json": {"$gte": 149.99}}, + ["WH001", "EC002"], + ), + ( + {"price_json": {"$lt": 50.0}}, + ["WB003"], + ), + ( + {"price_json": {"$lte": 79.95}}, + ["FT004", "WB003"], + ), + ( + {"$or": [{"code_json": "WH001"}, {"code_json": "EC002"}]}, + ["WH001", "EC002"], + ), + ( + {"$or": [{"code_json": "WH001"}, {"available_quantity_json": 10}]}, + ["WH001", "EC002"], + ), + ( + {"$and": [{"code_json": "WH001"}, {"code_json": "EC002"}]}, + [], + ), + ( + {"$not": {"code_json": "WB003"}}, + ["WH001", "FT004", "EC002"], + ), + ( + {"$not": [{"code_json": "WB003"}]}, + ["WH001", "FT004", "EC002"], + ), + ( + {"$not": {"available_quantity_json": 0}}, + ["WH001", "FT004", "EC002"], + ), + ( + {"$not": [{"available_quantity_json": 0}]}, + ["WH001", "FT004", "EC002"], + ), + ( + {"$not": {"is_available_json": True}}, + ["WB003"], + ), + ( + {"$not": [{"is_available_json": True}]}, + ["WB003"], + ), + ( + {"$not": {"price_json": {"$gt": 150.0}}}, + ["WH001", "FT004", "WB003"], + ), + ( + {"$not": [{"price_json": {"$gt": 150.0}}]}, + ["WH001", "FT004", "WB003"], + ), + ( + {"available_quantity_json": {"$between": (40, 60)}}, + ["WH001"], + ), + ( + {"name_json": {"$in": ["Smart Fitness Tracker", "Stainless Steel Water Bottle"]}}, + ["FT004", "WB003"], + ), + ( + {"available_quantity_json": {"$in": [0, 10]}}, + ["WB003", "EC002"], + ), + ( + {"name_json": {"$nin": ["Smart Fitness Tracker", "Stainless Steel Water Bottle"]}}, + ["WH001", "EC002"], + ), + ( + {"available_quantity_json": {"$nin": [50, 0, 10]}}, + ["FT004"], + ), + ( + {"name_json": {"$like": "Wireless%"}}, + ["WH001"], + ), + ( + {"name_json": {"$like": "%less%"}}, + ["WH001", "WB003"], + ), + ( + {"$or": [{"code_json": {"$like": "WH00%"}}, {"code_json": {"$like": "EC00%"}}]}, + ["WH001", "EC002"], + ), + ( + {"tags_json": {"$exists": False}}, + [], + ), + ( + {"inventory_location_json": {"$exists": False}}, + ["WB003"], + ) ] NEGATIVE_TEST_CASES = [ diff --git a/tests/unit_tests/v2/test_async_pg_vectorstore_search.py b/tests/unit_tests/v2/test_async_pg_vectorstore_search.py index 16c70fd..7211659 100644 --- a/tests/unit_tests/v2/test_async_pg_vectorstore_search.py +++ b/tests/unit_tests/v2/test_async_pg_vectorstore_search.py @@ -46,7 +46,12 @@ embeddings = [embeddings_service.embed_query("foo") for i in range(len(texts))] filter_docs = [ - Document(page_content=texts[i], metadata=METADATAS[i]) for i in range(len(texts)) + Document( + page_content=texts[i], + metadata=( + METADATAS[i] | {f"{key}_json": value for key, value in METADATAS[i].items()} + ) + ) for i in range(len(texts)) ] # Documents designed for hybrid search testing hybrid_docs_content = { @@ -194,7 +199,7 @@ async def vs_custom_filter( Column("available_quantity", "INTEGER", nullable=True), ], id_column="langchain_id", - store_metadata=False, + store_metadata=True, ) vs_custom_filter = await AsyncPGVectorStore.create( diff --git a/tests/unit_tests/v2/test_pg_vectorstore_search.py b/tests/unit_tests/v2/test_pg_vectorstore_search.py index 7815a25..0ca690d 100644 --- a/tests/unit_tests/v2/test_pg_vectorstore_search.py +++ b/tests/unit_tests/v2/test_pg_vectorstore_search.py @@ -42,7 +42,12 @@ Document(page_content=texts[i], metadata=metadatas[i]) for i in range(len(texts)) ] filter_docs = [ - Document(page_content=texts[i], metadata=METADATAS[i]) for i in range(len(texts)) + Document( + page_content=texts[i], + metadata=( + METADATAS[i] | {f"{key}_json": value for key, value in METADATAS[i].items()} + ) + ) for i in range(len(texts)) ] embeddings = [embeddings_service.embed_query("foo") for i in range(len(texts))] @@ -141,7 +146,7 @@ async def vs_custom_filter(self, engine: PGEngine) -> AsyncIterator[PGVectorStor Column("available_quantity", "INTEGER", nullable=True), ], id_column="langchain_id", - store_metadata=False, + store_metadata=True, overwrite_existing=True, ) @@ -352,7 +357,7 @@ async def vs_custom_filter_sync( Column("available_quantity", "INTEGER", nullable=True), ], id_column="langchain_id", - store_metadata=False, + store_metadata=True, overwrite_existing=True, )