From 36e4f053709812be648999ac42b38fe96a0ac854 Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Wed, 27 Aug 2025 16:07:19 -0700 Subject: [PATCH 01/12] wip: adding multi vector query class and tests --- redisvl/query/aggregate.py | 160 +++++++++++++++++++++++++++ tests/unit/test_aggregation_types.py | 149 ++++++++++++++++++++++++- 2 files changed, 308 insertions(+), 1 deletion(-) diff --git a/redisvl/query/aggregate.py b/redisvl/query/aggregate.py index fd066bce..0c970402 100644 --- a/redisvl/query/aggregate.py +++ b/redisvl/query/aggregate.py @@ -227,3 +227,163 @@ def _build_query_string(self) -> str: def __str__(self) -> str: """Return the string representation of the query.""" return " ".join([str(x) for x in self.build_args()]) + + +class MultiVectorQuery(AggregationQuery): + """ + MultiVectorQuery allows for search over multiple vector fields in a document simulateously. + The final score will be a weighted combination of the individual vector similarity scores + following the formula: + + score = (w_1 * score_1 + w_2 * score_2 + w_3 * score_3 + ... ) / (w_1 + w_2 + w_3 + ...) + + Vectors may be of different size and datatype. + + .. code-block:: python + + from redisvl.query import MultiVectorQuery + from redisvl.index import SearchIndex + + index = SearchIndex.from_yaml("path/to/index.yaml") + + query = MultiVectorQuery( + vectors=[[0.1, 0.2, 0.3], [0.5, 0.5], [0.1, 0.1, 0.1, 0.1]], + vector_field_names=["text_vector", "image_vector", "feature_vector"] + filter_expression=None, + weights=[0.7], + dtypes=["float32", "float32", "float32"], + num_results=10, + return_fields=["field1", "field2"], + dialect=2, + ) + + results = index.query(query) + + + FT.AGGREGATE 'idx:characters' + "@embedding1:[VECTOR_RANGE .7 $vector1]=>{$YIELD_DISTANCE_AS: vector_distance1} | @embedding2:[VECTOR_RANGE 1.0 $vector2]=>{$YIELD_DISTANCE_AS: vector_distance2} | @embedding3:[VECTOR_RANGE 1.7 $vector3]=>{$YIELD_DISTANCE_AS: vector_distance3} | @name:(James)" + ADDSCORES + SCORER BM25STD.NORM + LOAD 2 created_at @embedding + APPLY 'case(exists(@vector_distance1), @vector_distance1, 0.0)' as v1 + APPLY 'case(exists(@vector_distance2), @vector_distance2, 0.0)' as v2 + APPLY 'case(exists(@vector_distance3), @vector_distance3, 0.0)' as v3 + APPLY '(@__score * 0.3 + (@v1 * 0.3) + (@v2 * 1.2) + (@v3 * 0.1))' AS final_score + PARAMS 6 vector1 "\xe4\xd6..." vector2 "\x89\xa0..." vector3 "\x3c\x19..." + SORTBY 2 @final_score DESC + DIALECT 2 + LIMIT 0 100 + + + """ + + DISTANCE_ID: str = "vector_distance" + VECTOR_PARAM: str = "vector" + + def __init__( + self, + vectors: Union[bytes, List[bytes], List[float], List[List[float]]], + vector_field_names: Union[str, List[str]], + filter_expression: Optional[Union[str, FilterExpression]] = None, + weights: Union[float, List[float]] = 1.0, + dtypes: Union[str, List[str]] = "float32", + num_results: int = 10, + return_fields: Optional[List[str]] = None, + dialect: int = 2, + ): + """ + Instantiates a MultiVectorQuery object. + + Args: + vectors (Union[bytes, List[bytes], List[float], List[List[float]]): The vectors to perform vector similarity search. + vector_field_names (str): The vector field names to search in. + filter_expression (Optional[FilterExpression], optional): The filter expression to use. + Defaults to None. + weights (Union[float, List[float]], optional): The weights of the vector similarity. + Documents will be scored as: + score = (w1) * score1 + (w2) * score2 + (w3) * score3 + ... + Defaults to 1.0, which corresponds to equal weighting + dtype (Union[str, List[str]] optional): The data types of the vectors. Defaults to "float32" for all vectors. + num_results (int, optional): The number of results to return. Defaults to 10. + return_fields (Optional[List[str]], optional): The fields to return. Defaults to None. + dialect (int, optional): The Redis dialect version. Defaults to 2. + + Raises: + ValueError: The number of vectors, vector field names, and weights do not agree. + TypeError: If the stopwords are not a set, list, or tuple of strings. + """ + + self._vectors = vectors + self._vector_fields = vector_field_names + self._filter_expression = filter_expression + self._weights = weights + self._dtypes = dtypes + self._num_results = num_results + + query_string = self._build_query_string() + super().__init__(query_string) + + self.scorer(text_scorer) + self.add_scores() + self.apply( + vector_similarity=f"(2 - @{self.DISTANCE_ID})/2", text_score="@__score" + ) + self.apply(hybrid_score=f"{1-alpha}*@text_score + {alpha}*@vector_similarity") + self.sort_by(Desc("@hybrid_score"), max=num_results) # type: ignore + self.dialect(dialect) + if return_fields: + self.load(*return_fields) # type: ignore[arg-type] + + @property + def params(self) -> Dict[str, Any]: + """Return the parameters for the aggregation. + + Returns: + Dict[str, Any]: The parameters for the aggregation. + """ + if isinstance(self._vector, list): + vector = array_to_buffer(self._vector, dtype=self._dtype) + else: + vector = self._vector + + params = {self.VECTOR_PARAM: vector} + + return params + + def _tokenize_and_escape_query(self, user_query: str) -> str: + """Convert a raw user query to a redis full text query joined by ORs + Args: + user_query (str): The user query to tokenize and escape. + + Returns: + str: The tokenized and escaped query string. + Raises: + ValueError: If the text string becomes empty after stopwords are removed. + """ + escaper = TokenEscaper() + + tokens = [ + escaper.escape( + token.strip().strip(",").replace("“", "").replace("”", "").lower() + ) + for token in user_query.split() + ] + tokenized = " | ".join( + [token for token in tokens if token and token not in self._stopwords] + ) + + if not tokenized: + raise ValueError("text string cannot be empty after removing stopwords") + return tokenized + + def _build_query_string(self) -> str: + """Build the full query string for text search with optional filtering.""" + if isinstance(self._filter_expression, FilterExpression): + filter_expression = str(self._filter_expression) + else: + filter_expression = "" + + # base KNN query + knn_query = f"KNN {self._num_results} @{self._vector_field} ${self.VECTOR_PARAM} AS {self.DISTANCE_ID}" + + return f"{filter_expression})=>[{knn_query}]" diff --git a/tests/unit/test_aggregation_types.py b/tests/unit/test_aggregation_types.py index a13e87f5..5e2f2493 100644 --- a/tests/unit/test_aggregation_types.py +++ b/tests/unit/test_aggregation_types.py @@ -4,13 +4,17 @@ from redis.commands.search.result import Result from redisvl.index.index import process_results -from redisvl.query.aggregate import HybridQuery +from redisvl.query.aggregate import HybridQuery, MultiVectorQuery from redisvl.query.filter import Tag # Sample data for testing sample_vector = [0.1, 0.2, 0.3, 0.4] sample_text = "the toon squad play basketball against a gang of aliens" +sample_vector_2 = [0.1, 0.2, 0.3, 0.4] +sample_vector_3 = [0.5, 0.5] +sample_vector_4 = [0.1, 0.1, 0.1] + # Test Cases def test_aggregate_hybrid_query(): @@ -190,3 +194,146 @@ def test_hybrid_query_with_string_filter(): query_string_wildcard = str(hybrid_query_wildcard) assert f"@{text_field_name}:(search | document | 12345)" in query_string_wildcard assert "AND" not in query_string_wildcard + + +def test_aggregate_multi_vector_query(): + # test we require vectors and field names + with pytest.raises(ValueError): + _ = MultiVectorQuery() + + with pytest.raises(ValueError): + _ = MultiVectorQuery(vectors=[sample_vector], vector_field_names=[]) + + with pytest.raises(ValueError): + _ = MultiVectorQuery(vectors=[], vector_field_names=["field 1"]) + + # test we can initialize with a single vector and single field name + multivector_query = MultiVectorQuery( + vectors=[sample_vector], vector_field_names=["field 1"] + ) + assert query.query is not None + + # check default properties + assert multivector_query._vectors == [sample_vector] + assert multivector_query._vector_field_names == ["field 1"] + assert multivector_query._filter_expression == None + assert multivector_query._weights == 1.0 + assert multivector_query._num_results == 10 + assert multivector_query._loadfields == [] + assert multivector_query._dialect == 2 + + # test we can initialize with mutliple vectors and field names + multivector_query = MultiVectorQuery( + vectors=[sample_vector, sample_vector_2, sample_vector_3, sample_vector_4], + vector_field_names=["field 1", "field 2", "field 3", "field 4"], + weights=[0.2, 0.5, 0.6, 0, 1], + dtypes=[], + ) + + assert len(multivector_query._vectors) == 4 + assert len(multivector_query._vector_field_names) == 4 + assert len(multivector_query._weights) == 4 + + # test defaults can be overwritten + multivector_query = MultiVectorQuery( + vectors=[sample_vector, sample_vector_2, sample_vector_3, sample_vector_4], + vector_field_names=["field 1", "field 2", "field 3", "field 4"], + filter_expression=(Tag("user group") == ["group A", "group C"]), + weights=[0.2, 0.5, 0.6, 0, 1], + dtypes=["float32", "float32", "float64", "bfloat16"], + num_results=5, + return_fields=["field 1", "user name", "address"], + dialect=4, + ) + + assert multivector_query._vectors == [ + sample_vector, + sample_vector_2, + sample_vector_3, + sample_vector_4, + ] + assert multivector_query._vector_field_names == [ + "field 1", + "field 2", + "field 3", + "field 4", + ] + assert multivector_query._weights == [0.2, 0.5, 0.6, 0, 1] + assert multivector_query._filter_expression == Tag("user group") + assert multivector_query._num_results == 5 + assert multivector_query._loadfields == ["field 1", "user name", "address"] + assert multivector_query._dialect == 4 + + +def test_aggregate_multi_vector_query_broadcasting(): + # if a single vector and multiple fields is passed we search with the same vector over all fields + multivector_query = MultiVectorQuery( + vectors=[sample_vector], + vector_field_names=["text embedding", "image embedding"], + ) + assert multi_vector_query.query == "" + + # vector being broadcast doesn't need to be in a list + multivector_query = MultiVectorQuery( + vectors=sample_vector, vector_field_names=["text embedding", "image embedding"] + ) + assert multi_vector_query.query == "" + + # if multiple vectors are passed and a single field name we search with all vectors on that field + multivector_query = MultiVectorQuery( + vectors=[sample_vector_2, sample_vector_3], + vector_field_names=["text embedding"], + ) + assert multi_vector_query.query == "" + + # vector field name does not need to be in a list if only one is provided + multivector_query = MultiVectorQuery( + vectors=[sample_vector_2, sample_vector_3], vector_field_names="text embedding" + ) + assert multi_vector_query.query == "" + + # if a single weight is passed it is applied to all similarity scores + multivector_query = MultiVectorQuery( + vectors=[sample_vector_2, sample_vector_3], + vector_field_names=["text embedding", "image embedding"], + weights=[0.2], + ) + assert multi_vector_query.query == "" + + # weight does not need to be in a list if only one is provided + multivector_query = MultiVectorQuery( + vectors=[sample_vector_2, sample_vector_3], + vector_field_names=["text embedding", "image embedding"], + weights=0.2, + ) + assert multi_vector_query.query == "" + + +def test_aggregate_multi_vector_query_errors(): + # test an error is raised if the number of vectors and number of fields don't match + with pytest.raises(ValueError): + _ = MultiVectorQuery( + vectors=[sample_vector, sample_vector_2, sample_vector_3], + vector_field_names=["text embedding", "image embedding"], + ) + + with pytest.raises(ValueError): + _ = MultiVectorQuery( + vectors=[sample_vector, sample_vector_2], + vector_field_names=["text embedding", "image embedding", "features"], + ) + + # test an error is raised if the number of weights is incorrect + with pytest.raises(ValueError): + _ = MultiVectorQuery( + vectors=[sample_vector, sample_vector_2], + vector_field_names=["text embedding", "image embedding"], + weights=[0.1, 0.2, 0.3], + ) + + # test an error is raised if none of the field names are present + with pytest.raises(ValueError): + _ = MultiVectorQuery( + vectors=[], + vector_field_names=[], + ) From d0dff0bb0f6472d647e3f066e7bead036f965a7c Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Wed, 24 Sep 2025 15:09:04 -0700 Subject: [PATCH 02/12] wip: working multivector query --- redisvl/query/aggregate.py | 193 ++++++++++++++++++++++++------------- 1 file changed, 126 insertions(+), 67 deletions(-) diff --git a/redisvl/query/aggregate.py b/redisvl/query/aggregate.py index 0c970402..3131b28d 100644 --- a/redisvl/query/aggregate.py +++ b/redisvl/query/aggregate.py @@ -260,35 +260,57 @@ class MultiVectorQuery(AggregationQuery): results = index.query(query) + + FT.AGGREGATE multi_vector_test + "@user_embedding:[VECTOR_RANGE 2.0 $vector_0]=>{$YIELD_DISTANCE_AS: distance_0} + | @image_embedding:[VECTOR_RANGE 2.0 $vector_1]=>{$YIELD_DISTANCE_AS: distance_1}" + PARAMS 4 + vector_0 "\xcd\xcc\xcc=\xcd\xcc\xcc=\x00\x00\x00?" + vector_1 "\x9a\x99\x99\x99\x99\x99\xb9?\x9a\x99\x99\x99\x99\x99\xb9?\x9a\x99\x99\x99\x99\x99\xb9?\x9a\x99\x99\x99\x99\x99\xb9?\x9a\x99\x99\x99\x99\x99\xb9?" + APPLY "(2 - @distance_0)/2" AS score_0 + APPLY "(2 - @distance_1)/2" AS score_1 + DIALECT 2 + APPLY "(@score_0 + @score_1)" AS combined_score + SORTBY 2 @combined_score + ASC + MAX 10 + LOAD 2 score_0 score_1 + + + + + FT.AGGREGATE 'idx:characters' - "@embedding1:[VECTOR_RANGE .7 $vector1]=>{$YIELD_DISTANCE_AS: vector_distance1} | @embedding2:[VECTOR_RANGE 1.0 $vector2]=>{$YIELD_DISTANCE_AS: vector_distance2} | @embedding3:[VECTOR_RANGE 1.7 $vector3]=>{$YIELD_DISTANCE_AS: vector_distance3} | @name:(James)" - ADDSCORES - SCORER BM25STD.NORM - LOAD 2 created_at @embedding - APPLY 'case(exists(@vector_distance1), @vector_distance1, 0.0)' as v1 - APPLY 'case(exists(@vector_distance2), @vector_distance2, 0.0)' as v2 - APPLY 'case(exists(@vector_distance3), @vector_distance3, 0.0)' as v3 + "@embedding1:[VECTOR_RANGE .7 $vector1]=>{$YIELD_DISTANCE_AS: vector_distance1} + | @embedding2:[VECTOR_RANGE 1.0 $vector2]=>{$YIELD_DISTANCE_AS: vector_distance2} + | @embedding3:[VECTOR_RANGE 1.7 $vector3]=>{$YIELD_DISTANCE_AS: vector_distance3} + | @name:(James)" + ### ADDSCORES + ### SCORER BM25STD.NORM + ### LOAD 2 created_at @embedding + APPLY '(2 - @vector_distance1)/2' as v1 + APPLY '(2 - @vector_distance2)/2' as v2 + APPLY '(2 - @vector_distance3)/2' as v3 APPLY '(@__score * 0.3 + (@v1 * 0.3) + (@v2 * 1.2) + (@v3 * 0.1))' AS final_score PARAMS 6 vector1 "\xe4\xd6..." vector2 "\x89\xa0..." vector3 "\x3c\x19..." SORTBY 2 @final_score DESC DIALECT 2 LIMIT 0 100 - """ DISTANCE_ID: str = "vector_distance" - VECTOR_PARAM: str = "vector" def __init__( self, vectors: Union[bytes, List[bytes], List[float], List[List[float]]], vector_field_names: Union[str, List[str]], + weights: List[float] = [1.0], + return_fields: Optional[List[str]] = None, filter_expression: Optional[Union[str, FilterExpression]] = None, - weights: Union[float, List[float]] = 1.0, - dtypes: Union[str, List[str]] = "float32", + dtypes: List[str] = ["float32"], num_results: int = 10, - return_fields: Optional[List[str]] = None, + return_score: bool = False, dialect: int = 2, ): """ @@ -296,40 +318,81 @@ def __init__( Args: vectors (Union[bytes, List[bytes], List[float], List[List[float]]): The vectors to perform vector similarity search. - vector_field_names (str): The vector field names to search in. - filter_expression (Optional[FilterExpression], optional): The filter expression to use. - Defaults to None. - weights (Union[float, List[float]], optional): The weights of the vector similarity. + vector_field_names (Union[str, List[str]]): The vector field names to search in. + weights (List[float]): The weights of the vector similarity. Documents will be scored as: score = (w1) * score1 + (w2) * score2 + (w3) * score3 + ... - Defaults to 1.0, which corresponds to equal weighting - dtype (Union[str, List[str]] optional): The data types of the vectors. Defaults to "float32" for all vectors. - num_results (int, optional): The number of results to return. Defaults to 10. + Defaults to [1.0], which corresponds to equal weighting return_fields (Optional[List[str]], optional): The fields to return. Defaults to None. + filter_expression (Optional[Union[str, FilterExpression]]): The filter expression to use. + Defaults to None. + dtypes (List[str]): The data types of the vectors. Defaults to ["float32"] for all vectors. + num_results (int, optional): The number of results to return. Defaults to 10. + return_score (bool): Whether to return the combined vector similarity score. + Defaults to False. dialect (int, optional): The Redis dialect version. Defaults to 2. Raises: ValueError: The number of vectors, vector field names, and weights do not agree. - TypeError: If the stopwords are not a set, list, or tuple of strings. """ - self._vectors = vectors - self._vector_fields = vector_field_names self._filter_expression = filter_expression - self._weights = weights self._dtypes = dtypes self._num_results = num_results + if len(vectors) == 0 or len(vector_field_names) == 0 or len(weights) == 0: + raise ValueError( + f"""The number of vectors and vector field names must be equal. + If weights are specified their number must match the number of vectors and vector field names also. + Length of vectors list: {len(vectors) = } + Length of vector_field_names list: {len(vector_field_names) = } + Length of weights list: {len(weights) = } + """ + ) + + if isinstance(vectors, bytes) or isinstance(vectors[0], float): + self._vectors = [vectors] + else: + self._vectors = vectors + if isinstance(vector_field_names, str): + self._vector_field_names = [vector_field_names] + else: + self._vector_field_names = vector_field_names + if len(weights) == 1: + self._weights = weights * len(vectors) + else: + self._weights = weights + if len(dtypes) == 1: + self._dtypes = dtypes * len(vectors) + else: + self._dtypes = dtypes + + if (len(self._vectors) != len(self._vector_field_names)) or ( + len(self._vectors) != len(self._weights) + ): + raise ValueError( + f"""The number of vectors and vector field names must be equal. + If weights are specified their number must match the number of vectors and vector field names also. + Length of vectors list: {len(self._vectors) = } + Length of vector_field_names list: {len(self._vector_field_names) = } + Length of weights list: {len(self._weights) = } + """ + ) + query_string = self._build_query_string() super().__init__(query_string) - self.scorer(text_scorer) - self.add_scores() - self.apply( - vector_similarity=f"(2 - @{self.DISTANCE_ID})/2", text_score="@__score" - ) - self.apply(hybrid_score=f"{1-alpha}*@text_score + {alpha}*@vector_similarity") - self.sort_by(Desc("@hybrid_score"), max=num_results) # type: ignore + # construct the scoring string based on the vector similarity scores and weights + combined_scores = [] + for i, w in enumerate(self._weights): + combined_scores.append(f"@score_{i} * {w}") + combined_score_string = " + ".join(combined_scores) + combined_score_string = f"'({combined_score_string})'" + + self.apply(combined_score=combined_score_string) + + # self.add_scores() + self.sort_by(Desc("@combined_score"), max=num_results) # type: ignore self.dialect(dialect) if return_fields: self.load(*return_fields) # type: ignore[arg-type] @@ -341,49 +404,45 @@ def params(self) -> Dict[str, Any]: Returns: Dict[str, Any]: The parameters for the aggregation. """ - if isinstance(self._vector, list): - vector = array_to_buffer(self._vector, dtype=self._dtype) - else: - vector = self._vector - - params = {self.VECTOR_PARAM: vector} - + params = {} + for i, (vector, vector_field, dtype) in enumerate(zip( + self._vectors, self._vector_field_names, self._dtypes + )): + if isinstance(vector, list): + vector = array_to_buffer(vector, dtype=dtype) + params[f"vector_{i}"] = vector return params - def _tokenize_and_escape_query(self, user_query: str) -> str: - """Convert a raw user query to a redis full text query joined by ORs - Args: - user_query (str): The user query to tokenize and escape. - - Returns: - str: The tokenized and escaped query string. - Raises: - ValueError: If the text string becomes empty after stopwords are removed. - """ - escaper = TokenEscaper() - - tokens = [ - escaper.escape( - token.strip().strip(",").replace("“", "").replace("”", "").lower() - ) - for token in user_query.split() - ] - tokenized = " | ".join( - [token for token in tokens if token and token not in self._stopwords] - ) - - if not tokenized: - raise ValueError("text string cannot be empty after removing stopwords") - return tokenized - def _build_query_string(self) -> str: """Build the full query string for text search with optional filtering.""" + + filter_expression = self._filter_expression if isinstance(self._filter_expression, FilterExpression): filter_expression = str(self._filter_expression) - else: - filter_expression = "" # base KNN query - knn_query = f"KNN {self._num_results} @{self._vector_field} ${self.VECTOR_PARAM} AS {self.DISTANCE_ID}" + knn_queries = [] + range_queries = [] + for i, (vector, field) in enumerate(zip(self._vectors, self._vector_field_names)): + knn_queries.append(f"[KNN {self._num_results} @{field} $vector_{i} AS distance_{i}]") + range_queries.append(f"@{field}:[VECTOR_RANGE 2.0 $vector_{i}]=>{{$YIELD_DISTANCE_AS: distance_{i}}}") + + knn_query = " | ".join(knn_queries) ## knn_queries format doesn't work + knn_query = " | ".join(range_queries) + + # calculate the respective vector similarities + apply_string = "" + for i, (vector, field_name, weight) in enumerate( + zip(self._vectors, self._vector_field_names, self._weights) + ): + apply_string += f'APPLY "(2 - @distance_{i})/2" AS score_{i} ' - return f"{filter_expression})=>[{knn_query}]" + return ( + f"{knn_query} {filter_expression} {apply_string}" + if filter_expression + else f"{knn_query} {apply_string}" + ) + + def __str__(self) -> str: + """Return the string representation of the query.""" + return " ".join([str(x) for x in self.build_args()]) From 7275f708cbdbcab98b04f60de566339b7aafc4a7 Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Fri, 26 Sep 2025 16:58:55 -0700 Subject: [PATCH 03/12] working multivector query class and tests --- redisvl/query/__init__.py | 3 +- redisvl/query/aggregate.py | 165 ++++++-------- tests/conftest.py | 173 ++++++++++++++ tests/integration/test_aggregation.py | 309 +++++++++++++++++++++++++- tests/unit/test_aggregation_types.py | 42 ++-- 5 files changed, 559 insertions(+), 133 deletions(-) diff --git a/redisvl/query/__init__.py b/redisvl/query/__init__.py index 30d35562..67c29d2b 100644 --- a/redisvl/query/__init__.py +++ b/redisvl/query/__init__.py @@ -1,4 +1,4 @@ -from redisvl.query.aggregate import AggregationQuery, HybridQuery +from redisvl.query.aggregate import AggregationQuery, HybridQuery, MultiVectorQuery from redisvl.query.query import ( BaseQuery, BaseVectorQuery, @@ -21,4 +21,5 @@ "TextQuery", "AggregationQuery", "HybridQuery", + "MultiVectorQuery", ] diff --git a/redisvl/query/aggregate.py b/redisvl/query/aggregate.py index 3131b28d..d0a4273b 100644 --- a/redisvl/query/aggregate.py +++ b/redisvl/query/aggregate.py @@ -231,76 +231,35 @@ def __str__(self) -> str: class MultiVectorQuery(AggregationQuery): """ - MultiVectorQuery allows for search over multiple vector fields in a document simulateously. - The final score will be a weighted combination of the individual vector similarity scores - following the formula: + MultiVectorQuery allows for search over multiple vector fields in a document simulateously. + The final score will be a weighted combination of the individual vector similarity scores + following the formula: - score = (w_1 * score_1 + w_2 * score_2 + w_3 * score_3 + ... ) / (w_1 + w_2 + w_3 + ...) - - Vectors may be of different size and datatype. - - .. code-block:: python - - from redisvl.query import MultiVectorQuery - from redisvl.index import SearchIndex - - index = SearchIndex.from_yaml("path/to/index.yaml") - - query = MultiVectorQuery( - vectors=[[0.1, 0.2, 0.3], [0.5, 0.5], [0.1, 0.1, 0.1, 0.1]], - vector_field_names=["text_vector", "image_vector", "feature_vector"] - filter_expression=None, - weights=[0.7], - dtypes=["float32", "float32", "float32"], - num_results=10, - return_fields=["field1", "field2"], - dialect=2, - ) - - results = index.query(query) - - - - FT.AGGREGATE multi_vector_test - "@user_embedding:[VECTOR_RANGE 2.0 $vector_0]=>{$YIELD_DISTANCE_AS: distance_0} - | @image_embedding:[VECTOR_RANGE 2.0 $vector_1]=>{$YIELD_DISTANCE_AS: distance_1}" - PARAMS 4 - vector_0 "\xcd\xcc\xcc=\xcd\xcc\xcc=\x00\x00\x00?" - vector_1 "\x9a\x99\x99\x99\x99\x99\xb9?\x9a\x99\x99\x99\x99\x99\xb9?\x9a\x99\x99\x99\x99\x99\xb9?\x9a\x99\x99\x99\x99\x99\xb9?\x9a\x99\x99\x99\x99\x99\xb9?" - APPLY "(2 - @distance_0)/2" AS score_0 - APPLY "(2 - @distance_1)/2" AS score_1 - DIALECT 2 - APPLY "(@score_0 + @score_1)" AS combined_score - SORTBY 2 @combined_score - ASC - MAX 10 - LOAD 2 score_0 score_1 + score = (w_1 * score_1 + w_2 * score_2 + w_3 * score_3 + ... ) + Vectors may be of different size and datatype, but must be indexed using the 'cosine' distance_metric. + .. code-block:: python + from redisvl.query import MultiVectorQuery + from redisvl.index import SearchIndex + index = SearchIndex.from_yaml("path/to/index.yaml") - FT.AGGREGATE 'idx:characters' - "@embedding1:[VECTOR_RANGE .7 $vector1]=>{$YIELD_DISTANCE_AS: vector_distance1} - | @embedding2:[VECTOR_RANGE 1.0 $vector2]=>{$YIELD_DISTANCE_AS: vector_distance2} - | @embedding3:[VECTOR_RANGE 1.7 $vector3]=>{$YIELD_DISTANCE_AS: vector_distance3} - | @name:(James)" - ### ADDSCORES - ### SCORER BM25STD.NORM - ### LOAD 2 created_at @embedding - APPLY '(2 - @vector_distance1)/2' as v1 - APPLY '(2 - @vector_distance2)/2' as v2 - APPLY '(2 - @vector_distance3)/2' as v3 - APPLY '(@__score * 0.3 + (@v1 * 0.3) + (@v2 * 1.2) + (@v3 * 0.1))' AS final_score - PARAMS 6 vector1 "\xe4\xd6..." vector2 "\x89\xa0..." vector3 "\x3c\x19..." - SORTBY 2 @final_score DESC - DIALECT 2 - LIMIT 0 100 + query = MultiVectorQuery( + vectors=[[0.1, 0.2, 0.3], [0.5, 0.5], [0.1, 0.1, 0.1, 0.1]], + vector_field_names=["text_vector", "image_vector", "feature_vector"] + filter_expression=None, + weights=[0.7, 0.2, 0.5], + dtypes=["float32", "bfloat16", "float64"], + num_results=10, + return_fields=["field1", "field2"], + dialect=2, + ) + results = index.query(query) """ - DISTANCE_ID: str = "vector_distance" - def __init__( self, vectors: Union[bytes, List[bytes], List[float], List[List[float]]], @@ -340,58 +299,69 @@ def __init__( self._dtypes = dtypes self._num_results = num_results - if len(vectors) == 0 or len(vector_field_names) == 0 or len(weights) == 0: + if any([len(x) == 0 for x in [vectors, vector_field_names, weights, dtypes]]): raise ValueError( f"""The number of vectors and vector field names must be equal. - If weights are specified their number must match the number of vectors and vector field names also. - Length of vectors list: {len(vectors) = } - Length of vector_field_names list: {len(vector_field_names) = } - Length of weights list: {len(weights) = } - """ + If weights or dtypes are specified their number must match the number of vectors and vector field names also. + Length of vectors list: {len(vectors) = } + Length of vector_field_names list: {len(vector_field_names) = } + Length of weights list: {len(weights) = } + length of dtypes list: {len(dtypes) = } + """ ) if isinstance(vectors, bytes) or isinstance(vectors[0], float): self._vectors = [vectors] else: - self._vectors = vectors + self._vectors = vectors # type: ignore + if isinstance(vector_field_names, str): self._vector_field_names = [vector_field_names] else: self._vector_field_names = vector_field_names + if len(weights) == 1: self._weights = weights * len(vectors) else: self._weights = weights + if len(dtypes) == 1: self._dtypes = dtypes * len(vectors) else: self._dtypes = dtypes - if (len(self._vectors) != len(self._vector_field_names)) or ( - len(self._vectors) != len(self._weights) + num_vectors = len(self._vectors) + if any( + [ + len(x) != num_vectors # type: ignore + for x in [self._vector_field_names, self._weights, self._dtypes] + ] ): raise ValueError( f"""The number of vectors and vector field names must be equal. - If weights are specified their number must match the number of vectors and vector field names also. - Length of vectors list: {len(self._vectors) = } - Length of vector_field_names list: {len(self._vector_field_names) = } - Length of weights list: {len(self._weights) = } - """ + If weights or dtypes are specified their number must match the number of vectors and vector field names also. + Length of vectors list: {len(self._vectors) = } + Length of vector_field_names list: {len(self._vector_field_names) = } + Length of weights list: {len(self._weights) = } + Length of dtypes list: {len(self._dtypes) = } + """ ) query_string = self._build_query_string() super().__init__(query_string) + # calculate the respective vector similarities + for i in range(len(vectors)): + self.apply(**{f"score_{i}": f"(2 - @distance_{i})/2"}) + # construct the scoring string based on the vector similarity scores and weights combined_scores = [] for i, w in enumerate(self._weights): combined_scores.append(f"@score_{i} * {w}") combined_score_string = " + ".join(combined_scores) - combined_score_string = f"'({combined_score_string})'" self.apply(combined_score=combined_score_string) - # self.add_scores() self.sort_by(Desc("@combined_score"), max=num_results) # type: ignore self.dialect(dialect) if return_fields: @@ -405,43 +375,34 @@ def params(self) -> Dict[str, Any]: Dict[str, Any]: The parameters for the aggregation. """ params = {} - for i, (vector, vector_field, dtype) in enumerate(zip( - self._vectors, self._vector_field_names, self._dtypes - )): + for i, (vector, dtype) in enumerate(zip(self._vectors, self._dtypes)): if isinstance(vector, list): - vector = array_to_buffer(vector, dtype=dtype) + vector = array_to_buffer(vector, dtype=dtype) # type: ignore params[f"vector_{i}"] = vector return params def _build_query_string(self) -> str: """Build the full query string for text search with optional filtering.""" - filter_expression = self._filter_expression - if isinstance(self._filter_expression, FilterExpression): - filter_expression = str(self._filter_expression) - # base KNN query - knn_queries = [] range_queries = [] - for i, (vector, field) in enumerate(zip(self._vectors, self._vector_field_names)): - knn_queries.append(f"[KNN {self._num_results} @{field} $vector_{i} AS distance_{i}]") - range_queries.append(f"@{field}:[VECTOR_RANGE 2.0 $vector_{i}]=>{{$YIELD_DISTANCE_AS: distance_{i}}}") + for i, (vector, field) in enumerate( + zip(self._vectors, self._vector_field_names) + ): + range_queries.append( + f"@{field}:[VECTOR_RANGE 2.0 $vector_{i}]=>{{$YIELD_DISTANCE_AS: distance_{i}}}" + ) - knn_query = " | ".join(knn_queries) ## knn_queries format doesn't work - knn_query = " | ".join(range_queries) + range_query = " | ".join(range_queries) - # calculate the respective vector similarities - apply_string = "" - for i, (vector, field_name, weight) in enumerate( - zip(self._vectors, self._vector_field_names, self._weights) - ): - apply_string += f'APPLY "(2 - @distance_{i})/2" AS score_{i} ' + filter_expression = self._filter_expression + if isinstance(self._filter_expression, FilterExpression): + filter_expression = str(self._filter_expression) - return ( - f"{knn_query} {filter_expression} {apply_string}" - if filter_expression - else f"{knn_query} {apply_string}" - ) + if filter_expression: + return f"({range_query}) AND ({filter_expression})" + else: + return f"{range_query}" def __str__(self) -> str: """Return the string representation of the query.""" diff --git a/tests/conftest.py b/tests/conftest.py index b6b27746..7fc9dd9c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -233,6 +233,89 @@ def sample_datetimes(): } +@pytest.fixture +def OG(sample_datetimes): + return [ + { + "user": "john", + "age": 18, + "job": "engineer", + "description": "engineers conduct trains that ride on train tracks", + "last_updated": sample_datetimes["low"].timestamp(), + "credit_score": "high", + "location": "-122.4194,37.7749", + "user_embedding": [0.1, 0.1, 0.5], + "image_embedding": [0.1, 0.1, 0.1, 0.1, 0.1], + }, + { + "user": "mary", + "age": 14, + "job": "doctor", + "description": "a medical professional who treats diseases and helps people stay healthy", + "last_updated": sample_datetimes["low"].timestamp(), + "credit_score": "low", + "location": "-122.4194,37.7749", + "user_embedding": [0.1, 0.1, 0.5], + "image_embedding": [0.1, 0.2, 0.3, 0.4, 0.5], + }, + { + "user": "nancy", + "age": 94, + "job": "doctor", + "description": "a research scientist specializing in cancers and diseases of the lungs", + "last_updated": sample_datetimes["mid"].timestamp(), + "credit_score": "high", + "location": "-122.4194,37.7749", + "user_embedding": [0.7, 0.1, 0.5], + "image_embedding": [0.1, 0.1, 0.3, 0.3, 0.5], + }, + { + "user": "tyler", + "age": 100, + "job": "engineer", + "description": "a software developer with expertise in mathematics and computer science", + "last_updated": sample_datetimes["mid"].timestamp(), + "credit_score": "high", + "location": "-110.0839,37.3861", + "user_embedding": [0.1, 0.4, 0.5], + "image_embedding": [-0.1, -0.2, -0.3, -0.4, -0.5], + }, + { + "user": "tim", + "age": 12, + "job": "dermatologist", + "description": "a medical professional specializing in diseases of the skin", + "last_updated": sample_datetimes["mid"].timestamp(), + "credit_score": "high", + "location": "-110.0839,37.3861", + "user_embedding": [0.4, 0.4, 0.5], + "image_embedding": [-0.1, 0.0, 0.6, 0.0, -0.9], + }, + { + "user": "taimur", + "age": 15, + "job": "CEO", + "description": "high stress, but financially rewarding position at the head of a company", + "last_updated": sample_datetimes["high"].timestamp(), + "credit_score": "low", + "location": "-110.0839,37.3861", + "user_embedding": [0.6, 0.1, 0.5], + "image_embedding": [1.1, 1.2, -0.3, -4.1, 5.0], + }, + { + "user": "joe", + "age": 35, + "job": "dentist", + "description": "like the tooth fairy because they'll take your teeth, but you have to pay them!", + "last_updated": sample_datetimes["high"].timestamp(), + "credit_score": "medium", + "location": "-110.0839,37.3861", + "user_embedding": [-0.1, -0.1, -0.5], + "image_embedding": [-0.8, 2.0, 3.1, 1.5, -1.6], + }, + ] + + @pytest.fixture def sample_data(sample_datetimes): return [ @@ -309,6 +392,96 @@ def sample_data(sample_datetimes): ] +@pytest.fixture +def multi_vector_data(sample_datetimes): + return [ + { + "user": "john", + "age": 18, + "job": "engineer", + "description": "engineers conduct trains that ride on train tracks", + "last_updated": sample_datetimes["low"].timestamp(), + "credit_score": "high", + "location": "-122.4194,37.7749", + "user_embedding": [0.1, 0.1, 0.5], + "image_embedding": [0.1, 0.1, 0.1, 0.1, 0.1], + "audio_embedding": [34, 18.5, -6.0, -12, 115, 96.5], + }, + { + "user": "mary", + "age": 14, + "job": "doctor", + "description": "a medical professional who treats diseases and helps people stay healthy", + "last_updated": sample_datetimes["low"].timestamp(), + "credit_score": "low", + "location": "-122.4194,37.7749", + "user_embedding": [0.1, 0.1, 0.5], + "image_embedding": [0.1, 0.2, 0.3, 0.4, 0.5], + "audio_embedding": [0.0, -1.06, 4.55, -1.93, 0.0, 1.53], + }, + { + "user": "nancy", + "age": 94, + "job": "doctor", + "description": "a research scientist specializing in cancers and diseases of the lungs", + "last_updated": sample_datetimes["mid"].timestamp(), + "credit_score": "high", + "location": "-122.4194,37.7749", + "user_embedding": [0.7, 0.1, 0.5], + "image_embedding": [0.1, 0.1, 0.3, 0.3, 0.5], + "audio_embedding": [2.75, -0.33, -3.01, -0.52, 5.59, -2.30], + }, + { + "user": "tyler", + "age": 100, + "job": "engineer", + "description": "a software developer with expertise in mathematics and computer science", + "last_updated": sample_datetimes["mid"].timestamp(), + "credit_score": "high", + "location": "-110.0839,37.3861", + "user_embedding": [0.1, 0.4, 0.5], + "image_embedding": [-0.1, -0.2, -0.3, -0.4, -0.5], + "audio_embedding": [1.11, -6.73, 5.41, 1.04, 3.92, 0.73], + }, + { + "user": "tim", + "age": 12, + "job": "dermatologist", + "description": "a medical professional specializing in diseases of the skin", + "last_updated": sample_datetimes["mid"].timestamp(), + "credit_score": "high", + "location": "-110.0839,37.3861", + "user_embedding": [0.4, 0.4, 0.5], + "image_embedding": [-0.1, 0.0, 0.6, 0.0, -0.9], + "audio_embedding": [0.03, -2.67, -2.08, 4.57, -2.33, 0.0], + }, + { + "user": "taimur", + "age": 15, + "job": "CEO", + "description": "high stress, but financially rewarding position at the head of a company", + "last_updated": sample_datetimes["high"].timestamp(), + "credit_score": "low", + "location": "-110.0839,37.3861", + "user_embedding": [0.6, 0.1, 0.5], + "image_embedding": [1.1, 1.2, -0.3, -4.1, 5.0], + "audio_embedding": [0.68, 0.26, 2.08, 2.96, 0.01, 5.13], + }, + { + "user": "joe", + "age": 35, + "job": "dentist", + "description": "like the tooth fairy because they'll take your teeth, but you have to pay them!", + "last_updated": sample_datetimes["high"].timestamp(), + "credit_score": "medium", + "location": "-110.0839,37.3861", + "user_embedding": [-0.1, -0.1, -0.5], + "image_embedding": [-0.8, 2.0, 3.1, 1.5, -1.6], + "audio_embedding": [0.91, 7.10, -2.14, -0.52, -6.08, -5.53], + }, + ] + + def pytest_addoption(parser: pytest.Parser) -> None: parser.addoption( "--run-api-tests", diff --git a/tests/integration/test_aggregation.py b/tests/integration/test_aggregation.py index 3561b1de..f1ff7d0b 100644 --- a/tests/integration/test_aggregation.py +++ b/tests/integration/test_aggregation.py @@ -1,14 +1,15 @@ import pytest from redisvl.index import SearchIndex -from redisvl.query import HybridQuery +from redisvl.query import HybridQuery, MultiVectorQuery from redisvl.query.filter import FilterExpression, Geo, GeoRadius, Num, Tag, Text from redisvl.redis.utils import array_to_buffer from tests.conftest import skip_if_redis_version_below @pytest.fixture -def index(sample_data, redis_url, worker_id): +def index(multi_vector_data, redis_url, worker_id): + index = SearchIndex.from_dict( { "index": { @@ -33,6 +34,26 @@ def index(sample_data, redis_url, worker_id): "datatype": "float32", }, }, + { + "name": "image_embedding", + "type": "vector", + "attrs": { + "dims": 5, + "distance_metric": "cosine", + "algorithm": "flat", + "datatype": "float32", + }, + }, + { + "name": "audio_embedding", + "type": "vector", + "attrs": { + "dims": 6, + "distance_metric": "cosine", + "algorithm": "hnsw", + "datatype": "bfloat16", + }, + }, ], }, redis_url=redis_url, @@ -46,9 +67,12 @@ def hash_preprocess(item: dict) -> dict: return { **item, "user_embedding": array_to_buffer(item["user_embedding"], "float32"), + "image_embedding": array_to_buffer(item["image_embedding"], "float32"), + "audio_embedding": array_to_buffer(item["audio_embedding"], "bfloat16"), } - index.load(sample_data, preprocess=hash_preprocess) + ### TODO get sample data that has two vector fields + index.load(multi_vector_data, preprocess=hash_preprocess) # run the test yield index @@ -57,7 +81,7 @@ def hash_preprocess(item: dict) -> dict: index.delete(drop=True) -def test_aggregation_query(index): +def test_hybrid_query(index): skip_if_redis_version_below(index.client, "7.2.0") text = "a medical professional with expertise in lung cancer" @@ -136,7 +160,7 @@ def test_empty_query_string(): ) -def test_aggregation_query_with_filter(index): +def test_hybrid_query_with_filter(index): skip_if_redis_version_below(index.client, "7.2.0") text = "a medical professional with expertise in lung cancer" @@ -162,7 +186,7 @@ def test_aggregation_query_with_filter(index): assert int(result["age"]) > 30 -def test_aggregation_query_with_geo_filter(index): +def test_hybrid_query_with_geo_filter(index): skip_if_redis_version_below(index.client, "7.2.0") text = "a medical professional with expertise in lung cancer" @@ -188,7 +212,7 @@ def test_aggregation_query_with_geo_filter(index): @pytest.mark.parametrize("alpha", [0.1, 0.5, 0.9]) -def test_aggregate_query_alpha(index, alpha): +def test_hybrid_query_alpha(index, alpha): skip_if_redis_version_below(index.client, "7.2.0") text = "a medical professional with expertise in lung cancer" @@ -215,7 +239,7 @@ def test_aggregate_query_alpha(index, alpha): ) # allow for small floating point error -def test_aggregate_query_stopwords(index): +def test_hybrid_query_stopwords(index): skip_if_redis_version_below(index.client, "7.2.0") text = "a medical professional with expertise in lung cancer" @@ -249,7 +273,7 @@ def test_aggregate_query_stopwords(index): ) # allow for small floating point error -def test_aggregate_query_with_text_filter(index): +def test_hybrid_query_with_text_filter(index): skip_if_redis_version_below(index.client, "7.2.0") text = "a medical professional with expertise in lung cancer" @@ -292,3 +316,270 @@ def test_aggregate_query_with_text_filter(index): for result in results: assert "medical" in result[text_field].lower() assert "research" not in result[text_field].lower() + + +def test_multivector_query(index): + skip_if_redis_version_below(index.client, "7.2.0") + + vectors = [[0.1, 0.1, 0.5], [0.3, 0.4, 0.7, 0.2, -0.3]] + vector_fields = ["user_embedding", "image_embedding"] + return_fields = ["user", "credit_score", "age", "job", "location", "description"] + + multi_query = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + return_fields=return_fields, + ) + + results = index.query(multi_query) + assert isinstance(results, list) + assert len(results) == 7 + for doc in results: + assert doc["user"] in [ + "john", + "derrick", + "nancy", + "tyler", + "tim", + "taimur", + "joe", + "mary", + ] + assert int(doc["age"]) in [18, 14, 94, 100, 12, 15, 35] + assert doc["job"] in ["engineer", "doctor", "dermatologist", "CEO", "dentist"] + assert doc["credit_score"] in ["high", "low", "medium"] + + multi_query = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + num_results=3, + ) + + results = index.query(multi_query) + assert len(results) == 3 + assert ( + results[0]["combined_score"] + >= results[1]["combined_score"] + >= results[2]["combined_score"] + ) + + +def test_multivector_query_with_filter(index): + skip_if_redis_version_below(index.client, "7.2.0") + + text_field = "description" + vectors = [[0.1, 0.1, 0.5], [0.3, 0.4, 0.7, 0.2, -0.3]] + vector_fields = ["user_embedding", "image_embedding"] + filter_expression = Text(text_field) == ("medical") + + # make sure we can still apply filters to the same text field we are querying + multi_query = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + filter_expression=filter_expression, + return_fields=["job", "description"], + ) + + results = index.query(multi_query) + assert len(results) == 2 + for result in results: + assert "medical" in result[text_field].lower() + + filter_expression = (Text(text_field) == ("medical")) & ( + (Text(text_field) != ("research")) + ) + multi_query = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + filter_expression=filter_expression, + return_fields=["description"], + ) + + results = index.query(multi_query) + assert len(results) == 2 + for result in results: + assert "medical" in result[text_field].lower() + assert "research" not in result[text_field].lower() + + filter_expression = (Num("age") > 30) & ((Num("age") < 30)) + multi_query = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + filter_expression=filter_expression, + return_fields=["description"], + ) + + results = index.query(multi_query) + assert len(results) == 0 + + +def test_multivector_query_with_geo_filter(index): + skip_if_redis_version_below(index.client, "7.2.0") + + vectors = [[0.2, 0.4, 0.1], [0.1, 0.8, 0.3, -0.2, 0.3]] + vector_fields = ["user_embedding", "image_embedding"] + return_fields = ["user", "credit_score", "age", "job", "location", "description"] + filter_expression = Geo("location") == GeoRadius(-122.4194, 37.7749, 1000, "m") + + multi_query = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + filter_expression=filter_expression, + return_fields=return_fields, + ) + + results = index.query(multi_query) + assert len(results) == 3 + for result in results: + assert result["location"] is not None + + +def test_multivector_query_weights(index): + skip_if_redis_version_below( + index.client, "7.2.0" + ) ## TODO figure out min version for 'case()' + + vectors = [[0.1, 0.2, 0.5], [0.3, 0.4, 0.7, 0.2, -0.3]] + vector_fields = ["user_embedding", "image_embedding"] + return_fields = [ + "distance_0", + "distance_1", + "score_0", + "score_1", + "user_embedding", + "image_embedding", + ] + + # changing the weights does indeed change the result order + multi_query_1 = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + return_fields=return_fields, + weights=[0.2, 0.9], + ) + results_1 = index.query(multi_query_1) + + multi_query_2 = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + return_fields=return_fields, + weights=[0.5, 0.1], + ) + results_2 = index.query(multi_query_2) + + assert results_1 != results_2 + + for i in range(1, len(results_1)): + assert results_1[i]["combined_score"] <= results_1[i - 1]["combined_score"] + + for i in range(1, len(results_2)): + assert results_2[i]["combined_score"] <= results_2[i - 1]["combined_score"] + + # weights can be negative, 0.0, or greater than 1.0 + weights = [-5.2, 0.0] + multi_query = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + return_fields=return_fields, + weights=weights, + ) + + results = index.query(multi_query) + assert results + for r in results: + score = float(r["score_0"]) * weights[0] + assert ( + float(r["combined_score"]) - score <= 0.0001 + ) # allow for small floating point error + + # verify we're doing the combined score math correctly + weights = [-1.322, 0.851] + multi_query = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + return_fields=return_fields, + weights=weights, + ) + + results = index.query(multi_query) + assert results + for r in results: + score = float(r["score_0"]) * weights[0] + float(r["score_1"]) * weights[1] + assert ( + float(r["combined_score"]) - score <= 0.0001 + ) # allow for small floating point error + + # raise error if wrong number of weights are passed + with pytest.raises(ValueError): + _ = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + return_fields=return_fields, + weights=[], + ) + + with pytest.raises(ValueError): + _ = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + return_fields=return_fields, + weights=[1.2, 0.23, 0.52], + ) + + +def test_multivector_query_datatypes(index): + skip_if_redis_version_below(index.client, "7.2.0") + + vectors = [[0.1, 0.2, 0.5], [1.2, 0.3, -0.4, 0.7, 0.2, -0.3]] + vector_fields = ["user_embedding", "audio_embedding"] + return_fields = [ + "distance_0", + "distance_1", + "score_0", + "score_1", + "user_embedding", + "audio_embedding", + ] + + # changing the weights does indeed change the result order + multi_query = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + return_fields=return_fields, + dtypes=["float32", "bfloat16"], + ) + results = index.query(multi_query) + + for i in range(1, len(results)): + assert results[i]["combined_score"] <= results[i - 1]["combined_score"] + + # verify we're doing the combined score math correctly + weights = [-1.322, 0.851] + multi_query = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + return_fields=return_fields, + dtypes=["float32", "bfloat16"], + weights=weights, + ) + + results = index.query(multi_query) + assert results + for r in results: + score = float(r["score_0"]) * weights[0] + float(r["score_1"]) * weights[1] + assert ( + float(r["combined_score"]) - score <= 0.0001 + ) # allow for small floating point error + + # raise error if wrong number of datatypes are passed + with pytest.raises(ValueError): + _ = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + return_fields=return_fields, + dtypes=["float32", "float32", "float64"], + ) + + +def test_multivector_query_broadcasting(index): + pass diff --git a/tests/unit/test_aggregation_types.py b/tests/unit/test_aggregation_types.py index 5e2f2493..b684e9d0 100644 --- a/tests/unit/test_aggregation_types.py +++ b/tests/unit/test_aggregation_types.py @@ -141,7 +141,6 @@ def test_hybrid_query_with_string_filter(): ) # Check that filter is stored correctly - print("hybrid_query.filter ===", hybrid_query.filter) assert hybrid_query._filter_expression == string_filter # Check that the generated query string includes both text search and filter @@ -198,26 +197,25 @@ def test_hybrid_query_with_string_filter(): def test_aggregate_multi_vector_query(): # test we require vectors and field names - with pytest.raises(ValueError): + with pytest.raises(TypeError): _ = MultiVectorQuery() with pytest.raises(ValueError): _ = MultiVectorQuery(vectors=[sample_vector], vector_field_names=[]) with pytest.raises(ValueError): - _ = MultiVectorQuery(vectors=[], vector_field_names=["field 1"]) + _ = MultiVectorQuery(vectors=[], vector_field_names=["field_1"]) # test we can initialize with a single vector and single field name multivector_query = MultiVectorQuery( - vectors=[sample_vector], vector_field_names=["field 1"] + vectors=[sample_vector], vector_field_names=["field_1"] ) - assert query.query is not None # check default properties assert multivector_query._vectors == [sample_vector] - assert multivector_query._vector_field_names == ["field 1"] + assert multivector_query._vector_field_names == ["field_1"] assert multivector_query._filter_expression == None - assert multivector_query._weights == 1.0 + assert multivector_query._weights == [1.0] assert multivector_query._num_results == 10 assert multivector_query._loadfields == [] assert multivector_query._dialect == 2 @@ -225,24 +223,26 @@ def test_aggregate_multi_vector_query(): # test we can initialize with mutliple vectors and field names multivector_query = MultiVectorQuery( vectors=[sample_vector, sample_vector_2, sample_vector_3, sample_vector_4], - vector_field_names=["field 1", "field 2", "field 3", "field 4"], - weights=[0.2, 0.5, 0.6, 0, 1], - dtypes=[], + vector_field_names=["field_1", "field_2", "field_3", "field_4"], + weights=[0.2, 0.5, 0.6, 0.1], + dtypes=["float32", "float32", "float32", "float32"], ) assert len(multivector_query._vectors) == 4 assert len(multivector_query._vector_field_names) == 4 assert len(multivector_query._weights) == 4 + assert len(multivector_query._dtypes) == 4 # test defaults can be overwritten + filter_expression = Tag("user group") == ["group A", "group C"] multivector_query = MultiVectorQuery( vectors=[sample_vector, sample_vector_2, sample_vector_3, sample_vector_4], - vector_field_names=["field 1", "field 2", "field 3", "field 4"], - filter_expression=(Tag("user group") == ["group A", "group C"]), - weights=[0.2, 0.5, 0.6, 0, 1], + vector_field_names=["field_1", "field_2", "field_3", "field_4"], + filter_expression=filter_expression, + weights=[0.2, 0.5, 0.6, 0.1], dtypes=["float32", "float32", "float64", "bfloat16"], num_results=5, - return_fields=["field 1", "user name", "address"], + return_fields=["field_1", "user name", "address"], dialect=4, ) @@ -253,15 +253,15 @@ def test_aggregate_multi_vector_query(): sample_vector_4, ] assert multivector_query._vector_field_names == [ - "field 1", - "field 2", - "field 3", - "field 4", + "field_1", + "field_2", + "field_3", + "field_4", ] - assert multivector_query._weights == [0.2, 0.5, 0.6, 0, 1] - assert multivector_query._filter_expression == Tag("user group") + assert multivector_query._weights == [0.2, 0.5, 0.6, 0.1] + assert multivector_query._filter_expression == filter_expression assert multivector_query._num_results == 5 - assert multivector_query._loadfields == ["field 1", "user name", "address"] + assert multivector_query._loadfields == ["field_1", "user name", "address"] assert multivector_query._dialect == 4 From 0518dc40d6e6de4e0a3921befd64a55e569cbd8a Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Thu, 9 Oct 2025 14:18:18 -0700 Subject: [PATCH 04/12] cleans up unit test --- tests/unit/test_aggregation_types.py | 55 +++++++++++----------------- 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/tests/unit/test_aggregation_types.py b/tests/unit/test_aggregation_types.py index b684e9d0..442a338c 100644 --- a/tests/unit/test_aggregation_types.py +++ b/tests/unit/test_aggregation_types.py @@ -91,6 +91,7 @@ def test_aggregate_hybrid_query(): stopwords=["the", "a", "of"], ) assert hybrid_query.stopwords == set(["the", "a", "of"]) + hybrid_query = HybridQuery( sample_text, text_field_name, @@ -195,7 +196,7 @@ def test_hybrid_query_with_string_filter(): assert "AND" not in query_string_wildcard -def test_aggregate_multi_vector_query(): +def test_multi_vector_query(): # test we require vectors and field names with pytest.raises(TypeError): _ = MultiVectorQuery() @@ -265,51 +266,37 @@ def test_aggregate_multi_vector_query(): assert multivector_query._dialect == 4 -def test_aggregate_multi_vector_query_broadcasting(): - # if a single vector and multiple fields is passed we search with the same vector over all fields - multivector_query = MultiVectorQuery( - vectors=[sample_vector], - vector_field_names=["text embedding", "image embedding"], - ) - assert multi_vector_query.query == "" - - # vector being broadcast doesn't need to be in a list - multivector_query = MultiVectorQuery( - vectors=sample_vector, vector_field_names=["text embedding", "image embedding"] - ) - assert multi_vector_query.query == "" - - # if multiple vectors are passed and a single field name we search with all vectors on that field - multivector_query = MultiVectorQuery( +def test_multi_vector_query_broadcasting(): + # if a single weight is passed it is applied to all similarity scores + field_1 = "text embedding" + field_2 = "image embedding" + weight = 0.2 + multi_vector_query = MultiVectorQuery( vectors=[sample_vector_2, sample_vector_3], - vector_field_names=["text embedding"], + vector_field_names=[field_1, field_2], + weights=[weight], ) - assert multi_vector_query.query == "" - # vector field name does not need to be in a list if only one is provided - multivector_query = MultiVectorQuery( - vectors=[sample_vector_2, sample_vector_3], vector_field_names="text embedding" + assert ( + str(multi_vector_query) + == f"@{field_1}:[VECTOR_RANGE 2.0 $vector_0]=>{{$YIELD_DISTANCE_AS: distance_0}} | @{field_2}:[VECTOR_RANGE 2.0 $vector_1]=>{{$YIELD_DISTANCE_AS: distance_1}} SCORER TFIDF DIALECT 2 APPLY (2 - @distance_0)/2 AS score_0 APPLY (2 - @distance_1)/2 AS score_1 APPLY @score_0 * {weight} + @score_1 * {weight} AS combined_score SORTBY 2 @combined_score DESC MAX 10" ) - assert multi_vector_query.query == "" - # if a single weight is passed it is applied to all similarity scores - multivector_query = MultiVectorQuery( + # if a single dtype is passed it is applied to all vectors + multi_vector_query = MultiVectorQuery( vectors=[sample_vector_2, sample_vector_3], vector_field_names=["text embedding", "image embedding"], - weights=[0.2], + dtypes=["float16"], ) - assert multi_vector_query.query == "" - # weight does not need to be in a list if only one is provided - multivector_query = MultiVectorQuery( - vectors=[sample_vector_2, sample_vector_3], - vector_field_names=["text embedding", "image embedding"], - weights=0.2, + assert multi_vector_query._dtypes == ["float16", "float16"] + assert ( + str(multi_vector_query) + == f"@{field_1}:[VECTOR_RANGE 2.0 $vector_0]=>{{$YIELD_DISTANCE_AS: distance_0}} | @{field_2}:[VECTOR_RANGE 2.0 $vector_1]=>{{$YIELD_DISTANCE_AS: distance_1}} SCORER TFIDF DIALECT 2 APPLY (2 - @distance_0)/2 AS score_0 APPLY (2 - @distance_1)/2 AS score_1 APPLY @score_0 * 1.0 + @score_1 * 1.0 AS combined_score SORTBY 2 @combined_score DESC MAX 10" ) - assert multi_vector_query.query == "" -def test_aggregate_multi_vector_query_errors(): +def test_multi_vector_query_errors(): # test an error is raised if the number of vectors and number of fields don't match with pytest.raises(ValueError): _ = MultiVectorQuery( From 7c85122deea2076755613218011d3f69ba090532 Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Thu, 9 Oct 2025 15:14:38 -0700 Subject: [PATCH 05/12] skips test if redis version not new enough --- tests/integration/test_aggregation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_aggregation.py b/tests/integration/test_aggregation.py index f1ff7d0b..d4658660 100644 --- a/tests/integration/test_aggregation.py +++ b/tests/integration/test_aggregation.py @@ -582,4 +582,5 @@ def test_multivector_query_datatypes(index): def test_multivector_query_broadcasting(index): + skip_if_redis_version_below(index.client, "7.2.0") pass From 6493304e93fd5e31a5a4f2988003d9a733410388 Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Thu, 9 Oct 2025 16:06:57 -0700 Subject: [PATCH 06/12] tests hnsw multi vector indices only on supported search module versions --- tests/conftest.py | 83 --------------------------- tests/integration/test_aggregation.py | 64 ++++++++++++++++++++- 2 files changed, 61 insertions(+), 86 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 99baf110..692ce77d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -232,89 +232,6 @@ def sample_datetimes(): } -@pytest.fixture -def OG(sample_datetimes): - return [ - { - "user": "john", - "age": 18, - "job": "engineer", - "description": "engineers conduct trains that ride on train tracks", - "last_updated": sample_datetimes["low"].timestamp(), - "credit_score": "high", - "location": "-122.4194,37.7749", - "user_embedding": [0.1, 0.1, 0.5], - "image_embedding": [0.1, 0.1, 0.1, 0.1, 0.1], - }, - { - "user": "mary", - "age": 14, - "job": "doctor", - "description": "a medical professional who treats diseases and helps people stay healthy", - "last_updated": sample_datetimes["low"].timestamp(), - "credit_score": "low", - "location": "-122.4194,37.7749", - "user_embedding": [0.1, 0.1, 0.5], - "image_embedding": [0.1, 0.2, 0.3, 0.4, 0.5], - }, - { - "user": "nancy", - "age": 94, - "job": "doctor", - "description": "a research scientist specializing in cancers and diseases of the lungs", - "last_updated": sample_datetimes["mid"].timestamp(), - "credit_score": "high", - "location": "-122.4194,37.7749", - "user_embedding": [0.7, 0.1, 0.5], - "image_embedding": [0.1, 0.1, 0.3, 0.3, 0.5], - }, - { - "user": "tyler", - "age": 100, - "job": "engineer", - "description": "a software developer with expertise in mathematics and computer science", - "last_updated": sample_datetimes["mid"].timestamp(), - "credit_score": "high", - "location": "-110.0839,37.3861", - "user_embedding": [0.1, 0.4, 0.5], - "image_embedding": [-0.1, -0.2, -0.3, -0.4, -0.5], - }, - { - "user": "tim", - "age": 12, - "job": "dermatologist", - "description": "a medical professional specializing in diseases of the skin", - "last_updated": sample_datetimes["mid"].timestamp(), - "credit_score": "high", - "location": "-110.0839,37.3861", - "user_embedding": [0.4, 0.4, 0.5], - "image_embedding": [-0.1, 0.0, 0.6, 0.0, -0.9], - }, - { - "user": "taimur", - "age": 15, - "job": "CEO", - "description": "high stress, but financially rewarding position at the head of a company", - "last_updated": sample_datetimes["high"].timestamp(), - "credit_score": "low", - "location": "-110.0839,37.3861", - "user_embedding": [0.6, 0.1, 0.5], - "image_embedding": [1.1, 1.2, -0.3, -4.1, 5.0], - }, - { - "user": "joe", - "age": 35, - "job": "dentist", - "description": "like the tooth fairy because they'll take your teeth, but you have to pay them!", - "last_updated": sample_datetimes["high"].timestamp(), - "credit_score": "medium", - "location": "-110.0839,37.3861", - "user_embedding": [-0.1, -0.1, -0.5], - "image_embedding": [-0.8, 2.0, 3.1, 1.5, -1.6], - }, - ] - - @pytest.fixture def sample_data(sample_datetimes): return [ diff --git a/tests/integration/test_aggregation.py b/tests/integration/test_aggregation.py index d4658660..32782457 100644 --- a/tests/integration/test_aggregation.py +++ b/tests/integration/test_aggregation.py @@ -50,7 +50,7 @@ def index(multi_vector_data, redis_url, worker_id): "attrs": { "dims": 6, "distance_metric": "cosine", - "algorithm": "hnsw", + "algorithm": "flat", "datatype": "bfloat16", }, }, @@ -581,6 +581,64 @@ def test_multivector_query_datatypes(index): ) -def test_multivector_query_broadcasting(index): +def test_multivector_query_mixed_index(index): + # test that we can do multi vector queries on indices with both a 'flat' and 'hnsw' index skip_if_redis_version_below(index.client, "7.2.0") - pass + try: + index.schema.remove_field("audio_embedding") + index.schema.add_field( + { + "name": "audio_embedding", + "type": "vector", + "attrs": { + "dims": 6, + "distance_metric": "cosine", + "algorithm": "hnsw", + "datatype": "bfloat16", + }, + }, + ) + + except: + pytest.skip("Required Redis modules not available or version too low") + + vectors = [[0.1, 0.2, 0.5], [1.2, 0.3, -0.4, 0.7, 0.2, -0.3]] + vector_fields = ["user_embedding", "audio_embedding"] + return_fields = [ + "distance_0", + "distance_1", + "score_0", + "score_1", + "user_embedding", + "audio_embedding", + ] + + # changing the weights does indeed change the result order + multi_query = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + return_fields=return_fields, + dtypes=["float32", "bfloat16"], + ) + results = index.query(multi_query) + + for i in range(1, len(results)): + assert results[i]["combined_score"] <= results[i - 1]["combined_score"] + + # verify we're doing the combined score math correctly + weights = [-1.322, 0.851] + multi_query = MultiVectorQuery( + vectors=vectors, + vector_field_names=vector_fields, + return_fields=return_fields, + dtypes=["float32", "bfloat16"], + weights=weights, + ) + + results = index.query(multi_query) + assert results + for r in results: + score = float(r["score_0"]) * weights[0] + float(r["score_1"]) * weights[1] + assert ( + float(r["combined_score"]) - score <= 0.0001 + ) # allow for small floating point error From 406f4202e5fb739a9251322e3ed5e866983de2de Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Thu, 9 Oct 2025 17:02:53 -0700 Subject: [PATCH 07/12] switches test to float64 --- tests/integration/test_aggregation.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/integration/test_aggregation.py b/tests/integration/test_aggregation.py index 32782457..38f1f5fb 100644 --- a/tests/integration/test_aggregation.py +++ b/tests/integration/test_aggregation.py @@ -51,7 +51,7 @@ def index(multi_vector_data, redis_url, worker_id): "dims": 6, "distance_metric": "cosine", "algorithm": "flat", - "datatype": "bfloat16", + "datatype": "float64", }, }, ], @@ -68,7 +68,7 @@ def hash_preprocess(item: dict) -> dict: **item, "user_embedding": array_to_buffer(item["user_embedding"], "float32"), "image_embedding": array_to_buffer(item["image_embedding"], "float32"), - "audio_embedding": array_to_buffer(item["audio_embedding"], "bfloat16"), + "audio_embedding": array_to_buffer(item["audio_embedding"], "float64"), } ### TODO get sample data that has two vector fields @@ -546,7 +546,7 @@ def test_multivector_query_datatypes(index): vectors=vectors, vector_field_names=vector_fields, return_fields=return_fields, - dtypes=["float32", "bfloat16"], + dtypes=["float32", "float64"], ) results = index.query(multi_query) @@ -559,7 +559,7 @@ def test_multivector_query_datatypes(index): vectors=vectors, vector_field_names=vector_fields, return_fields=return_fields, - dtypes=["float32", "bfloat16"], + dtypes=["float32", "float64"], weights=weights, ) @@ -594,7 +594,7 @@ def test_multivector_query_mixed_index(index): "dims": 6, "distance_metric": "cosine", "algorithm": "hnsw", - "datatype": "bfloat16", + "datatype": "float64", }, }, ) @@ -618,7 +618,7 @@ def test_multivector_query_mixed_index(index): vectors=vectors, vector_field_names=vector_fields, return_fields=return_fields, - dtypes=["float32", "bfloat16"], + dtypes=["float32", "float64"], ) results = index.query(multi_query) @@ -631,7 +631,7 @@ def test_multivector_query_mixed_index(index): vectors=vectors, vector_field_names=vector_fields, return_fields=return_fields, - dtypes=["float32", "bfloat16"], + dtypes=["float32", "float64"], weights=weights, ) From e23b652c974a8c97669eeddc2dab2814422873db Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Fri, 10 Oct 2025 17:45:08 -0700 Subject: [PATCH 08/12] refactors MultiVectorQuery to accept Vector objects --- redisvl/query/__init__.py | 8 +- redisvl/query/aggregate.py | 126 ++++++++++++------------ tests/integration/test_aggregation.py | 126 ++++++++++++------------ tests/unit/test_aggregation_types.py | 136 ++++++++++++-------------- 4 files changed, 193 insertions(+), 203 deletions(-) diff --git a/redisvl/query/__init__.py b/redisvl/query/__init__.py index 67c29d2b..8cae93b2 100644 --- a/redisvl/query/__init__.py +++ b/redisvl/query/__init__.py @@ -1,4 +1,9 @@ -from redisvl.query.aggregate import AggregationQuery, HybridQuery, MultiVectorQuery +from redisvl.query.aggregate import ( + AggregationQuery, + HybridQuery, + MultiVectorQuery, + Vector, +) from redisvl.query.query import ( BaseQuery, BaseVectorQuery, @@ -22,4 +27,5 @@ "AggregationQuery", "HybridQuery", "MultiVectorQuery", + "Vector", ] diff --git a/redisvl/query/aggregate.py b/redisvl/query/aggregate.py index d0a4273b..d3e89a25 100644 --- a/redisvl/query/aggregate.py +++ b/redisvl/query/aggregate.py @@ -1,9 +1,11 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union +from pydantic import BaseModel, field_validator from redis.commands.search.aggregation import AggregateRequest, Desc from redisvl.query.filter import FilterExpression from redisvl.redis.utils import array_to_buffer +from redisvl.schema.fields import VectorDataType from redisvl.utils.token_escaper import TokenEscaper from redisvl.utils.utils import lazy_import @@ -11,6 +13,29 @@ nltk_stopwords = lazy_import("nltk.corpus.stopwords") +class Vector(BaseModel): + """ + Simple object containing the necessary arguments to perform a multi vector query. + """ + + vector: Union[List[float], bytes] + field_name: str + dtype: str = "float32" + weight: float = 1.0 + + @field_validator("dtype") + @classmethod + def validate_dtype(cls, dtype: str) -> str: + try: + VectorDataType(dtype.upper()) + except ValueError: + raise ValueError( + f"Invalid data type: {dtype}. Supported types are: {[t.lower() for t in VectorDataType]}" + ) + + return dtype + + class AggregationQuery(AggregateRequest): """ Base class for aggregation queries used to create aggregation queries for Redis. @@ -241,17 +266,33 @@ class MultiVectorQuery(AggregationQuery): .. code-block:: python - from redisvl.query import MultiVectorQuery + from redisvl.query import MultiVectorQuery, Vector from redisvl.index import SearchIndex index = SearchIndex.from_yaml("path/to/index.yaml") + vector_1 = Vector( + vector=[0.1, 0.2, 0.3], + field_name="text_vector", + dtype="float32", + weight=0.7, + ) + vector_2 = Vector( + vector=[0.5, 0.5], + field_name="image_vector", + dtype="bfloat16", + weight=0.2, + ) + vector_3 = Vector( + vector=[0.1, 0.2, 0.3], + field_name="text_vector", + dtype="float64", + weight=0.5, + ) + query = MultiVectorQuery( - vectors=[[0.1, 0.2, 0.3], [0.5, 0.5], [0.1, 0.1, 0.1, 0.1]], - vector_field_names=["text_vector", "image_vector", "feature_vector"] + vectors=[vector_1, vector_2, vector_3], filter_expression=None, - weights=[0.7, 0.2, 0.5], - dtypes=["float32", "bfloat16", "float64"], num_results=10, return_fields=["field1", "field2"], dialect=2, @@ -260,14 +301,13 @@ class MultiVectorQuery(AggregationQuery): results = index.query(query) """ + _vectors: List[Vector] + def __init__( self, - vectors: Union[bytes, List[bytes], List[float], List[List[float]]], - vector_field_names: Union[str, List[str]], - weights: List[float] = [1.0], + vectors: Union[Vector, List[Vector]], return_fields: Optional[List[str]] = None, filter_expression: Optional[Union[str, FilterExpression]] = None, - dtypes: List[str] = ["float32"], num_results: int = 10, return_score: bool = False, dialect: int = 2, @@ -276,87 +316,39 @@ def __init__( Instantiates a MultiVectorQuery object. Args: - vectors (Union[bytes, List[bytes], List[float], List[List[float]]): The vectors to perform vector similarity search. - vector_field_names (Union[str, List[str]]): The vector field names to search in. - weights (List[float]): The weights of the vector similarity. - Documents will be scored as: - score = (w1) * score1 + (w2) * score2 + (w3) * score3 + ... - Defaults to [1.0], which corresponds to equal weighting + vectors (Union[Vector, List[Vector]]): The Vectors to perform vector similarity search. return_fields (Optional[List[str]], optional): The fields to return. Defaults to None. filter_expression (Optional[Union[str, FilterExpression]]): The filter expression to use. Defaults to None. - dtypes (List[str]): The data types of the vectors. Defaults to ["float32"] for all vectors. num_results (int, optional): The number of results to return. Defaults to 10. return_score (bool): Whether to return the combined vector similarity score. Defaults to False. dialect (int, optional): The Redis dialect version. Defaults to 2. - - Raises: - ValueError: The number of vectors, vector field names, and weights do not agree. """ self._filter_expression = filter_expression - self._dtypes = dtypes self._num_results = num_results - if any([len(x) == 0 for x in [vectors, vector_field_names, weights, dtypes]]): - raise ValueError( - f"""The number of vectors and vector field names must be equal. - If weights or dtypes are specified their number must match the number of vectors and vector field names also. - Length of vectors list: {len(vectors) = } - Length of vector_field_names list: {len(vector_field_names) = } - Length of weights list: {len(weights) = } - length of dtypes list: {len(dtypes) = } - """ - ) - - if isinstance(vectors, bytes) or isinstance(vectors[0], float): + if isinstance(vectors, Vector): self._vectors = [vectors] else: self._vectors = vectors # type: ignore - if isinstance(vector_field_names, str): - self._vector_field_names = [vector_field_names] - else: - self._vector_field_names = vector_field_names - - if len(weights) == 1: - self._weights = weights * len(vectors) - else: - self._weights = weights - - if len(dtypes) == 1: - self._dtypes = dtypes * len(vectors) - else: - self._dtypes = dtypes - - num_vectors = len(self._vectors) - if any( - [ - len(x) != num_vectors # type: ignore - for x in [self._vector_field_names, self._weights, self._dtypes] - ] - ): - raise ValueError( - f"""The number of vectors and vector field names must be equal. - If weights or dtypes are specified their number must match the number of vectors and vector field names also. - Length of vectors list: {len(self._vectors) = } - Length of vector_field_names list: {len(self._vector_field_names) = } - Length of weights list: {len(self._weights) = } - Length of dtypes list: {len(self._dtypes) = } - """ + if not all([isinstance(v, Vector) for v in self._vectors]): + raise TypeError( + "vector arugment must be a Vector object or list of Vector objects." ) query_string = self._build_query_string() super().__init__(query_string) # calculate the respective vector similarities - for i in range(len(vectors)): + for i in range(len(self._vectors)): self.apply(**{f"score_{i}": f"(2 - @distance_{i})/2"}) # construct the scoring string based on the vector similarity scores and weights combined_scores = [] - for i, w in enumerate(self._weights): + for i, w in enumerate([v.weight for v in self._vectors]): combined_scores.append(f"@score_{i} * {w}") combined_score_string = " + ".join(combined_scores) @@ -375,7 +367,9 @@ def params(self) -> Dict[str, Any]: Dict[str, Any]: The parameters for the aggregation. """ params = {} - for i, (vector, dtype) in enumerate(zip(self._vectors, self._dtypes)): + for i, (vector, dtype) in enumerate( + [(v.vector, v.dtype) for v in self._vectors] + ): if isinstance(vector, list): vector = array_to_buffer(vector, dtype=dtype) # type: ignore params[f"vector_{i}"] = vector @@ -387,7 +381,7 @@ def _build_query_string(self) -> str: # base KNN query range_queries = [] for i, (vector, field) in enumerate( - zip(self._vectors, self._vector_field_names) + [(v.vector, v.field_name) for v in self._vectors] ): range_queries.append( f"@{field}:[VECTOR_RANGE 2.0 $vector_{i}]=>{{$YIELD_DISTANCE_AS: distance_{i}}}" diff --git a/tests/integration/test_aggregation.py b/tests/integration/test_aggregation.py index 38f1f5fb..f08815a6 100644 --- a/tests/integration/test_aggregation.py +++ b/tests/integration/test_aggregation.py @@ -1,7 +1,7 @@ import pytest from redisvl.index import SearchIndex -from redisvl.query import HybridQuery, MultiVectorQuery +from redisvl.query import HybridQuery, MultiVectorQuery, Vector from redisvl.query.filter import FilterExpression, Geo, GeoRadius, Num, Tag, Text from redisvl.redis.utils import array_to_buffer from tests.conftest import skip_if_redis_version_below @@ -71,7 +71,6 @@ def hash_preprocess(item: dict) -> dict: "audio_embedding": array_to_buffer(item["audio_embedding"], "float64"), } - ### TODO get sample data that has two vector fields index.load(multi_vector_data, preprocess=hash_preprocess) # run the test @@ -321,13 +320,16 @@ def test_hybrid_query_with_text_filter(index): def test_multivector_query(index): skip_if_redis_version_below(index.client, "7.2.0") - vectors = [[0.1, 0.1, 0.5], [0.3, 0.4, 0.7, 0.2, -0.3]] + vector_vals = [[0.1, 0.1, 0.5], [0.3, 0.4, 0.7, 0.2, -0.3]] vector_fields = ["user_embedding", "image_embedding"] + vectors = [] + for vector, field in zip(vector_vals, vector_fields): + vectors.append(Vector(vector=vector, field_name=field)) + return_fields = ["user", "credit_score", "age", "job", "location", "description"] multi_query = MultiVectorQuery( vectors=vectors, - vector_field_names=vector_fields, return_fields=return_fields, ) @@ -351,7 +353,6 @@ def test_multivector_query(index): multi_query = MultiVectorQuery( vectors=vectors, - vector_field_names=vector_fields, num_results=3, ) @@ -368,14 +369,17 @@ def test_multivector_query_with_filter(index): skip_if_redis_version_below(index.client, "7.2.0") text_field = "description" - vectors = [[0.1, 0.1, 0.5], [0.3, 0.4, 0.7, 0.2, -0.3]] + vector_vals = [[0.1, 0.1, 0.5], [0.3, 0.4, 0.7, 0.2, -0.3]] vector_fields = ["user_embedding", "image_embedding"] filter_expression = Text(text_field) == ("medical") + vectors = [] + for vector, field in zip(vector_vals, vector_fields): + vectors.append(Vector(vector=vector, field_name=field)) + # make sure we can still apply filters to the same text field we are querying multi_query = MultiVectorQuery( vectors=vectors, - vector_field_names=vector_fields, filter_expression=filter_expression, return_fields=["job", "description"], ) @@ -390,7 +394,6 @@ def test_multivector_query_with_filter(index): ) multi_query = MultiVectorQuery( vectors=vectors, - vector_field_names=vector_fields, filter_expression=filter_expression, return_fields=["description"], ) @@ -404,7 +407,6 @@ def test_multivector_query_with_filter(index): filter_expression = (Num("age") > 30) & ((Num("age") < 30)) multi_query = MultiVectorQuery( vectors=vectors, - vector_field_names=vector_fields, filter_expression=filter_expression, return_fields=["description"], ) @@ -416,14 +418,17 @@ def test_multivector_query_with_filter(index): def test_multivector_query_with_geo_filter(index): skip_if_redis_version_below(index.client, "7.2.0") - vectors = [[0.2, 0.4, 0.1], [0.1, 0.8, 0.3, -0.2, 0.3]] + vector_vals = [[0.2, 0.4, 0.1], [0.1, 0.8, 0.3, -0.2, 0.3]] vector_fields = ["user_embedding", "image_embedding"] return_fields = ["user", "credit_score", "age", "job", "location", "description"] filter_expression = Geo("location") == GeoRadius(-122.4194, 37.7749, 1000, "m") + vectors = [] + for vector, field in zip(vector_vals, vector_fields): + vectors.append(Vector(vector=vector, field_name=field)) + multi_query = MultiVectorQuery( vectors=vectors, - vector_field_names=vector_fields, filter_expression=filter_expression, return_fields=return_fields, ) @@ -435,11 +440,9 @@ def test_multivector_query_with_geo_filter(index): def test_multivector_query_weights(index): - skip_if_redis_version_below( - index.client, "7.2.0" - ) ## TODO figure out min version for 'case()' + skip_if_redis_version_below(index.client, "7.2.0") - vectors = [[0.1, 0.2, 0.5], [0.3, 0.4, 0.7, 0.2, -0.3]] + vector_vals = [[0.1, 0.2, 0.5], [0.3, 0.4, 0.7, 0.2, -0.3]] vector_fields = ["user_embedding", "image_embedding"] return_fields = [ "distance_0", @@ -450,20 +453,25 @@ def test_multivector_query_weights(index): "image_embedding", ] + vectors = [] + for vector, field in zip(vector_vals, vector_fields): + vectors.append(Vector(vector=vector, field_name=field)) + # changing the weights does indeed change the result order multi_query_1 = MultiVectorQuery( vectors=vectors, - vector_field_names=vector_fields, return_fields=return_fields, - weights=[0.2, 0.9], ) results_1 = index.query(multi_query_1) + weights = [0.2, 0.9] + vectors = [] + for vector, field, weight in zip(vector_vals, vector_fields, weights): + vectors.append(Vector(vector=vector, field_name=field, weight=weight)) + multi_query_2 = MultiVectorQuery( vectors=vectors, - vector_field_names=vector_fields, return_fields=return_fields, - weights=[0.5, 0.1], ) results_2 = index.query(multi_query_2) @@ -477,11 +485,13 @@ def test_multivector_query_weights(index): # weights can be negative, 0.0, or greater than 1.0 weights = [-5.2, 0.0] + vectors = [] + for vector, field, weight in zip(vector_vals, vector_fields, weights): + vectors.append(Vector(vector=vector, field_name=field, weight=weight)) + multi_query = MultiVectorQuery( vectors=vectors, - vector_field_names=vector_fields, return_fields=return_fields, - weights=weights, ) results = index.query(multi_query) @@ -494,11 +504,13 @@ def test_multivector_query_weights(index): # verify we're doing the combined score math correctly weights = [-1.322, 0.851] + vectors = [] + for vector, field, weight in zip(vector_vals, vector_fields, weights): + vectors.append(Vector(vector=vector, field_name=field, weight=weight)) + multi_query = MultiVectorQuery( vectors=vectors, - vector_field_names=vector_fields, return_fields=return_fields, - weights=weights, ) results = index.query(multi_query) @@ -509,29 +521,13 @@ def test_multivector_query_weights(index): float(r["combined_score"]) - score <= 0.0001 ) # allow for small floating point error - # raise error if wrong number of weights are passed - with pytest.raises(ValueError): - _ = MultiVectorQuery( - vectors=vectors, - vector_field_names=vector_fields, - return_fields=return_fields, - weights=[], - ) - - with pytest.raises(ValueError): - _ = MultiVectorQuery( - vectors=vectors, - vector_field_names=vector_fields, - return_fields=return_fields, - weights=[1.2, 0.23, 0.52], - ) - def test_multivector_query_datatypes(index): skip_if_redis_version_below(index.client, "7.2.0") - vectors = [[0.1, 0.2, 0.5], [1.2, 0.3, -0.4, 0.7, 0.2, -0.3]] + vector_vals = [[0.1, 0.2, 0.5], [1.2, 0.3, -0.4, 0.7, 0.2, -0.3]] vector_fields = ["user_embedding", "audio_embedding"] + dtypes = ["float32", "float64"] return_fields = [ "distance_0", "distance_1", @@ -541,12 +537,13 @@ def test_multivector_query_datatypes(index): "audio_embedding", ] - # changing the weights does indeed change the result order + vectors = [] + for vector, field, dtype in zip(vector_vals, vector_fields, dtypes): + vectors.append(Vector(vector=vector, field_name=field, dtype=dtype)) + multi_query = MultiVectorQuery( vectors=vectors, - vector_field_names=vector_fields, return_fields=return_fields, - dtypes=["float32", "float64"], ) results = index.query(multi_query) @@ -555,12 +552,17 @@ def test_multivector_query_datatypes(index): # verify we're doing the combined score math correctly weights = [-1.322, 0.851] + vectors = [] + for vector, field, weight, dtype in zip( + vector_vals, vector_fields, weights, dtypes + ): + vectors.append( + Vector(vector=vector, field_name=field, weight=weight, dtype=dtype) + ) + multi_query = MultiVectorQuery( vectors=vectors, - vector_field_names=vector_fields, return_fields=return_fields, - dtypes=["float32", "float64"], - weights=weights, ) results = index.query(multi_query) @@ -571,15 +573,6 @@ def test_multivector_query_datatypes(index): float(r["combined_score"]) - score <= 0.0001 ) # allow for small floating point error - # raise error if wrong number of datatypes are passed - with pytest.raises(ValueError): - _ = MultiVectorQuery( - vectors=vectors, - vector_field_names=vector_fields, - return_fields=return_fields, - dtypes=["float32", "float32", "float64"], - ) - def test_multivector_query_mixed_index(index): # test that we can do multi vector queries on indices with both a 'flat' and 'hnsw' index @@ -602,8 +595,9 @@ def test_multivector_query_mixed_index(index): except: pytest.skip("Required Redis modules not available or version too low") - vectors = [[0.1, 0.2, 0.5], [1.2, 0.3, -0.4, 0.7, 0.2, -0.3]] + vector_vals = [[0.1, 0.2, 0.5], [1.2, 0.3, -0.4, 0.7, 0.2, -0.3]] vector_fields = ["user_embedding", "audio_embedding"] + dtypes = ["float32", "float64"] return_fields = [ "distance_0", "distance_1", @@ -613,12 +607,13 @@ def test_multivector_query_mixed_index(index): "audio_embedding", ] - # changing the weights does indeed change the result order + vectors = [] + for vector, field, dtype in zip(vector_vals, vector_fields, dtypes): + vectors.append(Vector(vector=vector, field_name=field, dtype=dtype)) + multi_query = MultiVectorQuery( vectors=vectors, - vector_field_names=vector_fields, return_fields=return_fields, - dtypes=["float32", "float64"], ) results = index.query(multi_query) @@ -627,12 +622,17 @@ def test_multivector_query_mixed_index(index): # verify we're doing the combined score math correctly weights = [-1.322, 0.851] + vectors = [] + for vector, field, dtype, weight in zip( + vector_vals, vector_fields, dtypes, weights + ): + vectors.append( + Vector(vector=vector, field_name=field, dtype=dtype, weight=weight) + ) + multi_query = MultiVectorQuery( vectors=vectors, - vector_field_names=vector_fields, return_fields=return_fields, - dtypes=["float32", "float64"], - weights=weights, ) results = index.query(multi_query) diff --git a/tests/unit/test_aggregation_types.py b/tests/unit/test_aggregation_types.py index 442a338c..4d3b18e2 100644 --- a/tests/unit/test_aggregation_types.py +++ b/tests/unit/test_aggregation_types.py @@ -4,7 +4,7 @@ from redis.commands.search.result import Result from redisvl.index.index import process_results -from redisvl.query.aggregate import HybridQuery, MultiVectorQuery +from redisvl.query.aggregate import HybridQuery, MultiVectorQuery, Vector from redisvl.query.filter import Tag # Sample data for testing @@ -197,130 +197,120 @@ def test_hybrid_query_with_string_filter(): def test_multi_vector_query(): - # test we require vectors and field names + # test we require Vector objects with pytest.raises(TypeError): _ = MultiVectorQuery() - with pytest.raises(ValueError): - _ = MultiVectorQuery(vectors=[sample_vector], vector_field_names=[]) + with pytest.raises(TypeError): + _ = MultiVectorQuery(vector=[sample_vector]) - with pytest.raises(ValueError): - _ = MultiVectorQuery(vectors=[], vector_field_names=["field_1"]) + with pytest.raises(TypeError): + _ = MultiVectorQuery(vectors=[[0.1, 0.1, 0.1], "field_1"]) # test we can initialize with a single vector and single field name multivector_query = MultiVectorQuery( - vectors=[sample_vector], vector_field_names=["field_1"] + Vector(vector=sample_vector, field_name="field_1") ) # check default properties - assert multivector_query._vectors == [sample_vector] - assert multivector_query._vector_field_names == ["field_1"] + assert multivector_query._vectors == [ + Vector(vector=sample_vector, field_name="field_1") + ] + assert multivector_query._vectors[0].field_name == "field_1" + assert multivector_query._vectors[0].weight == 1.0 + assert multivector_query._vectors[0].dtype == "float32" assert multivector_query._filter_expression == None - assert multivector_query._weights == [1.0] assert multivector_query._num_results == 10 assert multivector_query._loadfields == [] assert multivector_query._dialect == 2 - # test we can initialize with mutliple vectors and field names - multivector_query = MultiVectorQuery( - vectors=[sample_vector, sample_vector_2, sample_vector_3, sample_vector_4], - vector_field_names=["field_1", "field_2", "field_3", "field_4"], - weights=[0.2, 0.5, 0.6, 0.1], - dtypes=["float32", "float32", "float32", "float32"], - ) + # test we can initialize with mutliple Vectors + vectors = [sample_vector, sample_vector_2, sample_vector_3, sample_vector_4] + vector_field_names = ["field_1", "field_2", "field_3", "field_4"] + weights = [0.2, 0.5, 0.6, 0.1] + dtypes = ["float32", "float32", "float32", "float32"] + + args = [] + for vec, field, weight, dtype in zip(vectors, vector_field_names, weights, dtypes): + args.append(Vector(vector=vec, field_name=field, weight=weight, dtype=dtype)) + + multivector_query = MultiVectorQuery(vectors=args) assert len(multivector_query._vectors) == 4 - assert len(multivector_query._vector_field_names) == 4 - assert len(multivector_query._weights) == 4 - assert len(multivector_query._dtypes) == 4 + assert multivector_query._vectors == args # test defaults can be overwritten filter_expression = Tag("user group") == ["group A", "group C"] + multivector_query = MultiVectorQuery( - vectors=[sample_vector, sample_vector_2, sample_vector_3, sample_vector_4], - vector_field_names=["field_1", "field_2", "field_3", "field_4"], + vectors=args, filter_expression=filter_expression, - weights=[0.2, 0.5, 0.6, 0.1], - dtypes=["float32", "float32", "float64", "bfloat16"], num_results=5, return_fields=["field_1", "user name", "address"], dialect=4, ) - assert multivector_query._vectors == [ - sample_vector, - sample_vector_2, - sample_vector_3, - sample_vector_4, - ] - assert multivector_query._vector_field_names == [ - "field_1", - "field_2", - "field_3", - "field_4", - ] - assert multivector_query._weights == [0.2, 0.5, 0.6, 0.1] assert multivector_query._filter_expression == filter_expression assert multivector_query._num_results == 5 assert multivector_query._loadfields == ["field_1", "user name", "address"] assert multivector_query._dialect == 4 -def test_multi_vector_query_broadcasting(): +def test_multi_vector_query_string(): # if a single weight is passed it is applied to all similarity scores field_1 = "text embedding" field_2 = "image embedding" - weight = 0.2 + weight_1 = 0.2 + weight_2 = 0.7 multi_vector_query = MultiVectorQuery( - vectors=[sample_vector_2, sample_vector_3], - vector_field_names=[field_1, field_2], - weights=[weight], + vectors=[ + Vector(vector=sample_vector_2, field_name=field_1, weight=weight_1), + Vector(vector=sample_vector_3, field_name=field_2, weight=weight_2), + ] ) assert ( str(multi_vector_query) - == f"@{field_1}:[VECTOR_RANGE 2.0 $vector_0]=>{{$YIELD_DISTANCE_AS: distance_0}} | @{field_2}:[VECTOR_RANGE 2.0 $vector_1]=>{{$YIELD_DISTANCE_AS: distance_1}} SCORER TFIDF DIALECT 2 APPLY (2 - @distance_0)/2 AS score_0 APPLY (2 - @distance_1)/2 AS score_1 APPLY @score_0 * {weight} + @score_1 * {weight} AS combined_score SORTBY 2 @combined_score DESC MAX 10" + == f"@{field_1}:[VECTOR_RANGE 2.0 $vector_0]=>{{$YIELD_DISTANCE_AS: distance_0}} | @{field_2}:[VECTOR_RANGE 2.0 $vector_1]=>{{$YIELD_DISTANCE_AS: distance_1}} SCORER TFIDF DIALECT 2 APPLY (2 - @distance_0)/2 AS score_0 APPLY (2 - @distance_1)/2 AS score_1 APPLY @score_0 * {weight_1} + @score_1 * {weight_2} AS combined_score SORTBY 2 @combined_score DESC MAX 10" ) - # if a single dtype is passed it is applied to all vectors - multi_vector_query = MultiVectorQuery( - vectors=[sample_vector_2, sample_vector_3], - vector_field_names=["text embedding", "image embedding"], - dtypes=["float16"], - ) - - assert multi_vector_query._dtypes == ["float16", "float16"] - assert ( - str(multi_vector_query) - == f"@{field_1}:[VECTOR_RANGE 2.0 $vector_0]=>{{$YIELD_DISTANCE_AS: distance_0}} | @{field_2}:[VECTOR_RANGE 2.0 $vector_1]=>{{$YIELD_DISTANCE_AS: distance_1}} SCORER TFIDF DIALECT 2 APPLY (2 - @distance_0)/2 AS score_0 APPLY (2 - @distance_1)/2 AS score_1 APPLY @score_0 * 1.0 + @score_1 * 1.0 AS combined_score SORTBY 2 @combined_score DESC MAX 10" - ) +def test_vector_object_validation(): + # test an error is raised if none of the field names are present + with pytest.raises(ValueError): + _ = Vector() -def test_multi_vector_query_errors(): - # test an error is raised if the number of vectors and number of fields don't match with pytest.raises(ValueError): - _ = MultiVectorQuery( - vectors=[sample_vector, sample_vector_2, sample_vector_3], - vector_field_names=["text embedding", "image embedding"], + _ = Vector( + vector=[], + field_name=[], ) + # test an error is raised if the type of vector or fields are incorrect + # no list of list of floats with pytest.raises(ValueError): - _ = MultiVectorQuery( - vectors=[sample_vector, sample_vector_2], - vector_field_names=["text embedding", "image embedding", "features"], + _ = Vector( + vector=[sample_vector, sample_vector_2, sample_vector_3], + field_name="text embedding", ) - # test an error is raised if the number of weights is incorrect + # no list as field name with pytest.raises(ValueError): - _ = MultiVectorQuery( - vectors=[sample_vector, sample_vector_2], - vector_field_names=["text embedding", "image embedding"], - weights=[0.1, 0.2, 0.3], + _ = Vector( + vector=sample_vector, + field_name=["text embedding", "image embedding", "features"], ) - # test an error is raised if none of the field names are present + # dtype must be one of the supported values with pytest.raises(ValueError): - _ = MultiVectorQuery( - vectors=[], - vector_field_names=[], - ) + _ = Vector(vector=sample_vector, field_name="text embedding", dtype="float") + + with pytest.raises(ValueError): + _ = Vector(vector=sample_vector, field_name="text embedding", dtype="normal") + + with pytest.raises(ValueError): + _ = Vector(vector=sample_vector, field_name="text embedding", dtype="") + + for dtype in ["bfloat16", "float16", "float32", "float64", "int8", "uint8"]: + vec = Vector(vector=sample_vector, field_name="text embedding", dtype=dtype) + assert isinstance(vec, Vector) From 2d58f6a3c06e1425c3b40673a00055de04ffa4d5 Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Tue, 14 Oct 2025 14:35:28 -0700 Subject: [PATCH 09/12] updates sphynx docs to include Vector class --- docs/api/index.md | 1 + docs/api/query.rst | 14 ++++++++++++++ docs/api/vector.rst | 17 +++++++++++++++++ 3 files changed, 32 insertions(+) create mode 100644 docs/api/vector.rst diff --git a/docs/api/index.md b/docs/api/index.md index 5b7b6261..f7c1c661 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -15,6 +15,7 @@ Reference documentation for the RedisVL API. schema searchindex +vector query filter vectorizer diff --git a/docs/api/query.rst b/docs/api/query.rst index fa92230e..c2ba04f9 100644 --- a/docs/api/query.rst +++ b/docs/api/query.rst @@ -88,3 +88,17 @@ CountQuery :inherited-members: :show-inheritance: :exclude-members: add_filter,get_args,highlight,return_field,summarize + + + +MultiVectorQuery +========== + +.. currentmodule:: redisvl.query + + +.. autoclass:: MultiVectorQuery + :members: + :inherited-members: + :show-inheritance: + :exclude-members: add_filter,get_args,highlight,return_field,summarize diff --git a/docs/api/vector.rst b/docs/api/vector.rst new file mode 100644 index 00000000..9d28d9cc --- /dev/null +++ b/docs/api/vector.rst @@ -0,0 +1,17 @@ + +***** +Vector +***** + +The Vector class in RedisVL is a container that encapsulates a numerical vector, it's datatype, corresponding index field name, and optional importance weight. It is used when constructing multi-vector queries using the MultiVectorQuery class. + + +Vector +=========== + +.. currentmodule:: redisvl.query + + +.. autoclass:: Vector + :members: + :exclude-members: From f29c8f8d89b3f406416c5e9021977584e03f05bb Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Tue, 14 Oct 2025 16:23:54 -0700 Subject: [PATCH 10/12] fixes typo --- redisvl/query/aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/redisvl/query/aggregate.py b/redisvl/query/aggregate.py index d3e89a25..77f78b3f 100644 --- a/redisvl/query/aggregate.py +++ b/redisvl/query/aggregate.py @@ -336,7 +336,7 @@ def __init__( if not all([isinstance(v, Vector) for v in self._vectors]): raise TypeError( - "vector arugment must be a Vector object or list of Vector objects." + "vector argument must be a Vector object or list of Vector objects." ) query_string = self._build_query_string() From a544f397ac24b617f5f33f467b38432e48e0c12d Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Tue, 14 Oct 2025 16:49:00 -0700 Subject: [PATCH 11/12] removes unused paramter --- redisvl/query/aggregate.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/redisvl/query/aggregate.py b/redisvl/query/aggregate.py index 77f78b3f..a3a31e05 100644 --- a/redisvl/query/aggregate.py +++ b/redisvl/query/aggregate.py @@ -309,7 +309,6 @@ def __init__( return_fields: Optional[List[str]] = None, filter_expression: Optional[Union[str, FilterExpression]] = None, num_results: int = 10, - return_score: bool = False, dialect: int = 2, ): """ @@ -321,8 +320,6 @@ def __init__( filter_expression (Optional[Union[str, FilterExpression]]): The filter expression to use. Defaults to None. num_results (int, optional): The number of results to return. Defaults to 10. - return_score (bool): Whether to return the combined vector similarity score. - Defaults to False. dialect (int, optional): The Redis dialect version. Defaults to 2. """ From 951b075c4caeee49ed10ebbdf2772434fb208b15 Mon Sep 17 00:00:00 2001 From: Justin Cechmanek <165097110+justin-cechmanek@users.noreply.github.com> Date: Tue, 14 Oct 2025 16:50:43 -0700 Subject: [PATCH 12/12] fixes typo Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/unit/test_aggregation_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_aggregation_types.py b/tests/unit/test_aggregation_types.py index 4d3b18e2..f2b6be86 100644 --- a/tests/unit/test_aggregation_types.py +++ b/tests/unit/test_aggregation_types.py @@ -224,7 +224,7 @@ def test_multi_vector_query(): assert multivector_query._loadfields == [] assert multivector_query._dialect == 2 - # test we can initialize with mutliple Vectors + # test we can initialize with multiple Vectors vectors = [sample_vector, sample_vector_2, sample_vector_3, sample_vector_4] vector_field_names = ["field_1", "field_2", "field_3", "field_4"] weights = [0.2, 0.5, 0.6, 0.1]