diff --git a/docs/reference/query-dsl/script-score-query.asciidoc b/docs/reference/query-dsl/script-score-query.asciidoc index 6b7411667f286..7eae5e54ddfae 100644 --- a/docs/reference/query-dsl/script-score-query.asciidoc +++ b/docs/reference/query-dsl/script-score-query.asciidoc @@ -11,7 +11,6 @@ a function to be used to compute a new score for each document returned by the query. For more information on scripting see <>. - Here is an example of using `script_score` to assign each matched document a score equal to the number of likes divided by 10: @@ -32,7 +31,6 @@ GET /_search } -------------------------------------------------- // CONSOLE -// TEST[setup:twitter] NOTE: The values returned from `script_score` cannot be negative. In general, Lucene requires the scores produced by queries to be non-negative in order to @@ -76,140 +74,6 @@ to be the most efficient by using the internal mechanisms. -------------------------------------------------- // NOTCONSOLE -[role="xpack"] -[testenv="basic"] -[[vector-functions]] -===== Functions for vector fields - -experimental[] - -These functions are used for -for <> and -<> fields. - -NOTE: During vector functions' calculation, all matched documents are -linearly scanned. Thus, expect the query time grow linearly -with the number of matched documents. For this reason, we recommend -to limit the number of matched documents with a `query` parameter. - -For dense_vector fields, `cosineSimilarity` calculates the measure of -cosine similarity between a given query vector and document vectors. - -[source,js] --------------------------------------------------- -{ - "query": { - "script_score": { - "query": { - "match_all": {} - }, - "script": { - "source": "cosineSimilarity(params.query_vector, doc['my_dense_vector']) + 1.0", <1> - "params": { - "query_vector": [4, 3.4, -0.2] <2> - } - } - } - } -} --------------------------------------------------- -// NOTCONSOLE -<1> The script adds 1.0 to the cosine similarity to prevent the score from being negative. -<2> To take advantage of the script optimizations, provide a query vector as a script parameter. - -Similarly, for sparse_vector fields, `cosineSimilaritySparse` calculates cosine similarity -between a given query vector and document vectors. - -[source,js] --------------------------------------------------- -{ - "query": { - "script_score": { - "query": { - "match_all": {} - }, - "script": { - "source": "cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector']) + 1.0", - "params": { - "query_vector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0} - } - } - } - } -} --------------------------------------------------- -// NOTCONSOLE - -For dense_vector fields, `dotProduct` calculates the measure of -dot product between a given query vector and document vectors. - -[source,js] --------------------------------------------------- -{ - "query": { - "script_score": { - "query": { - "match_all": {} - }, - "script": { - "source": """ - double value = dotProduct(params.query_vector, doc['my_vector']); - return sigmoid(1, Math.E, -value); <1> - """, - "params": { - "query_vector": [4, 3.4, -0.2] - } - } - } - } -} --------------------------------------------------- -// NOTCONSOLE - -<1> Using the standard sigmoid function prevents scores from being negative. - -Similarly, for sparse_vector fields, `dotProductSparse` calculates dot product -between a given query vector and document vectors. - -[source,js] --------------------------------------------------- -{ - "query": { - "script_score": { - "query": { - "match_all": {} - }, - "script": { - "source": """ - double value = dotProductSparse(params.query_vector, doc['my_sparse_vector']); - return sigmoid(1, Math.E, -value); - """, - "params": { - "query_vector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0} - } - } - } - } -} --------------------------------------------------- -// NOTCONSOLE - -NOTE: If a document doesn't have a value for a vector field on which -a vector function is executed, an error will be thrown. - -You can check if a document has a value for the field `my_vector` by -`doc['my_vector'].size() == 0`. Your overall script can look like this: - -[source,js] --------------------------------------------------- -"source": "doc['my_vector'].size() == 0 ? 0 : cosineSimilarity(params.queryVector, doc['my_vector'])" --------------------------------------------------- -// NOTCONSOLE - -NOTE: If a document's dense vector field has a number of dimensions -different from the query's vector, an error will be thrown. - - [[random-score-function]] ===== Random score function `random_score` function generates scores that are uniformly distributed @@ -323,6 +187,9 @@ You can read more about decay functions NOTE: Decay functions on dates are limited to dates in the default format and default time zone. Also calculations with `now` are not supported. +===== Functions for vector fields +<> are accessible through +`script_score` query. ==== Faster alternatives Script Score Query calculates the score for every hit (matching document). @@ -422,5 +289,4 @@ through a script: Script Score query has equivalent <> that can be used in script. - - +include::{es-repo-dir}/vectors/vector-functions.asciidoc[] diff --git a/docs/reference/vectors/vector-functions.asciidoc b/docs/reference/vectors/vector-functions.asciidoc new file mode 100644 index 0000000000000..d08af2d03bfab --- /dev/null +++ b/docs/reference/vectors/vector-functions.asciidoc @@ -0,0 +1,279 @@ +[role="xpack"] +[testenv="basic"] +[[vector-functions]] +===== Functions for vector fields + +experimental[] + +These functions are used for +for <> and +<> fields. + +NOTE: During vector functions' calculation, all matched documents are +linearly scanned. Thus, expect the query time grow linearly +with the number of matched documents. For this reason, we recommend +to limit the number of matched documents with a `query` parameter. + +Let's create an index with the following mapping and index a couple +of documents into it. + +[source,js] +-------------------------------------------------- +PUT my_index +{ + "mappings": { + "properties": { + "my_dense_vector": { + "type": "dense_vector", + "dims": 3 + }, + "my_sparse_vector" : { + "type" : "sparse_vector" + } + } + } +} + +PUT my_index/_doc/1 +{ + "my_dense_vector": [0.5, 10, 6], + "my_sparse_vector": {"2": 1.5, "15" : 2, "50": -1.1, "4545": 1.1} +} + +PUT my_index/_doc/2 +{ + "my_dense_vector": [-0.5, 10, 10], + "my_sparse_vector": {"2": 2.5, "10" : 1.3, "55": -2.3, "113": 1.6} +} + +-------------------------------------------------- +// CONSOLE +// TESTSETUP + +For dense_vector fields, `cosineSimilarity` calculates the measure of +cosine similarity between a given query vector and document vectors. + +[source,js] +-------------------------------------------------- +GET my_index/_search +{ + "query": { + "script_score": { + "query": { + "match_all": {} + }, + "script": { + "source": "cosineSimilarity(params.query_vector, doc['my_dense_vector']) + 1.0", <1> + "params": { + "query_vector": [4, 3.4, -0.2] <2> + } + } + } + } +} +-------------------------------------------------- +// CONSOLE +<1> The script adds 1.0 to the cosine similarity to prevent the score from being negative. +<2> To take advantage of the script optimizations, provide a query vector as a script parameter. + +NOTE: If a document's dense vector field has a number of dimensions +different from the query's vector, an error will be thrown. + +Similarly, for sparse_vector fields, `cosineSimilaritySparse` calculates cosine similarity +between a given query vector and document vectors. + +[source,js] +-------------------------------------------------- +GET my_index/_search +{ + "query": { + "script_score": { + "query": { + "match_all": {} + }, + "script": { + "source": "cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector']) + 1.0", + "params": { + "query_vector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0} + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +For dense_vector fields, `dotProduct` calculates the measure of +dot product between a given query vector and document vectors. + +[source,js] +-------------------------------------------------- +GET my_index/_search +{ + "query": { + "script_score": { + "query": { + "match_all": {} + }, + "script": { + "source": """ + double value = dotProduct(params.query_vector, doc['my_dense_vector']); + return sigmoid(1, Math.E, -value); <1> + """, + "params": { + "query_vector": [4, 3.4, -0.2] + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +<1> Using the standard sigmoid function prevents scores from being negative. + +Similarly, for sparse_vector fields, `dotProductSparse` calculates dot product +between a given query vector and document vectors. + +[source,js] +-------------------------------------------------- +GET my_index/_search +{ + "query": { + "script_score": { + "query": { + "match_all": {} + }, + "script": { + "source": """ + double value = dotProductSparse(params.query_vector, doc['my_sparse_vector']); + return sigmoid(1, Math.E, -value); + """, + "params": { + "query_vector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0} + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +For dense_vector fields, `l1norm` calculates L^1^ distance +(Manhattan distance) between a given query vector and +document vectors. + +[source,js] +-------------------------------------------------- +GET my_index/_search +{ + "query": { + "script_score": { + "query": { + "match_all": {} + }, + "script": { + "source": "1 / (1 + l1norm(params.queryVector, doc['my_dense_vector']))", <1> + "params": { + "queryVector": [4, 3.4, -0.2] + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +<1> Unlike `cosineSimilarity` that represent similarity, `l1norm` and +`l2norm` shown below represent distances or differences. This means, that +the more similar the vectors are, the lower the scores will be that are +produced by the `l1norm` and `l2norm` functions. +Thus, as we need more similar vectors to score higher, +we reversed the output from `l1norm` and `l2norm`. Also, to avoid +division by 0 when a document vector matches the query exactly, +we added `1` in the denominator. + +For sparse_vector fields, `l1normSparse` calculates L^1^ distance +between a given query vector and document vectors. + +[source,js] +-------------------------------------------------- +GET my_index/_search +{ + "query": { + "script_score": { + "query": { + "match_all": {} + }, + "script": { + "source": "1 / (1 + l1normSparse(params.queryVector, doc['my_sparse_vector']))", + "params": { + "queryVector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0} + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +For dense_vector fields, `l2norm` calculates L^2^ distance +(Euclidean distance) between a given query vector and +document vectors. + +[source,js] +-------------------------------------------------- +GET my_index/_search +{ + "query": { + "script_score": { + "query": { + "match_all": {} + }, + "script": { + "source": "1 / (1 + l2norm(params.queryVector, doc['my_dense_vector']))", + "params": { + "queryVector": [4, 3.4, -0.2] + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +Similarly, for sparse_vector fields, `l2normSparse` calculates L^2^ distance +between a given query vector and document vectors. + +[source,js] +-------------------------------------------------- +GET my_index/_search +{ + "query": { + "script_score": { + "query": { + "match_all": {} + }, + "script": { + "source": "1 / (1 + l2normSparse(params.queryVector, doc['my_sparse_vector']))", + "params": { + "queryVector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0} + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +NOTE: If a document doesn't have a value for a vector field on which +a vector function is executed, an error will be thrown. + +You can check if a document has a value for the field `my_vector` by +`doc['my_vector'].size() == 0`. Your overall script can look like this: + +[source,js] +-------------------------------------------------- +"source": "doc['my_vector'].size() == 0 ? 0 : cosineSimilarity(params.queryVector, doc['my_vector'])" +-------------------------------------------------- +// NOTCONSOLE diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/15_dense_vector_l1l2.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/15_dense_vector_l1l2.yml new file mode 100644 index 0000000000000..5845c17f5a080 --- /dev/null +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/15_dense_vector_l1l2.yml @@ -0,0 +1,102 @@ +setup: + - skip: + features: headers + version: " - 7.3.99" + reason: "l1norm and l2norm functions were added from 7.4" + + - do: + indices.create: + include_type_name: false + index: test-index + body: + settings: + number_of_replicas: 0 + mappings: + properties: + my_dense_vector: + type: dense_vector + dims: 5 + - do: + index: + index: test-index + id: 1 + body: + my_dense_vector: [230.0, 300.33, -34.8988, 15.555, -200.0] + + - do: + index: + index: test-index + id: 2 + body: + my_dense_vector: [-0.5, 100.0, -13, 14.8, -156.0] + + - do: + index: + index: test-index + id: 3 + body: + my_dense_vector: [0.5, 111.3, -13.0, 14.8, -156.0] + + - do: + indices.refresh: {} + + +--- +"L1 norm": + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "l1norm(params.query_vector, doc['my_dense_vector'])" + params: + query_vector: [0.5, 111.3, -13.0, 14.8, -156.0] + + - match: {hits.total: 3} + + - match: {hits.hits.0._id: "1"} + - gte: {hits.hits.0._score: 485.18} + - lte: {hits.hits.0._score: 485.19} + + - match: {hits.hits.1._id: "2"} + - gte: {hits.hits.1._score: 12.29} + - lte: {hits.hits.1._score: 12.30} + + - match: {hits.hits.2._id: "3"} + - gte: {hits.hits.2._score: 0.00} + - lte: {hits.hits.2._score: 0.01} + +--- +"L2 norm": + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "l2norm(params.query_vector, doc['my_dense_vector'])" + params: + query_vector: [0.5, 111.3, -13.0, 14.8, -156.0] + + - match: {hits.total: 3} + + - match: {hits.hits.0._id: "1"} + - gte: {hits.hits.0._score: 301.36} + - lte: {hits.hits.0._score: 301.37} + + - match: {hits.hits.1._id: "2"} + - gte: {hits.hits.1._score: 11.34} + - lte: {hits.hits.1._score: 11.35} + + - match: {hits.hits.2._id: "3"} + - gte: {hits.hits.2._score: 0.00} + - lte: {hits.hits.2._score: 0.01} diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/35_sparse_vector_l1l2.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/35_sparse_vector_l1l2.yml new file mode 100644 index 0000000000000..05d210df7578a --- /dev/null +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/35_sparse_vector_l1l2.yml @@ -0,0 +1,101 @@ +setup: + - skip: + features: headers + version: " - 7.3.99" + reason: "l1norm and l2norm functions were added from 7.4" + + - do: + indices.create: + include_type_name: false + index: test-index + body: + settings: + number_of_replicas: 0 + mappings: + properties: + my_sparse_vector: + type: sparse_vector + - do: + index: + index: test-index + id: 1 + body: + my_sparse_vector: {"2": 230.0, "10" : 300.33, "50": -34.8988, "113": 15.555, "4545": -200.0} + + - do: + index: + index: test-index + id: 2 + body: + my_sparse_vector: {"2": -0.5, "10" : 100.0, "50": -13, "113": 14.8, "4545": -156.0} + + - do: + index: + index: test-index + id: 3 + body: + my_sparse_vector: {"2": 0.5, "10" : 111.3, "50": -13.0, "113": 14.8, "4545": -156.0} + + - do: + indices.refresh: {} + +--- +"L1 norm": + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "l1normSparse(params.query_vector, doc['my_sparse_vector'])" + params: + query_vector: {"2": 0.5, "10" : 111.3, "50": -13.0, "113": 14.8, "4545": -156.0} + + - match: {hits.total: 3} + + - match: {hits.hits.0._id: "1"} + - gte: {hits.hits.0._score: 485.18} + - lte: {hits.hits.0._score: 485.19} + + - match: {hits.hits.1._id: "2"} + - gte: {hits.hits.1._score: 12.29} + - lte: {hits.hits.1._score: 12.30} + + - match: {hits.hits.2._id: "3"} + - gte: {hits.hits.2._score: 0.00} + - lte: {hits.hits.2._score: 0.01} + + +--- +"L2 norm": + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "l2normSparse(params.query_vector, doc['my_sparse_vector'])" + params: + query_vector: {"2": 0.5, "10" : 111.3, "50": -13.0, "113": 14.8, "4545": -156.0} + + - match: {hits.total: 3} + + - match: {hits.hits.0._id: "1"} + - gte: {hits.hits.0._score: 301.36} + - lte: {hits.hits.0._score: 301.37} + + - match: {hits.hits.1._id: "2"} + - gte: {hits.hits.1._score: 11.34} + - lte: {hits.hits.1._score: 11.35} + + - match: {hits.hits.2._id: "3"} + - gte: {hits.hits.2._score: 0.00} + - lte: {hits.hits.2._score: 0.01} diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/40_sparse_vector_special_cases.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/40_sparse_vector_special_cases.yml index 7137afef0f40c..396d144aecee5 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/40_sparse_vector_special_cases.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/40_sparse_vector_special_cases.yml @@ -220,3 +220,139 @@ setup: params: query_vector: [0.5, 111] - match: { error.root_cause.0.type: "script_exception" } + +--- +"Query vector has different dimensions from documents' vectors": +- do: + index: + index: test-index + id: 1 + body: + my_sparse_vector: {"1": 10} + +- do: + index: + index: test-index + id: 2 + body: + my_sparse_vector: {"1": 10, "10" : 10.5} + +- do: + index: + index: test-index + id: 3 + body: + my_sparse_vector: {"1": 10, "10" : 10.5, "100": 100.5} + +- do: + indices.refresh: {} + +- do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "dotProductSparse(params.query_vector, doc['my_sparse_vector'])" + params: + query_vector: {"1": 10, "5": 5} + +- match: {hits.total: 3} + +- match: {hits.hits.0._id: "1"} +- gte: {hits.hits.0._score: 99.99} +- lte: {hits.hits.0._score: 100.01} + +- match: {hits.hits.1._id: "2"} +- gte: {hits.hits.0._score: 99.99} +- lte: {hits.hits.0._score: 100.01} + +- match: {hits.hits.2._id: "3"} +- gte: {hits.hits.0._score: 99.99} +- lte: {hits.hits.0._score: 100.01} + + +- do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])" + params: + query_vector: {"1": 10, "5" : 5} + +- match: {hits.total: 3} + +- match: {hits.hits.0._id: "1"} +- gte: {hits.hits.0._score: 0.894} +- lte: {hits.hits.0._score: 0.895} + +- match: {hits.hits.1._id: "2"} +- gte: {hits.hits.1._score: 0.61} +- lte: {hits.hits.1._score: 0.62} + +- match: {hits.hits.2._id: "3"} +- gte: {hits.hits.2._score: 0.08} +- lte: {hits.hits.2._score: 0.09} + +- do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "l1normSparse(params.query_vector, doc['my_sparse_vector'])" + params: + query_vector: {"1": 10, "5": 5} + +- match: {hits.total: 3} + +- match: {hits.hits.0._id: "3"} +- match: {hits.hits.0._score: 116} + +- match: {hits.hits.1._id: "2"} +- match: {hits.hits.1._score: 15.5} + +- match: {hits.hits.2._id: "1"} +- match: {hits.hits.2._score: 5} + +- do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "l2normSparse(params.query_vector, doc['my_sparse_vector'])" + params: + query_vector: {"1": 10, "5": 5} + +- match: {hits.total: 3} + +- match: {hits.hits.0._id: "3"} +- gte: {hits.hits.0._score: 101.17} +- lte: {hits.hits.0._score: 101.18} + +- match: {hits.hits.1._id: "2"} +- gte: {hits.hits.1._score: 11.62} +- lte: {hits.hits.1._score: 11.63} + +- match: {hits.hits.2._id: "1"} +- gte: {hits.hits.2._score: 5.0} +- lte: {hits.hits.2._score: 5.0} diff --git a/x-pack/plugin/vectors/src/main/java/org/elasticsearch/xpack/vectors/query/ScoreScriptUtils.java b/x-pack/plugin/vectors/src/main/java/org/elasticsearch/xpack/vectors/query/ScoreScriptUtils.java index 34a1ae2c12a3c..10631aba4ce2d 100644 --- a/x-pack/plugin/vectors/src/main/java/org/elasticsearch/xpack/vectors/query/ScoreScriptUtils.java +++ b/x-pack/plugin/vectors/src/main/java/org/elasticsearch/xpack/vectors/query/ScoreScriptUtils.java @@ -20,6 +20,52 @@ public class ScoreScriptUtils { //**************FUNCTIONS FOR DENSE VECTORS + /** + * Calculate l1 norm - Manhattan distance + * between a query's dense vector and documents' dense vectors + * + * @param queryVector the query vector parsed as {@code List} from json + * @param dvs VectorScriptDocValues representing encoded documents' vectors + */ + public static double l1norm(List queryVector, VectorScriptDocValues.DenseVectorScriptDocValues dvs){ + BytesRef value = dvs.getEncodedValue(); + float[] docVector = VectorEncoderDecoder.decodeDenseVector(value); + if (queryVector.size() != docVector.length) { + throw new IllegalArgumentException("Can't calculate l1norm! The number of dimensions of the query vector [" + + queryVector.size() + "] is different from the documents' vectors [" + docVector.length + "]."); + } + Iterator queryVectorIter = queryVector.iterator(); + double l1norm = 0; + for (int dim = 0; dim < docVector.length; dim++){ + l1norm += Math.abs(queryVectorIter.next().doubleValue() - docVector[dim]); + } + return l1norm; + } + + /** + * Calculate l2 norm - Euclidean distance + * between a query's dense vector and documents' dense vectors + * + * @param queryVector the query vector parsed as {@code List} from json + * @param dvs VectorScriptDocValues representing encoded documents' vectors + */ + public static double l2norm(List queryVector, VectorScriptDocValues.DenseVectorScriptDocValues dvs){ + BytesRef value = dvs.getEncodedValue(); + float[] docVector = VectorEncoderDecoder.decodeDenseVector(value); + if (queryVector.size() != docVector.length) { + throw new IllegalArgumentException("Can't calculate l2norm! The number of dimensions of the query vector [" + + queryVector.size() + "] is different from the documents' vectors [" + docVector.length + "]."); + } + Iterator queryVectorIter = queryVector.iterator(); + double l2norm = 0; + for (int dim = 0; dim < docVector.length; dim++){ + double diff = queryVectorIter.next().doubleValue() - docVector[dim]; + l2norm += diff * diff; + } + return Math.sqrt(l2norm); + } + + /** * Calculate a dot product between a query's dense vector and documents' dense vectors * @@ -92,25 +138,17 @@ private static double intDotProduct(List v1, float[] v2){ //**************FUNCTIONS FOR SPARSE VECTORS - /** - * Calculate a dot product between a query's sparse vector and documents' sparse vectors - * - * DotProductSparse is implemented as a class to use - * painless script caching to prepare queryVector - * only once per script execution for all documents. - * A user will call `dotProductSparse(params.queryVector, doc['my_vector'])` - */ - public static final class DotProductSparse { + public static class VectorSparseFunctions { final double[] queryValues; final int[] queryDims; // prepare queryVector once per script execution // queryVector represents a map of dimensions to values - public DotProductSparse(Map queryVector) { + public VectorSparseFunctions(Map queryVector) { //break vector into two arrays dims and values int n = queryVector.size(); - queryDims = new int[n]; queryValues = new double[n]; + queryDims = new int[n]; int i = 0; for (Map.Entry dimValue : queryVector.entrySet()) { try { @@ -124,6 +162,115 @@ public DotProductSparse(Map queryVector) { // Sort dimensions in the ascending order and sort values in the same order as their corresponding dimensions sortSparseDimsDoubleValues(queryDims, queryValues, n); } + } + + /** + * Calculate l1 norm - Manhattan distance + * between a query's sparse vector and documents' sparse vectors + * + * L1NormSparse is implemented as a class to use + * painless script caching to prepare queryVector + * only once per script execution for all documents. + * A user will call `l1normSparse(params.queryVector, doc['my_vector'])` + */ + public static final class L1NormSparse extends VectorSparseFunctions { + public L1NormSparse(Map queryVector) { + super(queryVector); + } + + public double l1normSparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) { + BytesRef value = dvs.getEncodedValue(); + int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value); + float[] docValues = VectorEncoderDecoder.decodeSparseVector(value); + int queryIndex = 0; + int docIndex = 0; + double l1norm = 0; + while (queryIndex < queryDims.length && docIndex < docDims.length) { + if (queryDims[queryIndex] == docDims[docIndex]) { + l1norm += Math.abs(queryValues[queryIndex] - docValues[docIndex]); + queryIndex++; + docIndex++; + } else if (queryDims[queryIndex] > docDims[docIndex]) { + l1norm += Math.abs(docValues[docIndex]); // 0 for missing query dim + docIndex++; + } else { + l1norm += Math.abs(queryValues[queryIndex]); // 0 for missing doc dim + queryIndex++; + } + } + while (queryIndex < queryDims.length) { + l1norm += Math.abs(queryValues[queryIndex]); // 0 for missing doc dim + queryIndex++; + } + while (docIndex < docDims.length) { + l1norm += Math.abs(docValues[docIndex]); // 0 for missing query dim + docIndex++; + } + return l1norm; + } + } + + /** + * Calculate l2 norm - Euclidean distance + * between a query's sparse vector and documents' sparse vectors + * + * L2NormSparse is implemented as a class to use + * painless script caching to prepare queryVector + * only once per script execution for all documents. + * A user will call `l2normSparse(params.queryVector, doc['my_vector'])` + */ + public static final class L2NormSparse extends VectorSparseFunctions { + public L2NormSparse(Map queryVector) { + super(queryVector); + } + + public double l2normSparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) { + BytesRef value = dvs.getEncodedValue(); + int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value); + float[] docValues = VectorEncoderDecoder.decodeSparseVector(value); + int queryIndex = 0; + int docIndex = 0; + double l2norm = 0; + while (queryIndex < queryDims.length && docIndex < docDims.length) { + if (queryDims[queryIndex] == docDims[docIndex]) { + double diff = queryValues[queryIndex] - docValues[docIndex]; + l2norm += diff * diff; + queryIndex++; + docIndex++; + } else if (queryDims[queryIndex] > docDims[docIndex]) { + double diff = docValues[docIndex]; // 0 for missing query dim + l2norm += diff * diff; + docIndex++; + } else { + double diff = queryValues[queryIndex]; // 0 for missing doc dim + l2norm += diff * diff; + queryIndex++; + } + } + while (queryIndex < queryDims.length) { + l2norm += queryValues[queryIndex] * queryValues[queryIndex]; // 0 for missing doc dims + queryIndex++; + } + while (docIndex < docDims.length) { + l2norm += docValues[docIndex]* docValues[docIndex]; // 0 for missing query dims + docIndex++; + } + return Math.sqrt(l2norm); + } + } + + /** + * Calculate a dot product between a query's sparse vector and documents' sparse vectors + * + * DotProductSparse is implemented as a class to use + * painless script caching to prepare queryVector + * only once per script execution for all documents. + * A user will call `dotProductSparse(params.queryVector, doc['my_vector'])` + */ + public static final class DotProductSparse extends VectorSparseFunctions { + public DotProductSparse(Map queryVector) { + super(queryVector); + } public double dotProductSparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) { BytesRef value = dvs.getEncodedValue(); @@ -141,32 +288,16 @@ public double dotProductSparse(VectorScriptDocValues.SparseVectorScriptDocValues * only once per script execution for all documents. * A user will call `cosineSimilaritySparse(params.queryVector, doc['my_vector'])` */ - public static final class CosineSimilaritySparse { - final double[] queryValues; - final int[] queryDims; + public static final class CosineSimilaritySparse extends VectorSparseFunctions { final double queryVectorMagnitude; - // prepare queryVector once per script execution public CosineSimilaritySparse(Map queryVector) { - //break vector into two arrays dims and values - int n = queryVector.size(); - queryValues = new double[n]; - queryDims = new int[n]; + super(queryVector); double dotProduct = 0; - int i = 0; - for (Map.Entry dimValue : queryVector.entrySet()) { - try { - queryDims[i] = Integer.parseInt(dimValue.getKey()); - } catch (final NumberFormatException e) { - throw new IllegalArgumentException("Failed to parse a query vector dimension, it must be an integer!", e); - } - queryValues[i] = dimValue.getValue().doubleValue(); + for (int i = 0; i< queryDims.length; i++) { dotProduct += queryValues[i] * queryValues[i]; - i++; } this.queryVectorMagnitude = Math.sqrt(dotProduct); - // Sort dimensions in the ascending order and sort values in the same order as their corresponding dimensions - sortSparseDimsDoubleValues(queryDims, queryValues, n); } public double cosineSimilaritySparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) { diff --git a/x-pack/plugin/vectors/src/main/resources/org/elasticsearch/xpack/vectors/query/whitelist.txt b/x-pack/plugin/vectors/src/main/resources/org/elasticsearch/xpack/vectors/query/whitelist.txt index d385744e88fd5..252d4356f9ca1 100644 --- a/x-pack/plugin/vectors/src/main/resources/org/elasticsearch/xpack/vectors/query/whitelist.txt +++ b/x-pack/plugin/vectors/src/main/resources/org/elasticsearch/xpack/vectors/query/whitelist.txt @@ -11,8 +11,12 @@ class org.elasticsearch.xpack.vectors.query.VectorScriptDocValues$SparseVectorSc } static_import { + double l1norm(List, VectorScriptDocValues.DenseVectorScriptDocValues) from_class org.elasticsearch.xpack.vectors.query.ScoreScriptUtils + double l2norm(List, VectorScriptDocValues.DenseVectorScriptDocValues) from_class org.elasticsearch.xpack.vectors.query.ScoreScriptUtils double cosineSimilarity(List, VectorScriptDocValues.DenseVectorScriptDocValues) bound_to org.elasticsearch.xpack.vectors.query.ScoreScriptUtils$CosineSimilarity double dotProduct(List, VectorScriptDocValues.DenseVectorScriptDocValues) from_class org.elasticsearch.xpack.vectors.query.ScoreScriptUtils + double l1normSparse(Map, VectorScriptDocValues.SparseVectorScriptDocValues) bound_to org.elasticsearch.xpack.vectors.query.ScoreScriptUtils$L1NormSparse + double l2normSparse(Map, VectorScriptDocValues.SparseVectorScriptDocValues) bound_to org.elasticsearch.xpack.vectors.query.ScoreScriptUtils$L2NormSparse double dotProductSparse(Map, VectorScriptDocValues.SparseVectorScriptDocValues) bound_to org.elasticsearch.xpack.vectors.query.ScoreScriptUtils$DotProductSparse double cosineSimilaritySparse(Map, VectorScriptDocValues.SparseVectorScriptDocValues) bound_to org.elasticsearch.xpack.vectors.query.ScoreScriptUtils$CosineSimilaritySparse } \ No newline at end of file diff --git a/x-pack/plugin/vectors/src/test/java/org/elasticsearch/xpack/vectors/query/ScoreScriptUtilsTests.java b/x-pack/plugin/vectors/src/test/java/org/elasticsearch/xpack/vectors/query/ScoreScriptUtilsTests.java index 699a9b09fb537..f9bb87ece0ca8 100644 --- a/x-pack/plugin/vectors/src/test/java/org/elasticsearch/xpack/vectors/query/ScoreScriptUtilsTests.java +++ b/x-pack/plugin/vectors/src/test/java/org/elasticsearch/xpack/vectors/query/ScoreScriptUtilsTests.java @@ -12,6 +12,8 @@ import org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.CosineSimilarity; import org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.DotProductSparse; import org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.CosineSimilaritySparse; +import org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.L1NormSparse; +import org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.L2NormSparse; import java.util.Arrays; import java.util.HashMap; @@ -20,6 +22,9 @@ import static org.elasticsearch.xpack.vectors.mapper.VectorEncoderDecoderTests.mockEncodeDenseVector; import static org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.dotProduct; +import static org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.l1norm; +import static org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.l2norm; + import static org.hamcrest.Matchers.containsString; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -35,12 +40,20 @@ public void testDenseVectorFunctions() { // test dotProduct double result = dotProduct(queryVector, dvs); - assertEquals("dotProduct result is not equal to the expected value!", 65425.62, result, 0.1); + assertEquals("dotProduct result is not equal to the expected value!", 65425.626, result, 0.001); // test cosineSimilarity CosineSimilarity cosineSimilarity = new CosineSimilarity(queryVector); double result2 = cosineSimilarity.cosineSimilarity(dvs); - assertEquals("cosineSimilarity result is not equal to the expected value!", 0.78, result2, 0.1); + assertEquals("cosineSimilarity result is not equal to the expected value!", 0.790, result2, 0.001); + + // test l1Norm + double result3 = l1norm(queryVector, dvs); + assertEquals("l1norm result is not equal to the expected value!", 485.184, result3, 0.001); + + // test l2norm + double result4 = l2norm(queryVector, dvs); + assertEquals("l2norm result is not equal to the expected value!", 301.361, result4, 0.001); // test dotProduct fails when queryVector has wrong number of dims List invalidQueryVector = Arrays.asList(0.5, 111.3); @@ -52,6 +65,13 @@ public void testDenseVectorFunctions() { e = expectThrows(IllegalArgumentException.class, () -> cosineSimilarity2.cosineSimilarity(dvs)); assertThat(e.getMessage(), containsString("dimensions of the query vector [2] is different from the documents' vectors [5]")); + // test l1norm fails when queryVector has wrong number of dims + e = expectThrows(IllegalArgumentException.class, () -> l1norm(invalidQueryVector, dvs)); + assertThat(e.getMessage(), containsString("dimensions of the query vector [2] is different from the documents' vectors [5]")); + + // test l2norm fails when queryVector has wrong number of dims + e = expectThrows(IllegalArgumentException.class, () -> l2norm(invalidQueryVector, dvs)); + assertThat(e.getMessage(), containsString("dimensions of the query vector [2] is different from the documents' vectors [5]")); } public void testSparseVectorFunctions() { @@ -71,11 +91,95 @@ public void testSparseVectorFunctions() { // test dotProduct DotProductSparse docProductSparse = new DotProductSparse(queryVector); double result = docProductSparse.dotProductSparse(dvs); - assertEquals("dotProductSparse result is not equal to the expected value!", 65425.62, result, 0.1); + assertEquals("dotProductSparse result is not equal to the expected value!", 65425.626, result, 0.001); + + // test cosineSimilarity + CosineSimilaritySparse cosineSimilaritySparse = new CosineSimilaritySparse(queryVector); + double result2 = cosineSimilaritySparse.cosineSimilaritySparse(dvs); + assertEquals("cosineSimilaritySparse result is not equal to the expected value!", 0.790, result2, 0.001); + + // test l1norm + L1NormSparse l1Norm = new L1NormSparse(queryVector); + double result3 = l1Norm.l1normSparse(dvs); + assertEquals("l1normSparse result is not equal to the expected value!", 485.184, result3, 0.001); + + // test l2norm + L2NormSparse l2Norm = new L2NormSparse(queryVector); + double result4 = l2Norm.l2normSparse(dvs); + assertEquals("l2normSparse result is not equal to the expected value!", 301.361, result4, 0.001); + } + + public void testSparseVectorMissingDimensions1() { + // Document vector's biggest dimension > query vector's biggest dimension + int[] docVectorDims = {2, 10, 50, 113, 4545, 4546}; + float[] docVectorValues = {230.0f, 300.33f, -34.8988f, 15.555f, -200.0f, 11.5f}; + BytesRef encodedDocVector = VectorEncoderDecoder.encodeSparseVector(docVectorDims, docVectorValues, docVectorDims.length); + VectorScriptDocValues.SparseVectorScriptDocValues dvs = mock(VectorScriptDocValues.SparseVectorScriptDocValues.class); + when(dvs.getEncodedValue()).thenReturn(encodedDocVector); + Map queryVector = new HashMap() {{ + put("2", 0.5); + put("10", 111.3); + put("50", -13.0); + put("113", 14.8); + put("114", -20.5); + put("4545", -156.0); + }}; + + // test dotProduct + DotProductSparse docProductSparse = new DotProductSparse(queryVector); + double result = docProductSparse.dotProductSparse(dvs); + assertEquals("dotProductSparse result is not equal to the expected value!", 65425.626, result, 0.001); + + // test cosineSimilarity + CosineSimilaritySparse cosineSimilaritySparse = new CosineSimilaritySparse(queryVector); + double result2 = cosineSimilaritySparse.cosineSimilaritySparse(dvs); + assertEquals("cosineSimilaritySparse result is not equal to the expected value!", 0.786, result2, 0.001); + + // test l1norm + L1NormSparse l1Norm = new L1NormSparse(queryVector); + double result3 = l1Norm.l1normSparse(dvs); + assertEquals("l1normSparse result is not equal to the expected value!", 517.184, result3, 0.001); + + // test l2norm + L2NormSparse l2Norm = new L2NormSparse(queryVector); + double result4 = l2Norm.l2normSparse(dvs); + assertEquals("l2normSparse result is not equal to the expected value!", 302.277, result4, 0.001); + } + + public void testSparseVectorMissingDimensions2() { + // Document vector's biggest dimension < query vector's biggest dimension + int[] docVectorDims = {2, 10, 50, 113, 4545, 4546}; + float[] docVectorValues = {230.0f, 300.33f, -34.8988f, 15.555f, -200.0f, 11.5f}; + BytesRef encodedDocVector = VectorEncoderDecoder.encodeSparseVector(docVectorDims, docVectorValues, docVectorDims.length); + VectorScriptDocValues.SparseVectorScriptDocValues dvs = mock(VectorScriptDocValues.SparseVectorScriptDocValues.class); + when(dvs.getEncodedValue()).thenReturn(encodedDocVector); + Map queryVector = new HashMap() {{ + put("2", 0.5); + put("10", 111.3); + put("50", -13.0); + put("113", 14.8); + put("4545", -156.0); + put("4548", -20.5); + }}; + + // test dotProduct + DotProductSparse docProductSparse = new DotProductSparse(queryVector); + double result = docProductSparse.dotProductSparse(dvs); + assertEquals("dotProductSparse result is not equal to the expected value!", 65425.626, result, 0.001); // test cosineSimilarity CosineSimilaritySparse cosineSimilaritySparse = new CosineSimilaritySparse(queryVector); double result2 = cosineSimilaritySparse.cosineSimilaritySparse(dvs); - assertEquals("cosineSimilaritySparse result is not equal to the expected value!", 0.78, result2, 0.1); + assertEquals("cosineSimilaritySparse result is not equal to the expected value!", 0.786, result2, 0.001); + + // test l1norm + L1NormSparse l1Norm = new L1NormSparse(queryVector); + double result3 = l1Norm.l1normSparse(dvs); + assertEquals("l1normSparse result is not equal to the expected value!", 517.184, result3, 0.001); + + // test l2norm + L2NormSparse l2Norm = new L2NormSparse(queryVector); + double result4 = l2Norm.l2normSparse(dvs); + assertEquals("l2normSparse result is not equal to the expected value!", 302.277, result4, 0.001); } }