Skip to content

Commit 756c42f

Browse files
Add dims parameter to dense_vector mapping (#43444) (#43895)
Typically, dense vectors of both documents and queries must have the same number of dimensions. Different number of dimensions among documents or query vector indicate an error. This PR enforces that all vectors for the same field have the same number of dimensions. It also enforces that query vectors have the same number of dimensions.
1 parent fb825a6 commit 756c42f

File tree

11 files changed

+221
-98
lines changed

11 files changed

+221
-98
lines changed

docs/reference/mapping/types/dense-vector.asciidoc

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,7 @@ experimental[]
77

88
A `dense_vector` field stores dense vectors of float values.
99
The maximum number of dimensions that can be in a vector should
10-
not exceed 1024. The number of dimensions can be
11-
different across documents. A `dense_vector` field is
12-
a single-valued field.
10+
not exceed 1024. A `dense_vector` field is a single-valued field.
1311

1412
These vectors can be used for <<vector-functions,document scoring>>.
1513
For example, a document score can represent a distance between
@@ -24,7 +22,8 @@ PUT my_index
2422
"mappings": {
2523
"properties": {
2624
"my_vector": {
27-
"type": "dense_vector"
25+
"type": "dense_vector",
26+
"dims": 3 <1>
2827
},
2928
"my_text" : {
3029
"type" : "keyword"
@@ -42,13 +41,14 @@ PUT my_index/_doc/1
4241
PUT my_index/_doc/2
4342
{
4443
"my_text" : "text2",
45-
"my_vector" : [-0.5, 10, 10, 4]
44+
"my_vector" : [-0.5, 10, 10]
4645
}
4746
4847
--------------------------------------------------
4948
// CONSOLE
5049

50+
<1> dims—the number of dimensions in the vector, required parameter.
51+
5152
Internally, each document's dense vector is encoded as a binary
5253
doc value. Its size in bytes is equal to
53-
`4 * NUMBER_OF_DIMENSIONS`, where `NUMBER_OF_DIMENSIONS` -
54-
number of the vector's dimensions.
54+
`4 * dims`, where `dims`—the number of the vector's dimensions.

docs/reference/migration/migrate_7_3.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ coming[7.3.0]
2020

2121
[[breaking_73_mapping_changes]]
2222
=== Mapping changes
23+
`dense_vector` field now requires `dims` parameter, specifying the number of
24+
dimensions for document and query vectors for this field.
2325

2426
[float]
2527
==== Defining multi-fields within multi-fields

docs/reference/query-dsl/script-score-query.asciidoc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,8 +186,7 @@ a vector function is executed, 0 is returned as a result
186186
for this document.
187187

188188
NOTE: If a document's dense vector field has a number of dimensions
189-
different from the query's vector, 0 is used for missing dimensions
190-
in the calculations of vector functions.
189+
different from the query's vector, an error will be thrown.
191190

192191

193192
[[random-score-function]]

x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/10_dense_vector_basic.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ setup:
22
- skip:
33
features: headers
44
version: " - 7.2.99"
5-
reason: "dense_vector functions were introduced in 7.3.0"
5+
reason: "dense_vector dims parameter was added from 7.3"
66

77
- do:
88
indices.create:
@@ -15,6 +15,7 @@ setup:
1515
properties:
1616
my_dense_vector:
1717
type: dense_vector
18+
dims: 5
1819
- do:
1920
index:
2021
index: test-index

x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/20_dense_vector_special_cases.yml

Lines changed: 62 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ setup:
22
- skip:
33
features: headers
44
version: " - 7.2.99"
5-
reason: "dense_vector functions were introduced in 7.3.0"
5+
reason: "dense_vector dims parameter was added from 7.3"
66

77
- do:
88
indices.create:
@@ -17,31 +17,36 @@ setup:
1717
properties:
1818
my_dense_vector:
1919
type: dense_vector
20+
dims: 3
2021

2122

2223
---
23-
"Vectors of different dimensions and data types":
24-
# document vectors of different dimensions
24+
"Indexing of Dense vectors should error when dims don't match defined in the mapping":
25+
2526
- do:
27+
catch: bad_request
2628
index:
2729
index: test-index
2830
id: 1
2931
body:
30-
my_dense_vector: [10]
32+
my_dense_vector: [10, 2]
33+
- match: { error.type: "mapper_parsing_exception" }
3134

35+
---
36+
"Vectors of mixed integers and floats":
3237
- do:
3338
index:
3439
index: test-index
35-
id: 2
40+
id: 1
3641
body:
37-
my_dense_vector: [10, 10.5]
42+
my_dense_vector: [10, 10, 10]
3843

3944
- do:
4045
index:
4146
index: test-index
42-
id: 3
47+
id: 2
4348
body:
44-
my_dense_vector: [10, 10.5, 100.5]
49+
my_dense_vector: [10.9, 10.9, 10.9]
4550

4651
- do:
4752
indices.refresh: {}
@@ -59,14 +64,13 @@ setup:
5964
script:
6065
source: "cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
6166
params:
62-
query_vector: [10]
67+
query_vector: [10, 10, 10]
6368

64-
- match: {hits.total: 3}
69+
- match: {hits.total: 2}
6570
- match: {hits.hits.0._id: "1"}
6671
- match: {hits.hits.1._id: "2"}
67-
- match: {hits.hits.2._id: "3"}
6872

69-
# query vector of type double
73+
# query vector of type float
7074
- do:
7175
headers:
7276
Content-Type: application/json
@@ -79,12 +83,52 @@ setup:
7983
script:
8084
source: "cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
8185
params:
82-
query_vector: [10.0]
86+
query_vector: [10.0, 10.0, 10.0]
8387

84-
- match: {hits.total: 3}
88+
- match: {hits.total: 2}
8589
- match: {hits.hits.0._id: "1"}
8690
- match: {hits.hits.1._id: "2"}
87-
- match: {hits.hits.2._id: "3"}
91+
92+
93+
---
94+
"Functions with query vectors with dims different from docs vectors should error":
95+
- do:
96+
index:
97+
index: test-index
98+
id: 1
99+
body:
100+
my_dense_vector: [1, 2, 3]
101+
102+
- do:
103+
indices.refresh: {}
104+
105+
- do:
106+
catch: bad_request
107+
search:
108+
rest_total_hits_as_int: true
109+
body:
110+
query:
111+
script_score:
112+
query: {match_all: {} }
113+
script:
114+
source: "cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
115+
params:
116+
query_vector: [1, 2, 3, 4]
117+
- match: { error.root_cause.0.type: "script_exception" }
118+
119+
- do:
120+
catch: bad_request
121+
search:
122+
rest_total_hits_as_int: true
123+
body:
124+
query:
125+
script_score:
126+
query: {match_all: {} }
127+
script:
128+
source: "dotProduct(params.query_vector, doc['my_dense_vector'])"
129+
params:
130+
query_vector: [1, 2, 3, 4]
131+
- match: { error.root_cause.0.type: "script_exception" }
88132

89133
---
90134
"Distance functions for documents missing vector field should return 0":
@@ -93,7 +137,7 @@ setup:
93137
index: test-index
94138
id: 1
95139
body:
96-
my_dense_vector: [10]
140+
my_dense_vector: [10, 10, 10]
97141

98142
- do:
99143
index:
@@ -117,7 +161,7 @@ setup:
117161
script:
118162
source: "cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
119163
params:
120-
query_vector: [10.0]
164+
query_vector: [10.0, 10.0, 10.0]
121165

122166
- match: {hits.total: 2}
123167
- match: {hits.hits.0._id: "1"}
@@ -148,5 +192,5 @@ setup:
148192
script:
149193
source: "dotProductSparse(params.query_vector, doc['my_dense_vector'])"
150194
params:
151-
query_vector: {"2": 0.5, "10" : 111.3}
195+
query_vector: {"2": 0.5, "10" : 111.3, "3": 44}
152196
- match: { error.root_cause.0.type: "script_exception" }

x-pack/plugin/vectors/src/main/java/org/elasticsearch/xpack/vectors/mapper/DenseVectorFieldMapper.java

Lines changed: 54 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,11 @@
1212
import org.apache.lucene.index.IndexableField;
1313
import org.apache.lucene.search.DocValuesFieldExistsQuery;
1414
import org.apache.lucene.search.Query;
15-
import org.apache.lucene.util.ArrayUtil;
1615
import org.apache.lucene.util.BytesRef;
1716
import org.elasticsearch.common.settings.Settings;
17+
import org.elasticsearch.common.xcontent.XContentBuilder;
1818
import org.elasticsearch.common.xcontent.XContentParser.Token;
19+
import org.elasticsearch.common.xcontent.support.XContentMapValues;
1920
import org.elasticsearch.index.fielddata.IndexFieldData;
2021
import org.elasticsearch.index.mapper.ArrayValueMapperParser;
2122
import org.elasticsearch.index.mapper.FieldMapper;
@@ -56,12 +57,28 @@ public static class Defaults {
5657
}
5758

5859
public static class Builder extends FieldMapper.Builder<Builder, DenseVectorFieldMapper> {
60+
private int dims = 0;
5961

6062
public Builder(String name) {
6163
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
6264
builder = this;
6365
}
6466

67+
public Builder dims(int dims) {
68+
if ((dims > MAX_DIMS_COUNT) || (dims < 1)) {
69+
throw new MapperParsingException("The number of dimensions for field [" + name +
70+
"] should be in the range [1, " + MAX_DIMS_COUNT + "]");
71+
}
72+
this.dims = dims;
73+
return this;
74+
}
75+
76+
@Override
77+
protected void setupFieldType(BuilderContext context) {
78+
super.setupFieldType(context);
79+
fieldType().setDims(dims);
80+
}
81+
6582
@Override
6683
public DenseVectorFieldType fieldType() {
6784
return (DenseVectorFieldType) super.fieldType();
@@ -80,11 +97,17 @@ public static class TypeParser implements Mapper.TypeParser {
8097
@Override
8198
public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
8299
DenseVectorFieldMapper.Builder builder = new DenseVectorFieldMapper.Builder(name);
83-
return builder;
100+
Object dimsField = node.remove("dims");
101+
if (dimsField == null) {
102+
throw new MapperParsingException("The [dims] property must be specified for field [" + name + "].");
103+
}
104+
int dims = XContentMapValues.nodeIntegerValue(dimsField);
105+
return builder.dims(dims);
84106
}
85107
}
86108

87109
public static final class DenseVectorFieldType extends MappedFieldType {
110+
private int dims;
88111

89112
public DenseVectorFieldType() {}
90113

@@ -96,6 +119,14 @@ public DenseVectorFieldType clone() {
96119
return new DenseVectorFieldType(this);
97120
}
98121

122+
int dims() {
123+
return dims;
124+
}
125+
126+
void setDims(int dims) {
127+
this.dims = dims;
128+
}
129+
99130
@Override
100131
public String typeName() {
101132
return CONTENT_TYPE;
@@ -145,28 +176,30 @@ public void parse(ParseContext context) throws IOException {
145176
if (context.externalValueSet()) {
146177
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] can't be used in multi-fields");
147178
}
179+
int dims = fieldType().dims(); //number of vector dimensions
148180

149181
// encode array of floats as array of integers and store into buf
150182
// this code is here and not int the VectorEncoderDecoder so not to create extra arrays
151-
byte[] buf = new byte[0];
183+
byte[] buf = new byte[dims * INT_BYTES];
152184
int offset = 0;
153185
int dim = 0;
154186
for (Token token = context.parser().nextToken(); token != Token.END_ARRAY; token = context.parser().nextToken()) {
187+
if (dim++ >= dims) {
188+
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] of doc [" +
189+
context.sourceToParse().id() + "] has exceeded the number of dimensions [" + dims + "] defined in mapping");
190+
}
155191
ensureExpectedToken(Token.VALUE_NUMBER, token, context.parser()::getTokenLocation);
156192
float value = context.parser().floatValue(true);
157-
if (buf.length < (offset + INT_BYTES)) {
158-
buf = ArrayUtil.grow(buf, (offset + INT_BYTES));
159-
}
160193
int intValue = Float.floatToIntBits(value);
161-
buf[offset] = (byte) (intValue >> 24);
162-
buf[offset+1] = (byte) (intValue >> 16);
163-
buf[offset+2] = (byte) (intValue >> 8);
164-
buf[offset+3] = (byte) intValue;
165-
offset += INT_BYTES;
166-
if (dim++ >= MAX_DIMS_COUNT) {
167-
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
168-
"] has exceeded the maximum allowed number of dimensions of [" + MAX_DIMS_COUNT + "]");
169-
}
194+
buf[offset++] = (byte) (intValue >> 24);
195+
buf[offset++] = (byte) (intValue >> 16);
196+
buf[offset++] = (byte) (intValue >> 8);
197+
buf[offset++] = (byte) intValue;
198+
}
199+
if (dim != dims) {
200+
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] of doc [" +
201+
context.sourceToParse().id() + "] has number of dimensions [" + dim +
202+
"] less than defined in the mapping [" + dims +"]");
170203
}
171204
BinaryDocValuesField field = new BinaryDocValuesField(fieldType().name(), new BytesRef(buf, 0, offset));
172205
if (context.doc().getByKey(fieldType().name()) != null) {
@@ -176,6 +209,12 @@ public void parse(ParseContext context) throws IOException {
176209
context.doc().addWithKey(fieldType().name(), field);
177210
}
178211

212+
@Override
213+
protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException {
214+
super.doXContentBody(builder, includeDefaults, params);
215+
builder.field("dims", fieldType().dims());
216+
}
217+
179218
@Override
180219
protected void parseCreateField(ParseContext context, List<IndexableField> fields) {
181220
throw new AssertionError("parse is implemented directly");

x-pack/plugin/vectors/src/main/java/org/elasticsearch/xpack/vectors/mapper/VectorEncoderDecoder.java

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -162,12 +162,11 @@ public static float[] decodeDenseVector(BytesRef vectorBR) {
162162
float[] vector = new float[dimCount];
163163
int offset = vectorBR.offset;
164164
for (int dim = 0; dim < dimCount; dim++) {
165-
int intValue = ((vectorBR.bytes[offset] & 0xFF) << 24) |
166-
((vectorBR.bytes[offset+1] & 0xFF) << 16) |
167-
((vectorBR.bytes[offset+2] & 0xFF) << 8) |
168-
(vectorBR.bytes[offset+3] & 0xFF);
165+
int intValue = ((vectorBR.bytes[offset++] & 0xFF) << 24) |
166+
((vectorBR.bytes[offset++] & 0xFF) << 16) |
167+
((vectorBR.bytes[offset++] & 0xFF) << 8) |
168+
(vectorBR.bytes[offset++] & 0xFF);
169169
vector[dim] = Float.intBitsToFloat(intValue);
170-
offset = offset + INT_BYTES;
171170
}
172171
return vector;
173172
}

0 commit comments

Comments
 (0)