-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Description
Elasticsearch version (bin/elasticsearch --version):
7.1.1 (also at 6.7.2).
FYI, the bug don't exist at 6.5.4.
Plugins installed: No plugins.
JVM version (java -version):
openjdk version "1.8.0_212"
OpenJDK Runtime Environment (build 1.8.0_212-b04)
OpenJDK 64-Bit Server VM (build 25.212-b04, mixed mode)
OS version (uname -a if on a Unix-like system):
Linux bbdev6.local 3.10.0-957.12.1.el7.x86_64 #1 SMP Mon Apr 29 14:59:59 UTC 2019 x86_64 x86_64 x86_64 GNU/Linux
centos-release-7-6.1810.2.el7.centos.x86_64
Description of the problem including expected versus actual behavior:
When synonym graph together with hunspell for Hebrew is used and applied to specific query that uses match_phrase, the score of the query is from some reason much larger due to tokens from non-related documents.
Steps to reproduce:
Just copy/paste those curl commands:
DELETE test
{}
PUT test
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"filter": {
"synonym_graph": {
"type": "synonym_graph",
"synonyms": [
],
"tokenizer": "keyword"
},
"he_IL": {
"locale": "he_IL",
"type": "hunspell",
"dedup": "true"
}
},
"analyzer": {
"hebrew_synonym": {
"filter": [
"synonym_graph",
"he_IL"
],
"tokenizer": "standard"
}
}
}
},
"mappings": {
"properties": {
"content": {
"fields": {
"language": {
"type": "text",
"analyzer": "hebrew_synonym"
}
},
"type": "text",
"analyzer": "standard"
}
}
}
}
POST test/_doc/1
{
"content": "מבואנוס"
}
POST test/_doc/2
{
"content": "מבואו"
}
POST test/_doc/3
{
"content": "מבוא לספר הזוהר"
}
POST test/_doc/4
{
"content": "מבוארות"
}
POST test/_doc/5
{
"content": "מבואה"
}
POST test/_doc/6
{
"content": "מבוארים"
}
POST test/_doc/7
{
"content": "בואקום"
}
POST test/_doc/8
{
"content": "בואינג"
}
POST test/_doc/9
{
"content": "בואהבת"
}
POST test/_doc/10
{
"content": "בואנו"
}
POST test/_doc/11
{
"content": "מבואסים"
}
POST test/_doc/12
{
"content": "בואם"
}
POST test/_doc/13
{
"content": "בואהבת"
}
GET test/_search
{
"explain": true,
"query": {
"match_phrase": {
"content.language": {
"query": "מבוא לספר הזוהר"
}
}
}
}
POST test/_close
PUT test/_settings
{
"analysis": {
"filter": {
"synonym_graph": {
"type": "synonym_graph",
"synonyms": [
"זוהר לעם,זוהר,ספר הזוהר,הזוהר"
],
"tokenizer": "keyword"
}
}
}
}
POST test/_open
GET test/_search
{
"explain": true,
"query": {
"match_phrase": {
"content.language": {
"query": "מבוא לספר הזוהר"
}
}
}
}
Provide logs (if relevant):
# DELETE test
{
"acknowledged" : true
}
# PUT test
{
"acknowledged" : true,
"shards_acknowledged" : true,
"index" : "test"
}
# POST test/_doc/1
{
"_index" : "test",
"_type" : "_doc",
"_id" : "1",
"_version" : 1,
"result" : "created",
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 0,
"_primary_term" : 1
}
# POST test/_doc/2
{
"_index" : "test",
"_type" : "_doc",
"_id" : "2",
"_version" : 1,
"result" : "created",
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 1,
"_primary_term" : 1
}
# POST test/_doc/3
{
"_index" : "test",
"_type" : "_doc",
"_id" : "3",
"_version" : 1,
"result" : "created",
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 2,
"_primary_term" : 1
}
# POST test/_doc/4
{
"_index" : "test",
"_type" : "_doc",
"_id" : "4",
"_version" : 1,
"result" : "created",
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 3,
"_primary_term" : 1
}
# POST test/_doc/5
{
"_index" : "test",
"_type" : "_doc",
"_id" : "5",
"_version" : 1,
"result" : "created",
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 4,
"_primary_term" : 1
}
# POST test/_doc/6
{
"_index" : "test",
"_type" : "_doc",
"_id" : "6",
"_version" : 1,
"result" : "created",
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 5,
"_primary_term" : 1
}
# POST test/_doc/7
{
"_index" : "test",
"_type" : "_doc",
"_id" : "7",
"_version" : 1,
"result" : "created",
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 6,
"_primary_term" : 1
}
# POST test/_doc/8
{
"_index" : "test",
"_type" : "_doc",
"_id" : "8",
"_version" : 1,
"result" : "created",
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 7,
"_primary_term" : 1
}
# POST test/_doc/9
{
"_index" : "test",
"_type" : "_doc",
"_id" : "9",
"_version" : 1,
"result" : "created",
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 8,
"_primary_term" : 1
}
# POST test/_doc/10
{
"_index" : "test",
"_type" : "_doc",
"_id" : "10",
"_version" : 1,
"result" : "created",
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 9,
"_primary_term" : 1
}
# POST test/_doc/11
{
"_index" : "test",
"_type" : "_doc",
"_id" : "11",
"_version" : 1,
"result" : "created",
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 10,
"_primary_term" : 1
}
# POST test/_doc/12
{
"_index" : "test",
"_type" : "_doc",
"_id" : "12",
"_version" : 1,
"result" : "created",
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 11,
"_primary_term" : 1
}
# POST test/_doc/13
{
"_index" : "test",
"_type" : "_doc",
"_id" : "13",
"_version" : 1,
"result" : "created",
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 12,
"_primary_term" : 1
}
# GET test/_search
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 8.552871,
"hits" : [
{
"_shard" : "[test][0]",
"_node" : "76xQgsOsSz6OqfkSZmsVQw",
"_index" : "test",
"_type" : "_doc",
"_id" : "3",
"_score" : 8.552871,
"_source" : {
"content" : "מבוא לספר הזוהר"
},
"_explanation" : {
"value" : 8.552871,
"description" : """weight(content.language:"(מבוא בוא) ספר זוהר" in 2) [PerFieldSimilarity], result of:""",
"details" : [
.......
]
}
}
]
}
}
# POST test/_close
{
"acknowledged" : true
}
# PUT test/_settings
{
"acknowledged" : true
}
# POST test/_open
{
"acknowledged" : true,
"shards_acknowledged" : true
}
# GET test/_search
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 38.78237,
"hits" : [
{
"_shard" : "[test][0]",
"_node" : "76xQgsOsSz6OqfkSZmsVQw",
"_index" : "test",
"_type" : "_doc",
"_id" : "3",
"_score" : 38.78237,
"_source" : {
"content" : "מבוא לספר הזוהר"
},
"_explanation" : {
"value" : 38.78237,
"description" : "weight(spanNear([spanOr([spanOr([content.language:מבואנוס, content.language:מבואו, content.language:מבוארות, content.language:מבואה, content.language:מבואסים, content.language:מבוארים, content.language:מבוא]), spanOr([content.language:בוא, content.language:בואנו, content.language:בואה, content.language:בואם, content.language:בואינג, content.language:בואו, content.language:בואקום, content.language:בואהבת])]), content.language:ספר, spanOr([spanNear([content.language:זוהר, content.language:עם], 0, true), content.language:זוהר, spanNear([content.language:ספר, content.language:זוהר], 0, true), content.language:זוהר])], 0, true) in 2) [PerFieldSimilarity], result of:",
"details" : [
{
..........
]
}
}
]
}
}
In the logs above you can see 2 queries.
First query done when synonyms list is empty. The score is small, i.e., 8.5 and the result is reasonable.
Second query done when synonym list is "זוהר לעם,זוהר,ספר הזוהר,הזוהר" which might add some value to the score but the score is unproportionally large and what is more interesting depends on other non-related to query nor to synonyms documents (this can be seen the in the explanation of the second query):
...
"description" : "weight(spanNear([spanOr([spanOr([content.language:מבואנוס, content.language:מבואו, content.language:מבוארות, content.language:מבואה, content.language:מבואסים, content.language:מבוארים, content.language:מבוא]), spanOr([content.language:בוא, content.language:בואנו, content.language:בואה, content.language:בואם, content.language:בואינג, content.language:בואו, content.language:בואקום, content.language:בואהבת])]), content.language:ספר, spanOr([spanNear([content.language:זוהר, content.language:עם], 0, true), content.language:זוהר, spanNear([content.language:ספר, content.language:זוהר], 0, true), content.language:זוהר])], 0, true) in 2) [PerFieldSimilarity], result of:"
...