Skip to content

Synonym graph causes strange score on match_phrase query #43308

@bbfsdev

Description

@bbfsdev

Elasticsearch version (bin/elasticsearch --version):
7.1.1 (also at 6.7.2).
FYI, the bug don't exist at 6.5.4.

Plugins installed: No plugins.

JVM version (java -version):
openjdk version "1.8.0_212"
OpenJDK Runtime Environment (build 1.8.0_212-b04)
OpenJDK 64-Bit Server VM (build 25.212-b04, mixed mode)

OS version (uname -a if on a Unix-like system):
Linux bbdev6.local 3.10.0-957.12.1.el7.x86_64 #1 SMP Mon Apr 29 14:59:59 UTC 2019 x86_64 x86_64 x86_64 GNU/Linux

centos-release-7-6.1810.2.el7.centos.x86_64

Description of the problem including expected versus actual behavior:
When synonym graph together with hunspell for Hebrew is used and applied to specific query that uses match_phrase, the score of the query is from some reason much larger due to tokens from non-related documents.

Steps to reproduce:
Just copy/paste those curl commands:

DELETE test
{}

PUT test
{
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 0,
    "analysis": {
      "filter": {
        "synonym_graph": {
          "type": "synonym_graph",
          "synonyms": [

          ],
          "tokenizer": "keyword"
        },
        "he_IL": {
          "locale": "he_IL",
          "type": "hunspell",
          "dedup": "true"
        }
      },
      "analyzer": {
        "hebrew_synonym": {
          "filter": [
            "synonym_graph",
            "he_IL"
          ],
          "tokenizer": "standard"
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "content": {
        "fields": {
          "language": {
            "type": "text",
            "analyzer": "hebrew_synonym"
          }
        },
        "type": "text",
        "analyzer": "standard"
      }
    }
  }
}

POST test/_doc/1
{
  "content": "מבואנוס"
}

POST test/_doc/2
{
  "content": "מבואו"
}

POST test/_doc/3
{
  "content": "מבוא לספר הזוהר"
}

POST test/_doc/4
{
  "content": "מבוארות"
}

POST test/_doc/5
{
  "content": "מבואה"
}

POST test/_doc/6
{
  "content": "מבוארים"
}

POST test/_doc/7
{
  "content": "בואקום"
}

POST test/_doc/8
{
  "content": "בואינג"
}

POST test/_doc/9
{
  "content": "בואהבת"
}

POST test/_doc/10
{
  "content": "בואנו"
}

POST test/_doc/11
{
  "content": "מבואסים"
}
POST test/_doc/12
{
  "content": "בואם"
}

POST test/_doc/13
{
  "content": "בואהבת"
}

GET test/_search
{
  "explain": true,
  "query": {
    "match_phrase": {
      "content.language": {
        "query": "מבוא לספר הזוהר"
      }
    }
  }
}

POST test/_close
PUT test/_settings
{
  "analysis": {
    "filter": {
      "synonym_graph": {
        "type": "synonym_graph",
        "synonyms": [
            "זוהר לעם,זוהר,ספר הזוהר,הזוהר"
        ],
        "tokenizer": "keyword"
      }
    }
  }
}
POST test/_open

GET test/_search
{
  "explain": true,
  "query": {
    "match_phrase": {
      "content.language": {
        "query": "מבוא לספר הזוהר"
      }
    }
  }
}

Provide logs (if relevant):

# DELETE test
{
  "acknowledged" : true
}


# PUT test
{
  "acknowledged" : true,
  "shards_acknowledged" : true,
  "index" : "test"
}


# POST test/_doc/1
{
  "_index" : "test",
  "_type" : "_doc",
  "_id" : "1",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "_seq_no" : 0,
  "_primary_term" : 1
}


# POST test/_doc/2
{
  "_index" : "test",
  "_type" : "_doc",
  "_id" : "2",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "_seq_no" : 1,
  "_primary_term" : 1
}


# POST test/_doc/3
{
  "_index" : "test",
  "_type" : "_doc",
  "_id" : "3",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "_seq_no" : 2,
  "_primary_term" : 1
}


# POST test/_doc/4
{
  "_index" : "test",
  "_type" : "_doc",
  "_id" : "4",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "_seq_no" : 3,
  "_primary_term" : 1
}


# POST test/_doc/5
{
  "_index" : "test",
  "_type" : "_doc",
  "_id" : "5",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "_seq_no" : 4,
  "_primary_term" : 1
}


# POST test/_doc/6
{
  "_index" : "test",
  "_type" : "_doc",
  "_id" : "6",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "_seq_no" : 5,
  "_primary_term" : 1
}


# POST test/_doc/7
{
  "_index" : "test",
  "_type" : "_doc",
  "_id" : "7",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "_seq_no" : 6,
  "_primary_term" : 1
}


# POST test/_doc/8
{
  "_index" : "test",
  "_type" : "_doc",
  "_id" : "8",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "_seq_no" : 7,
  "_primary_term" : 1
}


# POST test/_doc/9
{
  "_index" : "test",
  "_type" : "_doc",
  "_id" : "9",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "_seq_no" : 8,
  "_primary_term" : 1
}


# POST test/_doc/10
{
  "_index" : "test",
  "_type" : "_doc",
  "_id" : "10",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "_seq_no" : 9,
  "_primary_term" : 1
}


# POST test/_doc/11
{
  "_index" : "test",
  "_type" : "_doc",
  "_id" : "11",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "_seq_no" : 10,
  "_primary_term" : 1
}


# POST test/_doc/12
{
  "_index" : "test",
  "_type" : "_doc",
  "_id" : "12",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "_seq_no" : 11,
  "_primary_term" : 1
}


# POST test/_doc/13
{
  "_index" : "test",
  "_type" : "_doc",
  "_id" : "13",
  "_version" : 1,
  "result" : "created",
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "_seq_no" : 12,
  "_primary_term" : 1
}


# GET test/_search
{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 8.552871,
    "hits" : [
      {
        "_shard" : "[test][0]",
        "_node" : "76xQgsOsSz6OqfkSZmsVQw",
        "_index" : "test",
        "_type" : "_doc",
        "_id" : "3",
        "_score" : 8.552871,
        "_source" : {
          "content" : "מבוא לספר הזוהר"
        },
        "_explanation" : {
          "value" : 8.552871,
          "description" : """weight(content.language:"(מבוא בוא) ספר זוהר" in 2) [PerFieldSimilarity], result of:""",
          "details" : [
            .......
          ]
        }
      }
    ]
  }
}


# POST test/_close
{
  "acknowledged" : true
}


# PUT test/_settings
{
  "acknowledged" : true
}


# POST test/_open
{
  "acknowledged" : true,
  "shards_acknowledged" : true
}


# GET test/_search
{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 38.78237,
    "hits" : [
      {
        "_shard" : "[test][0]",
        "_node" : "76xQgsOsSz6OqfkSZmsVQw",
        "_index" : "test",
        "_type" : "_doc",
        "_id" : "3",
        "_score" : 38.78237,
        "_source" : {
          "content" : "מבוא לספר הזוהר"
        },
        "_explanation" : {
          "value" : 38.78237,
          "description" : "weight(spanNear([spanOr([spanOr([content.language:מבואנוס, content.language:מבואו, content.language:מבוארות, content.language:מבואה, content.language:מבואסים, content.language:מבוארים, content.language:מבוא]), spanOr([content.language:בוא, content.language:בואנו, content.language:בואה, content.language:בואם, content.language:בואינג, content.language:בואו, content.language:בואקום, content.language:בואהבת])]), content.language:ספר, spanOr([spanNear([content.language:זוהר, content.language:עם], 0, true), content.language:זוהר, spanNear([content.language:ספר, content.language:זוהר], 0, true), content.language:זוהר])], 0, true) in 2) [PerFieldSimilarity], result of:",
          "details" : [
            {
   ..........
          ]
        }
      }
    ]
  }
}

In the logs above you can see 2 queries.
First query done when synonyms list is empty. The score is small, i.e., 8.5 and the result is reasonable.
Second query done when synonym list is "זוהר לעם,זוהר,ספר הזוהר,הזוהר" which might add some value to the score but the score is unproportionally large and what is more interesting depends on other non-related to query nor to synonyms documents (this can be seen the in the explanation of the second query):
...
"description" : "weight(spanNear([spanOr([spanOr([content.language:מבואנוס, content.language:מבואו, content.language:מבוארות, content.language:מבואה, content.language:מבואסים, content.language:מבוארים, content.language:מבוא]), spanOr([content.language:בוא, content.language:בואנו, content.language:בואה, content.language:בואם, content.language:בואינג, content.language:בואו, content.language:בואקום, content.language:בואהבת])]), content.language:ספר, spanOr([spanNear([content.language:זוהר, content.language:עם], 0, true), content.language:זוהר, spanNear([content.language:ספר, content.language:זוהר], 0, true), content.language:זוהר])], 0, true) in 2) [PerFieldSimilarity], result of:"
...

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions