55_初识搜索引擎_相关度评分TF&IDF算法独家解密

课程大纲

1、算法介绍

relevance score算法,简单来说,就是计算出,一个索引中的文本,与搜索文本,他们之间的关联匹配程度

Elasticsearch使用的是 term frequency/inverse document frequency算法,简称为TF/IDF算法

Term frequency:搜索文本中的各个词条在field文本中出现了多少次,出现次数越多,就越相关

搜索请求:hello world

doc1:hello you, and world is very good
doc2:hello, how are you

Inverse document frequency:搜索文本中的各个词条在整个索引的所有文档中出现了多少次,出现的次数越多,就越不相关

搜索请求:hello world

doc1:hello, today is very good
doc2:hi world, how are you

比如说,在index中有1万条document,hello这个单词在所有的document中,一共出现了1000次;world这个单词在所有的document中,一共出现了100次

doc2更相关

Field-length norm:field长度,field越长,相关度越弱

搜索请求:hello world

doc1:{ "title": "hello article", "content": "babaaba 1万个单词" }
doc2:{ "title": "my article", "content": "blablabala 1万个单词,hi world" }

hello world在整个index中出现的次数是一样多的

doc1更相关,title field更短

2、_score是如何被计算出来的

GET /test_index/test_type/_search?explain
{
"query": {
"match": {
"test_field": "test hello"
}
}
}

{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 1.595089,
"hits": [
{
"_shard": "[test_index][2]",
"_node": "4onsTYVZTjGvIj9_spWz2w",
"_index": "test_index",
"_type": "test_type",
"_id": "20",
"_score": 1.595089,
"_source": {
"test_field": "test hello"
},
"_explanation": {
"value": 1.595089,
"description": "sum of:",
"details": [
{
"value": 1.595089,
"description": "sum of:",
"details": [
{
"value": 0.58279467,
"description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:",
"details": [
{
"value": 0.58279467,
"description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
"details": [
{
"value": 0.6931472,
"description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
"details": [
{
"value": 2,
"description": "docFreq",
"details": []
},
{
"value": 4,
"description": "docCount",
"details": []
}
]
},
{
"value": 0.840795,
"description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
"details": [
{
"value": 1,
"description": "termFreq=1.0",
"details": []
},
{
"value": 1.2,
"description": "parameter k1",
"details": []
},
{
"value": 0.75,
"description": "parameter b",
"details": []
},
{
"value": 1.75,
"description": "avgFieldLength",
"details": []
},
{
"value": 2.56,
"description": "fieldLength",
"details": []
}
]
}
]
}
]
},
{
"value": 1.0122943,
"description": "weight(test_field:hello in 0) [PerFieldSimilarity], result of:",
"details": [
{
"value": 1.0122943,
"description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
"details": [
{
"value": 1.2039728,
"description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
"details": [
{
"value": 1,
"description": "docFreq",
"details": []
},
{
"value": 4,
"description": "docCount",
"details": []
}
]
},
{
"value": 0.840795,
"description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
"details": [
{
"value": 1,
"description": "termFreq=1.0",
"details": []
},
{
"value": 1.2,
"description": "parameter k1",
"details": []
},
{
"value": 0.75,
"description": "parameter b",
"details": []
},
{
"value": 1.75,
"description": "avgFieldLength",
"details": []
},
{
"value": 2.56,
"description": "fieldLength",
"details": []
}
]
}
]
}
]
}
]
},
{
"value": 0,
"description": "match on required clause, product of:",
"details": [
{
"value": 0,
"description": "# clause",
"details": []
},
{
"value": 1,
"description": ":, product of:",
"details": [
{
"value": 1,
"description": "boost",
"details": []
},
{
"value": 1,
"description": "queryNorm",
"details": []
}
]
}
]
}
]
}
},
{
"_shard": "[test_index][2]",
"_node": "4onsTYVZTjGvIj9_spWz2w",
"_index": "test_index",
"_type": "test_type",
"_id": "6",
"_score": 0.58279467,
"_source": {
"test_field": "tes test"
},
"_explanation": {
"value": 0.58279467,
"description": "sum of:",
"details": [
{
"value": 0.58279467,
"description": "sum of:",
"details": [
{
"value": 0.58279467,
"description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:",
"details": [
{
"value": 0.58279467,
"description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
"details": [
{
"value": 0.6931472,
"description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
"details": [
{
"value": 2,
"description": "docFreq",
"details": []
},
{
"value": 4,
"description": "docCount",
"details": []
}
]
},
{
"value": 0.840795,
"description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
"details": [
{
"value": 1,
"description": "termFreq=1.0",
"details": []
},
{
"value": 1.2,
"description": "parameter k1",
"details": []
},
{
"value": 0.75,
"description": "parameter b",
"details": []
},
{
"value": 1.75,
"description": "avgFieldLength",
"details": []
},
{
"value": 2.56,
"description": "fieldLength",
"details": []
}
]
}
]
}
]
}
]
},
{
"value": 0,
"description": "match on required clause, product of:",
"details": [
{
"value": 0,
"description": "# clause",
"details": []
},
{
"value": 1,
"description": ":, product of:",
"details": [
{
"value": 1,
"description": "boost",
"details": []
},
{
"value": 1,
"description": "queryNorm",
"details": []
}
]
}
]
}
]
}
},
{
"_shard": "[test_index][3]",
"_node": "4onsTYVZTjGvIj9_spWz2w",
"_index": "test_index",
"_type": "test_type",
"_id": "7",
"_score": 0.5565415,
"_source": {
"test_field": "test client 2"
},
"_explanation": {
"value": 0.5565415,
"description": "sum of:",
"details": [
{
"value": 0.5565415,
"description": "sum of:",
"details": [
{
"value": 0.5565415,
"description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:",
"details": [
{
"value": 0.5565415,
"description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
"details": [
{
"value": 0.6931472,
"description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
"details": [
{
"value": 1,
"description": "docFreq",
"details": []
},
{
"value": 2,
"description": "docCount",
"details": []
}
]
},
{
"value": 0.8029196,
"description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
"details": [
{
"value": 1,
"description": "termFreq=1.0",
"details": []
},
{
"value": 1.2,
"description": "parameter k1",
"details": []
},
{
"value": 0.75,
"description": "parameter b",
"details": []
},
{
"value": 2.5,
"description": "avgFieldLength",
"details": []
},
{
"value": 4,
"description": "fieldLength",
"details": []
}
]
}
]
}
]
}
]
},
{
"value": 0,
"description": "match on required clause, product of:",
"details": [
{
"value": 0,
"description": "# clause",
"details": []
},
{
"value": 1,
"description": "_type:test_type, product of:",
"details": [
{
"value": 1,
"description": "boost",
"details": []
},
{
"value": 1,
"description": "queryNorm",
"details": []
}
]
}
]
}
]
}
},
{
"_shard": "[test_index][1]",
"_node": "4onsTYVZTjGvIj9_spWz2w",
"_index": "test_index",
"_type": "test_type",
"_id": "8",
"_score": 0.25316024,
"_source": {
"test_field": "test client 2"
},
"_explanation": {
"value": 0.25316024,
"description": "sum of:",
"details": [
{
"value": 0.25316024,
"description": "sum of:",
"details": [
{
"value": 0.25316024,
"description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:",
"details": [
{
"value": 0.25316024,
"description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
"details": [
{
"value": 0.2876821,
"description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
"details": [
{
"value": 1,
"description": "docFreq",
"details": []
},
{
"value": 1,
"description": "docCount",
"details": []
}
]
},
{
"value": 0.88,
"description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
"details": [
{
"value": 1,
"description": "termFreq=1.0",
"details": []
},
{
"value": 1.2,
"description": "parameter k1",
"details": []
},
{
"value": 0.75,
"description": "parameter b",
"details": []
},
{
"value": 3,
"description": "avgFieldLength",
"details": []
},
{
"value": 4,
"description": "fieldLength",
"details": []
}
]
}
]
}
]
}
]
},
{
"value": 0,
"description": "match on required clause, product of:",
"details": [
{
"value": 0,
"description": "# clause",
"details": []
},
{
"value": 1,
"description": ":, product of:",
"details": [
{
"value": 1,
"description": "boost",
"details": []
},
{
"value": 1,
"description": "queryNorm",
"details": []
}
]
}
]
}
]
}
}
]
}
}

3、分析一个document是如何被匹配上的

GET /test_index/test_type/6/_explain
{
"query": {
"match": {
"test_field": "test hello"
}
}
}

posted @ 2024-10-02 13:02  5765809  阅读(21)  评论(0)    收藏  举报