全文搜索两个最重要的方面是:
- 相关性(Relevance) 它是评价查询与其结果间的相关程度,并根据这种相关程度对结果排名的一种能力,这种计算方式可以是 TF/IDF 方法、地理位置邻近、模糊相似,或其他的某些算法。
- 分词(Analysis) 它是将文本块转换为有区别的、规范化的 token 的一个过程,目的是为了创建倒排索引以及 查询倒排索引。
一、构造数据
1、数据库中当前数据
2、构建索引
PUT http://127.0.0.1:9200/study
# 请求数据
{
"settings": {
"index": {
"number_of_shards": "1",
"number_of_replicas": "0"
}
},
"mappings": {
"properties": {
"name": {
"type": "text"
},
"age": {
"type": "integer"
},
"mail": {
"type": "keyword"
},
"hobby": {
"type": "text",
"analyzer": "ik_max_word"
}
}
}
}
# 响应数据
{
"acknowledged": true,
"shards_acknowledged": true,
"index": "study"
}
3、添加数据
POST http://127.0.0.1:9200/study/_bulk
# 请求数据
{"index":{"_index":"study"}}
{"name":"张三","age": 20,"mail": "111@qq.com","hobby":"羽毛球、乒乓球、足球"}
{"index":{"_index":"study"}}
{"name":"李四","age": 21,"mail": "222@qq.com","hobby":"羽毛球、乒乓球、足球、篮球"}
{"index":{"_index":"study"}}
{"name":"王五","age": 22,"mail": "333@qq.com","hobby":"羽毛球、篮球、游泳、听音乐"}
{"index":{"_index":"study"}}
{"name":"赵六","age": 23,"mail": "444@qq.com","hobby":"跑步、游泳"}
{"index":{"_index":"study"}}
{"name":"孙七","age": 24,"mail": "555@qq.com","hobby":"听音乐、看电影"}
# 响应数据
{
"took": 16,
"errors": false,
"items": [
{
"index": {
"_index": "study",
"_type": "_doc",
"_id": "i6jJdoIBU4c5cKp3GGKx",
"_version": 1,
"result": "created",
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"_seq_no": 0,
"_primary_term": 1,
"status": 201
}
},
{
"index": {
"_index": "study",
"_type": "_doc",
"_id": "jKjJdoIBU4c5cKp3GGKx",
"_version": 1,
"result": "created",
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"_seq_no": 1,
"_primary_term": 1,
"status": 201
}
},
{
"index": {
"_index": "study",
"_type": "_doc",
"_id": "jajJdoIBU4c5cKp3GGKx",
"_version": 1,
"result": "created",
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"_seq_no": 2,
"_primary_term": 1,
"status": 201
}
},
{
"index": {
"_index": "study",
"_type": "_doc",
"_id": "jqjJdoIBU4c5cKp3GGKx",
"_version": 1,
"result": "created",
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"_seq_no": 3,
"_primary_term": 1,
"status": 201
}
},
{
"index": {
"_index": "study",
"_type": "_doc",
"_id": "j6jJdoIBU4c5cKp3GGKx",
"_version": 1,
"result": "created",
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"_seq_no": 4,
"_primary_term": 1,
"status": 201
}
}
]
}
二、全文搜索
2.1、单词搜索
POST http://127.0.0.1:9200/study/_search
# 请求数据
{
"query": {
"match": {
"hobby": "音乐"
}
},
"highlight": {
"fields": {
"hobby": {}
}
}
}
# 响应数据
{
"took": 67,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 0.9395274,
"hits": [
{
"_index": "study",
"_type": "_doc",
"_id": "j6jJdoIBU4c5cKp3GGKx",
"_score": 0.9395274,
"_source": {
"name": "孙七",
"age": 24,
"mail": "555@qq.com",
"hobby": "听音乐、看电影"
},
"highlight": {
"hobby": [
"听<em>音乐</em>、看电影"
]
}
},
{
"_index": "study",
"_type": "_doc",
"_id": "jajJdoIBU4c5cKp3GGKx",
"_score": 0.77041256,
"_source": {
"name": "王五",
"age": 22,
"mail": "333@qq.com",
"hobby": "羽毛球、篮球、游泳、听音乐"
},
"highlight": {
"hobby": [
"羽毛球、篮球、游泳、听<em>音乐</em>"
]
}
}
]
}
}
过程说明:
1. 检查字段类型
爱好 hobby 字段是一个 text 类型( 指定了IK分词器),这意味着查询字符串本身也应该被分词。
2. 分析查询字符串 。
将查询的字符串 “音乐” 传入IK分词器中,输出的结果是单个项 音乐。因为只有一个单词项,所以 match 查询执 行的是单个底层 term 查询。
3. 查找匹配文档 。
用 term 查询在倒排索引中查找 “音乐” 然后获取一组包含该项的文档,本例的结果是文档:3 、5 。
4. 为每个文档评分 。
用 term 查询计算每个文档相关度评分 _score ,这是种将 词频(term frequency,即词 “音乐” 在相关文档的 hobby 字段中出现的频率)和 反向文档频率(inverse document frequency,即词 “音乐” 在所有文档的 hobby 字段中出现的频率),以及字段的长度(即字段越短相关度越高)相结合的计算方式。
2.2、单词搜索
POST http://127.0.0.1:9200/study/_search
# 请求数据
{
"query": {
"match": {
"hobby": "音乐 篮球"
}
},
"highlight": {
"fields": {
"hobby": {}
}
}
}
# 响应数据
{
"took": 22,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 1.5408251,
"hits": [
{
"_index": "study",
"_type": "_doc",
"_id": "jajJdoIBU4c5cKp3GGKx",
"_score": 1.5408251,
"_source": {
"name": "王五",
"age": 22,
"mail": "333@qq.com",
"hobby": "羽毛球、篮球、游泳、听音乐"
},
"highlight": {
"hobby": [
"羽毛球、<em>篮球</em>、游泳、听<em>音乐</em>"
]
}
},
{
"_index": "study",
"_type": "_doc",
"_id": "j6jJdoIBU4c5cKp3GGKx",
"_score": 0.9395274,
"_source": {
"name": "孙七",
"age": 24,
"mail": "555@qq.com",
"hobby": "听音乐、看电影"
},
"highlight": {
"hobby": [
"听<em>音乐</em>、看电影"
]
}
},
{
"_index": "study",
"_type": "_doc",
"_id": "jKjJdoIBU4c5cKp3GGKx",
"_score": 0.77041256,
"_source": {
"name": "李四",
"age": 21,
"mail": "222@qq.com",
"hobby": "羽毛球、乒乓球、足球、篮球"
},
"highlight": {
"hobby": [
"羽毛球、乒乓球、足球、<em>篮球</em>"
]
}
}
]
}
}
上面查询中只要是包含篮球和音乐的都被查询出来了。但是这有时候不能达到我们的要求,我们大部分时候都是希望两个词是同时包含的。这时候可以使用elasticsearch中指定词之间逻辑关系operator:"and"
POST http://127.0.0.1:9200/study/_search
# 请求数据
{
"query": {
"match": {
"hobby": {
"query": "音乐 篮球",
"operator": "and"
}
}
},
"highlight": {
"fields": {
"hobby": {}
}
}
}
# 响应结果
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 1.5408251,
"hits": [
{
"_index": "study",
"_type": "_doc",
"_id": "jajJdoIBU4c5cKp3GGKx",
"_score": 1.5408251,
"_source": {
"name": "王五",
"age": 22,
"mail": "333@qq.com",
"hobby": "羽毛球、篮球、游泳、听音乐"
},
"highlight": {
"hobby": [
"羽毛球、<em>篮球</em>、游泳、听<em>音乐</em>"
]
}
}
]
}
}
上面的测试结果都是选择了"and"和"or"两个极端情况下,但是在真正搜索中,我们不会使用这两个极端情况的,这样就需要另外一种查询方式,即为只需要符合一定的相似度就可以查询到的数据,在elasticsearch中就支持这种查询方式,如使用minimum_should_match来指定匹配度,如60%。
POST http://127.0.0.1:9200/study/_search
# 请求数据
{
"query": {
"match": {
"hobby": {
"query": "游泳 羽毛球",
"minimum_should_match": "80%"
}
}
},
"highlight": {
"fields": {
"hobby": {}
}
}
}
# 响应数据
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3,
"relation": "eq"
},
"max_score": 2.1933634,
"hits": [
{
"_index": "study",
"_type": "_doc",
"_id": "jajJdoIBU4c5cKp3GGKx",
"_score": 2.1933634,
"_source": {
"name": "王五",
"age": 22,
"mail": "333@qq.com",
"hobby": "羽毛球、篮球、游泳、听音乐"
},
"highlight": {
"hobby": [
"<em>羽毛球</em>、篮球、<em>游泳</em>、听音乐"
]
}
},
{
"_index": "study",
"_type": "_doc",
"_id": "i6jJdoIBU4c5cKp3GGKx",
"_score": 1.7171206,
"_source": {
"name": "张三",
"age": 20,
"mail": "111@qq.com",
"hobby": "羽毛球、乒乓球、足球"
},
"highlight": {
"hobby": [
"<em>羽毛球</em>、乒乓<em>球</em>、足球"
]
}
},
{
"_index": "study",
"_type": "_doc",
"_id": "jKjJdoIBU4c5cKp3GGKx",
"_score": 1.6262295,
"_source": {
"name": "李四",
"age": 21,
"mail": "222@qq.com",
"hobby": "羽毛球、乒乓球、足球、篮球"
},
"highlight": {
"hobby": [
"<em>羽毛球</em>、乒乓<em>球</em>、足球、篮球"
]
}
}
]
}
}
2.3、组合搜索
在搜索时除了上面的方法外,还可以使用过滤器中的bool组合搜索。
POST http://127.0.0.1:9200/study/_search
# 请求数据
{
"query": {
"bool": {
"must": {
"match": {
"hobby": "篮球"
}
},
"must_not": {
"match": {
"hobby": "音乐"
}
},
"should": [
{
"match": {
"hobby": "游泳"
}
}
]
}
},
"highlight": {
"fields": {
"hobby": {}
}
}
}
# 响应数据
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 0.77041256,
"hits": [
{
"_index": "study",
"_type": "_doc",
"_id": "jKjJdoIBU4c5cKp3GGKx",
"_score": 0.77041256,
"_source": {
"name": "李四",
"age": 21,
"mail": "222@qq.com",
"hobby": "羽毛球、乒乓球、足球、篮球"
},
"highlight": {
"hobby": [
"羽毛球、乒乓球、足球、<em>篮球</em>"
]
}
}
]
}
}
注意:上面示例中在搜索结果中必须包含篮球,不能包含音乐,如果包含了游泳,那么它的相似度更高。
评分的计算规则
- bool 查询会为每个文档计算相关度评分 _score , 再将所有匹配的 must 和 should 语句的分数 _score 求和, 最后除以 must 和 should 语句的总数。
- must_not 语句不会影响评分; 它的作用只是将不相关的文档排除。
注意:默认情况下,should中的内容不是必须匹配的,如果查询语句中没有must,那么就会至少匹配其中一个。当然了, 也可以通过minimum_should_match参数进行控制,该值可以是数字也可以的百分比。
POST http://127.0.0.1:9200/study/_search
# 请求数据
{
"query": {
"bool": {
"should": [
{
"match": {
"hobby": "游泳"
}
},
{
"match": {
"hobby": "篮球"
}
},
{
"match": {
"hobby": "音乐"
}
}
],
"minimum_should_match": 2
}
},
"highlight": {
"fields": {
"hobby": {}
}
}
}
# 响应数据
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 2.3112378,
"hits": [
{
"_index": "study",
"_type": "_doc",
"_id": "jajJdoIBU4c5cKp3GGKx",
"_score": 2.3112378,
"_source": {
"name": "王五",
"age": 22,
"mail": "333@qq.com",
"hobby": "羽毛球、篮球、游泳、听音乐"
},
"highlight": {
"hobby": [
"羽毛球、<em>篮球</em>、<em>游泳</em>、听<em>音乐</em>"
]
}
}
]
}
}
2.4、权重搜索
在一些情况下,可能会对某些词增加权重来影响该条数据的得分。
POST http://127.0.0.1:9200/study/_search
# 请求数据
{
"query": {
"bool": {
"must": {
"match": {
"hobby": {
"query": "游泳篮球",
"operator": "and"
}
}
},
"should": [
{
"match": {
"hobby": {
"query": "音乐",
"boost": 10
}
}
},
{
"match": {
"hobby": {
"query": "跑步",
"boost": 2
}
}
}
]
}
},
"highlight": {
"fields": {
"hobby": {}
}
}
}
# 响应数据
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 9.24495,
"hits": [
{
"_index": "study",
"_type": "_doc",
"_id": "jajJdoIBU4c5cKp3GGKx",
"_score": 9.24495,
"_source": {
"name": "王五",
"age": 22,
"mail": "333@qq.com",
"hobby": "羽毛球、篮球、游泳、听音乐"
},
"highlight": {
"hobby": [
"羽毛球、<em>篮球</em>、<em>游泳</em>、听<em>音乐</em>"
]
}
}
]
}
}