Elasticsearch打造全文搜索引擎(二)
一、Es的文档、索引的CURD操作
1. elasticsearch概念
- 集群:一个或多个节点组织在一起
- 节点:一个节点是集群中的一个服务器,有一个名字来标识,默认是一个随机的漫画角色的名字
- 分片:将索引划分为多份的能力,允许水平分割和扩展容量,多个分片相应请求,提高性能和吞吐量。
- 副本:创建分片的一份或多份的能力,在一个节点失败其余节点可以顶上。
elasticsearch | mysql |
index(索引) | 数据库 |
type(类型) | 表 |
document(文档) | 行 |
fields | 列 |
2.常用属性和类型
3.内置类型
4. CURD操作
- 索引的初始化操作
- 指定分片和副本的数量
- shards一旦设置不能修改
# 索引初始化
PUT lagou { "settings": { "index": { "number_of_shards": 5, # 分片 "number_of_replicas": 1 # 备份 } } } GET lagou/_settings GET _all/_settings GET .kibana,lagou/_settings GET _settings # 修改settings PUT lagou/_settings { "number_of_replicas": 2 } # 获取索引信息 GET _all GET lagou # 新建/保存文档 # 方式一 PUT lagou/job/1 { "title": "python爬虫分布式开发", "salary_min":15000, "city":"北京", "company":{ "name":"百度", "company_addr":"北京市软件园" }, "publish_date":"2019-06-15", "comments":15 } # 新建文档 # 方式二 POST lagou/job/ { "title": "python django 开发工程师", "salary_min":30000, "city":"上海", "company":{ "name":"美团科技", "company_addr":"北京市软件园A区" }, "publish_date":"2019-06-15", "comments":120 } GET lagou/job/1 GET lagou/job/1?_source=title GET lagou/job/1?_source=title,city GET lagou/job/1?_source # 修改文章 # 方式一 PUT lagou/job/1 { "title": "python爬虫分布式开发", "salary_min":18000, "city":"广州", "company":{ "name":"百度", "company_addr":"北京市软件园" }, "publish_date":"2019-06-15", "comments":15 } # 方式二:修改修改某一字段 POST lagou/job/1/_update { "doc": { "comments":20 } } # 删除 DELETE lagou/job/1 DELETE lagou/job DELETE lagou
二、mget和bulk操作
# 批量操作 数据准备 POST lagou/job1/1 { "title": "python django 开发工程师", "salary_min":30000, "city":"上海", "company":{ "name":"美团科技", "company_addr":"北京市软件园A区" }, "publish_date":"2019-06-15", "comments":120 } POST lagou/job1/2 { "title": "python django 开发工程师", "salary_min":30000, "city":"上海", "company":{ "name":"美团科技", "company_addr":"北京市软件园A区" }, "publish_date":"2019-06-15", "comments":120 } POST lagou/job2/1 { "title": "python django 开发工程师", "salary_min":30000, "city":"上海", "company":{ "name":"美团科技", "company_addr":"北京市软件园A区" }, "publish_date":"2019-06-15", "comments":120 } POST lagou/job2/2 { "title": "python django 开发工程师", "salary_min":30000, "city":"上海", "company":{ "name":"美团科技", "company_addr":"北京市软件园A区" }, "publish_date":"2019-06-15", "comments":120 } mget批量获取 GET _mget { "docs":[ {"_index":"lagou", "_type":"job1", "_id":1 }, {"_index":"lagou", "_type":"job2", "_id":2 } ] } GET lagou/_mget { "docs":[ { "_type":"job1", "_id":1 }, { "_type":"job2", "_id":2 } ] } GET lagou/job1/_mget { "docs":[ { "_id":1 }, { "_id":2 } ] } GET lagou/job1/_mget { "ids":[1,2] } bulk增删改查 POST _bulk {"index":{"_index":"lagou","_type":"job1","_id":"3"}} {"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120} {"index":{"_index":"lagou","_type":"job2","_id":"3"}} {"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120} POST _bulk {"create":{"_index":"lagou","_type":"job1","_id":"3"}} {"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120} POST _bulk {"delete":{"_index":"lagou","_type":"job1","_id":"3"}} POST _bulk {"update":{"_index":"lagou","_type":"job1","_id":"3"}} {"doc":{"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120}}
三、mapping映射和查询
1. mapping映射
2.倒排索引
3. 倒排索引待解决的问题
4. 查询
5. 操作
# mapping操作 PUT lagou1 { "mappings":{ "job":{ "properties":{ "title":{ "type":"text" }, "salary_min":{ "type":"integer" }, "city":{ "type":"keyword" }, "company":{ "properties":{ "name":{ "type":"text" }, "company_addr":{ "type":"text" }, "employee_count":{ "type":"integer" } } }, "publish_date":{ "type":"date", "format":"yyyy-MM-dd" }, "comments":{ "type":"integer" } } } } } PUT lagou1/job/1 { "title": "python爬虫分布式开发", "salary_min":15000, "city":"北京", "company":{ "name":"百度", "company_addr":"北京市软件园", "employee_count":50 }, "publish_date":"2019-06-15", "comments":15 } # get index mapping GET lagou1/_mapping GET lagou1/_mapping/job GET _all/_mapping/job # 查询 PUT lagou2 { "mappings": { "job":{ "properties":{ "title":{ "type": "text", "store":true, "analyzer": "ik_max_word" }, "company_name": { "type": "keyword", "store":true }, "desc":{ "type":"text" }, "add_time":{ "type":"date", "format":"yyyy-MM-dd" }, "comments":{ "type": "integer" } } } } } POST lagou2/job { "title":"python django 开发工程师" , "company_name":"美国科技有限公司", "desc":"对django的概念熟悉,熟悉python基础知识", "comments":20, "add_time":"2017-04-01" } POST lagou2/job { "title":"python scrapy redis 分布式爬虫基本" , "company_name":"百度科技有限公司", "desc":"对scrapy的概念熟悉,熟悉redis的基本操作", "comments":5, "add_time":"2017-04-15" } POST lagou2/job { "title":"Elasticsearch打造搜索引擎" , "company_name":"阿里巴巴科技有限公司", "desc":"熟悉数据结构算法,熟悉python的基本开发", "comments":15, "add_time":"2017-06-20" } POST lagou2/job { "title":"python打造推荐引擎系统" , "company_name":"阿里巴巴科技有限公司", "desc":"熟悉推荐引擎的原理以及算法、掌握C语言", "comments":60, "add_time":"2016-10-20" } # 简单查询 #查看分析器解析的结果 GET _analyze { "analyzer": "ik_smart", "text":"Python网络开发师" } GET _analyze { "analyzer": "ik_max_word", "text":"Python网络开发师" } #match查询 (分词查询) python 和分布式 #查询第0-2条的title和company_name字段(desc字段的stored属性不是true),并按comments排序 GET lagou2/_search { "stored_fields":["title","company_name","desc"], "query":{ "match":{ "title":"python分布式" } }, "from": 0, "size": 2, "sort": [ { "comments": { "order": "desc" } } ] } #查询comments在大于等于10、小于等于20、权重2.0的数据 GET lagou2/_search { "query":{ "range": { "comments": { "gte": 10, "lte": 20, "boost":2.0 } } } } GET lagou2/_search { "query":{ "range": { "add_time": { "gte": "2017-04-01", "lte": "now" } } } } #term查询(不会做处理、直接查,类似于keyword属性) GET lagou2/_search { "query":{ "term":{ "title":"python" } } } #terms 和用match查django分布工程 效果一样 GET lagou2/_search { "query":{ "terms":{ "title":["django" ,"分布" ,"工程" ] } } } #match_all GET lagou2/_search { "query":{ "match_all":{} } } #match_phrase #短语查询 #满足所有词 既有python也有系统,俩个词最小间距6位 GET lagou2/_search { "query":{ "match_phrase": { "title": { "query": "python系统", "slop":6 } } } } #multi_match 多字段匹配,title的权重高于desc的3倍 GET lagou2/_search { "query":{ "multi_match": { "query": "python系统", "fields":["title^3","desc"] } } } # sort查询 GET lagou2/_search { "query": { "match_all": {} }, "sort": [ { "comments": { "order": "asc" } } ] } # range范围查询 GET lagou2/_search { "query": { "range": { "comments": { "gte": 20, "lte": 60, "boost":2.0 } } } } GET lagou2/_search { "query": { "range": { "add_time": { "gte": "2017-06-07", "lte": "now" } } } } #wildcard 通配符查询 GET lagou2/_search { "query":{ "wildcard": { "title": { "value": "pyth*n", "boost": 2 } } } } # 组合查询 #bool 查询 #用 bool 包括 must should must_not filter来完成 #格式如下 #bool:{ # "filter":[], #不参与打分 # "must":[], #相当于 (salary=20 and title=Python) # "should":[], #相当于 (salary=20 or title=Python) # "must_not":[], #相当于not #} #建立测试数据 POST lagou/testjob/_bulk {"index":{"_id":1}} {"salary":10,"title":"Python"} {"index":{"_id":2}} {"salary":20,"title":"Scrapy"} {"index":{"_id":3}} {"salary":30,"title":"Django"} {"index":{"_id":4}} {"salary":30,"title":"Elasticsearch"} DELETE lagou/testjob #简单的过滤查询 #最简单的fileter查询 #select * from testjob where salary=20 GET lagou/testjob/_search { "query":{ "bool": { "must": { "match":{ "salary":20 } }, "filter":{ "match":{ "title":"Scrapy" } } } } } #select * from testjob #where (salary=20 or title=Python) and salary!=30 and salary!=10 GET lagou/testjob/_search { "query":{ "bool": { "should":[ {"term":{"salary":20}}, {"term":{"title":"python"}} ], "must_not": [ {"term": {"salary": "30"}}, {"term": {"salary": "10"}} ] } } } #where (salary=30 and title="django") or title="python" GET lagou/testjob/_search { "query":{ "bool": { "should":[ {"term":{"title":"python"}}, {"bool": { "must":[ {"term":{"salary":30}}, {"term":{"title":"django"}} ] }} ] } } } #测试数据 POST lagou/testjob2/_bulk {"index":{"_id":1}} {"tags":["search"]} {"index":{"_id":2}} {"tags":["search","python"]} {"index":{"_id":3}} {"other_filed":["some data"]} {"index":{"_id":4}} {"tags":null} {"index":{"_id":5}} {"tags":["search",null]} #处理null空值的方法 #select tags from testjob2 where tags is not null GET lagou/testjob2/_search { "query": { "bool": { "filter": { "exists": { "field": "tags" } } } } } #select tags from testjob2 where tags is null GET lagou/testjob2/_search { "query": { "bool": { "must_not": { "exists": { "field": "tags" } } } } }
gitee地址:https://gitee.com/zhangyafeii/ArticleSpider_LcvSearch