芝麻_糊

导航

es学习

1.官网直接安装

官网下载:https://www.elastic.co/cn/downloads/elasticsearch

mac安装es:

下载完成后,打开bin文件夹下的elasticsearch文件,注意jdk版本要正确。因为我电脑有两个jdk版本,默认是jdk7.

 

执行vim ~/.bash_profile

 

 

 

进入编辑模式,输入i,修改后,esc退出,输入:wq退出insert

 

 

安装成功,访问http://localhost:9200/

2.docker安装

mac安装docker:https://www.runoob.com/docker/macos-docker-install.html

3.安装kibana

下载地址:https://www.elastic.co/cn/downloads/kibana

安装:打开bin文件夹,打开文件kibana

安装遇到错误:

      Error: getaddrinfo ENOTFOUND localhost,是由于localhost没有绑定到127.0.0.1

启动后,在浏览器上打开 http://localhost:5601/

 

如果想修改Kibana连接的Elasticsearch地址,或是Kibana自身的端口5601,可以在Kibana目录下的config下面的kibana.yml文件中进行修改; 

4.es分词器安装

安装指南:https://github.com/medcl/elasticsearch-analysis-ik

两种安装方式:

a.下载解压后安装,下载地址:https://github.com/medcl/elasticsearch-analysis-ik/releases

b.直接命令安装:./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v7.9.0/elasticsearch-analysis-ik-7.9.0.zip

安装完重启es

5.term vectors

官网文档:https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-termvectors.html#docs-termvectors-api-term-info

PUT /my-index-000001
{ "mappings": {
    "properties": {
      "text": {
        "type": "text",
        "term_vector": "with_positions_offsets_payloads",
        "store" : true,
        "analyzer" : "fulltext_analyzer"
       },
       "fullname": {
        "type": "text",
        "term_vector": "with_positions_offsets_payloads",
        "analyzer" : "fulltext_analyzer"
      }
    }
  },
  "settings" : {
    "index" : {
      "number_of_shards" : 1,
      "number_of_replicas" : 0
    },
    "analysis": {
      "analyzer": {
        "fulltext_analyzer": {
          "type": "custom",
          "tokenizer": "whitespace",
          "filter": [
            "lowercase",
            "type_as_payload"
          ]
        }
      }
    }
  }
}

PUT /my-index-000001/_doc/1
{
  "fullname" : "John Doe",
  "text" : "test test test "
}

PUT /my-index-000001/_doc/2?refresh=wait_for
{
  "fullname" : "Jane Doe",
  "text" : "Another test ..."
}

PUT /my-index-000001/_doc/3?refresh=wait_for
{
  "fullname" : "huyanxia liangming",
  "text" : "test Another baby ..."
}

GET /my-index-000001/_termvectors
{
  "fields" : ["text"],
  "offsets" : true,
  "payloads" : true,
  "positions" : true,
  "term_statistics" : true,
  "field_statistics" : true
}

GET /my-index-000001/_termvectors
{
  "doc" : {
    "fullname" : "John Doe diannao",
    "text" : "test test test"
  },
  "filter": {
    "max_num_terms": 3,
    "min_term_freq": 1,
    "min_doc_freq": 1
  }
}

6.聚合计算,es版本7.9.1

PUT /user_profiles1
{
    "settings": {
        "index": {
            "number_of_shards": "32",
            "number_of_replicas": "1"
        }
    },
    "mappings": {
                "properties": {
                "type": {
                    "type": "keyword"
                },
                "user_id": {
                    "type": "keyword"
                },
                "item_id": {
                    "type": "keyword"
                },
                "boost": {
                    "type": "double"
                },
                "created": {
                    "type": "date",
                    "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
                },
                "keywords": {
                    "type": "nested",
                    "properties": {
                        "word": {
                            "type": "keyword"
                        },
                        "weight": {
                            "type": "double"
                        }
                    }
                }
            }
        }
    
}

PUT /user_profiles1/_doc/1_1_1001
{
  "type": "1",
  "user_id": "1",
  "item_id": "1001",
  "factor": 1.2,
  "created" : "2020-09-07 14:54:37",
  "keywords": [
    {
      "word": "中国",
      "weight": 3.2
    },
    {
      "word": "美国",
      "weight": 1.4
    }
  ]
}

PUT /user_profiles1/_doc/1_1_1002
{
  "type": "1",
  "user_id": "1",
  "item_id": "1002",
  "factor": 1.2,
  "created" : "2020-09-07 14:54:37",
  "keywords": [
    {
      "word": "中国辅导费",
      "weight": 6.2
    },
    {
      "word": "美国当时的",
      "weight": 1.9
    }
  ]
}
POST /user_profiles1/_search
{
    "query": {
        "bool": {
            "must": [{
                    "terms": {
                        "type": [
                            "1"
                        ]
                    }
                },
                {
                    "term": {
                        "user_id": {
                            "value": "1"
                        }
                    }
                },
                {
                    "range": {
                        "created": {
                            "gte": "2020-09-07 14:54:37"
                        }
                    }
                }
              
            
            ]
        }
    },
    "size": 0,
    "aggs": {
        "agg_keywords": {
            "nested": {
                "path": "keywords"
            },
            "aggs": {
                "agg_word": {
                    "terms": {
                        "field": "keywords.word",
                        "order": {
                            "agg_score": "desc"
                        },
                        "size": 2 //决定返回大小 
                    },
                    "aggs": {
                        "agg_score": {
                            "sum": {
                                "field": "keywords.weight"
                            }
                        }
                    }
                }
            }
        }
    }
}

  7.从本地读取文件 

 

8.termVector es2.1

//第一种
TermVectorsResponse termVectorResponse = ElasticSearchUtils.getEsClient()
                .prepareTermVectors()
                .setIndex("knowledge_items")
                .setType("knowledge_items")
                .setId(itemId)
                .setSelectedFields("content")
                .setTermStatistics(true)
                .setFieldStatistics(false)
                .setOffsets(false)
                .setPayloads(false)
                .setPositions(false)
                .execute()
                .actionGet();
//第二种
TermVectorsRequest termVectorsRequest = new TermVectorsRequest();
 //设置参数            ElasticSearchUtils.getEsClient().termVectors(termVectorsRequest).actionGet();

 结果json化输出

 try {
            XContentBuilder builder = XContentFactory.jsonBuilder();
            builder.startObject();
            termVectorResponse.toXContent(builder, ToXContent.EMPTY_PARAMS);
            builder.endObject();
            System.out.println("json termVectorResponse:" + builder.string());
        } catch (IOException e) {
            e.printStackTrace();
        }

 结果遍历

 

Fields fields = termVectorResponse.getFields();
            Iterator<String> iterator = fields.iterator();
            while (iterator.hasNext()) {
                String field = iterator.next();
                Terms terms = fields.terms(field);
                int docCount = terms.getDocCount();//field_statistics
                TermsEnum termsEnum = terms.iterator();
                int currentTotalTermFreq = 0;
                List<TermInfoEntity> termInfoEntities = new ArrayList<>();
                while (termsEnum.next() != null) {//每个词条
                    BytesRef term = termsEnum.term();
                    String termName = term.utf8ToString();
                    if(NumUtils.isNum(termName) || termName.length() == 1){
                        LOG.info("termName filter:{}" + termName);
                        continue;
                    }
                    if (term != null) {
                        int docFreq = termsEnum.docFreq();
                        int termFreq = termsEnum.postings(null, PostingsEnum.FREQS).freq();
                        currentTotalTermFreq = currentTotalTermFreq + termFreq;

                        TermInfoEntity termInfoEntity = new TermInfoEntity(term.utf8ToString(), termFreq, docFreq);
                        termInfoEntities.add(termInfoEntity);
                    }
                }
                int finalCurrentTotalTermFreq = currentTotalTermFreq;
                double finalItemBoost = itemBoost;
                //计算每个词的tf-idf
                termInfoEntities.forEach(termInfoEntity -> {
                    double tf = (double) termInfoEntity.getTermFreq()/ (double) finalCurrentTotalTermFreq;
                    double idf = Math.log10(docCount/termInfoEntity.getDocFreq()) + 1;
                    double tfIDf = NumUtils.doubleValueScale(6, tf * idf);
                    KeyWordEntity keyWordEntity = new KeyWordEntity(termInfoEntity.getTermName(), tfIDf * userActionTypeEnum.getBoost() * finalItemBoost);
                    keyWordEntities.add(keyWordEntity);
                });
            }
        } catch (IOException e) {
            LOG.error("es termVectorResponse 遍历失败:", e);
        }

  9. es6 报错解决

原来代码:es2中不报错

setSource(JSON.toJSONStringWithDateFormat(sendMsgRecordEntity,JSON.DEFFAULT_DATE_FORMAT)) 

es6中报错如下:The number of object passed must be even but was [1]
setSource加上一个参数:XContentType.JSON

setSource(JSON.toJSONStringWithDateFormat(sendMsgRecordEntity,JSON.DEFFAULT_DATE_FORMAT), XContentType.JSON)

  

posted on 2020-09-04 15:01  芝麻_糊  阅读(405)  评论(0编辑  收藏  举报