elasticsearch 索引重建实战

一.介绍

  索引结构一但创建好后,是无法直接修改索引字段类型的,也无法直接删除索引中某一个字段,也无法增加分片,要解决这些问题,只能索引重建。

  新增字段是不需要索引重建的。

  下面是索引重建的一个案例:

    当时创建一个索引后,在运行一段时间,发现索引字段应该为keyword类型,而不应该是keyword和text二种复合类型。还有创建日期CreateDate和CreateTime只需要一个,crawl_basis_pn源索数据量有700w+, 索引结构如下所示:

{
  "crawl_basis_pn" : {
    "mappings" : {
      "properties" : {
        "Brand" : {
          "type" : "keyword"
        },
        "CategoryName" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "CreateDate" : {
          "type" : "date"
        },
        "CreateTime" : {
          "type" : "date"
        },
        "CreateTimeStamp" : {
          "type" : "long"
        },
        "DatasheetUrls" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "DateCode" : {
          "type" : "keyword"
        },
        "Describe" : {
          "type" : "text",
          "analyzer" : "ik_max_word"
        },
        "ECCN" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "Encapsulation" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "Id" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "ImageUrls" : {
          "type" : "keyword"
        },
        "PN" : {
          "type" : "keyword"
        },
        "ProductId" : {
          "type" : "keyword"
        },
        "ProductUrl" : {
          "type" : "keyword"
        },
        "SpecDesc" : {
          "type" : "text",
          "analyzer" : "ik_max_word"
        },
        "SubStatus" : {
          "type" : "short"
        },
        "SupplierFlag" : {
          "type" : "short"
        }
      }
    }
  }
}

 

二. 重建索引步骤   

  1)创建目标索引 crawl_basis_pn_source_v2,索引结构如下所示

PUT crawl_basis_pn_source_v2
{
  "settings": {
    "number_of_shards": 3,
    "number_of_replicas": 1
  },
  "mappings": {
    "properties": {
      "Brand": {
        "type": "keyword"
      },
      "CategoryName": {
        "type": "keyword"
      },
      "CreateTime": {
        "type": "date"
      },
      "CreateTimeStamp": {
        "type": "long"
      },
      "DatasheetUrls": {
        "type": "keyword",
        "ignore_above": 256
      },
      "DateCode": {
        "type": "keyword"
      },
      "Describe": {
        "type": "text",
        "analyzer": "ik_max_word"
      },
      "ECCN": {
        "type": "keyword"
      },
      "Encapsulation": {
        "type": "keyword"
      },
      "Id": {
        "type": "keyword"
      },
      "ImageUrls": {
        "type": "keyword"
      },
      "PN": {
        "type": "keyword"
      },
      "ProductId": {
        "type": "keyword"
      },
      "ProductUrl": {
        "type": "keyword"
      },
      "SpecDesc": {
        "type": "text",
        "analyzer": "ik_max_word"
      },
      "SubStatus": {
        "type": "short"
      },i
      "SupplierFlag": {
        "type": "short"
      }
    }
  }
}

  2)将源crawl_basis_pn索引的数据导入到目标crawl_basis_pn_source_v2索引中,在kibana中操作如下命令

POST _reindex?wait_for_completion=false
{
  "source": {
    "index": "crawl_basis_pn"
  },
  "dest": {
    "index": "crawl_basis_pn_source_v2",
    "version_type": "external"
  },
  "script": {
    "lang": "painless",
    "source": """
    ctx._source.remove('CreateTime');
    ctx._source.CreateTime=ctx._source.remove('CreateDate');
    """
  }
}

  解释一下:

    version_type:external external表示外部的,更新目标索引中版本比源索引中版本旧的任何文档。id不存在的文档会直接更新;id存在的文档会先判断版本号,只会更新版本号旧的文档。这里可以不加,因为目标索引是没有数据的。
    ctx._source.remove('CreateTime'); 是指导入时排除源索引字段CreateTime,原因是:该字段在源索引中都没有值的,是无用字段。
    ctx._source.CreateTime=ctx._source.remove('CreateDate'); 是指将源CreateDate字段重命名为CreateTime,原因是:目标索引中用的字段名是CreateTime。

    导入后,目标crawl_basis_pn_source_v2文档内容如下:

     "_index" : "crawl_basis_pn_source_v2",
        "_type" : "_doc",
        "_id" : "5MwiiogBSc5GrbdbkL-Z",
        "_score" : 1.0,
        "_source" : {
          "Brand" : "TE Connectivity",
          "Encapsulation" : "",
          "SpecDesc" : """{"触点数": "64", "触点材料": "BERYLLIUMCOPPER", "目前评级": "", "设备插槽类型": "ICSOCKET", "制造商序列号": "345842", "使用的设备类型": "DIP64", "其他特性": "", "介电耐压": "", "绝缘电阻": "", "JESD-609代码": "e0", "最高工作温度": "", "最低工作温度": "", "触点的结构": "", "联系完成配合": "NOTSPECIFIED", "联系完成终止": "TINLEADOVERNICKEL", "触点样式": "", "外壳材料": "", "安装方式": "", "端接类型": "", "主体深度": "", "主体宽度": "", "主体长度": "", "是否符合REACH标准": "unknown", "ECCN代码": "EAR99", "HTS代码": "8536.69.40.40", "特征": "ICSocket", "包装说明": ""}""",
          "SupplierFlag" : 39,
          "CreateTime" : "2023-06-05T13:58:21.080937",
          "CategoryName" : "插座-->插槽和芯片载体",
          "ECCN" : "EAR99",
          "Id" : "f9b1961d-0365-11ee-94a4-000c29206695",
          "DatasheetUrls" : "https://xxxx.ihs.com/images/VipMasterIC/IC/AMPI/AMPIS67777/AMPIS67777-1.pdf?hkey=EF798316E3902B6ED9A73243A3159BB0",
          "PN" : "2-345842-3",
          "ProductUrl" : "https://www.xxxx.com/partIntelligence/2-345842-3/",
          "CreateTimeStamp" : 1685944701
        }

 

#获取reindex任务列表
GET  _tasks?detailed=true&actions=*reindex

#取消任务
POST _tasks/r1A2WoRbTwKZ516z6NEs5A:36619/_cancel
#根据任务id查看任务
GET /_tasks/Iq_VvSaTTGquwQV4gAjL7g:267533075

#结果
{
  "completed" : false,
  "task" : {
    "node" : "Iq_VvSaTTGquwQV4gAjL7g",
    "id" : 267533075,
    "type" : "transport",
    "action" : "indices:data/write/reindex",
    "status" : {
      "total" : 7788401,  #源总数
      "updated" : 0,
      "created" : 209000, #已导入数
      "deleted" : 0,
      "batches" : 210,
      "version_conflicts" : 0,
      "noops" : 0,
      "retries" : {
        "bulk" : 0,
        "search" : 0
      },
      "throttled_millis" : 0,
      "requests_per_second" : -1.0,
      "throttled_until_millis" : 0
    },
    "description" : "reindex from [crawl_basis_pn] to [crawl_basis_pn_source_v2][_doc]",
    "start_time_in_millis" : 1689127808069,
    "running_time_in_nanos" : 64843072244,
    "cancellable" : true,
    "headers" : { }
  }
}

  3) 将源crawl_basis_pn索引删除(先检查crawl_basis_pn_source_v2索引结构和数据是否正确),为目标crawl_basis_pn_source_v2索引创建crawl_basis_pn别名

#删除原有索引
DELETE crawl_basis_pn

#创建索引别名
POST /_aliases
{
  "actions": [
    {
      "add": {
        "index": "crawl_basis_pn_source_v2",
        "alias": "crawl_basis_pn"
      }
    }
  ]
}

  4)查看crawl_basis_pn索引数据

get crawl_basis_pn/_count  

#结果
{
  "count" : 7788401,
  "_shards" : {
    "total" : 3,
    "successful" : 3,
    "skipped" : 0,
    "failed" : 0
  }
}

   这样就完成了一个索引重建,程序客户端也不需要更改索引名称。

 

三.附加知识点

  在python中使用elasticsearch_dsl操作es时,如果是操作索引别名时,应该注释掉Init()操作

index_name="crawl_basis_pn"
#创建document实例
search=Search(using=esclient(),  index=index_name)
#继承了es的Document
class BasisPNDocument(Document):
    PN=Keyword()
    Brand=Keyword()
    DateCode=Keyword()
    ProductId=Keyword()
    ProductUrl=Keyword()
    CreateTime=Date()
    CreateTimeStamp=Long()
    ImageUrls=Keyword()
    ....

#init 创建索引并填充映射,使用别名是要注释掉
#BasisPNDocument.init() 

 

 

  索引重建还有很多功能,可参考:https://www.dandelioncloud.cn/article/details/1569335929129365506

 

posted on 2023-07-11 15:27  花阴偷移  阅读(86)  评论(0编辑  收藏  举报

导航