用elasticsearch分析中国大学省份分布 - 创建表结构
1.去教育部官网下载excel数据:http://www.moe.gov.cn/srcsite/A03/moe_634/201706/W020170616379651135432.xls
2.把xls数据转换成json格式 https://blog.csdn.net/aomeishangpin/article/details/84404462
[root@do1_qy_10479 opt]# cat just4json.py # encoding: utf-8 import xlrd from collections import OrderedDict import json import codecs wb = xlrd.open_workbook('aa.xls') convert_list = [] #sh ==> sheet sh = wb.sheet_by_index(0) title = sh.row_values(2) for rownum in range(4,sh.nrows): rowvalue = sh.row_values(rownum) single = OrderedDict() for colnum in range(0,len(rowvalue)): print(title[colnum],rowvalue[colnum]) single[title[colnum]] = rowvalue[colnum] convert_list.append(single) j = json.dumps(convert_list,ensure_ascii=False) with codecs.open('tojson.json',"w","utf-8") as f: f.write(j)
3.得到的json数据
[{"rowid": 2631.0, "name": "新疆工业职业技术学院", "code": "4265051060", "charge": "新疆维吾尔自治区", "location": "乌鲁木齐市", "level": "专科", "remark": ""}]
4.把json格式的数据倒进elastic search
#python 3.6 # -*- coding:utf-8 -*- __author__ = 'BH8ANK' import json import os #文件预处理 a = open(r"/opt/englishjson.json", "r",encoding='UTF-8') out = a.read() tmp = json.dumps(out) tmp = json.loads(out) #构造curl语句上传数据 num = len(tmp) i = 0 while i < num: data = json.dumps(tmp[i],ensure_ascii=False) i = str(i) curl_word_1 = """ curl -XPUT "http://172.31.250.16:9200/daxue04/_doc/""" + i #此处设置ES的IP:PORT curl_word_2 = '''" -H 'Content-Type: application/json' -d''' curl_word_3 = "'" + data + "'" curl_words = curl_word_1 + curl_word_2 + curl_word_3 os.system(curl_words) print(curl_words) i = int(i) i = i + 1
5.查询es的数据
GET daxue05/_doc/0 { "_index" : "daxue05", "_type" : "_doc", "_id" : "0", "_version" : 1, "_seq_no" : 0, "_primary_term" : 1, "found" : true, "_source" : { "rowid" : 1.0, "name" : "北京大学", "code" : "4111010001", "charge" : "教育部", "location" : "北京市", "level" : "本科", "remark" : "" } }
6.参考:https://cloud.tencent.com/developer/article/1430850
7.
用一个例子来演示会更加清晰