进度三

今天完成了对行业和行业单位进行匹配 以及可视化,主要用到了

difflib 来进行对数据的操作
import pymysql
import difflib

def get_equal(str1, str2):
    data=difflib.SequenceMatcher(None, str1, str2).quick_ratio()

    return data


def mysql(b,code,id):
    db = pymysql.connect(host='localhost', user='root', passwd='199126', db='kettle', port=3306, charset='utf8')

    # 使用cursor()方法获取操作游标
    cur = db.cursor()

    sql = "update keji3_copy1 set hangye = '%s',hangyedaima= %s where id = %s" % (b, code, id)

    cur.execute(sql)  # 像sql语句传递参数
    # 提交
    db.commit()


def name (hynane,id):
    db1 = pymysql.connect(host='localhost', user='root', passwd='199126', db='kettle', port=3306, charset='utf8')
    cursor = db1.cursor()

    sql = "SELECT name,daima FROM data"
    cursor.execute(sql)

    datalist = []

    codelist = []
    alldata = cursor.fetchall()
    for s in alldata:
        datalist.append(s[0])
        codelist.append(s[1])
    #print(datalist)
    a=0
    b=''
    j=0
    code=''
    for i in datalist:
        j=j+1
        list=get_equal(i,hynane)
        if(list>a):
            a=list
            b=i
            code=codelist[j]
    print(b,code,id)

    return b,code,id


db = pymysql.connect(host='localhost', user='root', passwd='199126', db='kettle', port=3306, charset='utf8')
cursor = db.cursor()
sql1 = "SELECT jianjie,id FROM keji3_copy1 where name is not null and hangye is not null and industry is not null"

cursor.execute(sql1)
alldata = cursor.fetchall()
for s in alldata:
    hynane=s[0]
    id=s[1]
    print(hynane,id)
    data,data1,data2=name(hynane,id)
    mysql(data,data1,data2)
View Code

之前的关键词匹配

用到了jieba库

# encoding: utf-8
import jieba
import jieba.posseg as pseg #词性标注
import jieba.analyse as anls #关键词提取
import pymysql
def guanjian(s):
    y=''
    for x, w in anls.extract_tags(s, topK=6,withWeight=True):
        y=y+' '+x
    return y

def mysql():
    # 使用这个函数例如传入一个地址 网安大厦,当然传入的地址也不能太模糊了

    # 打开数据库连接
    db = pymysql.connect(host='localhost', user='root', passwd='199126', db='kettle', port=3306, charset='utf8')
    db1 = pymysql.connect(host='localhost', user='root', passwd='199126', db='kettle', port=3306, charset='utf8')
    # 使用 cursor() 方法创建一个游标对象 cursor
    cursor = db.cursor()
    cursor1 = db1.cursor()
    # SQL 查询语句
    sql = "SELECT * FROM keji3 where guanjianci is  null"

    try:  # 正确
        # 执行SQL语句
        cursor.execute(sql)
        # 获取所有记录列表
        # db.commit() 这是在增删改的情况下 下这条语句是提交到数据库
        results = cursor.fetchall()
        i = 0
        for row in results:
            jianjie = row[10].strip(' ')
            guanjianci=row[9]
            i+=1
            id=row[0]
            guan=guanjian(jianjie)


            try:
                sql1 = "UPDATE keji3 SET guanjianci ='%s' where id = %s" % (guan, id)

                cursor1.execute(sql1)
                # 提交到数据库执行
                db1.commit()
            except:
                print("2")

                # 打印结果
            print(guan, id)



    except:  # 发生错误
        print("1")

    # 关闭数据库连接
    db.close()
mysql()
View Code

规范化地域和行政代码用了百度地图接口

import requests
import json
import pymysql
'''
函数功能:根据传入的比较模糊的地址获取精确的结构化地址
函数实现流程:利用百度地图api的正地理编码可以获取该位置的经纬度
           接着使用经纬度采用逆地址编码获取结构化地址
'''


def get_formatted_address(address):
    # 根据百度地图api接口获取正地址编码也就是经纬度
    url1 = 'http://api.map.baidu.com/geocoding/v3/?address=' + address + '&output=json&ak=LXkcInigDnERPxfjbGZH2hKm7kr0uO5U&callback=showLocation'

    # 获取经纬度
    resp1 = requests.get(url1)
    resp1_str = resp1.text
    resp1_str = resp1_str.replace('showLocation&&showLocation', '')
    resp1_str = resp1_str[1:-1]
    resp1_json = json.loads(resp1_str)
    location = resp1_json.get('result').get('location')


    # 根据经纬度获取结构化地址
    lng = location.get('lng')
    lat = location.get('lat')
    url2 = 'http://api.map.baidu.com/reverse_geocoding/v3/?ak=LXkcInigDnERPxfjbGZH2hKm7kr0uO5U&output=json&coordtype=wgs84ll&location=' + str(
        lat) + ',' + str(lng) + ''
    resp2 = requests.get(url2)

    resp2_json = json.loads(resp2.text)
    # 提取结构化地址

    province = resp2_json.get('result').get('addressComponent').get('province')
    city=resp2_json.get('result').get('addressComponent').get('city')
    district = resp2_json.get('result').get('addressComponent').get('district')
    if(province==city):
        address=city+district
    if(province!=city):
        address=province+city+district


    gj = resp2_json.get('result').get('addressComponent').get('adcode')
    return address,gj


def mysql():
    # 使用这个函数例如传入一个地址 网安大厦,当然传入的地址也不能太模糊了

    # 打开数据库连接
    db = pymysql.connect(host='localhost', user='root', passwd='199126', db='kettle', port=3306, charset='utf8')
    db1 = pymysql.connect(host='localhost', user='root', passwd='199126', db='kettle', port=3306, charset='utf8')
    # 使用 cursor() 方法创建一个游标对象 cursor
    cursor = db.cursor()
    cursor1 = db1.cursor()
    # SQL 查询语句
    sql = "SELECT * FROM keji3_copy1 where id>1591  "

    try:  # 正确
        # 执行SQL语句
        cursor.execute(sql)
        # 获取所有记录列表
        # db.commit() 这是在增删改的情况下 下这条语句是提交到数据库
        results = cursor.fetchall()
        i = 0
        for row in results:
            danwei = row[13].split(' ')[0]
            city=row[5]
            danwei1=city+''+danwei
            id=row[0]

            address,gj = get_formatted_address(danwei1)
            try:
                sql1 = "UPDATE keji3_copy1 SET guifandiyu ='%s',xzbm=%s where id = %s"% (address, gj, id)


                cursor1.execute(sql1)
                    # 提交到数据库执行
                db1.commit()
            except:
                print("2")

            # 打印结果
            print(address,gj,id)

    except:  # 发生错误
        print("1")

    # 关闭数据库连接
    db.close()

mysql()



'''

address,gj = get_formatted_address('天津市大港油田滩海勘探开发公司')
print(address,gj)
'''
View Code

 

posted @ 2020-03-14 20:42  张利杰j  阅读(111)  评论(0编辑  收藏  举报