算法调参 weight_ratio, weight_seqratio

 

 

 

 

 

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from openpyxl import Workbook
import xlrd
import time
import Levenshtein as Le
 
target_city_list = ['北京市', '上海市', '深圳市', '广州市']
source_name = 'JMTool任务_csv_py_wholeCSV-加百度170826165729'
BDpoi_list_tag, BDpoi_list_tagb = '|-|', '|--|'
FEXCEL = '%s%s' % (source_name, '.xlsx')
 
weight_ratio, weight_seqratio = 0.7, 0.3
 
 
def main_():
    global source_name
    data = xlrd.open_workbook(FEXCEL)
    table = data.sheets()[0]
    nrows, ncols = table.nrows, table.ncols
    res_dic = {}
    for i in range(0, nrows):
        l = table.row_values(i)
        dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list = l
        if dbid == 'dbid':
            continue
        if city not in target_city_list:
            continue
        if city not in res_dic:
            res_dic[city] = {}
        if district not in res_dic[city]:
            res_dic[city][district] = {}
        if name_ not in res_dic[city][district]:
            res_dic[city][district][name_] = []
 
        if BDpoi_list.find(BDpoi_list_tag) == -1:
            ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list, '', '', ''
            res_dic[city][district][name_].append(ll)
        else:
            addr_ = '%s%s%s%s' % (city, district, address, city_street)
            chk_name_lsit, cmp_list, sorted_ratio_seqratio_res_dic = [name_, addr_], BDpoi_list.split(
                BDpoi_list_tag), {}
            for ii in cmp_list:
                if len(ii) == 0:
                    continue
                cmp_, BD_name, BD_addr = ['', ''], '', ''
                cmp_one = ii.split(BDpoi_list_tagb)
                if len(cmp_one) == 2:
                    # format data -fair
                    BD_name, BD_addr = cmp_[0], cmp_[1] = cmp_one[0], cmp_one[1].replace(city, '').replace(district, '')
                else:
                    BD_name = cmp_[0] = cmp_one[0]
                ratio_res, seqratio_res = Le.ratio(name_, BD_name), Le.seqratio(chk_name_lsit, cmp_)
                ratio_seqratio_res = weight_ratio * ratio_res + weight_seqratio * seqratio_res
                ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res
                if ratio_seqratio_res not in sorted_ratio_seqratio_res_dic:
                    sorted_ratio_seqratio_res_dic[ratio_seqratio_res] = []
                sorted_ratio_seqratio_res_dic[ratio_seqratio_res].append(ll)
            sorted_seqratio_res_list = sorted(sorted_ratio_seqratio_res_dic)
            for ratio_seqratio_res in sorted_seqratio_res_list:
                lll = sorted_ratio_seqratio_res_dic[ratio_seqratio_res]
                for vl in lll:
                    res_dic[city][district][name_].append(vl)
 
    wb = Workbook()
    worksheet = wb.active
    file_title_str = ' dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res'
    file_title_l = file_title_str.replace(' ', '').split(',')
    worksheet.append(file_title_l)
    for city in res_dic:
        for district in res_dic[city]:
            for name_ in res_dic[city][district]:
                l = res_dic[city][district][name_]
                for ll in l:
                    worksheet.append(ll)
    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    file_name = '%s%s%s' % (source_name, '-Levenshtein', localtime_)
    file_name_save = '%s%s' % (file_name, '.xlsx')
    wb.save(file_name_save)
 
    wb = Workbook()
    worksheet = wb.active
    file_title_str = ' dbid, area_code, ref_area_type_code, city, district, address, city_street, name_,BDpoi_list,max_BD_name, max_BD_addr, max_ratio_seqratio_res, ratio_res, seqratio_res'
    file_title_l = file_title_str.replace(' ', '').split(',')
    worksheet.append(file_title_l)
    for city in res_dic:
        for district in res_dic[city]:
            for name_ in res_dic[city][district]:
                l = res_dic[city][district][name_]
                lll = l[-1]
 
                worksheet.append(lll)
    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    file_name = '%s%s%s' % (file_name, '-Levenshtein-ordered', localtime_)
    file_name_save = '%s%s' % (file_name, '.xlsx')
    wb.save(file_name_save)
 
 
main_()

  

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from openpyxl import Workbook
import xlrd
import time
import Levenshtein as Le
 
target_city_list = ['深圳市']
BDpoi_list_tag, BDpoi_list_tagb = '|-|', '|--|'
 
source_name = 'JMTool任务_csv_py_wholeCSV_住宅小区-加百度170826152533'
FEXCEL = '%s%s' % (source_name, '.xlsx')
weight_ratio, weight_seqratio = 0.7, 0.3
 
 
def main_():
    global source_name
    data = xlrd.open_workbook(FEXCEL)
    table = data.sheets()[0]
    nrows, ncols = table.nrows, table.ncols
    res_dic = {}
    for i in range(0, nrows):
        l = table.row_values(i)
        dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list = l
        if dbid == 'dbid':
            continue
        if city not in target_city_list:
            continue
        if city not in res_dic:
            res_dic[city] = {}
        if district not in res_dic[city]:
            res_dic[city][district] = {}
        if name_ not in res_dic[city][district]:
            res_dic[city][district][name_] = []
 
        if BDpoi_list.find(BDpoi_list_tag) == -1:
            ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list, '', '', ''
            res_dic[city][district][name_].append(ll)
        else:
            addr_ = '%s%s%s%s' % (city, district, address, city_street)
            chk_name_lsit, cmp_list, sorted_ratio_seqratio_res_dic = [name_reduction, addr_], BDpoi_list.split(
                BDpoi_list_tag), {}
            for ii in cmp_list:
                if len(ii) == 0:
                    continue
                cmp_, BD_name, BD_addr = ['', ''], '', ''
                cmp_one = ii.split(BDpoi_list_tagb)
                if len(cmp_one) == 2:
                    # format data -fair
                    BD_name, BD_addr = cmp_[0], cmp_[1] = cmp_one[0], cmp_one[1].replace(city, '').replace(district, '')
                else:
                    BD_name = cmp_[0] = cmp_one[0]
                ratio_res, seqratio_res = Le.ratio(name_reduction, BD_name), Le.seqratio(chk_name_lsit, cmp_)
                ratio_seqratio_res = weight_ratio * ratio_res + weight_seqratio * seqratio_res
                ll = dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res
                if ratio_seqratio_res not in sorted_ratio_seqratio_res_dic:
                    sorted_ratio_seqratio_res_dic[ratio_seqratio_res] = []
                sorted_ratio_seqratio_res_dic[ratio_seqratio_res].append(ll)
            sorted_seqratio_res_list = sorted(sorted_ratio_seqratio_res_dic)
            for ratio_seqratio_res in sorted_seqratio_res_list:
                lll = sorted_ratio_seqratio_res_dic[ratio_seqratio_res]
                for vl in lll:
                    res_dic[city][district][name_].append(vl)
 
    wb = Workbook()
    worksheet = wb.active
    file_title_str = 'dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction, BDpoi_list, BD_name, BD_addr, ratio_seqratio_res, ratio_res, seqratio_res'
    file_title_l = file_title_str.replace(' ', '').split(',')
    worksheet.append(file_title_l)
    for city in res_dic:
        for district in res_dic[city]:
            for name_ in res_dic[city][district]:
                l = res_dic[city][district][name_]
                for ll in l:
                    worksheet.append(ll)
    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    file_name = '%s%s%s' % (source_name, '-Levenshtein', localtime_)
    file_name_save = '%s%s' % (file_name, '.xlsx')
    wb.save(file_name_save)
 
    wb = Workbook()
    worksheet = wb.active
    file_title_str = 'dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, name_reduction,BDpoi_list,max_BD_name, max_BD_addr, max_ratio_seqratio_res, ratio_res, seqratio_res'
    file_title_l = file_title_str.replace(' ', '').split(',')
    worksheet.append(file_title_l)
    for city in res_dic:
        for district in res_dic[city]:
            for name_ in res_dic[city][district]:
                l = res_dic[city][district][name_]
                lll = l[-1]
 
                worksheet.append(lll)
    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    file_name = '%s%s%s' % (file_name, '-Levenshtein-ordered', localtime_)
    file_name_save = '%s%s' % (file_name, '.xlsx')
    wb.save(file_name_save)
 
 
main_()

  

 

posted @   papering  阅读(433)  评论(0编辑  收藏  举报
编辑推荐:
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· winform 绘制太阳,地球,月球 运作规律
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
历史上的今天:
2016-08-26 :avalon及通用MVVM的设计原理分析
2016-08-26 php.exe
2016-08-26 the command line tools
点击右上角即可分享
微信分享提示