关键信息读写脚本
记录一个小的脚本
""" Function: extract the key info Author: dyx DateTime: 20200805 """ import pandas as pd current_file = r'./data/original.xlsx' to_save = r'./data/abc.xlsx' class AnalysisTool(): def get_info(self, analy_file, save_file): df = pd.read_excel(analy_file) message = df['sms_message'] messageid = df['messageid'] data = message.tolist() messages = [] groups = [] groups_ids = [] groups_name = [] save_types = [] money_nums = [] money_types = [] gusts = [] save_moneys = [] put_datas = [] remarks_names = [] remarks_groups = [] remarks_money = [] remarks = [] for id, each in zip(messageid, data): messages.append(id) items = each.split('\n\n') if len(items) == 4: items = [items[0], items[1], items[2]+'\n'+items[3]] first = items[0].split('\n') name_info = first[2].split(' ') if not name_info: name_info = 'None' # print(name_info) save_get_info = first[3] gust_info = first[4].split(': ')[-1] remark_info = first[5] second = items[1].split('\n') save_money = [] for s in second: if '0 万' not in s and '存结' not in s and '万' in s: save_money.append(s) if '0 million(s)' not in s and 'currency balance' not in s and 'million' in s: save_money.append(s) third = items[2].split('\n') for t in third: if '入数日期' in t or 'Input Date' in t : put_date_info = t.split(': ')[1] groups.append(name_info[0]) groups_ids.append(name_info[1]) groups_name.append(name_info[2]) sgi = save_get_info.split(': ')[-1] save_type = sgi.split(' ')[0] save_money_info = sgi.split(save_type)[-1] if '万' in save_get_info: smi = save_money_info.split('万') money_num = smi[0].strip()+'万' money_type = smi[-1].strip() elif 'million' in save_get_info: smi = save_money_info.split('million(s)') money_num = smi[0].strip()+'million(s)' money_type = smi[-1].strip() save_types.append(save_type) money_nums.append(money_num) money_types.append(money_type) gusts.append(gust_info) remarks.append(remark_info) save_moneys.append(save_money) put_datas.append(put_date_info) # remark remarks = self.ner_deal_data(remarks) for each in remarks: if '备注' in each[0] or 'Notes' in each[0]: remarks_names.append(None) remarks_groups.append(None) remarks_money.append(None) else: remarks_names.append(each[0]) remarks_groups.append(each[1]) remarks_money.append(each[2]) # save DateSet = list(zip(messages, groups,groups_ids,groups_name, save_types, money_nums,money_types, gusts, save_moneys, put_datas, remarks_names, remarks_groups, remarks_money, remarks)) df = pd.DataFrame(data=DateSet, columns=['item1','item2','编号','用','存','金额','单位', '信息','结', '日期','姓名','组名','金额','备注']) try: df.to_excel(save_file) print('OK. analysis result has generate!') except: print('exe is wrong') def ner_deal_data(self, data): all_list= [] for line in data: invertname = "" name = "" split_line =line.split("]") if len(split_line) < 2: all_list.append([line]) continue else: for i in split_line[0][::-1]: if not i.isdigit(): invertname += i else: for j in invertname[::-1]: name += j name.strip() break group1 = split_line[0].split(" ") group = "" for one in group1: if "組" in one or "组" in one: group = one money = "" for i in split_line[0]: if i.isdigit() or i == ".": money += i else: if i == "万" or i == "萬": if len(money) != 0: money += i break else: money = "" else: money = "" all_list.append([name.strip(), group, money]) return all_list if __name__ == '__main__': at = AnalysisTool() # execute at.get_info(current_file, to_save)
时刻记着自己要成为什么样的人!