关键信息读写脚本

记录一个小的脚本

"""
    Function: extract the key info
    Author: dyx
    DateTime: 20200805
"""
import pandas as pd

current_file = r'./data/original.xlsx'
to_save = r'./data/abc.xlsx'


class AnalysisTool():


    def get_info(self, analy_file, save_file):
        df = pd.read_excel(analy_file)
        message = df['sms_message']
        messageid = df['messageid']
        data = message.tolist()

        messages = []
        groups = []
        groups_ids = []
        groups_name = []
        save_types = []
        money_nums = []
        money_types = []
        gusts = []
        save_moneys = []
        put_datas = []
        remarks_names = []
        remarks_groups = []
        remarks_money = []
        remarks = []

        for id, each in zip(messageid, data):
            messages.append(id)
            items = each.split('\n\n')
            if len(items) == 4:
                items = [items[0], items[1], items[2]+'\n'+items[3]]

            first = items[0].split('\n')
            name_info = first[2].split(' ')
            if not name_info:
                name_info = 'None'
            # print(name_info)
            save_get_info = first[3]
            gust_info = first[4].split(': ')[-1]
            remark_info = first[5]
            second = items[1].split('\n')
            save_money = []
            for s in second:
                if '0 万' not in s and '存结' not in s and '' in s:
                    save_money.append(s)

                if '0 million(s)' not in s and 'currency balance' not in s and 'million' in s:
                    save_money.append(s)

            third = items[2].split('\n')
            for t in third:
                if '入数日期' in t or 'Input Date' in t :
                    put_date_info = t.split(': ')[1]

            groups.append(name_info[0])
            groups_ids.append(name_info[1])
            groups_name.append(name_info[2])

            sgi = save_get_info.split(': ')[-1]

            save_type = sgi.split(' ')[0]
            save_money_info = sgi.split(save_type)[-1]
            if '' in save_get_info:
                smi = save_money_info.split('')
                money_num = smi[0].strip()+''
                money_type = smi[-1].strip()

            elif 'million' in save_get_info:
                smi = save_money_info.split('million(s)')
                money_num = smi[0].strip()+'million(s)'
                money_type = smi[-1].strip()

            save_types.append(save_type)
            money_nums.append(money_num)
            money_types.append(money_type)
            gusts.append(gust_info)
            remarks.append(remark_info)
            save_moneys.append(save_money)
            put_datas.append(put_date_info)

        # remark
        remarks = self.ner_deal_data(remarks)
        for each in remarks:
            if '备注' in each[0] or 'Notes' in each[0]:
                remarks_names.append(None)
                remarks_groups.append(None)
                remarks_money.append(None)
            else:
                remarks_names.append(each[0])
                remarks_groups.append(each[1])
                remarks_money.append(each[2])

        # save
        DateSet = list(zip(messages, groups,groups_ids,groups_name, save_types, money_nums,money_types,
                           gusts, save_moneys, put_datas, remarks_names, remarks_groups, remarks_money, remarks))
        df = pd.DataFrame(data=DateSet, columns=['item1','item2','编号','','','金额','单位',
                                                 '信息','', '日期','姓名','组名','金额','备注'])
        try:
            df.to_excel(save_file)
            print('OK. analysis result has generate!')
        except:
            print('exe is wrong')



    def ner_deal_data(self, data):
        all_list= []

        for line in data:
            invertname = ""
            name = ""
            split_line =line.split("]")
            if len(split_line) < 2:
                all_list.append([line])
                continue
            else:
                for i in split_line[0][::-1]:
                    if not i.isdigit():
                        invertname += i
                    else:
                        for j in invertname[::-1]:
                            name += j
                        name.strip()
                        break
                group1 = split_line[0].split(" ")
                group = ""
                for one in group1:
                    if "" in one or "" in one:
                        group = one
                money = ""
                for i in split_line[0]:
                    if i.isdigit() or i == ".":
                        money += i
                    else:
                        if i == "" or i == "":
                            if len(money) != 0:
                                money += i
                                break
                            else:
                                money = ""
                        else:
                            money = ""
            all_list.append([name.strip(), group, money])

        return all_list


if __name__ == '__main__':
    at = AnalysisTool()
    # execute
    at.get_info(current_file, to_save)

 

posted @ 2020-08-05 15:11  今夜无风  阅读(151)  评论(0编辑  收藏  举报