网纸: https://ai.baidu.com/easydl/app/deploy/tee/public
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # @mail : lshan523@163.com # @Time : 2022/9/7 11:53 # @Author : Sea # @File : 文本抽取.py # @history: # **************************** import time import random import pandas as pd from datetime import datetime, timedelta # time: 2021-03-26 10:20:12 operator : Sea milestone : SAD bookingNo : 222231321212 def gen_no(): tail = str(random.randint(1, 9)) + str(random.randint(1000, 9999)) return time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) + tail def gen_time(): randint = random.randint(-100000000, 100000000) return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time() + randint)) def gen_miles(): pass mile = ["BKD", "DEP", "RCF", "DDL", "DL1", "RCS", "OTH", "CCD", "EXP"] return mile[random.randint(0, len(mile) - 1)] def gen_operator(): mile = ["Sea", "Jeff", "Zero", "Dana", "stiff", "jack", "Ryan", "Tom", "Jerry", "happy", "Mini", "Syan", "Joan"] return mile[random.randint(0, len(mile) - 1)] # time: 2021-03-26 10:20:12 operator : Sea milestone : SAD bookingNo : 222231321212 def prepare_data(): return "time:" + gen_time() + " operator:" + gen_operator() + " milestone:" + gen_miles() + " bookingNo:" + gen_no() def write_data(): with open("xxx.txt", mode='a+', encoding="utf-8") as file: for i in range(10): file.write(str(prepare_data() + "\n")) file.close() def write_date_to_excel_marked(): t = datetime.now().date() - timedelta(days=1) mark1_name = "时间" mark2_name = "操作者" mark3_name = "里程碑" mark4_name = "单号" # excel col data_set = {'文本内容': [], '实体标注1': [], '实体标注2': [], '实体标注3': [], '实体标注4': [] } for i in range(1000): # time:2019-08-09 07:41:16 stiff DDL 2022091318272387597 mark1 = gen_time() mark2 = gen_operator() mark3 = gen_miles() mark4 = gen_no() data = "" data += mark1_name + ":" + mark1 + " " * 3 start1 = data.find(mark1) mark1_loc = "[" + str(start1) + "," + str(start1 + len(mark1) - 1) + "]" + "," + mark1_name data += mark2_name + ":" + mark2 + " " * 3 start2 = data.find(mark2) mark2_loc = "[" + str(start2) + "," + str(start2 + len(mark2) - 1) + "]" + "," + mark2_name data += mark3_name + ":" + mark3 + " " * 3 start3 = data.find(mark3) mark3_loc = "[" + str(start3) + "," + str(start3 + len(mark3) - 1) + "]" + "," + mark3_name data += mark4_name + ":" + mark4 + " " * 3 start4 = data.find(mark4) mark4_loc = "[" + str(start4) + "," + str(start4 + len(mark4) - 1) + "]" + "," + mark4_name data_set["文本内容"].append(data) data_set["实体标注1"].append(mark1_loc) data_set["实体标注2"].append(mark2_loc) data_set["实体标注3"].append(mark3_loc) data_set["实体标注4"].append(mark4_loc) writer = pd.ExcelWriter(path='demo-%d%02d%02d.xlsx' % (t.year, t.month, t.day), mode="w", engine='xlsxwriter') pd.DataFrame(data_set).to_excel(writer, sheet_name='Sheet1', index=False, header=True, startrow=0) # add format workbook = writer.book worksheet1 = writer.sheets['Sheet1'] fmt = workbook.add_format({"font_name": u"宋体"}) # 设置列宽行宽 worksheet1.set_column('B:F', 20, fmt) worksheet1.set_column('A:B', 70, fmt) writer.save() if __name__ == '__main__': # data = "012345678" # print(data.find("67")) write_date_to_excel_marked()