b4和tncl_extract_UNCL_new
1 # -*- coding:utf-8 -*- 2 import re 3 4 5 ''' 6 适应新版本 7 8 注意: 9 1)17A文件改完后缀后,需要转为UTF-8无BOM格式,才能正确处理。 10 2)fr = open(filename,encoding='utf-8') 11 12 ''' 13 14 15 year='17A'#用户自定义 16 ss='./data/'#根目录 17 filename = ss+'UNCL%s.txt'%year#输入文件名 18 19 20 21 def tncl_note(): 22 23 24 25 p4= r"^(?:\s{5}|X\s{4}|\W\s{4})(\w+)\s+\w.+\n" 26 p1 = r"^(?:\s{5}|X\s{4}|\W\s{4})(\d\d\d\d)\s\s[A-Z].+\]$"#匹配tncl_id 27 p2 = r"^(?:\s{5}|X\s{4}|\W\s{4})(\w+)\s+\w.+\n"#匹配tncl_tag 28 p3 = r"^(?:\s{5}|X\s{4}|\W\s{4})\w+\s+(\w.+)\n"#匹配tncl_name 29 p4 = r"^\s{14}([^ ].+)\n"#匹配tncl_desc和#Note内容 30 31 p5 = r"^\s{11}Note:\s\n"#Note 32 33 34 pattern1 = re.compile(p1) 35 pattern2 = re.compile(p2) 36 pattern3 = re.compile(p3) 37 pattern4 = re.compile(p4) 38 39 pattern5 = re.compile(p5) 40 41 42 fr = open(filename,encoding='utf-8') 43 temp = str(); 44 flag = 0 45 w2 = open(ss+'tncl_ori%s.txt'%year,'a')#a代表追加 w代表重写 46 flag1=0 47 for line in fr.readlines(): 48 matcher1 = re.findall(pattern1,line) 49 matcher2 = re.findall(pattern2,line) 50 matcher3 = re.findall(pattern3,line) 51 matcher4 = re.findall(pattern4,line) 52 matcher5 = re.findall(pattern5,line) 53 54 #print matcher 55 56 if matcher1: 57 for g in matcher1: 58 flag = 1 59 temp = g 60 61 continue; 62 if matcher2 and(flag==1 or 4)and(temp!=''): 63 64 flag = 2 65 w2.write("\"\n"+temp+",") 66 for j in matcher2: 67 for k in j: 68 w2.write(k) 69 70 if matcher3 and flag==2: 71 flag = 3 72 w2.write(",") 73 for j in matcher3: 74 for k in j: 75 w2.write(k) 76 w2.write(",\"") 77 if matcher4 and (flag==3 or flag==4): 78 flag=4 79 for j in matcher4: 80 for k in j: 81 w2.write(k) 82 83 if ((matcher5!=[])and(flag == 4)): 84 # flag = 5 85 w2.write("\",\"") 86 # flag1=1 87 88 w2.write("\"") 89 w2.close( ) 90 def join(): 91 92 93 94 f1= open(ss+'tncl_ori%s.txt'%year) 95 96 list_note=[] 97 for line1 in f1: 98 # print(line1) 99 100 list_note.append(line1) 101 102 f1.close() 103 # print(list_note[1].split(',')) 104 # print("%s_%s,%s\n"%(list_note[1].split(',')[0],list_note[1].split(',')[1],list_note[1].strip('\n'))) 105 # list_note[i].strip('\n') 106 # print(list_note) 107 f2_w1= open(ss+'tred%s.csv'%year,'a') 108 f2_w2= open(ss+'b4_%s.csv'%year,'a') 109 # for i in range(len(list_note)): 110 # i=0 111 112 for i in range(1,len(list_note)): 113 114 str11="%s_%s,%s\n"%(list_note[i].split(',')[0],list_note[i].split(',')[1],list_note[i].strip('\n')) 115 116 str12="%s_%s,%s,%s\n"%(list_note[i].split(',')[0],list_note[i].split(',')[1],list_note[i].split(',')[1],year) 117 f2_w1.write(str11) 118 f2_w2.write(str12) 119 120 121 f2_w1.close() 122 f2_w2.close() 123 # f2.close() 124 125 if __name__ == '__main__': 126 127 tncl_note() 128 join()