python文件操作,读取,修改,合并
1 # -*- coding:utf-8 -*- 2 ''' 3 从11c开始提取 4 ''' 5 import re 6 import numpy as np 7 import os 8 year = '17A' 9 ss="./data/edmd/" 10 # filename=ss+"/EDMDI1.17A" 11 try: 12 os.rename(ss+"/EDMDI1.17A",ss+"/EDMDI1.txt") 13 except: 14 pass 15 f1=open(ss+"/EDMDI1.txt") 16 p1=re.compile(r"^(?:\s{3}|X\s{2}|\W\s{2})([A-Z]{6})\s.+\n") 17 list_tag=list() 18 for line in f1.readlines(): 19 # print(line) 20 match1=re.findall(p1,line) 21 # print(match1) 22 if match1: 23 for j in match1: 24 list_tag.append(j) 25 # filename_w1= ss+'%s'%list_tag[i] 26 print(list_tag) 27 for i in range(len(list_tag)): 28 try: 29 os.rename(ss+'%s_D.17A'%list_tag[i],ss+'%s.txt'%list_tag[i]) 30 except: 31 break 32 33 filename_w= ss+'/new/%s_w.txt'%list_tag[i] 34 if os.path.exists(filename_w): 35 os.remove(filename_w) 36 # import os 37 38 # os.rename('./data/CODECO_D.02A','./data/CODECO_D.txt') 39 filename_r = ss+'%s.txt'%list_tag[i] # txt文件和当前脚本在同一目录下,所以不用写具体路径 40 #00010 UNH Message header M 1 41 pattern1 = re.compile(r"(^\d{5})\s{3}[A-Z]{3}.+[CM]\s{3}\d*\s{1,}\|{0,}\n")#00010 42 pattern1_2 = re.compile(r"^\d{5}\s{3}([A-Z]{3}).+[CM]\s{3}\d*\s{1,}\|{0,}\n")#UNH 43 pattern1_3 = re.compile(r"^\d{5}\s{3}[A-Z]{3}(.+)[CM]\s{3}\d*\s{1,}\|{0,}\n")#Message header 44 pattern1_4 = re.compile(r"^\d{5}\s{3}[A-Z]{3}.+([CM])\s{3}\d*\s{1,}\|{0,}\n")#C 45 pattern1_5 = re.compile(r"^\d{5}\s{3}[A-Z]{3}.+[CM]\s{3}(\d*)\s{1,}\|{0,}\n")#1 46 #pattern2 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d)*.+[CM]\s{3}\d*\-+\+\n" )#+结尾 47 #00050 ---- Segment group 1 ------------------ C 9----------------+ 48 pattern4_1 = re.compile(r"(^\d{5}).+Segment\sgroup\s\d*.+[CM]\s{3}\d*.+\n") 49 pattern4_2 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*.+\n") 50 pattern4_3 = re.compile(r"^\d{5}.+Segment\sgroup\s\d*.+([CM])\s{3}\d*.+\n") 51 pattern4_4 = re.compile(r"^\d{5}.+Segment\sgroup\s\d*.+[CM]\s{3}(\d*).+\n") 52 #匹配每组的单独结尾的一行即没有Segment group的以+、+|、+||、+|||……结尾的的每个字段 53 #如00280 RNG Range details C 1---------------+| 54 pattern5_1 = re.compile(r"(^\d{5})\s{3}[A-Z]{3}.+[CM]\s{3}\d*\-+\+{1,10}\|{0,20}\n" ) 55 pattern5_2 = re.compile(r"^\d{5}\s{3}([A-Z]{3}).+[CM]\s{3}\d*\-+\+{1,10}\|{0,20}\n" ) 56 pattern5_3 = re.compile(r"^\d{5}\s{3}[A-Z]{3}.+([CM])\s{3}\d*\-+\+{1,10}\|{0,20}\n" ) 57 pattern5_4 = re.compile(r"^\d{5}\s{3}[A-Z]{3}.+[CM]\s{3}(\d*)\-+\+{1,10}\|{0,20}\n" ) 58 #以下是确定层级关系 59 #匹配每组的单独结尾的一行即没有Segment group的以+、+|、+||、+|||……结尾的 60 pattern5 = re.compile(r"^\d{5}\s{3}[A-Z]{3}.+[CM]\s{3}\d*\-+\+\|{0,10}\n" ) 61 #匹配每组的开头一行即有Segment group的以+、+|、+||、+|||……结尾的 62 pattern2_1 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\n" )#+结尾 63 pattern2_2 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\n" )#+|结尾 64 pattern2_3 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\|\n" )#+||结尾 65 pattern2_4 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\|\|\n" ) 66 pattern2_5 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\|\|\|\n" ) 67 pattern2_6 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\|\|\|\|\n" ) 68 pattern2_7 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\|\|\|\|\|\n" ) 69 #匹配有同时多个组同时结束的情况,即以++、++|、++||……++、++|、++||……等结尾的 70 pattern3_1 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{2}\|{0,20}\n")# 匹配++、++|、++||……等结尾 71 pattern3_2 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{3}\|{0,20}\n")# 匹配+++、+++|、+++||……等结尾 72 pattern3_3 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{4}\|{0,20}\n") 73 pattern3_4 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{5}\|{0,20}\n") 74 pattern3_5 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{6}\|{0,20}\n") 75 pattern3_6 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{7}\|{0,20}\n") 76 77 78 flag = 0 79 #listgr中第一个不为0的点 80 pos = -1 81 listgr =[0,0,0,0,0,0,0,0,0,0] 82 83 fr = open(filename_r) 84 w2 = open(filename_w,'a')#a代表追加 w代表重写 85 for line in fr.readlines(): 86 matcher1 = re.findall(pattern1,line) 87 matcher1_2 = re.findall(pattern1_2,line) 88 matcher1_3 = re.findall(pattern1_3,line) 89 matcher1_4 = re.findall(pattern1_4,line) 90 matcher1_5 = re.findall(pattern1_5,line) 91 matcher2_1 = re.findall(pattern2_1,line) 92 matcher2_2 = re.findall(pattern2_2,line) 93 matcher2_3 = re.findall(pattern2_3,line) 94 matcher2_4 = re.findall(pattern2_4,line) 95 matcher2_5 = re.findall(pattern2_5,line) 96 matcher2_6 = re.findall(pattern2_6,line) 97 matcher2_7 = re.findall(pattern2_7,line) 98 matcher3_1 = re.findall(pattern3_1,line) 99 matcher3_2 = re.findall(pattern3_2,line) 100 matcher3_3 = re.findall(pattern3_3,line) 101 matcher3_4 = re.findall(pattern3_4,line) 102 matcher3_5 = re.findall(pattern3_5,line) 103 matcher3_6 = re.findall(pattern3_6,line) 104 matcher4_1 = re.findall(pattern4_1,line) 105 matcher4_2 = re.findall(pattern4_2,line) 106 matcher4_3 = re.findall(pattern4_3,line) 107 matcher4_4 = re.findall(pattern4_4,line) 108 matcher5 = re.findall(pattern5,line) 109 matcher5_1 = re.findall(pattern5_1,line) 110 matcher5_2 = re.findall(pattern5_2,line) 111 matcher5_3 = re.findall(pattern5_3,line) 112 matcher5_4 = re.findall(pattern5_4,line) 113 114 if matcher4_1!=[]: 115 w2.write("\n") 116 for j in matcher4_1: 117 for k in j: 118 w2.write(k) 119 if matcher4_2!=[]: 120 w2.write(",") 121 #写入parent列 122 if pos!= -1: 123 numgr =listgr[pos] 124 else: 125 numgr = 0 126 w2.write("SG"+str(numgr)+",") 127 for j in matcher4_2: 128 for k in j: 129 w2.write(k) 130 if matcher4_3!=[]: 131 flag = 3 132 w2.write(",") 133 for j in matcher4_3: 134 for k in j: 135 w2.write(k) 136 if matcher4_4!=[]: 137 w2.write(",") 138 for j in matcher4_4: 139 for k in j: 140 w2.write(k) 141 if matcher5_1!=[]: 142 w2.write("\n") 143 for j in matcher5_1: 144 for k in j: 145 w2.write(k) 146 if matcher5_2!=[]: 147 w2.write(",") 148 #写入parent列 149 if pos!= -1: 150 numgr =listgr[pos] 151 else: 152 numgr = 0 153 w2.write("SG"+str(numgr)+",") 154 for j in matcher5_2: 155 for k in j: 156 w2.write(k) 157 if matcher5_3!=[]: 158 flag = 3 159 w2.write(",") 160 for j in matcher5_3: 161 for k in j: 162 w2.write(k) 163 if matcher5_4!=[]: 164 w2.write(",") 165 for j in matcher5_4: 166 for k in j: 167 w2.write(k) 168 #确定层级关系,也就是确定listgr 169 if(matcher5!=[]): 170 for i in listgr: 171 if i==0: 172 pos = listgr.index(i)-1 173 break 174 listgr[pos]=0 175 if (matcher2_1!=[]): 176 # print "2_1" 177 for j in matcher2_1: 178 # print j 179 if(listgr[0]==0): 180 listgr[0]=j 181 else: 182 listgr[0]=0 183 # print listgr 184 if (matcher2_2!=[]): 185 for j in matcher2_2: 186 #numgr_d = j 187 if(listgr[1]==0): 188 listgr[1]=j 189 else: 190 listgr[1]=0 191 if (matcher2_3!=[]): 192 for j in matcher2_3: 193 if(listgr[2]==0): 194 listgr[2]=j 195 else: 196 listgr[2]=0 197 if (matcher2_4!=[]): 198 for j in matcher2_4: 199 if(listgr[3]==0): 200 listgr[3]=j 201 else: 202 listgr[3]=0 203 if (matcher2_5!=[]): 204 for j in matcher2_5: 205 if(listgr[4]==0): 206 listgr[4]=j 207 else: 208 listgr[4]=0 209 if (matcher2_6!=[]): 210 for j in matcher2_6: 211 if(listgr[5]==0): 212 listgr[5]=j 213 else: 214 listgr[5]=0 215 if (matcher2_7!=[]): 216 for j in matcher2_7: 217 if(listgr[6]==0): 218 listgr[6]=j 219 else: 220 listgr[6]=0 221 if (matcher3_1!=[]): 222 for i in listgr: 223 if i==0: 224 pos = listgr.index(i)-1 225 break 226 listgr[pos]=0 227 listgr[pos-1]=0 228 if (matcher3_2!=[]): 229 for i in listgr: 230 if i==0: 231 pos = listgr.index(i)-1 232 break 233 for k in range((pos-2),(pos+1)): 234 listgr[k]=0 235 if (matcher3_3!=[]): 236 for i in listgr: 237 if i==0: 238 pos = listgr.index(i)-1 239 break 240 for k in range((pos-3),(pos+1)): 241 listgr[k]=0 242 if (matcher3_4!=[]): 243 for i in listgr: 244 if i==0: 245 pos = listgr.index(i)-1 246 break 247 for k in range(pos-4,pos+1): 248 listgr[k]=0 249 if (matcher3_5!=[]): 250 for i in listgr: 251 if i==0: 252 pos = listgr.index(i)-1 253 break 254 for k in range(pos-5,pos+1): 255 listgr[k]=0 256 if (matcher3_6!=[]): 257 for i in listgr: 258 if i==0: 259 pos = listgr.index(i)-1 260 break 261 for k in range(pos-6,pos+1): 262 listgr[k]=0 263 #确定层级关系结束 264 if (matcher1!=[]): 265 flag = 1 266 w2.write("\n") 267 for j in matcher1: 268 for k in j: 269 w2.write(k) 270 #print listgr 271 #判断当前lit不为0的位置 272 for i in listgr: 273 if i==0: 274 pos = listgr.index(i)-1 275 break 276 if matcher1_2!=[]: 277 flag = 2 278 w2.write(",") 279 #写入parent列 280 if pos!= -1: 281 numgr =listgr[pos] 282 else: 283 numgr = 0 284 w2.write("SG"+str(numgr)+",") 285 for j in matcher1_2: 286 for k in j: 287 w2.write(k) 288 if matcher1_3!=[]: 289 flag = 3 290 w2.write(",") 291 for j in matcher1_3: 292 for k in j: 293 w2.write(k) 294 if matcher1_4!=[]: 295 flag = 4 296 w2.write(",") 297 for j in matcher1_4: 298 for k in j: 299 w2.write(k) 300 if ((matcher1_5!=[])and(flag ==4)): 301 flag = 5 302 w2.write(",") 303 for j in matcher1_5: 304 for k in j: 305 w2.write(k) 306 307 w2.close( ) 308 fr.close() 309 310 f2_w= open(ss+'/new/%s.txt'%year,'a') 311 312 for i in range(len(list_tag)): 313 f2_r = open(ss+'/new/%s_w.txt'%list_tag[i]) 314 for line in f2_r: 315 # for j in line: 316 f2_w.write(year+','+line) 317 f2_r.close() 318 print("--%i--is ok"%i) 319 f2_w.close() 320 321 # if __name__ == '__main__': 322 323 324 """ 325 特殊情况 326 327 328 329 """