python网页爬虫 spiders_97A-04B
1 import urllib 2 import urllib.request 3 import bs4 4 from bs4 import BeautifulSoup as bs 5 import re 6 import os 7 8 # year = '97A' 9 # ss="./data/%s/"%year 10 ''' 11 适应网页爬取95B-96B 12 13 ''' 14 15 16 ''' 17 解决网页请求失败 18 resp = None 19 while (resp == None): 20 try: 21 resp = urllib.request.urlopen("http://baidu.com 22 23 ") 24 except: 25 pass 26 27 ''' 28 def b0_trmd(year,ss): 29 if not os.path.exists(ss): 30 os.makedirs(ss) 31 # os.makedirs(ss) 32 p1=r"^([A-Z]{6})" 33 34 url = "http://www.stylusstudio.com/edifact/D%s/messages.htm"%year 35 resp=None 36 while(resp==None): 37 try: 38 resp = urllib.request.urlopen(url) 39 except: 40 pass 41 data = resp.read().decode('cp852') 42 soup = bs(data, 'html.parser') 43 segment11= soup.find_all('table')# ResultSet 44 segment1=segment11[0].find_all('td')[1:]#表示第几个table,此时表示进去html网页中的第7个table,[1:],<class 'list'> 45 # segment2= soup.find_all('table') 46 # print(type(segment1))# 47 f2=open(ss+'./trmd1%s.txt'%year,'a',encoding='utf-8') 48 f3=open(ss+'./b0%s.txt'%year,'a',encoding='utf-8') 49 f4=open(ss+'./trmd%s.txt'%year,'a',encoding='utf-8') 50 pattern1=re.compile(p1) 51 tag_list=[] 52 for item in segment1: 53 # print(item.string)#如果一个标签里面没有标签了,那么 .string 就会返回标签里面的内容。如果标签里面只有唯一的一个标签了,那么 .string 也会返回最里面的内容。 54 str1=item.get_text() 55 # if str1.strip()=="":用于判断字符串是否含空格 56 # break 57 if item.string==None: 58 # print("hhusssssssssssssssssssss") 59 break 60 matcher1=re.findall(pattern1,str1) 61 if matcher1: 62 63 f3.write(matcher1[0]+','+year+'\n') 64 tag_list.append(matcher1[0]) 65 f4.write(matcher1[0]+',') 66 else: 67 f4.write(str1+'\n') 68 69 70 # print(type(str1)) 71 # test1(str1) 72 # print(str1)#以文本方式呈现 73 74 # print(item.get_text())#获取具体标签内部内容 75 # print([text for text in item.stripped_strings] )#以列表方式呈现 76 77 # str2=str([text for text in item.stripped_strings]) 78 # #print(type(str1[0][0])) 79 f2.writelines(str1+'\n') 80 f2.close() 81 return tag_list 82 def test1(code_tag,year,ss): 83 84 url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag) 85 resp=None 86 while(resp==None): 87 try: 88 resp = urllib.request.urlopen(url) 89 except: 90 pass 91 data = resp.read().decode('UTF-8') 92 soup = bs(data, 'html.parser') 93 segment11= soup.find_all('table') 94 segment1=segment11[6].find_all('tr')#表示第几个table,此时表示进去html网页中的第7个table 95 96 97 f2=open(ss+'./text1%s%s.txt'%(year,code_tag),'a',encoding='cp852') 98 for item in segment1: 99 100 # #print(item) 101 ''' 102 <tr class="FrameTreeFont"><td><span class="FrameDrawFont">│ 103 <span class="FrameHideFont">─</span>│<span class="FrameHideFont">─</span>├─</span> 104 <a class="FrameItemFont" href="DAM_.htm" target="classFrame" title="Damage">DAM</a> 105 Damage</td><td align="right"><span class="FrameDetailFont"> ×1 106 </span></td><td><span class="FrameDetailFont">(M)</span></td></tr> 107 ''' 108 str12=item.get_text() 109 # #print(str12)#以文本方式呈现 110 # #print(type(str12)) 111 ''' 112 │─│─├─DAM Damage ×1 (M) 113 ''' 114 # #print(item.td.span.get_text())#获取具体标签内部内容 115 # #print([text for text in item.stripped_strings] )#以列表方式呈现 116 ''' 117 ['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)'] 118 ''' 119 ''' 120 soup.get_text("|")#u'\nI linked to |example.com|\n'进一步,通过strip去除掉文本每个位的头尾空白。 121 122 soup.get_text("|", strip=True)#u'I linked to|example.com' 123 ''' 124 str1=str([text for text in item.stripped_strings]) 125 # #print(type(str1[0][0])) 126 f2.writelines(str12+'\n') 127 128 f2.close() 129 def test2(code_tag,year,ss): 130 # p1=r"^(?:├─|└─)(.+)\n" 131 p1=r"^\W{2}(\w.+)\n"# 132 # p1=r"^\W{2}(Segment\sGroup\s\w.+)\n"#segement为第一层 133 # p2=r"^(?:│─├─|│─└─)(.+)\n" 134 p2=r"^\W{4}(\w.+)\n" 135 # p3=r"^(?:│───├─|│───└─|│─│─├─|│─│─└─)(.+)\n" 136 p3=r"^\W{6}(\w.+)\n" 137 # p4=r"^(?:)(.+)\n" 138 139 p4=r"^\W{8}(\w.+)\n" 140 p5=r"^\W{10}(\w.+)\n" 141 p6=r"^\W{12}(\w.+)\n" 142 p7=r"^\W{14}(\w.+)\n" 143 p8=r"^\W{16}(\w.+)\n" 144 145 p9=r"Segment\sGroup\s(?:([0-9]|[0-9][0-9]))" 146 # p10="Segment Group " 147 148 149 150 pattern1=re.compile(p1) 151 pattern2=re.compile(p2) 152 pattern3=re.compile(p3) 153 pattern4=re.compile(p4) 154 155 pattern5=re.compile(p5) 156 pattern6=re.compile(p6) 157 pattern7=re.compile(p7) 158 pattern8=re.compile(p8) 159 pattern9=re.compile(p9) 160 # pattern10=re.compile(p10) 161 162 f1=open(ss+'./text1%s%s.txt'%(year,code_tag),'r',encoding='cp852') 163 f2=open(ss+'./text2%s%s.txt'%(year,code_tag),'a',encoding='utf-8') 164 # c=int() 165 # d=int() 166 listp=[0,0,0,0,0,0,0,0]#用于记录父节点 167 for line in f1.readlines(): 168 169 matcher1=re.findall(pattern1,line) 170 matcher2=re.findall(pattern2,line) 171 matcher3=re.findall(pattern3,line) 172 matcher4=re.findall(pattern4,line) 173 174 matcher5=re.findall(pattern5,line) 175 matcher6=re.findall(pattern6,line) 176 matcher7=re.findall(pattern7,line) 177 matcher8=re.findall(pattern8,line) 178 matcher9=re.findall(pattern9,line) 179 # #print(type(matcher1)) 180 181 if matcher1: 182 183 a='SG'+str(listp[0])+' '+matcher1[0]+'\n' 184 f2.write(a) 185 if matcher9: 186 listp[1]=matcher9[0] 187 if matcher2: 188 189 b='SG'+str(listp[1])+' '+matcher2[0]+'\n' 190 f2.write(b) 191 if matcher9: 192 listp[2]=matcher9[0] 193 if matcher3: 194 195 c='SG'+str(listp[2])+' '+matcher3[0]+'\n' 196 f2.write(c) 197 #print(c) 198 if matcher9: 199 listp[3]=matcher9[0] 200 if matcher4: 201 d='SG'+str(listp[3])+' '+matcher4[0]+'\n' 202 f2.write(d) 203 #print(d) 204 if matcher9: 205 listp[4]=matcher9[0] 206 if matcher5: 207 e='SG'+str(listp[4])+' '+matcher5[0]+'\n' 208 f2.write(e) 209 #print(d) 210 if matcher9: 211 listp[5]=matcher9[0] 212 if matcher6: 213 f='SG'+str(listp[5])+' '+matcher6[0]+'\n' 214 f2.write(f) 215 #print(d) 216 if matcher9: 217 listp[6]=matcher9[0] 218 if matcher7: 219 g='SG'+str(listp[6])+' '+matcher7[0]+'\n' 220 f2.write(g) 221 #print(d) 222 if matcher9: 223 listp[7]=matcher9[0] 224 if matcher8: 225 h='SG'+str(listp[7])+' '+matcher8[0]+'\n' 226 f2.write(h) 227 #print(d) 228 if matcher9: 229 listp[8]=matcher9[0] 230 f2.close() 231 f1.close() 232 f3=open(ss+'./text3%s%s.txt'%(year,code_tag),'w',encoding='utf-8') 233 f4=open(ss+'./text2%s%s.txt'%(year,code_tag),'r',encoding='utf-8') 234 for line1 in f4.readlines(): 235 #print(line1) 236 # f3.write(line1.replace(" "," ")) 237 f3.write(line1.replace("Segment Group ","SG")) 238 f4.close() 239 f3.close() 240 def test3(code_tag,year,ss): 241 f5=open(ss+'./text4%s%s.txt'%(year,code_tag),'a',encoding='utf-8') 242 f6=open(ss+'./text3%s%s.txt'%(year,code_tag),'r',encoding='utf-8') 243 p10=r"(^\w{3})\s(\w{3}).+×([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})\s\((\w)\)$" 244 pattern10=re.compile(p10) 245 i=0 246 for line2 in f6.readlines(): 247 i=i+1 248 matcher10=re.findall(pattern10,line2) 249 # print(matcher10) 250 # print(type(matcher10)) 251 if matcher10: 252 f5.write(str(matcher10[0])+'\n') 253 254 f5.close() 255 f6.close() 256 # print(i) 257 return i 258 def test4(code_tag,year,ss): 259 url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag) 260 resp=None 261 while(resp==None): 262 try: 263 resp = urllib.request.urlopen(url) 264 except: 265 pass 266 data = resp.read().decode('UTF-8') 267 soup = bs(data, 'html.parser') 268 segment11= soup.find_all('p') 269 # segment1=segment11[1].find_all('p')#表示第几个table,此时表示进去html网页中的第7个table 270 # #print(segment1) 271 f2=open(ss+'./text5%s%s.txt'%(year,code_tag),'a',encoding='utf-8') 272 for item in segment11: 273 str12=item.get_text() 274 #print(str12)#以文本方式呈现 275 #print(type(str12)) 276 ''' 277 │─│─├─DAM Damage ×1 (M) 278 ''' 279 # #print(item.td.span.get_text())#获取具体标签内部内容 280 #print([text for text in item.stripped_strings] )#以列表方式呈现 281 ''' 282 ['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)'] 283 ''' 284 ''' 285 soup.get_text("|")#u'\nI linked to |example.com|\n'进一步,通过strip去除掉文本每个位的头尾空白。 286 287 soup.get_text("|", strip=True)#u'I linked to|example.com' 288 ''' 289 str1=str([text for text in item.stripped_strings]) 290 #print(type(str1[0][0])) 291 f2.writelines(str12+'\n') 292 293 f2.close() 294 295 # f2=open('./text1.txt','a',encoding='cp852') 296 # for item in segment1: 297 def test5(code_tag,num,year,ss): 298 f7=open(ss+'./text6%s%s.txt'%(year,code_tag),'a',encoding='utf-8') 299 f8=open(ss+'./text5%s%s.txt'%(year,code_tag),'r',encoding='utf-8') 300 p1=r"(^A\sservice\ssegment.+\n)" 301 # p2=r"((?:A\s\w|^Date|^This|^Document|^In\s|^Requirements\s|^Dimensions|^The|^If\s|^Through|^Instructions|^For|^An).+\n)" 302 p2=r"(^(?!Information.+\:|Note|It\sis\srecommended\sthat\swhere|ID\sshould\sbe\sspecified|All\sother\ssegments|A\sgroup\sof\ssegments\sthat\scontains\sa\sline\sitem\sand\sits\srelated\sinformation.+should\sbe\sconsigned.).+\n)" 303 pattern1=re.compile(p1) 304 pattern2=re.compile(p2) 305 # pattern3=re.compile(p3) 306 # pattern4=re.compile(p4) 307 flag=0 308 i=num 309 for line3 in f8.readlines(): 310 matcher1=re.findall(pattern1,line3) 311 matcher2=re.findall(pattern2,line3) 312 # matcher3=re.findall(pattern3,line3) 313 # matcher4=re.findall(pattern4,line3) 314 315 # #print(matcher10) 316 if matcher1 and flag==0: 317 f7.write(matcher1[0]) 318 flag=1 319 i=i-1 320 if i==0: 321 break 322 continue 323 if (matcher2 and (flag==1 or flag==2)): 324 f7.write(matcher2[0]) 325 flag=2 326 i=i-1 327 continue 328 f7.close() 329 f8.close() 330 331 def join(code_tag,year,ss): 332 333 334 f1 =open(ss+'text6%s%s.txt'%(year,code_tag),'r',encoding='utf-8') 335 f2= open(ss+'text4%s%s.txt'%(year,code_tag),'r',encoding='utf-8') 336 337 338 list_note=[] 339 for line1 in f1: 340 list_note.append(line1) 341 f1.close() 342 p11=r"^\W{2}(\w{3}).+\n" 343 p12=r"^\W{2}\w{3}\W{2}\s\W(\w{3}).+\n" 344 p13=r"^\W{2}\w{3}\W{2}\s\W\w{3}\W{2}\s\W([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})\W.+\n" 345 p14=r"\W{2}\w{3}\W{2}\s\W\w{3}\W{2}\s\W.+(C|M)" 346 # print(list_note) 347 f2_w= open(ss+'b1%s%s.txt'%(year,code_tag),'a',encoding='utf-8') 348 f3_w= open(ss+'b1%s.csv'%year,'a',encoding='utf-8') 349 # for i in range(len(list_note)): 350 i=0 351 pattern11=re.compile(p11) 352 pattern12=re.compile(p12) 353 pattern13=re.compile(p13) 354 pattern14=re.compile(p14) 355 # f2_r = open(ss+'/new/%s_w.txt'%list_tag[i]) 356 pos=[ 357 358 '0010','0020','0030','0040','0050','0060','0070','0080','0090','0100','0110','0120','0130','0140','0150','0160','0170','0180','0190','0200', 359 '0210','0220','0230','0240','0250','0260','0270','0280','0290','0300','0310','0320','0330','0340','0350','0360','0370','0380','0390','0400', 360 '0410','0420','0430','0440','0450','0460','0470','0480','0490','0500','0510','0520','0530','0540','0550','0560','0570','0580','0590','0600', 361 '0610','0620','0630','0640','0650','0660','0670','0680','0690','0700','0710','0720','0730','0740','0750','0760','0770','0780','0790','0800', 362 '0810','0820','0830','0840','0850','0860','0870','0880','0890','0900','0910','0920','0930','0940','0950','0960','0970','0980','0990','1000', 363 '1010','1020','1030','1040','1050','1060','1070','1080','1090','1100','1110','1120','1130','1140','1150','1160','1170','1180','1190','1200', 364 '1210','1220','1230','1240','1250','1260','1270','1280','1290','1300','1310','1320','1330','1340','1350','1360','1370','1380','1390','1400', 365 '1410','1420','1430','1440','1450','1460','1470','1480','1490','1500','1510','1520','1530','1540','1550','1560','1570','1580','1590','1600', 366 '1610','1620','1630','1640','1650','1660','1670','1680','1690','1700','1710','1720','1730','1740','1750','1760','1770','1780','1790','1800', 367 '1810','1820','1830','1840','1850','1860','1870','1880','1890','1900','1910','1920','1930','1940','1950','1960','1970','1980','1990','2000', 368 '2010','2020','2030','2040','2050','2060','2070','2080','2090','2100','2110','2120','2130','2140','2150','2160','2170','2180','2190','2200', 369 '2210','2220','2230','2240','2250','2260','2270','2280','2290','2300','2310','2320','2330','2340','2350','2360','2370','2380','2390','2400', 370 '2410','2420','2430','2440','2450','2460','2470','2480','2490','2500','2510','2520','2530','2540','2550','2560','2570','2580','2590','2600', 371 '2610','2620','2630','2640','2650','2660','2670','2680','2690','2700','2710','2720','2730','2740','2750','2760','2770','2780','2790','2800', 372 '2810','2820','2830','2840','2850','2860','2870','2880','2890','2900','2910','2920','2930','2940','2950','2960','2970','2980','2990','3000', 373 '3010','3020','3030','3040','3050','3060','3070','3080','3090','3100','3110','3120','3130','3140','3150','3160','3170','3180','3190','3200', 374 '3210','3220','3230','3240','3250','3260','3270','3280','3290','3300','3310','3320','3330','3340','3350','3360','3370','3380','3390','3400', 375 '3410','3420','3430','3440','3450','3460','3470','3480','3490','3500','3510','3520','3530','3540','3550','3560','3570','3580','3590','3600', 376 '3610','3620','3630','3640','3650','3660','3670','3680','3690','3700','3710','3720','3730','3740','3750','3760','3770','3780','3790','3800', 377 '3810','3820','3830','3840','3850' 378 379 ] 380 for line2 in f2: 381 matcher11=re.findall(pattern11,line2) 382 matcher12=re.findall(pattern12,line2) 383 matcher13=re.findall(pattern13,line2) 384 matcher14=re.findall(pattern14,line2) 385 # print(matcher11[0]) 386 # print(matcher12[0]) 387 # print(matcher13[0]) 388 # print(matcher14[0]) 389 # print(matcher11[0]) 390 # a=list(line2) 391 # print(a) 392 # b=str(a) 393 # print(b) 394 # print(line2.split(',')) 395 try: 396 str11="%s,%s,%s,%s,%s,%s,%s,\"%s\"\n"%(pos[i],code_tag,matcher12[0],matcher11[0],year,matcher14[0],matcher13[0],list_note[i].strip('\n')) 397 398 i=i+1 399 # print(i) 400 # print(str11) 401 f2_w.write(str11) 402 f3_w.write(str11) 403 except: 404 print("---error---") 405 break 406 407 f2_w.close() 408 f2.close() 409 410 def test():#用户爬取网页,保存到本地 411 filename='./codeco.txt' 412 url = "http://www.stylusstudio.com/edifact/D95B/CODECO.htm" 413 resp = urllib.request.urlopen(url) 414 data = resp.read().decode('UTF-8') 415 # f1=open(filename,'w') 416 # f1.write(data) 417 # #print(type(data)) 418 # #print(data) 419 f2=open('./text.txt','a') 420 soup = bs(data, 'html.parser') 421 422 # sw=soup.find_all('table',border=0,width="100%") 423 # #print(sw[0]) 424 segment1= soup.find_all('h4') 425 426 segment2= soup.find_all('p') 427 # #print(type(segment)) 428 #print(segment1) 429 #print(segment2) 430 nowplaying_list = [] 431 for item in segment1: 432 #print(item) 433 # #print(item.name) 434 # #print(item.attrs) 435 # #print(type(item)) 436 #print(item.get_text()) 437 #print([text for text in item.stripped_strings] ) 438 f2.writelines(str([text for text in item.stripped_strings])+'\n') 439 # nowplaying_dict = {} 440 # nowplaying_dict['id'] = item['a'] 441 # for tag_img_item in item.find_all('img'): 442 # nowplaying_dict['name'] = tag_img_item['alt'] 443 # nowplaying_list.append(nowplaying_dict) 444 # result= segment[0].find_all('h4') 445 # #print(result) 446 447 for item in segment2: 448 449 #print(item) 450 #print(item.get_text()) 451 f2.writelines(str([text for text in item.stripped_strings] )+'\n') 452 f2.close() 453 # data={} 454 # data['word']='Jecvay Notes' 455 456 # url_values=urllib.parse.urlencode(data) 457 # url="http://www.baidu.com/s?" 458 # full_url=url+url_values 459 460 # data=urllib.request.urlopen(full_url).read() 461 # data=data.decode('UTF-8') 462 # #print(data) 463 if __name__=='__main__': 464 # '97A','97B','98A','98B','99A','99B' 465 year1=['00A','00B','01A','01B','01C','02A','02B','03A','03B','04A','04B'] 466 for j in range(len(year1)): 467 468 year=year1[j] 469 ss="./data/%s/"%year 470 tag=b0_trmd(year,ss) 471 print(tag) 472 for i in range(len(tag)): 473 test1(tag[i],year,ss) 474 test2(tag[i],year,ss) 475 num=test3(tag[i],year,ss) 476 test4(tag[i],year,ss) 477 test5(tag[i],num,year,ss) 478 join(tag[i],year,ss) 479 print("------%s-----ok"%i) 480 # str1='APERAK' 481 # join(str1)