python网页爬虫 spiders_97A-04B

  1 import urllib
  2 import urllib.request
  3 import bs4
  4 from bs4 import BeautifulSoup as bs
  5 import re
  6 import os 
  7 
  8 # year = '97A'
  9 # ss="./data/%s/"%year
 10 '''
 11 适应网页爬取95B-96B
 12 
 13 '''
 14 
 15 
 16 '''
 17 解决网页请求失败
 18 resp = None
 19 while (resp == None):
 20     try:
 21         resp = urllib.request.urlopen("http://baidu.com 
 22 
 23 ")
 24     except:
 25         pass
 26 
 27 '''
 28 def b0_trmd(year,ss):
 29     if not os.path.exists(ss):
 30         os.makedirs(ss)
 31     # os.makedirs(ss)
 32     p1=r"^([A-Z]{6})"
 33 
 34     url = "http://www.stylusstudio.com/edifact/D%s/messages.htm"%year
 35     resp=None
 36     while(resp==None):
 37         try:
 38             resp = urllib.request.urlopen(url)
 39         except:
 40             pass
 41     data = resp.read().decode('cp852')
 42     soup = bs(data, 'html.parser')    
 43     segment11= soup.find_all('table')# ResultSet
 44     segment1=segment11[0].find_all('td')[1:]#表示第几个table，此时表示进去html网页中的第7个table,[1:],<class 'list'>
 45     # segment2= soup.find_all('table')
 46     # print(type(segment1))#
 47     f2=open(ss+'./trmd1%s.txt'%year,'a',encoding='utf-8')
 48     f3=open(ss+'./b0%s.txt'%year,'a',encoding='utf-8')
 49     f4=open(ss+'./trmd%s.txt'%year,'a',encoding='utf-8')
 50     pattern1=re.compile(p1)
 51     tag_list=[]
 52     for item in segment1:
 53             # print(item.string)#如果一个标签里面没有标签了，那么 .string 就会返回标签里面的内容。如果标签里面只有唯一的一个标签了，那么 .string 也会返回最里面的内容。
 54             str1=item.get_text()
 55             # if str1.strip()=="":用于判断字符串是否含空格
 56             #     break
 57             if item.string==None:
 58                 # print("hhusssssssssssssssssssss")
 59                 break
 60             matcher1=re.findall(pattern1,str1)
 61             if matcher1:
 62                 
 63                 f3.write(matcher1[0]+','+year+'\n')
 64                 tag_list.append(matcher1[0])
 65                 f4.write(matcher1[0]+',')
 66             else:
 67                 f4.write(str1+'\n')
 68 
 69 
 70             # print(type(str1))
 71             # test1(str1)
 72             # print(str1)#以文本方式呈现
 73 
 74             # print(item.get_text())#获取具体标签内部内容
 75             # print([text for text in item.stripped_strings] )#以列表方式呈现
 76 
 77             # str2=str([text for text in item.stripped_strings])
 78             # #print(type(str1[0][0]))
 79             f2.writelines(str1+'\n')
 80     f2.close()
 81     return tag_list
 82 def test1(code_tag,year,ss):
 83 
 84     url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag)
 85     resp=None
 86     while(resp==None):
 87         try:
 88             resp = urllib.request.urlopen(url)
 89         except:
 90             pass
 91     data = resp.read().decode('UTF-8')
 92     soup = bs(data, 'html.parser')    
 93     segment11= soup.find_all('table')
 94     segment1=segment11[6].find_all('tr')#表示第几个table，此时表示进去html网页中的第7个table
 95 
 96  
 97     f2=open(ss+'./text1%s%s.txt'%(year,code_tag),'a',encoding='cp852')
 98     for item in segment1:
 99 
100             # #print(item)
101             '''
102             <tr class="FrameTreeFont"><td><span class="FrameDrawFont">│
103             <span class="FrameHideFont">─</span>│<span class="FrameHideFont">─</span>├─</span>
104             <a class="FrameItemFont" href="DAM_.htm" target="classFrame" title="Damage">DAM</a> 
105             Damage</td><td align="right"><span class="FrameDetailFont"> ×1 
106             </span></td><td><span class="FrameDetailFont">(M)</span></td></tr>
107             '''
108             str12=item.get_text()
109             # #print(str12)#以文本方式呈现
110             # #print(type(str12))
111             '''
112             │─│─├─DAM Damage ×1 (M)
113             '''
114             # #print(item.td.span.get_text())#获取具体标签内部内容
115             # #print([text for text in item.stripped_strings] )#以列表方式呈现
116             '''
117             ['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']
118             '''
119             '''
120             soup.get_text("|")#u'\nI linked to |example.com|\n'进一步，通过strip去除掉文本每个位的头尾空白。
121 
122             soup.get_text("|", strip=True)#u'I linked to|example.com'
123             '''
124             str1=str([text for text in item.stripped_strings])
125             # #print(type(str1[0][0]))
126             f2.writelines(str12+'\n')
127 
128     f2.close()
129 def test2(code_tag,year,ss):
130     # p1=r"^(?:├─|└─)(.+)\n"
131     p1=r"^\W{2}(\w.+)\n"#
132     # p1=r"^\W{2}(Segment\sGroup\s\w.+)\n"#segement为第一层
133     # p2=r"^(?:│─├─|│─└─)(.+)\n"
134     p2=r"^\W{4}(\w.+)\n"
135     # p3=r"^(?:│───├─|│───└─|│─│─├─|│─│─└─)(.+)\n"
136     p3=r"^\W{6}(\w.+)\n"
137     # p4=r"^(?:)(.+)\n"
138 
139     p4=r"^\W{8}(\w.+)\n"
140     p5=r"^\W{10}(\w.+)\n"
141     p6=r"^\W{12}(\w.+)\n"
142     p7=r"^\W{14}(\w.+)\n"
143     p8=r"^\W{16}(\w.+)\n"
144 
145     p9=r"Segment\sGroup\s(?:([0-9]|[0-9][0-9]))"
146     # p10="Segment Group "
147 
148 
149 
150     pattern1=re.compile(p1)
151     pattern2=re.compile(p2)
152     pattern3=re.compile(p3)
153     pattern4=re.compile(p4)
154 
155     pattern5=re.compile(p5)
156     pattern6=re.compile(p6)
157     pattern7=re.compile(p7)
158     pattern8=re.compile(p8)
159     pattern9=re.compile(p9) 
160     # pattern10=re.compile(p10)
161 
162     f1=open(ss+'./text1%s%s.txt'%(year,code_tag),'r',encoding='cp852')
163     f2=open(ss+'./text2%s%s.txt'%(year,code_tag),'a',encoding='utf-8')    
164     # c=int()
165     # d=int()
166     listp=[0,0,0,0,0,0,0,0]#用于记录父节点
167     for line in f1.readlines():
168 
169         matcher1=re.findall(pattern1,line)
170         matcher2=re.findall(pattern2,line)
171         matcher3=re.findall(pattern3,line)
172         matcher4=re.findall(pattern4,line)
173 
174         matcher5=re.findall(pattern5,line)
175         matcher6=re.findall(pattern6,line)
176         matcher7=re.findall(pattern7,line)
177         matcher8=re.findall(pattern8,line)
178         matcher9=re.findall(pattern9,line)
179         # #print(type(matcher1))
180 
181         if matcher1:
182 
183             a='SG'+str(listp[0])+' '+matcher1[0]+'\n'
184             f2.write(a)
185             if matcher9:
186                 listp[1]=matcher9[0]
187         if matcher2:
188 
189             b='SG'+str(listp[1])+' '+matcher2[0]+'\n'
190             f2.write(b)
191             if matcher9:
192                 listp[2]=matcher9[0]
193         if matcher3:
194 
195             c='SG'+str(listp[2])+' '+matcher3[0]+'\n'
196             f2.write(c)
197             #print(c)
198             if matcher9:
199                 listp[3]=matcher9[0]
200         if matcher4:
201             d='SG'+str(listp[3])+' '+matcher4[0]+'\n'
202             f2.write(d)
203             #print(d)
204             if matcher9:
205                 listp[4]=matcher9[0]
206         if matcher5:
207             e='SG'+str(listp[4])+' '+matcher5[0]+'\n'
208             f2.write(e)
209             #print(d)
210             if matcher9:
211                 listp[5]=matcher9[0]
212         if matcher6:
213             f='SG'+str(listp[5])+' '+matcher6[0]+'\n'
214             f2.write(f)
215             #print(d)
216             if matcher9:
217                 listp[6]=matcher9[0]
218         if matcher7:
219             g='SG'+str(listp[6])+' '+matcher7[0]+'\n'
220             f2.write(g)
221             #print(d)
222             if matcher9:
223                 listp[7]=matcher9[0]
224         if matcher8:
225             h='SG'+str(listp[7])+' '+matcher8[0]+'\n'
226             f2.write(h)
227             #print(d)
228             if matcher9:
229                 listp[8]=matcher9[0]
230     f2.close()
231     f1.close()
232     f3=open(ss+'./text3%s%s.txt'%(year,code_tag),'w',encoding='utf-8')
233     f4=open(ss+'./text2%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
234     for line1 in f4.readlines():
235         #print(line1)
236         # f3.write(line1.replace(" "," "))
237         f3.write(line1.replace("Segment Group ","SG"))
238     f4.close()
239     f3.close()
240 def test3(code_tag,year,ss):
241     f5=open(ss+'./text4%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
242     f6=open(ss+'./text3%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
243     p10=r"(^\w{3})\s(\w{3}).+×([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})\s\((\w)\)$"
244     pattern10=re.compile(p10)
245     i=0
246     for line2 in f6.readlines():
247         i=i+1
248         matcher10=re.findall(pattern10,line2)
249         # print(matcher10)
250         # print(type(matcher10))
251         if matcher10:
252             f5.write(str(matcher10[0])+'\n')
253 
254     f5.close()
255     f6.close()
256     # print(i)
257     return i
258 def test4(code_tag,year,ss):
259     url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag)
260     resp=None
261     while(resp==None):
262         try:
263             resp = urllib.request.urlopen(url)
264         except:
265             pass
266     data = resp.read().decode('UTF-8')
267     soup = bs(data, 'html.parser')    
268     segment11= soup.find_all('p')
269     # segment1=segment11[1].find_all('p')#表示第几个table，此时表示进去html网页中的第7个table
270     # #print(segment1)    
271     f2=open(ss+'./text5%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
272     for item in segment11:
273         str12=item.get_text()
274         #print(str12)#以文本方式呈现
275         #print(type(str12))
276         '''
277         │─│─├─DAM Damage ×1 (M)
278         '''
279         # #print(item.td.span.get_text())#获取具体标签内部内容
280         #print([text for text in item.stripped_strings] )#以列表方式呈现
281         '''
282         ['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']
283         '''
284         '''
285         soup.get_text("|")#u'\nI linked to |example.com|\n'进一步，通过strip去除掉文本每个位的头尾空白。
286 
287         soup.get_text("|", strip=True)#u'I linked to|example.com'
288         '''
289         str1=str([text for text in item.stripped_strings])
290         #print(type(str1[0][0]))
291         f2.writelines(str12+'\n')
292 
293     f2.close()
294  
295     # f2=open('./text1.txt','a',encoding='cp852')
296     # for item in segment1:    
297 def test5(code_tag,num,year,ss):
298     f7=open(ss+'./text6%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
299     f8=open(ss+'./text5%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
300     p1=r"(^A\sservice\ssegment.+\n)"
301     # p2=r"((?:A\s\w|^Date|^This|^Document|^In\s|^Requirements\s|^Dimensions|^The|^If\s|^Through|^Instructions|^For|^An).+\n)"
302     p2=r"(^(?!Information.+\:|Note|It\sis\srecommended\sthat\swhere|ID\sshould\sbe\sspecified|All\sother\ssegments|A\sgroup\sof\ssegments\sthat\scontains\sa\sline\sitem\sand\sits\srelated\sinformation.+should\sbe\sconsigned.).+\n)"
303     pattern1=re.compile(p1)
304     pattern2=re.compile(p2)
305     # pattern3=re.compile(p3)
306     # pattern4=re.compile(p4)
307     flag=0
308     i=num
309     for line3 in f8.readlines():
310         matcher1=re.findall(pattern1,line3)
311         matcher2=re.findall(pattern2,line3)
312         # matcher3=re.findall(pattern3,line3)
313         # matcher4=re.findall(pattern4,line3)
314 
315         # #print(matcher10)
316         if matcher1 and flag==0:
317             f7.write(matcher1[0])
318             flag=1
319             i=i-1
320             if i==0:
321                 break
322             continue
323         if (matcher2 and (flag==1 or flag==2)):
324             f7.write(matcher2[0])
325             flag=2
326             i=i-1
327             continue
328     f7.close()
329     f8.close()
330 
331 def join(code_tag,year,ss):
332 
333 
334     f1 =open(ss+'text6%s%s.txt'%(year,code_tag),'r',encoding='utf-8') 
335     f2= open(ss+'text4%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
336 
337 
338     list_note=[]
339     for line1 in f1:
340         list_note.append(line1)
341     f1.close()
342     p11=r"^\W{2}(\w{3}).+\n"
343     p12=r"^\W{2}\w{3}\W{2}\s\W(\w{3}).+\n"
344     p13=r"^\W{2}\w{3}\W{2}\s\W\w{3}\W{2}\s\W([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})\W.+\n"
345     p14=r"\W{2}\w{3}\W{2}\s\W\w{3}\W{2}\s\W.+(C|M)"
346     # print(list_note)
347     f2_w= open(ss+'b1%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
348     f3_w= open(ss+'b1%s.csv'%year,'a',encoding='utf-8')  
349     # for i in range(len(list_note)):
350     i=0
351     pattern11=re.compile(p11)
352     pattern12=re.compile(p12)    
353     pattern13=re.compile(p13)
354     pattern14=re.compile(p14)        
355     # f2_r = open(ss+'/new/%s_w.txt'%list_tag[i])
356     pos=[
357     
358     '0010','0020','0030','0040','0050','0060','0070','0080','0090','0100','0110','0120','0130','0140','0150','0160','0170','0180','0190','0200',
359     '0210','0220','0230','0240','0250','0260','0270','0280','0290','0300','0310','0320','0330','0340','0350','0360','0370','0380','0390','0400',
360     '0410','0420','0430','0440','0450','0460','0470','0480','0490','0500','0510','0520','0530','0540','0550','0560','0570','0580','0590','0600',
361     '0610','0620','0630','0640','0650','0660','0670','0680','0690','0700','0710','0720','0730','0740','0750','0760','0770','0780','0790','0800',
362     '0810','0820','0830','0840','0850','0860','0870','0880','0890','0900','0910','0920','0930','0940','0950','0960','0970','0980','0990','1000',
363     '1010','1020','1030','1040','1050','1060','1070','1080','1090','1100','1110','1120','1130','1140','1150','1160','1170','1180','1190','1200',
364     '1210','1220','1230','1240','1250','1260','1270','1280','1290','1300','1310','1320','1330','1340','1350','1360','1370','1380','1390','1400',
365     '1410','1420','1430','1440','1450','1460','1470','1480','1490','1500','1510','1520','1530','1540','1550','1560','1570','1580','1590','1600',
366     '1610','1620','1630','1640','1650','1660','1670','1680','1690','1700','1710','1720','1730','1740','1750','1760','1770','1780','1790','1800',
367     '1810','1820','1830','1840','1850','1860','1870','1880','1890','1900','1910','1920','1930','1940','1950','1960','1970','1980','1990','2000',
368     '2010','2020','2030','2040','2050','2060','2070','2080','2090','2100','2110','2120','2130','2140','2150','2160','2170','2180','2190','2200',
369     '2210','2220','2230','2240','2250','2260','2270','2280','2290','2300','2310','2320','2330','2340','2350','2360','2370','2380','2390','2400',
370     '2410','2420','2430','2440','2450','2460','2470','2480','2490','2500','2510','2520','2530','2540','2550','2560','2570','2580','2590','2600',
371     '2610','2620','2630','2640','2650','2660','2670','2680','2690','2700','2710','2720','2730','2740','2750','2760','2770','2780','2790','2800',
372     '2810','2820','2830','2840','2850','2860','2870','2880','2890','2900','2910','2920','2930','2940','2950','2960','2970','2980','2990','3000',
373     '3010','3020','3030','3040','3050','3060','3070','3080','3090','3100','3110','3120','3130','3140','3150','3160','3170','3180','3190','3200',
374     '3210','3220','3230','3240','3250','3260','3270','3280','3290','3300','3310','3320','3330','3340','3350','3360','3370','3380','3390','3400',
375     '3410','3420','3430','3440','3450','3460','3470','3480','3490','3500','3510','3520','3530','3540','3550','3560','3570','3580','3590','3600',
376     '3610','3620','3630','3640','3650','3660','3670','3680','3690','3700','3710','3720','3730','3740','3750','3760','3770','3780','3790','3800',
377     '3810','3820','3830','3840','3850'
378 
379     ]
380     for line2 in f2:
381         matcher11=re.findall(pattern11,line2)
382         matcher12=re.findall(pattern12,line2)
383         matcher13=re.findall(pattern13,line2)
384         matcher14=re.findall(pattern14,line2)
385         # print(matcher11[0])
386         # print(matcher12[0])
387         # print(matcher13[0])
388         # print(matcher14[0])
389         # print(matcher11[0])
390         # a=list(line2)
391         # print(a)
392         # b=str(a)
393         # print(b)
394         # print(line2.split(','))
395         try:
396             str11="%s,%s,%s,%s,%s,%s,%s,\"%s\"\n"%(pos[i],code_tag,matcher12[0],matcher11[0],year,matcher14[0],matcher13[0],list_note[i].strip('\n'))
397         
398             i=i+1
399             # print(i)
400             # print(str11)
401             f2_w.write(str11)
402             f3_w.write(str11)
403         except:
404             print("---error---")
405             break
406 
407     f2_w.close() 
408     f2.close()
409 
410 def test():#用户爬取网页，保存到本地
411     filename='./codeco.txt'
412     url = "http://www.stylusstudio.com/edifact/D95B/CODECO.htm"
413     resp = urllib.request.urlopen(url)
414     data = resp.read().decode('UTF-8')
415     # f1=open(filename,'w')
416     # f1.write(data)
417     # #print(type(data))
418     # #print(data)
419     f2=open('./text.txt','a')
420     soup = bs(data, 'html.parser')    
421 
422     # sw=soup.find_all('table',border=0,width="100%")
423     # #print(sw[0])
424     segment1= soup.find_all('h4')
425 
426     segment2= soup.find_all('p')
427     # #print(type(segment))
428     #print(segment1)
429     #print(segment2)
430     nowplaying_list = [] 
431     for item in segment1:
432             #print(item)
433             # #print(item.name)
434             # #print(item.attrs)
435             # #print(type(item))
436             #print(item.get_text())
437             #print([text for text in item.stripped_strings] )
438             f2.writelines(str([text for text in item.stripped_strings])+'\n')    
439             # nowplaying_dict = {}        
440             # nowplaying_dict['id'] = item['a']       
441             # for tag_img_item in item.find_all('img'):            
442             #     nowplaying_dict['name'] = tag_img_item['alt']            
443             #     nowplaying_list.append(nowplaying_dict)
444     # result= segment[0].find_all('h4')
445     # #print(result)
446 
447     for item in segment2:
448 
449             #print(item)
450             #print(item.get_text())
451             f2.writelines(str([text for text in item.stripped_strings] )+'\n')  
452     f2.close()
453     # data={}
454     # data['word']='Jecvay Notes'
455      
456     # url_values=urllib.parse.urlencode(data)
457     # url="http://www.baidu.com/s?"
458     # full_url=url+url_values
459      
460     # data=urllib.request.urlopen(full_url).read()
461     # data=data.decode('UTF-8')
462     # #print(data)
463 if __name__=='__main__':
464     # '97A','97B','98A','98B','99A','99B'
465     year1=['00A','00B','01A','01B','01C','02A','02B','03A','03B','04A','04B']
466     for j in range(len(year1)):
467 
468         year=year1[j]
469         ss="./data/%s/"%year
470         tag=b0_trmd(year,ss)
471         print(tag)
472         for i in range(len(tag)):
473             test1(tag[i],year,ss)
474             test2(tag[i],year,ss)
475             num=test3(tag[i],year,ss)
476             test4(tag[i],year,ss)
477             test5(tag[i],num,year,ss)
478             join(tag[i],year,ss)
479             print("------%s-----ok"%i)
480     # str1='APERAK'
481     # join(str1)
posted on 2017-08-25 13:45 懵懂的菜鸟阅读(498) 评论(0) 编辑收藏举报
刷新页面返回顶部
懵懂的菜鸟

导航

公告

python网页爬虫 spiders_97A-04B