python 爬取网页内容

 1 #encoding:UTF-8
 2 import urllib
 3 import urllib.request
 4 import bs4
 5 from bs4 import BeautifulSoup as bs
 6 def test1():
 7     url = "http://www.stylusstudio.com/edifact/D95B/CODECO.htm"
 8     resp = urllib.request.urlopen(url)
 9     data = resp.read().decode('UTF-8')
10     soup = bs(data, 'html.parser')    
11     segment11= soup.find_all('table')
12     segment1=segment11[7].find_all('tr')#表示第几个table，此时表示进去html网页中的第7个table
13 
14  
15     f2=open('./text1.txt','a',encoding='cp852')
16     for item in segment1:
17 
18             print(item)
19             '''
20             <tr class="FrameTreeFont"><td><span class="FrameDrawFont">│
21             <span class="FrameHideFont">─</span>│<span class="FrameHideFont">─</span>├─</span>
22             <a class="FrameItemFont" href="DAM_.htm" target="classFrame" title="Damage">DAM</a> 
23             Damage</td><td align="right"><span class="FrameDetailFont"> ×1 
24             </span></td><td><span class="FrameDetailFont">(M)</span></td></tr>
25             '''
26 
27             print(item.get_text())#以文本方式呈现
28             '''
29             │─│─├─DAM Damage ×1 (M)
30             '''
31             # print(item.td.span.get_text())#获取具体标签内部内容
32             print([text for text in item.stripped_strings] )#以列表方式呈现
33             '''
34             ['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']
35             '''
36             '''
37             soup.get_text("|")#u'\nI linked to |example.com|\n'进一步，通过strip去除掉文本每个位的头尾空白。
38 
39             soup.get_text("|", strip=True)#u'I linked to|example.com'
40             '''
41             f2.writelines(str([text for text in item.stripped_strings])+'\n')
42     f2.close()     
43 if __name__=='__main__':
44     test1()

posted on 2017-08-23 11:35 懵懂的菜鸟阅读(575) 评论(0) 编辑收藏举报

刷新页面返回顶部

懵懂的菜鸟

导航

公告

python 爬取网页内容