python练习册 每天一个小程序 第0008题

 1 # -*-coding:utf-8-*-
 2 __author__ = 'Deen'
 3 '''
 4 题目描述:
 5      一个HTML文件,找出里面的正文。
 6 
 7 思路:
 8     利用BeautifulSoup或者正则表达式
 9 
10 '''
11 '''
12 import requests
13 from bs4 import BeautifulSoup
14 
15 
16 def get_body(url):
17     response = requests.get(url)
18     soup = BeautifulSoup(response)
19     print soup.body.text
20 '''    
21 from bs4 import BeautifulSoup
22 
23 def find_the_content(path):
24     with open(path) as f:
25         text = BeautifulSoup(f, 'lxml')
26         content = text.get_text().strip('\n')
27 
28         return content.encode('gbk','ignore')
29 
30 
31 if __name__ == '__main__':
32     print find_the_content('Show-Me-the-Code_show-me-the-code_1.html')

 

posted on 2017-07-14 16:30  _Deen  阅读(288)  评论(0编辑  收藏  举报

导航