python练习册 每天一个小程序 第0008题
1 # -*-coding:utf-8-*- 2 __author__ = 'Deen' 3 ''' 4 题目描述: 5 一个HTML文件,找出里面的正文。 6 7 思路: 8 利用BeautifulSoup或者正则表达式 9 10 ''' 11 ''' 12 import requests 13 from bs4 import BeautifulSoup 14 15 16 def get_body(url): 17 response = requests.get(url) 18 soup = BeautifulSoup(response) 19 print soup.body.text 20 ''' 21 from bs4 import BeautifulSoup 22 23 def find_the_content(path): 24 with open(path) as f: 25 text = BeautifulSoup(f, 'lxml') 26 content = text.get_text().strip('\n') 27 28 return content.encode('gbk','ignore') 29 30 31 if __name__ == '__main__': 32 print find_the_content('Show-Me-the-Code_show-me-the-code_1.html')