requests+正则爬取豆瓣图书

 1 #requests+正则爬取豆瓣图书
 2 
 3 import requests
 4 import re
 5 
 6 def get_html(url):
 7     headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36 LBBROWSER'}
 8     response = requests.get(url,headers=headers)
 9     html = response.text
10     return html
11 
12 
13 def get_books(url):
14 
15     html = get_html(url)
16     pattern = re.compile(r'<li.*?cover.*?href="(.*?)".*?title="(.*?)".*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?</li>',re.S)
17     result = re.findall(pattern,html)
18     for rs in result:
19         link,book,name,data = rs
20         book = re.sub('\s','',book)#可用sub去掉换行空白等
21 
22         print(link,book,name.strip(),data.strip())#也可用strip去掉换行空白
23 
24 
25 if __name__ == '__main__':
26 
27     url = 'https://book.douban.com/'
28     get_books(url)

 

posted @ 2017-05-14 10:12  道高一尺  阅读(434)  评论(0编辑  收藏  举报