[爬虫]统计豆瓣读书中每个标签下的前两百本书
先说下结果吧,结果就是我的IP被豆瓣封了。。。
1 from bs4 import BeautifulSoup 2 import requests 3 import csv 4 5 def storeorder(book): 6 ibook = open(book+'.csv', 'w+', newline='') 7 writer = csv.writer(ibook) 8 for urlnum in range(0, 200, 20): 9 url = 'https://book.douban.com/tag/{book}?start={urlnum}&type=T'.format(book=book,urlnum=urlnum) 10 headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64)' 11 ' AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'} 12 html = requests.get(url,headers=headers) 13 obj = BeautifulSoup(html.text) 14 try: 15 for item in obj.find('ul', {'class': 'subject-list'}).findAll('a'): 16 if 'title' in item.attrs: 17 print(item.attrs['href']) 18 writer.writerow((item.attrs['title'], item.attrs['href'])) 19 except AttributeError: 20 print("哈哈哈,你的ip被封了!!!") 21 raise AttributeError 22 23 24 booklist = ['科学','互联网','算法','通信','程序','神经网络','经济学','金融','经济','历史','小说'] 25 for v in booklist: 26 storeorder(v)