python学习笔记1豆瓣图书信息下载保存至csv
还需添加ip池 未实现
import requests from bs4 import BeautifulSoup import re #正则规范信息 import csv #保存至.csv import random import time #生成随机秒数,反爬 aurl = 'https://book.douban.com/tag/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' } res = requests.get(aurl,headers = headers) res.encoding = 'utf-8' soup = BeautifulSoup(res.text,'html.parser') calss1 = soup.select('#content > div > div.article > div:nth-child(2) > div > table > tbody > tr > td > a') for calss2 in calss1 : tag = calss2.get_text().strip() for tage in range(0,41,20): dat = { 'start':tage, type:'T' } url = 'https://book.douban.com/tag/%s' % tag #每个url下载3页信息,下载一页保存一页 books = [] response = requests.get(url, headers=headers,params=dat) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') names = soup.select('#subject_list > ul > li > div.info > h2 > a') details = soup.select('#subject_list > ul > li > div.info > div.pub') scores = soup.select('#subject_list > ul > li > div.info > div.star.clearfix > span.rating_nums') comments = soup.select('#subject_list > ul > li > div.info > div.star.clearfix > span.pl') for name, detail, score, comment in zip(names, details, scores, comments): try: dict_book = {} name2 = name.get_text() name = ''.join(name2.split()) detail = detail.get_text().split('/') author = detail[0].strip() pubtime = detail[-2].strip() price2 = detail[-1].strip() price1 = re.findall(r'(\d+\.\d*).*', price2) # 正则取出数值,保持格式一致 price = price1[0] score = score.get_text() comment2 = comment.get_text().strip() comment1 = re.findall(r'(\d+)\D*', comment2) comment = comment1[0] dict_book['书名'] = name dict_book['作者'] = author dict_book['上市时间'] = pubtime dict_book['价格'] = price dict_book['书籍评分'] = score dict_book['评分人数'] = comment books.append(dict_book) time.sleep(random.random() * 3) # 随机休眠 except IndexError as e: print('IndexError:', e) finally: print(name) # errors='ignore' 避免格式错误导致写入循环终止 with open(r'fileName.csv', 'w', errors='ignore') as csvfile: filednames = ['书名', '作者', '上市时间', '价格', '书籍评分', '评分人数'] # 以字典格式写入 filednames writer = csv.DictWriter(csvfile, filednames) # 写入 filednames writer.writeheader() for book_ in books: writer.writerow({ '书名': book_['书名'], '作者': book_['作者'], '上市时间': book_['上市时间'], '价格': book_['价格'], '书籍评分': book_['书籍评分'], '评分人数': book_['评分人数'] })