python爬取豆瓣书籍排行

最近想通过爬取豆瓣数据来练习下爬虫,这次做一个爬取豆瓣书籍的信息。

需求:通过爬取豆瓣图书小说这一标签的数据,将数据存入csv或者数据库里面。

思路:先从网页上爬取数据,然后存到csv,然后读取csv的数据写到数据库中。(别问我为什么不直接写数据库,还要在csv中转一次。o(╯□╰)o。。。因为这个项目是逐渐练手的,是先写完csv,然后准备统计数据画图,所以想到还是存mysql好一点,就这样了。。。)

直接上个代码吧.。。。。。

画图表的方法还没完善,先上传上来,后面完善了再更

——————————————————————————————————————————————————

更新画图

 

# -*- coding: utf-8 -*-
'''
Created on 2018年8月17日

@author: zww
'''
import requests
import re
import random
import time
from lxml import etree
import pandas as pd
import matplotlib.pyplot as plt
import pymysql
from pymysql import charset
import csv
import codecs
# rating_list:评分, pl_list:评论人数
title_list, pub_list, rating_list, pl_list = [], [], [], []


def scrapy_contents(currentPage):
  headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
  cookies = {
    'cookies': '这里弄成自己的cookies'}

  url = ''.join(['https://book.douban.com/tag/小说?start=',
    str((currentPage + 1) * 20), '&type=T'])
  res = requests.get(url, headers=headers, cookies=cookies)
  res.encoding = "utf-8"
  if (res.status_code == 200):
    print('\n第{}页的数据爬取成功'.format(currentPage))
    print(url)
  else:
    print('\n o(╯□╰)o第{}页的数据爬取失败'.format(currentPage))
    print(url)
  x = etree.HTML(res.text)
  # 豆瓣每一页有20条数据
  for i in range(1, 21):
  title = x.xpath(
  '//*[@id="subject_list"]/ul/li[{}]/div[2]/h2/a/@title'.format(i))
  pub = x.xpath(
  '//*[@id="subject_list"]/ul/li[{}]/div[2]/div[1]/text()'.format(i))
  rating = x.xpath(
  '//*[@id="subject_list"]/ul/li[{}]/div[2]/div[2]/span[2]/text()'.format(i))
  pl = x.xpath(
  '//*[@id="subject_list"]/ul/li[{}]/div[2]/div[2]/span[3]/text()'.format(i))
  # 遇到有一页只有19条数据。。。。
  try:
    title_list.append(str(title[0]).strip())
    pub_list.append(str(pub[0]).strip())
    rating_list.append(str(rating[0]))
    # 抓出来的数据是: (13376人评价),这里只取数字
    num = re.findall(r"\d+", str(pl[0]))
    pl_list.append(num[0])
  except Exception as e:
    print('第%d条记录获取数据失败' % i)
    print(e)
  continue


def draw_chart(name_list, num_list, title=u'评分比例'):

  plt.bar(range(len(num_list)), num_list,
  tick_label=name_list, facecolor='#ff9999', edgecolor='white', )
  plt.title(title)
  plt.savefig(u'图书评分的柱状图') # 保存
  plt.show()


def draw_pie(name_list, num_list, title=u'评分比例'):

  plt.title(title)
  # 正圆
  plt.axes(aspect='equal')

  # patches, l_texts, p_texts,为了得到饼图的返回值,p_texts饼图内部文本的,l_texts饼图外label的文本
  patches, l_text, p_text = plt.pie(
  num_list, labels=name_list, autopct='%1.1f%%',
  pctdistance=0.8, textprops={'fontsize': 6, 'color': 'k'}, radius=1)

  plt.savefig(u'图书评分的饼图') # 保存
  plt.show()


def save_file(filename):
  infos = {'书名': title_list, '出版信息': pub_list,
  '评分': rating_list, '评论人数': pl_list}
  data = pd.DataFrame(
  infos, columns=['书名', '出版信息', '评分', '评论人数'])

  data.to_csv(filename, index=False)


def insert(cur, sql, args):
  cur.execute(sql, args)


def get_conn(host, port, user, passwd, db):
  conn = pymysql.connect(
    host=host, port=port, user=user, passwd=passwd, db=db, charset='utf8')
  return conn


def query(cur, sql):
  cur.execute(sql)
  result = cur.fetchall()
  return result


def read_csv_to_mysql(filename):
  with codecs.open(filename=filename, mode='r', encoding='utf-8') as f:
  reader = csv.reader(f)
  head = next(reader)
  conn = get_conn(
    'localhost', 3306, 'root', '123456', 'test_scrapy')
  cur = conn.cursor()
  sql = '''insert into novel(BookName,Pub,Score,CommentNum) values(%s,%s,%s,%s)'''
  for item in reader:
    args = tuple(item)
    insert(cur, sql=sql, args=args)
    conn.commit()
  cur.close()
  conn.close()


def Drawing(name_list, num_list):
  plt.rcParams['figure.figsize'] = (12, 8) # 设置图片的大小1200*800
  plt.rcParams['font.sans-serif'] = ['SimHei'] # 这两句是为了显示中文不乱码
  plt.rcParams['font.family'] = 'sans-serif'
  draw_chart(name_list, num_list)
  # draw_pie(name_list, num_list)


def main(scrapyPage):
  for i in range(1, scrapyPage + 1):
    scrapy_contents(i)
    # 随机等待时间,免得被封ip
    time.sleep(round(random.uniform(1, 2), 2))
    now = time.strftime('%Y-%m-%d %H-%M-%S', time.localtime())
    filename = now + "豆瓣图书评分信息.csv"
    save_file(filename)
  print('scrapy done!')
  # 写到mysql里面
  read_csv_to_mysql(filename)


if __name__ == '__main__':
  main(48)

  conn = get_conn(
  'localhost', 3306, 'root', '123456', 'test_scrapy')
  cur = conn.cursor(pymysql.cursors.DictCursor) # 字典形式返回
  cur_list = conn.cursor() # 元组形式返回
  sql = '''SELECT DISTINCT(Score) from novel ORDER BY Score desc'''
  Scores = query(cur_list, sql)
  Scores_num = {}
  for i in Scores:
    sql = 'SELECT count(*) as c from novel where Score =%s' % i
    num = query(cur, sql)
    num_value = num[0]['c']
    num_key = str(i[0])
    Scores_num[num_key] = num_value
    name_list = list(Scores_num.keys())
    num_list = list(Scores_num.values())
  cur.close()
  cur_list.close()
  conn.close()
  Drawing(name_list, num_list)

posted @ 2018-08-20 15:24  zww1  阅读(1483)  评论(0编辑  收藏  举报