协程+requests+MYSQL爬取盗墓笔记小说

#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = 'Fade Zhao'
import requests
from lxml import etree
from fake_useragent import UserAgent
from gevent import monkey
monkey.patch_all()
from gevent.pool import Pool
import MySQLdb

class Grave(object):
    def __init__(self):
        self.conn = MySQLdb.connect(host='localhost', user='root', password='zhaoyinghan', db='GraveNote', charset='utf8')
        self.cursor = self.conn.cursor()

        self.session = requests.session()
        self.url = 'http://seputu.com/'
        self.agent = UserAgent().random
        self.headers = {
            'User-Agent':self.agent
        }

    def get_bags(self):
        '''获取各个木块的列表'''
        response = self.session.get(self.url,headers=self.headers)
        html = etree.HTML(response.content)
        article_list = html.xpath('.//div[@class="mulu"]')
        len(article_list)
        for item in article_list[1:]:
            self.get_Aictcleurl(item)
        # self.get_Aictcleurl(article_list[1])


    def get_Aictcleurl(self,bag):
        '''获取各个模块的url'''
        urls = bag.xpath('.//li/a/@href')
        print(urls)
        pool = Pool(20)
        pool.map_cb(self.get_detail,urls,callback=self.save)


    def get_detail(self,url):

        try:
            response = self.session.get(url,headers = self.headers)
            html = etree.HTML(response.content)
            arcitle_dir = html.xpath('.//div[@class="mark"]/a/text()')[0]
            title = html.xpath('.//div[@class="bg"]/h1/text()')[0]
            if '藏海花2' in title or '盗墓笔记9' in title or '巫山妖棺' in title:
                content = html.xpath('.//div[@class="content-body"]/div[@class="content"]/p/text()')
            else:
                content = html.xpath('.//div[@class="content-body"]/p/text()')
            content_str = '\n'.join(content)
            return (arcitle_dir,title,content_str)
        except Exception as e:
            print('发生错误:',e)

    def save(self,args):

        print(args)
        sql_str = '''INSERT INTO Grave(article_dir,title,content) 
                                 VALUES(%s,%s,%s) 
                                 on duplicate 
                                 key update content=values(content)'''
        try:
            self.cursor.executemany(sql_str, args)
            self.conn.commit()
            print('插完了')
        except Exception as e:
            print('插入错误', e)
            self.conn.rollback()

if __name__ == '__main__':
    
    spider = Grave()
    spider.get_bags()

遇到的坑:

  1、在爬取盗墓笔记-藏海花的时候,碰到数据爬取不到的现象,发现部分【藏海花】章节的网页结构和其他的不同,导致获取不到数据,改正后重新填充。

  2、在xpath中,即使已经通过xpath筛选到对象;如果将此对象另外筛选,必须要在筛选条件之前加上[ . ] 代表的是当前节点下,否则默认的范围是包括之前筛选过的所有节点范围。

  3、出现问题后,解决的思路应该先从程序运行的结构向下查找,再细分问题(每一个问题的解决想法都应该是独立的,不能惯性思维),逐步筛选后再修改。

 

posted @ 2017-12-01 00:26  LeeeetMe  阅读(221)  评论(0编辑  收藏  举报