python 自建爬虫复用简单框架(gevent异步)

一般爬虫可以分为以下几个步骤:

一、打开指定网页

二、解析网页

三、处理/存储数据,新增任务网页

另外异步的话,需要调度器。

简单爬虫的话,不需要搞复杂验证码,requests/urllib修改cookie,header就能访问的话,写一个打开,一个解析就够了,处理数据和新任务,直接写在解析类就下,gevent也可以直接异步。

 

项目路径:ur'D:\python_py\my_scrapy/scrapy_tools'

# scrapy_tools下添加__init__.py作为包使用

itemparse.py

按照数据的结构建立相应的xpath 结构

# -*- coding: utf-8 -*-
"""
Created on Fri Jul 07 17:24:34 2017

@author: willowj
"""
import sys
stdout, stdin, stderr = sys.stdout, sys.stdin, sys.stderr
reload(sys)
sys.stdout, sys.stdin, sys.stderr = stdout, stdin, stderr
sys.setdefaultencoding('utf8')


import gevent
import pandas as pd 
import numpy as np
from lxml import html
import time
import codecs
import json


def list_0e(list_):
    if isinstance(list_, list):
        if not list_:
            return None
        else:
            if len(list_)>1:
                print 'warning : list>1,list[1]:', list_[1] #,len(list_)
            return list_[0]
    else:
        return list_


class ItemParse(object):
    """docstring for zhihu_topi"""
    name = 'ItemParse'

    base_url = 'https://www.zhihu.com/topic/19551147/top-answers'
    pageN_x = '//div[@class="zm-invite-pager"]//span[last()-1]/a/text()'
    new_urls_x = None

    #以下一条数据的节点,以及每一项
    items_node_x = '//div[@class="feed-main"]'
    #注意每一条数据的属性都是在一个项目里搜索,xpath '.'开头
    item_xs = dict(
        question_name = '''.//a[@class='question_link']/text()''', 
        #question_href = '''.//a[@class='question_link']/@href''', 
        author = './/div[@data-action="/answer/content"]/@data-author-name',
        author_href = '''.//a[@class='author-link']/@href''',  
        ups_x = './/div[@class="zm-item-vote-info"]/@data-votecount',
        answers_text = ".//textarea/text()",
        commentN = './/a[@name="addcomment"]/text()[last()]',
        entry_url = './/div[@data-action="/answer/content"]/@data-entry-url',

        #re:
        #z = re.compile('\.')
        )    
    
    #换页url样式
    def getnextpages(self):
        if self.pageN > 1:
        #自定义换也规则,只有一页则为 False
            urls = [self.base_url + '?page=%s' %n 
                        for n in range(self.pageN,1,-1)
                    ]
            return urls


    def __init__(self, html_):
        #self.item_atrr_xpath()
        self.results = []
        self.new_urls = []
        self.pageN = self.update_page_n(html_)
        self.nextpages = self.getnextpages()
        self.parase(html_)


    def parase(self, html_):
        #优先使用xpath,,补充使用re; 找不到的item 返回none
        etree = html.document_fromstring(html_)
        items_nodes = etree.xpath(self.items_node_x)
        #results = []
        for ee in items_nodes:
            ee_str = None
            ite = {}
            for item,itemx in self.item_xs.items():
                # re, or xpath
                if hasattr(itemx, 'findall'):
                    if ee_str is None:
                        ee_str = html.to_string(ee)
                    ite[item] = itemx.findall(ee_str)
                #xpath   
                elif isinstance(itemx, str) or isinstance(itemx, unicode): 
                    if itemx.startswith('./'):
                        ite[item] = ee.xpath(itemx)
                    else:
                        print item
                        raise 'xpath not startwith ./'
                else:
                    print item
                    raise 'not re.pattarn object or xpath str'
                
                if len(ite[item]) == 0:
                    ite[item] = None
                elif len(ite[item]) == 1:
                    ite[item] = ite[item][0]
                else:
                    ite[item] = '\n'.join([str(__i) for __i in ite[item]])
                
            self.results.append(ite)
        
        #new_url
        if self.new_urls_x:
            self.new_urls.extend(etree.xpath(self.new_urls_x)) 

    #获取有多少页
    def update_page_n(self, html_):
        if self.pageN_x:
            etree = html.document_fromstring(html_)
            pages = etree.xpath(self.pageN_x)
            pages = list_0e(pages)
            if isinstance(pages, str): 
                pages.strip()
            if pages and pages.isdigit():
                return int(pages)
        else:
            return 1

    #普通的获取项目下所有换页
    def get_nextpages(self, opener, sleep_sec=None):
        for url in self.nextpages:
            if sleep_sec:
                time.sleep(sleep_sec)
            #if not hasattr(opener, 'get')    
            _re = opener.get(url)
            print _re.status_code,  _re.url
            self.parase(_re.text)
            print time.time()
    #暂时把 异步控制和存储方法写到了这里
    #gevent 协程方法            
    def __gevent_get_nextpages(self, opener):
        print id(opener)
        while self.nextpages:
            #start_time = time.time()
            url = self.nextpages.pop()
            print gevent.getcurrent()
            zhihu_re = opener.get(url)
            #gevent.sleep(5)
            print zhihu_re.status_code,  url
            self.parase(zhihu_re.text) 
            print time.time()
    #gevent 协程方法
    def get_nextpages_by_gevent(self, opener_class, g_n=4):
        '''
        param:  opener_class : 创建网页打开器的类
                g_n: 协程数量,默认4个
        '''
        from gevent import monkey; monkey.patch_all()
          
        start_time = time.time()
        gs = [gevent.spawn(self.__gevent_get_nextpages, opener_class())
                for i in range(g_n)
                ]
        gevent.joinall(gs)    

        print time.time() - start_time 
        self.save_to_excel()

    def save_to_excel(self, path=None):  
        if path:  
            save_name = path
        else:     
            save_name = u''+ self.name \
                           + time.strftime('%Y%m%d_%H_%M', time.localtime()) \
                           + '.xlsx'
        print save_name
        result_pd = pd.DataFrame(self.results)
        print 'pd ok'
        result_pd.to_excel(u'' + save_name, encoding='gb18030')        
        print 'saved to ' + save_name


    def save_to_json(self, path=None):
        if path:  
            save_name = path
        else:     
            save_name = u''+ self.name \
                           + time.strftime('%Y%m%d_%H_%M', time.localtime()) \
                           + '.json'
        print save_name
        with codecs.open(save_name, 'w', encdoing='gb18030') as f:
            f.write(josn.dumps(self.results))
            
        print 'saved to '+ save_name
View Code

 使用时继承类重写类属性和getnextpages 换页方法


 

web_opener.py

使用requests.Session,保持会话的方式速度大概会快一倍

 对应gevent异步,多少个协程就会生成同等的会话,各自打开网页互补干扰。 方法暂时写在itemparse.py

# -*- coding: utf-8 -*-
"""
2017年8月17日星期四
下午 17:22
@author: willowj
"""
import sys
sys.setdefaultencoding('utf8')  

import requests
#from requests.cookies import (
#    cookiejar_from_dict, extract_cookies_to_jar, RequestsCookieJar, merge_cookies)

class SessionFopener(object):
    """requests 封装的网页打开器
    param: headers 默认使用类属性,实例化的时候自己可以传入
           cookie_dic 默认禁用
           proxies 默认无
    """
    headers = {
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding':'gzip, deflate, sdch',
        'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
        'Cache-Control':'max-age=0',
        'Connection':'keep-alive',
        #'Cookie':'q'
        #'Host':'www.zhihu.com',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
        }

    def __init__(self, headers=None, cookie_dic=None, proxies=None):
    
        self.req_s = requests.Session()
        self.req_s.adapters.DEFAULT_RETRIES = 3
        self.req_s.keep_alive = True 

        if headers:
            self.req_s.headers = headers 
        else:     
            self.req_s.headers = self.headers

        if not cookie_dic:
            cookie_dic = {}
        self.req_s.cookies = requests.cookies.cookiejar_from_dict(cookie_dic)

        if proxies:
            self.req_s.proxies = proxies

    def close(self):
        self.req_s.close()

    def get(self, *arg, **karg):
        return self.req_s.get(*arg, **karg)

    def post(self, *arg, **karg):
        return self.req_s.post(*arg, **karg)

    def set_cookiejar(self, cookie_dic={}):
        self.req_s.cookies = requests.cookies.cookiejar_from_dict(cookie_dic)

    def add_cookiejar(self, cookie_dic):
        self.req_s.cookies = requests.cookies.merge_cookies(self.req_s.cookies, cookie_dic)


    def set_headers(self, headers={}):
        self.req_s.headers = headers

    def add_headers(self, headers_dic):
        for k,v in header_dic:
            self.req_s.headers[k] = v


    def set_proxies(self, proxies):
        self.req_s.proxies = proxies


    @classmethod    
    def cookiejar_from_dict(cls, cookie_dic):
        return requests.cookies.cookiejar_from_dict(cookie_dic)

    def __enter__(self):
        print 'enter'
        return self

    def __exit__(self, *used):
        self.req_s.close()
        del self.req_s
        print 'exit'


if __name__ == '__main__':
    with  SessionFopener() as req_o:
        res_p = req_o.get('http://httpbin.org/get')
        
    print res_p.json()  
View Code

 

大众点评店铺爬取示例:

只需要继承后重写解析的节点、换页的url形式就行

暂时未考虑外链接。

# -*- coding: utf-8 -*-
"""
Created

2017年8月17日星期四
下午 19:33

@author: Administrator
"""
import sys
stdout, stdin, stderr = sys.stdout, sys.stdin, sys.stderr
reload(sys)
sys.stdout, sys.stdin, sys.stderr = stdout, stdin, stderr
sys.setdefaultencoding('utf8')  

sys.path.append(ur'D:\python_py\my_scrapy')
from scrapy_tools.web_opener import SessionFopener
from scrapy_tools.itemparse import ItemParse

class DzdpItemParse(ItemParse):
    """广州酒家(文昌店)的点评
    docstring for zhihu_topi"""
    name = u'DzdpItemParse广州酒家'

    base_url = 'https://www.dianping.com/shop/516983/review_more'
    pageN_x = ".//a[@class='PageLink'][last()]/text()"
    new_urls_x = None

    #以下一条数据的节点,以及每一项
    items_node_x = './/div[@class="comment-list"]/ul/li'
    #注意每一条数据的属性都是在一个项目里搜索,xpath '.'开头
    item_xs = dict(
        user_id = '''.//*[@class="J_card"]/@user-id''',
        #question_href = '''.//a[@class='question_link']/@href''' ,

        comm_per = """.//span[@class='comm-per']/text()""",
        total_mark = """.//*[@class="user-info"]/span[1]/@class""",
        taste = """.//*[@class="comment-rst"]/span[1]/text()""",
        environment = """.//*[@class="comment-rst"]/span[2]/text()""",
        sevice = """.//*[@class="comment-rst"]/span[3]/text()""",

        comments_agree = '''.//span[@class="heart-num"]/text()''',
        comment_text = """.//*[@class="J_brief-cont"]/text()""",
        comment_date = '''.//*[@class="time"]/text()''',
        recommend_food = \
        u'''.//*[@class="comment-recommend" \
        and (contains(text(),推荐) \
        or contains(text(),喜欢))]\
        [1]/a/text()'''
        #  中文得使用unicode                                                        
        #re:
        #z = re.compile('\.')
        )    

    
    def getnextpages(self):
        if self.pageN > 1:
        #自定义换也规则,只有一页则为 False
            urls = [self.base_url + '?pageno=%s' %n
                    for n in range(self.pageN, 1, -1)
                    ]
            return urls


open_s = SessionFopener()   #实例化一个打开器
respon_= open_s.get(DzdpItemParse.base_url) #打开初始页
gzjj_item = DzdpItemParse(respon_.text) #解析对象用初始页html实例化

#同步方式的话,使用普通方法
gzjj_item.get_nextpages(open_s, sleep_sec=None)

#异步方法:
#gzjj_item.get_nextpages_by_gevent(SessionFopener) #实例异步方法
View Code

 

结果:本来打开一个网页0.5279 s,开四个协程后77.71s爬完613个页面,平均0.13s一个,速度提升至4倍

200 https://www.dianping.com/shop/516983/review_more?pageno=600
1503074965.07
<Greenlet at 0x9c44620: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
200 https://www.dianping.com/shop/516983/review_more?pageno=602
1503074965.1
<Greenlet at 0x9c445d0: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
200 https://www.dianping.com/shop/516983/review_more?pageno=601
1503074965.14
<Greenlet at 0x9c44670: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
200 https://www.dianping.com/shop/516983/review_more?pageno=604
1503074965.54
<Greenlet at 0x9c44440: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
200 https://www.dianping.com/shop/516983/review_more?pageno=607
1503074965.59
<Greenlet at 0x9c44670: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
200 https://www.dianping.com/shop/516983/review_more?pageno=605
1503074965.64
<Greenlet at 0x9c44620: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
200 https://www.dianping.com/shop/516983/review_more?pageno=606
1503074965.67
<Greenlet at 0x9c445d0: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
200 https://www.dianping.com/shop/516983/review_more?pageno=611
1503074966.1
<Greenlet at 0x9c445d0: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
200 https://www.dianping.com/shop/516983/review_more?pageno=609
1503074966.15
<Greenlet at 0x9c44670: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
200 https://www.dianping.com/shop/516983/review_more?pageno=610
1503074966.18
<Greenlet at 0x9c44620: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
200 https://www.dianping.com/shop/516983/review_more?pageno=608
1503074966.22
<Greenlet at 0x9c44440: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
200 https://www.dianping.com/shop/516983/review_more?pageno=612
1503074966.7
200 https://www.dianping.com/shop/516983/review_more?pageno=614
1503074966.74
200 https://www.dianping.com/shop/516983/review_more?pageno=615
1503074967.05
200 https://www.dianping.com/shop/516983/review_more?pageno=613
1503074967.09
77.7100000381
DzdpItemParse广州酒家20170819_00_49.xlsx
pd ok
saved to DzdpItemParse广州酒家20170819_00_49.xlsx
View Code

 

分布式多进程、入数据库的话,还得单独写调度器、与数据对接的模块

posted @ 2017-08-18 19:03  willowj  阅读(875)  评论(0编辑  收藏  举报