pyspider示例代码五:实现自动翻页功能

实现自动翻页功能

示例代码一

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2016-05-19 00:21:31
# Project: v2ex

from pyspider.libs.base_handler import *
#import re


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('https://www.v2ex.com/', callback=self.index_page, validate_cert=False)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="https://www.v2ex.com/?tab="]').items():
            self.crawl(each.attr.href, callback=self.tab_page, validate_cert=False)

    @config(age=10 * 24 * 60 * 60)
    def tab_page(self, response):
        for each in response.doc('a[href^="https://www.v2ex.com/go/"]').items():
            self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)

    @config(priority=2)
    def board_page(self, response):
        #实现自动翻页功能
        for each in response.doc('a[href^="https://www.v2ex.com/t/"]').items():
            url = each.attr.href
            if url.find('#reply')>0:
                url = url[0:url.find('#')]
            self.crawl(url, callback=self.detail_page, validate_cert=False)
        for each in response.doc('a.page_normal').items():
            self.crawl(each.attr.href, callback=self.board_page, validate_cert=False) 
            
    @config(priority=2)
    def detail_page(self, response):
        title = response.doc('h1').text()
        content = response.doc('div.topic_content').html().replace('"', '\\"')
        tmp = zip(response.doc('a[href^="/member/"]').items(), response.doc('div.reply_content').items())
        reply_content = list()
        for e1, e2 in tmp:
            reply_content.append((e1.text(), e2.text()))
        #self.add_question(title, content)  #插入数据库
        return {
            "url": response.url,
            "title": title,
            "content": content,
            "reply_content": reply_content,
        }

示例代码二

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-01-04 10:42:01
# Project: tutorial_douban_movie

import re
from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    """
    This is a sample script for: pyspider 爬虫教程(一):HTML 和 CSS 选择器
    http://blog.binux.me/2015/01/pyspider-tutorial-level-1-html-and-css-selector/
    """

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://movie.douban.com/tag/', callback=self.index_page)

    @config(age=24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            if 'tag' in each.attr.href:
                self.crawl(each.attr.href, callback=self.list_page)
                
    @config(age=10*24*60*60, priority=2)
    def list_page(self, response):
        for each in response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-16-8.clearfix>DIV.article>DIV>TABLE TR.item>TD>DIV.pl2>A').items():
            self.crawl(each.attr.href, priority=9, callback=self.detail_page)
        # 翻页
        for each in response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-16-8.clearfix>DIV.article>DIV.paginator>A').items():
            self.crawl(each.attr.href, callback=self.list_page)
    
    @config(priority=3)
    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc('HTML>BODY>DIV#wrapper>DIV#content>H1>SPAN').text(),
            "rating": response.doc('#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > strong').text(),
            "导演": [x.text() for x in response.doc('a[rel="v:directedBy"]').items()],
        }

 

posted @ 2016-12-07 13:17  microman  阅读(2891)  评论(0编辑  收藏  举报