pyspider示例代码四:搜索引擎爬取

搜索引擎爬取

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2016-03-23 08:25:01
# Project: __git_lab_fix

from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        list = ['bigsec', 'password', 'email', 'tongdun', 'vpn', 'address', 'pop3',
                'smtp', 'imap', 'zhengxin', 'jdbc', 'mysql', 'credit', 'access_token', 'client_secret', 
                'privatekey', 'secret_key', 'xiecheng', 'ctrip', 'tongcheng']
        for u in list:
            url = 'https://gitlab.com/search?group_id=&scope=issues&search=' + u
            self.crawl(url, callback=self.index_page)

    @config(age=10)
    def index_page(self, response):
        self.crawl(response.doc('.next > a').attr.href,callback = self.index_page)
        for each in response.doc('h4 > a[href^="http"]').items():
            # print each.text()
            self.crawl(each.attr.href, callback=self.detail_page)
            
    @config(etag = True)        
    def detail_page(self, response):
        for each in response.doc('.detail-page-description').items():
            return {
                "app":"githack",
                "origin":"gitlab.net",
                "code": each.text(), 
            }
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2016-04-27 14:30:30
# Project: __git_zhibo

from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes= 24 * 60)
    def on_start(self):
        list = ['douyu', 'panda', 'zhanqi', 'longzhu', 'huya', 'yy', 'momo', 'tv']
        for qu in list:
            url = 'https://github.com/search?p=1&q=' + qu + '&type=Code&utf8'
            self.crawl(url, callback=self.index_page)

    @config(age=1)
    def index_page(self, response):
        self.crawl(response.doc('.next_page').attr.href,callback = self.index_page)
        flag = 0
        for each in response.doc('.title > a').items():
            flag += 1 
            if flag % 2 == 0:
                self.crawl(each.attr.href, callback=self.into_page)
                
    @config(age=1,etag = True) 
    def into_page(self, response):
        for each in response.doc('table').items():
            return{
                "app":"githack",
                "origin":"github.net",
                "code": each.text(),         
            }

 

posted @ 2016-12-07 13:15  microman  阅读(1308)  评论(0编辑  收藏  举报