pyspider示例代码四:搜索引擎爬取
搜索引擎爬取
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2016-03-23 08:25:01 # Project: __git_lab_fix from pyspider.libs.base_handler import * class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): list = ['bigsec', 'password', 'email', 'tongdun', 'vpn', 'address', 'pop3', 'smtp', 'imap', 'zhengxin', 'jdbc', 'mysql', 'credit', 'access_token', 'client_secret', 'privatekey', 'secret_key', 'xiecheng', 'ctrip', 'tongcheng'] for u in list: url = 'https://gitlab.com/search?group_id=&scope=issues&search=' + u self.crawl(url, callback=self.index_page) @config(age=10) def index_page(self, response): self.crawl(response.doc('.next > a').attr.href,callback = self.index_page) for each in response.doc('h4 > a[href^="http"]').items(): # print each.text() self.crawl(each.attr.href, callback=self.detail_page) @config(etag = True) def detail_page(self, response): for each in response.doc('.detail-page-description').items(): return { "app":"githack", "origin":"gitlab.net", "code": each.text(), }
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2016-04-27 14:30:30 # Project: __git_zhibo from pyspider.libs.base_handler import * class Handler(BaseHandler): crawl_config = { } @every(minutes= 24 * 60) def on_start(self): list = ['douyu', 'panda', 'zhanqi', 'longzhu', 'huya', 'yy', 'momo', 'tv'] for qu in list: url = 'https://github.com/search?p=1&q=' + qu + '&type=Code&utf8' self.crawl(url, callback=self.index_page) @config(age=1) def index_page(self, response): self.crawl(response.doc('.next_page').attr.href,callback = self.index_page) flag = 0 for each in response.doc('.title > a').items(): flag += 1 if flag % 2 == 0: self.crawl(each.attr.href, callback=self.into_page) @config(age=1,etag = True) def into_page(self, response): for each in response.doc('table').items(): return{ "app":"githack", "origin":"github.net", "code": each.text(), }
记录自己、分享公众、成就别人