scrapy
1、基本命令:
Global commands: scrapy + (大的命令): scrapy startproject *** #创建项目 ***(这是项目名,不是爬虫名) cd myproject scrapy genspider baidu www.baidu.com # 创建爬虫程序baidu.py,以及爬虫名称 # 百度网址是域名,不一定非要有,有的话只能爬百度开头的网址 # 一个项目可以有多个爬虫,使用 scrapy list 查看 scrapy settings #如果是在项目目录下,则得到的是该项目的配置 scrapy runspider baidu.py # 爬虫文件的绝对路径(不是项目文件) # 运行一个独立的python文件,不必创建项目 scrapy shell http://www.baidu.com # 在交互式调试,如选择器规则正确与否 response.status / text / body # response.xpth() scrapy view(response) # 下载到本地,完毕后直接弹出浏览器,以此可以分辨出哪些数据是ajax请求
Project-only commands: scrapy crawl baidu --nolog (爬虫名称) #运行爬虫,必须创建项目才行,确保配置文件中ROBOTSTXT_OBEY = False scrapy check #检测项目中有无语法错误 scrapy list #列出项目中所包含的爬虫名 scrapy parse http://www.baodu.com/ --callback parse #以此可以验证我们的回调函数是否正确 scrapy bench #scrapy bentch压力测试,看每分钟爬多少页面 官网链接 https://docs.scrapy.org/en/latest/topics/commands.html
2、custom_settings
custom_settings值为一个字典,定义一些配置信息,在运行爬虫程序时,这些配置会覆盖项目级别的配置。所以custom_settings必须被定义成一个类属性(放在parse之前),由于settings会在类实例化前加载,但是后来居上custom_settings会覆盖settings.py的配置
import scrapy from scrapy.selector import HtmlXPathSelector from scrapy.http.request import Request from ..items import * from scrapy.http.cookies import CookieJar class zhihuSpiser(scrapy.Spider): name="zhihu" allowed_urls=["xxx",] start_urls=["xxxx",] custom_settings={ "DEFAULT_REQUEST_HEADERS":{ “user-agent”:... "accept_language":... } } cookie_dict={}
3、start_requests 两种写法
def start_requests(self): for url in self.start_urls: yield Request(url=url,callback=self.parse)
def start_requests(self): req_list = [] for url in self.start_urls: req_list.append(Request(url=url,callback=self.parse2)) return req_list # scrapy内部会将返回值(list可迭代对象)转换成迭代器。
4、cookie处理
4.1、post请求
from scrapy.http import Request req = Request( url='http://dig.chouti.com/login', method='POST', body='phone=8613121758648&password=woshiniba&oneMonth=1', headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}, cookies={}, # meta={'cookiejar': True} 或者 cookies=self.cookie_dict callback=self.parse_check_login, )
4.2、手动操作cookie
def parse(self,response): prin(response.headers.getlist('Set-Cookie')) # 原始cookie from scrapy.http.cookies import CookieJar # 解析之后的cookie cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value print(cookie_dict) req = Request( url='http://dig.chouti.com/login', method='POST', headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}, body='phone=8615131255089&password=pppppppp&oneMonth=1', cookies=self.cookie_dict, callback=self.check_login ) yield req def check_login(self, response): yield Request( url='http://dig.chouti.com/link/vote?linksId=%s' %(link_id,), method='POST', cookies=self.cookie_dict, callback=self.do_favor ) def do_favor(self, response): print(response.text)
4.3、自动操作cookie : meta={'cookiejar':True}
配置文件制定是否允许操作cookie: # Disable cookies (enabled by default) # COOKIES_ENABLED = False # 第36行
class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['http://dig.chouti.com/',] def start_requests(self): for url in self.start_urls: yield Request(url=url,callback=self.parse_index,meta={'cookiejar':True}) def parse_index(self,response): req = Request( url='http://dig.chouti.com/login', method='POST', headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}, body='phone=8613121758648&password=woshiniba&oneMonth=1', callback=self.parse_check_login, meta={'cookiejar': True} ) yield req def parse_check_login(self,response): # print(response.text) yield Request( url='https://dig.chouti.com/link/vote?linksId=19440976', method='POST', callback=self.parse_show_result, meta={'cookiejar': True}, ) def parse_show_result(self,response): print(response.text)
4.4、cookie其他用法
模拟浏览器登录 start_requests()方法,可以返回一个请求给爬虫的起始网站,这个返回的请求相当于start_urls,start_requests()返回的请求会替代start_urls里的请求 Request()get请求,可以设置,url、cookie、回调函数 FormRequest.from_response()表单post提交,第一个必须参数,上一次响应cookie的response对象,其他参数,cookie、url、表单内容等 yield Request()可以将一个新的请求返回给爬虫执行 在发送请求时cookie的操作, meta={'cookiejar':1}表示开启cookie记录,首次请求时写在Request()里 meta={'cookiejar':response.meta['cookiejar']}表示使用上一次response的cookie,写在FormRequest.from_response()里post授权 meta={'cookiejar':True}表示使用授权后的cookie访问需要登录查看的页面 获取Scrapy框架Cookies 请求Cookie Cookie = response.request.headers.getlist('Cookie') print(Cookie) 响应Cookie Cookie2 = response.headers.getlist('Set-Cookie') print(Cookie2)
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request,FormRequest class PachSpider(scrapy.Spider): #定义爬虫类,必须继承scrapy.Spider name = 'pach' #设置爬虫名称 allowed_domains = ['edu.iqianyue.com'] #爬取域名 # start_urls = ['http://edu.iqianyue.com/index_user_login.html'] #爬取网址,只适于不需要登录的请求,因为没法设置cookie等信息 header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'} #设置浏览器用户代理 def start_requests(self): #用start_requests()方法,代替start_urls """第一次请求一下登录页面,设置开启cookie使其得到cookie,设置回调函数""" return [Request('http://edu.iqianyue.com/index_user_login.html',meta={'cookiejar':1},callback=self.parse)] def parse(self, response): #parse回调函数 data = { #设置用户登录信息,对应抓包得到字段 'number':'adc8868', 'passwd':'279819', 'submit':'' } # 响应Cookie Cookie1 = response.headers.getlist('Set-Cookie') #查看一下响应Cookie,也就是第一次访问注册页面时后台写入浏览器的Cookie print(Cookie1) print('登录中') """第二次用表单post请求,携带Cookie、浏览器代理、用户登录信息,进行登录给Cookie授权""" return [FormRequest.from_response(response, url='http://edu.iqianyue.com/index_user_login', #真实post地址 meta={'cookiejar':response.meta['cookiejar']}, headers=self.header, formdata=data, callback=self.next, )] def next(self,response): a = response.body.decode("utf-8") #登录后可以查看一下登录响应信息 # print(a) """登录后请求需要登录才能查看的页面,如个人中心,携带授权后的Cookie请求""" yield Request('http://edu.iqianyue.com/index_user_index.html',meta={'cookiejar':True},callback=self.next2) def next2(self,response): # 请求Cookie Cookie2 = response.request.headers.getlist('Cookie') print(Cookie2) body = response.body # 获取网页内容字节类型 unicode_body = response.body_as_unicode() # 获取网站内容字符串类型 a = response.xpath('/html/head/title/text()').extract() #得到个人中心页面 print(a)
5、xpath
# from scrapy.selector import HtmlXPathSelector # hxs=HtmlXPathSelector(response=response) # div_tag=hxs.xpath("//div[@id='content-list']/div[@class='item']") # 可以直接response.xpath() 没必要调用HtmlXPathSelector response.xpath('//div[@id='content-list']/div[@class='item']') for div in div_tag: a_text=div.xpath(".//div[@class='part1']/a[1]/text()").extract_first() a_href=div.xpath(".//div[@class='part1']/a[1]/@href").extract_first(default="not found") print(a_text,a_href) from ..items import * item=XianglongItem(title=a_text,href=a_href) yield item pages=hxs.xpath().extract() for page in pages: page_url="xxxxx"+page_url yield Request(url=page_url,callback=self.parse) # 这样子会不停的爬取,可以在settings中设置DEPTH_LIMIT = 1
6、pipeline 处理item或者DropItem
ITEM_PIPELINES={ TextPipeline: 300, MongoPipeline:400, } MONGO_URL="localhost" MONGO_DB="quotes"
from scrapy.exceptions import DropItem class TextPipeline(object): def __init__(self): self.limit=50 def process_item(self,item,spider): if item["text]: if len(item["text]) > self.limit: item["text"] = item["text"][0:self.limit]/rstrip() + "..." return item else: from scrapy.exceptions import DropItem raise DropItem("Missing Text")
from scrapy.exceptions import DropItem class MongoPipeline(objects): def __init__(self,mongo_url,mongo_db): self.mongo_url=mongo_url self.mongo_db=mongo_db @classmethod def from_crawler(cls,crawler): # 一般用来获取settings信息,也可用于spider.py中 return cls( mongo_url=crawler.settings.get(“MONGO_UR”), mongo_db=crawler.settings.get(“MONGO_DB”), ) # 获取的mongo_url又传到了def __init__ 中的参数 # def __init__ 中再接收参数 def open_spider(self,spider): self.client=pymongo.MongoClient(self.mongo_url) self.db=self.client[self.mongo_db] def process_item(self,item,spider): name=item.__class__.__name__ self.db["quotes"].insert(dict(item)) return item def close(self,spider): self.client.close()
import json from scrapy.exceptions import DropItem class JsonPipLine(object): def open_spider(self,spider): self.file=open("items.jl","wb") def process_item(self,item,spider): line=json.dumps(dict(item))+"\n" self.file.write(line) return item def close(self,spider): selc.file.close()
from scrapy.exceptions import DropItem class DropPipLine(object): def __init__(self): self.ids_seen = set() def process_item(self,item,spider): if item["id"] in self.self.ids_seen: raise DropItem("missing") else: self.ids_seen.add(item["id"]) return item
7、dupefilter
7.1、源码解析
scrapy默认使用 scrapy.dupefilter.RFPDupeFilter 进行去重
dupefilter.py(源码)---注意返回True或者false ,True表示已经访问过;False表示未访问过
DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' DUPEFILTER_DEBUG = False JOBDIR = "保存范文记录的日志路径,如:/root/" # 最终路径为 /root/requests.seen
RFPDupeFilter继承BaseDupeFilter,再利用request_fingerprint
class RFPDupeFilter(BaseDupeFilter): def __init__(self,path=None,debug=False): self.fingerprints=set() # 集合set def request_seen(self,request): # 关键函数 fp=self.request_fingerprint(request) # request的指纹 if fp in self.fingerprints: return True added=self.server.sadd(self.key,fp) # k,v 插入,返回值added return added == 0 #如果added=0,说明已经存在这个请求,就不会再加入队列 def request_fingerprint(self,request): # 获取指纹 return request_fingerprint(request)
拓展:利用request_fingerprint,为请求创建唯一标识
http://www.oldboyedu.com?id=1&age=2 http://www.oldboyedu.com?age=2&id=1 from scrapy.utils.request import request_fingerprint from scrapy.http import Request u1 = Request(url='http://www.oldboyedu.com?id=1&age=2') u2 = Request(url='http://www.oldboyedu.com?age=2&id=1') result1 = request_fingerprint(u1) result2 = request_fingerprint(u2) print(result1,result2)
7.2、自己写去重
class MyDupeFilter(BaseDupeFilter): def __init__(self): self.visited_url = set() @classmethod def from_settings(cls, settings): """ 初始化时,调用 :param settings: :return: """ return cls() def request_seen(self, request): """ 检测当前请求是否已经被访问过 :param request: :return: True表示已经访问过;False表示未访问过 """ if request.url in self.visited_url: return True self.visited_url.add(request.url) return False def open(self): """ 开始爬去请求时,调用 :return: """ print('open replication') def close(self, reason): """ 结束爬虫爬取时,调用 :param reason: :return: """ print('close replication') def log(self, request, spider): """ 记录日志 :param request: :param spider: :return: """ print('repeat', request.url)
DUPEFILTER_CLASS = 'xxxxxxx.dupe.MyDupeFilter'
from scrapy.dupefilter import BaseDupeFilter from scrapy.utils.request import request_fingerprint class MyDupeFilter(BaseDupeFilter): def __init__(self): self.record = set() @classmethod def from_settings(cls, settings): return cls() def request_seen(self, request): print('url开始去重了,request.url') ident=request_fingerprint(request) if ident in self.record: print('已经访问过了', request.url,ident) return True self.record.add(ident) return False # 为什么返回 True/False , 因为scheduler.py def open(self): # can return deferred pass def close(self, reason): # can return a deferred pass
8、scheduler
# 将request对象全部放到内部维护的队列:self.q=deque()
# 将request对象全部放到硬盘维护的队列:文件操作 mq
from scrapy.core.scheduler import Scheduler def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): self.df.log(request, self.spider) return False dqok = self._dqpush(request) if dqok:pass else:pass return True # dont_filter参数解析: #(request.dont_filter=False) # self.df.request_seen(request): # - True,已经访问 那么if 为真,return False # - False,未访问 那么if 为假,dqok = self._dqpush(request) # request.dont_filter=True, # 那么if 为假,dqok = self._dqpush(request)全部加入到调度器 def next_request(self): pass
9、下载中间件 --- 请求头、代理、证书、cookie、
9.1、对爬虫中所有请求发送时,携带请求头?
方案一:在每个Request对象中添加一个请求头
配置: DOWNLOADER_MIDDLEWARES = { 'xianglong.middlewares.UserAgentDownloaderMiddleware': 543, } 编写类 class UserAgentDownloaderMiddleware(object): @classmethod def from_crawler(cls, crawler): s = cls() return s def process_request(self, request, spider): request.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" # return None # 继续执行后续的中间件的process_request # from scrapy.http import Request # return Request(url='www.baidu.com') # 重新放入调度器中,当前请求不再继续处理 # from scrapy.http import HtmlResponse # 执行从最后一个开始执行所有的process_response # return HtmlResponse(url='www.baidu.com',body=b'asdfuowjelrjaspdoifualskdjf;lajsdf') def process_response(self, request, response, spider): return response def process_exception(self, request, exception, spider): pass
配置文件: USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
9.2、scrapy中如何添加代理?
scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware import os import scrapy from scrapy.http import Request class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['https://dig.chouti.com/'] def start_requests(self): os.03['HTTP_PROXY'] = "http://192.168.11.11" for url in self.start_urls: yield Request(url=url,callback=self.parse) def parse(self, response): print(response)
import random import base64 import six def to_bytes(text, encoding=None, errors='strict'): """Return the binary representation of `text`. If `text` is already a bytes object, return it as-is.""" if isinstance(text, bytes): return text if not isinstance(text, six.string_types): raise TypeError('to_bytes must receive a unicode, str or bytes ' 'object, got %s' % type(text).__name__) if encoding is None: encoding = 'utf-8' return text.encode(encoding, errors) class MyProxyDownloaderMiddleware(object): def process_request(self, request, spider): proxy_list = [ {'ip_port': '111.11.228.75:80', 'user_pass': 'xxx:123'}, {'ip_port': '120.198.243.22:80', 'user_pass': ''}, {'ip_port': '111.8.60.9:8123', 'user_pass': ''}, {'ip_port': '101.71.27.120:80', 'user_pass': ''}, {'ip_port': '122.96.59.104:80', 'user_pass': ''}, {'ip_port': '122.224.249.122:8088', 'user_pass': ''}, ] proxy = random.choice(proxy_list) if proxy['user_pass'] is not None: request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port']) encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass'])) request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass) else: request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port']) 配置: DOWNLOADER_MIDDLEWARES = { # 'xiaohan.middlewares.MyProxyDownloaderMiddleware': 543, }
9.3、证书
Https访问时有两种情况: 1. 要爬取网站使用的可信任证书(默认支持) DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory" DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory" 2. 要爬取网站使用的自定义证书 # settings.py DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory" DOWNLOADER_CLIENTCONTEXTFACTORY = "项目名.https.MySSLFactory" # https.py from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate) class MySSLFactory(ScrapyClientContextFactory): def getCertificateOptions(self): from OpenSSL import crypto v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read()) v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read()) return CertificateOptions( privateKey=v1, # pKey对象 certificate=v2, # X509对象 verify=False, method=getattr(self, 'method', getattr(self, '_ssl_method', None)) ) 其他: 相关类 scrapy.core.downloader.handlers.http.HttpDownloadHandler scrapy.core.downloader.webclient.ScrapyHTTPClientFactory scrapy.core.downloader.contextfactory.ScrapyClientContextFactory
10、爬虫中间件
class XiaohanSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. def __init__(self): pass @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() return s # 每次下载完成之后,未执行parse函数之前。 def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. print('process_spider_input',response) return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. print('process_spider_output',response) for i in result:+ yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass # 爬虫启动时,第一次执行start_requests时,触发。(只执行一次) def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). print('process_start_requests') for r in start_requests: yield r
11、扩展、信号
11.1、单纯扩展
extends.py class MyExtension(object): def __init__(self): pass @classmethod def from_crawler(cls, crawler): obj = cls() return obj 配置: EXTENSIONS = { 'xiaohan.extends.MyExtension':500, }
11.2、扩展+信号
from scrapy import signals class MyExtension(object): def __init__(self): pass @classmethod def from_crawler(cls, crawler): obj = cls() # 在爬虫打开时,触发spider_opened信号相关的所有函数:xxxxxxxxxxx crawler.signals.connect(obj.xxxxxxxxxxx1, signal=signals.spider_opened) # 在爬虫关闭时,触发spider_closed信号相关的所有函数:xxxxxxxxxxx crawler.signals.connect(obj.uuuuuuuuuu, signal=signals.spider_closed) return obj def xxxxxxxxxxx1(self, spider): print('open') def uuuuuuuuuu(self, spider): print('close') return obj 配置: # 62 行 EXTENSIONS = { 'xiaohan.extends.MyExtension':500, }
12、自定制命令
commands 文件夹 ,在其中创建 crawlall.py (此处文件名就是自定义的命令)在settings.py中添加配置: COMMANDS_MODULE = "sp3.commands"在项目目录执行命令:scrapy crawlall ( cmd 中直接执行)
from scrapy.commands import ScrapyCommand from scrapy.utils.project import get_project_settings class Command(ScrapyCommand): requires_project = True def syntax(self): return '[options]' def short_desc(self): return 'Runs all of the spiders' def run(self, args, opts): spider_list = self.crawler_process.spiders.list() for name in spider_list: self.crawler_process.crawl(name, **opts.__dict__) self.crawler_process.start()
def run(self, args, opts): from scrapy.crawler import CrawlerProcess CrawlerProcess.crawl CrawlerProcess.start """ self.crawler_process对象中含有:_active = {d,} """ self.crawler_process.crawl('chouti',**opts.__dict__) self.crawler_process.crawl('cnblogs',**opts.__dict__) # self.crawler_process.start()