scrapy
在爬虫文件中
import scrapy from choutidemo.items import ChoutidemoItem item=ChoutidemoItem() class ChoutiSpider(scrapy.Spider): name = 'chouti' start_urls = ['https://dig.chouti.com/r/scoff/hot/'] page=2 def parse(self, response): div_list=response.xpath('//div[@id="content-list"]/div') for div in div_list: actor=div.xpath('./div[2]/div[2]/a[4]/b/text() | ./div[3]/div[2]/a[4]/b/text()').extract_first() content=div.xpath('./div[2]/div[1]/a/text() | ./div[3]/div[1]/a/text()').extract_first() item["actor"]=actor item["content"]=content print(content) yield item ######################递归解析#处理分页操作######################### if self.page<=5: url='https://dig.chouti.com/r/scoff/hot/%d' % self.page self.page=self.page+1 yield scrapy.Request(url=url,callback=self.parse) #手动发送请求
在items.py中
import scrapy class ChoutidemoItem(scrapy.Item): # define the fields for your item here like: actor = scrapy.Field() content = scrapy.Field()
在管道文件中
class ChoutidemoPipeline(object): f = None def open_spider(self, spider): print("开始爬虫!") self.f = open('./chouti1.txt', 'w', encoding='utf-8') def process_item(self, item, spider): self.f.write(item['actor'] + ':' + item['content'] + "\n") return item def close_spider(self, spider): print("结束爬虫!") self.f.close()
二.post请求(以百度翻译为例)
class ChoutidemoPipeline(object): f = None def open_spider(self, spider): print("开始爬虫!") self.f = open('./chouti1.txt', 'w', encoding='utf-8') def process_item(self, item, spider): self.f.write(item['actor'] + ':' + item['content'] + "\n") return item def close_spider(self, spider): print("结束爬虫!") self.f.close()
三.cookie的处理
使用scrapy模块,不需要自己处理cookie,scrapy模块已经帮我们处理了,但是一定要先登录才可以
使用方式:修改配置文件
COOKIES_ENABLED = True
四.请求传参
在爬虫文件中
import scrapy from getmovie.items import GetmovieItem class MovieSpider(scrapy.Spider): name = 'movie' start_urls = ['https://www.4567tv.tv/frim/index6.html'] def parse_detail(self,response): #2.取出参数的格式 item=response.meta["item"] content=response.xpath('//div[@class="stui-content__item clearfix"]/div[2]/p[5]/span[@class="detail-sketch"]/text()').extract_first() item["content"]=content yield item def parse(self, response): li_list=response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for li in li_list: item = GetmovieItem() #注:实例化一定要在此处,否则item["name"]总是最后一个值 name=li.xpath('./div/div/h4/a/text()').extract_first() href='https://www.4567tv.tv'+li.xpath('./div/div/h4/a/@href').extract_first() item["name"]=name #1.传参的格式 yield scrapy.Request(url=href,callback=self.parse_detail,meta={"item":item})
在items.py中
import scrapy class GetmovieItem(scrapy.Item): name = scrapy.Field() content = scrapy.Field() pass
在管道文件中
class GetmoviePipeline(object): def open_spider(self, spider): print("开始爬虫!") self.f = open('./movie.txt', 'w', encoding='utf-8') def process_item(self, item, spider): self.f.write(item['name'] + ':' + item['content'] + "\n") return item def close_spider(self, spider): print("结束爬虫!") self.f.close()
五.日志等级
在配置文件中
LOG_LEVEL='ERROR' #显示错误
LOG_LEVEL='WARNING' #显示警告及错误
LOG_FILE='./log.txt' #日志信息存储到文本文件中
六.五大核心组件
引擎:
用来处理整个系统的数据处理,触发事物(框架核心)
调度器:
用来接收引擎发过来的请求,压入队列中,并在引擎再次请求的时候返回,可以想象成一个url的优先队列,由它来决定下一个要抓取的网址是什么,同时去除重复 的网址
下载器:
用于下载网页内容,并将网页内容返回给下载器(此过程是建立在twisted 这个高效的模型上的)
爬虫:
用于从特定的网页中提取自己需要的信息,用户也可以提取出链接,用于抓取下一个页面
管道:
负责处理爬虫从网页中抽取出的实体,主要功能是持久化实体,验证实体的有效性,清除不需要的信息.
七.中间件
1.拦截请求(ip伪装,ua伪装)
from scrapy import signals import random class GetmovieDownloaderMiddleware(object): user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] PROXY_http = [ '153.180.102.104:80', '195.208.131.189:56055', ] PROXY_https = [ '120.83.49.90:9000', '95.189.112.214:35508', ] #拦截所有未发生异常的请求 def process_request(self, request, spider): request.headers["User-Agent"]=random.choice(self.user_agent_list) return None #拦截所有的相应 def process_response(self, request, response, spider): return response #拦截异常的请求 def process_exception(self, request, exception, spider): if request.url.split(':')[0]=='http': request.meta["proxy"]=random.choice(self.PROXY_http) else: request.meta["proxy"] = random.choice(self.PROXY_https) def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
配置文件中让中间件生效
DOWNLOADER_MIDDLEWARES = { 'getmovie.middlewares.GetmovieDownloaderMiddleware': 543, }
2.拦截相应(动态加载数据)
爬虫文件中
import scrapy from selenium import webdriver class DbanSpider(scrapy.Spider): name = 'dban' # allowed_domains = ['www.xxx.com'] start_urls = ['https://movie.douban.com/typerank?type_name=喜剧&type=24&interval_id=100:90&action='] #先执行__init__函数,且只执行一次 def __init__(self): #1.在init函数中创建浏览器对象,因为浏览器对象只需要创建一个 self.bro=webdriver.Chrome(executable_path=r'C:\Users\Administrator\PycharmProjects\untitled3\pachong\day137\chromedriver.exe') def parse(self, response): #4.获取页面的所有数据并持久化处理 all=response.xpath('/html').extract_first() with open('./douban.html','w',encoding='utf-8') as f: for line in all: f.write(line) f.close() #closed函数在parse函数循环结束和中间件执行完后执行一次 def closed(self,spider): #5.关闭浏览器 self.bro.quit()
中间件中
import time from scrapy.http import HtmlResponse class DoubanDownloaderMiddleware(object): def process_request(self, request, spider): return None def process_response(self, request, response, spider): #2.获取动态的数据 bro=spider.bro bro.get(url=request.url) time.sleep(4) bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') time.sleep(3) bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') time.sleep(3) page_text=bro.page_source #3.封装一个HtmlResponse返回 return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8') def process_exception(self, request, exception, spider): pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
改变世界,改变自己!