Scrapy
Scrapy 初识
1 Scrapy 编码流程 2 1.创建爬虫,也就是继承scrapy.Spider或scrapy.CrawlSpider 3 2.将要爬取的网页填写在start_urls中 4 3.实现parse解析网站的回调 5 4.在Items中编码规定数据对象 6 5.middlewares中配置独立的请求头丶代理 7 6.pipelines中编写数据保存代码(保存到数据库\文件\ftp) 8 9 10 # Scrapy文档:https://scrapy-chs.readthedocs.io/zh_CN/0.24/ 11 # pip install scrapy 12 # pip install win32api 13 # pip install twisted 14 # pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple 15 # 创建scrapy项目 16 # scrapy startproject myproject 17 # 创建爬虫 Spider模板 18 # scrapy genspider baidu baidu.com 19 # 快速创建CrawlSpider模板 20 # scrapy genspider -t crawl fanqienovel fanqienovel.com 21 # 执行爬虫代码 22 # scrapy crawl baidu 23 # 如果需要调试运行的话就需要调用 from scrapy.cmdline import execute 24 # execute('scrapy crawl baidu'.split) 25 # 执行爬虫代码参数: -o 输出.json, .csv, .xml 格式 26 # scrapy crawl baidu -o names.json 27 28 # 调试请求 29 # scrapy shell www.baidu.com 30 31 from scrapy.cmdline import execute 32 33 def main(): 34 execute('scrapy crawl baidu'.split) 35 36 if __name__ == '__main__': 37 main() 38
Scrapy的配置说明
1 Scrapy常用配置 2 # 是否显示日志 3 LOG_ENABLED = False 4 5 # 项目名,用来构造User-Agent,同时也用来log. 6 BOT_NAME = "myproject" 7 8 # 项目位置 9 SPIDER_MODULES = ["myproject.spiders"] 10 11 # 创建的新爬虫的位置 12 NEWSPIDER_MODULE = "myproject.spiders" 13 14 # 配置默认的User-Agent 15 USER_AGENT = r'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/537.11' 16 17 # Obey robots.txt rules 18 # 如果网站申明了不可爬取,我们是否遵循规则 19 ROBOTSTXT_OBEY = False 20 21 # 同时发起请求的最大数量(requests) 22 #CONCURRENT_REQUESTS = 16 23 24 # 下载文件间隔(秒):下载器在下载同一个网站下一个页面前需要等待的时间. 25 DOWNLOAD_DELAY = 3 26 27 # 当前页面同时并发16个 28 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 29 30 # 当前IP同时并发16个 31 #CONCURRENT_REQUESTS_PER_IP = 16 32 33 # 是否启动cookie 34 #COOKIES_ENABLED = False 35 36 # 默认的请求头 37 DEFAULT_REQUEST_HEADERS = { 38 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 39 "Accept-Language": "en", 40 } 41 42 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 43 # 开启middlewares. 作用:下载文件图片前动态设置请求头丶代理或者一些其他配置.middlewares.py文件中有备注 44 DOWNLOADER_MIDDLEWARES = { 45 "myproject.middlewares.MyprojectDownloaderMiddleware": 1, 46 } 47 48 # 这里说明了image和file的pipe配置:https://docs.scrapy.org/en/latest/topics/media-pipeline.html 49 # 开启pipelines,数值越小优先顺序越高 50 ITEM_PIPELINES = { 51 "myproject.pipelines.MyprojectPipeline": 300, 52 "myproject.pipelines.MyImagesPipeline": 300, 53 # "scrapy.pipelines.images.MyImagesPipeline": 1 # 图片的话就继承它 54 } 55 56 # 配置图片保存位置 57 IMAGES_STORE = "./img"
Scrapy Items
1 import scrapy 2 3 class ImgItem(scrapy.Item): 4 # define the fields for your item here like: 5 # 配置了image_urls字段,就会默认下载图片(https://docs.scrapy.org/en/latest/topics/media-pipeline.html) 6 image_urls = scrapy.Field() 7 images = scrapy.Field() 8 filename = scrapy.Field() 9 class BookChapter(scrapy.Item): 10 # define the fields for your item here like: 11 book_name = scrapy.Field() 12 chapter_name = scrapy.Field() 13 book_content = scrapy.Field() 14 15 class NameItem(scrapy.Item): 16 # define the fields for your item here like: 17 name = scrapy.Field() 18 age = scrapy.Field() 19 20 21 class MyprojectItem(scrapy.Item): 22 # define the fields for your item here like: 23 # name = scrapy.Field() 24 pass
Scrapy Pipelines
1 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 2 3 4 # useful for handling different item types with a single interface 5 from itemadapter import ItemAdapter 6 from scrapy.pipelines.images import ImagesPipeline 7 from scrapy import Request 8 from urllib.parse import urlparse 9 from os.path import basename, splitext 10 11 class MyImagesPipeline(ImagesPipeline): 12 13 # 定义文件名 14 def file_path(self, request, response=None, info=None, *, item=None): 15 path = urlparse(request.url) 16 filename = basename(path.path) 17 filename = request.meta['filename']+splitext(filename)[1] 18 return filename 19 20 def get_media_requests(self, item, info): 21 for i in range(0, len(item['image_urls'])): 22 yield Request(item['image_urls'][i], meta={'filename': item['filename'][i]}) 23 # urls = ItemAdapter(item).get(self.images_urls_field, []) 24 # return [Request(u, meta={'filename': u['filename']}) for u in urls] 25 26 class MyprojectPipeline: 27 # 开始爬取数据的时候调用 28 def open_spider(self, spider): 29 super(MyFilesPipeline, self).open_spider(spider) 30 if spider.name == "8z5": 31 self.files_8z5 = {} 32 pass 33 34 # 关闭爬虫的时候调用 35 def close_spider(self, spider): 36 super(MyFilesPipeline, self).close_spider(spider) 37 if self.files_8z5 and hasattr(self, "files_8z5"): 38 for keyname in self.files_8z5.keys(): 39 self.files_8z5[keyname].close() 40 pass 41 42 # 回调:接受到parse的数据时调用 43 def process_item(self, item, spider): 44 if spider.name == "8z5": 45 filename = item['book_name']+'.txt' 46 if filename not in self.files_8z5.keys(): 47 self.files_8z5[filename] = open(filename, 'a') 48 self.files_8z5[filename].write(item['chapter_name']) 49 self.files_8z5[filename].write(item['book_content']) 50 return item
Scrapy middlewares
1 from scrapy import signals 2 from fake_useragent import UserAgent 3 4 # useful for handling different item types with a single interface 5 from itemadapter import is_item, ItemAdapter 6 7 8 class MyprojectSpiderMiddleware: 9 # Not all methods need to be defined. If a method is not defined, 10 # scrapy acts as if the spider middleware does not modify the 11 # passed objects. 12 13 @classmethod 14 def from_crawler(cls, crawler): 15 # This method is used by Scrapy to create your spiders. 16 s = cls() 17 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 18 return s 19 20 def process_spider_input(self, response, spider): 21 # Called for each response that goes through the spider 22 # middleware and into the spider. 23 24 # Should return None or raise an exception. 25 return None 26 27 def process_spider_output(self, response, result, spider): 28 # Called with the results returned from the Spider, after 29 # it has processed the response. 30 31 # Must return an iterable of Request, or item objects. 32 for i in result: 33 yield i 34 35 def process_spider_exception(self, response, exception, spider): 36 # Called when a spider or process_spider_input() method 37 # (from other spider middleware) raises an exception. 38 39 # Should return either None or an iterable of Request or item objects. 40 pass 41 42 def process_start_requests(self, start_requests, spider): 43 # Called with the start requests of the spider, and works 44 # similarly to the process_spider_output() method, except 45 # that it doesn’t have a response associated. 46 47 # Must return only requests (not items). 48 for r in start_requests: 49 yield r 50 51 def spider_opened(self, spider): 52 spider.logger.info("Spider opened: %s" % spider.name) 53 54 # 下载前使用的请求头 55 class MyprojectDownloaderMiddleware: 56 # Not all methods need to be defined. If a method is not defined, 57 # scrapy acts as if the downloader middleware does not modify the 58 # passed objects. 59 60 @classmethod 61 def from_crawler(cls, crawler): 62 # This method is used by Scrapy to create your spiders. 63 s = cls() 64 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 65 return s 66 67 def process_request(self, request, spider): 68 # 配置随即请求头 69 request.headers.setdefault(b'User-Agent', UserAgent().random) 70 # 配置ip代理 71 request.meta['proxy'] = 'http://47.92.248.86:10000' 72 # Called for each request that goes through the downloader 73 # middleware. 74 75 # Must either: 76 # - return None: continue processing this request 77 # - or return a Response object 78 # - or return a Request object 79 # - or raise IgnoreRequest: process_exception() methods of 80 # installed downloader middleware will be called 81 return None 82 83 def process_response(self, request, response, spider): 84 # Called with the response returned from the downloader. 85 86 # Must either; 87 # - return a Response object 88 # - return a Request object 89 # - or raise IgnoreRequest 90 return response 91 92 def process_exception(self, request, exception, spider): 93 # Called when a download handler or a process_request() 94 # (from other downloader middleware) raises an exception. 95 96 # Must either: 97 # - return None: continue processing this exception 98 # - return a Response object: stops process_exception() chain 99 # - return a Request object: stops process_exception() chain 100 pass 101 102 def spider_opened(self, spider): 103 spider.logger.info("Spider opened: %s" % spider.name)
Scrapy scrapy.Spider
1 import scrapy 2 # from ..items import NameItem 3 4 5 class BaiduSpider(scrapy.Spider): 6 name = "baidu" 7 allowed_domains = ["baidu.com"] # 允许爬取数据的域名 8 start_urls = ["https://baid.com"] # start_requests覆写他,就不用声明start_urls 9 10 # def start_requests(self): 11 # for url in self.start_urls: 12 # yield scrapy.Request(url, dont_filter=True) 13 14 def parse(self, response): 15 # 正则来选取元素 16 # response.re() 17 # css选择器来选取元素 18 # response.css() 19 # xpath来选取元素 20 # response.xpath() 21 # 返回元素单个内容,Unicode字符串 22 # response.xpath().extract_first() 23 # 返回元素多个内容,Unicode字符串 24 # response.xpath().extract() 25 # 例:response.geturl()真实的url为www.baidu.com/abc.txt 26 # response.urljoin('next_url.txt') # 执行完了之后返回www.baidu.com/next_url.txt 27 print(response.text) 28 29 # 返回数据后,交给pipelines处理 30 # 或者执行:scrapy crawl baidu -o names.json 31 # return [{'name':'张三','age':18}, {'name':'lisi','age':19},{'name':'王五','age':20}] 32 33 data_list = [{'name':'张三','age':18}, {'name':'lisi','age':19},{'name':'王五','age':20}] 34 # item = NameItem() 35 item = {} 36 for rec in data_list: 37 item['name'] = rec['name'] 38 item['age'] = rec['age'] 39 yield item 40 41 # 接着爬取下一页:callback指定下一页的解析方法 42 # yield scrapy.Request(response.urljoin('next_url'), callback=self.parse)
Scrapy CrawlSpider
1 import scrapy 2 from scrapy.linkextractors import LinkExtractor 3 from scrapy.spiders import CrawlSpider, Rule 4 from urllib.parse import urlparse 5 from os.path import basename 6 from ..items import BookChapter 7 8 class FanqienovelSpider(CrawlSpider): 9 name = "8z5" 10 allowed_domains = ["8z5.net"] 11 start_urls = ["http://www.8z5.ne/mfxs/15867/19733461.html"] 12 # Rule 13 # class Rule: 14 # def __init__( 15 # self, 16 # link_extractor=None, # 用于定义需要提取的链接 17 # callback=None, # 当link_extractor获取到链接时参数所指定的值作为回调函数 18 # # 注意:当编写爬虫的时候,避免使用parse作为回调.于Crawlpider使用parse方法来实现其逻辑,如果覆盖了parse方法,crawlspider将会运行错误 19 # cb_kwargs=None, # 传给回调函数的参数 20 # follow=None, # 指定了根据该规则从response提取的链接是否需要跟进. 21 # process_links=None, # 主要是用于过滤link_extractor获取到的链接 22 # process_request=None, # 主要用来过滤在rule中提取到的request 23 # errback=None, # 出现错误时调用 24 # ): 25 # LinkExtractor: 26 # # class LxmlLinkExtractor: 27 # # _csstranslator = HTMLTranslator() 28 # # 29 # # def __init__( 30 # # self, 31 # # allow=(), # 满足括号中"正则表达式"的值会被提取如果为空则全部匹配 32 # # deny=(), # 这个与正则表达式(或正则表达式列表)不匹配的URL一定不提取 33 # # allow_domains=(), # 会被提取的链接的domains 34 # # deny_domains=(), # 一定不会被提取的链接的domains 35 # # restrict_xpaths=(), # 使用xpath表达式,和allow共同作用过滤链接(只选到节点,不选到属性) 36 # # tags=("a", "area"), # 提取链接时从哪些标签中取 37 # # attrs=("href",), # 提取链接时从哪些属性中取 38 # # canonicalize=False, 39 # # unique=True, # 提取的链接是唯一的 40 # # process_value=None, # 每次提取到的标签都会经过它过滤一遍 41 # # deny_extensions=None, 42 # # restrict_css=(), # css选择器 43 # # strip=True, # 提取时删除前后空格 44 # # restrict_text=None, 45 # # ): 46 def process_value(self): 47 path = urlparse(self) 48 filename = basename(path.path) 49 if filename.find('.') == -1: 50 return None 51 return self 52 53 rules = (Rule(LinkExtractor(restrict_xpaths=r"//div[@class='bottem1']/a[3]", process_value=process_value), callback="parse_item", follow=True), ) 54 55 def parse_item(self, response): 56 book_name = response.xpath(r"//div[@class='con_top']/a[2]/text()").extract_first() 57 chapter_name = response.xpath(r"//div[@class='bookname']/h1[1]/text()").extract_first() 58 book_content = response.xpath(r"//div[@id='content'][1]").extract_first() 59 book_content = book_content.replace('<br><br>', '\n') 60 item = BookChapter() 61 item['book_name'] = book_name 62 item['chapter_name'] = chapter_name 63 item['book_content'] = book_content 64 return item
Scrapy 下载图片
1 import scrapy 2 from scrapy.linkextractors import LinkExtractor 3 from scrapy.spiders import CrawlSpider, Rule 4 from ..items import ImgItem 5 6 class TupianSpider(CrawlSpider): 7 name = "tupian" 8 allowed_domains = ["desk.zol.com.cn"] 9 start_urls = ["https://desk.zol.co.cn/bizhi/9109_111584_2.html"] 10 11 rules = (Rule(LinkExtractor(restrict_xpaths=r"//div[@class='photo-set-list']//li/a[1]"), callback="parse_item", follow=True),) 12 13 def parse_item(self, response): 14 item = ImgItem() 15 item['image_urls'] = [response.xpath(r"//img[@id='bigImg']/@src").extract_first()] 16 item['filename'] = [response.xpath(r"//span[@class='current-num']/text()").extract_first()] 17 return item 18 19 class Tupian1Spider(scrapy.Spider): 20 name = "tupian1" 21 allowed_domains = ["desk.zol.com.cn"] 22 start_urls = ["https://desk.zol.co.cn/bizhi/9109_111584_2.html"] 23 24 def parse_dimage(self, response): 25 item = ImgItem() 26 item['image_urls'] = [response.xpath(r"//img[@id='bigImg']/@src").extract_first()] 27 item['filename'] = [response.xpath(r"//span[@class='current-num']/text()").extract_first()] 28 return item 29 30 def parse(self, response): 31 imgsurls = response.xpath(r"//div[@class='photo-set-list']//li/a[1]/@href").extract() 32 for url in imgsurls: 33 yield scrapy.Request(response.urljoin(url), callback=self.parse_dimage)