scrapy抓取校花网图片
一:基础版(抓取首页图片)
爬虫py文件代码:
1 # -*- coding: utf-8 -*- 2 import scrapy 3 import sys 4 import io 5 from scrapy.selector import Selector 6 from scrapy.http import Request 7 from ..items import Day96XiaohuaItem 8 import re 9 sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') 10 11 12 class XiaohuaSpider(scrapy.Spider): 13 name = 'xiaohua' 14 allowed_domains = ['www.xueshengmai.com/hua/'] 15 start_urls = ['http://www.xueshengmai.com/hua/'] 16 17 def parse(self, response): 18 # ------------持久化数据-------------- 19 hxs = Selector(response=response).xpath("//div[@class='item_t']/div[@class='img']/a/img").extract() 20 # print(hxs) 21 for i in hxs: 22 # print(i) 23 title = re.findall("alt=(.*) src=",i)[0].strip('"')+".jpg" 24 src = "http://www.xueshengmai.com%s"%re.findall("src=(.*)>",i)[0].strip('"') 25 print(title,src) 26 item_obj = Day96XiaohuaItem(title=title, src=src) 27 yield item_obj
items.py 代码:
1 import scrapy 2 3 4 class Day96XiaohuaItem(scrapy.Item): 5 # define the fields for your item here like: 6 # name = scrapy.Field() 7 title=scrapy.Field() 8 src=scrapy.Field()
pipelines代码:
import requests class Day96XiaohuaPipeline(object): def process_item(self, item, spider): file_path="imgs/%s"%item["title"] file_src=item["src"] f=open(file_path,"wb") img_date=requests.get(file_src) f.write(img_date.content) f.close()
二:分页抓取校花网图片
下面代码和上面如出一辙,只将不同的代码块写下:
1 # -*- coding: utf-8 -*- 2 import scrapy 3 import sys 4 import io 5 from scrapy.selector import Selector 6 from scrapy.http import Request 7 from ..items import Day96XiaohuaItem 8 import re 9 10 sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') 11 12 13 class XiaohuaSpider(scrapy.Spider): 14 name = 'xiaohua' 15 allowed_domains = ['www.xueshengmai.com/hua/'] 16 start_urls = ['http://www.xueshengmai.com/hua/'] 17 18 def parse(self, response): 19 # ------------持久化数据-------------- 20 hxs = Selector(response=response).xpath("//div[@class='item_t']/div[@class='img']/a/img").extract() 21 # print(hxs) 22 for i in hxs: 23 # print(i) 24 title = re.findall("alt=(.*) src=", i)[0].strip('"') + ".jpg" 25 src = "http://www.xueshengmai.com%s" % re.findall("src=(.*)>", i)[0].strip('"') 26 print("正在努力抓取校花网图片...",title, src) 27 item_obj = Day96XiaohuaItem(title=title, src=src) 28 yield item_obj 29 30 hxs2 = Selector(response=response).xpath("//div[@class='page_num']/a").extract() 31 for i in hxs2: 32 page_number = re.findall(">(.*)<", i)[0] 33 # print("=========",page_number) 34 if page_number not in ["首页", "上一页", "下一页", "尾页"]: 35 url = "http://www.xueshengmai.com/list-1-%s.html" % page_number 36 # print("-------", url) 37 # 将新要访问的url添加到调度器 38 yield Request(url=url, callback=self.parse,dont_filter=True)
同时应该在settings中加上一句:
1 DEPTH_LIMIT = 1
否则程序将会一直深入下载,直到网站图片资源穷尽...