scrapy爬个小网站

本文使用scrapy对某一个网站静态数据进行了抓取

# -*- coding: utf-8 -*- import scrapy from scrapy.http import request import requests import os import sys reload(sys) sys.setdefaultencoding('utf-8')#中文字符不能被识别报错 class spider(scrapy.Spider): name='picSpider' allowed_domains=[] urls=[] for i in range(1,400): if i==1: urls.append('http://www.***.com/pic/12/') else: urls.append('http://www.***.com/pic/12/p_'+str(i)+'.html') start_urls=urls def parse(self, response): #title=response.xpath("//div[@class='box list channel']/ul/li/a/text()").extract() link=response.xpath("//div[@class='box list channel']/ul/li/a/@href").extract() for l in link: url='http://www.***.com'+l re=scrapy.Request(url,callback=self.parse_page)#子页面2层爬 yield re def parse_page(self, response): title=response.xpath("//h1/text()").extract()#名字 path=os.path.join('d:/dd',title[0]) if os.path.exists(path) is False: os.mkdir(path) for i in response.xpath("//div[@class='post']/img/@src").extract(): name=os.path.join(path,i.split('/')[-1]) pic=requests.get(i,timeout=10) f=open(name,'wb') f.write(pic.content) f.close()

  

posted @ 2017-05-28 17:16  不起泡沫的洗衣粉  阅读(201)  评论(0编辑  收藏  举报