scrapy爬虫
a. 配置文件
#settings.py DEPTH_LIMIT = 1 #指定“递归”的层数 ROBOTSTXT_OBEY = False #对方网站规定哪些网址可以爬,这个选项表示不遵循此规定
b. 选择器
.// #表示对象的子孙中 ./ #儿子 ./dev #儿子中的div标签 ./div[@id='i1'] #儿子中的div标签且id='i1' obj.extract() #列表中每一个对象转换字符串 => [] obj.extract_first #列表中的每一个对象转换字符串 => 列表第一个元素 //div/text() #获取某个标签的文本
#!/usr/bin/env python # -*- coding:utf-8 -*- from scrapy.selector import Selector, HtmlXPathSelector from scrapy.http import HtmlResponse html = """<!DOCTYPE html> <html> <head lang="en"> <meta charset="UTF-8"> <title></title> </head> <body> <ul> <li class="item-"><a id='i1' href="link.html">first item</a></li> <li class="item-0"><a id='i2' href="llink.html">first item</a></li> <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li> </ul> <div><a href="llink2.html">second item</a></div> </body> </html> """ response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8') # hxs = HtmlXPathSelector(response) # print(hxs) # hxs = Selector(response=response).xpath('//a') # print(hxs) # hxs = Selector(response=response).xpath('//a[2]') # print(hxs) # hxs = Selector(response=response).xpath('//a[@id]') # print(hxs) # hxs = Selector(response=response).xpath('//a[@id="i1"]') # print(hxs) # hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]') # print(hxs) # hxs = Selector(response=response).xpath('//a[contains(@href, "link")]') # print(hxs) # hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]') # print(hxs) # hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]') # print(hxs) # hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract() # print(hxs) # hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract() # print(hxs) # hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract() # print(hxs) # hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first() # print(hxs) # ul_list = Selector(response=response).xpath('//body/ul/li') # for item in ul_list: # v = item.xpath('./a/span') # # 或 # # v = item.xpath('a/span') # # 或 # # v = item.xpath('*/a/span') # print(v)
c. 结构化处理
setting.py ITEM_PIPELINES = { 'day96.pipelines.Day96Pipeline': 300, } DB = "....."
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.exceptions import DropItem class Day96Pipeline(object): def __init__(self,conn_str): self.conn_str = conn_str @classmethod def from_crawler(cls, crawler): """ 初始化时候,用于创建pipeline对象 :param crawler: :return: """ conn_str = crawler.settings.get('DB') return cls(conn_str) def open_spider(self,spider): """ 爬虫开始执行时,调用 :param spider: :return: """ self.conn = open(self.conn_str, 'a') def close_spider(self,spider): """ 爬虫关闭时,被调用 :param spider: :return: """ self.conn.close() def process_item(self, item, spider): """ 每当数据需要持久化时,就会被调用 :param item: :param spider: :return: """ # if spider.name == 'chouti' tpl = "%s\n%s\n\n" %(item['title'],item['href']) self.conn.write(tpl) # 交给下一个pipeline处理 return item # 丢弃item,不交给 # raise DropItem()
d. 常用命令
scrapy startproject sp1 cd p1 scrapy genspider baidu baidu.com #创建爬虫 scrapy crawl baidu scrapy crawl baidu --nolog
e. 目录结构
sp1 - scrapy.cfg #初始配置文件 - sp1 - spiders #目录 - items.py #格式化 - pipelines.py #持久化 - middlewares.py #中间件 - settings.py #配置
事例
# -*- coding: utf-8 -*- import scrapy import sys import io from scrapy.selector import Selector,HtmlXPathSelector class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['http://dig.chouti.com/'] def parse(self, response): hxs = Selector(response=response).xpath('//div[@id="content-list"]/div[@class="item"]') for obj in hxs: a = obj.xpath('.//a[@class="show-content color-chag"]/text()').extract_first() print(a.strip())
# -*- coding: utf-8 -*- import scrapy import sys import io from scrapy.selector import Selector,HtmlXPathSelector class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['http://dig.chouti.com/'] visited_urls = set() def parse(self, response): #获取当前页的所有页码的url hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract() for url in hxs: md5_url = self.md5(url) if md5_url in self.visited_urls: print('已经存在',url) else: self.visited_urls.add(md5_url) print(url) def md5(self,url): import hashlib obj = hashlib.md5() obj.update(bytes(url,encoding='utf-8')) return obj.hexdigest()
# -*- coding: utf-8 -*- import scrapy import sys import io from scrapy.http import Request from scrapy.selector import Selector,HtmlXPathSelector class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['http://dig.chouti.com/'] visited_urls = set() def parse(self, response): #获取当前页的所有页码的url hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract() for url in hxs: md5_url = self.md5(url) if md5_url in self.visited_urls: pass else: print(url) self.visited_urls.add(md5_url) url = "http://dig.chouti.com%s" %url #将新要访问的url增加到调度器 yield Request(url=url,callback=self.parse) def md5(self,url): import hashlib obj = hashlib.md5() obj.update(bytes(url,encoding='utf-8')) return obj.hexdigest()
a. 避免重复的url
setting.py DUPEFILTER_CLASS = "day96.duplication.RepeatFilter"
class RepeatFilter(object): def __init__(self): self.visited_set = set() @classmethod def from_settings(cls, settings): print('...') return cls() def request_seen(self, request): if request.url in self.visited_set: return True self.visited_set.add(request.url) return False def open(self): # can return deferred print('open') pass def close(self, reason): # can return a deferred print('close') pass def log(self, request, spider): # log that a request has been filtered # print('log....') pass
# -*- coding: utf-8 -*- import scrapy import sys import io from scrapy.http import Request from scrapy.selector import Selector,HtmlXPathSelector class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['http://dig.chouti.com/'] from scrapy.dupefilter import RFPDupeFilter def parse(self, response): print(response.url) #获取当前页的所有页码的url hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract() for url in hxs: url = "http://dig.chouti.com%s" %url #将新要访问的url增加到调度器 yield Request(url=url,callback=self.parse) def md5(self,url): import hashlib obj = hashlib.md5() obj.update(bytes(url,encoding='utf-8')) return obj.hexdigest()
116
传智播客
#爬传智播客的老师的名称 #scrapy startproject mySpider #cat /Users/huaixiaozi/PycharmProjects/mySpider/mySpider/items.py import scrapy class MyspiderItem(scrapy.Item): name = scrapy.Field() title = scrapy.Field() info = scrapy.Field() #cat /Users/huaixiaozi/PycharmProjects/mySpider/mySpider/spiders/itcastspider.py import scrapy from mySpider.items import ItcastItem #创建一个爬虫类 class ItcastSpider(scrapy.Spider): #爬虫名 name = "itcast" #允许爬虫作用的范围 allowd_domains = ["http://www.itcast.cn"] #爬虫起始的url start_urls = ["http://www.itcast.cn/channel/teacher.shtml#"] def parse(self, response): #通过scripy自带的xpath匹配出所有老师的根节点列表集合 teacher_list = response.xpath('//div[@class="li_txt"]') teacherItem = [] #遍历根节点集合 for each in teacher_list: item = ItcastItem() name = each.xpath('./h3/text()').extract() title = each.xpath('./h4/text()').extract() info = each.xpath('./p/text()').extract() print("--------------",type(name)) item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] teacherItem.append(item) return teacherItem #保存到json文件 scrapy crawl itcast -o itcast.json #保存到csv文件 scrapy crawl itcast -o itcast.csv