<scrapy爬虫>爬取quotes.toscrape.com
1.创建scrapy项目
dos窗口输入:
scrapy startproject quote
cd quote
2.编写item.py文件(相当于编写模板,需要爬取的数据在这里定义)
import scrapy class QuoteItem(scrapy.Item): # define the fields for your item here like: text = scrapy.Field() author = scrapy.Field() tags = scrapy.Field()
3.创建爬虫文件
dos窗口输入:
scrapy genspider myspider quotes.toscrape.com
4.编写myspider.py文件(接收响应,处理数据)
# -*- coding: utf-8 -*- import scrapy from quote.items import QuoteItem class MyspiderSpider(scrapy.Spider): name = 'myspider' allowed_domains = ['quotes.toscrape.com'] start_urls = ['http://quotes.toscrape.com/'] def parse(self, response): for each in response.xpath('//div[@class="quote"]'): item = QuoteItem() item['text'] = each.xpath('./span/text()').extract()[0] item['author'] = each.xpath('.//small/text()').extract()[0] list = each.xpath('.//a[@class="tag"]/text()').extract() #列表形式的文件不能存入mysql,需要弄成str形式 item['tags']= '/'.join(list) yield item next = response.xpath('//li[@class="next"]/a/@href').extract()[0] url = response.urljoin(next) yield scrapy.Request(url=url,callback=self.parse)
5.编写pipelines.py(存储数据)
存储到mysql
import pymysql.cursors class QuotePipeline(object): def __init__(self): self.connect = pymysql.connect( host='localhost', user='root', password='', database='quotes', charset='utf8', ) self.cursor = self.connect.cursor() def process_item(self, item, spider): item = dict(item) sql = 'insert into quote(text,author,tags) values(%s,%s,%s)' self.cursor.execute(sql,(item['text'],item['author'],item['tags'])) self.connect.commit() return item def close_spider(self,spider): self.cursor.close() self.connect.close()
改进版:
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql.cursors class QuotePipeline(object): def __init__(self): self.connect = pymysql.connect( host='localhost', user='root', password='', database='quotes', charset='utf8', ) self.cursor = self.connect.cursor() def process_item(self, item, spider): item = dict(item) table = 'quote' keys = ','.join(item.keys()) values = ','.join(['%s']*len(item)) sql = 'insert into {table}({keys}) values({values})'.format(table=table,keys=keys,values=values) try: if self.cursor.execute(sql, tuple(item.values())): self.connect.commit() print("Successful!") except: print("Failed!") self.connect.rollback() return item def close_spider(self, spider): self.cursor.close() self.connect.close()
存储到mongoDB
1.在setting文件设置2个属性
MONGO_URI = 'localhost' MONGO_DB = 'study' #一个管道文件 ITEM_PIPELINES = { # 'quote.pipelines.QuotePipeline': 300, 'quote.pipelines.MongoPipeline': 300, }
2.pipeline.py
import pymongo class MongoPipeline(object): # 表名字 collection = 'student' def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DB'), ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): # 插入到mongo数据库 self.db[self.collection].insert(dict(item)) return item
6.编写settings.py(设置headers,pipelines等)
robox协议
# Obey robots.txt rules ROBOTSTXT_OBEY = False
headers
DEFAULT_REQUEST_HEADERS = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', }
pipelines
ITEM_PIPELINES = { 'quote.pipelines.QuotePipeline': 300, }
7.运行爬虫
dos窗口输入:
scrapy crawl myspider
运行结果