Scrapy模块基础使用
在爬虫文件中
1.将解析到的页面数据存储至items对象中
2.使用yield关键字将items提交给管道文件进行处理
import scrapy from scrapy_qiu123.items import ScrapyQiu123Item class F1Spider(scrapy.Spider): name = 'f1' # allowed_domains = ['www.qiushibaike.com'] start_urls = ['https://www.qiushibaike.com/text/'] def parse(self, response): div_list = response.xpath('//div[@id="content-left"]/div') for i in div_list: # xpath解析到的都是selector对象,可以用extract获取数据 author = i.xpath('./div/a[2]/h2/text()').extract()[0] content = i.xpath('.//div[@class="content"]/span/text()').extract_first() # 1将解析到的数据值(author和content)存储到items中 item = ScrapyQiu123Item() item['author'] = author item['content'] = content # 2将item对象提交给管道 yield item
在items中
class ScrapyQiu123Item(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() author = scrapy.Field() content = scrapy.Field()
3.在管道文件中编写代码完成数据存储
在pipelines文件中基于管道存储
class ScrapyQiu123Pipeline(object): fp = None def open_spider(self,spider): #整个爬虫过程中,该方法只会在开始爬虫的时候被调用一次 self.fp = open('./qiubai.text', 'w', encoding='utf-8') # 可接收爬虫文件传来的item,并且对item对象中的页面数据进行持久化存储 # 每当爬虫文件向管道提交一次item,则该方法就会被执行一次 def process_item(self, item, spider): author = item['author'] content = item['content'] # 持久化存储 self.fp.write(author+":"+content+"\n\n\n") return item def close_spider(self,spider): # 整个爬虫过程中,该方法只会在结束爬虫的时候被调用一次 self.fp.close()
在pipelines文件中基于存储于mysql中
class ScrapyByMysql(object): conn = None cursor = None def open_spider(self,spider): #链接数据库 self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='',db='scrapy_data') def process_item(self, item, spider): # 向数据库存储相关代码 author = item['author'] content = item['content'] sql ='insert into Scrapy_data value ("%s","%s")' % (author,content) self.cursor = self.conn.cursor() try: self.cursor.execute(sql) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item def close_spider(self,spider): # 整个爬虫过程中,该方法只会在结束爬虫的时候被调用一次 self.cursor.close() self.conn.close()
并在setting文件中设置
ITEM_PIPELINES = {
'scrapy_qiu123.pipelines.ScrapyQiu123Pipeline': 300,
'scrapy_qiu123.pipelines.ScrapyByMysql': 400,
}
4.在终端执行操作
scrapy crawl f1 --nolog