一、创建数据模板
# items.py import scrapy # 创建数据模板类 class ChoutiItem(scrapy.Item): title = scrapy.Field() url = scrapy.Field() photo_url = scrapy.Field()
二、定义数据处理类
# pipelines.py class ChoutiFilePipeline(object): # 开始处理数据会执行该方法 def open_spider(self, spider): print('开始处理数据') # 获取文件对象 self.file = open('chouti.txt', 'wt', encoding='utf-8') # 处理数据的主体逻辑,直至数据处理完毕 def process_item(self, item, spider): print('处理了一条数据') self.file.write(item['title'] + '\n') self.file.write(item['url'] + '\n') self.file.write(item['photo_url'] + '\n') # 返回item,下次继续使用 return item # 数据处理结束会执行该方法 def close_spider(self, spider): print('数据处理结束') self.file.close() import pymysql class ChoutiMysqlPipeline(object): def open_spider(self, spider): # 获取数据库连接对象 self.conn = pymysql.connect(host='127.0.0.1', user='root', password="111", port=3306, database='scrapy_0805_mysql') def process_item(self, item, spider): # 数据库操作 cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor) sql = 'insert into article (title,url,photo_url)values(%s,%s,%s) ' cursor.execute(sql, (item['title'], item['url'], item['photo_url']) ) self.conn.commit() return item def close_spider(self, spider): self.conn.close()
三、注册数据处理类
# settings.py ... ITEM_PIPELINES = { # 注册数据处理类,声明优先级,数字越小,优先级越高,会按照优先级高级依次执行 'scrapy_0805.pipelines.ChoutiFilePipeline': 300, 'scrapy_0805.pipelines.ChoutiMysqlPipeline': 305, } ...
四、书写爬虫程序
# chouti.py import scrapy from scrapy.http.request import Request from bs4 import BeautifulSoup from scrapy_0805.items import ChoutiItem # 需要继承Spider class ChoutiSpider(scrapy.Spider): # 爬虫名 name = 'chouti' # 允许域 allowed_domains = ['dig.chouti.com'] # 起始url start_urls = ['http://dig.chouti.com/'] # 将爬虫的回调函数写在parse方法里面 # 假如本类是用于继续爬取的 # def parse(self, response): # # 进行解析 # # 假设解析出了新的url # # url = ... # # 返回Request对象继续传给引擎 # return Request(url, dont_filter=True) # 持久化方案1 # def parse(self, response): # ll=[] # div_list=response.xpath('//div[contains(@class,"link-item")]') # for div in div_list: # title=div.css('.link-title::text').extract_first() # url=div.css('.link-title::attr(href)').extract_first() # photo_url=div.css('.image-scale::attr(src)').extract_first() # # 方案一的parser必须返回列表套字典的形式 # ll.append({'title':title,'url':url,'photo_url':photo_url}) # return ll # ①执行:scrapy crawl chouti -o chouti.csv # ②返回的结果会写入< chouti.csv >文件,可以用excel打开 # 持久化方案二 def parse(self, response): div_list = response.xpath('//div[contains(@class,"link-item")]') for div in div_list: # 生成数据模板对象 item = ChoutiItem() title = div.css('.link-title::text').extract_first() url = div.css('.link-title::attr(href)').extract_first() photo_url = div.css('.image-scale::attr(src)').extract_first() if not photo_url: photo_url = '' item['title'] = title item['url'] = url item['photo_url'] = photo_url # 此处不能写成return,必须写yield yield item