scrapy框架

几个scrapy框架的指令:
  scrapy startproject xxxx
  scrapy genspider xxx www.ooo.com
  scrapy crawl xxx
基于管道的持久化存储:
  1.数据解析
  2.在item类中定义相关的属性
  3.在parse方法中实例化一个item类型的对象
  4.将解析到的数据存储到item类型的对象中
  5.使用yield item将item对象提交给管道
  6.在process_item 这里面 接收数据 并进行持久化存储
  7.在配置文件里面开启管道
 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from qiubai01.items import Qiubai01Item
 4 
 5 
 6 class QiubaiSpider(scrapy.Spider):
 7     name = 'qiubai'
 8     # allowed_domains = ['www.xxx.com']
 9     start_urls = ['https://www.qiushibaike.com/text/']
10 
11     # def parse(self, response):
12     #     # response 是请求返回对象
13     #     div_list = response.xpath("//*[@id=\"content-left\"]/div")
14     #     for div in div_list:
15     #         # author = div.xpath("./div[1]/a[2]/h2/text()")[0].extract()
16     #         # 这里面换一种写法也可以
17     #         author = div.xpath("./div[1]/a[2]/h2/text()").extract_first()
18     #         content = div.xpath('./a/div/span//text()').extract()
19     #         content = "".join(content)
20     #         print(author)
21     #         print()
22     #         print(content)
23 
24     # 基于终端指令存储  scrapy crawl -o qiushi.csv
25     # def parse(self, response):
26     #     # response 是请求返回对象
27     #     div_list = response.xpath("//*[@id=\"content-left\"]/div")
28     #     all_data_list = []
29     #     for div in div_list:
30     #         # author = div.xpath("./div[1]/a[2]/h2/text()")[0].extract()
31     #         # 这里面换一种写法也可以
32     #         author = div.xpath("./div[1]/a[2]/h2/text()").extract_first()
33     #         content = div.xpath('./a/div/span//text()').extract()
34     #         content = "".join(content)
35     #         dic = {}
36     #         dic['author'] = author
37     #         dic['content'] = content
38     #         all_data_list.append(dic)
39     #     return all_data_list
40 
41     # 基于管道的持久化存储
42 
43     def parse(self, response):
44         """
45         1.数据解析
46         2.在item类中定义相关的属性
47         3.在parse方法中实例化一个item类型的对象
48         4.将解析到的数据存储到item类型的对象中
49         5.使用yield item将item对象提交给管道
50         6.在process_item 这里面 接收数据 并进行持久化存储
51         7.在配置文件里面开启管道
52         """
53         # response 是请求返回对象
54         div_list = response.xpath("//*[@id=\"content-left\"]/div")
55 
56         for div in div_list:
57             # author = div.xpath("./div[1]/a[2]/h2/text()")[0].extract()
58             # 这里面换一种写法也可以
59             author = div.xpath("./div[1]/a[2]/h2/text()").extract_first()
60             content = div.xpath('./a/div/span//text()').extract()
61             content = "".join(content)
62             item = Qiubai01Item()
63             item['author'] = author
64             item['content'] = content
65             # 向管道提交item
66             yield item
View Code

 全栈数据爬取:

1.解析数据页面

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from choutipro.items import ChoutiproItem
 4 
 5 
 6 class ChoutiSpider(scrapy.Spider):
 7     name = 'chouti'
 8     # allowed_domains = ['https://dig.chouti.com/']
 9     start_urls = ['https://dig.chouti.com/all/hot/recent/1']
10 
11     pageNum = 1
12     url = "https://dig.chouti.com/all/hot/recent/%s"
13 
14     def parse(self, response):
15         div_list = response.xpath('//*[@id="content-list"]/div')
16         for div in div_list:
17             # //*[@id="newsContent26168656"]/div[1]/a[1]
18             # //*[@id="newsContent26168656"]/div[1]
19             # //*[@id="newsContent26168656"]
20             content = div.xpath('./div[4]/div[1]/a[1]/text()').extract_first()
21             author = div.xpath('./div[4]/div[2]/a[4]/b/text()').extract_first()
22             item = ChoutiproItem()
23             item['author'] = author
24             item['content'] = content
25             yield item
26         if self.pageNum <= 120:
27             self.pageNum += 1
28             new_url = self.url % str(self.pageNum)
29             print(new_url)
30             yield scrapy.Request(url=new_url, callback=self.parse)
View Code

2.item类定义

1 import scrapy
2 
3 
4 class ChoutiproItem(scrapy.Item):
5     # define the fields for your item here like:
6     author = scrapy.Field()
7     content = scrapy.Field()
View Code

3.管道文件

 1 class ChoutiproPipeline(object):
 2     file = None
 3 
 4     def open_spider(self, spider):
 5         self.file = open("./test.txt", "a", encoding='utf-8')
 6 
 7     def process_item(self, item, spider):
 8         # author = item['author']
 9         content = item['content'].strip()
10         try:
11             self.file.write( content+'\n')
12         except:
13             pass
14         return item
15 
16     def close_spider(self, spider):
17         self.file.close()
View Code
 1 # -*- coding: utf-8 -*-
 2 
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 
 8 import pymysql
 9 
10 
11 class Qiubai01Pipeline(object):
12     fp = None
13 
14     def open_spider(self, spider):
15         self.fp = open('./qiubai.txt', 'w', encoding='utf-8')
16 
17     def process_item(self, item, spider):
18         author = item['author']
19         content = item['content']
20         print(type(author), type(content))
21         try:
22             self.fp.write(author + ":" + content)
23         except:
24             pass
25         return item
26 
27     def close_soider(self, spider):
28         self.fp.close()
29 
30 
31 class MySql01Pipeline(object):
32     conn = None
33     cursor = None
34 
35     def open_spider(self, item):
36         self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='', password='', db='spider')
37 
38     def process_item(self, item, spider):
39         self.cursor = self.conn.cursor()
40         try:
41             self.cursor.execute('insert into qiubqi values ("%s","%s")' % (item['author'], item['content']))
42             self.conn.commit()
43         except:
44             self.conn.rollback()
45 
46     def close_spider(self, spider):
47         self.cursor.close()
48         self.conn.close()
View Code

 4.日志等级处理

  在setting里面设置:LOG_LEVEL = 'ERROR'  或者LOG_FILE = 'log.txt' 都可以。

5.请求传参:当解析的页面不在同一个页面的时候需要使用请求传参,主要是回调函数里面的参数需要重新写。

posted @ 2019-05-19 22:52  Jonathan1  阅读(200)  评论(0编辑  收藏  举报