scrapy框架
#创建项目 scrapy startproject demo #开一个爬虫项目 cd demo scrapy genspider first www.baidu.com #setting 中设置 ROBOTSTXT_OBEY = False USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' #执行爬虫 scrapy crawl first --nolog
#持久化存储(简单)
scrapy crawl first -o a.json
#取值
text = div.xpath('./div[1]//h2/text()')[0].extract()
word = div.xpath('./a//span/text()').extract_first()
1 #实例化一个item类型的对象
2 item = BossproItem()
3 #
4 item['title'] = title
5 item['salary'] = salary
6 item['company'] = company
7
8 #将item对象提交给管道进行持久化存储
9 yield item
10
11
12#item文件中
13 class BossproItem(scrapy.Item):
14 # define the fields for your item here like:
15 # name = scrapy.Field()
16 title = scrapy.Field()
17 salary = scrapy.Field()
18 company = scrapy.Field()
19
20
21 #pipelines中
22 class BossproPipeline(object):
23 fp = None
24 #只会被执行一次(开始爬虫的时候执行一次)
25 def open_spider(self,spider):
26 print('开始爬虫!!!')
27 self.fp = open('./job.txt','w',encoding='utf-8')
28 #爬虫文件没提交一次item,该方法会被调用一次
29 def process_item(self, item, spider):
30 self.fp.write(item['title']+"\t"+item['salary']+'\t'+item['company']+'\n')
31 return item
32 def close_spider(self,spider):
33 print('爬虫结束!!!')
34 self.fp.close()
35 #注意:默认情况下,管道机制并没有开启.需要手动在配置文件中进行开启
36
37
38 ITEM_PIPELINES = {
39 'demo.pipelines.DemoPipeline': 300,
40 }
多页爬去数据
page = 1
page_model = 'https://www.zhipin.com/c101010100/?query=python&page=%d'
class FirstSpider(scrapy.Spider):
page = 1
page_model = 'https://www.zhipin.com/c101010100/?query=python&page=%d'
name = 'first'
# allowed_domains = ['www.baidu.com']
start_urls = [
'https://www.zhipin.com/c101010100/?query=python&page=1']
def parse(self, response):
li_list = response.xpath('//div[@class="job-list"]/ul/li')
for li in li_list:
title = li.xpath('.//div[@class="info-primary"]/h3[@class="name"]/a/div/text()').extract_first()
salary = li.xpath('.//div[@class="info-primary"]/h3[@class="name"]/a/span/text()').extract_first()
company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first()
demoitem = DemoItem()
demoitem['title'] = title
demoitem['salary'] = salary
demoitem['company'] = company
yield demoitem
if self.page <= 5:
self.page += 1
new_url = format(self.page_model % self.page)
yield scrapy.Request(url=new_url, callback=self.parse)