scrapy 第一个案例(爬取腾讯招聘职位信息)
import scrapy import json class TzcSpider(scrapy.Spider): # spider的名字,唯一 name = 'tzc' # 起始地址 start_urls = ['https://hr.tencent.com/position.php?keywords=python&tid=0&lid=2268'] # 每个url爬取之后会调用这个方法 def parse(self, response): tr = response.xpath( '//table[@class="tablelist"]/tr[@class = "even"]|//table[@class="tablelist"]/tr[@class = "odd"]') with open('info.json','a') as f: for i in tr: data = { "jobName": i.xpath('./td[1]/a/text()').extract_first(), "jobType":i.xpath('./td[2]/text()').extract_first(), "Num":i.xpath('./td[3]/text()').extract_first(), "Place":i.xpath('./td[4]/text()').extract_first(), "Time":i.xpath('./td[5]/text()').extract_first() } data = json.dumps(data,ensure_ascii=False) f.write(data) f.write('\n') # 寻找下一页标签 url_next = response.xpath('//a[@id = "next"]/@href').extract_first() # 提取的是段标签,需要加上域名 url_next = 'https://hr.tencent.com/{}'.format(url_next) # 返回下一页地址,scrapy会递归 return scrapy.Request(url_next)