腾讯招聘爬虫代码

from scrapy_plus.core.spider import Spider
from scrapy_plus.http.request import Request


class TencentSpider(Spider):

    name = 'tencent'
    start_urls = ['https://hr.tencent.com/position.php']

    def parse(self, response): # 对start_urls进行解析
        print(response.url + '*****')
        tr_list = response.xpath('//*[@class="tablelist"]//tr')[1:-1]
        print(len(tr_list))

        for tr in tr_list:
            item = {}
            # 获取一部分数据
            item['name'] = tr.xpath('./td[1]/a/text()')[0]
            item['address'] = tr.xpath('./td[4]/text()')[0]
            item['time'] = tr.xpath('./td[5]/text()')[0]
            # 获取详情页url,并发送请求
            detail_url = 'https://hr.tencent.com/' + tr.xpath('./td[1]/a/@href')[0]
            print(detail_url)
            yield Request(
                detail_url,
                parse='parse_detail',
                meta=item # meta接收一个字典
            )
        # 翻页
        print(response.xpath('//a[text()="下一页"]/@href')[0])
        next_url = 'https://hr.tencent.com/' + response.xpath('//a[text()="下一页"]/@href')[0]
        if response.xpath('//a[text()="下一页"]/@href')[0] != 'javascript:;':
            yield Request(next_url, parse='parse')

    def parse_detail(self, response):
        # print(response.body)
        item = response.meta # 获取传入的meta
        item['job_content'] = response.xpath('//*[@class="squareli"]//text()')[0] # 加入岗位职责数据
        print(item)
        yield item
posted on 2018-08-23 16:48  老韩来了  阅读(171)  评论(0编辑  收藏  举报