爬取腾讯招聘岗位 scrapy框架

1,腾讯岗位招聘

# -*- coding: utf-8 -*-

import scrapy

from lxml import etree

from ..items import TencentTtem

class TxSpider(scrapy.Spider):

 name = 'tx'

 allowed_domains = ['hr.tencent.com']

 start_urls = []

 url = 'https://hr.tencent.com/position.php?lid=&tid=&keywords=python&start=%d'

 for page in range(55):

 url1 = url % (page * 10)

 start_urls.append(url1)

 def parse(self, response):

 content=response.body.decode('utf-8')

 # with open('11111.html','w',encoding='utf-8')as fp:

 # fp.write(content)

 tree=etree.HTML(content)

 tr_list=tree.xpath('//table[@class="tablelist"]/tr')

 # print(tr_list)

 tr_list.pop()

 tr_list.pop(0)

 for tr in tr_list:

 item=TencentTtem()

 ##职位名称

 name=tr.xpath('./td[1]/a/text()')

 item['name']=name[0]

 type=tr.xpath('./td[2]/text()')

 item['type'] = type[0]

 num=tr.xpath('./td[3]/text()')

 item['num'] = num[0]

 address=tr.xpath('./td[4]/text()')

 item['address']=address[0]

 time=tr.xpath('./td[5]/text()')

 item['time']=time[0]

 #详情页 url

 deile=tr.xpath('./td[1]/a/@href')

 item['deile']=deile[0]

 ##拼接域名

 delai_url='http://hr.tencent.com/'+deile[0]

 print(delai_url)

 yield scrapy.Request(delai_url,callback=self.deile_parse,meta={'data':item},dont_filter=False)

 def deile_parse(self, response):

 content=response.body.decode('utf-8')

 tree=etree.HTML(content)

 item=response.meta['data']

 ##获取所有的内容

 duty_list1=tree.xpath('//table[@class="tablelist textl"]/tr[3]//text()')

 tr=''

 for tr3 in duty_list1: 

 tr=tr + tr3.strip()

 item['duty'] = tr

 duty_list2=tree.xpath('//table[@class="tablelist textl"]/tr[4]//text()')

 tr_4=''

 for tr4 in duty_list2:

 tr_4=tr_4+tr4.strip()

 item['duty1']=tr_4

 yield item

2.pipelines 管道部分

class TencentPipeline(object):

 def __init__(self):

 #建立连接

 self.client = pymongo.MongoClient('localhost')

 #创建库

 self.db = self.client['Tencent']

 self.table = self.db['Tencentjob']

 def process_item(self, item, spider):

 self.table.insert(dict(item))

 return item

3，items 部分

class TencentTtem(scrapy.Item):

 name = scrapy.Field()

 type = scrapy.Field()

 num = scrapy.Field()

 address = scrapy.Field()

 time = scrapy.Field()

 deile = scrapy.Field()

 duty= scrapy.Field()

 duty1=scrapy.Field()

4.settings部分

# -*- coding: utf-8 -*-

# Scrapy settings for tengxu project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

# https://doc.scrapy.org/en/latest/topics/settings.html

# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'tengxu'

SPIDER_MODULES = ['tengxu.spiders']

NEWSPIDER_MODULE = 'tengxu.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = 'tengxu (+http://www.yourdomain.com)'

# Obey robots.txt rules

ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

# 'Accept-Language': 'en',

#}

# Enable or disable spider middlewares

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

# 'tengxu.middlewares.TengxuSpiderMiddleware': 543,

#}

# Enable or disable downloader middlewares

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

# DOWNLOADER_MIDDLEWARES = {

# 'tengxu.mymiddleware.TengxuSpiderMiddleware': 543,

# 'tengxu.mymiddleware.ProxyMiddleware': 1,

# }

# Enable or disable extensions

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

# 'scrapy.extensions.telnet.TelnetConsole': None,

#}

# Configure item pipelines

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {

 'tengxu.pipelines.TencentPipeline': 300, ##

 # 'tengxu.pipelines.MeijuPipeline': 300, ##美剧

}

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

##设置ip代理ip的设置

# PROXIES=[

#

# ]

LOG_FILE='tx.log' ##日志文件

LOG_ENABLED=True ##启用log

LOG_ENCODING='UTF-8'##编码

LOG_LEVEL='DEBUG'##日志登记

posted @ 2018-12-28 20:14 joson01 阅读(210) 评论(0) 编辑收藏举报

刷新页面返回顶部

joson01