爬取腾讯招聘岗位 scrapy框架

1,腾讯岗位招聘
# -*- coding: utf-8 -*-
import scrapy
from lxml import etree
from ..items import TencentTtem
 
class TxSpider(scrapy.Spider):
name = 'tx'
allowed_domains = ['hr.tencent.com']
start_urls = []
url = 'https://hr.tencent.com/position.php?lid=&tid=&keywords=python&start=%d'
for page in range(55):
url1 = url % (page * 10)
start_urls.append(url1)
 
def parse(self, response):
content=response.body.decode('utf-8')
# with open('11111.html','w',encoding='utf-8')as fp:
# fp.write(content)
tree=etree.HTML(content)
tr_list=tree.xpath('//table[@class="tablelist"]/tr')
# print(tr_list)
tr_list.pop()
tr_list.pop(0)
for tr in tr_list:
item=TencentTtem()
##职位名称
name=tr.xpath('./td[1]/a/text()')
item['name']=name[0]
type=tr.xpath('./td[2]/text()')
item['type'] = type[0]
num=tr.xpath('./td[3]/text()')
item['num'] = num[0]
address=tr.xpath('./td[4]/text()')
item['address']=address[0]
time=tr.xpath('./td[5]/text()')
item['time']=time[0]
 
#详情页 url
deile=tr.xpath('./td[1]/a/@href')
item['deile']=deile[0]
##拼接域名
delai_url='http://hr.tencent.com/'+deile[0]
print(delai_url)
 
yield scrapy.Request(delai_url,callback=self.deile_parse,meta={'data':item},dont_filter=False)
 
def deile_parse(self, response):
content=response.body.decode('utf-8')
tree=etree.HTML(content)
item=response.meta['data']
##获取所有的内容
duty_list1=tree.xpath('//table[@class="tablelist textl"]/tr[3]//text()')
tr=''
for tr3 in duty_list1:
tr=tr + tr3.strip()
item['duty'] = tr
 
duty_list2=tree.xpath('//table[@class="tablelist textl"]/tr[4]//text()')
tr_4=''
for tr4 in duty_list2:
tr_4=tr_4+tr4.strip()
item['duty1']=tr_4
yield item
 
2.pipelines 管道部分
class TencentPipeline(object):
def __init__(self):
#建立连接
self.client = pymongo.MongoClient('localhost')
#创建库
self.db = self.client['Tencent']
 
self.table = self.db['Tencentjob']
def process_item(self, item, spider):
self.table.insert(dict(item))
return item
3,items 部分
class TencentTtem(scrapy.Item):
name = scrapy.Field()
type = scrapy.Field()
num = scrapy.Field()
address = scrapy.Field()
time = scrapy.Field()
deile = scrapy.Field()
duty= scrapy.Field()
duty1=scrapy.Field()
4.settings部分
 
# -*- coding: utf-8 -*-
 
# Scrapy settings for tengxu project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 
BOT_NAME = 'tengxu'
 
SPIDER_MODULES = ['tengxu.spiders']
NEWSPIDER_MODULE = 'tengxu.spiders'
 
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'tengxu (+http://www.yourdomain.com)'
 
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
 
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
 
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
 
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
 
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
 
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
 
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tengxu.middlewares.TengxuSpiderMiddleware': 543,
#}
 
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'tengxu.mymiddleware.TengxuSpiderMiddleware': 543,
# 'tengxu.mymiddleware.ProxyMiddleware': 1,
# }
 
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
 
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tengxu.pipelines.TencentPipeline': 300, ##
# 'tengxu.pipelines.MeijuPipeline': 300, ##美剧
}
 
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
 
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 
##设置ip代理ip的设置
# PROXIES=[
#
# ]
 
 
LOG_FILE='tx.log' ##日志文件
LOG_ENABLED=True ##启用log
LOG_ENCODING='UTF-8'##编码
LOG_LEVEL='DEBUG'##日志登记
 
 
 
 
 
 
posted @ 2018-12-28 20:14  joson01  阅读(205)  评论(0编辑  收藏  举报