python之crawlscrapy爬取某集团招聘信息以及招聘详情
针对这种招聘信息,使用crawlscrapy很适合。
1、settings.py
# -*- coding: utf-8 -*- # Scrapy settings for gosuncn project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'gosuncn' SPIDER_MODULES = ['gosuncn.spiders'] NEWSPIDER_MODULE = 'gosuncn.spiders' LOG_LEVEL="WARNING" LOG_FILE="./gxx.log" # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'gosuncn (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'gosuncn.middlewares.GosuncnSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'gosuncn.middlewares.GosuncnDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'gosuncn.pipelines.GosuncnPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
2、pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import logging import re logger = logging.getLogger(__name__) class GosuncnPipeline(object): def process_item(self, item, spider): """ 数据处理在pipelines中进行 :param item: :param spider: :return: """ item["job_responsible"] = re.sub(r"<p>\r\n ","",item["job_responsible"]) item["job_responsible"] = re.sub(r"\r\n </p>", "", item["job_responsible"]) item["job_responsible"] = re.sub(r"(<br>{1,2})", "", item["job_responsible"]) item["job_responsible"] = re.sub(r"\t", "", item["job_responsible"]) logger.warning(item) print(item) return item
3、gxx.py
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule import re import logging logger = logging.getLogger(__name__) class GxxSpider(CrawlSpider): name = 'gxx' allowed_domains = ['gosuncn.zhiye.com'] start_urls = ['https://gosuncn.zhiye.com/social/?PageIndex=1'] rules = ( Rule(LinkExtractor(allow=r'/zpdetail/\d+\?PageIndex=\d'), callback='parse_item',), #获取详情页信息 Rule(LinkExtractor(allow=r'/social/\?PageIndex=\d+'), follow=True), #翻页 ) def parse_item(self, response): item = {} item["job_name"] = response.xpath("//div[@class='boxSupertitle']/span/text()").extract_first() #工作名 ul_list = response.xpath("//div[@class='xiangqingcontain']/ul[1]") for ul in ul_list: item["recuirt_type"] = ul.xpath("./li[2]/text()").extract_first() item["recuirt_type"] = re.sub("\r\n ", "", item["recuirt_type"])#招聘类型 item["recuirt_type"] = re.sub("\r\n ", "", item["recuirt_type"]) item["job_type"] = ul.xpath("./li[4]/text()").extract_first() item["job_type"] = re.sub("\r\n ", "", item["job_type"]) item["job_type"] = re.sub("\r\n ", "", item["job_type"]) #工作类型 item["pay_money"] = ul.xpath("./li[6]/text()").extract_first() #薪资 item["pay_money"] = re.sub("\r\n ", "", item["pay_money"]) # 招聘类型 item["pay_money"] = re.sub("\r\n ", "", item["pay_money"]) item["publish_time"] = re.findall("20\d+\-\d+\-\d+", response.body.decode())[0] # 发布时间 item["recuirt_num"] = ul.xpath("./li[8]/text()").extract_first() #招聘人数 item["recuirt_num"] = re.sub("\r\n ", "", item["recuirt_num"]) # 招聘类型 item["recuirt_num"] = re.sub("\r\n ", "", item["recuirt_num"]) item["job_place"] = response.xpath("//div[@class='xiangqingcontain']/ul[3]/li[2]/text()").extract_first() item["job_place"] = re.sub("\r\n\r\n ", "", item["job_place"]) # 招聘类型 item["job_place"] = re.sub("\r\n ", "", item["job_place"]) #logger.warning(item) #print(item) item["job_responsible"] = response.xpath("//div[@class='xiangqingtext']/p[2]").extract_first() yield item # for li in li_list: # li.xpath("") #item["publish_time"] =response.xpath("/html/body/div/div[3]/div/div[1]/div/div/div/div[2]/ul[2]/li[2]/text()").extract_first() # #item["publish_time"] = re.findall("20\d+\-\d+\-\d+",response.body.decode())[0] #发布时间 #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() #print(item) #return item
本文来自博客园,作者:小白啊小白,Fighting,转载请注明原文链接:https://www.cnblogs.com/ywjfx/p/11097970.html