scrapy项目3
# -*- coding: utf-8 -*- import scrapy #导入items from tencent.items import TencentItem class HrSpider(scrapy.Spider): name = 'hr' allowed_domains = ['tencent.com'] start_urls = ['https://hr.tencent.com/position.php'] def parse(self, response): print("=========") tr_list = response.xpath("//table[@class='tablelist']/tr")[1:-1] #去掉第一个和最后一个 for tr in tr_list: # 使用item,items里面的字段要和这下面的字段一样 item = TencentItem() # item = {} #不使用items item["title"] = tr.xpath("./td[1]/a/text()").extract_first() item["position"] = tr.xpath("./td[4]/text()").extract_first() item["publish_date"] = tr.xpath("./td[5]/text()").extract_first() yield item # <a href="javascript:;"class="noactive" id="next">下一页</a> #判断下一页,最后一页的href="javascript next_url = response.xpath("//a[@id='next']/@href").extract_first() if next_url != "javascript": #如果href的属性不为javascript,代表有下一页 next_url = 'https://hr.tencent.com/'+next_url yield scrapy.Request( next_url, callback=self.parse )
pipelines.py
# mogodb数据库 # from pymongo import MogoClient # client = MogoClient() # collection = client["tencent"]["hr"] #导入items from tencent.items import TencentItem class TencentPipeline(object): def process_item(self, item, spider): # print(spider.name) if isinstance(item,TencentItem): print(item) # collection.insert(dict(item)) return item
items,py
import scrapy # 可以定义多个item对应不同的爬虫项目字段,比如怕京东,抽屉,汽车之家 #然后再pipelines中做判断 class TencentItem(scrapy.Item): #scrapy.Item也是一个字典 # define the fields for your item here like: # name = scrapy.Field() num = scrapy.Field() title = scrapy.Field() #scrapy.Field()是一个字典 position = scrapy.Field() publish_date = scrapy.Field() class ChoutiItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() position = scrapy.Field() publish_date = scrapy.Field() class JdItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() position = scrapy.Field() publish_date = scrapy.Field()
settings.py
LOG_LEVEL = "WARNING" # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
项目地址:https://github.com/CH-chen/tencent