Scrapy爬取IT之家
创建项目
scrapy startproject ithome
创建CrawSpider
scrapy genspider -t crawl IT ithome.com
items.py
1 import scrapy 2 3 4 class IthomeItem(scrapy.Item): 5 # define the fields for your item here like: 6 # name = scrapy.Field() 7 title = scrapy.Field() 8 content = scrapy.Field()
it.py
1 import scrapy 2 from scrapy.linkextractors import LinkExtractor 3 from scrapy.spiders import CrawlSpider, Rule 4 from ithome.items import IthomeItem 5 6 class ItSpider(CrawlSpider): 7 name = 'it' 8 allowed_domains = ['ithome.com'] 9 start_urls = ['https://it.ithome.com/ityejie/'] 10 11 rules = ( 12 Rule(LinkExtractor(allow=r'ityejie/'), follow=True), 13 Rule(LinkExtractor(allow=r'html/it/\d+.htm', restrict_xpaths='//*[@id="wrapper"]//*[@class="block"]'), callback='parse_item', follow=True), 14 ) 15 16 def parse_item(self, response): 17 i = {} 18 #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() 19 #i['name'] = response.xpath('//div[@id="name"]').extract() 20 #i['description'] = response.xpath('//div[@id="description"]').extract() 21 22 i['title'] = response.xpath('//*[@id="wrapper"]/div[1]/div[2]/h1/text()').extract()[0] 23 i['content'] = response.xpath('//*[@id="paragraph"]/p/text()').extract() 24 yield i
pipelines.py
1 import json 2 3 class IthomePipeline(object): 4 5 6 def __init__(self): 7 self.filename = open("it.json", "w") 8 9 def process_item(self, item, spider): 10 text = json.dumps(dict(item), ensure_ascii = False) + ",\n" 11 self.filename.write(text) 12 return item 13 14 def close_spider(self, spider): 15 self.filename.close()
settings.py
1 BOT_NAME = 'ithome' 2 3 SPIDER_MODULES = ['ithome.spiders'] 4 NEWSPIDER_MODULE = 'ithome.spiders' 5 6 7 8 # Obey robots.txt rules 9 ROBOTSTXT_OBEY = False 10 11 ITEM_PIPELINES = { 12 'ithome.pipelines.IthomePipeline': 300, 13 }
执行
scrapy crawl it