爬取中华网科技新闻
爬取 http://tech.china.com/articles/
抓取新闻列表中所有分页的新闻详情,包括标题、正文、时间、来源等信息。
创建项目
scrapy startproject China
scrapy genspider -t crawl chinatech
items.py
1 from scrapy import Field, Item 2 3 4 class ChinaItem(Item): 5 # define the fields for your item here like: 6 # name = scrapy.Field() 7 8 title = Field() 9 text = Field() 10 datetime = Field() 11 source = Field() 12 url = Field() 13 website = Field()
chinatech.py
1 import scrapy 2 from scrapy.linkextractors import LinkExtractor 3 from scrapy.spiders import CrawlSpider, Rule 4 from China.items import * 5 from China.loaders import * 6 7 class ChinatechSpider(CrawlSpider): 8 name = 'chinatech' 9 allowed_domains = ['tech.china.com'] 10 start_urls = ['http://tech.china.com/articles/'] 11 12 rules = ( 13 Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'), 14 callback='parse_item'), 15 Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]')) 16 ) 17 18 def parse_item(self, response): 19 loader = ChinaLoader(item=ChinaItem(), response=response) 20 loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()') 21 loader.add_value('url', response.url) 22 loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()') 23 loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)') 24 loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)') 25 loader.add_value('website', '中华网') 26 yield loader.load_item()
loads.py
1 from scrapy.loader import ItemLoader 2 from scrapy.loader.processors import TakeFirst, Join, Compose 3 4 5 class NewsLoader(ItemLoader): 6 default_output_processor = TakeFirst() 7 8 9 class ChinaLoader(NewsLoader): 10 text_out = Compose(Join(), lambda s: s.strip()) 11 source_out = Compose(Join(), lambda s: s.strip())
pipelines.py
1 import json 2 3 class ChinaPipeline(object): 4 5 def __init__(self): 6 self.filename = open("china.json", "w") 7 8 def process_item(self, item, spider): 9 text = json.dumps(dict(item), ensure_ascii = False) + ",\n" 10 self.filename.write(text) 11 return item 12 13 def close_spider(self, spider): 14 self.filename.close()
settings.py
1 BOT_NAME = 'China' 2 3 SPIDER_MODULES = ['China.spiders'] 4 NEWSPIDER_MODULE = 'China.spiders' 5 6 ROBOTSTXT_OBEY = False 7 8 ITEM_PIPELINES = { 9 'China.pipelines.ChinaPipeline': 300, 10 }