scrapy爬取简书整站文章
在这里我们使用CrawlSpider爬虫模板, 通过其过滤规则进行抓取, 并将抓取后的结果存入mysql中,下面直接上代码:
jianshu_spider.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import CrawlSpider, Rule 5 from jianshu.items import JianshuItem 6 import html 7 8 9 class JianshuSpiderSpider(CrawlSpider): 10 name = 'jianshu_spider' 11 allowed_domains = ['jianshu.com'] 12 start_urls = ['http://jianshu.com/'] 13 14 rules = ( 15 Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_article', follow=True), 16 ) 17 18 def parse_article(self, response): 19 article_code = response.url.split("?")[0].split("/")[-1] 20 title = response.xpath('//h1[@class="title"]/text()').get().strip() 21 author = response.xpath('//div[contains(@class, "author")]/div[@class="info"]//span[@class="name"]/a/text()').get().strip() 22 head_img = response.xpath('//div[contains(@class, "author")]/a[@class="avatar"]/img/@src').get() 23 pub_time = response.xpath('//span[@class="publish-time"]/text()').get().strip().replace('*','') 24 head_img_url = "http:{}".format(head_img) 25 # 存储到数据库中,需要对‘/’转义 26 # content = html.escape(response.xpath('//div[@class="show-content"]').get()) 27 content = response.xpath('//div[@class="show-content"]').get() 28 29 yield JianshuItem( 30 article_code = article_code, 31 title = title, 32 author = author, 33 head_img_url = head_img_url, 34 content = content, 35 pub_time = pub_time,)
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class JianshuItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() article_code = scrapy.Field() title = scrapy.Field() author = scrapy.Field() pub_time = scrapy.Field() head_img_url = scrapy.Field() content = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from jianshu import model class JianshuPipeline(object): def __init__(self): self.session = model.DBSession() def process_item(self, item, spider): # 这里的item属于字典类型 article = model.Article(**item) try: self.session.add(article) self.session.commit() except Exception as e: print("="*100) print("INSERT ERROR!") self.session.rollback() return item def open_spider(self, spider): pass def close_spider(self, spider): self.session.close()
model.py
from sqlalchemy import create_engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import String, Text, Time, Column, Integer, VARCHAR from sqlalchemy.orm import sessionmaker # 创建数据库链接接口 engine = create_engine("mysql+pymysql://jianshu:jianshu@localhost:3306/jianshu?charset=utf8mb4", echo=False) # 声明映像, 即实际数据库表的基本准则的映射类 # 其维持类和数据库表关系目录 Base = declarative_base() class Article(Base): __tablename__ = "jianshu_article" id = Column(Integer, autoincrement=True, primary_key=True) article_code = Column(String(16), nullable=False) title = Column(Text) author = Column(String(16)) pub_time = Column(Time) head_img_url = Column(VARCHAR(256)) content = Column(Text) DBSession = sessionmaker(bind=engine) if __name__ == '__main__': Base.metadata.create_all(engine)