简书网站 爬取所有文章(同步方式保存数据库)
import scrapy import re from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from jianshu.items import JianshuItem class JsSpider(CrawlSpider): name = 'js' allowed_domains = ['jianshu.com'] start_urls = ['https://www.jianshu.com/'] rules = ( # 匹配文章链接 Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True), ) def parse_detail(self, response): title = response.xpath('//h1[@class="title"]/text()').get() avatar = response.xpath('//a[@class="avatar"]/img/@src').get() author = response.xpath('//span[@class="name"]//text()').get() content = response.xpath('//div[@class="show-content"]').get() pub_time = response.xpath('//span[@class="publish-time"]/text()').get().replace('*', '') read_count = response.xpath('//span[@class="views-count"]/text()').get() comment_count = response.xpath('//span[@class="comments-count"]/text()').get() like_count = response.xpath('//span[@class="likes-count"]/text()').get() rewards_count = response.xpath('//span[@class="wordage"]/text()').get() print('-------------', rewards_count) # 获取数字 read_count = re.findall('\d+', read_count)[0] if re.findall('\d+', read_count) else None comment_count = re.findall('\d+', comment_count)[0] if re.findall('\d+', comment_count) else None like_count = re.findall('\d+', like_count)[0] if re.findall('\d+', like_count) else None rewards_count = re.findall('\d+', rewards_count)[0] if re.findall('\d+', rewards_count) else None item = JianshuItem( title=title, avatar=avatar, author=author, content=content, pub_time=pub_time, read_count=read_count, comment_count=comment_count, like_count=like_count, rewards_count=rewards_count ) yield item
items:
import scrapy class JianshuItem(scrapy.Item): title = scrapy.Field() avatar = scrapy.Field() author = scrapy.Field() content = scrapy.Field() pub_time = scrapy.Field() read_count = scrapy.Field() comment_count = scrapy.Field() like_count = scrapy.Field() rewards_count = scrapy.Field()
pipline:
class JianShuPipeline(object): def __init__(self): db_params = { 'host': '127.0.0.1', 'port': 3306, 'user': 'root', 'password': '', 'database': 'jianshu', 'charset': 'utf8' } self.conn = pymysql.connect(**db_params) self.cursor = self.conn.cursor() self._sql = None def process_item(self, item, spider): self.cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'], item['like_count'], item['read_count'], item['comment_count'], item['rewards_count'])) self.conn.commit() return item @property def sql(self): if not self._sql: self._sql = """ INSERT INTO article(id, title, content, author, avatar, pub_time, like_count, read_count, comment_count, rewards_count) VALUES (null, %s, %s, %s, %s, %s, %s, %s, %s, %s); """ return self._sql return self._sql