cnblogs_spider.py
普通 scrapy
# -*- coding: utf-8 -*-
import scrapy
from ..items import TttItem
class ChoutiSpider(scrapy.Spider):
name = 'chouti' # 爬虫名字
start_urls = ['https://www.cnblogs.com']
def parse(self, response):
div_list = response.xpath('//div[@class="post_item_body"]')
for div in div_list:
title = div.xpath('./h3/a/text()').extract_first()
url = div.xpath('./h3/a/@href').extract_first()
outline = div.css('.post_item_summary::text').extract()[-1]
author = div.xpath('./div[@class="post_item_foot"]/a/text()').extract_first()
item = TttItem()
item['title'] = title
item['outline'] = outline
item['author'] = author
item['url'] = url
yield scrapy.Request(url, callback=self.get_detail, meta={'item': item})
beforeurl = response.url
print(beforeurl)
# 获取最后一个 a 标签
next_url = response.xpath('//div[@class="pager"]/a[last()]/@href').extract_first()
print('next_url', next_url)
yield scrapy.Request(self.start_urls[0] + next_url, callback=self.parse)
# 获取文章详情
def get_detail(self, response):
content = response.xpath('//div[@id="cnblogs_post_body"]').extract_first()
if not content:
content=response.css('content').extract_first()
item = response.meta.get('item')
item['content'] = content
yield item
piplines.py
import pymysql
class CnblogsSaveMysqlPipline(object):
def open_spider(self, spider):
self.conn = pymysql.connect(user='root', password='123123', db='cnblogs')
def close_spider(self, spider):
self.conn.close()
def process_item(self, item, spider):
cursor = self.conn.cursor()
sql = '''insert into cnb (title, outline, author, url, content) values (%s,%s,%s,%s,%s)'''
cursor.execute(sql, args=(item['title'], item['outline'], item['author'], item['url'], item['content']))
self.conn.commit()
分布式爬取
cnblogs_spider.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import TttItem
from scrapy.http import Request
from scrapy_redis.spiders import RedisSpider
class ChoutiSpider(RedisSpider):
name = 'chouti' # 爬虫名字
allowed_domains = ['www.cnblogs.com']
redis_key = 'myspider:start_urls'
def parse(self, response):
div_list = response.xpath('//div[@class="post_item_body"]')
for div in div_list:
title = div.xpath('./h3/a/text()').extract_first()
url = div.xpath('./h3/a/@href').extract_first()
outline = div.css('.post_item_summary::text').extract()[-1]
author = div.xpath('./div[@class="post_item_foot"]/a/text()').extract_first()
item = TttItem()
item['title'] = title
item['outline'] = outline
item['author'] = author
item['url'] = url
yield Request(url, callback=self.get_detail, meta={'item': item})
beforeurl = response.url
print(beforeurl)
# 获取最后一个 a 标签
next = response.css('div.pager a:last-child::attr(href)').extract_first()
# print('https://www.cnblogs.com/'+next)
print('----爬取下一页地址', next)
yield Request('https://www.cnblogs.com/' + next)
def get_detail(self, response):
content = response.xpath('//div[@id="cnblogs_post_body"]').extract_first()
if not content:
content=response.css('content').extract_first()
item = response.meta.get('item')
item['content'] = content
yield item
settings.py
# Enables scheduling storing requests queue in redis.
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# Ensure all spiders share same duplicates filter through redis.
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
REDIS_PARAMS = {'password':'redis123'}