全站爬取cnblogs
全站爬取cnblogs
- 创建
"""
# 1 scrapy startproject cnblogs_crawl
# 2 scrapy genspider cnblogs www.cnblogs.com
"""
- 代码演示
"""
# -*- coding: utf-8 -*-
import scrapy
from cnblogs_crawl.items import CnblogsCrawlItem
from scrapy.http import Request
class CnblogsSpider(scrapy.Spider):
name = 'cnblogs'
allowed_domains = ['www.cnblogs.com']
start_urls = ['http://www.cnblogs.com/']
def parse(self, response):
# print(response)
div_list = response.css('.post_item')
for div in div_list:
item = CnblogsCrawlItem()
title = div.css('h3>a::text').extract_first()
# print(title)
item['title'] = title
url = div.css('h3>a::attr(href)').extract_first()
# print(url)
item['url'] = url
author = div.css('.post_item_foot a::text').extract_first()
# print(author)
item['author'] = author
desc = div.css('.post_item_summary::text').extract()[-1]
# print(desc)
item['desc'] = desc
# yield item
# 如果不写callback,爬完之后,就会执行parse_detail解析
# 传参
yield Request(url, callback=self.parse_detail, meta={'item':item})
# 持久化
next = response.css('div.pager a:last-child::attr(href)').extract_first()
# print(next)
# 继续爬取下一页内容
yield Request('http://www.cnblogs.com/' + next)
# 继续获取文章的真正的内容,因为解析方式可能不一样
def parse_detail(self, response):
# print(response)
item = response.meta.get('item')
content = response.css('#cnblogs_post_body').extract_first()
if not content:
content = response.css('content').extract_first()
item['content'] = content
yield item
"""
- 鼠标执行
"""
from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'cnblogs'])
"""
- 爬取数据持久化到数据库
"""
import pymysql
# 写入数据,持久化
class CnblogsCrawlPipeline(object):
def open_spider(self, spider):
self.conn = pymysql.Connect(host='127.0.0.1', port=3306, db='cnblogs', user='root', password='123456')
def process_item(self, item, spider):
curser = self.conn.cursor()
sql = '''insert into article (title, url, `desc`, content, author) values (%s, %s, %s, %s, %s)'''
curser.execute(sql, args=(item['title'], item['url'], item['desc'], item['content'], item['author']))
self.conn.commit()
def close_spider(self, spider):
self.conn.close()
"""
Only you can control your future
You're not alone. You still have family,peopel who care for you and want to save you.