爬虫大作业

# -*- coding:utf-8 -*-
# 第三方库
import scrapy
from scrapy.spiders import Spider
from lxml import etree
import re
import jieba
from BoKeYuan.items import BokeyuanItem

class BlogYuanSpider(Spider):
name = 'blog_yuan'
start_urls = ['https://www.cnblogs.com/']
extra = '/#p{}'

def start_requests(self):
yield scrapy.Request(self.start_urls[0], callback=self.parse)

@staticmethod
def get_num(response):
html = response.body
selector = etree.HTML(html)
page_num = int(selector.xpath('string(//a[@class="p_200 last"])'))
return page_num

@staticmethod
def get_info(response):
html = response.body
item = BokeyuanItem()
selector = etree.HTML(html)
i = selector.xpath('string(//div[@class="blogpost-body"])')
info = re.sub('[\s+\n\t]', '', i)
item['info'] = info
item['url'] = response.url
d = {}
text = ''
text += ' '.join(jieba.lcut(item['info']))
t = re.sub('[\,\'\:\/\)\.\;\}\(\,\{]', '', text).split()
for v in t:
d[v] = item['info'].count(v)
e = list(d.items())
e.sort(key=lambda x: x[1], reverse=True)
for k, v in enumerate(e):
if k < 20:
print(v)
yield item

def get_page_url(self, response):
selector = etree.HTML(response.body)
page_url = selector.xpath('//a[@class="titlelnk"]/@href')
for p in page_url:
yield scrapy.Request(url=p, callback=self.get_info)

def parse(self, response):
html = response.body
selector = etree.HTML(html)
page_url = selector.xpath('//a[@class="titlelnk"]/@href')
for p in page_url:
yield scrapy.Request(url=p, callback=self.get_info)
page_num = self.get_num(response)
for n in range(2, page_num):
yield scrapy.Request(self.start_urls[0] + self.extra.format(n), callback=self.get_page_url)

通过设置入口url寻找首页中内容页的链接,并寻找首页中的最大页数,通过嵌套循环遍历页数和内容页链接,实现深度为3的深度爬取,通过yield生成item对象,同时输出词频统计后出现次数的top20

 

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item, Field


class BokeyuanItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
info = Field()
url = Field()

该函数定义scrapy中item的键以传值

# -*- coding: utf-8 -*-

# Scrapy settings for BoKeYuan project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'BoKeYuan'

SPIDER_MODULES = ['BoKeYuan.spiders']
NEWSPIDER_MODULE = 'BoKeYuan.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'BoKeYuan (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'BoKeYuan.middlewares.BokeyuanSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'BoKeYuan.middlewares.BokeyuanDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'BoKeYuan.pipelines.BokeyuanPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

该函数设置header头部信息及延迟时间的设置

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 第三方库
import jieba
import re
import os
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from scipy.misc import imread


class BokeyuanPipeline(object):
    def __init__(self):
        pass

    def process_item(self, item, spider):
        text = ''
        name = item['url'].split('/')[-1].replace('.html', '')
        p = os.path.abspath(__file__).replace('\\pipelines.py', '')
        if os.path.exists(p+'\\pic'):
            path = p+'\\pic'
        else:
            path = os.mkdir(p+'\\pic')
        info = re.sub('[\s+\、\(\)\(\)\{\}\_\,\.\。\“\”\;\!\?]', '', item['info'])
        text += ' '.join(jieba.lcut(info))
        backgroud_Image = imread(p + '\\ju.PNG')
        wc = WordCloud(
            width=500,
            height=500,
            margin=2,
            background_color='white',  # 设置背景颜色
            mask=backgroud_Image,  # 设置背景图片
            font_path='C:\Windows\Fonts\STZHONGS.TTF',  # 若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字
            max_words=2000,  # 设置最大现实的字数
            stopwords=STOPWORDS,  # 设置停用词
            max_font_size=150,  # 设置字体最大值
            random_state=42  # 设置有多少种随机生成状态,即有多少种配色方案
        )
        wc.generate_from_text(text)
        wc.to_file(path + '\\{}.jpg'.format(name))
        return item

    def close_spider(self, spider):
        pass

通过jieba库实现分词并输出词云

 

posted on 2018-04-16 11:48  117李智濠  阅读(306)  评论(0编辑  收藏  举报