爬虫大作业 - 135陈若倩

import requests
from bs4 import BeautifulSoup
import jieba
import matplotlib.pyplot as plt
from scipy.misc import imread
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


def get_url(urls):
    for n in range(0, 100):
        url = 'https://news.cnblogs.com/n/page/' + str(n) + '/'
        urls.append(url)
    return urls


def get_info(url, content):
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    news = soup.select('div[class="content"] h2')
    for nn in news:
        content.append(nn.get_text().strip('\n').split('\n'))
    return content


urls = []
single_content =[]
all_content = []
# urls = get_url(urls)
# for u in urls:
#     all_content.append(get_info(u, single_content))
# name = open('blog.txt', 'w', encoding='utf-8')
# for cc in all_content[0]:
#     name.write(str(cc[0]) + '\n')
# name.close()


def jieba_split():
    with open('blog.txt', encoding='utf-8') as f:
        comment_text = f.read()
    cut_text = " ".join(jieba.cut(comment_text))
    with open('blog_split.txt', 'w', encoding='utf-8') as f:
        f.write(cut_text)

# jieba_split()

def wordcouter():
    word_lists = []
    with open('blog_split.txt', 'r', encoding='utf-8') as f:
        words = f.readlines()
        for ww in words:
             s_word= list(jieba.cut(ww))
             for word in s_word:
                 word_lists.append(word)

    word_lists_set = set(list(word_lists))
    length = len(word_lists_set)
    k = 1
    couter = []
    for w in word_lists_set:
        couter.append(w + u':' + str(word_lists.count(w)) + u"次\n")
        k += 1
    with open('counter.txt', 'w', encoding='utf-8') as f:
        f.writelines(couter)
# wordcouter()

def word_cloud():
    s_words = open('counter.txt', 'r', encoding='utf-8').read()
    words = jieba.cut(s_words, cut_all=True)
    words_split = " ".join(words)
    print(words_split)
    background_pic = imread('hellokity.JPG')
    word_c = WordCloud(
        width=1000,
        height=1000,
        margin=2,
        background_color='white',
        mask=background_pic,
        font_path='C:\Windows\Fonts\STZHONGS.TTF',
        stopwords=STOPWORDS,
        max_font_size=100,
        random_state=100
    )
    word_c.generate_from_text(words_split)
    word_c.to_file('kity.JPG')

word_cloud()
posted on 2018-04-24 21:51 135陈若倩阅读(171) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部