一个完整的大作业

import re
import requests
from bs4 import BeautifulSoup
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator

news=""
html = requests.get('http://www.kejixun.com/news/index.html')

html.encoding = 'gb2312'

soup = BeautifulSoup(html.text,'html.parser')
for p in soup.find_all("figcaption",class_='title'):
    news = news + p.get_text()

ls = []
words = jieba.lcut(news)
counts = {}
for word in words:
    if len(word) == 1:
        continue
    
    else:
        counts[word] = counts.get(word,0)+1
        ls.append(word)

items = list(counts.items())
items.sort(key = lambda x:x[1], reverse = True)
for i in range(10):
    word , count = items[i]
    print ("{:<10}{:>5}".format(word,count))

wz = open('ms.txt','w+')
wz.write(str(ls))
wz.close()

wz = open('ms.txt','r').read()

backgroud_Image = plt.imread('cloud.jpg')
wc = WordCloud( background_color = 'white',    
                mask = backgroud_Image,       
                max_words = 2000,           
                stopwords = STOPWORDS,      
                font_path = 'C:/Users/Windows/fonts/msyh.ttf',
                max_font_size = 200,          
                random_state = 30,           
                )

wc.generate(wz)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
plt.imshow(wc)
plt.axis('off')
plt.show()

 

 

捕获关键词

 

生成词云

posted @ 2017-11-02 15:53  41钱嘉信  阅读(218)  评论(0编辑  收藏  举报