Youth

python爬取《三国演义》小说&统计词频&生成词云图


python爬取《三国演义》小说&统计词频&生成词云图

注意点:

  1. 爬取小说正文时用的正则表达式涉及到多行匹配。需要开启多行模式(?s) book_content_re = re.compile(r'(?s)<div.*?id="htmlContent">(.*?)</div>')

image.png

image.png

image.png

image.png

源代码练习

 
import re
 import requests
 import jieba
 import jieba.analyse
 import codecs
 from collections import Counter
 import wordcloud
 import matplotlib.pyplot as plt
 ​
 # 要爬取的网址url
 url = 'http://www.janpn.com/book/sanguoyanyi2.html'def get_content(url):
 ​
     # 先获取二进制数据,再进行编码
 ​
     txt = requests.get(url).content.decode('utf-8')
 ​
     # print(txt)
 ​
     book_title = re.compile(r'<h3 class="bookTitle">(.+)</h3>')
     # m1.findall(txt)得到的是一个数组,取m1.findall(txt)[0]
     # print(m1.findall(txt))
# book_chapters_re = re.compile(r'<li><a href="(.+\.html)">([第|回].+)</a></li>')
     # book_chapters_re = re.compile(r'<ul class="panel-body panel-chapterlist"><li><a href="(.+)">(.*)</a></li></ul>')
     # 一定要注意   要使用非贪婪模式  来匹配特定结尾.html
     book_chapters_re = re.compile(r'<li><a href="(.*?\.html)".*?>(.+?)</a></li>')
 ​
     book_chapters = book_chapters_re.findall(txt)
     # 开启多行模式  正文是很多段落的
     book_content_re = re.compile(r'(?s)<div.*?id="htmlContent">(.*?)</div>')
 ​
     # 过滤正文的\r\n
     m3 = re.compile(r'\r\n')
     # 过滤正文的&nbsp;
     m4 = re.compile(r'&nbsp;')
     # 过滤正文的<br />
     m5 = re.compile(r'<br />')
 ​
     print(book_chapters)
     # print(book_chapters)
     with open('三国演义.txt','a') as f:
         for i in book_chapters:
             print([i[0],i[1]])
         # print(book_chapters)
             print(i[0])
             i_url = i[0]
             print("正在下载--->%s" % i[1])
         # 根据每个章节的url,先获取二进制,再编码
             content_html = requests.get(i_url).content.decode('utf-8')
 #             匹配正文
             content = book_content_re.findall(content_html)[0]
             print(content)
             content = m3.sub('',content)
             content = m4.sub('',content)
             content = m5.sub('',content)
             print(content)
             f.write('\n'+i[1]+'\n')
             f.write(content)
 ​
 ​
 ​
 # =================================================
# 创建停用词列表
 def stopwordlist():
     stopwords = [line.strip() for line in open('../结巴分词/hit_stopwords.txt',encoding='UTF-8').readline()]
     return stopwords
 ​
 ​
 # 对句子进行中文分词 并 去停用词
 def seg_depart(sentence):
     print('正在分词')
     sentence_depart = jieba.cut(sentence.strip())
 ​
 #     创建一个停用词列表
     stopwords = stopwordlist()
 #     输出结果为outstr
     outstr = ''#     去停用词
     for word in sentence_depart:
         if word not in stopwords:
             if word != '\t':
                 outstr += word
                 outstr += ' '
     return outstr
 ​
 # 读取文件并生成词云图
 filepath = '三国演义.txt'
 def create_word_cloud(filepath):
 ​
     #读取文件内容
     content = codecs.open(filepath,'r','gbk').read()
 ​
 #     去停用词
     content = seg_depart(content)
 ​
 #     结巴分词
     wordlist = jieba.cut(content)
     wl = ' '.join(wordlist)
 ​
     print(wl)
 ​
 #     配置词云图
     wc = wordcloud.WordCloud(
     #     这只背景颜色
         background_color='white',
     #     设置最大显示的词数
         max_words=100,
     #     设置字体路径
         font_path = 'C:\Windows\Fonts\msyh.ttc',
     #
         height=1200,
         width=1600,
     #     设置字体最大值
         max_font_size=300,
     #     设置有多少种随机配色方案
         random_state=50
     )
 ​
 #     生成词云图
     myword = wc.generate(wl)
 ​
 #     展示词云图
     plt.imshow(myword)
     plt.axis('off')
     plt.show()
 ​
 ​
 ​
 # =================================================
# 开启爬虫
 # get_content(url)
# 生成词云图
 create_word_cloud(filepath)
 ​
 # ===================================================
 ​
 ​
 # 读取文件  词频统计
 def count_from_file(filepath,top_limit=0):
     with codecs.open(filepath,'r','gbk') as f:
         content = f.read()
 ​
 #         将多个空格替换为一个空格
         content = re.sub(r'\s+',r' ',content)
         content = re.sub(r'\.+',r' ',content)
 #         去停用词
         content = seg_depart(content)
 ​
         return count_from_str(content)
 ​
 def count_from_str(content,top_limit=0): 
    if top_limit<=0: 
        top_limit=100#         提取文章的关键词 
    tags = jieba.analyse.extract_tags(content,topK=100) 
    print("关键词:") 
    print(tags) 
​ 
    words = jieba.cut(content) 
​ 
    counter = Counter() 
​ 
    for word in words: 
        if word in tags: 
            counter[word]+=1return  counter.most_common(top_limit) 
​ 
​ 
​ 
# ===================================== 
print("打印词频统计") 
​ 
# 打印词频统计 
result = count_from_file(filepath) 
print(result) 
​ 
​ 
def test(url): 
​ 
    # 开启多行匹配模式  因为如果涉及到换行符   就要用多行 
    book_content_re = re.compile(r'(?s)<div.*?id="htmlContent">(.*?)</div>') 
    content_html = requests.get(url).content.decode('gbk') 
    print(content_html) 
    content = book_content_re.findall(content_html) 
    print(content) 
​ 
# test("http://www.janpn.com/book/171/171182/35225767.html")

 

posted @ 2020-10-05 13:29  lishuaics  阅读(1959)  评论(0编辑  收藏  举报