python大作业
词云---利用python对电影评价的爬取
一、抓取网页数据
1:网页爬取一些数据的前期工作
from urllib import request
resp = request.urlopen('https://movie.douban.com/nowplaying/hangzhou/')
html_data = resp.read().decode('utf-8')
:2:爬取得到的html解析
from bs4 import BeautifulSoup as bs soup = bs(html_data, 'html.parser') nowplaying_movie = soup.find_all('div', id='nowplaying') nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')
在上图中可以看到data-subject属性里面id,而在img标签的电影的名字,两个属性来获得电影的id和名称。
nowplaying_list = []
for i in nowplaying_movie_list:
nowplaying_dict = {}
nowplaying_dict['id'] = i['data-subject']
for tag_img_item in i.find_all('img'):
nowplaying_dict['name'] = tag_img_item['alt']
nowplaying_list.append(nowplaying_dict)
二、数据的处理
comments = ''
for k in range(len(eachCommentList)):
comments = comments + (str(eachCommentList[k])).strip()
三、词云生成图片
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
from wordcloud import WordCloud#词云包
wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",max_font_size=80)
word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}
word_frequence_list = []
for key in word_frequence:
temp = (key,word_frequence[key])
word_frequence_list.append(temp)
wordcloud=wordcloud.fit_words(word_frequence_list)
plt.imshow(wordcloud)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | 付源码 # -*- coding: utf-8 -*- import warnings warnings.filterwarnings( "ignore" ) import jieba # 分词包 import numpy # numpy计算包 import codecs # codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode import re import pandas as pd import matplotlib.pyplot as plt from PIL import Image from urllib import request from bs4 import BeautifulSoup as bs from wordcloud import WordCloud,ImageColorGenerator # 词云包 import matplotlib matplotlib.rcParams[ 'figure.figsize' ] = ( 10.0 , 5.0 ) # 分析网页函数 def getNowPlayingMovie_list(): resp = request.urlopen( 'https://movie.douban.com/nowplaying/hangzhou/' ) html_data = resp.read().decode( 'utf-8' ) soup = bs(html_data, 'html.parser' ) nowplaying_movie = soup.find_all( 'div' , id = 'nowplaying' ) nowplaying_movie_list = nowplaying_movie[ 0 ].find_all( 'li' , class_ = 'list-item' ) nowplaying_list = [] for item in nowplaying_movie_list: nowplaying_dict = {} nowplaying_dict[ 'id' ] = item[ 'data-subject' ] for tag_img_item in item.find_all( 'img' ): nowplaying_dict[ 'name' ] = tag_img_item[ 'alt' ] nowplaying_list.append(nowplaying_dict) return nowplaying_list # 爬取评论函数 def getCommentsById(movieId, pageNum): eachCommentList = [] if pageNum > 0 : start = (pageNum - 1 ) * 20 else : return False requrl = 'https://movie.douban.com/subject/' + movieId + '/comments' + '?' + 'start=' + str (start) + '&limit=20' print (requrl) resp = request.urlopen(requrl) html_data = resp.read().decode( 'utf-8' ) soup = bs(html_data, 'html.parser' ) comment_div_lits = soup.find_all( 'div' , class_ = 'comment' ) for item in comment_div_lits: if item.find_all( 'p' )[ 0 ].string is not None : eachCommentList.append(item.find_all( 'p' )[ 0 ].string) return eachCommentList def main(): # 循环获取第一个电影的前10页评论 commentList = [] NowPlayingMovie_list = getNowPlayingMovie_list() for i in range ( 10 ): num = i + 1 commentList_temp = getCommentsById(NowPlayingMovie_list[ 0 ][ 'id' ], num) commentList.append(commentList_temp) # 将列表中的数据转换为字符串 comments = '' for k in range ( len (commentList)): comments = comments + ( str (commentList[k])).strip() # 使用正则表达式去除标点符号 pattern = re. compile (r '[\u4e00-\u9fa5]+' ) filterdata = re.findall(pattern, comments) cleaned_comments = ''.join(filterdata) # 使用结巴分词进行中文分词 segment = jieba.lcut(cleaned_comments) words_df = pd.DataFrame({ 'segment' : segment}) # 去掉停用词 stopwords = pd.read_csv( "stopwords.txt" , index_col = False , quoting = 3 , sep = "\t" , names = [ 'stopword' ], encoding = 'utf-8' ) # quoting=3全不引用 words_df = words_df[~words_df.segment.isin(stopwords.stopword)] # 统计词频 words_stat = words_df.groupby(by = [ 'segment' ])[ 'segment' ].agg({ "计数" : numpy.size}) words_stat = words_stat.reset_index().sort_values(by = [ "计数" ], ascending = False ) # print(words_stat.head()) bg_pic = numpy.array(Image. open ( "alice_mask.png" )) # 用词云进行显示 wordcloud = WordCloud( font_path = "simhei.ttf" , background_color = "white" , max_font_size = 80 , width = 2000 , height = 1800 , mask = bg_pic, mode = "RGBA" ) word_frequence = {x[ 0 ]: x[ 1 ] for x in words_stat.head( 1000 ).values} # print(word_frequence) """ word_frequence_list = [] for key in word_frequence: temp = (key, word_frequence[key]) word_frequence_list.append(temp) #print(word_frequence_list) """ wordcloud = wordcloud.fit_words(word_frequence) image_colors = ImageColorGenerator(bg_pic) # 根据图片生成词云颜色 plt.imshow(wordcloud) #显示词云图片 plt.axis( "off" ) plt.show() wordcloud.to_file( 'show_Chinese.png' ) # 把词云保存下来 main() |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:基于图像分类模型对图像进行分类
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 25岁的心里话
· 闲置电脑爆改个人服务器(超详细) #公网映射 #Vmware虚拟网络编辑器
· 零经验选手,Compose 一天开发一款小游戏!
· 通过 API 将Deepseek响应流式内容输出到前端
· AI Agent开发,如何调用三方的API Function,是通过提示词来发起调用的吗