结巴分词与词云,简单爬虫——(python)
bilibili弹幕词云
美国历史词云
JIEBA分词
import jieba txt=" **** " 精确模式: 全模式: 搜索模式: res = jieba.cut(txt) res =jieba.cut(txt ,cut_all=True) res=jieba.cut_for_search(txt) for i in res: for i in res: for i in res: print(i) print(i) print(i) 或者 res = jieba.lcut(txt) res=jieba.lcut(txt,cut_all=True) res=jieba.lcut_for_search(txt) print(res)
词云
1.小段文字的词云
from wordcloud import WordCloud import matplotlib.pylot as plt from imageio import imread txt="小段文字" color_mask=imread('图片路径') wc=WordCloud( width: ** , height: ** , background_color: ** , font_path:r'c:\windows\Fonts\***', mask=color_mask ) wc.generate( txt ) wc.to_file('cy.pg') plt.imshow(wc) plt.show()
2.文本文件的词云
from wordcloud import WordCloud import matplotlib.pylot as plt from imageio import imread f.open('文本文件名.txt',encoding='utf8') data=f.read result="".join(jieba.lcut(data)) color_mask=imread('图片名.jpg') wc=WordCloud( height:**, width:**, font_path=r'c:\windows\Fonts\***', mask=color_mask ) wc.generate(result) wc.to_file('图片名.png') plt.imshow(wc) plt.show()
案例()
1.爬取bilibili弹幕
import requests from bs4 import BeautifulSoup import pandas as pd imort datetime import re header={ 'User-Agent':'*********' } url='http:\\comment.bilibili.com/codecid.xml' #向对方服务器发送请求 response=requests.get(url=url,headers=header) #设置字符码 response.encoding = response.apparent_encoding #获取文本 data=response.txt #解析 soup=BeautifulSoup(data,'lxml') #获取所有的d标签 d_list=soup.find_all('d') dlist[] #循环所有的d标签 for d in d_list: danmu={} danmu['弹幕']=d.txt #获取文本信息 danmu['时间']=datetime.datetime.now() danmu['地址']=url dlist.append(danmu) #转换为二维数组,类似于excel表格 df=pd.DataFrame(dlist) f=open('sign.txt','w',encoding='utf8')#打开文件 #循环所有的文件信息 for i in df['弹幕'].values: pat=re.compile(r'[一-龥]+')#定义过滤的规则(所有的汉字) filter_data=re.findall(pattern=pat,string=i)#执行过滤操作 f.write("".join(filter_data))#写入文本 f.close()
2.bilibili弹幕分析
import jieba from wordcloud import WordCloud import matplotlib.pylot as plt from imageio import imread f=open('sign.txt','r',encoding='utf8') data=f.read() result="".join(jieba.lcut(data)) f.close color_mask=imread('图片名称.jpg') wc=WordCloud( height=***, width=***, background_color='**', font_path=r'c:\windows\Fonts\***', mask=color_mask ) wc.generate(result) wc.to_file('bilibili.png') plt.imshow(wc) plt.show()
————————godlover