有关爬虫的应用,哔哩哔哩弹幕

爬虫的一个小实例

使用工具:

jieba分词;

Wordcloud词云

requests

  • 源程序

    import requests#爬虫的请求包
    import pandas as pd#pandas数据分析包
    from bs4 import BeautifulSoup
    import datetime
    import re
    import jieba
    from wordcloud import WordCloud
    from imageio import imread
    import matplotlib.pyplot as plt
    
    header = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    url = 'http://comment.bilibili.com/125511694.xml'
    response = requests.get(url=url, headers=header)#向对方服务器发送请求
    response.encoding = response.apparent_encoding#设置字符编码
    data = response.text#获取文本
    soup = BeautifulSoup(data, 'lxml')#解析
    d_list = soup.find_all('d')#获取所有的d标签
    dlst = []
    for d in d_list:
        danmu = {}
        danmu['弹幕'] = d.text#获取文本信息
        dlst.append(danmu)
    df = pd.DataFrame(dlst)#转换成二位数组,类似于execl表格
    f = open('sign.txt', 'w', encoding='utf-8')#打开文件
    for i in df['弹幕'].values:#循环所有的文本信息
        pat = re.compile(r'[1-龥]+')# 定义过滤数据的规则,所有的汉字
        filter_data = re.findall(pattern=pat, string=i)#执行过滤操作
        f.write("".join(filter_data))#写入文本
    f.close()
    

    改进版的弹幕源代码

    import requests#爬虫的请求包
    import pandas as pd#pandas数据分析包
    from bs4 import BeautifulSoup
    import datetime
    import re
    import jieba
    from wordcloud import WordCloud
    from imageio import imread
    import matplotlib.pyplot as plt
    
    f = open('sign.txt', 'r', encoding='utf-8')
    data = f.read()
    result = " ".join(jieba.lcut(data))
    f.close()
    color_mask = imread('小猪佩奇.jpg')
    wc = WordCloud(
        font_path=r'C:\Windows\Fonts\simkai.ttf',
        width=1000,
        height=800,
        mask=color_mask,
        background_color='pink'
    )
    wc.generate(result)
    wc.to_file('bili.jpg')
    plt.imshow(wc)
    plt.show()
    

posted @ 2019-10-28 16:34  alen_zhan  阅读(174)  评论(0编辑  收藏  举报
返回顶部