爬虫综合大作业

爬取哔哩哔哩弹幕

import re
import sqlite3

import requests
from lxml import etree
import threading
from queue import Queue
import pandas as pd
import pymysql
from sqlalchemy import create_engine
import jieba
import random
import time
from wordcloud import WordCloud
import matplotlib.pyplot as plt

class BiliSpider:
    '''哔哩哔哩弹幕爬虫'''
    tindex=0
    global g_wordlist
    global episodes
    global allBarrage
    global ct_episodes
    allBarrage = []
    episodes={}
    ct_episodes=1
    g_wordlist=[]
    def __init__(self):
        url_st = self.get_url()
        self.start_url = url_st
        self.headers = {
            'Referer': 'https://www.bilibili.com/bangumi/play/ep7821',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            # 'Cookie': 'finger=846f9182; LIVE_BUVID=AUTO7515275889865517; fts=1527589020; BANGUMI_SS_413_REC=7823; sid=bywgf18g; buvid3=89102350-5F5E-4056-A926-16EEC8780EE8140233infoc; rpdid=oqllxwklspdosimsqlwiw; bg_view_413=7820%7C7819%7C7823%7C7822',
            'Host': 'm.bilibili.com',
        }

        self.barrage_url = 'https://comment.bilibili.com/{}.xml'

        # self.proxies = {'https': 'https://115.223.209.238:9000'}
        # 要请求的url队列
        self.url_queue = Queue()
        # 解析出的html字符串队列
        self.html_str_q = Queue()
        #获取集数队列
        self.ep_list_q = Queue()
        # 获取到的弹幕队列
        self.barrage_list_q = Queue()
        #保存至数据库
        #print("在self前")
        print(self.barrage_list_q)
    def get_url(self):
        url_input = input("请输入移动版的bilibili番剧ep号:\n(如ep63725)")
        # url ='https://m.bilibili.com/bangumi/play/ep63725'
        url='https://m.bilibili.com/bangumi/play/{}'.format(url_input)
        return url

    def parse_url(self, url=None, headers={}):
        if url is None:
            while True:
                url = self.url_queue.get()
                print('\n弹幕xml为:')
                print(url)
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                self.html_str_q.put(res.text)
                # self.url_queue.task_done()
                return
        else:
            print('\n弹幕xml为:')
            print(url)
            res = requests.get(url, headers=headers)
            res.encoding = 'utf-8'
            return res.text

    def get_cid(self, html_str):
        html = etree.HTML(html_str)

        print(html_str)
        script = html.xpath('//script[contains(text(),"epList")]/text()')[0]
        cid_list = re.findall(r'"cid":(\d+)', script)
        return cid_list

    #集标题及副标题
    def get_episodes(self, html_str):
        global episodes

        html = etree.HTML(html_str)
        ep_content = html.xpath('//script[contains(text(),"epList")]/text()')[0]

        print(ep_content)
        ep_list = re.findall(r'"share_copy":"(\S+)', ep_content)
        self.ep_list_q.put(ep_list)

    #获取弹幕文件url
    def get_barrage_url(self, cid_list):

        for i in (cid_list[1:]):

            self.url_queue.put(self.barrage_url.format(i))
            # return url_list

    def get_barrage_list(self):
        while True:
            barrage_str = self.html_str_q.get()
            barrage_str = barrage_str.encode('utf-8')
            barrage_xml = etree.HTML(barrage_str)
            barrage_list = barrage_xml.xpath('//d/text()')

            self.barrage_list_q.put(barrage_list)
            return barrage_list

    def takeSecond(elem):
        return elem[1]

    def save_barrage(self):

        global g_wordlist
        global ct_episodes
        global allBarrage
        ct_episodes+=1



        #停用词表
        stop = [line.strip() for line in open("stop.txt", 'r', encoding='utf-8').readlines()]
        barrage_list = self.barrage_list_q.get()
        #输出弹幕
        with open('barrage2.txt', 'w', encoding='utf-8') as f:

            for barrage in barrage_list:

                f.write(barrage)
                f.write('\n')
            fo = open('barrage2.txt','r',encoding='utf-8')
            tk = fo.read()

            for s in stop:
                tk = tk.replace(s, "")
            fo.close()
            wordlist = jieba.lcut(tk)

            b_ls=[]

            #生成一个字典
            temp ={}

            for word in wordlist:

                duplicates=False
                if len(word)==1:
                    continue
                else:
                    temp[word]=temp.get(word,0)+1
                    count=temp[word]
                    new=aBrrage(word,episodes[ct_episodes],count)
                    for n1 in b_ls[0:]:
                        if n1['word']==new['word']:
                            duplicates=True

                            if int(new['count'])>int(n1['count']):

                                n_temp = new
                                b_ls.remove(n1)
                                b_ls.append(n_temp)
                                break
                    #字典列表
                    if(duplicates==False):
                        b_ls.append(new)


        print("\n******")
        print(episodes[ct_episodes])
        print("字幕数量:",len(barrage_list))
        print("处理后弹幕数量:",len(b_ls))

        allBarrage.extend(b_ls)
        g_wordlist.extend(wordlist)
        print('获取成功')
        return allBarrage

    def create_dict(self):
        dict = {}
        wordlist = {}
        return dict

    def run(self):
        '''主要逻辑'''
        global episodes
        # 请求初始视频url
        html_str = self.parse_url(url=self.start_url, headers=self.headers)

        # 提取数据cid

        cid_list = self.get_cid(html_str)

        print(cid_list)

        ep_list=self.get_episodes(html_str)
        # 组织弹幕的url
        self.get_barrage_url(cid_list)
        # 请求网址
        episodes={}
        episodes = self.ep_list_q.get()


        ex_len=len(episodes)
        print('==========')


        for i in range(ex_len-2):
            self.parse_url()
            self.get_barrage_list()

            res = self.save_barrage()

            time.sleep(random.random() * 3)#设置爬取的时间间隔
            if(i==ex_len-3):
                save_assql(res)
                #生成字典
                wcdict = {}
                for word in g_wordlist:
                    if len(word)==1:
                        continue
                    else:
                        wcdict[word]= wcdict.get(word,0)+1
                #排序
                wcls = list(wcdict.items())
                wcls.sort(key = lambda x:x[1],reverse=True)

                #输出前二十五词
                print('输出系列前二十五词:')
                for i in range(25):
                    print(wcls[i])
                ciyun(g_wordlist)
                print("保存到数据库")



        #源代码使用的线程
        # for i in range(100):
        #     # barrage_str = self.parse_url(url)
        #     t_parse = threading.Thread(target=self.parse_url)
        #     t_parse.setDaemon(True)
        #     t_parse.start()
        #
        #     # 提取出信息
        # for i in range(2):
        #     # barrage_list = self.get_barrage_list(barrage_str)
        #     t_barrage_list = threading.Thread(target=self.get_barrage_list)
        #     t_barrage_list.setDaemon(True)
        #     t_barrage_list.start()
        #
        # # 写入文件
        # for i in range(2):
        #     # self.save_barrage(barrage_list)
        #     t_save = threading.Thread(target=self.save_barrage)
        #     t_save.setDaemon(True)
        #     t_save.start()
        #
        #
        # for q in [self.html_str_q, self.barrage_list_q, self.url_queue]:
        #     q.join()
        print('==========')
        print('主线程结束')

    #保存至数据库
def save_assql(list):

        conInfo = "mysql+pymysql://root:123456@localhost:3306/bilibili?charset=utf8"
        engine = create_engine(conInfo,encoding='utf-8')
        if(list!=[]):
            df = pd.DataFrame(list)
            df.to_sql(name = 'bilibilitest', con = engine, if_exists = 'append', index = False)
            pymysql.connect(host='localhost',port=3306,user='root',passwd='123456',db='bilibili',charset='utf8')
        else:
            return

def aBrrage(str,episodes,count):
    ab_dict={}
    ab_dict['word']= str
    ab_dict['e_index'] = episodes
    ab_dict['count']=count
    return ab_dict

def ciyun(wordlist):
    wl_split=''.join(wordlist)
    #生成词云

    mywc = WordCloud().generate(wl_split)
    plt.imshow(mywc)
    plt.axis("off")
    plt.show()
    
if __name__ == '__main__':

    bili = BiliSpider() 
    bili.run()
View Code

通过伪造headers访问

self.headers = {
            'Referer': 'https://www.bilibili.com/bangumi/play/ep7821',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            # 'Cookie': 'finger=846f9182; LIVE_BUVID=AUTO7515275889865517; fts=1527589020; BANGUMI_SS_413_REC=7823; sid=bywgf18g; buvid3=89102350-5F5E-4056-A926-16EEC8780EE8140233infoc; rpdid=oqllxwklspdosimsqlwiw; bg_view_413=7820%7C7819%7C7823%7C7822',
            'Host': 'm.bilibili.com',
        }

 关键代码:

1.获取视频id:cid

def get_cid(self, html_str):
        html = etree.HTML(html_str)
        print(html_str)
        script = html.xpath('//script[contains(text(),"epList")]/text()')[0]

        # print(script)
        cid_list = re.findall(r'"cid":(\d+)', script)
        return cid_list
View Code

2.生成字典与词云

    def save_barrage(self):
        global episodes
        global g_wordlist
        episodes=episodes+1
        #停用词表
        stop = [line.strip() for line in open("stop.txt", 'r', encoding='utf-8').readlines()]
        while True:
            barrage_list = self.barrage_list_q.get()
            #输出弹幕
            g_wordlist=[]
            # print(barrage_list)
            with open('barrage2.txt', 'w', encoding='utf-8') as f:

                for barrage in barrage_list:

                    f.write(barrage)
                    f.write('\n')
                fo = open('barrage2.txt','r',encoding='utf-8')
                tk = fo.read()
                for s in stop:
                    tk = tk.replace(s, "")
                fo.close()
                wordlist = jieba.lcut(tk)

                b_ls=[]
                ab_dict = {'word':"test",'e_index':999,'count':1}
                b_ls.append(ab_dict)

                #生成一个字典

                temp ={}
                for word in wordlist:
                    duplicates=False
                    if len(word)==1:
                        continue
                    else:
                        temp[word]=temp.get(word,0)+1
                        count=temp[word]
                        new=aBrrage(word,episodes,count)
                        # ab_dict['word']= word
                        # ab_dict['e_index'] = episodes
                        # ab_dict['count']=ab_dict.get(word,0)+1
                        for n1 in b_ls[0:]:
                            if n1['word']==new['word']:
                                duplicates=True

                                if int(new['count'])>int(n1['count']):

                                    n_temp = new
                                    b_ls.remove(n1)
                                    b_ls.append(n_temp)
                                    break
                        #字典列表
                        if(duplicates==False):
                            b_ls.append(new)


            print("\n******")
            print("",episodes,"")
            print("字幕数量:",len(barrage_list))
            print("处理后弹幕数量:",len(b_ls))

            allBarrage.extend(b_ls)
            save_assql(allBarrage)
            ciyun(wordlist)
            print('保存成功')
View Code

3.全局变量保存弹幕等信息

global g_wordlist#保存结巴弹幕
    global episodes#在存放xml的集标题
    global allBarrage#将保存到数据库有其他信息如集信息的列表
    global ct_episodes#集数序数
    allBarrage = []
    episodes={}
    ct_episodes=1#第一集集标题前两项均为无效信息
    g_wordlist=[]
View Code

4.词云及数据库保存

def ciyun(wordlist):
    wl_split=''.join(wordlist)
    #生成词云

    mywc = WordCloud().generate(wl_split)
    plt.imshow(mywc)
    plt.axis("off")
    plt.show()

def save_assql(list):

        conInfo = "mysql+pymysql://root:123456@localhost:3306/bilibili?charset=utf8"
        engine = create_engine(conInfo,encoding='utf-8')
        if(list!=[]):
            df = pd.DataFrame(list)
            df.to_sql(name = 'bilibilitest', con = engine, if_exists = 'append', index = False)
            pymysql.connect(host='localhost',port=3306,user='root',passwd='123456',db='bilibili',charset='utf8')
        else:
            print('保存失败')
            return
View Code

5.主要逻辑

def run(self):
        '''主要逻辑'''
        global episodes
        # 请求初始视频url
        html_str = self.parse_url(url=self.start_url, headers=self.headers)

        # 提取数据cid

        cid_list = self.get_cid(html_str)

        print(cid_list)

        ep_list=self.get_episodes(html_str)
        # 组织弹幕的url
        self.get_barrage_url(cid_list)
        # 请求网址
        episodes={}
        episodes = self.ep_list_q.get()


        ex_len=len(episodes)
        print('==========')


        for i in range(ex_len-2):
            self.parse_url()
            self.get_barrage_list()

            res = self.save_barrage()

            time.sleep(random.random() * 3)#设置爬取的时间间隔
            if(i==ex_len-3):
                save_assql(res)
                #生成字典
                wcdict = {}
                for word in g_wordlist:
                    if len(word)==1:
                        continue
                    else:
                        wcdict[word]= wcdict.get(word,0)+1
                #排序
                wcls = list(wcdict.items())
                wcls.sort(key = lambda x:x[1],reverse=True)

                #输出前二十五词
                print('输出系列前二十五词:')
                for i in range(25):
                    print(wcls[i])
                ciyun(g_wordlist)
                print("保存到数据库")
        print('==========')
        print('主线程结束')
View Code

 

4.输出词云能够分析该视频或者该系列视频的关键词

输出系列剧集的前十五词:

拉取弹幕提示:

 

输出该系列剧集的弹幕词云如下图:

总结:

读取的弹幕可以了解到该视频的主要内容,能在看之前就较直观地了解视频的好评程度。若有铺天遍地的谩骂那么或许就不是一部适合大众观看的视频。

若看到关键词是自己喜欢的,那么就是能很快选择到自己喜欢的视频。

 

posted @ 2019-04-28 21:52  FreyjaFs  阅读(337)  评论(0编辑  收藏  举报