抓取qq音乐评论 (林俊杰-雪落下的声音 13061条) 制作词云图，是否值得一听

使用抓包工具 charles 抓取qq音乐客户端

url = "https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg?g_tk=798799166&loginUin=1152921504630904742&hostUin=0&format=json&inCharset=GB2312&outCharset=GB2312&notice=0&platform=jqspaframe.json&needNewCode=0&cid=205360772&reqtype=2&biztype=1&topid=219004455&cmd=8&needmusiccrit=0&pagenum=1&pagesize=25&lasthotcommentid=song_219004455_3394972532_1543030743&domain=qq.com&ct=6&cv=50600"

爬虫代码:

#!/usr/bin/env python
# -*- coding:utf-8 -*-

# Author : zhibo.wang
# E-mail : d_1206@qq.com
# Date   : 18/11/25 23:39:11
# Desc   : qq音乐 林俊杰-雪落下的声音 评论


import time
import json
import ranom
import pymongo
import requests


config = {
    'HOST': '127.0.0.1',
    'PORT': 27017,
    'DB': 'wangzhibo',
}

def mongo_con_keepalive(confing=config):
    conn = pymongo.MongoClient(confing['HOST'], confing['PORT'])
    conn = conn[confing['DB']]
    if confing.get('USER'):
        conn.authenticate(confing['USER'], confing['PASSWORD'])
    return conn


class Crawl():
    start_url = "https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg?g_tk=798799166&loginUin=1152921504630904742&hostUin=0&format=json&inCharset=GB2312&outCharset=GB2312&notice=0&platform=jqspaframe.json&needNewCode=0&cid=205360772&reqtype=2&biztype=1&topid=219004455&cmd=8&needmusiccrit=0&pagenum=1&pagesize=25&domain=qq.com&ct=6&cv=50600"
    time_out = 10

    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) patch/0 QQMusic/5.6.0 Released[1]",
        "Referer": "https://y.qq.com/musicmac/v4/song/detail.html?songid=219004455&songtype=13",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Host": "c.y.qq.com",
        "Origin": "https://y.qq.com",
    }
    insert_table = "qq_music_comment"
    proxyMeta = "http://xxxx:xxxxx@proxy.abuyun.com:9020" # 阿布云代理
    proxies = {
                "http": proxyMeta,
                "https": proxyMeta,
              }
    is_proxy = True
    if is_proxy:
        wait_time = [0.25, 0.26, 0.27]
    else:
        wait_time = [1, 1.1, 1.2, 1.3]     # 间隔时间

    def __init__(self):
        self.db = mongo_con_keepalive()

    def req(self, url):
        soup = None
        try:
            if self.is_proxy:
                r = requests.get(url, headers=self.headers, timeout=self.time_out, proxies=self.proxies)
            else:
                r = requests.get(url, headers=self.headers, timeout=self.time_out)
            if r.status_code == 200:
                soup = r.json()
        except Exception as e:
            print("req error: ", e)
        return soup

    def create_pages(self, soup):
        pages = None
        try:
            count = soup.get("comment").get("commenttotal")
            pages = list(range(2, len(list(range(0, count, 25))) +1 ))
        except:
            pass
        return pages

    def get_time_stamp(self):
        # 生成时间戳
        return str(int(time.time()))

    def create_lasthotcommentid(self):
        # return "&lasthotcommentid=song_219004455_3394972532_{0}".format(self.get_time_stamp())
        return ""

    def run(self):
        index_url = "{0}{1}".format(
            self.start_url,
            self.create_lasthotcommentid()
        )
        data_index = self.req(index_url)
        if data_index:
            if data_index.get("code") == 0:
                end_data_index = data_index.get("comment").get("commentlist")
                self.db.get_collection(self.insert_table).insert_many(end_data_index)
                pages = self.create_pages(data_index)
                if pages:
                    for page in pages:
                        url_ = "{0}&pagenum={1}".format(self.start_url.replace("&pagenum=1", ""), page)
                        url = "{0}{1}".format(url_, self.create_lasthotcommentid())

                        print(url)
                        data = self.req(url)
                        if data:
                            if data.get("code") == 0:
                                end_data = data.get("comment").get("commentlist")
                                self.db.get_collection(self.insert_table).insert_many(end_data)
                        time.sleep(random.choice(self.wait_time))


if __name__ == "__main__":
    C = Crawl()
    C.run()

数据样例

{
    "_id" : ObjectId("5bfad01b19dd9f457f126c7a"),
    "avatarurl" : "http://thirdqq.qlogo.cn/g?b=sdk&k=jsufRtCrVfrD4RSeXgAib6Q&s=140&t=1541948704",
    "commentid" : "song_219004455_1943375732_1541849025",
    "commit_state" : 2,
    "enable_delete" : 0,
    "identity_pic" : "",
    "identity_type" : 0,
    "is_hot" : 1,
    "is_hot_cmt" : 0,
    "is_medal" : 0,
    "is_stick" : 0,
    "ispraise" : 0,
    "middlecommentcontent" : null,
    "nick" : "聪哓",
    "permission" : 15,
    "praisenum" : 4,
    "root_enable_delete" : 0,
    "root_identity_pic" : "",
    "root_identity_type" : 0,
    "root_is_stick" : 0,
    "rootcommentcontent" : "喜欢JJ已经五年了，无论在哪儿听到他的歌，我即使我不会唱我也能听出来是JJ的声音，因为JJ的声音给我一种特别的感觉，，，，，，永远支持你哦，JJ",
    "rootcommentid" : "song_219004455_1943375732_1541849025",
    "rootcommentnick" : "@聪哓",
    "rootcommentuin" : "1943375732",
    "score" : 0,
    "taoge_topic" : "",
    "taoge_url" : "",
    "time" : 1541849025,
    "uin" : "1943375732",
    "user_type" : "",
    "vipicon" : ""
}

制作词云图

#!/usr/bin/env python
# -*- coding:utf-8 -*-

# Author : zhibo.wang
# E-mail : d_1206@qq.com
# Date   : 18/11/26 00:53:22
# Desc   :

import re
import os
import jieba
import fool
import codecs
import pymongo
from scipy.misc import imread
from wordcloud import WordCloud
from matplotlib import pyplot as plt

config = {
    'HOST': '127.0.0.1',
    'PORT': 27017,
    'DB': 'wangzhibo',
}

def mongo_con_keepalive(confing=config):
    conn = pymongo.MongoClient(confing['HOST'], confing['PORT'])
    conn = conn[confing['DB']]
    if confing.get('USER'):
        conn.authenticate(confing['USER'], confing['PASSWORD'])
    return conn

"""
emoji_pattern = re.compile(
    u"(\ud83d[\ude00-\ude4f])|"  # emoticons
    u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
    u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
    u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
    u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
    "+", flags=re.UNICODE)
"""
emoji_pattern = re.compile(
    '[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+')

def remove_emoji(text):
    return emoji_pattern.sub(r'', text)

def draw_wordcloud(comment_text, fenci):
    comment_text = remove_emoji(comment_text)
    if fenci == "jieba":
        cut_text = " ".join(jieba.cut(comment_text))
    else:
        dd = fool.cut(comment_text)
        cut_text = " ".join(fool.cut(comment_text)[0])
    color_mask = imread('/Users/work/Downloads/e0f057b7a1a61de962d89347b6d7201f-d4o1tzm.jpg')
    font = r'/Users/work/Downloads/simfang.ttf'
    stopwords = open("stopworld.txt").read().split("\n")
    cloud = WordCloud(
        font_path=font,
        background_color='white',
        max_words=20000,
        max_font_size=400,
        min_font_size=10,
        mask=color_mask,
        stopwords=stopwords,
    )

    word_cloud = cloud.generate(cut_text)
    word_cloud.to_file('{0}.jpg'.format(fenci))

def run():
    fenci_list = ["jieba", "fool"]
    db = mongo_con_keepalive()
    datas = db.get_collection("qq_music_comment").find({})
    print("count: ", datas.count())
    comment_text = "".join([i.get("rootcommentcontent").strip() for i in datas if i.get("rootcommentcontent")])
    for fenci in fenci_list:
        print(fenci)
        draw_wordcloud(comment_text, fenci)


if __name__ == "__main__":
    run()

使用两种分词 jieba、fool(https://github.com/rockyzhengwu/FoolNLTK)

从最终的词云结果上来看，这首歌曲还是值得一听。

github: https://github.com/wang-zhibo/qq_music

posted @ 2018-11-25 23:56 🐳.城南阅读(864) 评论(0) 编辑收藏举报

指间灵动，快码加编

刷新页面返回顶部

🦉. 城南

抓取qq音乐评论 (林俊杰-雪落下的声音 13061条) 制作词云图，是否值得一听

公告