Python——爬虫（一定要看下）

#!/usr/bin/env python3.5
# -*- coding: utf-8 -*-
# @Time   : 2018/1/26
# @Author : Lyrichu
# @Email  : 919987476@qq.com
# @File   : NetCloudAnalyse.py
'''
@Description:
Simple Analysis for NetCloud music,including song comments,users info etc.
And we use pyecharts for visualization analyse. 
'''
try:
    from NetCloudCrawler import NetCloudCrawl
except ImportError:
    from .NetCloudCrawler import NetCloudCrawl
from pyecharts import Bar,Geo
import requests 
import re 
import time 
import json 
import pandas as pd 
import jieba 
from wordcloud import WordCloud
import os 
from threading import Thread 
from scipy.misc import imread
from collections import Counter
from operator import itemgetter

class NetCloudAnalyse(NetCloudCrawl):
    """
    analyse for NetCloud comments of songs,user info etc. 
    """
    def __init__(self,song_name,singer_name,song_id = 1,singer_id = 1):
        super(NetCloudAnalyse, self).__init__(song_name = song_name,song_id = song_id,
                                            singer_name = singer_name,singer_id = singer_id)
        self.threading_count = 0 # global count for threadings
        self.unknown = "" # blank str for unknown info

    def load_comments_csv(self):
        '''
        load crawler comments csv file
        '''
        comments_df = pd.read_csv(self.comments_file_path,engine = 'python',encoding = 'utf-8') # read csv file as dataframe
        return comments_df

    def save_users_info_to_file(self):
        with open(self.users_info_file_path,"w",encoding = "utf-8") as fout:
            fout.write("用户ID,抓取时间,动态总数,关注人数,粉丝人数,用户所在地区,用户简介,年龄,累计听歌数量\n")
            users_url = self.load_users_url()
            num = len(users_url)
            
            # iterate the users url list
            for index,user_url in enumerate(users_url,1):
                try:
                    user_id = re.search(r'.*id=(\d+)',user_url).group(1) # user id
                    # time to crawl this info
                    crawler_time = self.from_timestamp_to_date(time_stamp = time.time())
                    html = requests.get(user_url,headers = self.headers).text
                    # personal events counts
                    event_count_pattern = re.compile(r'<strong id="event_count">(\d+?)</strong>')
                    event_count = re.search(event_count_pattern,html)
                    if event_count:
                        event_count = event_count.group(1) 
                    else:
                        event_count = self.unknown
                    # how many people the user follow
                    follow_count_pattern = re.compile(r'<strong id="follow_count">(\d+?)</strong>')
                    follow_count = re.search(follow_count_pattern,html)
                    if follow_count:
                        follow_count = follow_count.group(1) 
                    else:
                        follow_count = self.unknown
                    # how many fans the user has
                    fan_count_pattern = re.compile(r'<strong id="fan_count">(\d+?)</strong>')
                    fan_count = re.search(fan_count_pattern,html)
                    if fan_count:
                        fan_count = fan_count.group(1)
                    else:
                        fan_count = self.unknown
                    # the location the user is in
                    location_pattern = re.compile('<span>所在地区：(.+?)</span>')
                    location = re.search(location_pattern,html)
                    if location:
                        location = location.group(1)
                    else:
                        location = self.unknown # unknown location
                    description_pattern = re.compile('<div class="inf s-fc3 f-brk">个人介绍：(.*?)</div>')
                    description = re.search(description_pattern,html)
                    if description:   # if user has a description
                        description = description.group(1)
                        description = description.replace(","," ")
                    else:
                        description = self.unknown
                    age_pattern = re.compile(r'<span.*?data-age="(\d+)">')
                    age = re.search(age_pattern,html) # if user age info exists
                    if age:
                        age = age.group(1) # note that this age is formatted as timestamp
                        # we should convert it into real age
                        current_year = int(self.from_timestamp_to_date(time_stamp = time.time(),format = "%Y"))
                        age = (current_year-1970) - int(age)//(1000*365*24*3600) # real age
                    else:
                        age = self.unknown
                    listening_songs_num_pattern = re.compile('<h4>累积听歌(\d+?)首</h4>')
                    # total listening songs count
                    listening_songs_num = re.search(listening_songs_num_pattern,html)
                    if listening_songs_num:
                        listening_songs_num = listening_songs_num.group(1) 
                    else:
                        listening_songs_num = self.unknown
                    # write user info to the file
                    fout.write("{user_id},{crawler_time},{event_count},{follow_count},{fan_count},{location},{description},{age},{listening_songs_num}\n"
                                .format(
                                    user_id = user_id,crawler_time = crawler_time,event_count = event_count,
                                    follow_count = follow_count,fan_count = fan_count,location = location,
                                    description = description,age = age,listening_songs_num = listening_songs_num
                                    ))
                    print("Write {current}/{total} user info to file successfully!".format(current = index,total = num))
                except Exception as e:
                    print("Fail to get No.{index} comment user's info:{error}"
                          .format(index = index,error = e))

    def threading_save_users_info_to_file(self,threads = 10):
        '''
        using multithreads to save users info to file
        :param threads: the threads count
        '''
        start_time = time.time()
        with open(self.users_info_file_path,"w",encoding = "utf-8") as fout:
            fout.write("用户ID,抓取时间,动态总数,关注人数,粉丝人数,用户所在地区,用户简介,年龄,累计听歌数量\n")
        users_url = self.load_users_url()
        num = len(users_url)
        pack = num//threads # urls count every threads process
        unknown = "" # blank str for unknown info
        threads_list = []
        for i in range(threads):
            if i < threads-1:
                urls = users_url[i*pack:(i+1)*pack]
            else:
                urls = users_url[i*pack:]
            t = Thread(target = self.save_users_info,args=(urls,num))
            threads_list.append(t)
        for i in range(threads):
            threads_list[i].start()
        for i in range(threads):
            threads_list[i].join()
        end_time = time.time()
        print("Using {threads} threads to save users info done,costs {cost_time} seconds"
                .format(threads = threads,cost_time = (end_time - start_time)))

    def save_users_info(self,users_url,total):
        '''
        add users info to file,this function will be called in threadings
        :param users_url: the processing users url list
        :param total:total users ulr count
        '''
        users_info_list = []
        # note that we use add mode
        with open(self.users_info_file_path,"a",encoding = "utf-8") as fout:
            for user_url in users_url:
                    try:
                        user_id = re.search(r'.*id=(\d+)',user_url).group(1) # user id
                        # time to crawl this info
                        crawler_time = self.from_timestamp_to_date(time_stamp = time.time())
                        html = requests.get(user_url,headers = self.headers).text
                        # personal events counts
                        event_count_pattern = re.compile(r'<strong id="event_count">(\d+?)</strong>')
                        event_count = re.search(event_count_pattern,html)
                        if event_count:
                            event_count = event_count.group(1) 
                        else:
                            event_count = self.unknown
                        # how many people the user follow
                        follow_count_pattern = re.compile(r'<strong id="follow_count">(\d+?)</strong>')
                        follow_count = re.search(follow_count_pattern,html)
                        if follow_count:
                            follow_count = follow_count.group(1) 
                        else:
                            follow_count = self.unknown
                        # how many fans the user has
                        fan_count_pattern = re.compile(r'<strong id="fan_count">(\d+?)</strong>')
                        fan_count = re.search(fan_count_pattern,html)
                        if fan_count:
                            fan_count = fan_count.group(1)
                        else:
                            fan_count = self.unknown
                        # the location the user is in
                        location_pattern = re.compile('<span>所在地区：(.+?)</span>')
                        location = re.search(location_pattern,html)
                        if location:
                            location = location.group(1)
                        else:
                            location = self.unknown # unknown location
                        description_pattern = re.compile('<div class="inf s-fc3 f-brk">个人介绍：(.*?)</div>')
                        description = re.search(description_pattern,html)
                        if description:   # if user has a description
                            description = description.group(1)
                            description = description.replace(","," ")
                        else:
                            description = self.unknown
                        age_pattern = re.compile(r'<span.*?data-age="(\d+)">')
                        age = re.search(age_pattern,html) # if user age info exists
                        if age:
                            age = age.group(1) # note that this age is formatted as timestamp
                            # we should convert it into real age
                            current_year = int(self.from_timestamp_to_date(time_stamp = time.time(),format = "%Y"))
                            age = (current_year-1970) - int(age)//(1000*365*24*3600) # real age
                        else:
                            age = self.unknown
                        listening_songs_num_pattern = re.compile('<h4>累积听歌(\d+?)首</h4>')
                        # total listening songs count
                        listening_songs_num = re.search(listening_songs_num_pattern,html)
                        if listening_songs_num:
                            listening_songs_num = listening_songs_num.group(1) 
                        else:
                            listening_songs_num = self.unknown
                        # write user info to the file
                        user_info = "{user_id},{crawler_time},{event_count},{follow_count},{fan_count},{location},{description},{age},{listening_songs_num}\n".format(
                                        user_id = user_id,crawler_time = crawler_time,event_count = event_count,
                                        follow_count = follow_count,fan_count = fan_count,location = location,
                                        description = description,age = age,listening_songs_num = listening_songs_num
                                        )
                        users_info_list.append(user_info)
                        print("Get {current}/{total} user info to file successfully!".format(current = self.threading_count,total = total))
                    except Exception as e:
                        print("Fail to get No.{index} comment user's info:{error}"
                              .format(index = self.threading_count,error = e))
                    self.threading_count += 1
            fout.writelines(users_info_list)




    def count_comments_lines(self):
        '''
        count total comments lines
        '''
        with open(self.comments_file_path,"r",encoding = "utf-8") as fin:
            for total,_ in enumerate(fin,1):
                pass
        return total

    
    def from_timestamp_to_date(self,time_stamp,format = "%Y-%m-%d %H:%M:%S"):
        '''
        convert from timestamp to real date formatted in Year-Month-Day etc. 
        :param time_stamp: the time stamp
        :param format: the date format we want to convert
        '''
        real_date = time.strftime(format,time.localtime(time_stamp))
        return real_date

    def load_users_url(self):
        '''
        return all users domain page ulr list
        '''
        comments_df = self.load_comments_csv()
        users_id = comments_df['用户ID'].dropna() # user id
        ids_num = len(users_id) # all ids num
        # users id must be integers like string
        users_id = [users_id.iloc[i] for i in range(ids_num) if re.match(r'\d+',str(users_id.iloc[i]))]
        users_url = []
        for user_id in users_id:
            users_url.append('http://music.163.com/user/home?id={user_id}'.format(user_id = user_id))
        return list(set(users_url)) # remove the same user's ulr

                    
    def load_users_info_csv(self):
        '''
        load users info from file,
        return users info dataframe
        '''
        users_info_df = pd.read_csv(self.users_info_file_path,engine = 'python',encoding = 'utf-8')
        return users_info_df


    def draw_wordcloud(self,full_comments = True,background_path = "source/JayChou.jpg",font_path = "source/simsun.ttc"):
        '''
        darw wordcloud of full comments of one song or hot comments of a singer
        :param full_comments: True means full comments,False means hot comments
        :param background_path:background image path
        :param font_path: font path
        '''
        abs_path = os.path.split(os.path.realpath(__file__))[0]
        background_path = os.path.join(abs_path,background_path)
        font_path = os.path.join(abs_path,font_path)
        if full_comments:
            file_path = self.comments_file_path
            save_path = os.path.join(self.song_path,self.song_name+".jpg")
        else:
            file_path = os.path.join(self.singer_path,"hot_comments.csv")
            save_path = os.path.join(self.singer_path,self.singer_name+".jpg")
        comments_df = pd.read_csv(file_path,engine = 'python',encoding = 'utf-8')["评论内容"]
        comments_text = ""
        for i in range(len(comments_df)):
            comments_text += str(comments_df.iloc[i]) 
        cut_text = " ".join(jieba.cut(comments_text)) # use blank space to paste cut keywords to str
        color_mask = imread(background_path) # read the background image
        cloud = WordCloud(font_path=font_path,background_color='white',mask=color_mask,max_words=2000,max_font_size=40)
        word_cloud = cloud.generate(cut_text) # 产生词云
        word_cloud.to_file(save_path)
        print("Successfully generate {save_path}".format(save_path =save_path))

    def core_visual_analyse(self):
        '''
        core visual analyse for comments and users info,including:
        1. The distribution of comments time,both for months,days(bar to show)
        2. The distribution of comments agree count(bar to show)
        3. The distribution of comment keywords,excluded stopwords(bar to show)
        4. The distribution of users location,using geo to show(geo to show)
        5. The distribution of users location,using bar to show(bar to show)
        6. The distribution of events count(bar to show)
        7. The distribution of follow people count(bar to show)
        8. The distribution of fans count(bar to show)
        9. The distribution of description keywords(excluded stopwords)(bar to show)
        10. The distribution of users age(bar to show)
        11. The distribution of listening songs total count(bar to show)
        '''
        plot_save_path = os.path.join(self.song_path,"plots")
        if not os.path.exists(plot_save_path):
            os.mkdir(plot_save_path)
        comments_df = self.load_comments_csv()
        users_info_df = self.load_users_info_csv()
        # 1. The distribution of comments time,both for months,days and for hours(bar to show)
        comments_time = list(comments_df['评论时间'].dropna())
        # date formatted by year-month
        comments_date_year_month = []
        # date formatted by year-month-day
        comments_date_year_month_day = []
        for comment_time in comments_time:
            # note that the timestamp should divide by 1000 first
            year_month = self.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m")
            year_month_day = self.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m-%d")
            comments_date_year_month.append(year_month)
            comments_date_year_month_day.append(year_month_day)
        
        comments_date_year_month_x,comments_date_year_month_y = zip(*(sorted(Counter(comments_date_year_month).items(),key = itemgetter(0))))
        comments_date_year_month_day_x,comments_date_year_month_day_y = zip(*(sorted(Counter(comments_date_year_month_day).items(),key = itemgetter(0))))
        # year-month bar plot
        comments_date_year_month_bar = Bar(title = "歌曲<{song_name}>评论时间(年-月)数量分布".format(song_name = self.song_name))
        comments_date_year_month_bar.add("年-月",comments_date_year_month_x,comments_date_year_month_y)
        comments_date_year_month_save_path = os.path.join(plot_save_path,"comments_year_month_bar.html")
        comments_date_year_month_bar.render(comments_date_year_month_save_path)
        # year-month-day bar plot
        comments_date_year_month_day_bar = Bar(title = "歌曲<{song_name}>评论时间(年-月-日)数量分布".format(song_name = self.song_name))
        comments_date_year_month_day_bar.add("年-月-日",comments_date_year_month_day_x,comments_date_year_month_day_y)
        comments_date_year_month_day_save_path = os.path.join(plot_save_path,"comments_year_month_day_bar.html")
        comments_date_year_month_day_bar.render(comments_date_year_month_day_save_path)
        # 2. The distribution of comments agree count(bar to show)
        agree_count = list(comments_df['点赞总数'].dropna())
        agree_count_x,agree_count_y = zip(*(sorted(Counter(agree_count).items(),key = itemgetter(0))))
        agree_count_bar = Bar(title = "歌曲<{song_name}>评论点赞数量分布".format(song_name = self.song_name))
        agree_count_bar.add("点赞数量",agree_count_x,agree_count_y)
        agree_count_save_path = os.path.join(plot_save_path,"agree_count_bar.html")
        agree_count_bar.render(agree_count_save_path)
        # 3. The distribution of comment keywords,excluded stopwords(bar to show)
        comments_text = "".join(list(comments_df['评论内容'].dropna()))
        comments_keywords = jieba.cut(comments_text)
        # remove the stopwords and word that length less than 2
        stopwords = self.load_stopwords()
        comments_keywords = [keyword for keyword in comments_keywords if keyword not in stopwords and len(keyword) > 1]
        comments_keywords_x,comments_keywords_y = zip(*(sorted(Counter(comments_keywords).items(),key = itemgetter(1),reverse = True)))
        comments_keywords_bar = Bar(title = "歌曲<{song_name}>评论关键词数量分布(已去除停用词)".format(song_name = self.song_name))
        comments_keywords_bar.add("关键词",comments_keywords_x,comments_keywords_y)
        comments_keywords_save_path = os.path.join(plot_save_path,"comments_keywords_bar.html")
        comments_keywords_bar.render(comments_keywords_save_path)
        # 4. The distribution of users location,using geo to show(geo to show)
        users_location = list(users_info_df['用户所在地区'].dropna())
        users_city = [] # city users in
        all_cities = self.load_all_cities()
        for location in users_location:
            for city in all_cities:
                if city in location:
                    users_city.append(city.replace("市",""))
        users_city_data = list(Counter(users_city).items()) 
        users_city_geo = Geo("歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name),title_color="#fff", title_pos="left",
                                width=1200, height=600, background_color='#404a59')
        attr, value = users_city_geo.cast(users_city_data)
        users_city_geo.add("", attr, value, visual_range=[0, 200], visual_text_color="#fff", symbol_size=15, is_visualmap=True)
        users_city_save_path = os.path.join(plot_save_path,"users_city_geo.html")
        users_city_geo.render(users_city_save_path)

        # 5. The distribution of users location,using bar to show(bar to show)
        users_location_x,users_location_y = zip(*(sorted(Counter(users_location).items(),key = itemgetter(1),reverse = True)))
        users_location_bar = Bar(title = "歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name))
        users_location_bar.add("用户所在地区",users_location_x,users_location_y)
        users_location_save_path = os.path.join(plot_save_path,"users_location_bar.html")
        users_location_bar.render(users_location_save_path)
        # 6. The distribution of events count(pie to show)
        events_count = list(users_info_df['动态总数'].dropna())
        events_count_x,events_count_y = zip(*(sorted(Counter(events_count).items(),key = itemgetter(0))))
        events_count_bar = Bar(title = "歌曲<{song_name}>评论用户动态总数分布".format(song_name = self.song_name))
        events_count_bar.add("用户动态总数",events_count_x,events_count_y)
        events_count_save_path = os.path.join(plot_save_path,"events_count_bar.html")
        events_count_bar.render(events_count_save_path)
        # 7. The distribution of follow people count(bar to show)
        follow_count = list(users_info_df['关注人数'].dropna())
        follow_count_x,follow_count_y = zip(*(sorted(Counter(follow_count).items(),key = itemgetter(0))))
        follow_count_bar = Bar(title = "歌曲<{song_name}>评论用户关注人数分布".format(song_name = self.song_name))
        follow_count_bar.add("用户关注人数",follow_count_x,follow_count_y)
        follow_count_save_path = os.path.join(plot_save_path,"follow_count_bar.html")
        follow_count_bar.render(follow_count_save_path)
        # 8. The distribution of fans count(bar to show)
        fans_count = list(users_info_df['粉丝人数'].dropna())
        fans_count_x,fans_count_y = zip(*(sorted(Counter(fans_count).items(),key = itemgetter(0))))
        fans_count_bar = Bar(title = "歌曲<{song_name}>评论用户粉丝人数分布".format(song_name = self.song_name))
        fans_count_bar.add("用户粉丝人数",fans_count_x,fans_count_y)
        fans_count_save_path = os.path.join(plot_save_path,"fans_count_bar.html")
        fans_count_bar.render(fans_count_save_path)
        # 9. The distribution of description keywords(excluded stopwords)(bar to show)
        description_text = "".join(list(users_info_df['用户简介'].dropna()))
        description_keywords = jieba.cut(description_text)
        description_keywords = [keyword for keyword in description_keywords if keyword not in stopwords and len(keyword) > 1]
        description_keywords_x,description_keywords_y = zip(*(sorted(Counter(description_keywords).items(),key = itemgetter(1),reverse = True)))
        description_keywords_bar = Bar(title = "歌曲<{song_name}>评论用户简介关键词数量分布(已去除停用词)".format(song_name = self.song_name))
        description_keywords_bar.add("用户简介关键词",description_keywords_x,description_keywords_y)
        description_keywords_save_path = os.path.join(plot_save_path,"description_keywords_bar.html")
        description_keywords_bar.render(description_keywords_save_path)
        # 10. The distribution of users age(bar to show)
        age_count = list(users_info_df['年龄'].dropna())
        age_count = [age for age in age_count if age >= 0] # filter legal age
        age_count_x,age_count_y = zip(*(sorted(Counter(age_count).items(),key = itemgetter(0))))
        age_count_bar = Bar(title = "歌曲<{song_name}>评论用户年龄分布".format(song_name = self.song_name))
        age_count_bar.add("年龄",age_count_x,age_count_y)
        age_count_save_path = os.path.join(plot_save_path,"age_count_bar.html")
        age_count_bar.render(age_count_save_path)
        # 11. The distribution of listening songs total count(bar to show)
        listening_songs_count = list(users_info_df['累计听歌数量'].dropna())
        listening_songs = {'0-100':0,'100-1000':0,'1000-10000':0,'>10000':0}
        for c in listening_songs_count:
            if c < 100:
                listening_songs['0-100'] += 1
            elif c < 1000:
                listening_songs['100-1000'] += 1
            elif c < 10000:
                listening_songs['1000-10000'] += 1
            else:
                listening_songs['>10000'] += 1
        listening_songs_count_x,listening_songs_count_y = zip(*sorted(Counter(listening_songs).items(),key = itemgetter(1),reverse = True))
        listening_songs_count_bar = Bar(title = "歌曲<{song_name}>评论用户听歌总数分布".format(song_name = self.song_name))
        listening_songs_count_bar.add("听歌总数",listening_songs_count_x,listening_songs_count_y)
        listening_songs_count_save_path = os.path.join(plot_save_path,"listening_songs_count_bar.html")
        listening_songs_count_bar.render(listening_songs_count_save_path)




    def load_stopwords(self):
        '''
        load stopwords list
        '''
        abs_path = os.path.split(os.path.realpath(__file__))[0]
        stopwords_path = os.path.join(abs_path,"source","stopwords.txt")
        with open(stopwords_path,"r",encoding = "utf-8") as f:
            stopwords = f.readlines()
        stopwords = [word.strip() for word in stopwords]
        return list(set(stopwords))

    def load_all_cities(self):
        '''
        load all cities from province_cities.json file,
        to match city from location text
        '''
        abs_path = os.path.split(os.path.realpath(__file__))[0]
        province_cities_file = os.path.join(abs_path,"source","province_cities.json")
        all_cities = []
        with open(province_cities_file,"r",encoding = "utf-8") as fin:
            content = fin.read()
            d = json.loads(content)
            for province in d:
                for city in province['city']:
                    all_cities.append(city['name'])
        return all_cities

    def generate_all_analyse_files(self,threads = 10):
        '''
        generate all analyse files,including:
        1. generate users info file
        2. generate wordcloud picture
        3. generate core analyse files
        '''
        self.threading_save_users_info_to_file(threads)
        self.draw_wordcloud()
        self.core_visual_analyse()

    def _test_load_all_cities(self):
        all_cities = self.load_all_cities()
        print("There are %d cities." % len(all_cities))
        print(all_cities)

    def _test_load_stopwords(self):
        stopwords = self.load_stopwords()
        print('There are %d stopwords.' % len(stopwords))
        # print first 100 stopwords
        print(stopwords[:100])
        

    def _test_load_comments_csv(self):
        df = self.load_comments_csv()
        print(df.head)

    def _test_count_comments_lines(self):
        total = self.count_comments_lines()
        print("{file} has {total} comments.".format(file = self.comments_file_path,total = total))

    def _test_from_timestamp_to_date(self):
        comments_df = self.load_comments_csv()
        comments_timestamp = comments_df['评论时间'].dropna() # drop na value
        show_num = 10 # lines to show
        print(self.song_name)
        print("timestamp           real_date")
        for i in range(show_num):
            time_stamp = comments_timestamp.iloc[i]
            if time_stamp:
                real_date = self.from_timestamp_to_date(time_stamp)
                print("%s       %s" %(time_stamp,real_date))

    def _test_load_users_url(self):
        users_url = self.load_users_url()
        print("There are %d users ulr." % len(users_url))
        num = 10
        print("Top %d users ulr are:" % num)
        for i in range(num):
            print("{index}:{url}".format(index = i+1,url = users_url[i]))

    def _test_load_users_info_csv(self):
        users_info_df = self.load_users_info_csv()
        print(users_info_df.head())

    def _test_save_users_info_to_file(self):
        self.save_users_info_to_file()

    def _test_draw_wordcloud(self):
        full_comments = False
        self.draw_wordcloud(full_comments = full_comments)

    def _test_core_visual_analyse(self):
        self.core_visual_analyse()

    def _test_threading_save_users_info_to_file(self,threads = 10):
        self.threading_save_users_info_to_file(threads)

    def _test_netcloudanalyse_all(self):
        self._test_save_users_info_to_file()
        self._test_threading_save_users_info_to_file(20)
        self._test_load_comments_csv()
        self._test_count_comments_lines()
        self._test_from_timestamp_to_date()
        self._test_load_users_url()
        self._test_load_users_info_csv()
        self._test_draw_wordcloud()
        self._test_core_visual_analyse()
        self._test_load_stopwords()
        self._test_load_all_cities()
        

# if __name__ == '__main__':
#     song_name = '晴天'
#     song_id = 186016
#     singer_name = '周杰伦'
#     singer_id = 6452
#     netcloud_analyse = NetCloudAnalyse(song_name = song_name,song_id = song_id,singer_name = singer_name,
#                                         singer_id = singer_id)
#     #netcloud_analyse._test_netcloudanalyse_all()
#     netcloud_analyse.generate_all_analyse_files(100)
posted @ 2018-04-27 17:57 想54256 阅读(313) 评论(0) 收藏举报
刷新页面返回顶部
Python——爬虫（一定要看下）

公告