【python爬虫课程设计】抖音冬至——数据分析与可视化
抖音冬至评论区内容数据分析与可视化
一、选题背景
通过爬取抖音评论区的内容,可以分析用户对于特定话题、视频内容或事件的情感和情绪表达,从而了解用户的喜好、兴趣和情感倾向。通过分析抖音评论区的内容,可以发现用户对于特定话题的讨论热点,从而挖掘出当前社会热门话题和用户关注度较高的内容。从抖音12/22日热点话题冬至中爬取到今年冬至抖音热门视频评论区内容可以进行情感分析、热点话题挖掘、用户行为分析和用户需求反馈收集等方面的研究和分析。
二、主题式网络爬虫设计方案
数据来源:douyin.com/video/7314984745174043958
1.名称:抖音冬至评论区数据分析与可视化
2.爬取的数据内容:用户评论的内容、评论的用户、评论的点赞数量、评论的回复数量
3.爬虫设计方案概述:实现思路:将爬取到的数据解析,使用xlutils、xlwt库将文件进行保存为表格(xls文件)搭配使用pandas、numpy、pyecharts(可视化)将写入本地的xls文件数据进行分析可视化操作,最终呈现分析结果
技术难点:抖音评论区的数据量庞大,需要对数据进行准确处理才能得到精准的数据,从而对数据进行分析。
三、主题式页面结构特征分析
页面结构
1.搜索栏位于页面顶部
2.评论区处于视频部分下方
页面结构解析
节点(标签)查找方法与遍历方法
for循环迭代遍历
四、网络爬虫设计
爬取到并处理后的数据
代码实现
1 import requests,time 2 from threading import Thread 3 import xlrd, xlwt, os 4 from xlutils.copy import copy 5 6 7 class DouYin(object): 8 def __init__(self): 9 self.num = 0 10 11 #开启线程 12 def runing(self): 13 thread = Thread(target=self.spider_one) 14 thread.start() 15 16 #爬虫部分-----评论页面 17 def spider_one(self): 18 for page in range(0,8): 19 video_url = 'https://www.douyin.com/video/7314984745174043958?modeFrom=' 20 video_id = video_url.split('/')[-1].split('?')[0] 21 start_url = 'https://www.douyin.com/aweme/v1/web/comment/list/?' 22 params = { 23 'aweme_id': video_id, 24 'cursor': f'{page*20}', 25 'count': '20', 26 } 27 headers = { 28 'authority': 'www.douyin.com', 29 'accept': 'application/json, text/plain, */*', 30 'accept-language': 'zh-CN,zh;q=0.9', 31 'cache-control': 'no-cache', 32 'cookie': 's_v_web_id=verify_lmbbbkv5_aCF9bcOU_iwX1_4cU5_87uR_xtq8jZjvueDj; ttwid=1%7Cfd0IdCTKJx7zxMF5AgA1sQd-Bj5hyb-6oHaAoX9qKsw%7C1694220811%7Cef08e075a9f6de70de95ef1a65cf6374ab666257462099cd72e1bbb32afc6fa6; passport_csrf_token=02e2c01f6bb465e74eaa8671b4ef8d0f; passport_csrf_token_default=02e2c01f6bb465e74eaa8671b4ef8d0f; store-region-src=uid; __security_server_data_status=1; d_ticket=e42339701e6a0fc30f4ff8ddde9322674f6f2; MONITOR_WEB_ID=722dfd91-c8e6-4771-b4b3-28a6ef6f385e; my_rd=2; download_guide=%223%2F20231021%2F1%22; n_mh=vUM_WnuOgtpRnm2JPxsFs5XiKEkAiKFn8p3YdU8tQG0; LOGIN_STATUS=1; store-region=cn-hi; _bd_ticket_crypt_doamin=2; sso_uid_tt=50ba2faf8e81d2f7bb0d959dbdd111ef; sso_uid_tt_ss=50ba2faf8e81d2f7bb0d959dbdd111ef; toutiao_sso_user=4b03848e217e961dc909414f1e91f730; toutiao_sso_user_ss=4b03848e217e961dc909414f1e91f730; passport_auth_status=29f59ae890c6f35c8cd181ba8957dd5c%2C918ba999d2841f1b47f867ef7dcdd497; passport_auth_status_ss=29f59ae890c6f35c8cd181ba8957dd5c%2C918ba999d2841f1b47f867ef7dcdd497; uid_tt=fee35ed7c59f6da6f4b8c34a9c5a6707; uid_tt_ss=fee35ed7c59f6da6f4b8c34a9c5a6707; sid_tt=7a29fd511d8764aeb4bd04e0a26b8070; sessionid=7a29fd511d8764aeb4bd04e0a26b8070; sessionid_ss=7a29fd511d8764aeb4bd04e0a26b8070; passport_assist_user=Cj0-T0ssCwbr5bIclpuA6tKVafWCz8at7vXKkSrT-IQaohdhmtwthMqEyQSxpOIYOhGCy_R2uGBUhNV_peT5GkoKPNi3REB7dFp_dOAeJfVq8In93-iPlZQujMqq1-UcXGtwCy7q-8dp2sXBitCT7qgN6uCsTYoSRZSE8lbFSBCtqb8NGImv1lQgASIBA2vyoYI%3D; sid_ucp_sso_v1=1.0.0-KDBiMDBlNWFkZTE0Njc2YzE3N2NhYTJkNzcwMjFkNWQ1NGE3MjYxNjgKHQjH7MSHjQMQuOXXqQYY7zEgDDDA6qnfBTgGQPQHGgJsZiIgNGIwMzg0OGUyMTdlOTYxZGM5MDk0MTRmMWU5MWY3MzA; ssid_ucp_sso_v1=1.0.0-KDBiMDBlNWFkZTE0Njc2YzE3N2NhYTJkNzcwMjFkNWQ1NGE3MjYxNjgKHQjH7MSHjQMQuOXXqQYY7zEgDDDA6qnfBTgGQPQHGgJsZiIgNGIwMzg0OGUyMTdlOTYxZGM5MDk0MTRmMWU5MWY3MzA; _bd_ticket_crypt_cookie=46313640559d8d76cc27ec25e77cd71f; sid_guard=7a29fd511d8764aeb4bd04e0a26b8070%7C1698034363%7C5183997%7CFri%2C+22-Dec-2023+04%3A12%3A40+GMT; sid_ucp_v1=1.0.0-KGYwZDkxYzA3MDQ1ODJhMWNlYTQ0YzcwYWQ0OWExYmM5YjdkN2QzNjAKGQjH7MSHjQMQu-XXqQYY7zEgDDgGQPQHSAQaAmxxIiA3YTI5ZmQ1MTFkODc2NGFlYjRiZDA0ZTBhMjZiODA3MA; ssid_ucp_v1=1.0.0-KGYwZDkxYzA3MDQ1ODJhMWNlYTQ0YzcwYWQ0OWExYmM5YjdkN2QzNjAKGQjH7MSHjQMQu-XXqQYY7zEgDDgGQPQHSAQaAmxxIiA3YTI5ZmQ1MTFkODc2NGFlYjRiZDA0ZTBhMjZiODA3MA; volume_info=%7B%22isUserMute%22%3Afalse%2C%22isMute%22%3Atrue%2C%22volume%22%3A0.379%7D; FOLLOW_LIVE_POINT_INFO=%22MS4wLjABAAAA-s6OwH-mXKZwK-AQ6aKzTrwPBuS8CmyFL0US5Z8DWAU%2F1698595200000%2F0%2F0%2F1698572706455%22; __live_version__=%221.1.1.4816%22; SEARCH_RESULT_LIST_TYPE=%22single%22; pwa2=%220%7C0%7C3%7C1%22; live_can_add_dy_2_desktop=%221%22; publish_badge_show_info=%220%2C0%2C0%2C1698898991955%22; strategyABtestKey=%221698898992.623%22; __ac_nonce=06543608c0006eb9b3d74; __ac_signature=_02B4Z6wo00f01Aa11OAAAIDAhrcuovJwdAAGldBAAGT0gbCTuwOzJhUbU-y.fRQMagomfmLvXFzB9BSdnWFddRtfumok-P1Fts98YxbavHK1mJUgwkjyy2qrkYDSLppVlonvlB.34AyvBvoV43; douyin.com; device_web_cpu_core=8; device_web_memory_size=8; architecture=amd64; webcast_local_quality=null; FOLLOW_NUMBER_YELLOW_POINT_INFO=%22MS4wLjABAAAA-s6OwH-mXKZwK-AQ6aKzTrwPBuS8CmyFL0US5Z8DWAU%2F1698940800000%2F0%2F0%2F1698915646477%22; VIDEO_FILTER_MEMO_SELECT=%7B%22expireTime%22%3A1699519247002%2C%22type%22%3A1%7D; passport_fe_beating_status=true; csrf_session_id=ac4ec49617c3497d5a5fc37160c5fe48; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCR1NKLytwVmdHV3JBOFdwY0FiYzZSRU5DTWJvMXQxV1NEclZLSk5Tb0txeHRXall0NGE1cFlUNFVMSnJIbU9NUmF5YXNpYmVxOVgxYmxlYWFNRzFhTE09IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoxfQ%3D%3D; odin_tt=f503e68a7820af49500e65884a149e58ce7e089fac4bf93d2e22a98f6c644e0b51ce7c4f056abf4741d108c7acab8c95; msToken=TL8BIbJX9mdJxKSagF76sQHnUnFWKhc8eh0q-JiGSrYPSjbEvSUbKJL4b8Ag3bLLorrTdVvU0xfhCTqlpl6CgFm7-YwDyI0FKc8_CADEXJ9sRRIzPbw=; tt_scid=lHZTlWgOIl2fd1yQOu-x-DArWYh6oINefi6sco5lV8KfqOpa5KcFdor-wh1uPij48f10; IsDouyinActive=true; home_can_add_dy_2_desktop=%220%22; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A1536%2C%5C%22screen_height%5C%22%3A864%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A8%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A10%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A100%7D%22; msToken=x0GmjUPWueKkQYv0FC06-zeK8090j40OKh08BfvYDYXoFFA111R2fEjqwBy7BMbcj6OmLHUFhsSc4FjQuCjfFa2qaikmTNQNGLDuGtF_oI3gK6a1iQ8=', 33 'pragma': 'no-cache', 34 'referer': 'https://www.douyin.com/video/7291311609962614050?modeFrom=', 35 'sec-ch-ua': '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"', 36 'sec-ch-ua-mobile': '?0', 37 'sec-ch-ua-platform': '"Windows"', 38 'sec-fetch-dest': 'empty', 39 'sec-fetch-mode': 'cors', 40 'sec-fetch-site': 'same-origin', 41 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36', 42 } 43 response = requests.get(start_url,headers=headers,params=params).json() 44 for data in response['comments']: 45 self.num += 1 46 #1、评论内容 47 text = data['text'] 48 #2、评论时间 (10位数是秒) 49 create_time = data['create_time'] 50 create_time = time.localtime(create_time) 51 create_time = time.strftime("%Y-%m-%d %H:%M:%S",create_time) 52 #3、评论点赞数 53 digg_count = data['digg_count'] 54 #4、评论用户 55 sec_name = data['user']['nickname'] 56 #5、评论回复数 57 reply_comment = data['reply_comment_total'] 58 #序号 59 data = { 60 '序号':self.num, 61 '评论用户':sec_name, 62 '评论': text, 63 '评论时间': create_time, 64 '评论点赞': digg_count, 65 '评论回复数': reply_comment, 66 } 67 #print(data) 68 data = { 69 '抖音冬至数据': [self.num,sec_name,text, create_time, digg_count, reply_comment] 70 } 71 self.save_data(data,text,sec_name,create_time) 72 73 #储存数据在excel 74 def save_data(self, data, text,sec_uid,create_time): 75 if not os.path.exists('抖音冬至数据.xls'): 76 # 1、创建 Excel 文件 77 wb = xlwt.Workbook(encoding='utf-8') 78 # 2、创建新的 Sheet 表 79 sheet = wb.add_sheet('抖音冬至数据', cell_overwrite_ok=True) 80 # 3、设置 Borders边框样式 81 borders = xlwt.Borders() 82 borders.left = xlwt.Borders.THIN 83 borders.right = xlwt.Borders.THIN 84 borders.top = xlwt.Borders.THIN 85 borders.bottom = xlwt.Borders.THIN 86 borders.left_colour = 0x40 87 borders.right_colour = 0x40 88 borders.top_colour = 0x40 89 borders.bottom_colour = 0x40 90 style = xlwt.XFStyle() # Create Style 91 style.borders = borders # Add Borders to Style 92 # 4、写入时居中设置 93 align = xlwt.Alignment() 94 align.horz = 0x02 # 水平居中 95 align.vert = 0x01 # 垂直居中 96 style.alignment = align 97 # 5、设置表头信息, 遍历写入数据, 保存数据 98 header = ('序号','评论用户','评论', '评论时间', '评论点赞数', '评论回复数') 99 for i in range(0, len(header)): 100 sheet.col(i).width = 2560 * 3 101 # 行,列, 内容, 样式 102 sheet.write(0, i, header[i], style) 103 wb.save('抖音冬至数据.xls') 104 # 判断工作表是否存在 105 if os.path.exists('抖音冬至数据.xls'): 106 # 打开工作薄 107 wb = xlrd.open_workbook('抖音冬至数据.xls') 108 # 获取工作薄中所有表的个数 109 sheets = wb.sheet_names() 110 for i in range(len(sheets)): 111 for name in data.keys(): 112 worksheet = wb.sheet_by_name(sheets[i]) 113 # 获取工作薄中所有表中的表名与数据名对比 114 if worksheet.name == name: 115 # 获取表中已存在的行数 116 rows_old = worksheet.nrows 117 # 将xlrd对象拷贝转化为xlwt对象 118 new_workbook = copy(wb) 119 # 获取转化后的工作薄中的第i张表 120 new_worksheet = new_workbook.get_sheet(i) 121 for num in range(0, len(data[name])): 122 new_worksheet.write(rows_old, num, data[name][num]) 123 new_workbook.save('抖音冬至数据.xls') 124 #print(f'========================已保存: 第{self.num}条抖音冬至数据评论数据:{sec_uid} - {text} -- {create_time}================') 125 126 if __name__ == '__main__': 127 d=DouYin() 128 # 执行爬虫线程 129 d.runing()
数据清洗:由于数据比较干净,这里删除评论内容为空的数据行
1 import pandas as pd 2 # 将评论内容为空的数据清除 3 data = pd.read_excel("抖音冬至数据.xls") 4 # 清除前 5 print("清除前评论为空的数据:{}条".format(data["评论"].isnull().sum())) 6 # 删除 7 data['评论'].fillna('',inplace=True) 8 # 清除后 9 print("清除后评论为空的数据:{}条".format(data["评论"].isnull().sum())) 10 # 进行重新排序 11 data = data.reset_index(drop=True)
执行结果
五、数据可视化
1.使用pyecharts配合jieba分词将评论数据生成词云图,可以看出冬至热门视频评论区爱心和比心为出现频率最高的关键词
代码实现
1 from pyecharts.charts import WordCloud 2 from pyecharts import options as opts 3 from pyecharts.globals import SymbolType 4 import jieba 5 import pandas as pd 6 from collections import Counter 7 8 file_path = '抖音冬至数据.xls' 9 data = pd.read_excel(file_path) 10 content = data['评论'].tolist() 11 seg_list = [jieba.lcut(text) for text in content] 12 words = [word for seg in seg_list for word in seg if len(word) > 1] 13 word_counts = Counter(words) 14 word_cloud_data = [(word, count) for word, count in word_counts.items()] 15 # 创建词云图 16 wordcloud = ( 17 WordCloud(init_opts=opts.InitOpts(bg_color='#b9986d')) 18 .add("", word_cloud_data, word_size_range=[20, 100], shape=SymbolType.DIAMOND, 19 word_gap=5, rotate_step=45, 20 textstyle_opts=opts.TextStyleOpts(font_family='cursive', font_size=15)) 21 .set_global_opts(title_opts=opts.TitleOpts(title="抖音冬至评论词云图",pos_top="5%", pos_left="center"), 22 toolbox_opts=opts.ToolboxOpts( 23 is_show=True, 24 feature={ 25 "saveAsImage": {}, 26 "dataView": {}, 27 "restore": {}, 28 "refresh": {} 29 } 30 ) 31 32 ) 33 ) 34 wordcloud.render("评论内容词云图.html")
2.计算粉丝数量和获赞数量的相关性,并使用散点图绘制数据并配合折线图显示出粉丝数量与获赞数量的相关性,最后展现出合并结果图
代码实现
1 import pandas as pd 2 from pyecharts import options as opts 3 from pyecharts.charts import Scatter, Line 4 from pyecharts.globals import ThemeType 5 import numpy as np 6 # 读取Excel文件中的数据 7 data = pd.read_excel("抖音冬至数据.xls") 8 # 提取粉丝数和视频点赞数的列数据 9 fans_data = data["评论点赞数"] 10 likes_data = data["评论回复数"] 11 # 计算粉丝数和获赞数的相关性 12 correlation_coefficient = np.corrcoef(fans_data, likes_data)[0, 1] 13 correlation_coefficient = round(correlation_coefficient, 2) 14 # 创建散点图并添加数据 15 scatter = ( 16 Scatter(init_opts=opts.InitOpts(theme=ThemeType.DARK,bg_color='#94c66b')) 17 .add_xaxis(fans_data.tolist()) # 粉丝数作为X轴 18 .add_yaxis( 19 "评论点赞数", 20 likes_data.tolist(), # 视频点赞数作为Y轴 21 label_opts=opts.LabelOpts(is_show=False), 22 itemstyle_opts=opts.ItemStyleOpts( 23 color="red", opacity=0.7 # 调整散点颜色和透明度 24 ), 25 ) 26 .set_series_opts( 27 label_opts=opts.LabelOpts( 28 formatter="{b}", position="right", is_show=True, font_size=16 ,color="black" # 调整标签样式 29 ), 30 ) 31 .set_global_opts( 32 title_opts=opts.TitleOpts( 33 title=f"评论点赞数与评论回复数关联图 (相关性: {correlation_coefficient})", 34 title_textstyle_opts=opts.TextStyleOpts(font_size=18, color="black"), # 调整标题样式 35 ), 36 xaxis_opts=opts.AxisOpts( 37 type_="value", 38 name="评论回复数", 39 min_=min(fans_data), 40 max_=max(fans_data), 41 axislabel_opts=opts.LabelOpts(font_size=16,color="black"), # 调整坐标轴标签样式 42 ), 43 yaxis_opts=opts.AxisOpts( 44 type_="value", 45 name="评论点赞数", 46 min_=min(likes_data), 47 max_=max(likes_data), 48 axislabel_opts=opts.LabelOpts(font_size=16,color="black"), # 调整坐标轴标签样式 49 ), 50 datazoom_opts=[ 51 opts.DataZoomOpts( 52 type_="slider", 53 is_show=True, 54 ), 55 opts.DataZoomOpts( 56 type_="inside", 57 is_show=True, 58 ), 59 ], 60 ) 61 ) 62 63 # 添加趋势线 64 scatter.set_series_opts( 65 markline_opts=opts.MarkLineOpts( 66 data=[ 67 opts.MarkLineItem(type_="average", name="平均值"), 68 opts.MarkLineItem(type_="max", name="最大值"), 69 opts.MarkLineItem(type_="min", name="最小值"), 70 ] 71 ) 72 ) 73 74 # 创建折线图 75 line = ( 76 Line() 77 .add_xaxis(fans_data.tolist()) 78 .add_yaxis( 79 "线性趋势线", 80 [min(fans_data), max(fans_data)], 81 linestyle_opts=opts.LineStyleOpts(color="green", width=2), 82 label_opts=opts.LabelOpts(is_show=False), 83 ) 84 ) 85 86 # 合并散点图和折线图 87 scatter.overlap(line) 88 html_content = scatter.render_embed() 89 # 分析报告文本 90 analysis_text = ''' 91 相关性越接近1,说明越相关,从可视化图可看出,评论点赞数和评论回复数相关性为0.23 92 ''' 93 94 complete_html = f""" 95 <html> 96 <head> 97 <title>评论点赞数的性关性可视化</title> 98 99 </head> 100 <body style="background-color: #7ba1a8"> 101 {html_content} 102 <div style='margin-top: 20px;background-color='#6493af''> 103 <h3>分析报告:</h3> 104 <p>{analysis_text}</p> 105 </div> 106 </body> 107 </html> 108 """ 109 110 # 保存整合后的 HTML 文件 111 with open('评论点赞和回复性关性可视化.html', "w", encoding="utf-8") as file: 112 file.write(complete_html)
3.计算点赞数量与回复数量的回归方程
代码实现
1 import pandas as pd 2 from scipy import stats 3 data = pd.read_excel("抖音冬至数据.xls") 4 # 计算一元回归方程 5 slope, intercept, rvalue, pvalue, intercept_stderr = stats.linregress(data["评论点赞数"], data["评论回复数"]) 6 print("回归方程:y = {}x + {}".format(slope, intercept))
六、整体源代码
1 import requests,time 2 from threading import Thread 3 import xlrd, xlwt, os 4 from xlutils.copy import copy 5 6 class DouYin(object): 7 def __init__(self): 8 self.num = 0 9 10 #开启线程 11 def runing(self): 12 thread = Thread(target=self.spider_one) 13 thread.start() 14 15 #爬虫部分-----评论页面 16 def spider_one(self): 17 for page in range(0,8): 18 video_url = 'https://www.douyin.com/video/7314984745174043958?modeFrom=' 19 video_id = video_url.split('/')[-1].split('?')[0] 20 start_url = 'https://www.douyin.com/aweme/v1/web/comment/list/?' 21 params = { 22 'aweme_id': video_id, 23 'cursor': f'{page*20}', 24 'count': '20', 25 } 26 headers = { 27 'authority': 'www.douyin.com', 28 'accept': 'application/json, text/plain, */*', 29 'accept-language': 'zh-CN,zh;q=0.9', 30 'cache-control': 'no-cache', 31 'cookie': 's_v_web_id=verify_lmbbbkv5_aCF9bcOU_iwX1_4cU5_87uR_xtq8jZjvueDj; ttwid=1%7Cfd0IdCTKJx7zxMF5AgA1sQd-Bj5hyb-6oHaAoX9qKsw%7C1694220811%7Cef08e075a9f6de70de95ef1a65cf6374ab666257462099cd72e1bbb32afc6fa6; passport_csrf_token=02e2c01f6bb465e74eaa8671b4ef8d0f; passport_csrf_token_default=02e2c01f6bb465e74eaa8671b4ef8d0f; store-region-src=uid; __security_server_data_status=1; d_ticket=e42339701e6a0fc30f4ff8ddde9322674f6f2; MONITOR_WEB_ID=722dfd91-c8e6-4771-b4b3-28a6ef6f385e; my_rd=2; download_guide=%223%2F20231021%2F1%22; n_mh=vUM_WnuOgtpRnm2JPxsFs5XiKEkAiKFn8p3YdU8tQG0; LOGIN_STATUS=1; store-region=cn-hi; _bd_ticket_crypt_doamin=2; sso_uid_tt=50ba2faf8e81d2f7bb0d959dbdd111ef; sso_uid_tt_ss=50ba2faf8e81d2f7bb0d959dbdd111ef; toutiao_sso_user=4b03848e217e961dc909414f1e91f730; toutiao_sso_user_ss=4b03848e217e961dc909414f1e91f730; passport_auth_status=29f59ae890c6f35c8cd181ba8957dd5c%2C918ba999d2841f1b47f867ef7dcdd497; passport_auth_status_ss=29f59ae890c6f35c8cd181ba8957dd5c%2C918ba999d2841f1b47f867ef7dcdd497; uid_tt=fee35ed7c59f6da6f4b8c34a9c5a6707; uid_tt_ss=fee35ed7c59f6da6f4b8c34a9c5a6707; sid_tt=7a29fd511d8764aeb4bd04e0a26b8070; sessionid=7a29fd511d8764aeb4bd04e0a26b8070; sessionid_ss=7a29fd511d8764aeb4bd04e0a26b8070; passport_assist_user=Cj0-T0ssCwbr5bIclpuA6tKVafWCz8at7vXKkSrT-IQaohdhmtwthMqEyQSxpOIYOhGCy_R2uGBUhNV_peT5GkoKPNi3REB7dFp_dOAeJfVq8In93-iPlZQujMqq1-UcXGtwCy7q-8dp2sXBitCT7qgN6uCsTYoSRZSE8lbFSBCtqb8NGImv1lQgASIBA2vyoYI%3D; sid_ucp_sso_v1=1.0.0-KDBiMDBlNWFkZTE0Njc2YzE3N2NhYTJkNzcwMjFkNWQ1NGE3MjYxNjgKHQjH7MSHjQMQuOXXqQYY7zEgDDDA6qnfBTgGQPQHGgJsZiIgNGIwMzg0OGUyMTdlOTYxZGM5MDk0MTRmMWU5MWY3MzA; ssid_ucp_sso_v1=1.0.0-KDBiMDBlNWFkZTE0Njc2YzE3N2NhYTJkNzcwMjFkNWQ1NGE3MjYxNjgKHQjH7MSHjQMQuOXXqQYY7zEgDDDA6qnfBTgGQPQHGgJsZiIgNGIwMzg0OGUyMTdlOTYxZGM5MDk0MTRmMWU5MWY3MzA; _bd_ticket_crypt_cookie=46313640559d8d76cc27ec25e77cd71f; sid_guard=7a29fd511d8764aeb4bd04e0a26b8070%7C1698034363%7C5183997%7CFri%2C+22-Dec-2023+04%3A12%3A40+GMT; sid_ucp_v1=1.0.0-KGYwZDkxYzA3MDQ1ODJhMWNlYTQ0YzcwYWQ0OWExYmM5YjdkN2QzNjAKGQjH7MSHjQMQu-XXqQYY7zEgDDgGQPQHSAQaAmxxIiA3YTI5ZmQ1MTFkODc2NGFlYjRiZDA0ZTBhMjZiODA3MA; ssid_ucp_v1=1.0.0-KGYwZDkxYzA3MDQ1ODJhMWNlYTQ0YzcwYWQ0OWExYmM5YjdkN2QzNjAKGQjH7MSHjQMQu-XXqQYY7zEgDDgGQPQHSAQaAmxxIiA3YTI5ZmQ1MTFkODc2NGFlYjRiZDA0ZTBhMjZiODA3MA; volume_info=%7B%22isUserMute%22%3Afalse%2C%22isMute%22%3Atrue%2C%22volume%22%3A0.379%7D; FOLLOW_LIVE_POINT_INFO=%22MS4wLjABAAAA-s6OwH-mXKZwK-AQ6aKzTrwPBuS8CmyFL0US5Z8DWAU%2F1698595200000%2F0%2F0%2F1698572706455%22; __live_version__=%221.1.1.4816%22; SEARCH_RESULT_LIST_TYPE=%22single%22; pwa2=%220%7C0%7C3%7C1%22; live_can_add_dy_2_desktop=%221%22; publish_badge_show_info=%220%2C0%2C0%2C1698898991955%22; strategyABtestKey=%221698898992.623%22; __ac_nonce=06543608c0006eb9b3d74; __ac_signature=_02B4Z6wo00f01Aa11OAAAIDAhrcuovJwdAAGldBAAGT0gbCTuwOzJhUbU-y.fRQMagomfmLvXFzB9BSdnWFddRtfumok-P1Fts98YxbavHK1mJUgwkjyy2qrkYDSLppVlonvlB.34AyvBvoV43; douyin.com; device_web_cpu_core=8; device_web_memory_size=8; architecture=amd64; webcast_local_quality=null; FOLLOW_NUMBER_YELLOW_POINT_INFO=%22MS4wLjABAAAA-s6OwH-mXKZwK-AQ6aKzTrwPBuS8CmyFL0US5Z8DWAU%2F1698940800000%2F0%2F0%2F1698915646477%22; VIDEO_FILTER_MEMO_SELECT=%7B%22expireTime%22%3A1699519247002%2C%22type%22%3A1%7D; passport_fe_beating_status=true; csrf_session_id=ac4ec49617c3497d5a5fc37160c5fe48; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCR1NKLytwVmdHV3JBOFdwY0FiYzZSRU5DTWJvMXQxV1NEclZLSk5Tb0txeHRXall0NGE1cFlUNFVMSnJIbU9NUmF5YXNpYmVxOVgxYmxlYWFNRzFhTE09IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoxfQ%3D%3D; odin_tt=f503e68a7820af49500e65884a149e58ce7e089fac4bf93d2e22a98f6c644e0b51ce7c4f056abf4741d108c7acab8c95; msToken=TL8BIbJX9mdJxKSagF76sQHnUnFWKhc8eh0q-JiGSrYPSjbEvSUbKJL4b8Ag3bLLorrTdVvU0xfhCTqlpl6CgFm7-YwDyI0FKc8_CADEXJ9sRRIzPbw=; tt_scid=lHZTlWgOIl2fd1yQOu-x-DArWYh6oINefi6sco5lV8KfqOpa5KcFdor-wh1uPij48f10; IsDouyinActive=true; home_can_add_dy_2_desktop=%220%22; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A1536%2C%5C%22screen_height%5C%22%3A864%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A8%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A10%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A100%7D%22; msToken=x0GmjUPWueKkQYv0FC06-zeK8090j40OKh08BfvYDYXoFFA111R2fEjqwBy7BMbcj6OmLHUFhsSc4FjQuCjfFa2qaikmTNQNGLDuGtF_oI3gK6a1iQ8=', 32 'pragma': 'no-cache', 33 'referer': 'https://www.douyin.com/video/7291311609962614050?modeFrom=', 34 'sec-ch-ua': '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"', 35 'sec-ch-ua-mobile': '?0', 36 'sec-ch-ua-platform': '"Windows"', 37 'sec-fetch-dest': 'empty', 38 'sec-fetch-mode': 'cors', 39 'sec-fetch-site': 'same-origin', 40 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36', 41 } 42 response = requests.get(start_url,headers=headers,params=params).json() 43 for data in response['comments']: 44 self.num += 1 45 #1、评论内容 46 text = data['text'] 47 #2、评论时间 (10位数是秒) 48 create_time = data['create_time'] 49 create_time = time.localtime(create_time) 50 create_time = time.strftime("%Y-%m-%d %H:%M:%S",create_time) 51 #3、评论点赞数 52 digg_count = data['digg_count'] 53 #4、评论用户 54 sec_name = data['user']['nickname'] 55 #5、评论回复数 56 reply_comment = data['reply_comment_total'] 57 #序号 58 data = { 59 '序号':self.num, 60 '评论用户':sec_name, 61 '评论': text, 62 '评论时间': create_time, 63 '评论点赞': digg_count, 64 '评论回复数': reply_comment, 65 } 66 #print(data) 67 data = { 68 '抖音冬至数据': [self.num,sec_name,text, create_time, digg_count, reply_comment] 69 } 70 self.save_data(data,text,sec_name,create_time) 71 72 #储存数据在excel 73 def save_data(self, data, text,sec_uid,create_time): 74 if not os.path.exists('抖音冬至数据.xls'): 75 # 1、创建 Excel 文件 76 wb = xlwt.Workbook(encoding='utf-8') 77 # 2、创建新的 Sheet 表 78 sheet = wb.add_sheet('抖音冬至数据', cell_overwrite_ok=True) 79 # 3、设置 Borders边框样式 80 borders = xlwt.Borders() 81 borders.left = xlwt.Borders.THIN 82 borders.right = xlwt.Borders.THIN 83 borders.top = xlwt.Borders.THIN 84 borders.bottom = xlwt.Borders.THIN 85 borders.left_colour = 0x40 86 borders.right_colour = 0x40 87 borders.top_colour = 0x40 88 borders.bottom_colour = 0x40 89 style = xlwt.XFStyle() # Create Style 90 style.borders = borders # Add Borders to Style 91 # 4、写入时居中设置 92 align = xlwt.Alignment() 93 align.horz = 0x02 # 水平居中 94 align.vert = 0x01 # 垂直居中 95 style.alignment = align 96 # 5、设置表头信息, 遍历写入数据, 保存数据 97 header = ('序号','评论用户','评论', '评论时间', '评论点赞数', '评论回复数') 98 for i in range(0, len(header)): 99 sheet.col(i).width = 2560 * 3 100 # 行,列, 内容, 样式 101 sheet.write(0, i, header[i], style) 102 wb.save('抖音冬至数据.xls') 103 # 判断工作表是否存在 104 if os.path.exists('抖音冬至数据.xls'): 105 # 打开工作薄 106 wb = xlrd.open_workbook('抖音冬至数据.xls') 107 # 获取工作薄中所有表的个数 108 sheets = wb.sheet_names() 109 for i in range(len(sheets)): 110 for name in data.keys(): 111 worksheet = wb.sheet_by_name(sheets[i]) 112 # 获取工作薄中所有表中的表名与数据名对比 113 if worksheet.name == name: 114 # 获取表中已存在的行数 115 rows_old = worksheet.nrows 116 # 将xlrd对象拷贝转化为xlwt对象 117 new_workbook = copy(wb) 118 # 获取转化后的工作薄中的第i张表 119 new_worksheet = new_workbook.get_sheet(i) 120 for num in range(0, len(data[name])): 121 new_worksheet.write(rows_old, num, data[name][num]) 122 new_workbook.save('抖音冬至数据.xls') 123 #print(f'========================已保存: 第{self.num}条抖音冬至数据评论数据:{sec_uid} - {text} -- {create_time}================') 124 125 if __name__ == '__main__': 126 d=DouYin() 127 # 执行爬虫线程 128 d.runing() 129 130 131 import pandas as pd 132 # 将评论内容为空的数据清除 133 data = pd.read_excel("抖音冬至数据.xls") 134 # 清除前 135 print("清除前评论为空的数据:{}条".format(data["评论"].isnull().sum())) 136 # 删除 137 data['评论'].fillna('',inplace=True) 138 # 清除后 139 print("清除后评论为空的数据:{}条".format(data["评论"].isnull().sum())) 140 # 进行重新排序 141 data = data.reset_index(drop=True) 142 143 144 from pyecharts.charts import WordCloud 145 from pyecharts import options as opts 146 from pyecharts.globals import SymbolType 147 import jieba 148 import pandas as pd 149 from collections import Counter 150 151 file_path = '抖音冬至数据.xls' 152 data = pd.read_excel(file_path) 153 content = data['评论'].tolist() 154 seg_list = [jieba.lcut(text) for text in content] 155 words = [word for seg in seg_list for word in seg if len(word) > 1] 156 word_counts = Counter(words) 157 word_cloud_data = [(word, count) for word, count in word_counts.items()] 158 # 创建词云图 159 wordcloud = ( 160 WordCloud(init_opts=opts.InitOpts(bg_color='#b9986d')) 161 .add("", word_cloud_data, word_size_range=[20, 100], shape=SymbolType.DIAMOND, 162 word_gap=5, rotate_step=45, 163 textstyle_opts=opts.TextStyleOpts(font_family='cursive', font_size=15)) 164 .set_global_opts(title_opts=opts.TitleOpts(title="抖音冬至评论词云图",pos_top="5%", pos_left="center"), 165 toolbox_opts=opts.ToolboxOpts( 166 is_show=True, 167 feature={ 168 "saveAsImage": {}, 169 "dataView": {}, 170 "restore": {}, 171 "refresh": {} 172 } 173 ) 174 175 ) 176 ) 177 wordcloud.render("评论内容词云图.html") 178 179 180 import pandas as pd 181 from pyecharts import options as opts 182 from pyecharts.charts import Scatter, Line 183 from pyecharts.globals import ThemeType 184 import numpy as np 185 # 读取Excel文件中的数据 186 data = pd.read_excel("抖音冬至数据.xls") 187 # 提取粉丝数和视频点赞数的列数据 188 fans_data = data["评论点赞数"] 189 likes_data = data["评论回复数"] 190 # 计算粉丝数和获赞数的相关性 191 correlation_coefficient = np.corrcoef(fans_data, likes_data)[0, 1] 192 correlation_coefficient = round(correlation_coefficient, 2) 193 # 创建散点图并添加数据 194 scatter = ( 195 Scatter(init_opts=opts.InitOpts(theme=ThemeType.DARK,bg_color='#94c66b')) 196 .add_xaxis(fans_data.tolist()) # 粉丝数作为X轴 197 .add_yaxis( 198 "评论点赞数", 199 likes_data.tolist(), # 视频点赞数作为Y轴 200 label_opts=opts.LabelOpts(is_show=False), 201 itemstyle_opts=opts.ItemStyleOpts( 202 color="red", opacity=0.7 # 调整散点颜色和透明度 203 ), 204 ) 205 .set_series_opts( 206 label_opts=opts.LabelOpts( 207 formatter="{b}", position="right", is_show=True, font_size=16 ,color="black" # 调整标签样式 208 ), 209 ) 210 .set_global_opts( 211 title_opts=opts.TitleOpts( 212 title=f"评论点赞数与评论回复数关联图 (相关性: {correlation_coefficient})", 213 title_textstyle_opts=opts.TextStyleOpts(font_size=18, color="black"), # 调整标题样式 214 ), 215 xaxis_opts=opts.AxisOpts( 216 type_="value", 217 name="评论回复数", 218 min_=min(fans_data), 219 max_=max(fans_data), 220 axislabel_opts=opts.LabelOpts(font_size=16,color="black"), # 调整坐标轴标签样式 221 ), 222 yaxis_opts=opts.AxisOpts( 223 type_="value", 224 name="评论点赞数", 225 min_=min(likes_data), 226 max_=max(likes_data), 227 axislabel_opts=opts.LabelOpts(font_size=16,color="black"), # 调整坐标轴标签样式 228 ), 229 datazoom_opts=[ 230 opts.DataZoomOpts( 231 type_="slider", 232 is_show=True, 233 ), 234 opts.DataZoomOpts( 235 type_="inside", 236 is_show=True, 237 ), 238 ], 239 ) 240 ) 241 242 # 添加趋势线 243 scatter.set_series_opts( 244 markline_opts=opts.MarkLineOpts( 245 data=[ 246 opts.MarkLineItem(type_="average", name="平均值"), 247 opts.MarkLineItem(type_="max", name="最大值"), 248 opts.MarkLineItem(type_="min", name="最小值"), 249 ] 250 ) 251 ) 252 253 # 创建折线图 254 line = ( 255 Line() 256 .add_xaxis(fans_data.tolist()) 257 .add_yaxis( 258 "线性趋势线", 259 [min(fans_data), max(fans_data)], 260 linestyle_opts=opts.LineStyleOpts(color="green", width=2), 261 label_opts=opts.LabelOpts(is_show=False), 262 ) 263 ) 264 265 # 合并散点图和折线图 266 scatter.overlap(line) 267 html_content = scatter.render_embed() 268 # 分析报告文本 269 analysis_text = ''' 270 相关性越接近1,说明越相关,从可视化图可看出,评论点赞数和评论回复数相关性为0.23 271 ''' 272 273 complete_html = f""" 274 <html> 275 <head> 276 <title>评论点赞数的性关性可视化</title> 277 278 </head> 279 <body style="background-color: #7ba1a8"> 280 {html_content} 281 <div style='margin-top: 20px;background-color='#6493af''> 282 <h3>分析报告:</h3> 283 <p>{analysis_text}</p> 284 </div> 285 </body> 286 </html> 287 """ 288 289 # 保存整合后的 HTML 文件 290 with open('评论点赞和回复性关性可视化.html', "w", encoding="utf-8") as file: 291 file.write(complete_html) 292 293 import pandas as pd 294 from scipy import stats 295 296 data = pd.read_excel("抖音冬至数据.xls") 297 slope, intercept, rvalue, pvalue, intercept_stderr = stats.linregress(data["评论点赞数"], data["评论回复数"]) 298 print("回归方程:y = {}x + {}".format(slope, intercept))
七、总结
用户调查:抖音评论区爬虫+数据分析可视化的应用可以帮助抖音内容创作者和营销人员更好地了解用户对内容的反馈和情感倾向,从而更好地优化内容和营销策略。同时,也可以帮助研究人员对用户行为和社会趋势进行深入分析。然而,需要注意的是,在进行抖音评论区爬虫时,需要遵守相关法律法规和抖音平台的规定,避免违反用户隐私和侵犯平台规定。
技术总结:通过本次课程,更加深度掌握了Python的爬虫和数据分析可视化。Python爬虫和数据分析可视化课程是一种提升数据处理和分析能力的有效途径。在Python爬虫方面,我掌握了使用Python编程语言编写爬虫程序,从网页上获取数据,了解网页结构和数据抓取技术,同时也需要了解相关的法律法规和道德规范,以避免侵犯他人隐私或违反网络规定。在数据分析可视化方面,我学习到了如何使用Python中的数据分析库(例如pandas、numpy)和可视化库(例如pycharts)来处理和分析数据,并将分析结果以图表或图形的形式进行可视化呈现,以便更直观地理解数据的趋势和规律。感谢大伟老师,让我有这次提升自己的能力的机会。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· ollama系列01:轻松3步本地部署deepseek,普通电脑可用
· 25岁的心里话
· 按钮权限的设计及实现