【爬虫】python+mysql获取慕课首页下若干条的课程评论
爬虫简介和获取效果
获取慕课首页下若干条的课程评论
具体说明见:说明文档
# -*- codeing = utf-8 -*-
import time
import random
import sqlite3
from selenium import webdriver
import re
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
'''
爬虫目的:获取慕课首页下若干条的课程评论。三步:
1.获取网页评论:get_comments()
2.初始化数据库+保存数据:init_db(),saver()
3.main():输入url,组合上述三组函数
'''
系统配置:具体可见说明文档
'''
系统配置
'''
# 测试webdriver是否安装好
# from selenium.webdriver import Chrome
# webdriver_obj = Chrome()
函数1:获取网页评论
#函数1:获取网页评论
def get_comments(url, driver):
'''
作用:从课程页获取评论信息
参数:url-要获取的课程首页url;driver-chrome driver
返回值:userid_list, names_list, comments_list, created_time_list, course_time_list, like_list, rating_list
'''
driver.get(url)
content_source = driver.page_source
soup = BeautifulSoup(content_source,'html.parser')
find_comments = driver.find_element(By.ID,"review-tag-button") # 找到页面的评论选项卡
find_comments.click() #点击该选项卡按钮
time.sleep(1)
#
# 初始化参数列表
userid_list = [] # 学习者id
names_list = [] # 学习者名字
comments_list = [] # 评论
created_time_list = [] # 发表时间
course_time_list = [] # 开课次数
like_list = [] # 点赞数
rating_list = [] #打分
# 找到评论总数
num_src=str(soup.find("span",id='review-tag-num'))
#print(num_src)
findnum=re.compile(r'\d+\.?\d*')
num=findnum.findall(num_src)
print(num)
while True:
content_source = driver.page_source
soup = BeautifulSoup(content_source, 'html.parser')
try:
# 使用bs4查找一条评论组合
content = soup.find_all('div', {
'class':
'ux-mooc-comment-course-comment_comment-list_item_body'
})
#对每页中评论div进行拆解
for perc in content:
author_name = perc.find_all(
'a', {'class':
'primary-link ux-mooc-comment-course-comment_comment-list_item_body_user-info_name'})#按照属性查找元素(标签,{属性})
comments = perc.find_all(
'div', {'class':
'ux-mooc-comment-course-comment_comment-list_item_body_content'})
created_time = perc.find_all(
'div', {'class':
'ux-mooc-comment-course-comment_comment-list_item_body_comment-info_time'})
course_times = perc.find_all(
'div', {'class':
'ux-mooc-comment-course-comment_comment-list_item_body_comment-info_term-sign'})
like = perc.find_all('span', {'primary-link'})
rating = perc.find_all('div', {"star-point"})
for userid in author_name:
userid_list.append(userid.get('href').split('=')[-1])
for name in author_name:
names_list.append(name.text)
for comment in comments:
comments_list.append(comment.text.strip('\n'))
for ct in created_time:
created_time_list.append(ct.text)
for corts in course_times:
course_time_list.append(corts.text.strip())
for lk in like:
like_list.append(lk.text.strip('\n'))
for r in rating:
count_re=re.compile('star ux-icon-custom-rating-favorite')
rstr=count_re.findall(str(r))
rating_list.append(str(len(rstr)))
# 找到下一页
next_page=driver.find_element(By.XPATH,'//li[@class="ux-pager_btn ux-pager_btn__next"]/a')
if next_page.get_attribute("class") != "th-bk-disable-gh":
next_page.click()
time.sleep(random.randint(1, 3))
else:
break
except Exception:
break
return userid_list, names_list, comments_list, created_time_list, course_time_list, like_list, rating_list
函数2+3:创建数据库、数据表,保存数据
#函数2+3 创建数据库、数据表,保存数据
def init_db(dbpath):
conn = sqlite3.connect(dbpath)
cursor = conn.cursor()
cursor.execute("drop table if exists comments_game")
#id integer primary key autoincrement,
sql = '''
create table comments_game(
id integer primary key autoincrement,
userid varchar,
user_name varchar,
comments text,
created_time varchar,
course_time varchar,
like integer,
rating integer)
'''
cursor.execute(sql)#创建数据表
conn.commit()
conn.close()
def saver(dbpath,userid_list, names_list,
comments_list, created_time_list, course_time_list, like_list,
rating_list):
'''
@description: Save the comments info to mysql
@param All field in mysql; {"conn,cursor": the code to use mysql}
@return: None
'''
init_db(dbpath)
conn = sqlite3.connect(dbpath)
cur = conn.cursor()
for i in range(len(names_list)):
userid = userid_list[i]
author_name = names_list[i]
comments = comments_list[i]
created_time = created_time_list[i]
course_time = course_time_list[i]
like = like_list[i]
rating = rating_list[i]
insert_sql = """
INSERT INTO comments_game(userid,user_name, comments, created_time, course_time, like, rating)
values(?,?,?,?,?,?,?)
"""
cur.execute(insert_sql,
(userid, author_name,
comments, created_time, course_time, like, rating))
conn.commit()
print("{}-{}: 保存成功".format(userid, author_name))
cur.close
conn.close()
main函数:参数输入
def main(url):
'''
获取学习者慕课评价
'''
url=url
#1.获取网页
driver = webdriver.Chrome(executable_path=r"drivers/chromedriver.exe") #启动谷歌浏览器
driver.maximize_window()
#2.获取数据
userid_list, names_list, comments_list, created_time_list, course_time_list, like_list, rating_list = get_comments(url, driver)
#3.存储数据
dbpath = "icourse_game_comment.db"
saver(dbpath,userid_list, names_list, comments_list,
created_time_list, course_time_list,like_list, rating_list)
driver.quit()
print("\n获取完毕,存储完毕")
if __name__ == "__main__":
main('替换成慕课网首页链接')
#根据需要修改链接
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 周边上新:园子的第一款马克杯温暖上架
· Open-Sora 2.0 重磅开源!
· 分享 3 个 .NET 开源的文件压缩处理库,助力快速实现文件压缩解压功能!
· Ollama——大语言模型本地部署的极速利器
· DeepSeek如何颠覆传统软件测试?测试工程师会被淘汰吗?