import requests
import openpyxl
import re
import time
import pymysql
class DoubanSpider:
def __init__(self):
self.url_temp = "https://movie.douban.com/top250?start={}"
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0'}
self.movie_data = []
def get_movie_data(self):
i = 1
for url in url_list:
response = requests.get(url, headers=self.headers)
text = response.text
pattern = re.compile(r'<span class="title">(.*?)</span>.*?<p class="">\n\s*(.*?)<br>.*?<span class="rating_num" property="v:average">(.*?)</span>.*?<span>\d*人评价</span>\s*</div>\s*(.*?)</div>',re.S)
items = re.findall(pattern, text)
self.movie_data.append([{'电影名称': item[0], '导演与主演': re.sub(' ', '', item[1]) ,
'电影评分': item[2], '电影引言': re.sub(r'<p class="quote">\s*<span class="inq">(.*?)</span>\s*</p>\s*',r'\1',item[3] if item[3] else '无')} for item in items])
print('----第{}页----'.format(i))
i = i+1
time.sleep(1)
print(self.movie_data)
def save_to_excel(self,wb):
ws = wb.active
ws.append(['电影名称', '导演与主演', '电影评分', '电影引言'])
for movielist in self.movie_data:
for movie in movielist:
ws.append([movie['电影名称'], movie['导演与主演'], movie['电影评分'], movie['电影引言']])
wb.save('douban_top250.xlsx')
wb.close()
def save_to_sql(self):
try:
conn = pymysql.connect(host='localhost',port=3306, user='root', password='password', db='dbtest')
print('数据库成功连接')
cursor = conn.cursor()
create_table_sql = '''CREATE TABLE IF NOT EXISTS movies (
id INT PRIMARY KEY AUTO_INCREMENT,
title VARCHAR(20),
director VARCHAR(100),
score FLOAT,
rate VARCHAR(100)
);'''
cursor.execute(create_table_sql)
insert_sql = "INSERT INTO movies (title, director, score, rate) VALUES (%s, %s, %s, %s)"
for movielist in self.movie_data:
for movie in movielist:
cursor.execute(insert_sql,
(movie['电影名称'], movie['导演与主演'], movie['电影评分'], movie['电影引言']))
conn.commit()
cursor.close()
conn.close()
except pymysql.Error:
print('数据库无法连接')
if __name__ == '__main__':
dbspider = DoubanSpider()
url_list = [dbspider.url_temp.format(i*25) for i in range(10)]
dbspider.get_movie_data()
wb = openpyxl.load_workbook('douban_top250.xlsx')
dbspider.save_to_excel(wb)
存入douban_top250.xlsx表格的一部分数据……:

………………
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 一个费力不讨好的项目,让我损失了近一半的绩效!
· 清华大学推出第四讲使用 DeepSeek + DeepResearch 让科研像聊天一样简单!
· 实操Deepseek接入个人知识库
· CSnakes vs Python.NET:高效嵌入与灵活互通的跨语言方案对比
· Plotly.NET 一个为 .NET 打造的强大开源交互式图表库