爬取 【豆瓣电影top250数据】 python代码

import requests
import openpyxl
import re
import time
import pymysql
class DoubanSpider:
    def __init__(self):
        self.url_temp = "https://movie.douban.com/top250?start={}"
        self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0'}
        self.movie_data = []

    #获取电影数据
    def get_movie_data(self):
        i = 1
        for url in url_list:
            response = requests.get(url, headers=self.headers)
            text = response.text

            pattern = re.compile(r'<span class="title">(.*?)</span>.*?<p class="">\n\s*(.*?)<br>.*?<span class="rating_num" property="v:average">(.*?)</span>.*?<span>\d*人评价</span>\s*</div>\s*(.*?)</div>',re.S)
            items = re.findall(pattern, text)

            self.movie_data.append([{'电影名称': item[0], '导演与主演':  re.sub('&nbsp;', '', item[1]) ,
                     '电影评分': item[2], '电影引言': re.sub(r'<p class="quote">\s*<span class="inq">(.*?)</span>\s*</p>\s*',r'\1',item[3] if item[3] else '无')} for item in items])

            print('----第{}页----'.format(i))
            i = i+1
            time.sleep(1)
        print(self.movie_data)

    #存入excel
    def save_to_excel(self,wb):
        ws = wb.active
        ws.append(['电影名称', '导演与主演', '电影评分', '电影引言'])

        for movielist in self.movie_data:
            for movie in movielist:
                ws.append([movie['电影名称'], movie['导演与主演'], movie['电影评分'], movie['电影引言']])

        wb.save('douban_top250.xlsx')
        wb.close()

    # 存入数据库
    def save_to_sql(self):
        # 连接数据库
        try:
            conn = pymysql.connect(host='localhost',port=3306, user='root', password='password', db='dbtest')
            print('数据库成功连接')
            cursor = conn.cursor()
            # 创建movies表
            create_table_sql = '''CREATE TABLE IF NOT EXISTS movies (
                       id INT PRIMARY KEY AUTO_INCREMENT,   
                       title VARCHAR(20),
                       director VARCHAR(100),
                       score FLOAT,
                       rate VARCHAR(100)
                   );'''

            cursor.execute(create_table_sql)

            # 插入数据库
            insert_sql = "INSERT INTO movies (title, director, score, rate) VALUES (%s, %s, %s, %s)"

            for movielist in self.movie_data:
                for movie in movielist:
                    cursor.execute(insert_sql,
                               (movie['电影名称'], movie['导演与主演'], movie['电影评分'], movie['电影引言']))

            # 提交并关闭连接
            conn.commit()
            cursor.close()
            conn.close()

        except pymysql.Error:
            print('数据库无法连接')


if __name__ == '__main__':
    dbspider = DoubanSpider()
    url_list = [dbspider.url_temp.format(i*25) for i in range(10)]   #url列表
    dbspider.get_movie_data()
    wb = openpyxl.load_workbook('douban_top250.xlsx')      # douban_top250.xlsx表格
    dbspider.save_to_excel(wb)
    #dbspider.save_to_sql()

存入douban_top250.xlsx表格的一部分数据……:

………………

posted @   yao-ziyan  阅读(118)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 一个费力不讨好的项目,让我损失了近一半的绩效!
· 清华大学推出第四讲使用 DeepSeek + DeepResearch 让科研像聊天一样简单!
· 实操Deepseek接入个人知识库
· CSnakes vs Python.NET:高效嵌入与灵活互通的跨语言方案对比
· Plotly.NET 一个为 .NET 打造的强大开源交互式图表库
点击右上角即可分享
微信分享提示