python 爬取豆瓣电影top100,保存到Excel
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/2/7 16:23 # @Author : from bs4 import BeautifulSoup import re import xlwt import urllib.request, urllib.response, urllib.error def default(): baseUrl = 'https://movie.douban.com/top250?start=' dataList = getData(baseUrl) savePath = '豆瓣Top250.xls' saveDataExcel(savePath, dataList) def getData(baseUrl): """ 获取数据 """ findUrl = re.compile(r'<a href="(.*?)">') findImg = re.compile(r'<img.*src="(.*?)"') findTitle = re.compile(r'<span class="title">(.*)</span>') findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>') findJudge = re.compile(r'<span>(\d*)人评价</span>') findInq = re.compile(r'<span class="inq">(.*)</span>') findBd = re.compile(r'<p class="">(.*?)</p>', re.S) # 忽略换行符 dataList = [] for i in range(0, 10): url = baseUrl + str(i * 25) html = getOneUrl(url) soup = BeautifulSoup(html, 'html.parser') for item in soup.find_all('div', class_="item"): data = [] item = str(item) ##链接地址 link = re.findall(findUrl, item)[0] data.append(link) img = re.findall(findImg, item)[0] data.append(img) titles = re.findall(findTitle, item) if len(titles) == 2 : cTitle = titles[0] oTitle = titles[1].replace('/', '') else: cTitle = titles[0] oTitle = ' ' data.append(cTitle.strip()) data.append(oTitle.strip()) rating = re.findall(findRating, item)[0] data.append(rating) judge = re.findall(findJudge, item)[0] data.append(judge) inq = re.findall(findInq, item) if len(inq) != 0: inq = inq[0].replace('。', '') else: inq = ' ' data.append(inq) bd = re.findall(findBd, item)[0] bd = re.sub('<br(\s+)?/>(\s+)?', ' ', bd) # 去掉br bd = re.sub('/', ' ', bd) # 去掉/ data.append(bd.strip()) dataList.append(data) return dataList def getOneUrl(url): """获取一个地址信息""" header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36', } request = urllib.request.Request(url=url, headers=header) try: resp = urllib.request.urlopen(request) html = resp.read().decode('utf-8') # print(html) except urllib.error.HTTPError as e: if hasattr(e, 'code'): print('code', e.code) if hasattr(e, 'reason'): print('reason', e.reason) print('error', e) return html def saveDataExcel(path, dataList): """保存数据到Excel""" print('save....') workbook = xlwt.Workbook(encoding='utf-8', style_compression=0) #创建workbook对象 worksheet = workbook.add_sheet('shheet1', cell_overwrite_ok=True) #创建工作表 col = ("链接", "图片", "中文名", "外国名", "评分", "评价数", "概况", "相关信息") for item in range(0, 8): worksheet.write(0, item, col[item]) #列名 for item in range(0, 250): print("第%d条" % (item + 1)) data = dataList[item] for i in range(0, 8): worksheet.write(item+1, i, data[i]) workbook.save(path) print('over') if __name__ == '__main__': default()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧