# Ref: https://fishc.com.cn/forum.php?mod=viewthread&tid=101887&extra=page%3D1%26filter%3Dtypeid%26typeid%3D722
import requests
from bs4 import BeautifulSoup
import openpyxl
def open_url(url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}
res = requests.get(url, headers=headers)
res.raise_for_status()
res.encoding = res.apparent_encoding
return res.text
def parserHtml(html) -> object:
try:
soup = BeautifulSoup(html, 'html.parser')
titles = []
hrefs = []
messages_movie = []
messages_star = []
# 名称
titles_targets = soup.find_all('div', class_='hd')
for each in titles_targets:
titles.append(each.a.span.text)
hrefs.append(each.a['href']) # 链接
# 信息
bd_targets = soup.find_all('div', class_='bd')
for each in bd_targets:
try:
messages_movie.append(each.p.text.split('\n')[1].strip() +
each.p.text.split('\n')[2].strip())
except:
continue
# 评分
star_targets = soup.find_all('span', class_='rating_num')
for each in star_targets:
messages_star.append(each.text)
# result
result = []
for i in range(len(messages_star)):
# result.append(titles[i] + messages_movie[i] + messages_star[i] + '\n') # save to text
result.append([titles[i], messages_star[i], messages_movie[i], hrefs[i]])\
return result
except:
print('解析错误')
# def sava_excel(result):
# try:
# with open(r'./Python_Excel_小甲鱼/Top_DouBan_250.txt', 'w', encoding='utf-8') as f:
# for each in result:
# f.write(each)
# f.close()
# except:
# print('存储错误')
def save_excel(result):
try:
wb = openpyxl.Workbook()
ws = wb.active
ws['A1'] = '电影名称'
ws['B1'] = '评分'
ws['C1'] = '电影信息'
ws['D1'] = '电影链接'
for each in result:
ws.append(each)
wb.save('Top_DouBan_250.xlsx')
except:
print('保存Excel错误')
# 获取页面
def get_depth(html):
try:
soup = BeautifulSoup(html, 'html.parser')
depth = soup.find('span', class_='next').previous_sibling.previous_sibling.text
return depth
except:
print('获取页数错误')
def main():
host = r'https://movie.douban.com/top250'
html = open_url(host)
depth = get_depth(html)
result = []
for i in range(int(depth)):
url = host + '/?start=' + str(25*i) + '&filter='
html = open_url(url)
result.extend(parserHtml(html))
save_excel(result)
if __name__ == '__main__':
main()