Python 爬虫实战
基于 python 的 requests、BeautifulSoup4 第三方库,对哔站全站范围热门视频排行榜前一百实时爬取。
import requests
from bs4 import BeautifulSoup
file = open("hot100.txt", "w")
url = "https://www.bilibili.com/v/popular/rank/all/"
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36 Edg/92.0.902.55'}
res = requests.get(url, headers=header)
if res.status_code != 200:
print(res.status_code)
raise Exception("error")
html = res.text
soup = BeautifulSoup(html, "html.parser")
links = soup.find("div", id="app")\
.find("ul", class_="rank-list")\
.find_all("li", class_="rank-item")
for link in links:
link = link.find("div", class_="info").find("a", class_="title")
name = link.get_text()
href = "https:" + link["href"]
print("[%s](%s)" % (name, href))
try:
file.write("[%s](%s)\n" % (name, href))
except UnicodeEncodeError:
file.write("[Error](%s)\n" % href)
file.close()