爬取今日热榜的知乎热点前10
打开网页https://tophub.today/
找到爬取信息
编写代码
#爬取今日热榜的知乎热点前10 import requests from bs4 import BeautifulSoup import pandas as pd url = 'https://tophub.today/' #今日热点网址 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}#伪装爬虫 r = requests.get(url,timeout=30,headers=headers) r.encoding = r.apparent_encoding #设置编码标准 r.encoding = 'utf-8' #避免乱码 t = r.text #获取源代码 soup=BeautifulSoup(t,'html.parser') a = [] #创建空列表 b = [] for x in soup.find_all(class_="t"): a.append(x.get_text().strip()) for y in soup.find_all(class_= "e"): b.append(y.get_text().strip()) data=[a,b] d=pd.DataFrame(data,index=["标题","热度"]) print(d.T)
输出结果