爬top250


import re

import requests
from bs4 import BeautifulSoup
import xlwt

def getID():
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
'Connection': 'keep-alive'
}

url = "https://top.baidu.com/board?tab=realtime"
r = requests.get(url, headers=headers)
r.elapsed = "gbk" ##设置字符集
soup = BeautifulSoup(r.text, "html.parser")
return soup
def getItem(soup):
mod = soup.find("div",attrs={"class":"container-bg_lQ801"})
sy = mod.find_all("div",attrs={"class":"category-wrap_iQLoo horizontal_1eKyQ"})
dataList = []
for i in sy:
EE = i.find_all("div", attrs={"class": "index_1Ew5p"})
top = re.findall(r'(\d+)',str(EE))[-1]
name = i.find("a",attrs={"class":"title_dIF3B"}).text
jj = i.find("div",attrs={"class":"hot-desc_1m_jR"}).text
rr = i.find("div",attrs={"class":"hot-index_1Bl1a"}).string
dataList.append(dict(排名=top,新闻头条=name,简介=jj,热度=rr))
return dataList
posted @ 2021-06-29 18:08  /**serenity*/  阅读(24)  评论(0编辑  收藏  举报