爬top250

def getCount(ID,ww):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
'Connection': 'keep-alive'
}

url = "https://top.baidu.com%s"%ID
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding ##设置字符集
soup = BeautifulSoup(r.text, "html.parser")
mod = soup.find("div", attrs={"class": "container-bg_lQ801"})
sy = mod.find_all("div", attrs={"class": "category-wrap_iQLoo"})
listt11 = []
for i in sy:
dataList = []
EE = i.find_all("div", attrs={"class": "index_1Ew5p"})
top = re.findall(r'(\d+)', str(EE))[-1]
dataList.append(top)
name = i.find("a", attrs={"class": "title_dIF3B"}).text
dataList.append(name)
if ww=="热点":
jj = i.find("div", attrs={"class": "hot-desc_1m_jR"}).text
dataList.append(jj)
rr = i.find("div", attrs={"class": "hot-index_1Bl1a"}).string
dataList.append(rr)
elif ww == "小说" or ww =="电影" or ww == "电视剧" or ww == "动漫" or ww == "明星":
zz = i.find("div", {"class": "intro_1l0wp"})
dataList.append(zz.text)
zx = zz.find_next("div").text
dataList.append(zx)
jj = i.find("div", attrs={"class": "c-single-text-ellipsis"}).text
dataList.append(jj)
rr = i.find("div", attrs={"class": "hot-index_1Bl1a"}).string
dataList.append(rr)
elif ww == "综艺" or ww == "游戏" or ww == "纪录片":
zz = i.find("div", {"class": "intro_1l0wp"})
dataList.append(zz.text)
jj = i.find("div", attrs={"class": "c-single-text-ellipsis"}).text
dataList.append(jj)
rr = i.find("div", attrs={"class": "hot-index_1Bl1a"}).string
dataList.append(rr)
else:
zz = i.find("div", {"class": "intro_1l0wp"})
dataList.append(zz.text)
zx = zz.find_next("div").text
dataList.append(zx)
rr = i.find("div", attrs={"class": "hot-index_1Bl1a"}).string
dataList.append(rr)
listt11.append(dataList)
return listt11
def saveExcel(c,i):

sheet = workbook.create_sheet(ww[i])
for x in c:
sheet.append(x)
workbook.save('百度666.xlsx')
a = getID()
b = getId(a)
print(ww)
print(ID)
workbook = Workbook()
for i in range(1,len(ID)):
c = getCount(ID[i],ww[i])
# print(c)
saveExcel(c,i)
posted @ 2021-06-29 18:06  /**serenity*/  阅读(332)  评论(0编辑  收藏  举报