百度热搜爬虫

爬取页面热搜榜单

image

爬取汽车榜单中的热度和价格

image

import requests
from bs4 import BeautifulSoup

headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
,
'Cookie':
"BIDUPSID=533E2C368EEB36A3FD9D61D5E2ED701D; PSTM=1701426278; BAIDUID=4445FFBCC0EE2BDC834E4893CDAC797E:FG=1; MCITY=-%3A; BDUSS=0NYU2N1ckxyM090STRqRUlPfktkN0pJT3ZiWlo0Q2hTeXBTajVNWmtqTzRRMTltSVFBQUFBJCQAAAAAAAAAAAEAAAD9jfvxu6WyuTIzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALi2N2a4tjdmb; BDUSS_BFESS=0NYU2N1ckxyM090STRqRUlPfktkN0pJT3ZiWlo0Q2hTeXBTajVNWmtqTzRRMTltSVFBQUFBJCQAAAAAAAAAAAEAAAD9jfvxu6WyuTIzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALi2N2a4tjdmb; H_WISE_SIDS_BFESS=60274_60340_60346_60362_60360; H_WISE_SIDS=60274_60362_60360; H_PS_PSSID=60274_60470_60491_60500; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; BAIDUID_BFESS=4445FFBCC0EE2BDC834E4893CDAC797E:FG=1; BA_HECTOR=a5al8k01210l0404058180a103gi9j1j9stlv1u; ZFY=egHzkrAoC4T1EKBSIVxmq:A9d2CRZeNCzn1fI8:B6JPLU:C; BDRCVFR[Ter2S3H5o_D]=mk3SLVN4HKm; delPer=0; PSINO=6"}

url = "https://top.baidu.com/board?platform=pc&sa=pcindex_entry"

res = requests.get(url,headers)
result = res.content.decode('utf-8')
# print(res.content.decode('utf-8'))
def get_hots(result):
    soup = BeautifulSoup(result,'html.parser')

    divs = soup.find_all('div', class_ ="list_1EDla")
    # print(divs)
    a_s = divs[0].find_all('a', class_="item-wrap_2oCLZ")
    # print(a_s)
    hots =[]
    for a in a_s:
        hot_href = a['href']

        div_ = a.find_all('div', class_="c-single-text-ellipsis")
        hot_text = div_[0].text
        # print(div_[0].string)

        hot = {
            hot_text:hot_href
        }
        hots.append(hot)
    print(hots)
    return hots
    # break

def get_car(result):
    soup = BeautifulSoup(result, 'html.parser')
    divs = soup.find_all('div',theme ='car')
    print(divs)
    car_divs = divs[0].find_all('div', class_="item-wrap_Z0BrP")
    print(car_divs)
    car_infos =[]
    for div in car_divs:
        info_div = div.find_all('div', class_="right_1PE2e")
        car_name_tag = info_div[0].find_all('a')
        car_name = car_name_tag[0].text
        hot_price = info_div[0].find_all('div')
        hot = hot_price[0].text
        price = hot_price[-1].text
        # print(car_name,hot,price)
        car_info = {
            'car_name':car_name,
            '热搜指数':hot,
            'price':price
        }
        car_infos.append(car_info)
    print(car_infos)
    return car_infos
if __name__ == '__main__':
    hots = get_hots(result)
    cars = get_car(result)
    for car in cars:
        for v in car.values():
            print(v)
    print('--------------热点榜单--------------')
    num=1
    for hot in hots:
        for k,v in hot.items():
            print(num , k, v)
            num +=1
posted @ 2024-07-23 23:17  云岛夜川川  阅读(8)  评论(0编辑  收藏  举报