链家二手房

import pandas as pd
import requests
from bs4 import BeautifulSoup

# 获取数据的函数
def get_data(page):
    url = f"https://sz.lianjia.com/ershoufang/pg{page}/"
    res = requests.get(url=url).text
    return res

# 处理页面数据的函数
def process_page(page):
    res = get_data(page)
    bs = BeautifulSoup(res, 'html.parser')
    soup = bs.find('ul', class_='sellListContent')
    sells = soup.find_all('li')
    page_data = []
    for sell in sells:
        title = sell.find("div", class_="title").get_text()
        location = sell.find("div", class_="positionInfo").get_text().strip()
        house_info = sell.find("div", class_="houseInfo").get_text().strip()
        price = sell.find("div", class_="totalPrice").get_text()
        unitPrice = sell.find("div", class_="unitPrice").get_text()
        followInfo = sell.find("div", class_="followInfo").get_text()
        data = [title, location, house_info, price, unitPrice, followInfo]
        page_data.append(data)
        print(f"第{page}的数据{data}")
    return page_data

def main():
    data = []
    for i in range(30):
        res = process_page(i)
        data.extend(res)

    # 将列表数据转换为 Pandas DataFrame
    df = pd.DataFrame(data, columns=["标题", "位置", "房屋信息", "价格", "单价", "关注度"])

    # 将数据写入 Excel 文件
    df.to_excel("lianjia.xlsx", index=False)


if __name__ == '__main__':
    main()

异步处理

from bs4 import BeautifulSoup
import asyncio
import aiohttp
from openpyxl import Workbook


workbook = Workbook()
sheet = workbook.active

async def get_request(url):
    async with aiohttp.ClientSession() as session:
        async with await session.get(url=url) as reponse:
            page_text = await reponse.text()
            return page_text

def parse_data(t):
    res = t.result()
    bs = BeautifulSoup(res, 'html.parser')
    soup = bs.find('ul', class_='sellListContent')
    sells = soup.find_all('li')
    for sell in sells:
        title = sell.find("div", class_="title").get_text()
        location = sell.find("div", class_="positionInfo").get_text().strip()
        house_info = sell.find("div", class_="houseInfo").get_text().strip()
        price = sell.find("div", class_="totalPrice").get_text()
        unitPrice = sell.find("div", class_="unitPrice").get_text()
        followInfo = sell.find("div", class_="followInfo").get_text()
        data = (title, location, house_info, price, unitPrice, followInfo)
        sheet.append(data)


def main():
    tasks = []
    urls = [f"https://sz.lianjia.com/ershoufang/pg{i}/" for i in range(1,5)]
    for index,url in enumerate(urls):
        c =get_request(url)
        task = asyncio.ensure_future(c)
        task.add_done_callback(parse_data)
        tasks.append(task)
    asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks))

    workbook.save('lianjia_data.xlsx')


if __name__ == '__main__':
    main()
posted @ 2023-10-18 18:33  牧羊人の冬天  阅读(19)  评论(0编辑  收藏  举报