链家二手房
import pandas as pd
import requests
from bs4 import BeautifulSoup
# 获取数据的函数
def get_data(page):
url = f"https://sz.lianjia.com/ershoufang/pg{page}/"
res = requests.get(url=url).text
return res
# 处理页面数据的函数
def process_page(page):
res = get_data(page)
bs = BeautifulSoup(res, 'html.parser')
soup = bs.find('ul', class_='sellListContent')
sells = soup.find_all('li')
page_data = []
for sell in sells:
title = sell.find("div", class_="title").get_text()
location = sell.find("div", class_="positionInfo").get_text().strip()
house_info = sell.find("div", class_="houseInfo").get_text().strip()
price = sell.find("div", class_="totalPrice").get_text()
unitPrice = sell.find("div", class_="unitPrice").get_text()
followInfo = sell.find("div", class_="followInfo").get_text()
data = [title, location, house_info, price, unitPrice, followInfo]
page_data.append(data)
print(f"第{page}的数据{data}")
return page_data
def main():
data = []
for i in range(30):
res = process_page(i)
data.extend(res)
# 将列表数据转换为 Pandas DataFrame
df = pd.DataFrame(data, columns=["标题", "位置", "房屋信息", "价格", "单价", "关注度"])
# 将数据写入 Excel 文件
df.to_excel("lianjia.xlsx", index=False)
if __name__ == '__main__':
main()
异步处理
from bs4 import BeautifulSoup
import asyncio
import aiohttp
from openpyxl import Workbook
workbook = Workbook()
sheet = workbook.active
async def get_request(url):
async with aiohttp.ClientSession() as session:
async with await session.get(url=url) as reponse:
page_text = await reponse.text()
return page_text
def parse_data(t):
res = t.result()
bs = BeautifulSoup(res, 'html.parser')
soup = bs.find('ul', class_='sellListContent')
sells = soup.find_all('li')
for sell in sells:
title = sell.find("div", class_="title").get_text()
location = sell.find("div", class_="positionInfo").get_text().strip()
house_info = sell.find("div", class_="houseInfo").get_text().strip()
price = sell.find("div", class_="totalPrice").get_text()
unitPrice = sell.find("div", class_="unitPrice").get_text()
followInfo = sell.find("div", class_="followInfo").get_text()
data = (title, location, house_info, price, unitPrice, followInfo)
sheet.append(data)
def main():
tasks = []
urls = [f"https://sz.lianjia.com/ershoufang/pg{i}/" for i in range(1,5)]
for index,url in enumerate(urls):
c =get_request(url)
task = asyncio.ensure_future(c)
task.add_done_callback(parse_data)
tasks.append(task)
asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks))
workbook.save('lianjia_data.xlsx')
if __name__ == '__main__':
main()