1、采集贝壳网站房产信息,获取字段:项目名称、地址、均价、户型、建筑面积、总价等信息。
# -*- coding: utf-8 -*- # 贝壳找房信息采集 import requests import re import time import pandas as pd from lxml import etree # 创建一个空表存储数据 datas_list = [] def data(Cookie): url='https://zz.fang.ke.com/loupan/pg1/' header={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept - Encoding': 'gzip, deflate, br', 'Accept - Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep - alive', 'Cookie':Cookie, 'Host':'zz.fang.ke.com', 'Sec - Fetch - Dest': 'document', 'Sec - Fetch - Mode': 'navigate', 'Sec - Fetch - Site': 'none', 'Sec - Fetch - User': '?1', 'Upgrade - Insecure - Requests': '1', 'User - Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36' } response = requests.get(url=url,headers=header).text # html=etree.HTML(response.text) #获取总楼盘数 total_num= re.findall(r'<div class="page-box" data-current="1" data-total-count="(.*?)">',response)[0] #总页数 page, remainder = divmod(int(total_num), 10) # 如果余数不为0,则整数部分加1 if remainder != 0: page += 1 # 输出结果 for i in range (1,page+1): url2=f'https://zz.fang.ke.com/loupan/pg{i}/' response2= requests.get(url2,headers=header).text #设置休眠时间1.5秒模拟人为操作 time.sleep(1.5) html=etree.HTML(response2) #项目名称 project_names= html.xpath("//div[@class='resblock-name']/a") #项目地址 addresses=html.xpath("//div[@class='resblock-desc-wrapper']/a[@class='resblock-location']") #均价 average_prices=html.xpath("//div[@class='resblock-price']/div[@class='main-price']/span[@class='number']") # #户型 floor_plans=html.xpath("//div[@class='resblock-desc-wrapper']/a[@class='resblock-room']/span[2]") #建筑面积 areas=html.xpath("//div[@class='resblock-desc-wrapper']/a[@class='resblock-room']/span[@class='area']") #总价 prices=html.xpath("//div[@class='resblock-price']/div[@class='second']") for j in range(1,len(project_names)+1): try: project_name = project_names[j - 1].xpath("string(.)") # 去除空字符串 address = addresses[j - 1].xpath("string(.)").strip() average_price = average_prices[j - 1].xpath("string(.)") floor_plan = floor_plans[j - 1].xpath("string(.)") area = areas[j - 1].xpath("string(.)").replace('建面', '') # 去除多余字符串 price = prices[j - 1].xpath("string(.)").replace('总价', '') print('项目名称:' + project_name) print('项目地址:' + address) print('均价:' + average_price) print('户型:' + floor_plan) print('面积:' + area) print('总价:' + price) # 创建一个Pandas DataFrame来存储数据 data_dict = { '项目名称': [project_name], '项目地址': [address], '均价': [average_price], '户型': [floor_plan], '面积': [area], '总价': [price], } df = pd.DataFrame(data_dict) datas_list.append(df) # 合并所有数据DataFrame final_df = pd.concat(datas_list, ignore_index=True) # 保存到Excel文件 final_df.to_excel('郑州市楼盘信息-贝壳网.xlsx', index=False) print('执行完毕') except: print('异常') Cookie='lianjia_uuid=c69a0257-ad07-4ee6-a77b-d0a8ae364f19; digv_extends=%7B%22utmTrackId%22%3A%22180514533%22%7D; select_city=410100; lianjia_ssid=cda478b9-a29c-482d-902a-fbbf36ef910a; crosSdkDT2019DeviceId=-9dgpud--e1brmv-ovzn0buj019hkq0-rj5nidxv8; _ga=GA1.2.1321224426.1713143752; _gid=GA1.2.68286756.1713143752; __xsptplusUT_788=1; _jzqa=1.2303208279780800800.1713143752.1713143752.1713143752.1; _jzqc=1; _jzqx=1.1713143752.1713143752.1.jzqsr=zz%2Eke%2Ecom|jzqct=/.-; _jzqckmp=1; ke_uuid=614b30d986fc259b87d777a3fa7da189; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218d67c654b638a-0470d0ab78c575-3e604809-1327104-18d67c654b7594%22%2C%22%24device_id%22%3A%2218d67c654b638a-0470d0ab78c575-3e604809-1327104-18d67c654b7594%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E4%BB%98%E8%B4%B9%E5%B9%BF%E5%91%8A%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22360%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22biaoti%22%2C%22%24latest_utm_content%22%3A%22biaoti%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; _qzjc=1; digData=%7B%22key%22%3A%22loupan_index%22%7D; login_ucid=2000000045530134; lianjia_token=2.00144024156b15f5ef05ed0d24a97241ed; lianjia_token_secure=2.00144024156b15f5ef05ed0d24a97241ed; security_ticket=JrDFpWg3TXSJOrEXPQhIX9clqR/Bk7W9RLW3NHYwOtDlbIkpZxkbYtPbhci4h8AaBw4JnRlPiW3Ey0RaYj9Cxop1kuKMh89I4YpGTQfed1heKxFmgDgkhZu3oI0VeSoNRHXYD/+IWbLvBrUmRqxS7a4mhEItL7/R0X7FByfiJEo=; ftkrc_=41accf7d-0651-4e65-8df1-ec15579ef6af; lfrc_=d8a9ea15-a6ca-4377-b9cb-c80611472e29; lj_newh_session=eyJpdiI6Ik54aVhwaUZHUHpNbjhTWU8yWVZHUlE9PSIsInZhbHVlIjoiUXR0NjBvZkZrbGxMM2NCaDQ0Y3ZtbVpIazhRXC9yWTZXRGtSOHpCNXZFTDFVZkdaZUVjUXNIaENZd0ZtbzhrK1o4ZVpTbnZ2SEpoR3FGMHBqSVZtTTNBPT0iLCJtYWMiOiIwNzU0NjZkMDFkYTBlMWJiNjVjNGFjNDBiZmZjMGE2ODcwNzdiZGY1MWJhZTc0YjU5ZDU1MzA3ZDVjOGVjYzgwIn0%3D; _qzja=1.1289423564.1713143751997.1713143751997.1713143751997.1713143831552.1713143839998.0.0.0.4.1; _qzjb=1.1713143751997.4.0.0.0; _qzjto=4.1.0; _jzqb=1.4.10.1713143752.1; srcid=eyJ0IjoiXCJ7XFxcImRhdGFcXFwiOlxcXCJhMzkxNjQ0YzkwMDMzNzA5Mjg3NTgyOWZmZDc4ZmNiMzZjYTg4ZTgwYWEwZGM5ZGI1YzMwMTA1ZTUzYzlkMzhhYzg4ODk2YWM1MWUxN2M1NWY3OWVjZWU0MTNiNTNkNjJlOGUzMjAwMWU1YzY4ZjQ1N2ZiYzJhZDU0ZGY2ZGJjY2MxMTkxMzRhOWZlMmM0NmIyOTVhMWM1MmEzNmI2N2FhNjdmNDJlNmZhZTdjYjRiYTQ2OGYwMzc5N2Q1ZDRkYzc3ZGMwNGY0ZjU1OWQzYWVlZDM1ZGY4OGUwYzBkYTc1YTY4Nzk3MTZhZDQyOTAxYzA2NDNkZGE3OTQ3NzIzZTZkXFxcIixcXFwia2V5X2lkXFxcIjpcXFwiMVxcXCIsXFxcInNpZ25cXFwiOlxcXCI5ZDk2YmY0OFxcXCJ9XCIiLCJyIjoiaHR0cHM6Ly96ei5mYW5nLmtlLmNvbS9sb3VwYW4vcGcxLyIsIm9zIjoid2ViIiwidiI6IjAuMSJ9; __xsptplus788=788.1.1713143793.1713143873.3%234%7C%7C%7C%7C%7C%23%23%23' data(Cookie)
获取信息如下