五杀摇滚小拉夫

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

1、采集贝壳网站房产信息,获取字段:项目名称、地址、均价、户型、建筑面积、总价等信息。

# -*- coding: utf-8 -*-
# 贝壳找房信息采集
import requests
import re
import time
import pandas as pd
from lxml import etree
# 创建一个空表存储数据
datas_list = []
def data(Cookie):
    url='https://zz.fang.ke.com/loupan/pg1/'
    header={
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept - Encoding': 'gzip, deflate, br',
        'Accept - Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep - alive',
        'Cookie':Cookie,
        'Host':'zz.fang.ke.com',
        'Sec - Fetch - Dest': 'document',
        'Sec - Fetch - Mode': 'navigate',
        'Sec - Fetch - Site': 'none',
        'Sec - Fetch - User': '?1',
        'Upgrade - Insecure - Requests': '1',
        'User - Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
    }
    response = requests.get(url=url,headers=header).text
    # html=etree.HTML(response.text)
    #获取总楼盘数
    total_num= re.findall(r'<div class="page-box" data-current="1" data-total-count="(.*?)">',response)[0]
    #总页数
    page, remainder = divmod(int(total_num), 10)
    # 如果余数不为0,则整数部分加1
    if remainder != 0:
        page += 1
    # 输出结果
    for i in range (1,page+1):
        url2=f'https://zz.fang.ke.com/loupan/pg{i}/'
        response2= requests.get(url2,headers=header).text
        #设置休眠时间1.5秒模拟人为操作
        time.sleep(1.5)
        html=etree.HTML(response2)
        #项目名称
        project_names= html.xpath("//div[@class='resblock-name']/a")
        #项目地址
        addresses=html.xpath("//div[@class='resblock-desc-wrapper']/a[@class='resblock-location']")
        #均价
        average_prices=html.xpath("//div[@class='resblock-price']/div[@class='main-price']/span[@class='number']")
        # #户型
        floor_plans=html.xpath("//div[@class='resblock-desc-wrapper']/a[@class='resblock-room']/span[2]")
        #建筑面积
        areas=html.xpath("//div[@class='resblock-desc-wrapper']/a[@class='resblock-room']/span[@class='area']")
        #总价
        prices=html.xpath("//div[@class='resblock-price']/div[@class='second']")
        for j in range(1,len(project_names)+1):
            try:
                project_name = project_names[j - 1].xpath("string(.)")
                # 去除空字符串
                address = addresses[j - 1].xpath("string(.)").strip()
                average_price = average_prices[j - 1].xpath("string(.)")
                floor_plan = floor_plans[j - 1].xpath("string(.)")
                area = areas[j - 1].xpath("string(.)").replace('建面', '')
                # 去除多余字符串
                price = prices[j - 1].xpath("string(.)").replace('总价', '')
                print('项目名称:' + project_name)
                print('项目地址:' + address)
                print('均价:' + average_price)
                print('户型:' + floor_plan)
                print('面积:' + area)
                print('总价:' + price)
                # 创建一个Pandas DataFrame来存储数据
                data_dict = {
                    '项目名称': [project_name],
                    '项目地址': [address],
                    '均价': [average_price],
                    '户型': [floor_plan],
                    '面积': [area],
                    '总价': [price],

                }
                df = pd.DataFrame(data_dict)
                datas_list.append(df)
                # 合并所有数据DataFrame
                final_df = pd.concat(datas_list, ignore_index=True)

                # 保存到Excel文件
                final_df.to_excel('郑州市楼盘信息-贝壳网.xlsx', index=False)
                print('执行完毕')
            except:
                print('异常')


Cookie='lianjia_uuid=c69a0257-ad07-4ee6-a77b-d0a8ae364f19; digv_extends=%7B%22utmTrackId%22%3A%22180514533%22%7D; select_city=410100; lianjia_ssid=cda478b9-a29c-482d-902a-fbbf36ef910a; crosSdkDT2019DeviceId=-9dgpud--e1brmv-ovzn0buj019hkq0-rj5nidxv8; _ga=GA1.2.1321224426.1713143752; _gid=GA1.2.68286756.1713143752; __xsptplusUT_788=1; _jzqa=1.2303208279780800800.1713143752.1713143752.1713143752.1; _jzqc=1; _jzqx=1.1713143752.1713143752.1.jzqsr=zz%2Eke%2Ecom|jzqct=/.-; _jzqckmp=1; ke_uuid=614b30d986fc259b87d777a3fa7da189; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218d67c654b638a-0470d0ab78c575-3e604809-1327104-18d67c654b7594%22%2C%22%24device_id%22%3A%2218d67c654b638a-0470d0ab78c575-3e604809-1327104-18d67c654b7594%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E4%BB%98%E8%B4%B9%E5%B9%BF%E5%91%8A%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22360%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22biaoti%22%2C%22%24latest_utm_content%22%3A%22biaoti%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; _qzjc=1; digData=%7B%22key%22%3A%22loupan_index%22%7D; login_ucid=2000000045530134; lianjia_token=2.00144024156b15f5ef05ed0d24a97241ed; lianjia_token_secure=2.00144024156b15f5ef05ed0d24a97241ed; security_ticket=JrDFpWg3TXSJOrEXPQhIX9clqR/Bk7W9RLW3NHYwOtDlbIkpZxkbYtPbhci4h8AaBw4JnRlPiW3Ey0RaYj9Cxop1kuKMh89I4YpGTQfed1heKxFmgDgkhZu3oI0VeSoNRHXYD/+IWbLvBrUmRqxS7a4mhEItL7/R0X7FByfiJEo=; ftkrc_=41accf7d-0651-4e65-8df1-ec15579ef6af; lfrc_=d8a9ea15-a6ca-4377-b9cb-c80611472e29; lj_newh_session=eyJpdiI6Ik54aVhwaUZHUHpNbjhTWU8yWVZHUlE9PSIsInZhbHVlIjoiUXR0NjBvZkZrbGxMM2NCaDQ0Y3ZtbVpIazhRXC9yWTZXRGtSOHpCNXZFTDFVZkdaZUVjUXNIaENZd0ZtbzhrK1o4ZVpTbnZ2SEpoR3FGMHBqSVZtTTNBPT0iLCJtYWMiOiIwNzU0NjZkMDFkYTBlMWJiNjVjNGFjNDBiZmZjMGE2ODcwNzdiZGY1MWJhZTc0YjU5ZDU1MzA3ZDVjOGVjYzgwIn0%3D; _qzja=1.1289423564.1713143751997.1713143751997.1713143751997.1713143831552.1713143839998.0.0.0.4.1; _qzjb=1.1713143751997.4.0.0.0; _qzjto=4.1.0; _jzqb=1.4.10.1713143752.1; srcid=eyJ0IjoiXCJ7XFxcImRhdGFcXFwiOlxcXCJhMzkxNjQ0YzkwMDMzNzA5Mjg3NTgyOWZmZDc4ZmNiMzZjYTg4ZTgwYWEwZGM5ZGI1YzMwMTA1ZTUzYzlkMzhhYzg4ODk2YWM1MWUxN2M1NWY3OWVjZWU0MTNiNTNkNjJlOGUzMjAwMWU1YzY4ZjQ1N2ZiYzJhZDU0ZGY2ZGJjY2MxMTkxMzRhOWZlMmM0NmIyOTVhMWM1MmEzNmI2N2FhNjdmNDJlNmZhZTdjYjRiYTQ2OGYwMzc5N2Q1ZDRkYzc3ZGMwNGY0ZjU1OWQzYWVlZDM1ZGY4OGUwYzBkYTc1YTY4Nzk3MTZhZDQyOTAxYzA2NDNkZGE3OTQ3NzIzZTZkXFxcIixcXFwia2V5X2lkXFxcIjpcXFwiMVxcXCIsXFxcInNpZ25cXFwiOlxcXCI5ZDk2YmY0OFxcXCJ9XCIiLCJyIjoiaHR0cHM6Ly96ei5mYW5nLmtlLmNvbS9sb3VwYW4vcGcxLyIsIm9zIjoid2ViIiwidiI6IjAuMSJ9; __xsptplus788=788.1.1713143793.1713143873.3%234%7C%7C%7C%7C%7C%23%23%23'
data(Cookie)

获取信息如下

 

posted on 2024-04-15 16:06  五杀摇滚小拉夫  阅读(15)  评论(0编辑  收藏  举报