import requests
import pymongo
from lxml import etree
class Houst(object):
def __init__(self):
self.url="https://cs.lianjia.com/ershoufang/"
self.headers ={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.95 Safari/537.36"
}
# 连接MongoDB
self.client= pymongo.MongoClient(host="localhost",port=27017)
self.db = self.client["my_data"]
def get_data_index(self):
response=requests.get(self.url,headers=self.headers)
response.encoding= "utf-8"
if response.status_code==200:
return response.text
else:
return None
def parse_data_index(self,response):
html=etree.HTML(response)
data_list=html.xpath('//ul[@class="sellListContent"]//li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')
print(data_list)
for data in data_list:
# title = data.xpath('./div[@class="info clear"]/div[@class="title"]/a/text()')
title = data.xpath('./div[1]/div[1]/a/text()')[0]
info = data.xpath("./div[1]/div[3]/div[1]/text()")[0]
number = data.xpath("./div[1]/div[4]/text()")[0]
yield {
'名称': title,
"户型": info,
"关注人数": number
}
def main(self):
response=self.get_data_index()
for item in self.parse_data_index(response):
print(item)
self.db.house.insert_one(item)
if __name__=='__main__':
spider=Houst()
spider.main()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· C#/.NET/.NET Core技术前沿周刊 | 第 29 期(2025年3.1-3.9)
· 从HTTP原因短语缺失研究HTTP/2和HTTP/3的设计差异