随笔 - 7  文章 - 0 评论 - 0 阅读 - 54
< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5

import requests
import pymongo
from lxml import etree

class Houst(object):
    def __init__(self):
        self.url="https://cs.lianjia.com/ershoufang/"
        self.headers ={
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.95 Safari/537.36"
        }
       # 连接MongoDB
        self.client= pymongo.MongoClient(host="localhost",port=27017)
        self.db = self.client["my_data"]

    def get_data_index(self):
        response=requests.get(self.url,headers=self.headers)
        response.encoding= "utf-8"
        if response.status_code==200:
            return response.text
        else:
            return  None

    def parse_data_index(self,response):
        html=etree.HTML(response)
        data_list=html.xpath('//ul[@class="sellListContent"]//li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')
        print(data_list)
        for data in data_list:
            # title = data.xpath('./div[@class="info clear"]/div[@class="title"]/a/text()')
            title = data.xpath('./div[1]/div[1]/a/text()')[0]
            info = data.xpath("./div[1]/div[3]/div[1]/text()")[0]
            number = data.xpath("./div[1]/div[4]/text()")[0]
            yield {
                '名称': title,
                "户型": info,
                "关注人数": number
            }

    def main(self):
        response=self.get_data_index()
        for item in self.parse_data_index(response):
            print(item)
        self.db.house.insert_one(item)

if __name__=='__main__':
    spider=Houst()
    spider.main()
posted on   下雨天的眼睛  阅读(4)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· C#/.NET/.NET Core技术前沿周刊 | 第 29 期(2025年3.1-3.9)
· 从HTTP原因短语缺失研究HTTP/2和HTTP/3的设计差异
点击右上角即可分享
微信分享提示