import requests
from lxml import etree

class Houst(object):
    def __init__(self):
        self.url = "https://yibin.lianjia.com/ershoufang/pg{}/"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
        }

    def get_url_list(self):
        url_list = []
        for num in range(1, 11):
            url_list.append(self.url.format(num))
        return url_list

    def get_data_index(self, url):
        response = requests.get(url, headers=self.headers)
        response.encoding = "utf-8"
        if response.status_code == 200:
            return response.text
        else:
            return None

    def parse_data_index(self, response):
        html = etree.HTML(response)
        # 找ul标签下面全部的li标签
        data_list = html.xpath('//ul[@class="sellListContent"]//li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')
        for data in data_list:
            title = data.xpath('./div[1]/div[1]/a/text()')[0]
            info = data.xpath("./div[1]/div[3]/div[1]/text()")[0]
            number = data.xpath("./div[1]/div[4]/text()")[0]
            # 抓详情页地址
            detail_url = data.xpath('./div[1]/div[1]/a/@href')[0]
            # 向详情页发送请求,获取数据
            detail_resp = self.get_data_index(detail_url)
            # 解析详情页数据
            detail_html = etree.HTML(detail_resp)
            data_time = detail_html.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li[1]/span[2]/text()')[0]
            print(title, info, number, data_time)
        print("*-*-*-*-"*20)

    def main(self):
        for url in self.get_url_list():
            response = self.get_data_index(url)
            self.parse_data_index(response)


if __name__ == '__main__':
    spider = Houst()
    spider.main()

posted on 2024-05-22 18:09  下雨天的眼睛  阅读(7)  评论(0编辑  收藏  举报