数据分析实战(8)-贝壳租房Xpath爬虫+数据分析实战
sadsadsadsa
import requests from lxml import etree basic_url = "https://xa.zu.ke.com" url = "https://xa.zu.ke.com/zufang/" header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"} html = requests.get(url=url,headers=header).text tree = etree.HTML(html) # 获取div标签列表 div_list = tree.xpath('//div[@class="content__list"]/div') for div in div_list: try: # 数据解析 name = div.xpath('.//p[1]/a/text()')[0] print(name) target_url = basic_url + div.xpath('.//p[1]/a/@href')[0] print(target_url) area = div.xpath('.//p[2]/a[1]/text()')[0] print(area) subdivide = div.xpath('.//p[2]/a[2]/text()')[0] print(subdivide) #community_name = div.xpath('.//p[2]/a[2]/text()') # 有问题,茶张新元 #print(community_name) space_size = div.xpath('.//p[2]/text()')[4] print(space_size) towards = div.xpath('.//p[2]/text()')[5] print(towards) room_type = div.xpath('.//p[2]/text()')[6] print(room_type) #apartment_name = div.xpath('.//p[2]/p/text()')[0] # 有问题,西安梧桐公寓 #print(apartment_name) floor = div.xpath('.//p[2]/span/text()')[1] print(floor) last_updated = div.xpath('.//p[3]/text()')[0] print(last_updated) is_new = div.xpath('.//p[4]/i[1]/text()')[0] print(is_new) #rent_type = div.xpath('.//p[4]/i[3]/text()')[0] #print(rent_type) decoration = div.xpath('div[1]/p[4]/i[4]/text()') print(decoration) price = div.xpath('.//span/em/text()')[0] print(price) data_unit = div.xpath('./div[1]/span/text()')[0] print(data_unit) break except IndexError: pass