某居客页面爬虫
支持2023版本,每过一段时间会有验证码验证,脚本会提醒手动输入,爬虫本身不会进行验证码处理(可自行扩展)。
pip前置安装项:
pip install logzero
pip install bs4
pip install requests
pip install html5lib
pip install lxml
代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from logzero import logger
import random
import time
def LOG_INFO(string): logger.info(string)
def LOG_DEBUG(string): logger.debug(string)
def LOG_WARN(string): logger.warning(string)
def LOG_ERR(string): logger.error(string)
class House:
def __init__(self):
# 标题
self.title = ''
# 小区概况信息(大范围+小范围)
self.cmu_desc = [] # default will be 2 elements (example: '华强南', '南华花园')
# 总价
self.price = ''
# 楼层
self.floor = ''
# 类型
self.type = ''
# 户型
self.unit_type = ''
# 面积
self.area = ''
# 单价
self.price_per = ''
# 朝向
self.direct = ''
# 年份
self.year = ''
# 装修
self.furnish = ''
# 预算
self.budget = ''
# 小区
self.community = ''
# 详情页面地址(用于手动补充未抓到的数据)
self.detail_page_href = ''
def NormalizeString(self, origin):
return origin.replace('\n', '').replace('\r', '').replace('\t', '').strip()
def assign_title(self, x):
self.title = self.NormalizeString(x)
def append_cmudesc(self, x):
self.cmu_desc.append(self.NormalizeString(x))
def assign_price(self, x, xu):
self.price = "" + self.NormalizeString(x) + self.NormalizeString(xu)
def assign_price_per(self, x, xu):
self.price_per = "" + self.NormalizeString(x) + self.NormalizeString(xu)
def assign_type(self, x):
self.type = self.NormalizeString(x)
def assign_area(self, x):
self.area = self.NormalizeString(x)
def assign_direct(self, x):
self.direct = self.NormalizeString(x)
def assign_unittype(self, x):
self.unit_type = self.NormalizeString(x)
def assign_year(self, x):
self.year = self.NormalizeString(x)
def assign_floor(self, x):
self.floor = self.NormalizeString(x)
def assign_furnish(self, x):
self.furnish = self.NormalizeString(x)
def assign_budget(self, x):
self.budget = self.NormalizeString(x)
def assign_community(self, x):
self.community = self.NormalizeString(x)
def assign_detail_page_href(self, x):
self.detail_page_href = self.NormalizeString(x)
def dump_string(self):
return self.title + "--" + self.cmu_desc[0] + "--" + self.cmu_desc[1] + "--" + self.price + "--"\
+ self.floor + "--" + self.type + "--" + self.unit_type + "--" + self.area + "--" + self.price_per + "--" \
+ self.direct + "--" + self.year + "--" + self.furnish + "--" + self.budget + "--" + self.community + "--" + self.detail_page_href
g_house_list = []
g_title_list = []
## Global Config
g_sleep_time_begin = 1
g_sleep_time_end = 3
g_index_max = 100
def _GenRandomInteger(begin, end):
return random.randint(begin, end)
def _GenURLContext(index):
format_pre = "https://m.anjuke.com/sale/ditiefang-l1037/p{}/?from=navigation"
return format_pre.format(index)
def _HandleHouseContext(hse):
# Do anything you want. (output, save to file, etc.)
## This is just a demo, output to console ~~
print(hse.dump_string())
def _HandleWebsiteVerifyRetry(url):
input_text = "\nPlease open '{}' to verify first, then press any key to retry...".format(url)
input(input_text)
def _MappingTitleTextRule(hse, title, text):
if title == "户型":
hse.assign_unittype(text)
elif title == "建筑面积":
hse.assign_area(text)
elif title == "楼层":
hse.assign_floor(text)
elif title == "朝向":
hse.assign_direct(text)
elif title == "类型":
hse.assign_type(text)
elif title == "装修":
hse.assign_furnish(text)
elif title == "年代":
hse.assign_year(text)
elif title == "预算":
hse.assign_budget(text)
elif title == "小区":
hse.assign_community(text)
elif title == "单价":
hse.assign_price_per(text, "")
else:
return
def _AssignHouseDetailFields(hse, listwp_obj):
list_obj_iter = listwp_obj.find_all("div", class_='list-item')
if list_obj_iter != None and len(list_obj_iter) > 0:
for list_obj in list_obj_iter:
list_obj_title = list_obj.find("span", class_='title')
list_obj_text = list_obj.find("span", class_='text')
if list_obj_title != None and list_obj_text != None:
_MappingTitleTextRule(hse, list_obj_title.text, list_obj_text.text)
list_obj1 = listwp_obj.find("a", class_='list-item with-arrow')
if list_obj1 != None:
list_obj1_title = list_obj1.find("span", class_='title')
list_obj1_text = list_obj1.find("span", class_='text')
if list_obj1_title != None and list_obj1_text != None:
_MappingTitleTextRule(hse, list_obj1_title.text, list_obj1_text.text)
list_obj2 = listwp_obj.find("a", class_='list-item with-up-arrow')
if list_obj2 != None:
list_obj2_title = list_obj2.find("span", class_='title')
list_obj2_text = list_obj2.find("span", class_='text')
if list_obj2_title != None and list_obj2_text != None:
_MappingTitleTextRule(hse, list_obj2_title.text, list_obj2_text.text)
def HandleDetailPage(hse, detail_page_url):
global g_house_list
header_self = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
}
resp = requests.get(detail_page_url, headers=header_self)
resp_text = resp.text
resp_obj = BeautifulSoup(resp_text, 'lxml')
while True:
if resp.status_code != 200:
LOG_WARN("Status code[{}] failed".format(resp.status_code))
return False
resp_text = resp.text
if len(resp_text) == 0:
_HandleWebsiteVerifyRetry(detail_page_url)
resp = requests.get(detail_page_url, headers=header_self)
continue
resp_obj = BeautifulSoup(resp_text, 'lxml')
## Title
baseinfo_title = resp_obj.find("div", class_='baseinfo-title')
if baseinfo_title == None:
_HandleWebsiteVerifyRetry(detail_page_url)
resp = requests.get(detail_page_url, headers=header_self)
continue
break
## Price
baseinfo_price = resp_obj.find("div", class_='baseinfo-2l-data-left')
if baseinfo_price != None:
baseinfo_price_ctx = baseinfo_price.find("span", class_='baseinfo-num')
baseinfo_price_unit = baseinfo_price.find("span", class_='baseinfo-unit')
if baseinfo_price_ctx != None and baseinfo_price_unit != None:
hse.assign_price(baseinfo_price_ctx.text, baseinfo_price_unit.text)
## Per-Price
baseinfo_price = resp_obj.find("div", class_='baseinfo-2l-data-right')
if baseinfo_price != None:
baseinfo_price_ctx = baseinfo_price.find("span", class_='baseinfo-num')
baseinfo_price_unit = baseinfo_price.find("span", class_='baseinfo-unit')
if baseinfo_price_ctx != None and baseinfo_price_unit != None:
hse.assign_price(baseinfo_price_ctx.text, baseinfo_price_unit.text)
## Detail
house_detail_list = resp_obj.find("ul", class_='houseinfo-list')
if house_detail_list == None:
LOG_WARN("Try detail page list failed")
return False
house_detail_item_iter = house_detail_list.find_all("li")
if house_detail_item_iter == None or len(house_detail_item_iter) == 0:
LOG_WARN("Try detail page list item failed")
return False
for house_detail_item in house_detail_item_iter:
_AssignHouseDetailFields(hse, house_detail_item)
return True
def RunWalker(origin_url):
global g_title_list
header_self = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
}
resp = requests.get(origin_url, headers=header_self)
resp_text = resp.text
resp_obj = BeautifulSoup(resp_text, 'lxml')
while True:
if resp.status_code != 200:
LOG_WARN("Status code[{}] failed".format(resp.status_code))
return False
resp_text = resp.text
if len(resp_text) == 0:
_HandleWebsiteVerifyRetry(origin_url)
resp = requests.get(origin_url, headers=header_self)
continue
resp_obj = BeautifulSoup(resp_text, 'lxml')
house_iter = resp_obj.find_all('li', class_='item-wrap')
if len(house_iter) == 0:
LOG_WARN("Empty object of '{}'".format(origin_url))
_HandleWebsiteVerifyRetry(origin_url)
resp = requests.get(origin_url, headers=header_self)
continue
break
for house_main in house_iter:
time.sleep(_GenRandomInteger(g_sleep_time_begin, g_sleep_time_end))
house_a = house_main.find('a', class_='cell-wrap MLIST_MAIN')
if house_a == None:
continue
house_a_href = house_a.get('href')
LOG_DEBUG("Handle defail page '{}'".format(house_a_href))
if len(house_a_href) == 0:
LOG_WARN("Empty detail page url, skip.")
continue
hse = House()
hse.assign_detail_page_href(house_a_href)
if HandleDetailPage(hse, house_a_href) == False:
time.sleep(_GenRandomInteger(g_sleep_time_begin, g_sleep_time_end))
LOG_WARN("Handle detail page operation failed")
house_div_cwrap = house_a.find('div', class_='content-wrap')
if house_div_cwrap == None:
continue
house_div_twrapl2 = house_div_cwrap.find('div', class_='title-wrap lines2')
if house_div_twrapl2 == None:
continue
# Title
house_span_title = house_div_twrapl2.find('span', class_='content-title')
if house_span_title == None or len(house_span_title.text) == 0:
continue
hse.assign_title(house_span_title.text)
house_div_cmuwrap = house_div_cwrap.find('div', class_='desc-wrap desc-wrap-community')
if house_div_cmuwrap != None:
# Community
house_span_cmuwrap_c1 = house_div_cmuwrap.find('span', class_='content-desc')
if house_span_cmuwrap_c1 != None and len(house_span_cmuwrap_c1.text) > 0:
hse.append_cmudesc(house_span_cmuwrap_c1.text)
house_span_cmuwrap_c2 = house_div_cmuwrap.find('span', class_='content-desc content-desc-community')
if house_span_cmuwrap_c2 != None and len(house_span_cmuwrap_c2.text) > 0:
hse.append_cmudesc(house_span_cmuwrap_c2.text)
house_div_descwrap_iter = house_div_cwrap.find_all('div', class_='desc-wrap')
house_div_descwrap = None
if house_div_descwrap_iter != None and len(house_div_descwrap_iter) > 0:
for house_div_descwrap_item in house_div_descwrap_iter:
if house_div_descwrap_item.get('class') == ['desc-wrap']:
house_div_descwrap = house_div_descwrap_item
break
if house_div_descwrap != None:
house_price_div = house_div_descwrap.find("div", class_='price-wrap')
if house_price_div != None:
house_price_ctx = house_price_div.find("span", class_='content-price')
house_price_unit = house_price_div.find("span", class_='content-unit')
if house_price_ctx != None and house_price_unit != None:
hse.assign_price(house_price_ctx.text, house_price_unit.text)
house_constent_desc_iter = house_div_descwrap.find_all("span", class_='content-desc')
if house_constent_desc_iter != None and len(house_constent_desc_iter) > 0:
for house_constent_desc_item in house_constent_desc_iter:
if (house_constent_desc_item.text.find('室') != -1 or house_constent_desc_item.text.find('厅') != -1) and len(hse.unit_type) == 0:
hse.assign_unittype(house_constent_desc_item.text)
if house_constent_desc_item.text.find("㎡") != -1 and len(hse.area) == 0:
hse.assign_area(house_constent_desc_item.text)
title_found = False
for i in g_title_list:
if hse.title == i:
title_found = True
break
if title_found == False:
g_house_list.append(hse)
_HandleHouseContext(hse)
return True
def main():
index_max = g_index_max
LOG_DEBUG("Begin walker routine")
for index in range(1, index_max + 1):
url_string = _GenURLContext(index)
LOG_DEBUG("Begin to get '{}' context".format(url_string))
if RunWalker(url_string) == False:
LOG_WARN("Get context '{}' failed".format(url_string))
time.sleep(_GenRandomInteger(g_sleep_time_begin, g_sleep_time_end))
if __name__ == '__main__':
main()
本文为博主总结文章,欢迎转载,请注明出处。