某居客页面爬虫

支持2023版本,每过一段时间会有验证码验证,脚本会提醒手动输入,爬虫本身不会进行验证码处理(可自行扩展)。

pip前置安装项:

pip install logzero
pip install bs4
pip install requests
pip install html5lib
pip install lxml

代码:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup
from logzero import logger
import random
import time

def LOG_INFO(string): logger.info(string)
def LOG_DEBUG(string): logger.debug(string)
def LOG_WARN(string): logger.warning(string)
def LOG_ERR(string):  logger.error(string)

class House:
  def __init__(self):
    # 标题
    self.title = ''
    # 小区概况信息(大范围+小范围)
    self.cmu_desc = [] # default will be 2 elements (example: '华强南', '南华花园')
    # 总价
    self.price = ''
    # 楼层
    self.floor = ''
    # 类型
    self.type = ''
    # 户型
    self.unit_type = ''
    # 面积
    self.area = ''
    # 单价
    self.price_per = ''
    # 朝向
    self.direct = ''
    # 年份
    self.year = ''
    # 装修
    self.furnish = ''
    # 预算
    self.budget = ''
    # 小区
    self.community = ''
    # 详情页面地址(用于手动补充未抓到的数据)
    self.detail_page_href = ''
  
  def NormalizeString(self, origin):
    return origin.replace('\n', '').replace('\r', '').replace('\t', '').strip()

  def assign_title(self, x):
    self.title = self.NormalizeString(x)
  
  def append_cmudesc(self, x):
    self.cmu_desc.append(self.NormalizeString(x))
  
  def assign_price(self, x, xu):
    self.price = "" + self.NormalizeString(x) + self.NormalizeString(xu)
  
  def assign_price_per(self, x, xu):
    self.price_per = "" + self.NormalizeString(x) + self.NormalizeString(xu)

  def assign_type(self, x):
    self.type = self.NormalizeString(x)

  def assign_area(self, x):
    self.area = self.NormalizeString(x)

  def assign_direct(self, x):
    self.direct = self.NormalizeString(x)
  
  def assign_unittype(self, x):
    self.unit_type = self.NormalizeString(x)

  def assign_year(self, x):
    self.year = self.NormalizeString(x)
  
  def assign_floor(self, x):
    self.floor = self.NormalizeString(x)

  def assign_furnish(self, x):
    self.furnish = self.NormalizeString(x)
  
  def assign_budget(self, x):
    self.budget = self.NormalizeString(x)
  
  def assign_community(self, x):
    self.community = self.NormalizeString(x)

  def assign_detail_page_href(self, x):
    self.detail_page_href = self.NormalizeString(x)

  def dump_string(self):
    return self.title + "--" + self.cmu_desc[0] + "--" + self.cmu_desc[1] + "--" + self.price + "--"\
       + self.floor + "--" + self.type + "--" + self.unit_type + "--" + self.area + "--" + self.price_per + "--" \
         + self.direct + "--" + self.year + "--" + self.furnish + "--" + self.budget + "--" + self.community + "--" + self.detail_page_href

g_house_list = []
g_title_list = []

## Global Config
g_sleep_time_begin = 1
g_sleep_time_end = 3
g_index_max = 100

def _GenRandomInteger(begin, end):
  return random.randint(begin, end)

def _GenURLContext(index):
  format_pre = "https://m.anjuke.com/sale/ditiefang-l1037/p{}/?from=navigation"
  return format_pre.format(index)

def _HandleHouseContext(hse):
  # Do anything you want. (output, save to file, etc.)
  ## This is just a demo, output to console ~~
  print(hse.dump_string())

def _HandleWebsiteVerifyRetry(url):
    input_text = "\nPlease open '{}' to verify first, then press any key to retry...".format(url)
    input(input_text)

def _MappingTitleTextRule(hse, title, text):
  if title == "户型":
    hse.assign_unittype(text)
  elif title == "建筑面积":
    hse.assign_area(text)
  elif title == "楼层":
    hse.assign_floor(text)
  elif title == "朝向":
    hse.assign_direct(text)
  elif title == "类型":
    hse.assign_type(text)
  elif title == "装修":
    hse.assign_furnish(text)
  elif title == "年代":
    hse.assign_year(text)
  elif title == "预算":
    hse.assign_budget(text)
  elif title == "小区":
    hse.assign_community(text)
  elif title == "单价":
    hse.assign_price_per(text, "")
  else:
    return
  

def _AssignHouseDetailFields(hse, listwp_obj):
  list_obj_iter = listwp_obj.find_all("div", class_='list-item')
  if list_obj_iter != None and len(list_obj_iter) > 0:
    for list_obj in list_obj_iter:
      list_obj_title = list_obj.find("span", class_='title')
      list_obj_text = list_obj.find("span", class_='text')
      if list_obj_title != None and list_obj_text != None:
        _MappingTitleTextRule(hse, list_obj_title.text, list_obj_text.text)
      
  list_obj1 = listwp_obj.find("a", class_='list-item with-arrow')
  if list_obj1 != None:
    list_obj1_title = list_obj1.find("span", class_='title')
    list_obj1_text = list_obj1.find("span", class_='text')
    if list_obj1_title != None and list_obj1_text != None:
      _MappingTitleTextRule(hse, list_obj1_title.text, list_obj1_text.text)
  
  list_obj2 = listwp_obj.find("a", class_='list-item with-up-arrow')
  if list_obj2 != None:
    list_obj2_title = list_obj2.find("span", class_='title')
    list_obj2_text = list_obj2.find("span", class_='text')
    if list_obj2_title != None and list_obj2_text != None:
      _MappingTitleTextRule(hse, list_obj2_title.text, list_obj2_text.text)

def HandleDetailPage(hse, detail_page_url):
  global g_house_list
  header_self = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
  }

  resp = requests.get(detail_page_url, headers=header_self)
  resp_text = resp.text
  resp_obj = BeautifulSoup(resp_text, 'lxml')
  while True:
    if resp.status_code != 200:
      LOG_WARN("Status code[{}] failed".format(resp.status_code))
      return False
    
    resp_text = resp.text
    if len(resp_text) == 0:
      _HandleWebsiteVerifyRetry(detail_page_url)
      resp = requests.get(detail_page_url, headers=header_self)
      continue
    
    resp_obj = BeautifulSoup(resp_text, 'lxml')

    ## Title
    baseinfo_title = resp_obj.find("div", class_='baseinfo-title')
    if baseinfo_title == None:
      _HandleWebsiteVerifyRetry(detail_page_url)
      resp = requests.get(detail_page_url, headers=header_self)
      continue

    break

  ## Price
  baseinfo_price = resp_obj.find("div", class_='baseinfo-2l-data-left')
  if baseinfo_price != None:
    baseinfo_price_ctx = baseinfo_price.find("span", class_='baseinfo-num')
    baseinfo_price_unit = baseinfo_price.find("span", class_='baseinfo-unit')
    if baseinfo_price_ctx != None and baseinfo_price_unit != None:
      hse.assign_price(baseinfo_price_ctx.text, baseinfo_price_unit.text)

  ## Per-Price
  baseinfo_price = resp_obj.find("div", class_='baseinfo-2l-data-right')
  if baseinfo_price != None:
    baseinfo_price_ctx = baseinfo_price.find("span", class_='baseinfo-num')
    baseinfo_price_unit = baseinfo_price.find("span", class_='baseinfo-unit')
    if baseinfo_price_ctx != None and baseinfo_price_unit != None:
      hse.assign_price(baseinfo_price_ctx.text, baseinfo_price_unit.text)

  ## Detail
  house_detail_list = resp_obj.find("ul", class_='houseinfo-list')
  if house_detail_list == None:
    LOG_WARN("Try detail page list failed")
    return False
  
  house_detail_item_iter = house_detail_list.find_all("li")
  if house_detail_item_iter == None or len(house_detail_item_iter) == 0:
    LOG_WARN("Try detail page list item failed")
    return False

  for house_detail_item in house_detail_item_iter:
    _AssignHouseDetailFields(hse, house_detail_item)

  return True

def RunWalker(origin_url):
  global g_title_list

  header_self = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
  }

  resp = requests.get(origin_url, headers=header_self)
  resp_text = resp.text
  resp_obj = BeautifulSoup(resp_text, 'lxml')
  while True:
    if resp.status_code != 200:
      LOG_WARN("Status code[{}] failed".format(resp.status_code))
      return False
    
    resp_text = resp.text
    if len(resp_text) == 0:
      _HandleWebsiteVerifyRetry(origin_url)
      resp = requests.get(origin_url, headers=header_self)
      continue
  
    resp_obj = BeautifulSoup(resp_text, 'lxml')

    house_iter = resp_obj.find_all('li', class_='item-wrap')
    if len(house_iter) == 0:
      LOG_WARN("Empty object of '{}'".format(origin_url))
      _HandleWebsiteVerifyRetry(origin_url)
      resp = requests.get(origin_url, headers=header_self)
      continue

    break
  
  for house_main in house_iter:
    time.sleep(_GenRandomInteger(g_sleep_time_begin, g_sleep_time_end))
    house_a = house_main.find('a', class_='cell-wrap MLIST_MAIN')
    if house_a == None:
      continue
    
    house_a_href = house_a.get('href')
    LOG_DEBUG("Handle defail page '{}'".format(house_a_href))
    if len(house_a_href) == 0:
      LOG_WARN("Empty detail page url, skip.")
      continue
    
    hse = House()
    hse.assign_detail_page_href(house_a_href)
    if HandleDetailPage(hse, house_a_href) == False:
      time.sleep(_GenRandomInteger(g_sleep_time_begin, g_sleep_time_end))
      LOG_WARN("Handle detail page operation failed")

    house_div_cwrap = house_a.find('div', class_='content-wrap')
    if house_div_cwrap == None:
      continue

    house_div_twrapl2 = house_div_cwrap.find('div', class_='title-wrap lines2')
    if house_div_twrapl2 == None:
      continue
    # Title
    house_span_title = house_div_twrapl2.find('span', class_='content-title')
    if house_span_title == None or len(house_span_title.text) == 0:
      continue
    hse.assign_title(house_span_title.text)

    house_div_cmuwrap = house_div_cwrap.find('div', class_='desc-wrap desc-wrap-community')
    if house_div_cmuwrap != None:
      # Community
      house_span_cmuwrap_c1 = house_div_cmuwrap.find('span', class_='content-desc')
      if house_span_cmuwrap_c1 != None and len(house_span_cmuwrap_c1.text) > 0:
        hse.append_cmudesc(house_span_cmuwrap_c1.text)
      
      house_span_cmuwrap_c2 = house_div_cmuwrap.find('span', class_='content-desc content-desc-community')
      if house_span_cmuwrap_c2 != None and len(house_span_cmuwrap_c2.text) > 0:
        hse.append_cmudesc(house_span_cmuwrap_c2.text)

    house_div_descwrap_iter = house_div_cwrap.find_all('div', class_='desc-wrap')
    house_div_descwrap = None
    if house_div_descwrap_iter != None and len(house_div_descwrap_iter) > 0:
      for house_div_descwrap_item in house_div_descwrap_iter:
        if house_div_descwrap_item.get('class') == ['desc-wrap']:
          house_div_descwrap = house_div_descwrap_item
          break
      
      if house_div_descwrap != None:
        house_price_div = house_div_descwrap.find("div", class_='price-wrap')
        if house_price_div != None:
          house_price_ctx = house_price_div.find("span", class_='content-price')
          house_price_unit = house_price_div.find("span", class_='content-unit')
          if house_price_ctx != None and house_price_unit != None:
            hse.assign_price(house_price_ctx.text, house_price_unit.text)

        house_constent_desc_iter = house_div_descwrap.find_all("span", class_='content-desc')
        if house_constent_desc_iter != None and len(house_constent_desc_iter) > 0:
          for house_constent_desc_item in house_constent_desc_iter:
            if (house_constent_desc_item.text.find('室') != -1 or house_constent_desc_item.text.find('厅') != -1) and len(hse.unit_type) == 0:
              hse.assign_unittype(house_constent_desc_item.text)
            if house_constent_desc_item.text.find("㎡") != -1 and len(hse.area) == 0:
              hse.assign_area(house_constent_desc_item.text)

    title_found = False
    for i in g_title_list:
      if hse.title == i:
        title_found = True
        break
    
    if title_found == False:
      g_house_list.append(hse)
      _HandleHouseContext(hse)

  return True

def main():
  index_max = g_index_max
  LOG_DEBUG("Begin walker routine")
  for index in range(1, index_max + 1):
    url_string = _GenURLContext(index)
    LOG_DEBUG("Begin to get '{}' context".format(url_string))
    if RunWalker(url_string) == False:
      LOG_WARN("Get context '{}' failed".format(url_string))
    
    time.sleep(_GenRandomInteger(g_sleep_time_begin, g_sleep_time_end))

if __name__ == '__main__':
  main()
posted @ 2023-12-26 11:06  倚剑问天  阅读(19)  评论(0编辑  收藏  举报