全站爬取链家网

一,获取链家独特的命名链接(全拼+缩写)

image

1、由上可知,链家的网页链接是采取城市名称缩写加普通链接的形式。
及 城市缩写+lianjia.com
但是存在一些问题,有部分城市的命名可能与其他城市重复,所以在这里,我需要重新获取链家的所有城市缩写命名
image

image

2、具体代码如下,这里我直接通过第二种方法拿到链接:

import requests
import json
import pprint
#获取城市名与对应链接
def get_url():
    get_city_url='https://ajax.api.lianjia.com/config/cityConfig/getConfig?type=province&category=1'
    json_data=json.loads(requests.get(url=get_city_url).text)['data']
    url_list=[]
    for  key in json_data:
        for id in json_data[key]:
            ls={}
            ls[json_data[key][id]['name']]=json_data[key][id]['url']
            url_list.append(ls)
    return url_list
get_url()
[{'济南': 'https://jn.lianjia.com/'},
 {'泰安': 'https://ta.lianjia.com/'},
 {'临沂': 'https://linyi.lianjia.com/'},
 ...
 ...
 {'天津': 'https://tj.lianjia.com/'}]
for i in get_url():
    s=i.values()
#获取链接
print('step1:')
print(s)
#将dict_values(['https://tj.lianjia.com/'])中的dict_values去掉
print('step2:')
print(list(s))
#获取字符类型链接
print('step3:')
print(list(s)[0])
step1:
dict_values(['https://tj.lianjia.com/'])
step2:
['https://tj.lianjia.com/']
step3:
https://tj.lianjia.com/
#只获取链接列表列表推导式
url_list=[list(i.values())[0]+'ershoufang/' for i in get_url()]
len(url_list)
url_list   
['https://weihai.lianjia.com/ershoufang/',
 'https://zjk.lianjia.com/ershoufang/',
 'https://yibin.lianjia.com/ershoufang/',
 ...
 ...
 'https://hrb.lianjia.com/ershoufang/',
 'https://tj.lianjia.com/ershoufang/']

二,获取每个城市二手房房源信息最大页数

image

这里分为两种情况,一种是顶部显示最大套数,底部动态加载的页码(直接提取不到页码),如上图。另一种就是不显示最大套数,但底部数据可以直接获得,如下图。前者是大多数城市的页码获取方式,后者不多,但在整站爬取过程中让人如鲠在喉。
image

#第一种,适用于大部分城市
import requests
from lxml import etree
import math
def get_max_page(ershou_url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
    }
    ershou_url=ershou_url
    ershou_req=requests.get(url=ershou_url,headers=headers)
    web_html = etree.HTML(ershou_req.text)
    number_of_houses=web_html.xpath('//*[@id="content"]/div[1]/div[2]/h2/span/text()')[0]
    max_page=math.ceil(int(number_of_houses)/30)
    return 100 if max_page>100 else max_page
get_max_page('https://zz.lianjia.com/ershoufang/')

#第二种,适用于极个别城市
def get_max_page2(ershou_url):   
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
    }
    ershou_url=ershou_url
    ershou_req=requests.get(url=ershou_url,headers=headers)
    web_html = etree.HTML(ershou_req.text)
    max_page=int(json.loads(web_html.xpath('//*[@id="content"]/div[1]/div[7]/div[2]/div/@page-data')[0])['totalPage'])
    return max_page
get_max_page2('https://zz.lianjia.com/ershoufang/')
100

具体的网站结构没有细究,但链家的网站布局都不是一成不变的。。。
上面为什么要设置不能大于100呢?
每个城市链家二手房数据都只展示100页数据,一页30条,也就是3000条,但是我们可以通过分区+加分价格索引的方式,最大程度上的获取房源信息。
在后面的代码我会获取地区及价格的动态信息,通过索引方式最大程度拿到数据。

三,获取具体房源信息的链接

image

为什么先获取链接?
首先,直接在城市二手房(如:https://zz.lianjia.com/ershoufang/)下获得的房屋信息是有限的,我们想要的是尽可能的获得最充分的字段信息,所以必须进入房源的具体链接,但是由于上面所说的,房源信息只展示3000条,我们通过索引方式最大程度拿到的房源链接,其中有部分是重复的(重复链接很可能是网页检索后的其他类型推荐,具体的不太清楚,网上查资料大部人说通过检索获取的数据竟然比网站显示的总房源数还多),所以为了避免浪费资源,我这里先将链接爬取存储到本地,再通过去重获得准确不重复的房源链接。

headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
    }
#获取一页的房源链接
def get_houses_url(url):
    url=url
    houses_res=requests.get(url=url,headers=headers)
    if houses_res.status_code==200:
        web_html = etree.HTML(houses_res.text)
        houses_url_list=web_html.xpath('//*[@id="content"]/div[1]/ul/li[*]/a/@href')
        return houses_url_list
    else:
        input('3、开始真人验证,输入任意字符继续。。')
        get_houses_url(url)

#根据索引,最大页数,遍历获取所有房源链接,暂存本地
def get_all_url(url_l):
    url_list=list(url_l.values())[0]
    city_name=list(url_l.keys())[0]
    regions,prices=get_regions_prices(url_list)
    with open('./data/省会/{}.txt'.format(city_name), 'a') as f:
        for re in regions:
            for pr in prices:
                page = 1
                max_page=get_max_page2(url_list + re + '/'  + pr + '/')
                while True:
                    url = url_list + re + '/pg' + str(page) + pr + '/'      
                    houses_url_list=get_houses_url(url)
                    for houses_url in houses_url_list:
                         f.write(houses_url+',')
                    page = page + 1
                    if page > max_page:
                        print('{} 网页下载完成'.format(url))
                        break
    print('{} 全部获取成功!!!'.format(url_list))
city_list=[{'济南': 'https://jn.lianjia.com/'},
 {'泰安': 'https://ta.lianjia.com/'},
 {'临沂': 'https://linyi.lianjia.com/'},
 {'菏泽': 'https://heze.lianjia.com/'},
 {'济宁': 'https://jining.lianjia.com/'},
 {'淄博': 'https://zb.lianjia.com/'},
 {'潍坊': 'https://wf.lianjia.com/'},
 {'青岛': 'https://qd.lianjia.com/'},
 {'烟台': 'https://yt.lianjia.com/'},
 {'威海': 'https://weihai.lianjia.com/'},
 {'廊坊': 'https://lf.lianjia.com/'},
 {'唐山': 'https://ts.lianjia.com/'},
 {'保定': 'https://bd.lianjia.com/'},
 {'石家庄': 'https://sjz.lianjia.com/'},
 {'邯郸': 'https://hd.lianjia.com/'},
 {'承德': 'https://chengde.lianjia.com/'},
 {'秦皇岛': 'https://qhd.fang.lianjia.com/'},
 {'张家口': 'https://zjk.lianjia.com/'},
 {'南充': 'https://nanchong.lianjia.com/'},
 {'德阳': 'https://dy.lianjia.com/'},
 {'雅安': 'https://yaan.lianjia.com/'},
 {'遂宁': 'https://sn.lianjia.com/'},
 {'资阳': 'https://ziyang.lianjia.com/'},
 {'成都': 'https://cd.lianjia.com/'},
 {'达州': 'https://dazhou.lianjia.com/'},
 {'乐山': 'https://leshan.fang.lianjia.com/'},
 {'凉山': 'https://liangshan.lianjia.com/'},
 {'攀枝花': 'https://pzh.lianjia.com/'},
 {'广元': 'https://guangyuan.lianjia.com/'},
 {'眉山': 'https://ms.fang.lianjia.com/'},
 {'绵阳': 'https://mianyang.lianjia.com/'},
 {'宜宾': 'https://yibin.lianjia.com/'},
 {'东莞': 'https://dg.lianjia.com/'},
 {'湛江': 'https://zhanjiang.lianjia.com/'},
 {'珠海': 'https://zh.lianjia.com/'},
 {'清远': 'https://qy.lianjia.com/'},
 {'江门': 'https://jiangmen.lianjia.com/'},
 {'深圳': 'https://sz.lianjia.com/'},
 {'惠州': 'https://hui.lianjia.com/'},
 {'中山': 'https://zs.lianjia.com/'},
 {'佛山': 'https://fs.lianjia.com/'},
 {'广州': 'https://gz.lianjia.com/'},
 {'昆明': 'https://km.lianjia.com/'},
 {'西双版纳': 'https://xsbn.fang.lianjia.com/'},
 {'大理': 'https://dali.lianjia.com/'},
 {'绍兴': 'https://sx.lianjia.com/'},
 {'宁波': 'https://nb.lianjia.com/'},
 {'金华': 'https://jh.lianjia.com/'},
 {'义乌': 'https://yw.lianjia.com/'},
 {'温州': 'https://wz.lianjia.com/'},
 {'台州': 'https://taizhou.lianjia.com/'},
 {'衢州': 'https://quzhou.lianjia.com/'},
 {'嘉兴': 'https://jx.lianjia.com/'},
 {'湖州': 'https://huzhou.lianjia.com/'},
 {'杭州': 'https://hz.lianjia.com/'},
 {'万宁': 'https://wn.fang.lianjia.com/'},
 {'文昌': 'https://wc.fang.lianjia.com/'},
 {'琼海': 'https://qh.fang.lianjia.com/'},
 {'五指山': 'https://wzs.fang.lianjia.com/'},
 {'儋州': 'https://dz.fang.lianjia.com/'},
 {'海口': 'https://hk.lianjia.com/'},
 {'三亚': 'https://san.lianjia.com/'},
 {'陵水': 'https://ls.fang.lianjia.com/'},
 {'乐东': 'https://ld.fang.lianjia.com/'},
 {'临高': 'https://lg.fang.lianjia.com/'},
 {'澄迈': 'https://cm.lianjia.com/'},
 {'保亭': 'https://bt.fang.lianjia.com/'},
 {'西安': 'https://xa.lianjia.com/'},
 {'汉中': 'https://hanzhong.lianjia.com/'},
 {'宝鸡': 'https://baoji.lianjia.com/'},
 {'咸阳': 'https://xianyang.lianjia.com/'},
 {'安庆': 'https://aq.lianjia.com/'},
 {'阜阳': 'https://fy.lianjia.com/'},
 {'合肥': 'https://hf.lianjia.com/'},
 {'马鞍山': 'https://mas.lianjia.com/'},
 {'芜湖': 'https://wuhu.lianjia.com/'},
 {'滁州': 'https://cz.fang.lianjia.com/'},
 {'上饶': 'https://sr.lianjia.com/'},
 {'赣州': 'https://ganzhou.lianjia.com/'},
 {'南昌': 'https://nc.lianjia.com/'},
 {'九江': 'https://jiujiang.lianjia.com/'},
 {'吉安': 'https://jian.lianjia.com/'},
 {'株洲': 'https://zhuzhou.lianjia.com/'},
 {'岳阳': 'https://yy.lianjia.com/'},
 {'长沙': 'https://cs.lianjia.com/'},
 {'湘西': 'https://xx.lianjia.com/'},
 {'衡阳': 'https://hy.lianjia.com/'},
 {'常德': 'https://changde.lianjia.com/'},
 {'镇江': 'https://zj.lianjia.com/'},
 {'句容': 'https://jr.lianjia.com/'},
 {'丹阳': 'https://danyang.lianjia.com/'},
 {'淮安': 'https://ha.lianjia.com/'},
 {'常州': 'https://changzhou.lianjia.com/'},
 {'昆山': 'https://ks.lianjia.com/'},
 {'常熟': 'https://changshu.lianjia.com/'},
 {'盐城': 'https://yc.lianjia.com/'},
 {'苏州': 'https://su.lianjia.com/'},
 {'南京': 'https://nj.lianjia.com/'},
 {'太仓': 'https://taicang.lianjia.com/'},
 {'江阴': 'https://jy.lianjia.com/'},
 {'南通': 'https://nt.lianjia.com/'},
 {'无锡': 'https://wx.lianjia.com/'},
 {'海门': 'https://haimen.lianjia.com/'},
 {'徐州': 'https://xz.lianjia.com/'},
 {'驻马店': 'https://zmd.lianjia.com/'},
 {'郑州': 'https://zz.lianjia.com/'},
 {'濮阳': 'https://py.lianjia.com/'},
 {'三门峡': 'https://smx.fang.lianjia.com/'},
 {'周口': 'https://zk.lianjia.com/'},
 {'平顶山': 'https://pds.lianjia.com/'},
 {'新乡': 'https://xinxiang.lianjia.com/'},
 {'洛阳': 'https://luoyang.lianjia.com/'},
 {'许昌': 'https://xc.lianjia.com/'},
 {'济源': 'https://jiyuan.fang.lianjia.com/'},
 {'开封': 'https://kf.lianjia.com/'},
 {'乌鲁木齐': 'https://wlmq.lianjia.com/'},
 {'长春': 'https://cc.lianjia.com/'},
 {'吉林': 'https://jl.lianjia.com/'},
 {'福州': 'https://fz.lianjia.com/'},
 {'泉州': 'https://quanzhou.lianjia.com/'},
 {'漳州': 'https://zhangzhou.lianjia.com/'},
 {'厦门': 'https://xm.lianjia.com/'},
 {'重庆': 'https://cq.lianjia.com/'},
 {'包头': 'https://baotou.lianjia.com/'},
 {'通辽': 'https://tongliao.lianjia.com/'},
 {'呼和浩特': 'https://hhht.lianjia.com/'},
 {'巴彦淖尔': 'https://byne.fang.lianjia.com/'},
 {'赤峰': 'https://cf.lianjia.com/'},
 {'防城港': 'https://fcg.lianjia.com/'},
 {'柳州': 'https://liuzhou.lianjia.com/'},
 {'北海': 'https://bh.lianjia.com/'},
 {'南宁': 'https://nn.lianjia.com/'},
 {'桂林': 'https://gl.lianjia.com/'},
 {'运城': 'https://yuncheng.lianjia.com/'},
 {'晋中': 'https://jz.lianjia.com/'},
 {'太原': 'https://ty.lianjia.com/'},
 {'北京': 'https://bj.lianjia.com/'},
 {'鄂州': 'https://ez.lianjia.com/'},
 {'黄石': 'https://huangshi.lianjia.com/'},
 {'襄阳': 'https://xy.lianjia.com/'},
 {'武汉': 'https://wh.lianjia.com/'},
 {'宜昌': 'https://yichang.lianjia.com/'},
 {'黔西南': 'https://qxn.fang.lianjia.com/'},
 {'贵阳': 'https://gy.lianjia.com/'},
 {'大连': 'https://dl.lianjia.com/'},
 {'丹东': 'https://dd.lianjia.com/'},
 {'沈阳': 'https://sy.lianjia.com/'},
 {'抚顺': 'https://fushun.lianjia.com/'},
 {'银川': 'https://yinchuan.lianjia.com/'},
 {'天水': 'https://tianshui.lianjia.com/'},
 {'兰州': 'https://lz.lianjia.com/'},
 {'上海': 'https://sh.lianjia.com/'},
 {'哈尔滨': 'https://hrb.lianjia.com/'},
 {'天津': 'https://tj.lianjia.com/'}]
#遍历获取所有房源链接
for url_list in city_list:
    print(url_list)
    get_all_url(url_list)
#多线程 不推荐,大型网站对于爬虫很忌讳,多线程我尝试用代理池,随机请求头都会出现人机识别,跳不过去。。当然,这么大的网站,有一点反爬也很正常。
#非要用的话,在代码里添加try机制,当出现异常,手动跳过人机验证即可。速度倒是很可观。
from multiprocessing.dummy import Pool
pool = Pool(8)
pool.map(get_all_url, city_list)

四,获取房源具体信息

建库

create database lianjia DEFAULT CHARSET utf8 COLLATE utf8_general_ci;

创表

create table tianjin(
total varchar(16),
unitPrice varchar(16),
communityName varchar(32),
areaName varchar(32),
base_list longtext,
transaction_list longtext,
tags_clear varchar(50),
baseattribute_clear varchar(100),
url varchar(255)
) default charset=utf8;

#汇总
        
        
def geturl(city):
    city=city
    with open('./data/{}.txt'.format(city), 'r') as f:
        content = f.read()
        url_lists=content.split(',')[:-1]
        url_list=set(url_lists)
        print(f'去重前{len(url_lists)}个链接-------去重后{len(url_list)}个链接')
    return list(url_list)



import requests
from lxml import etree
from fake_useragent import UserAgent
count=0
def get_detailed_house(url):
    global count
    count+=1
    print(f'第{count}个房源')
    headers= {'User-Agent':str(UserAgent().random)}
    url=url
    print(f'{url}   开始')
    r = requests.get(url, headers=headers)
    if r.status_code==200:
        web_html = etree.HTML(r.text)
        try:
            #总价                
            total=web_html.xpath('/html/body/div[5]/div[2]/div[3]/div/span[1]/text()')[0]
            #单价
            unitPrice=web_html.xpath('/html/body/div[5]/div[2]/div[3]/div/div[1]/div[1]/span/text()')[0]
        except:
            #总价
            total=web_html.xpath('/html/body/div[5]/div[2]/div[4]/p/span/text()')[0]
            #单价
            unitPrice=web_html.xpath('/html/body/div[5]/div[2]/div[4]/p/span/text()')[1]
        #小区名
        communityName=web_html.xpath('/html/body/div[5]/div[2]/div/div[1]/a[1]/text()')[0]
        #位置
        areaName=''
        for i in web_html.xpath('/html/body/div[5]/div[2]/div/div[2]/span[2]/a/text()'):
            areaName+=i    
        ################################ 基本信息 #################################
        base_list=web_html.xpath('//*[@id="introduction"]/div/div/div[1]/div[2]/ul/li/text()')    
        #房屋户型    
        #所在楼层
        #建筑面积
        #户型结构
        #套内面积
        #建筑类型
        #房屋朝向
        #建筑结构
        #装修情况
        #梯户比例
        #配备电梯
        ################################ 交易属性 ################################# 
        transaction_list=[t.strip() for t in web_html.xpath('//*[@id="introduction"]/div/div/div[2]/div[2]/ul/li/span[2]/text()') ]
        #挂牌时间
        #交易权属
        #上次交易
        #房屋用途
        #房屋年限
        #产权所属
        #抵押信息
        #房本备件

        ################################ 房源特色 ################################# 

        try:
            #房源标签
            tags_clear=[t.strip() for t in web_html.xpath('/html/body/div[7]/div/div[2]/div/div[1]/div[2]/a/text()')]
            #h核心卖点
            baseattribute_clear=web_html.xpath('/html/body/div[7]/div/div[2]/div/div[2]/div[2]/text()')[0].strip()
        except:
            tags_clear='暂无数据'
            baseattribute_clear='暂无数据'
        #房屋链接
        #print(total,unitPrice,communityName,areaName,base_list,transaction_list,tags_clear,baseattribute_clear,url)
        sql='insert into tianjin(total,unitPrice,communityName,areaName,base_list,transaction_list,tags_clear,baseattribute_clear,url) values (%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        cursor.execute(sql, [total,unitPrice,communityName,areaName,str(base_list),str(transaction_list),str(tags_clear),baseattribute_clear,url,])
        conn.commit()
        print(f'++++++++++++第{count}个房源  {url} 下载完成!+++++++++++++')
    elif r.status_code==404:
        pass
    else:     
        input('人机验证,任意字符继续...:')
#main
import pymysql
from multiprocessing.dummy import Pool

# 1.连接MySQL
conn = pymysql.connect(host="127.0.0.1", port=3306, user='root', passwd="******", charset='utf8', db='lianjia')
cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)


#测试
# url='https://zz.lianjia.com/ershoufang/104107897508.html'
# get_detailed_house(url)

#单线程
# url_list=geturl('郑州')
# for url in url_list:
#     try:
#         get_detailed_house(url)
#     except:
#         pass

#多线程

url_list=geturl('天津')
pool = Pool(10)
pool.map(get_detailed_house,url_list)
    
    
    
# url='https://zz.lianjia.com/ershoufang/104107897508.html'
# get_detailed_house(url)

cursor.close()
conn.close()
posted @ 2023-03-15 02:04  AubeLiang  阅读(207)  评论(3编辑  收藏  举报