Python爬取小猪短租全网数据
爬取时需要进行的操作:
1:输入你是要爬取国内的还是海外的,1表示国内,2表示海外;
2:然后输入你要爬取的城市名称,就可以了;
每个函数的功能:
choose_area函数根据你输入的是国内还是海外,输出不同的区域名称
url_list函数根据你输入的城市名称,来爬取城市有多少房源,来判断有几页数据,因为小猪短租网上面只显示13页数据,超过了的话页只显示13页的数据,所以做个判断就可以
get_url函数根据你输入的城市名称和页码,来构建你输入的城市每页的url
get_html函数就是获取每页的html数据
get_zf_url函数根据每页的html数据来爬取每个租房的url链接
get_zf_message函数,通过传入的租房url链接来获取每个租房的信息
''' 获取每个城市的url ''' import re import requests url = 'http://jci.xiaozhustatic1.com/e17061601/xzjs?k=Front_Search&httphost=bj.xiaozhu.com' #获取城市名称的链接 ser = input('输入你要查找的地区(1:国内;2:海外):') #输入你你要查找是国内的短租房还是海外的短租房 html = requests.get(url).text #通过上面提供的url来爬取每个省份的拼音 def choose_area(): #判断你需要查找的是国内的还是海外的,并输出相应的内容 city_tup = re.compile('citys[[0-9]\d*]=new Array(.*?);').findall(html) #print(city_tup) for city_name in city_tup[29:]: #print(city_name) city_time = re.compile('[0-9]\d*:[0-9]\d*').findall(city_name) if ser == '1': if len(city_time) == 0: city = re.compile('[\u4E00-\u9FA5]+').findall(city_name)[0] #城市名称 city_jc = re.compile('[a-z]\w*').findall(city_name)[1] #城市拼音 city_zf = re.compile('[0-9]\d*').findall(city_name)[0] #城市租房数量 city_dic = {city:[city_jc,city_zf]} yield city_dic else: pass elif ser == '2': if len(city_time) != 0: city = re.compile('[\u4E00-\u9FA5]+').findall(city_name)[0] city_jc = re.compile('[a-z]\w*').findall(city_name)[1] city_zf = re.compile('[0-9]\d*').findall(city_name)[0] city_dic = {city: [city_jc, city_zf]} yield city_dic else: pass def get_url(city_jc,page): #提供省份的名称和页码来构建需要爬取的url url = 'http://{}.xiaozhu.com/search-duanzufang-p{}-0/'.format(city_jc,page) return url #根据你提供的省份名称来判断,这个省份有多少房源,但是每个省份的房源只显示13页的数据,所有做个判断,超过了的话就只显示13页,没有超过的话就有几页就显示几页 def url_list(city_name): #city_name = input('输入你要查找的城市名称:') for city in choose_area(): if city_name in city.keys(): if int(int(city[city_name][1])/24) > 13: for page in range(1,14): url = get_url(city[city_name][0],page) yield url elif int(int(city[city_name][1])/24) <= 13: sum_page = int(int(city[city_name][1])/24) <= 13 for page in range(1,sum_page + 1): url = get_url(city[city_name][0], page) yield url
import requests from lxml import etree import re headers = { 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36' } def get_html(url): #获取网页的html内容 response = requests.get(url,headers=headers) if response.status_code == 200: response.encoding = response.apparent_encoding html = response.text return html else: print('没有获取到HTML') def get_zf_url(url): #根据网页的内容来获取每个短租房的url,并传入get_zf_message函数来获取每个短租房的信息 html = get_html(url) links = etree.HTML(html).xpath('//*[@id="page_list"]/ul/li/a/@href') return links #返回是一个列表 def get_zf_message(zf_url): html = get_html(zf_url) area = etree.HTML(html).xpath('//div[@class="pho_info"]/p/@title')[0] h_image = etree.HTML(html).xpath('//*[@id="curBigImage"]/@src')[0] #//*[@id="floatRightBox"]/div[3]/div[3]/h6/a #因为用lxml获取不到房东姓名,不知道为什么,但是用正则就可以 fd_name = re.compile('<a class="lorder_name" href=".*?" title="(.*?)" target="_blank">.*?</a>').findall(html)[0] #fd_name = etree.HTML(html).xpath('//a[class="lorder_name"]/text()') fd_link = re.compile('<a class="lorder_name" href="(.*?)" title=".*?" target="_blank">.*?</a>').findall(html)[0] zf_price = etree.HTML(html).xpath('//*[@id="pricePart"]/div[1]/span/text()')[0] zf_title = etree.HTML(html).xpath('//div[@class="pho_info"]/h4/em/text()')[0] data = { '标题': zf_title, '价格': zf_price, '地址': area, '图片': h_image, '房东': fd_name, '房东链接': fd_link, } return data
from spider_ziaozu import * from get_url import * if __name__ == '__main__': city = input('输入你想爬取的城市名称:') for url in url_list(city): #通过传入上海的,获取上海的短租房的所有页面url print(url) zf_list = get_zf_url(url) #把上海的短租房的所有页面url,传给get_zf_url来获取每个短租房url,返回的也是一个列表 #print(zf_list) for zf_url in zf_list: #通过遍历每个短租房的url,并传入给get_zf_message,来获取每个租房的信息 print(zf_url) try: zf_message = get_zf_message(zf_url) print(zf_message) except Exception as e: print(e)
下面是城市的数据:
domestic_list = [ {'北京': ['beijing', '8221']}, {'上海': ['shanghai', '6996']}, {'广州': ['guangzhou', '2727']}, {'成都': ['chengdu', '5369']}, {'深圳': ['shenzhen', '2522']}, {'西安': ['xian', '2562']}, {'南京': ['nanjing', '1675']}, {'杭州': ['hangzhou', '2455']}, {'重庆': ['chongqing', '3171']}, {'武汉': ['wuhan', '1901']}, {'苏州': ['suzhou', '1603']}, {'无锡': ['wuxi', '240']}, {'青岛': ['qingdao', '3712']}, {'厦门': ['xiamen', '1548']}, {'三亚': ['sanya', '2384']}, {'大连': ['dalian', '1034']}, {'哈尔滨': ['haerbin', '790']}, {'秦皇岛': ['qinhuangdao', '1924']}, {'天津': ['tianjin', '485']}, {'昆明': ['kunming', '819']}, {'香港': ['xianggang', '164']}, {'长春': ['changchun', '350']}, {'沈阳': ['shenyang', '562']}, {'合肥': ['hefei', '533']}, {'郑州': ['zhengzhou', '370']}, {'太原': ['taiyuan', '470']}, {'威海': ['weihai', '821']}, {'丽江': ['lijiang', '632']}, {'大理': ['dali', '576']}, {'桂林': ['guilin', '518']}, {'澳门': ['aomen', '33']}, {'福州': ['fuzhou', '410']}, {'宁波': ['ningbo', '233']}, {'珠海': ['zhuhai', '595']}, {'长沙': ['changsha', '727']}, {'石家庄': ['shijiazhuang', '288']}, {'拉萨': ['lasa', '47']}, {'常州': ['changzhou', '137']}, {'扬州': ['yangzhou', '224']}, {'东莞': ['dongguan', '70']}, {'海口': ['haikou', '518']}, {'兰州': ['lanzhou', '198']}, {'洛阳': ['luoyang', '195']}, {'乌鲁木齐': ['wulumuqi', '213']}, {'徐州': ['xuzhou', '51']}, {'贵阳': ['guiyang', '503']}, {'呼和浩特': ['huhehaote', '82']}, {'济南': ['jinan', '375']}, {'唐山': ['tangshan', '102']}, {'保定': ['baoding', '83']}, {'南昌': ['nanchang', '206']}, {'邯郸': ['handan', '12']}, {'南宁': ['nanning', '168']}, {'潍坊': ['weifang', '65']}, {'锦州': ['jinzhou', '54']}, {'日照': ['rizhao', '508']}, {'临沂': ['linyi', '41']}, {'鞍山': ['anshan', '23']}, {'廊坊': ['langfang', '101']}, {'大庆': ['daqing', '29']}, {'北海': ['beihai', '436']}, {'中山': ['zhongshan', '70']}, {'西宁': ['xining', '362']}, {'金华': ['jinhua', '71']}, {'丹东': ['dandong', '181']}, {'承德': ['chengde', '437']}, {'盘锦': ['panjin', '35']}, {'淄博': ['zibo', '30']}, {'株洲': ['zhuzhou', '17']}, {'佛山': ['foshan', '127']}, {'吉林': ['jilinshi', '50']}, {'邢台': ['xingtai', '9']}, {'齐齐哈尔': ['qiqihaer', '8']}, {'宜昌': ['yichang', '42']}, {'大同': ['datong', '83']}, {'烟台': ['yantai', '803']}, {'银川': ['yinchuan', '76']}, {'温州': ['wenzhou', '52']}, {'淮安': ['huaian', '37']}, {'绵阳': ['mianyang', '121']}, {'包头': ['baotou', '40']}, {'抚顺': ['fushun', '5']}, {'泰安': ['taian', '103']}, {'济宁': ['jining', '11']}, {'连云港': ['lianyungang', '33']}, {'泉州': ['quanzhou', '95']}, {'安阳': ['anyang', '24']}, {'惠州': ['huizhou', '537']}, {'葫芦岛': ['huludao', '595']}, {'嘉兴': ['jiaxing', '405']}, {'南通': ['nantong', '143']}, {'攀枝花': ['panzhihua', '15']}, {'柳州': ['liuzhou', '19']}, {'东营': ['dongying', '1']}, {'佳木斯': ['jiamusi', '5']}, {'通辽': ['tongliao', '5']}, {'德州': ['dezhou', '22']}, {'赣州': ['ganzhou', '6']}, {'滨州': ['binzhou', '3']}, {'咸阳': ['xianyang', '23']}, {'江门': ['jiangmen', '17']}, {'漳州': ['zhangzhou', '84']}, {'新乡': ['xinxiang', '8']}, {'襄樊': ['xiangfan', '4']}, {'南充': ['nanchong', '29']}, {'聊城': ['liaocheng', '17']}, {'张家口': ['zhangjiakou', '196']}, {'沧州': ['cangzhou', '22']}, {'石河子': ['shihezi', '4']}, {'宝鸡': ['baoji', '5']}, {'赤峰': ['chifeng', '22']}, {'湛江': ['zhanjiang', '41']}, {'商丘': ['shangqiu', '5']}, {'平顶山': ['pingdingshan', '4']}, {'信阳': ['xinyang', '13']}, {'九江': ['jiujiang', '29']}, {'营口': ['yingkou', '500']}, {'本溪': ['benxi', '6']}, {'钦州': ['qinzhou', '2']}, {'衡阳': ['hengyang', '19']}, {'汕头': ['shantou', '63']}, {'芜湖': ['wuhu', '18']}, {'呼伦贝尔': ['hulunbeier', '124']}, {'湘潭': ['xiangtan', '11']}, {'朝阳市': ['chaoyang', '2']}, {'清远': ['qingyuan', '137']}, {'遂宁': ['suining', '6']}, {'泰州': ['jstaizhou', '6']}, {'莆田': ['putian', '3']}, {'枣庄': ['zaozhuang', '14']}, {'泸州': ['luzhou', '52']}, {'舟山': ['zhoushan', '304']}, {'镇江': ['zhenjiang', '19']}, {'开封': ['kaifeng', '130']}, {'鄂尔多斯': ['eerduosi', '3']}, {'十堰': ['shiyan', '14']}, {'延边': ['yanbian', '75']}, {'淮北': ['huaibei', '5']}, {'临汾': ['linfen', '21']}, {'常德': ['changde', '5']}, {'荆州': ['jingzhou', '3']}, {'郴州': ['chenzhou', '46']}, {'德阳': ['deyang', '13']}, {'绍兴': ['shaoxing', '33']}, {'南阳': ['nanyang', '7']}, {'菏泽': ['heze', '1']}, {'台州': ['zjtaizhou', '36']}, {'遵义': ['zunyi', '7']}, {'阜新': ['fuxin', '2']}, {'盐城': ['yancheng', '6']}, {'宿迁': ['suqian', '2']}, {'焦作': ['jiaozuo', '16']}, {'长治': ['changzhi', '26']}, {'吉安': ['jian', '14']}, {'驻马店': ['zhumadian', '1']}, {'汉中': ['hanzhong', '28']}, {'河源': ['heyuan', '11']}, {'铁岭': ['tieling', '2']}, {'晋中': ['jinzhong', '63']}, {'安康': ['ankang', '4']}, {'岳阳': ['yueyang', '13']}, {'肇庆': ['zhaoqing', '15']}, {'衡水': ['hengshui', '21']}, {'牡丹江': ['mudanjiang', '24']}, {'安庆': ['anqing', '11']}, {'黄冈': ['huanggang', '2']}, {'娄底': ['loudi', '3']}, {'乐山': ['leshan', '187']}, {'蚌埠': ['bengbu', '14']}, {'昌吉': ['changji', '1']}, {'韶关': ['shaoguan', '28']}, {'阳江': ['yangjiang', '87']}, {'潮州': ['chaozhou', '15']}, {'张家界': ['zhangjiajie', '171']}, {'怀化': ['huaihua', '2']}, {'西双版纳': ['xishuangbanna', '141']}, {'三明': ['sanming', '9']}, {'运城': ['yuncheng', '15']}, {'眉山': ['meishan', '17']}, {'许昌': ['xuchang', '11']}, {'防城港': ['fangchenggang', '16']}, {'永州': ['yongzhou', '1']}, {'益阳': ['yiyang', '5']}, {'上饶': ['shangrao', '45']}, {'衢州': ['quzhou', '1']}, {'六盘水': ['liupanshui', '10']}, {'白山': ['baishan', '37']}, {'六安': ['luan', '1']}, {'铜陵': ['tongling', '1']}, {'池州': ['chizhou', '5']}, {'晋城': ['jincheng', '3']}, {'黄石': ['huangshi', '10']}, {'湘西': ['xiangxi', '24']}, {'宜春': ['jxyichun', '18']}, {'茂名': ['maoming', '1']}, {'梅州': ['meizhou', '2']}, {'凉山': ['liangshan', '330']}, {'宜宾': ['yibin', '22']}, {'湖州': ['huzhou', '296']}, {'海拉尔': ['hailaer', '2']}, {'延安': ['yanan', '5']}, {'内江': ['neijiang', '9']}, {'南平': ['nanping', '11']}, {'三门峡': ['sanmenxia', '2']}, {'松原': ['songyuan', '5']}, {'阜阳': ['fuyang', '3']}, {'黄山': ['huangshan', '105']}, {'巴彦淖尔': ['bayannaoer', '1']}, {'渭南': ['weinan', '8']}, {'咸宁': ['xianning', '12']}, {'恩施': ['enshi', '29']}, {'抚州': ['jxfuzhou', '5']}, {'龙岩': ['longyan', '13']}, {'通化': ['tonghua', '18']}, {'莱芜': ['laiwu', '1']}, {'宣城': ['xuancheng', '8']}, {'锡林郭勒': ['xilinguole', '18']}, {'景德镇': ['jingdezhen', '21']}, {'曲靖': ['qujing', '3']}, {'广元': ['guangyuan', '9']}, {'巴中': ['bazhong', '5']}, {'济源': ['jiyuan', '3']}, {'鹤岗': ['hegang', '2']}, {'黑河': ['heihe', '6']}, {'吕梁': ['lvliang', '3']}, {'天水': ['tianshui', '11']}, {'榆林': ['sxyulin', '4']}, {'萍乡': ['pingxiang', '4']}, {'哈密': ['hami', '7']}, {'自贡': ['zigong', '23']}, {'阿坝': ['aba', '260']}, {'宁德': ['ningde', '20']}, {'马鞍山': ['maanshan', '1']}, {'阿拉善': ['alashan', '12']}, {'阳泉': ['yangquan', '2']}, {'新余': ['xinyu', '1']}, {'喀什': ['kashi', '2']}, {'黔西南': ['qianxinan', '14']}, {'鸡西': ['jixi', '1']}, {'伊春': ['hljyichun', '24']}, {'大兴安岭': ['daxinganling', '5']}, {'宿州': ['ahsuzhou', '2']}, {'梧州': ['wuzhou', '2']}, {'阿克苏': ['akesu', '1']}, {'汕尾': ['shanwei', '12']}, {'广安': ['guangan', '4']}, {'资阳': ['ziyang', '3']}, {'安顺': ['anshun', '7']}, {'黔东南': ['qiandongnan', '15']}, {'七台河': ['qitaihe', '1']}, {'河池': ['hechi', '4']}, {'张掖': ['zhangye', '27']}, {'酒泉': ['jiuquan', '113']}, {'陇南': ['longnan', '4']}, {'神农架': ['shennongjia', '12']}, {'克拉玛依': ['kelamayi', '4']}, {'伊犁': ['yili', '19']}, {'雅安': ['yaan', '8']}, {'甘孜': ['ganzi', '94']}, {'丽水': ['lishui', '39']}, {'瓦房店': ['wafangdian', '2']}, {'武夷山': ['wuyishan', '34']}, {'亳州': ['bozhou', '1']}, {'贺州': ['hezhou', '6']}, {'石嘴山': ['shizuishan', '1']}, {'中卫': ['zhongwei', '20']}, {'平凉': ['pingliang', '1']}, {'铜川': ['tongchuan', '3']}, {'昭通': ['zhaotong', '1']}, {'巴音郭楞': ['bayinguoleng', '2']}, {'日喀则': ['rikaze', '4']}, {'铜仁': ['tongren', '6']}, {'忻州': ['xinzhou', '15']}, {'吴忠': ['wuzhong', '1']}, {'玉树': ['yushu', '1']}, {'海西': ['haixi', '11']}, {'玉溪': ['yuxi', '11']}, {'红河': ['honghe', '7']}, {'德宏': ['dehong', '8']}, {'吐鲁番': ['tulufan', '2']}, {'黔南': ['qiannan', '9']}, {'张北': ['zhangbei', '3']}, {'鹤壁': ['hebi', '1']}, {'五指山': ['wuzhishan', '4']}, {'兴安': ['xingan', '6']}, {'嘉峪关': ['jiayuguan', '20']}, {'商洛': ['shangluo', '1']}, {'海东': ['haidong', '5']}, {'海北': ['haibei', '17']}, {'随州': ['suizhou', '1']}, {'保山': ['baoshan', '25']}, {'楚雄': ['chuxiong', '2']}, {'普洱': ['puer', '3']}, {'文山': ['wenshan', '1']}, {'迪庆': ['diqing', '14']}, {'和田': ['hetian', '1']}, {'阿拉尔': ['alaer', '1']}, {'文昌': ['wenchang', '39']}, {'琼海': ['qionghai', '30']}, {'儋州': ['danzhou', '1']}, {'万宁': ['wanning', '17']}, {'东方': ['dongfang', '5']}, {'安定': ['anding', '1']}, {'澄迈': ['chengmai', '7']}, {'临高': ['lingao', '1']}, {'白沙': ['baisha', '2']}, {'昌江': ['changjiang', '10']}, {'乐东': ['ledong', '5']}, {'陵水': ['lingshui', '60']}, {'保亭': ['baoting', '2']}, {'琼中': ['qiongzhong', '1']}, {'长白山': ['changbaishan', '113']}, {'台北': ['taibei', '14']}, {'新北': ['xinbei', '4']}, {'台中': ['taizhong', '9']}, {'高雄': ['gaoxiong', '2']}, {'新竹': ['xinzhu', '1']}, {'嘉义': ['jiayi', '1']}, {'花莲乡': ['hualianxiang', '20']}, {'台东县': ['taidongxian', '1']}, {'澎湖县': ['penghuxian', '1']}, ] overseas_list = [ {'墨尔本': ['moerben', '4']}, {'悉尼': ['xini', '3']}, {'维多利亚': ['weiduoliya', '2']}, {'巴黎': ['bali', '856']}, {'巴厘岛': ['balidaobalidao', '1']}, {'佛罗伦萨': ['foluolunsa', '1']}, {'名古屋': ['nagoya', '2']}, {'福冈': ['fugang', '1']}, {'吉隆坡': ['jilongpo', '19']}, {'马累': ['malei', '1']}, {'奥克兰': ['aokelan', '4']}, {'圣彼得堡': ['shengbidebao', '1']}, {'普吉岛': ['phuket', '5']}, {'芭堤雅': ['badiya', '10']}, {'伦敦': ['lundun', '1']}, {'旧金山': ['jiujinshan', '1']}, {'拉斯维加斯': ['lasiweijiasi', '1']}, {'文莱': ['wenlai', '4']}, {'弗雷德里顿': ['fuleidelidun', '1']}, {'杜塞尔多夫': ['dusaierduofu', '3']}, {'雅加达': ['yajiada', '1']}, {'埼玉': ['qiyu', '2']}, {'广岛': ['guangdao', '1']}, {'千叶': ['qianye', '2']}, {'堺': ['jie', '1']}, {'相模原': ['xiangmoyuan', '1']}, {'船桥': ['chuanqiao', '1']}, {'东大阪': ['dongdaban', '12']}, {'暹粒': ['xianli', '3']}, {'哥打基纳巴鲁': ['gedajinabalu', '4']}, {'奥兰多': ['aolanduo', '3']}, {'圣何塞': ['shenghs', '1']}, {'立川': ['lichuan', '1']}, {'调布': ['diaobu', '1']}, {'日野': ['riye', '1']}, {'马塔兰': ['mataram', '1']}, ]
因为每个城市的房源时刻在变得所有,不一定准确