爬取广州飞猪酒店数据--待完善
一直听说淘宝的反爬很厉害,只爬取数十条数据的话不会有感觉,当破百了就很容易被识别出来
自己试了一下,依旧存在问题,现在提供源码供大家一起学习
import requests from lxml import etree import time import pymongo url='https://hotel.fliggy.com/hotel_list3.htm?' #原网址是没有页号的!自己加params上去 #通过刷新页面查看XHR可以发现,出现了一个很长的URL,把里面的页号加到原有params进行尝试,可以翻页了! # 'https://hotel.fliggy.com/ajax/hotelList.htm?pageSize=20¤tPage=7&totalItem=12644&startRow=100&endRow=119&city=440100&tid=null&market=0&previousChannel=&u=null&detailLinkCity=440100&cityName=%E5%B9%BF%E5%B7%9E&checkIn=2019-05-01&checkOut=2019-05-0。。。。' # 'https://hotel.fliggy.com/hotel_list3.htm?pageSize=20¤tPage=6' #设置mongo数据库 mon_client=pymongo.MongoClient() mon_db=mon_client.home feizhu_col=mon_db.guangzhou1 #编号 id=0 for i in range(1,5): params={ 'cityName':'%B9%E3%D6%DD',#(unable to decode value) 'city':'440100', 'keywords':'', 'checkIn':'2019-05-01', 'checkOut':'2019-05-02', 'ttid':'seo.000000583', # 'pageSize':'20' if i!=633 else , #每页有20条信息,尝试后发现可以去掉 'currentPage':str(i), #第几页 #选择多次页面观察得出 'priceRange':'R2', #价格档次为第二档100-300 'poiNameFilter':'%E4%BD%93%E8%82%B2%E8%A5%BF%E8%B7%AF', #体育西路 'searchPoiName':'%E4%BD%93%E8%82%B2%E8%A5%BF%E8%B7%AF', } #ValueError: Invalid header name b':authority' #这是表头参数设置错误,把:method改为method headers={ 'authority':'hotel.fliggy.com', 'method':'GET', 'path':'/hotel_list3.htm?cityName=%B9%E3%D6%DD&city=440100&keywords=&checkIn=2019-05-01&checkOut=2019-05-02&ttid=seo.000000583', 'scheme':'https', 'referer':'https://hotel.fliggy.com/hotel_list3.htm?_input_charset=utf-8&_output_charset=utf-8&searchBy=&market=0&previousChannel=&cityName=%E5%B9%BF%E5%B7%9E&city=440100&_fmd.h._0.r=&checkIn=2019-05-01&checkOut=2019-05-02&keywords=&ttid=seo.000000583', 'cache-control':'max-age=0', 'upgrade-insecure-requests':'1', 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } cookies={ #部分cookie删除掉了,需要自己去更新 'chanelStatExpire':'2019-04-11 09:44:29', #cookie到期时间 #更新cookie后新增的内容 'cookieCheck':'27487', 'v':'0', 'last':'true', } response=requests.get(url,headers=headers,params=params,cookies=cookies) # print(response.text) #网页源码出现:亲,小二正忙,滑动一下马上回来 #被识别为爬虫了! r=requests.utils.dict_from_cookiejar(response.cookies) print(r) #不一定每次都更新cookies,更新格式为{'cookieCheck': '15798', 'v': '0'} cookies.update(r) response=requests.get(url,headers=headers,params=params,cookies=cookies) html=etree.HTML(response.text) # print(response.text) hotel_name=html.xpath('//*[@id="J_List"]/div/div/div[4]/div/h5/a/text()') coment_url=html.xpath('//*[@id="J_List"]/div/div/div[3]/a/@href') hotel_score=html.xpath('//*[@id="J_List"]/div/div/div[3]/a/p[1]/span[1]/text()') hotel_price=html.xpath('//*[@id="J_List"]/div/div/div[2]/div[1]/p/span[1]/text()') print(hotel_name) for i,j,k,l in zip(hotel_name,hotel_score,hotel_price,coment_url): id+=1 #id加一 data={ 'id':str(id), 'name':i, 'score':j, 'price':k, 'url':l, } # print(data) feizhu_col.insert(data) #插入数据 # print(i) time.sleep(2) mon_client.close() #关闭数据库连接