爬取携程7天内的全国热门城市航班
1 #!/usr/bin/env python 2 # coding: utf-8 6 7 import requests 8 import pandas as pd 9 import json,random,time,datetime 10 11 # userAgent 12 userAgent = [ 13 "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36", 14 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", 15 "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0", 16 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10", 17 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 18 "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 19 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36", 20 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17" 21 "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", 22 "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", 23 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", 24 "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", 25 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36" 26 ] 27 28 # get city 29 def getCityMsg(): 30 headers = { 31 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', 32 "Referer": "https://flights.ctrip.com/itinerary", 33 "Content-Type": "application/json" 34 } 35 url = 'https://flights.ctrip.com/itinerary/api/poi/get' 36 r = requests.get(url=url,headers=headers).text 37 # print(len(r)) 38 # get city msg 39 city = {} 40 city_load = json.loads(r).get('data') 41 for data in city_load.keys(): 42 ## 所有航班 43 # if data != '热门': 44 # tmpdata = city_load.get(data) 45 # for i in tmpdata: 46 # # print(i) # A 47 # for k in tmpdata.get(i): 48 # name = k.get('data').split('|') 49 # cityNumId = name[2] 50 # cityId = name[3] 51 # cityName = name[1].split('(')[0] 52 # city[cityName] = [cityId, cityNumId] 53 if data == '热门': # 仅限热门城市 54 tmpdata = city_load.get(data) 55 for i in tmpdata: # tmpdata is list , i is dict 56 name = i.get('data').split('|') 57 cityNumId = name[2] 58 cityId = name[3] 59 cityName = name[1].split('(')[0] 60 city[cityName] = [cityId, cityNumId] 61 62 63 return city 64 65 # 生成自今日至往后7天日期 66 def get_date(): 67 dateList = [] # 存放时间list 68 formatDate = datetime.datetime.now() # 生成今日的格式化时间 69 strDate = formatDate.strftime('%Y-%m-%d') # 生成字符串日期 70 stpDate = datetime.datetime.strptime(strDate,'%Y-%m-%d') # 将字符串转为日期格式的日期 71 for i in range(7): 72 stpDate += datetime.timedelta(days=+1) # 日期叠加1 73 dateList.append(datetime.datetime.strftime(stpDate,'%Y-%m-%d')) # 放入字典 74 return dateList 75 76 # get page text:routeList 77 def get_routeList(headers, load_json, cnt): 78 try: 79 response = requests.post(url = "https://flights.ctrip.com/itinerary/api/12808/products",data=json.dumps(load_json), headers = headers).text 80 result = json.loads(response)["data"].get('routeList') 81 return json.loads(response)["data"].get('routeList') 82 except Exception as e: 83 print('Get 【{} --> {}】 Page is failed !'.format(load_json.get('airportParams')[0].get('dcityname'), load_json.get('airportParams')[0].get('acityname'))) 84 print('休息10m后再来……') 85 time.sleep(600) 86 cnt += 1 87 if cnt <= 10: 88 get_routeList(headers, load_json, cnt) 89 else: 90 return None 91 # get Data 92 def get_data(index, df, routeList): 93 if routeList is not None: 94 for i, route in enumerate(routeList): 95 if route.get('routeType') == 'Flight': # 只要航班 96 index += 1 97 # route is dict 98 # we need route inside legs, legs is list, but its lengths is 1 99 # so we should legs[0], legs[0] is dict 100 101 # flight 102 flight = route.get('legs')[0].get('flight') # dict 108 109 #### about flight 110 if flight is not None: 111 # common attr 112 df.loc[index,'airlineCode'] = flight.get('airlineCode') 113 df.loc[index,'AirlineName'] = flight.get('airlineName') 114 df.loc[index,'durationDays'] = flight.get('durationDays') 115 df.loc[index,'flightNumber'] = flight.get('flightNumber') 116 df.loc[index,'mealFlag'] = flight.get('mealFlag') 117 df.loc[index,'mealType'] = flight.get('mealType') 118 df.loc[index,'comfort'] = flight.get('comfort') 119 df.loc[index,'craftKind'] = flight.get('craftKind') 120 df.loc[index,'craftTypeCode'] = flight.get('craftTypeCode') 121 df.loc[index,'craftTypeKindDisplayName'] = flight.get('craftTypeKindDisplayName') 122 df.loc[index,'craftTypeName'] = flight.get('craftTypeName') 123 df.loc[index,'delayedTime'] = flight.get('delayedTime') 124 df.loc[index,'oilFee'] = flight.get('oilFee') 125 df.loc[index,'punctualityRate'] = flight.get('punctualityRate') 126 df.loc[index,'sharedFlightName'] = flight.get('sharedFlightName') 127 df.loc[index,'sharedFlightNumber'] = flight.get('sharedFlightNumber') 128 df.loc[index,'specialCraft'] = flight.get('specialCraft') 129 df.loc[index,'stopInfo'] = flight.get('stopInfo') 130 df.loc[index,'stopTimes'] = flight.get('stopTimes') 131 df.loc[index,'tax'] = flight.get('tax') 132 # arrival 133 df.loc[index,'arrivalairportName'] = flight.get('arrivalAirportInfo').get('airportName') 134 df.loc[index,'arrivalairportTlc'] = flight.get('arrivalAirportInfo').get('airportTlc') 135 df.loc[index,'arrivalcityName'] = flight.get('arrivalAirportInfo').get('cityName') 136 df.loc[index,'arrivalcityTlc'] = flight.get('arrivalAirportInfo').get('cityTlc') 137 df.loc[index,'arrivalTerminalName'] = flight.get('arrivalAirportInfo').get('terminal').get('name') 138 df.loc[index,'arrivalDate'] = flight.get('arrivalDate') 139 # departure 140 df.loc[index,'departureairportName'] = flight.get('departureAirportInfo').get('airportName') 141 df.loc[index,'departureairportTlc'] = flight.get('departureAirportInfo').get('airportTlc') 142 df.loc[index,'departureCityName'] = flight.get('departureAirportInfo').get('cityName') 143 df.loc[index,'departureCityTlc'] = flight.get('departureAirportInfo').get('cityTlc') 144 df.loc[index,'departureTerminalName'] = flight.get('departureAirportInfo').get('terminal').get('name') 145 df.loc[index,'departureDate'] = flight.get('departureDate') 146 147 #### characteristic : charactor 148 # characteristic:charactor 149 charactor = route.get('legs')[0].get('characteristic') # dict 150 if charactor is not None: 151 df.loc[index, 'businessAircraft'] = charactor.get('businessAircraft') 152 df.loc[index, 'discountAmount'] = charactor.get('discountAmount') 153 df.loc[index, 'discountShowType'] = charactor.get('discountShowType') 154 df.loc[index, 'flyMan'] = charactor.get('flyMan') 155 df.loc[index, 'groupTicketPrice'] = charactor.get('groupTicketPrice') 156 df.loc[index, 'hotFlight'] = charactor.get('hotFlight') 157 df.loc[index, 'hx'] = charactor.get('hx') 158 df.loc[index, 'infantSoldOut'] = charactor.get('infantSoldOut') 159 df.loc[index, 'lowPriceDiscount'] = charactor.get('lowPriceDiscount') 160 df.loc[index, 'lowestBabyCfPrice'] = charactor.get('lowestBabyCfPrice') 161 df.loc[index, 'lowestBabyPrice'] = charactor.get('lowestBabyPrice') 162 df.loc[index, 'lowestCfPrice'] = charactor.get('lowestCfPrice') 163 df.loc[index, 'lowestChildAdultCfPrice'] = charactor.get('lowestChildAdultCfPrice') 164 df.loc[index, 'lowestChildAdultPrice'] = charactor.get('lowestChildAdultPrice') 165 df.loc[index, 'lowestChildCfPrice'] = charactor.get('lowestChildCfPrice') 166 df.loc[index, 'lowestChildPrice'] = charactor.get('lowestChildPrice') 167 df.loc[index, 'lowestPrice'] = charactor.get('lowestPrice') 168 df.loc[index, 'promotion'] = charactor.get('promotion') 169 df.loc[index, 'providerHx'] = charactor.get('providerHx') 170 df.loc[index, 'roundTripDiscounts'] = charactor.get('roundTripDiscounts') 171 tmp_charactor = charactor.get('standardPrices') 172 if tmp_charactor is not None: 173 for i, stdPrice in enumerate(tmp_charactor): 174 diffCabinCla = stdPrice.get('cabinClass') 175 df.loc[index, 'price' + diffCabinCla] = stdPrice.get('price') 176 df.loc[index, 'superFlyMan'] = charactor.get('superFlyMan') 177 df.loc[index, 'weight'] = charactor.get('weight') 178 209 return (index, df) 210 211 212 # main function 213 def main(city): 214 # 初始化 时间 215 flightDates = get_date() 216 for flightDate in flightDates: # 起飞日期 217 df = pd.DataFrame() 218 index = 0 219 print(flightDate, end= '\t') 220 221 222 # 当出现错误时,在后续的过程中加入加入该段代码 223 # for (fromCityName, fromCityId) in city[city.index('厦门'):].items(): 224 # 当第一次运行时,执行下面这个for 225 for (fromCityName, fromCityId) in city.items(): # 起飞城市 226 print(fromCityName,end=':') 227 for (toCityName, toCityId) in city.items(): # 降落城市 228 # 容错次数 229 cnt = 1 230 if fromCityName != toCityName: 231 print(toCityName,end='\t') 232 # headers 233 headers = { 234 "User-Agent": random.choice(userAgent), 235 "origin": "https://flights.ctrip.com", 236 "content-type": "application/json" 237 } 238 # 加载不同 load_json 239 load_json = { 240 "airportParams":[ 241 {"dcity":fromCityId[0],"dcityname":fromCityName,"acity":toCityId[0],"acityname":toCityName,"date":flightDate,"dcityid":fromCityId[1],"acityid":toCityId[1]} 242 ], 243 "classType": "ALL", 244 "date": flightDate, 245 "flightWay": "Oneway", 246 "hasBaby": False, 247 "hasChild": False, 248 "searchIndex": 1, 249 "token": "a4d91efc14f95ad7e1abaf914da140f3" 250 } 251 # routeList 252 routeList = get_routeList(headers, load_json, cnt) 253 # get_data 254 if routeList is not None: # 没有航班则跳过 255 (index, df) = get_data(index, df, routeList) 256 # print(index,df.shape, end='\t') 257 time.sleep(random.choice(range(2))) 258 print('\n' + '--'*50) 259 time.sleep(random.choice(range(3))) 260 ####################################### 261 print('【{}】起飞,抓完!'.format(fromCityName)) 262 time.sleep(random.choice(range(60,90))) 263 print(df.shape) 264 csv_path = '【{}】起飞航班.csv'.format(flightDate) 265 print('起飞日期:{},抓完,写入文件!'.format(fromCityName)) 266 print(csv_path) 267 df.to_csv(csv_path,index=False, encoding='utf-8') 268 return (index, df) 269 270 271 272 if __name__ == "__main__": 273 # getCityMsg 274 city = getCityMsg() 275 (index, df) = main(city)
降低爬取速度可用