19、我家附近有啥好吃的(饿了么)
练习介绍
要求:
在本练习,我们会借助cookies的相关知识,使用Python登录饿了么网站,爬取自己家附近的餐厅列表。
目的:
练习掌握cookies和session的用法
练习post和get请求
练习json数据的解析提取
反爬虫应对策略
1 import requests 2 import json 3 4 headers = { 5 'content-type': 'application/json; charset=utf-8', 6 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' 7 } 8 9 def ele_login(): 10 11 url_mobile_send_code = 'https://h5.ele.me/restapi/eus/login/mobile_send_code' 12 url_login_by_mobile = 'https://h5.ele.me/restapi/eus/login/login_by_mobile' 13 14 dict_mobile_send_code = {"mobile":"","captcha_value":"","captcha_hash":"","scf":"ms"} 15 dict_login_by_mobile = {"mobile":"","validate_code":"","validate_token":"","scf":"ms"} 16 17 mobile = input('请输入您的手机号码:') 18 dict_mobile_send_code['mobile'] = mobile 19 20 res_mobile_send_code = requests.post(url_mobile_send_code,json=dict_mobile_send_code,headers=headers) 21 22 23 dict_login_by_mobile['mobile'] = mobile 24 dict_login_by_mobile['validate_token'] = res_mobile_send_code.json()['validate_token'] 25 dict_login_by_mobile['validate_code'] = input('请输入您收到的六位手机验证码:') 26 27 28 res_login_by_mobile = requests.post(url_login_by_mobile,json=dict_login_by_mobile,headers=headers) 29 30 cookies_dict = requests.utils.dict_from_cookiejar(res_login_by_mobile.cookies) 31 32 cookies_str = json.dumps(cookies_dict) 33 34 with open('cookies.str','w',encoding='utf-8') as strfile: 35 strfile.write(cookies_str) 36 37 def ele_cookies_check(): 38 try: 39 cookies_str_read = open('cookies.str','r') 40 except: 41 ele_login() 42 43 cookies_str_read = open('cookies.str','r') 44 cookies_dict_read = json.loads(cookies_str_read.read()) 45 cookies = requests.utils.cookiejar_from_dict(cookies_dict_read) 46 47 if requests.get('https://www.ele.me/restapi/eus/v1/current_user?info_raw={}',headers=headers,cookies=cookies).json() == 0 : 48 ele_login() 49 50 cookies_str_read = open('cookies.str','r') 51 cookies_dict_read = json.loads(cookies_str_read.read()) 52 cookies = requests.utils.cookiejar_from_dict(cookies_dict_read) 53 54 return cookies 55 56 def ele_address(): 57 58 ele_address_url = 'https://www.ele.me/restapi/v2/pois' 59 60 my_address = input('请输入你的地址:') 61 print('\n') 62 63 ele_address_params = { 64 'extras[]': 'count', 65 'geohash': 'wx4g0bmjetr7', 66 'keyword': my_address, 67 'limit': '20', 68 'type': 'nearby', 69 } 70 71 res_ele_address = requests.get(ele_address_url,headers=headers,cookies=cookies,params=ele_address_params) 72 73 my_address_name = res_ele_address.json()[0]['name'] 74 my_address_latitude = res_ele_address.json()[0]['latitude'] 75 my_address_longitude = res_ele_address.json()[0]['longitude'] 76 my_address_geohash = res_ele_address.json()[0]['geohash'] 77 78 79 ele_address_dict = {'my_address_name':my_address_name,'my_address_latitude':my_address_latitude,'my_address_longitude':my_address_longitude,'my_address_geohash':my_address_geohash} 80 81 return ele_address_dict 82 83 84 def ele_main(): 85 86 my_ele_address = ele_address() 87 88 ele_restaurants_url = 'https://www.ele.me/restapi/shopping/restaurants' 89 90 ele_restaurants_params = { 91 'extras[]': 'activities', 92 'geohash': my_ele_address['my_address_geohash'], 93 'latitude': str(my_ele_address['my_address_latitude']), 94 'limit': '10', 95 'longitude': str(my_ele_address['my_address_longitude']), 96 'offset': '0', 97 'terminal': 'web' 98 } 99 100 res_ele_restaurants = requests.get(ele_restaurants_url,headers=headers,cookies=cookies,params=ele_restaurants_params) 101 102 ele_restaurants_json = res_ele_restaurants.json() 103 104 for restaurants in ele_restaurants_json: 105 print(restaurants['name']) 106 107 cookies = ele_cookies_check() 108 109 ele_main() 110 111 ---------------------------------- 112 执行结果:(因为之前执行代码的时候保存过cookies,所有本次执行的时候不需要重新登录获取cookies) 113 114 115 请输入你的地址:十里河 116 117 曼玲粥店(十里河店) 118 秦唐味道(新业店) 119 京味斋(松榆里店) 120 南城香(潘家园店) 121 望京小腰(潘家园店) 122 田老师红烧肉(十里河店) 123 卷饼王炸串(十八里店) 124 金百万烤鸭(劲松店) 125 人民公社家东北菜 126 串小贱秘制烤鸭肠(十里河店)
老师的代码
1 import requests 2 session = requests.session() 3 4 headers = { 5 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' 6 } 7 url_1 = 'https://h5.ele.me/restapi/eus/login/mobile_send_code' 8 tel = input('请输入手机号码:') 9 data_1 = {'captcha_hash':'', 10 'captcha_value':'', 11 'mobile':tel, 12 'scf':''} 13 14 token = session.post(url_1, headers=headers, data=data_1).json()['validate_token'] 15 16 url_2 = 'https://h5.ele.me/restapi/eus/login/login_by_mobile' 17 code = input('请输入手机验证码:') 18 data_2 = {'mobile':tel, 19 'scf':'ms', 20 'validate_code':code, 21 'validate_token':token} 22 23 session.post(url_2,headers=headers,data=data_2) 24 25 address_url = 'https://www.ele.me/restapi/v2/pois?' 26 place = input('请输入你的收货地址:') 27 params = {'extras[]':'count','geohash':'ws105rz9smwm','keyword':place,'limit':'20','type':'nearby'} 28 # 这里使用了深圳的geohash 29 30 address_res = requests.get(address_url,params=params) 31 address_json = address_res.json() 32 33 print('以下,是与'+place+'相关的位置信息:\n') 34 n=0 35 for address in address_json: 36 print(str(n)+'. '+address['name']+':'+address['short_address']+'\n') 37 n = n+1 38 address_num = int(input('请输入您选择位置的序号:')) 39 final_address = address_json[address_num] 40 41 restaurants_url = 'https://www.ele.me/restapi/shopping/restaurants?' 42 # 使用带有餐馆列表的那个XHR地址。 43 params = {'extras[]':'activities', 44 'geohash':final_address['geohash'], 45 'latitude':final_address['latitude'], 46 'limit':'24', 47 'longitude':final_address['longitude'], 48 'offset':'0', 49 'terminal':'web' 50 } 51 # 将参数封装,其中geohash和经纬度,来自前面获取到的数据。 52 restaurants_res = session.get(restaurants_url,params=params) 53 # 发起请求,将响应的结果,赋值给restaurants_res 54 restaurants = restaurants_res.json() 55 # 把response对象,转为json。 56 for restaurant in restaurants: 57 # restsurants最外层是一个列表,它可被遍历。restaurant则是字典,里面包含了单个餐厅的所有信息。 58 print(restaurant['name'])
一份总结:
就是这样一个代码,它能拿到给定位置附近的餐厅名。但它的潜力并不只是如此。
如果我们尝试加载饿了么官网的首页,能够找到一个xhr叫做cities,这个xhr里包含了全国两千多个城市的经纬度。
利用Python的geohash模块,你可以将经纬度数据,转化为geohash(当然,也可以将geohash转为经纬度,我也是用这种方式,发现我的默认geohash是深圳)。
那么在理论上,其实你可以通过这种方式,拿到全国餐厅的数据……
只要稍做扩展,它还能拿到许多数据:所有的餐厅名/电话号码/评分/品牌/经纬度/介绍/均价/月销量……
此时,这个爬虫就具备了商业价值,它能胜任许多数据分析的工作:选址策略、定价策略、差异化竞争、2B营销……
或许你会质疑自己能不能做到像我描述的这样厉害,不用怕,很快你就能够做到对此心中有数。
而在后续的关卡,当你学会反爬虫的应对策略、协程、Scrapy框架……你会变得像我说的那样强大。
再会!