用python抓取摩拜单车API数据
最近摩拜单车超级火,下班了各种骑,然后膝盖伤了。。。。。。
-----------------------------------------------------------------------
看到这篇文章http://mp.weixin.qq.com/s?__biz=MzA5NDExMTAzNA==&mid=2649982414&idx=1&sn=68b638c4f019baa3a783c045b294d6de&chksm=8854b19bbf23388dd91509a2a692736a5c0505ab144c5605d4015330044699c20e5c42f92496&mpshare=1&scene=1&srcid=0519Fkgd6ew4TuOHUerHMlZs#rd
说好的给源码,但貌似没给。。。。
然后就自己写了一份
--------------------------------------------------------------------------------------
import requests
import numpy as np
import json
import multiprocessing
from requests.packages.urllib3.exceptions import InsecureRequestWarning
#忽略警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
#进程数
PROCESS_NUM = 4
#模拟的数据包头
HEADERS = {
'charset':'utf-8',
'platform':'4',
'referer':'https://servicewechat.com/wx40f112341ae33edb/1/',
'content-type':'application/x-www-form-urlencoded',
'user-agent':'MicroMessenger/6.5.4.1000 NetType/WIFI Language/zh_CN',
'host':'mwx.mobike.com',
'connection':'Keep-Alive',
'accept-encoding':'gzip',
'cache-control':'no-cache'
}
#初始化的部分鹿城区经纬度
LEFT = 28.026735
TOP = 120.644784
RIGHT = 27.99344
BOTTOM = 120.715068
INIT_POS = (TOP, LEFT, BOTTOM, RIGHT)
#将一个区域分成num份,每份后续对应一个进程进行处理
def split_pos(init_pos, num):
result = []
x1 = init_pos[0]
y1 = init_pos[1]
x2 = init_pos[2]
y2 = init_pos[3]
xrange = np.linspace(x1, x2, num + 1)
for i in range(num):
#可以做一些重叠的优化,暂时没做
result.append((xrange[i], y1, xrange[i+1], y2))
return result
#解析并保存数据
#可以直接通过MONGDB操作,这台机器没安装。。
def save(text, output):
try:
json_data = json.loads(text)
except:
print('***ERR1*** ', text)
return
if 'object' not in json_data:
print('***ERR2*** ', text)
return
for object in json_data['object']:
obj_values = []
for key in sorted(object.keys()):
try:
if(object[key] == None):
obj_values.append('')
else:
obj_values.append(str(object[key]))
except:
print('***ERR3*** ', object)
return
output.write('$'.join(obj_values) + '\n')
#爬取摩拜单车的数据,没有对反爬虫的限制进行处理,可以考虑IP池、拨号等手段
#此处只是闹着玩。。
def worker(num, location):
raw_fd = open('worker' + str(num) + '_rawdata.txt', 'w')
tabular_fd = open('worker' + str(num) + '_tabulardata.txt', 'w')
session = requests.session()
url = 'https://mwx.mobike.com/mobike-api/rent/nearbyBikesInfo.do'
# 模拟登录
postdata = {
'latitude': 0,
'longitude': 0,
'errMsg': 'getMapCenterLocation'
}
(top, left, bottom, right) = location
#按照经纬度,一个区域一个区域的获取信息
lat_range = np.arange(left, right, -0.002)
for lat in lat_range:
lon_range = np.arange(top, bottom, 0.002)
for lon in lon_range:
postdata['latitude'] = lat
postdata['longitude'] = lon
mobike_data = session.post(url, headers=HEADERS, data=postdata, verify=False)
#print(mobike_data.text)
#保存原始数据,原始数据还是很关键的,最好都保留下来
raw_fd.write(mobike_data.text + '\n')
#解析并保存数据,还需要去除,后续单独进行,尽量不要再多进程或者多线程环境下处理
save(mobike_data.text, tabular_fd)
raw_fd.close()
tabular_fd.close()
#主函数入口
if __name__ == '__main__':
jobs = []
location = split_pos(INIT_POS, PROCESS_NUM)
#执行多进程,也可以用多线程
for process_id in range(PROCESS_NUM):
p = multiprocessing.Process(target=worker, args=(process_id,location[process_id]))
jobs.append(p)
p.start()