ofo小黄车数据抓取
#!/usr/bin/env python # -*- coding:utf-8 -*- # Author : zhibo.wang # E-mail : d_1206@qq.com # Desc : ofo import time import random import socket import hashlib import datetime import threading import numpy as np from more_itertools import chunked from requests_toolbelt import MultipartEncoder class Crawler: oss = OSS2() W = Weixin() def __init__(self): #self.city_code = [131, 289, 257, 340] # self.city_code = {"citycode": 257, "cityname": "广州市"} # 城市code self.city_code = {"citycode": 131, "cityname": "北京市"} # 城市code self.timeout = 10 # 超时时间 self.offset = 0.0022 # 平移量 self.indexs = None self.db = mongo_con_keepalive() self.start_time = datetime.datetime.now() self.url = "https://san.ofo.so/ofo/Api/nearbyofoCar" self.wait_time = [0.9, 1, 1.1, 1.2, 1.3] # 间隔时间 # 用户token,可用抓包工具抓取 self.keys = [{ "Content-Type":"multipart/form-data; boundary=--------FormDataxxx", "boundary": "--------FormDataxxx"}, { "Content-Type":"multipart/form-data; boundary=--------FormDataxxx", "boundary": "--------FormDataxxx"}, ] self.headers = { "Accept": "*/*", "Host": "san.ofo.so", "Accept-Language": "zh-CN", "Origin": "https://common.ofo.so", "Accept-Encoding": "gzip, deflate", "Referer": "https://common.ofo.so/newdist/?Journey", "User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 11_2_1 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C153 MicroMessenger/6.6.0 NetType/WIFI Language/zh_CN" } def request(self, lat, lng, gridid): # 获取相关 数据 key = random.choice(self.keys) fields={ "token": key["token"], "lat": lat, "lng": lng, "source": "-5", } multipart_encoder = MultipartEncoder(fields=fields, boundary=key["boundary"]) headers = self.headers headers["Content-Type"] = key["Content-Type"] date = datetime.datetime.now() response = requests.post(url=self.url, headers=headers, data=multipart_encoder, timeout=self.timeout) try: data = response.json() if data["errorCode"] == 200: if len(data["values"]["info"]["cars"]) != 0: file_name = self.create_file_name("{0},{1}".format(lat, lng), ".json") data["center_lng"], data["center_lat"] = float(lng), float(lat) data["citycode"] = self.city_code["citycode"] data["cityname"] = self.city_code["cityname"] data["gridid"] = gridid else: print(data) except Exception as e: print("request error: ", e) def get_city_gridid(self, gridid_data, db): # 根据偏移量生成 坐标 for i in gridid_data: gridid = i["gridid"] print("gridid: ", gridid) try: # 转换坐标 left_lng, top_lat = bd09togcj02(i["left_lng"], i["top_lat"]) right_lng, bottom_lat = bd09togcj02(i["right_lng"], i["bottom_lat"]) lat_range = np.arange(float(bottom_lat), float(top_lat), self.offset)[1:] for lat in lat_range: lng_range = np.arange(float(left_lng), float(right_lng), self.offset)[1:] for lng in lng_range: self.request(str(lat), str(lng), gridid) time.sleep(random.choice(self.wait_time)) except Exception as e: print("get_city_gridid error:", i, e) def start(self): all_data = self.db.get_collection("active_grids").find({"citycode": self.city_code["citycode"]}, no_cursor_timeout=True) print("count: ", all_data.count()) all_data_list = list(chunked(list(all_data), int(all_data.count()/len(self.keys)))) p = [] for i in range(0, len(all_data_list)): t = threading.Thread(target=self.get_city_gridid, args=(all_data_list[i], self.db)) p.append(t) for x in p: x.start() for x in p: x.join() if __name__ == "__main__": c = Crawler() c.start() { "carno" : "EXxvn8", "ordernum" : "", "userIdLast" : "1", "lng" : 113.24468731813714, "lat" : 23.273194605097277, "Time" : "2018-03-27 19:37:16", "recordBatchNo" : "19"}