ofo小黄车数据抓取

#!/usr/bin/env python
# -*- coding:utf-8 -*-
 
# Author : zhibo.wang
# E-mail : d_1206@qq.com
# Desc   : ofo


import time
import random
import socket
import hashlib
import datetime
import threading
import numpy as np
from more_itertools import chunked 
from requests_toolbelt import MultipartEncoder



class Crawler:
    oss = OSS2()
    W = Weixin()
    def __init__(self):
        #self.city_code = [131, 289, 257, 340]
        # self.city_code = {"citycode": 257, "cityname": "广州市"}      # 城市code
        self.city_code = {"citycode": 131, "cityname": "北京市"}  # 城市code
        self.timeout = 10         # 超时时间
        self.offset = 0.0022      # 平移量
        self.indexs = None
        self.db = mongo_con_keepalive()
        self.start_time = datetime.datetime.now()
        self.url = "https://san.ofo.so/ofo/Api/nearbyofoCar" 
        self.wait_time = [0.9, 1, 1.1, 1.2, 1.3] # 间隔时间 
        # 用户token,可用抓包工具抓取 
        self.keys = [{ "Content-Type":"multipart/form-data; boundary=--------FormDataxxx", "boundary": "--------FormDataxxx"},
             {
                      "Content-Type":"multipart/form-data; boundary=--------FormDataxxx", 
                      "boundary": "--------FormDataxxx"},
                    ]
        self.headers = {
                        "Accept": "*/*",
                        "Host": "san.ofo.so",
                        "Accept-Language": "zh-CN",
                        "Origin": "https://common.ofo.so",
                        "Accept-Encoding": "gzip, deflate",
                        "Referer": "https://common.ofo.so/newdist/?Journey",
                        "User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 11_2_1 like Mac OS X) AppleWebKit/604.4.7 (KHTML, like Gecko) Mobile/15C153 MicroMessenger/6.6.0 NetType/WIFI Language/zh_CN"
                       }
        
    def request(self, lat, lng, gridid):
        # 获取相关 数据
        key = random.choice(self.keys)
        fields={
                "token": key["token"],
                "lat": lat,
                "lng": lng,
                "source": "-5",
               }
        multipart_encoder = MultipartEncoder(fields=fields,
                                             boundary=key["boundary"])
        headers = self.headers
        headers["Content-Type"] = key["Content-Type"]
        date = datetime.datetime.now()
        response = requests.post(url=self.url, headers=headers, data=multipart_encoder, timeout=self.timeout)
        try:
            data = response.json()
            if data["errorCode"] == 200:
                if len(data["values"]["info"]["cars"]) != 0:
                    file_name = self.create_file_name("{0},{1}".format(lat, lng), ".json")
                    data["center_lng"], data["center_lat"] = float(lng), float(lat)
                    data["citycode"] = self.city_code["citycode"]
                    data["cityname"] = self.city_code["cityname"] 
                    data["gridid"] = gridid
            else:
                print(data)
        except Exception as e:
            print("request error: ", e)
        

    def get_city_gridid(self, gridid_data, db):
        # 根据偏移量生成 坐标
        for i in gridid_data:
            gridid = i["gridid"]
            print("gridid: ", gridid)
            try:
                # 转换坐标
                left_lng, top_lat = bd09togcj02(i["left_lng"], i["top_lat"])
                right_lng, bottom_lat = bd09togcj02(i["right_lng"], i["bottom_lat"])
                lat_range = np.arange(float(bottom_lat), float(top_lat), self.offset)[1:]
                for lat in lat_range:
                    lng_range = np.arange(float(left_lng), float(right_lng), self.offset)[1:]
                    for lng in lng_range:
                        self.request(str(lat), str(lng), gridid)
                        time.sleep(random.choice(self.wait_time))
            except Exception as e:
                print("get_city_gridid error:", i, e)


    def start(self):
        all_data = self.db.get_collection("active_grids").find({"citycode": self.city_code["citycode"]}, no_cursor_timeout=True)
        print("count: ", all_data.count())
        all_data_list = list(chunked(list(all_data), int(all_data.count()/len(self.keys))))
        p = []
        for i in range(0, len(all_data_list)):
            t = threading.Thread(target=self.get_city_gridid, args=(all_data_list[i], self.db))
            p.append(t)

        for x in p:
            x.start()

        for x in p:
            x.join()


if __name__ == "__main__":
    c = Crawler()
    c.start()
  

{	"carno" : "EXxvn8",
	"ordernum" : "",
	"userIdLast" : "1",
	"lng" : 113.24468731813714,
	"lat" : 23.273194605097277,
	"Time" : "2018-03-27 19:37:16",
	"recordBatchNo" : "19"}
  

  

posted @ 2018-10-19 17:22  🐳.城南  阅读(1494)  评论(0编辑  收藏  举报