线程污染 重复请求

 

 

 

import xlrd
import time
import sys
import os
import requests
import sqlite3
import threading

curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)

MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST = 1980, '天配额超限,限制访问', 'DB_KEY_EXHAUST'

db = 'py_bdspider_status.db'
db = '%s\\%s' % (curPath, db)


def db_init_key_table():
    conn = sqlite3.connect(db)
    c = conn.cursor()
    sql = 'DELETE  FROM  baidu_map_key_used'
    c.execute(sql)
    conn.commit()
    pcity_file = '%s\\%s' % (curPath, 'bdmap_key.txt')
    with open(pcity_file, 'r', encoding='utf-8') as pf:
        c_ = 0
        for i in pf:
            if len(i) < 4:
                continue
            author, key = i.replace('\n', '').split('\t')
            localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
            sql = 'INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ' % (
                author, key, localtime_, 0)
            c.execute(sql)
    conn.commit()
    conn.close()


# db_init_key_table()

def db_get_one_effective():
    conn = sqlite3.connect(db)
    c = conn.cursor()
    sql = 'SELECT key FROM baidu_map_key_used WHERE today_used<=%s ' % (MAX_USED_TIMES)
    res = c.execute(sql).fetchone()
    if res is None:
        return DB_KEY_EXHAUST
    else:
        return res[0]
    conn.close


def db_update_one_today_used(key):
    conn = sqlite3.connect(db)
    c = conn.cursor()
    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    sql = 'UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ' % (
        localtime_, key)
    c.execute(sql)
    conn.commit()
    conn.close()


dir_, dir_exception = 'baidu_map_uid', 'baidu_map_uid_exception'
requested_file_list = []
requested_file_dir_str, requested_file_dir_exception_str = '%s\\%s\\' % (curPath, dir_), '%s\\%s\\' % (
    curPath, dir_exception)
requested_file_dir = os.listdir(requested_file_dir_str)


def chk_if_requested_file():
    for f in requested_file_dir:
        to_in = f.split('.txt')[0]
        if to_in not in requested_file_list:
            requested_file_list.append(to_in)


def write_requested_res(request_name, str_, type_='.txt'):
    fname = '%s%s%s' % (requested_file_dir_str, request_name, type_)
    with open(fname, 'w', encoding='utf-8') as ft:
        ft.write(str_)
    print('ok', threading.get_ident(), request_name)


def write_requested_exception_res(request_name, str_, type_='.txt'):
    fname = '%s%s%s' % (requested_file_dir_exception_str, request_name, type_)
    with open(fname, 'w', encoding='utf-8') as ft:
        ft.write(str_)


request_dic = {}


def gen_request_dic_list():
    fname_source = '官方上传任务.csv_py170829093808-BD_request_name-REDUCTION170829142821'
    fname_open = '%s\\%s' % (curPath, fname_source)
    FEXCEL = '%s%s' % (fname_open, '.xlsx')
    data = xlrd.open_workbook(FEXCEL)
    table = data.sheets()[0]
    nrows, ncols = table.nrows, table.ncols
    for i in range(1, nrows):
        l = table.row_values(i)
        dbid, area_code, name_, request_name, type_, city, district, addr, street = l
        request_name_chk = '%s%s%s' % (city, district, request_name)
        chk_if_requested_file()
        if request_name_chk in requested_file_list:
            continue
        if city not in request_dic:
            request_dic[city] = {}
        if district not in request_dic[city]:
            request_dic[city][district] = {}
            request_dic[city][district] = []
        if request_name not in request_dic[city][district]:
            request_dic[city][district].append(request_name)


gen_request_dic_list()

fname_source = '官方上传任务.csv_py170829093808-BD_request_name-REDUCTION170829142821'

# http://api.map.baidu.com/place/v2/suggestion?query=瀛嘉天下&region=重庆市&city_limit=true&output=json&ak=oy2Q7IluhhwTGlz6l8pXYv6a0m6hXxr1
base_url = 'http://api.map.baidu.com/place/v2/suggestion?query=R-QUERY&region=R-CITY&city_limit=true&output=json&ak=R-AK'


def fun_(city):
    for district in request_dic[city]:
        for request_name in request_dic[city][district]:
            request_name_chk = '%s%s%s' % (city, district, request_name)
            chk_if_requested_file()
            if request_name_chk in requested_file_list:
                continue
            ak = db_get_one_effective()
            if ak == DB_KEY_EXHAUST:
                print(DB_KEY_EXHAUST)
                break
            else:
                url_ = base_url.replace('R-QUERY', request_name).replace('R-CITY', city).replace('R-AK', ak)
            try:
                bd_res_json_str = requests.get(url_).text
                db_update_one_today_used(ak)
                write_requested_res(request_name_chk, bd_res_json_str)
            except Exception:
                bd_res_json_str = '请求百度-异常'
                write_requested_exception_res(request_name_chk, bd_res_json_str)
                print(bd_res_json_str)


class MyThread(threading.Thread):
    def __init__(self, func):
        threading.Thread.__init__(self)
        self.func = func

    def run(self):
        self.func()


class MyThread(threading.Thread):
    def __init__(self, func, args):
        threading.Thread.__init__(self)
        self.func, self.args = func, args

    def run(self):
        self.func(self.args)


target_city_list = ['广州市', '厦门市', '深圳市', '北京市', '杭州市', '成都市', '上海市', '西安市']
thread_sum = len(target_city_list)


def main():
    threads_list = []
    for nloop in range(0, thread_sum, 1):
        city = target_city_list[nloop]
        thread_instance = MyThread(fun_, (city))
        threads_list.append(thread_instance)
    for t in threads_list:
        t.setDaemon = False
        t.start()
    for t in threads_list:
        t.join()


if __name__ == '__main__':
    main()

  

posted @ 2017-08-29 17:53  papering  阅读(372)  评论(0编辑  收藏  举报