python多线程执行队列中的下载任务
一般先把下载任务先放到待处理队列中,也可以定时填充任务到待处理队列 ;
一个线程判断任务完成情况,完成后处理下载结果;
开多个工作线程从队列中获取下载任务,完成后放到已完成队列中,失败要放回待处理;
# _*_ coding: utf-8 _*_ import requests from threading import Thread from queue import Queue import time, json, math import sys, os import pymysql req_queue = Queue(maxsize=30000) res_queue = Queue() class pagest(): max_page = 1 username = 'test' password = '123' date_start = '2019-05-01' date_end = '2019-05-05' cookies = {} class maintd(Thread): def __init__(self): Thread.__init__(self) def run(self): while True: if res_queue.qsize() == pagest.max_page: bigfile() break time.sleep(10) class worker(Thread): def __init__(self): Thread.__init__(self) def run(self): while True: if req_queue.empty(): break page = req_queue.get(block=True, timeout=30) isok = parser(page) if isok == True: res_queue.put(page) else: req_queue.put(page) time.sleep(5) def parser(page): sql = [] try: url = 'http://a.b.c/log.html?page=%s&date_start=%s&date_end=%s' % (page, pagest.date_start, pagest.date_end) res = requests.get(url, cookies=pagest.cookies) user = json.loads(res.text) f = open("dt/%s.txt" % page, "w") f.write(res.text+'\n') print(obj['phone']) f.close() except Exception as e: print(e) return False return True def login(): url = 'http://a.b.c/login.html' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36' } res = requests.get(url, headers=headers) sessid = res.cookies.get('PHPSESSID') param = { 'username': pagest.username, 'password': pagest.password } res = requests.post(url, headers=headers, cookies={'PHPSESSID': sessid}, data=param) pagest.cookies = { 'PHPSESSID': sessid } url = 'http://a.b.c/log.html?page=1&date_start=%s&date_end=%s' % (pagest.date_start, pagest.date_end) res = requests.get(url, cookies=pagest.cookies) cnt = res.json().get('count') pagest.max_page = math.ceil(int(cnt) / 100) def init(): path = os.path.realpath(sys.path[0])+'\dt' if not os.path.exists(path): os.mkdir(path) ls = os.listdir(path) for l in ls: fp = os.path.join(path, l) os.remove(fp) def bigfile(): i = 1 bf = open("dt/all.txt", "a") while i <= pagest.max_page: f = open("dt/%s.txt" % i, "r") bf.write(f.read()) f.close() i += 1 bf.close() def main(): init() login() td = maintd() td.start() for i in range(pagest.max_page): req_queue.put(int(i+1)) worker_list = [] for i in range(30): item = worker() worker_list.append(item) for li in worker_list: li.start() for li in worker_list: li.join() td.join() if __name__ == '__main__': main()