python多线程执行队列中的下载任务

一般先把下载任务先放到待处理队列中,也可以定时填充任务到待处理队列 ; 

 一个线程判断任务完成情况,完成后处理下载结果;

  开多个工作线程从队列中获取下载任务,完成后放到已完成队列中,失败要放回待处理;

# _*_ coding: utf-8 _*_
import requests
from threading import Thread
from queue import Queue
import time, json, math
import sys, os
import pymysql

req_queue  = Queue(maxsize=30000)
res_queue  = Queue()

class pagest():
    max_page   = 1
    username   = 'test'
    password   = '123'
    date_start = '2019-05-01'
    date_end   = '2019-05-05'
    cookies    = {}

class maintd(Thread):
    def __init__(self):
        Thread.__init__(self)

    def run(self):
        while True:
            if res_queue.qsize() == pagest.max_page:
                bigfile()
                break
            time.sleep(10)


class worker(Thread):
    def __init__(self):
        Thread.__init__(self)

    def run(self):
        while True:
            if req_queue.empty():
                break
            page = req_queue.get(block=True, timeout=30)
            isok = parser(page)
            if isok == True:
                res_queue.put(page)
            else:
                req_queue.put(page)
                time.sleep(5)


def parser(page):
    sql = []
    try:
        url = 'http://a.b.c/log.html?page=%s&date_start=%s&date_end=%s' % (page, pagest.date_start, pagest.date_end)
        res = requests.get(url, cookies=pagest.cookies)
        user = json.loads(res.text)
        f = open("dt/%s.txt" % page, "w")
                f.write(res.text+'\n')
                print(obj['phone'])
        f.close()
    except Exception as e:
        print(e)
        return False
    return True


def login():
    url = 'http://a.b.c/login.html'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    sessid = res.cookies.get('PHPSESSID')
    param = {
        'username': pagest.username,
        'password': pagest.password
    }
    res = requests.post(url, headers=headers, cookies={'PHPSESSID': sessid}, data=param)
    pagest.cookies = {
        'PHPSESSID': sessid
    }

    url = 'http://a.b.c/log.html?page=1&date_start=%s&date_end=%s' % (pagest.date_start, pagest.date_end)
    res = requests.get(url, cookies=pagest.cookies)
    cnt = res.json().get('count')
    pagest.max_page = math.ceil(int(cnt) / 100)

def init():
    path = os.path.realpath(sys.path[0])+'\dt'
    if not os.path.exists(path):
        os.mkdir(path)
    ls = os.listdir(path)
    for l in ls:
        fp = os.path.join(path, l)
        os.remove(fp)

def bigfile():
    i = 1
    bf = open("dt/all.txt", "a")
    while i <= pagest.max_page:
        f = open("dt/%s.txt" % i, "r")
        bf.write(f.read())
        f.close()
        i += 1
    bf.close()

def main():
    init()
    login()
    td = maintd()
    td.start()

    for i in range(pagest.max_page):
        req_queue.put(int(i+1))

    worker_list = []
    for i in range(30):
        item = worker()
        worker_list.append(item)

    for li in worker_list:
        li.start()
    for li in worker_list:
        li.join()

    td.join()

if __name__ == '__main__':
    main()

                            

 

posted @ 2019-05-05 20:10  1553  阅读(358)  评论(0编辑  收藏  举报