代码改变世界

PDF_下载,下载进度条

2017-09-26 20:06  hdwen  阅读(363)  评论(0编辑  收藏  举报
from __future__ import division
import requests,re,os,math,sys,time
from contextlib import closing

from multiprocessing import Pool
start_time=time.time()
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36'}
#读取进度条
def progressbar(cur, total):
    percent = '{:.2%}'.format(cur / total)
    sys.stdout.write('\r')
    sys.stdout.write('[%-50s] %s' % ('*' * int(math.floor(cur * 50 / total)), percent))
    sys.stdout.flush()
    if cur == total:
        sys.stdout.write('\n')

def return_links():
    for i in range(1,44):
        url='http://www.allitebooks.com/page/{}/?s=python'.format(i)
        response=requests.get(url).text
        links=re.findall('<a href="(.*?)" rel="bookmark">',response)
        for link in set(links):
            find_pdf_links(link)
        # return set(links)
def find_pdf_links(url):
    res=requests.get(url,headers=headers).text
    pdf_link=re.findall('<a href="(.*?)" target="_blank"><i class="fa fa-download" aria-hidden="true">',res)[0]
    pdf_size=re.findall('Download PDF <span class="download-size">\((.*?)\)</span></a>',res)
    print('Downloading:%s' % (os.path.basename(pdf_link)), ' Size:%s' % pdf_size)
    start_down_time=time.time()
    count = 0
    with closing(requests.get(pdf_link,headers=headers,stream=True)) as response:
        chunk_size=1024
        content_size=int(response.headers['content-length'])
        with open(os.path.join(r'C:\Users\HDWEN\Desktop\pdf',os.path.basename(pdf_link)),'wb')as f:
            for data in response.iter_content(chunk_size=chunk_size):
                f.write(data)
                count = count + len(data)
                progressbar(count, content_size)
        single_time=time.time()-start_down_time
        print('             下载时间:{:,.2f}秒'.format(single_time))
        with open(r'C:\Users\HDWEN\Desktop\dialog.txt','a+') as f:
            f.write('%s  [%s]'%(time.strftime('%y-%m-%d %H:%M:%S'),os.path.basename(pdf_link))+' '*10+'time:[%s]\n'%single_time)
return_links()
all_time=time.time()-start_time
print('花费总时间:%s'%(all_time))
with open(r'C:\Users\HDWEN\Desktop\1.txt', 'a+') as f:
    f.write('总时间:[%s],%s' % (all_time,time.strftime('%Y-%M-%d %H:%M:%S')))

# if __name__=='__main__':
#     start_time=time.clock()
#     pool=Pool()
#     pool.map_async(find_pdf_links,return_links())
#     pool.close()
#     pool.join()
#     print('time_wasting:',time.clock()-start_time)