PDF_下载,下载进度条
2017-09-26 20:06 hdwen 阅读(368) 评论(0) 收藏 举报from __future__ import division import requests,re,os,math,sys,time from contextlib import closing from multiprocessing import Pool start_time=time.time() headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36'} #读取进度条 def progressbar(cur, total): percent = '{:.2%}'.format(cur / total) sys.stdout.write('\r') sys.stdout.write('[%-50s] %s' % ('*' * int(math.floor(cur * 50 / total)), percent)) sys.stdout.flush() if cur == total: sys.stdout.write('\n') def return_links(): for i in range(1,44): url='http://www.allitebooks.com/page/{}/?s=python'.format(i) response=requests.get(url).text links=re.findall('<a href="(.*?)" rel="bookmark">',response) for link in set(links): find_pdf_links(link) # return set(links) def find_pdf_links(url): res=requests.get(url,headers=headers).text pdf_link=re.findall('<a href="(.*?)" target="_blank"><i class="fa fa-download" aria-hidden="true">',res)[0] pdf_size=re.findall('Download PDF <span class="download-size">\((.*?)\)</span></a>',res) print('Downloading:%s' % (os.path.basename(pdf_link)), ' Size:%s' % pdf_size) start_down_time=time.time() count = 0 with closing(requests.get(pdf_link,headers=headers,stream=True)) as response: chunk_size=1024 content_size=int(response.headers['content-length']) with open(os.path.join(r'C:\Users\HDWEN\Desktop\pdf',os.path.basename(pdf_link)),'wb')as f: for data in response.iter_content(chunk_size=chunk_size): f.write(data) count = count + len(data) progressbar(count, content_size) single_time=time.time()-start_down_time print(' 下载时间:{:,.2f}秒'.format(single_time)) with open(r'C:\Users\HDWEN\Desktop\dialog.txt','a+') as f: f.write('%s [%s]'%(time.strftime('%y-%m-%d %H:%M:%S'),os.path.basename(pdf_link))+' '*10+'time:[%s]\n'%single_time) return_links() all_time=time.time()-start_time print('花费总时间:%s'%(all_time)) with open(r'C:\Users\HDWEN\Desktop\1.txt', 'a+') as f: f.write('总时间:[%s],%s' % (all_time,time.strftime('%Y-%M-%d %H:%M:%S'))) # if __name__=='__main__': # start_time=time.clock() # pool=Pool() # pool.map_async(find_pdf_links,return_links()) # pool.close() # pool.join() # print('time_wasting:',time.clock()-start_time)