from __future__ import division
import requests,re,os,math,sys,time
from contextlib import closing
from multiprocessing import Pool
start_time=time.time()
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36'}
#读取进度条
def progressbar(cur, total):
percent = '{:.2%}'.format(cur / total)
sys.stdout.write('\r')
sys.stdout.write('[%-50s] %s' % ('*' * int(math.floor(cur * 50 / total)), percent))
sys.stdout.flush()
if cur == total:
sys.stdout.write('\n')
def return_links():
for i in range(1,44):
url='http://www.allitebooks.com/page/{}/?s=python'.format(i)
response=requests.get(url).text
links=re.findall('<a href="(.*?)" rel="bookmark">',response)
for link in set(links):
find_pdf_links(link)
# return set(links)
def find_pdf_links(url):
res=requests.get(url,headers=headers).text
pdf_link=re.findall('<a href="(.*?)" target="_blank"><i class="fa fa-download" aria-hidden="true">',res)[0]
pdf_size=re.findall('Download PDF <span class="download-size">\((.*?)\)</span></a>',res)
print('Downloading:%s' % (os.path.basename(pdf_link)), ' Size:%s' % pdf_size)
start_down_time=time.time()
count = 0
with closing(requests.get(pdf_link,headers=headers,stream=True)) as response:
chunk_size=1024
content_size=int(response.headers['content-length'])
with open(os.path.join(r'C:\Users\HDWEN\Desktop\pdf',os.path.basename(pdf_link)),'wb')as f:
for data in response.iter_content(chunk_size=chunk_size):
f.write(data)
count = count + len(data)
progressbar(count, content_size)
single_time=time.time()-start_down_time
print(' 下载时间:{:,.2f}秒'.format(single_time))
with open(r'C:\Users\HDWEN\Desktop\dialog.txt','a+') as f:
f.write('%s [%s]'%(time.strftime('%y-%m-%d %H:%M:%S'),os.path.basename(pdf_link))+' '*10+'time:[%s]\n'%single_time)
return_links()
all_time=time.time()-start_time
print('花费总时间:%s'%(all_time))
with open(r'C:\Users\HDWEN\Desktop\1.txt', 'a+') as f:
f.write('总时间:[%s],%s' % (all_time,time.strftime('%Y-%M-%d %H:%M:%S')))
# if __name__=='__main__':
# start_time=time.clock()
# pool=Pool()
# pool.map_async(find_pdf_links,return_links())
# pool.close()
# pool.join()
# print('time_wasting:',time.clock()-start_time)