import wget
from bs4 import BeautifulSoup as bs
import requests
import random
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, Future, as_completed, wait
from multiprocessing import cpu_count
headers = [
'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
] # 随便找
def open_url(url):
respon = requests.get(url, headers={"user-agent": random.choice(headers)}).content
respon_decoded = respon.decode("utf-8")
return respon_decoded
response = open_url('https://nadc.china-vo.org/res/r101217/')
soup = bs(response, 'html.parser')
pd_files = soup.find_all(id='pd-files')
file_info_list = []
for file in pd_files:
file_info = {
'file_name': file.find(class_='paperinfo-files-filename').text.strip(),
'file_size': file.find(class_='paperinfo-files-filesize').text,
'download_link': 'https://nadc.china-vo.org/'+file.find(class_='col-sm-1').find('a')['href']
}
file_info_list.append(file_info)
# python 源码
import requests
import time
def download(file_info):
# 用流stream的方式获取url的数据
resp = requests.get(file_info['download_link'], stream=True)
# 拿到文件的长度,并把total初始化为0
total = int(resp.headers.get('content-length', 0))
# 打开当前目录的fname文件(名字你来传入)
# 初始化tqdm,传入总数,文件名等数据,接着就是写入,更新等操作了
with open('download/'+file_info['file_name'], 'wb') as file, tqdm(
desc=file_info['file_name'],
total=total,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in resp.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
print(f'总共有:{cpu_count()} 个核心')
def test_tqdm():
executor = ThreadPoolExecutor(max_workers=cpu_count()) # 线程池设置,最多同时跑8个线程
for file_info in file_info_list:
args = [file_info,]
tasks = [executor.submit(lambda p:download(*p), args)]
wait(tasks)
test_tqdm()