多线程
from threading import Thread,current_thread
import time
def task(n):
print("%s is running" %current_thread().name)
time.sleep(n)
print("%s is end" %current_thread().name)
if __name__ == '__main__':
t1 = Thread(target=task,args=(3,))
t2 = Thread(target=task,args=(5,))
t3 = Thread(target=task,args=(100,))
t3.daemon = True
t1.start()
t2.start()
t3.start()
print("主")
多线程案例
import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin
import os
def get_html():
for i in range(1, 4):
url = f'https://www.woyaogexing.com/tupian/index_{i}.html'
headers = {
"Referer": 'https://www.baidu.com/link?',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
res = response.text
tree = etree.HTML(res)
result = tree.xpath('//div[@class="pMain pMain_1"]//div/a//img/@src')
url_list = [urljoin(url, i) for i in result]
with ThreadPoolExecutor(max_workers=10) as t:
for url in url_list:
t.submit(download_img, url)
def download_img(url):
headers = {
"Referer": 'https://www.baidu.com/link?',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
response = requests.get(url, headers=headers)
file_name = url.split('/')[-1]
directory = 'tutu'
if not os.path.exists(directory):
os.makedirs(directory)
with open(f'tutu/{file_name}', 'wb') as f:
f.write(response.content)
print('一张图片下载完成')
if __name__ == '__main__':
get_html()
线程池
import requests
from concurrent.futures import ThreadPoolExecutor
from lxml import etree
import time
def if_xpath(y):
s = "".join(y)
return s.strip()
def get_year(year):
f = open(f"nf/{year}.cvs", "w", encoding="utf-8")
url = f"http://www.boxofficecn.com/boxoffice{year}"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36"
}
resp = requests.get(url, headers=headers)
resp_text = resp.text
x_path = etree.HTML(resp_text)
e_path = x_path.xpath("//table/tbody/tr")[1:]
for item in e_path:
num = item.xpath("./td[1]//text()")
year = item.xpath("./td[2]//text()")
name = item.xpath("./td[3]//text()")
money = item.xpath("./td[4]//text()")
num = if_xpath(num)
year = if_xpath(year)
name = if_xpath(name)
money = if_xpath(money)
f.write(f"{num},{year},{name},{money}\n")
if __name__ == '__main__':
start = time.time()
with ThreadPoolExecutor(16) as t:
for i in range(1994, 2023):
t.submit(get_year, i)
stop = time.time()
print(stop - start)
线程池面向对象案例
import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin
import os
class ImgDownloader:
def __init__(self, baseurl, num_pages=1):
self.url = baseurl
self.num_pages = num_pages
self.headers = {
"Referer": 'https://www.baidu.com/link?',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
def get_html(self, url):
response = requests.get(url, headers=self.headers)
response.raise_for_status()
response.encoding = 'utf-8'
return response.text
def parse_img_urls(self, html):
etree_html = etree.HTML(html)
img_urls = etree_html.xpath('//div[@class="pMain pMain_1"]//div/a//img/@src')
return [urljoin(self.url, img_url) for img_url in img_urls]
def download_img(self, img_url):
file_name = os.path.basename(img_url)
directory = 'tutu'
if not os.path.exists(directory):
os.makedirs(directory)
response = requests.get(img_url, headers=self.headers)
response.raise_for_status()
with open(os.path.join(directory, file_name), 'wb') as f:
f.write(response.content)
print(f'下载完成: {file_name}')
def download_all_imgs(self):
for page in range(2, self.num_pages + 1):
page_url = f"{self.url}/index_{page}.html"
print(f"正在下载第 {page} 页的图片...")
html = self.get_html(page_url)
img_urls = self.parse_img_urls(html)
with ThreadPoolExecutor(max_workers=10) as executor:
for img_url in img_urls:
executor.submit(self.download_img, img_url)
if __name__ == '__main__':
downloader = ImgDownloader(baseurl='https://www.woyaogexing.com/tupian', num_pages=10)
downloader.download_all_imgs()
多进程
import requests
from concurrent.futures import ThreadPoolExecutor
from lxml import etree
from multiprocessing import Queue
from multiprocessing import Process
import time
def get_img_url(q):
for item in range(1, 3):
url = f"https://www.pkdoutu.com/article/list/?page={item}"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36"
}
resp = requests.get(url, headers=headers)
resp_text = resp.text
x_path = etree.HTML(resp_text)
img_urls = x_path.xpath("//div[@class='col-sm-9 center-wrap']/a/@href")
for img_url in img_urls:
resps = requests.get(img_url, headers=headers)
resps_text = resps.text
x_paths = etree.HTML(resps_text)
img_urlss = x_paths.xpath("//li[@class='list-group-item']//a/img/@src")
for imgs in img_urlss:
print(imgs)
q.put(imgs)
q.put("滚蛋吧.没了")
def img_process(q):
with ThreadPoolExecutor(10) as t:
while 1:
imgs = q.get()
if imgs == '滚蛋吧.没了':
break
t.submit(download_img, imgs)
def download_img(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36"
}
resp = requests.get(url, headers=headers)
file_name = url.split("/")[-1]
with open(f"tu/{file_name}", "wb") as w:
w.write(resp.content)
if __name__ == '__main__':
s1 = time.time()
q = Queue()
p1 = Process(target=get_img_url, args=(q,))
p2 = Process(target=img_process, args=(q,))
p1.start()
p2.start()
p1.join()
p2.join()
s2 = time.time()
print(s2 - s1)
协程
小说下载案例
import requests
from lxml import etree
import asyncio
import aiohttp
import aiofiles
import os
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
}
def get_chaptor_info(url):
resp = requests.get(url, headers=headers)
resp.encoding = "UTF-8"
page_source = resp.text
tree = etree.HTML(page_source)
result = []
divs = tree.xpath("//div[@class='mulu']")
for div in divs:
trs = div.xpath(".//table/tr")
juan_name = trs[0].xpath(".//a/text()")
juan_name = "".join(juan_name).strip().replace(":", "_")
for tr in trs[1:]:
tds = tr.xpath("./td")
for td in tds:
txt = td.xpath(".//text()")
href = td.xpath(".//@href")
txt = "".join(txt).replace(" ", "").strip()
href = "".join(href)
dic = {
"chapter_name": txt,
"chapter_url": href,
"juan_name": juan_name
}
result.append(dic)
return result
async def download_one(url, file_path):
print("我要下載文章了")
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as resp:
page_source = await resp.text(encoding="utf-8")
tree = etree.HTML(page_source)
content = tree.xpath("//div[@class='content']//p//text()")
content = "".join(content).replace("\n", "").replace("\r", "").replace(" ", "").strip()
async with aiofiles.open(file_path, mode="w", encoding="utf-8") as f:
await f.write(content)
print("恭喜你。 下載了一篇文章!", file_path)
async def download_chapter(chaptor_list):
tasks = []
for chaptor in chaptor_list:
juan = chaptor['juan_name']
name = chaptor['chapter_name']
url = chaptor['chapter_url']
if not os.path.exists(juan):
os.makedirs(juan)
file_path = f"{juan}/{name}.txt"
f = download_one(url, file_path)
t = asyncio.create_task(f)
tasks.append(t)
break
await asyncio.wait(tasks)
def main():
url = "https://www.mingchaonaxieshier.com/"
chaptor_list = get_chaptor_info(url)
asyncio.run(download_chapter(chaptor_list))
if __name__ == '__main__':
main()
图片下载案例
"""
# 整体步骤 => 秀人集
1.拿到首页html页面的url和目录名称
2.拿到分页列表
3.在拿到分页下的图片url和文件名称
4.追加到一个空字典里 返回函数的值
5.异步中循环取到字典的key 判断取到字典目录的名字是否存在 不存在创建
6.创建单个异步下载任务
7.提交任务执行
"""
import aiohttp
import requests
from lxml import etree
from urllib.parse import urljoin
import os
import aiofiles
import asyncio
import random
import time
ua_list = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) Gecko/20100101 Firefox/61.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15"
]
header = {
"User-Agent": random.choice(ua_list)
}
url = "https://www.xiurenb.com/"
def img_url(url):
restul = []
resp = requests.get(url, headers=header)
resp.encoding = 'utf-8'
page_source = etree.HTML(resp.text)
home_url = page_source.xpath("//ul/li[@class='i_list list_n2']/a")
for u in home_url:
href = u.xpath("./@href")[0]
titles = u.xpath("./@title")[0]
new_url = urljoin(url, href)
folder_names = titles.replace("\r", "").replace("\n", "").replace(" ", "").split("]")[-1]
dic = {
"dir_url": new_url,
"dir_name": folder_names,
}
restul.append(dic)
return restul
async def download_one(urls, name):
print("开始下载图片")
for i in range(5):
try:
async with aiohttp.ClientSession() as session:
async with session.get(urls, headers=header) as resp:
page_source = await resp.text(encoding="utf-8")
num_list = etree.HTML(page_source)
num_list_url = num_list.xpath("//div[@class='main_inner']//div[@class='page'][1]/a")[1:-1]
for nums in num_list_url:
href = nums.xpath("./@href")
if href:
href_list = href[0]
new_href = urljoin(url, href_list)
async with aiohttp.ClientSession() as sessions:
async with sessions.get(new_href, headers=header) as resps:
jpg_source = await resps.text(encoding='utf-8')
subpage_page = etree.HTML(jpg_source)
subpage_url = subpage_page.xpath("//div[@class='main_left']//p/img")
for img_list in subpage_url:
src_url = img_list.xpath("./@src")[0]
new_src = urljoin(url, src_url)
file_name = new_src.split("/")[-1]
async with aiohttp.ClientSession() as session_jpg:
async with session_jpg.get(new_src, headers=header) as resp_list:
jpg_urls = await resp_list.content.read()
async with aiofiles.open(f"{name}/{file_name}", mode="wb") as f:
await f.write(jpg_urls)
print("下载图片完成", new_src)
break
except Exception as e:
print("请求超时错误", e)
async def task(char_set):
tasks = []
for i in char_set:
name = i['dir_name']
urls = i['dir_url']
if not os.path.exists(name):
os.makedirs(name)
f = download_one(urls, name)
t = asyncio.create_task(f)
tasks.append(t)
await asyncio.wait(tasks)
def main():
char_set = img_url(url)
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(task(char_set))
if __name__ == '__main__':
start = time.time()
main()
stop = time.time()
print(stop - start)
视频下载案列
"""
# 整体步骤 => 网吧电影
1. 想办法找到M3U8文件
2. 判别(人工)是否需要下载第二层M3U8
3. 提取ts文件的下载路径
4. 下载
5. 判别是否需要解密
6. 如果需要解密, 拿到秘钥
7. 解密
8. 根据M3U8的正确顺序来合并所有的ts文件 => MP4
"""
import requests
from lxml import etree
import re
from urllib.parse import urljoin
import os
import asyncio
import aiohttp
import aiofiles
from Crypto.Cipher import AES
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
}
def get_iframe_src(url):
resp = requests.get(url, headers=headers)
tree = etree.HTML(resp.text)
src = tree.xpath("//iframe/@src")[0]
return src
def get_m3u8_url(url):
resp = requests.get(url, headers=headers)
obj = re.compile(r'url: "(?P<m3u8>.*?)"', re.S)
m3u8 = obj.search(resp.text).group("m3u8")
return m3u8
def download_m3u8(url):
resp = requests.get(url, headers=headers)
with open("first.m3u8", mode="w", encoding="utf-8") as f:
f.write(resp.text)
with open("first.m3u8", mode='r', encoding="utf-8") as f2:
for line in f2:
if line.startswith("#"):
continue
line = line.strip()
line = urljoin(url, line)
resp = requests.get(line, headers=headers)
with open("second.m3u8", mode="w", encoding="utf-8") as f3:
f3.write(resp.text)
break
async def download_one(url, sem):
async with sem:
file_name = url.split("/")[-1]
file_path = "./解密前/"+file_name
print(file_name, "开始工作了!")
for i in range(10):
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as resp:
content = await resp.content.read()
async with aiofiles.open(file_path, mode="wb") as f:
await f.write(content)
print(file_name, "下载完成!")
break
except Exception as e:
print(file_name, "出错了, 马上重试", e)
async def download_all_videos():
sem = asyncio.Semaphore(100)
tasks = []
with open("second.m3u8", mode="r", encoding="utf-8") as f:
for line in f:
if line.startswith("#"):
continue
line = line.strip()
t = asyncio.create_task(download_one(line, sem))
tasks.append(t)
await asyncio.wait(tasks)
def get_key():
with open("second.m3u8", mode="r", encoding="utf-8") as f:
file_content = f.read()
obj = re.compile(r'URI="(?P<key_url>.*?)"')
key_url = obj.search(file_content).group("key_url")
resp = requests.get(key_url, headers=headers)
return resp.content
async def desc_one(file_path, key):
file_name = file_path.split("/")[-1]
new_file_path = "./解密后/" + file_name
async with aiofiles.open(file_path, mode="rb") as f1,\
aiofiles.open(new_file_path, mode="wb") as f2:
content = await f1.read()
aes = AES.new(key=key, mode=AES.MODE_CBC, IV=b"0000000000000000")
new_content = aes.decrypt(content)
await f2.write(new_content)
print(new_file_path, "解密成功")
async def desc_all(key):
tasks = []
with open("second.m3u8", mode="r", encoding="utf-8") as f:
for line in f:
if line.startswith("#"):
continue
line = line.strip()
file_name = line.split("/")[-1]
file_path = "./解密前/"+file_name
t = asyncio.create_task(desc_one(file_path, key))
tasks.append(t)
await asyncio.wait(tasks)
def merge():
file_list = []
with open("second.m3u8", mode="r", encoding="utf-8") as f:
for line in f:
if line.startswith("#"):
continue
line = line.strip()
file_name = line.split("/")[-1]
file_list.append(file_name)
os.chdir("./解密后")
n = 1
temp = []
for i in range(len(file_list)):
file_name = file_list[i]
temp.append(file_name)
if i != 0 and i % 20 == 0:
cmd = f"copy /b {'+'.join(temp)} {n}.ts"
r = os.popen(cmd)
print(r.read())
temp = []
n = n + 1
cmd = f"copy /b {'+'.join(temp)} {n}.ts"
r = os.popen(cmd)
print(r.read())
n = n + 1
last_temp = []
for i in range(1, n):
last_temp.append(f"{i}.ts")
cmd = f"copy /b {'+'.join(last_temp)} 春夏秋冬又一春.mp4"
r = os.popen(cmd)
print(r.read())
os.chdir("../")
def main():
url = "http://www.wbdy.tv/play/63690_1_1.html"
src = get_iframe_src(url)
print(src)
src = urljoin(url, src)
m3u8_url = get_m3u8_url(src)
print(m3u8_url)
download_m3u8(m3u8_url)
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(download_all_videos())
key = get_key()
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(desc_all(key))
print("全部完成")
merge()
if __name__ == '__main__':
main()