Python | 《国王排名》漫画爬取并合成pdf文件
这里仅对下面两篇随笔做个合并,就是每爬取完一章的漫画图片,就立刻生成一个pdf文件。
Python 爬取《国王排名》漫画
Python | 图片转pdf
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from PIL import Image
from pyquery import PyQuery
import requests
import execjs
import glob
import re
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}
url = 'http://www.dm5.com/manhua-guowangpaiming/'
def getOne(url):
"""获取漫画章节"""
url_One = []
html = PyQuery(requests.get(url, headers=headers).content.decode('utf-8'))
id_url = html("#detail-list-select-1 li a")
for i in id_url.items():
url_One.append([i.text(), 'http://www.dm5.com' + i.attr("href")])
url_One.reverse()
return url_One
def getTwo(chapters_url):
"""解析漫画"""
pathOne = os.getcwd() + r'\国王排名'
if not os.path.exists(pathOne):
os.mkdir(pathOne)
# 获取漫画的必要参数
for chapter_name, chapter_url in chapters_url:
print(f"开始下载 >> {chapter_name} << ")
pathTwo = pathOne + '\\' + chapter_name
if not os.path.exists(pathTwo):
os.mkdir(pathTwo)
response = requests.get(chapter_url)
print(chapter_url)
text = response.text
cid = re.findall('var DM5_CID=(.*?);', text)[0].strip()
mid = re.findall('var DM5_MID=(.*?);', text)[0].strip()
dt = re.findall('var DM5_VIEWSIGN_DT="(.*?)";', text)[0].strip()
sign = re.findall('var DM5_VIEWSIGN="(.*?)";', text)[0].strip()
page_count = int(re.findall('var DM5_IMAGE_COUNT=(.*?);', text)[0].strip())
# print(cid, mid, dt, sign, page_count)
page = 1
while page <= page_count:
js_api = f'{chapter_url}chapterfun.ashx?cid={cid}&page={page}&key=&language=1>k=6&_cid={cid}&_mid={mid}&_dt={dt}&_sign={sign}'
ret = requests.get(js_api, headers={'referer': 'http://www.dm5.com'})
js_code = ret.text
image_url = execjs.eval(js_code)
img_url = image_url[0]
try:
with open(f'{pathTwo}\\{page}.jpg', 'wb') as f:
f.write(requests.get(img_url).content)
print(f"下载 {chapter_name} {page}.jpg......")
except Exception as e:
print(f'{chapter_name} {page}下载失败:{e}')
page += 1
jpg_path = glob.glob(f"{pathTwo}\*.jpg")
# jpg_path = os.listdir(path+"\\"+i)
# jpg_path.sort(key=lambda x: int(x.split('.')[0]))
jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
pdf_path = f'国王排名 {os.path.split(pathTwo)[1]}.pdf'
jpg_to_pdf(jpg_path, pdf_path)
def jpg_to_pdf(jpgs, path):
"""生成PDF文件"""
w, h = Image.open(jpgs[0]).size
ca = canvas.Canvas(path, pagesize=portrait((w, h)))
for jpg in jpgs:
ca.drawImage(jpg, 0, 0, w, h)
ca.showPage()
ca.save()
print(path+' >> 已保存至pdf')
def main():
urls_one = getOne(url)
getTwo(urls_one)
if __name__ == '__main__':
main()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!