python爬取教习网试卷下载
#!/usr/local/bin/python3 # -*- encoding: utf-8 -*- import os import json import requests from PIL import Image from lxml import etree import time import re def get_doc_url(url): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53" } keywords_pattern = r"(教习网|-教习网|ppt|ppt课件|课件ppt|ppt课件-教习网|课件ppt-教习网)" try: response = requests.get(url=url, headers=headers) if response.status_code == 200: html = response.text all_doc_page = etree.HTML(html) title = re.sub(keywords_pattern, "", all_doc_page.xpath('//title/text()')[0]) dir2 = os.path.join(dir1, title) os.makedirs(dir2, exist_ok=True) doc_urls = all_doc_page.xpath("//div[@class='list-bd']//div[@class='title fl']/a/@href") mu_titles = all_doc_page.xpath("//div[@class='list-bd']//div[@class='title fl']/a/@title") return doc_urls, mu_titles, dir2 except Exception as e: print("【请求失败,请检查URL和网络环境!】") print(e) def jpg2pdf(images_folder, doc_id): image_list = [] for i, file_name in enumerate(os.listdir(images_folder)): if file_name.endswith((".jpg", ".png")): image_path = os.path.join(images_folder, file_name) img = Image.open(image_path) if img.mode != "RGB": img = img.convert("RGB") image_list.append(img) #os.remove(image_path) pdf_path = os.path.join(images_folder, f"{doc_id}.pdf") image_list[0].save(pdf_path, "PDF", resolution=100.0, save_all=True, append_images=image_list[1:]) if os.path.exists(pdf_path): print(f"{doc_id} 【转换为pdf成功!】") for image_path in [os.path.join(images_folder, file_name) for file_name in os.listdir(images_folder) if file_name.endswith((".jpg", ".png"))]: os.remove(image_path) print(f"{doc_id} 【文件夹图片删除!】") else: print(f"{doc_id} 【转换失败,请检查!】") def download_images(url_list, folder_name): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53" } for i, url in enumerate(url_list): response = requests.get(url, headers=headers) file_name = f"{i}.jpg" file_path = os.path.join(folder_name, file_name) with open(file_path, "wb") as f: print(f"{file_name}正在下载") f.write(response.content) time.sleep(1) def download_mp3(url, folder_name, title): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53" } response = requests.get(url, headers=headers) file_name = f"{title}" file_path = os.path.join(folder_name, file_name) with open(file_path, "wb") as f: print(f"{file_name}正在下载") f.write(response.content) def get_jpg(doc_urls, mu_titles, dir2): keywords_pattern = r"(教习网|-教习网|ppt|ppt课件|课件ppt|ppt课件-教习网|课件ppt-教习网)" for i, url in enumerate(doc_urls): doc_id = url[1:].split("-")[1].split(".")[0] url = f"https://www.####.com/api/document/preview?document_id={doc_id}&all=1" print(url) headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53" } response = requests.get(url=url, headers=headers).text titlebt = re.sub(keywords_pattern, "", mu_titles[i]) dir3 = os.path.join(dir2, titlebt) os.makedirs(dir3, exist_ok=True) data_dict = json.loads(response) for item in data_dict['data']: for subset in item["format_subsets"]: title = subset["title"] for ext in (".pptx", ".docx", ".ppt", ".doc", ".pdf"): title = title.replace(ext, "") if subset["file_type"].lower() in ("pptx", "docx", "ppt", "doc", "pdf"): preview_files = subset["preview_files"] url_list = [f'https:{preview["url"]}' for preview in preview_files] download_images(url_list, dir3) print(f"试卷{title}下载完成") jpg2pdf(dir3, title) elif subset["file_type"].lower() in ("mp3", "mp4"): mp3_url = "https://img-preview.######.com/"+subset["vga_capture_url"] download_mp3(mp3_url, dir3, title) else: print("这不是PPT、Word文档,也不是MP3、MP4音视频文件") def creat_dir(dir_name): os.makedirs(dir_name, exist_ok=True) def main(): url = input("\n请输入成套试卷链接: ") while "album" not in url: print("【不是成套试卷链接,请重新输入!】") url = input("\n请输入成套试卷链接: ") creat_dir(dir1) doc_urls, mu_titles, dir2 = get_doc_url(url) if doc_urls: get_jpg(doc_urls, mu_titles, dir2) else: print("无法获取试卷链接,请检查输入的链接是否正确。") dir1 = 'output' if __name__ == "__main__": main()
本文来自博客园,作者:安之立吖,转载请注明原文链接:https://www.cnblogs.com/anzhili/p/18001707