python爬取教习网试卷下载

#!/usr/local/bin/python3
# -*- encoding: utf-8 -*-
import os
import json
import requests
from PIL import Image
from lxml import etree
import time
import re

def get_doc_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53"
    }
    keywords_pattern = r"(教习网|-教习网|ppt|ppt课件|课件ppt|ppt课件-教习网|课件ppt-教习网)"
    try:
        response = requests.get(url=url, headers=headers)
        if response.status_code == 200:
            html = response.text
            all_doc_page = etree.HTML(html)
            title = re.sub(keywords_pattern, "", all_doc_page.xpath('//title/text()')[0])
            dir2 = os.path.join(dir1, title)
            os.makedirs(dir2, exist_ok=True)
            doc_urls = all_doc_page.xpath("//div[@class='list-bd']//div[@class='title fl']/a/@href")
            mu_titles = all_doc_page.xpath("//div[@class='list-bd']//div[@class='title fl']/a/@title")
            return doc_urls, mu_titles, dir2
    except Exception as e:
        print("【请求失败,请检查URL和网络环境!】")
        print(e)

def jpg2pdf(images_folder, doc_id):
    image_list = []
    for i, file_name in enumerate(os.listdir(images_folder)):
        if file_name.endswith((".jpg", ".png")):
            image_path = os.path.join(images_folder, file_name)
            img = Image.open(image_path)
            if img.mode != "RGB":
                img = img.convert("RGB")
            image_list.append(img)
            #os.remove(image_path)
    pdf_path = os.path.join(images_folder, f"{doc_id}.pdf")
    image_list[0].save(pdf_path, "PDF", resolution=100.0, save_all=True, append_images=image_list[1:])
    if os.path.exists(pdf_path):
        print(f"{doc_id} 【转换为pdf成功!】")
        for image_path in [os.path.join(images_folder, file_name) for file_name in os.listdir(images_folder) if file_name.endswith((".jpg", ".png"))]:
            os.remove(image_path)
        print(f"{doc_id} 【文件夹图片删除!】")
    else:
        print(f"{doc_id} 【转换失败,请检查!】")

def download_images(url_list, folder_name):
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53"
        }
    for i, url in enumerate(url_list):
        response = requests.get(url, headers=headers)
        file_name = f"{i}.jpg"
        file_path = os.path.join(folder_name, file_name)
        with open(file_path, "wb") as f:
            print(f"{file_name}正在下载")
            f.write(response.content)
        time.sleep(1)

def download_mp3(url, folder_name, title):
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53"
        }
    response = requests.get(url, headers=headers)
    file_name = f"{title}"
    file_path = os.path.join(folder_name, file_name)
    with open(file_path, "wb") as f:
        print(f"{file_name}正在下载")
        f.write(response.content)

def get_jpg(doc_urls, mu_titles, dir2):
    keywords_pattern = r"(教习网|-教习网|ppt|ppt课件|课件ppt|ppt课件-教习网|课件ppt-教习网)"
    for i, url in enumerate(doc_urls):
        doc_id = url[1:].split("-")[1].split(".")[0]
        url = f"https://www.####.com/api/document/preview?document_id={doc_id}&all=1"
        print(url)
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53"
        }
        response = requests.get(url=url, headers=headers).text
        titlebt = re.sub(keywords_pattern, "", mu_titles[i])
        dir3 = os.path.join(dir2, titlebt)
        os.makedirs(dir3, exist_ok=True)
        data_dict = json.loads(response)
        for item in data_dict['data']:
            for subset in item["format_subsets"]:
                title = subset["title"]
                for ext in (".pptx", ".docx", ".ppt", ".doc", ".pdf"):
                    title = title.replace(ext, "")
                if subset["file_type"].lower() in ("pptx", "docx", "ppt", "doc", "pdf"):
                    preview_files = subset["preview_files"]
                    url_list = [f'https:{preview["url"]}' for preview in preview_files]
                    download_images(url_list, dir3)
                    print(f"试卷{title}下载完成")
                    jpg2pdf(dir3, title)
                elif subset["file_type"].lower() in ("mp3", "mp4"):
                    mp3_url = "https://img-preview.######.com/"+subset["vga_capture_url"]
                    download_mp3(mp3_url, dir3, title)
                else:
                    print("这不是PPT、Word文档,也不是MP3、MP4音视频文件")

def creat_dir(dir_name):
    os.makedirs(dir_name, exist_ok=True)

def main():
    url = input("\n请输入成套试卷链接: ")
    while "album" not in url:
        print("【不是成套试卷链接,请重新输入!】")
        url = input("\n请输入成套试卷链接: ")
    creat_dir(dir1)
    doc_urls, mu_titles, dir2 = get_doc_url(url)
    if doc_urls:
        get_jpg(doc_urls, mu_titles, dir2)
    else:
        print("无法获取试卷链接,请检查输入的链接是否正确。")

dir1 = 'output'

if __name__ == "__main__":
    main()

 

posted @ 2024-05-22 15:39  安之立吖  阅读(19)  评论(1编辑  收藏  举报