爬虫爬取CS142

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os

os.makedirs('./lecture', exist_ok=True)

def download_pdf(url, save_path):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all the <a> tags with href ending in '.pdf'
    pdf_links = soup.find_all('a', href=lambda href: href.endswith('.pdf'))
    
    for link in pdf_links:
        pdf_url = urljoin(url, link['href'])
        pdf_name = pdf_url.split('/')[-1]
        
        # Find the parent tag of the PDF link
        parent_tag = link.find_parent('div', class_='row container')
        
        if parent_tag:
            folder_name = parent_tag['id']
            folder_path = os.path.join(save_path, folder_name)
            os.makedirs(folder_path, exist_ok=True)
            pdf_path = os.path.join(folder_path, pdf_name)
            
            # Download the PDF file
            pdf_response = requests.get(pdf_url)
            with open(pdf_path, 'wb') as f:
                f.write(pdf_response.content)
            
            print(f"Downloaded: {pdf_name} in folder: {folder_name}")

# Example usage
url='https://web.stanford.edu/class/cs142/lectures.html'
save_path='./lecture'
download_pdf(url, save_path)

最后的结果CS142

posted @ 2024-07-05 15:05  zddkk  阅读(2)  评论(0编辑  收藏  举报