爬虫爬取CS142
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
os.makedirs('./lecture', exist_ok=True)
def download_pdf(url, save_path):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Find all the <a> tags with href ending in '.pdf'
pdf_links = soup.find_all('a', href=lambda href: href.endswith('.pdf'))
for link in pdf_links:
pdf_url = urljoin(url, link['href'])
pdf_name = pdf_url.split('/')[-1]
# Find the parent tag of the PDF link
parent_tag = link.find_parent('div', class_='row container')
if parent_tag:
folder_name = parent_tag['id']
folder_path = os.path.join(save_path, folder_name)
os.makedirs(folder_path, exist_ok=True)
pdf_path = os.path.join(folder_path, pdf_name)
# Download the PDF file
pdf_response = requests.get(pdf_url)
with open(pdf_path, 'wb') as f:
f.write(pdf_response.content)
print(f"Downloaded: {pdf_name} in folder: {folder_name}")
# Example usage
url='https://web.stanford.edu/class/cs142/lectures.html'
save_path='./lecture'
download_pdf(url, save_path)
最后的结果CS142