【爬虫】批量下载极客时间课程

import sys
import requests
import bs4
import re
import urllib
import os
import random

class Queue(object):
    """队列
    """
    def __init__(self):
        self.items = []
        self.sum = 0

    def is_empty(self):
        """判断队列是否为空"""
        return len(self.items) == 0

    def push(self, element):
        """元素入队"""
        self.items.append(element)
        self.sum += 1

    def pop(self):
        """元素出队列"""
        temp = self.items[0]
        self.items = self.items[1:]
        return temp

    def size(self):
        """获取当前队列大小"""
        return len(self.items)

    def total_size(self):
        """入队列的总元素数量"""
        return self.sum

class RequestsCrawler(object):
    """使用requests实现的抓取器
    """
    def get_page(self, url):
        """页面抓取
        """
        text = None
        header = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:48.0) \
                Gecko/20100101 Firefox/48.0",
            "Connection": "close"
        }
        
        with requests.Session() as s:
            res = s.get(url=url, headers=header)
            text = res.text
        return text

def get_link(url, sub_url):
    crawler = RequestsCrawler()
    text = crawler.get_page(url+sub_url)
    #print(text)
    soup = bs4.BeautifulSoup(text, "html.parser")
    tds = soup.find_all("td", attrs={'class': 'file'})
    rst = []
    for td in tds:
        href = td.find('a', href=True).attrs['href']
        #print(href)
        rst.append(href)
    return rst

def download(url, link):
    r = requests.get(url+link)
    path = os.getcwd()
    dir = path+'/'.join(link.split('/')[:-1])
    if not os.path.exists(dir):
        os.makedirs(dir)
    print("下载", path+link)
    with open(path+link, "wb") as code:
        code.write(r.content)
    print("下载完成")

if __name__ == "__main__":
    #print(text)
    url = "https://d.shikey.com"
    queue = Queue()
    queue.push("/jike/")
    while(not queue.is_empty()):
        link = queue.pop()
        print("遍历目录:", link)
        rst = get_link(url, link)
        for link in rst:
            if not link.endswith("?preview"):
                queue.push(link)
            else:
                download(url, link[:-8])
posted @ 2021-03-12 14:22  Yanqiang  阅读(2424)  评论(0编辑  收藏  举报