import sys
import requests
import bs4
import re
import urllib
import os
import random
class Queue(object):
"""队列
"""
def __init__(self):
self.items = []
self.sum = 0
def is_empty(self):
"""判断队列是否为空"""
return len(self.items) == 0
def push(self, element):
"""元素入队"""
self.items.append(element)
self.sum += 1
def pop(self):
"""元素出队列"""
temp = self.items[0]
self.items = self.items[1:]
return temp
def size(self):
"""获取当前队列大小"""
return len(self.items)
def total_size(self):
"""入队列的总元素数量"""
return self.sum
class RequestsCrawler(object):
"""使用requests实现的抓取器
"""
def get_page(self, url):
"""页面抓取
"""
text = None
header = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:48.0) \
Gecko/20100101 Firefox/48.0",
"Connection": "close"
}
with requests.Session() as s:
res = s.get(url=url, headers=header)
text = res.text
return text
def get_link(url, sub_url):
crawler = RequestsCrawler()
text = crawler.get_page(url+sub_url)
#print(text)
soup = bs4.BeautifulSoup(text, "html.parser")
tds = soup.find_all("td", attrs={'class': 'file'})
rst = []
for td in tds:
href = td.find('a', href=True).attrs['href']
#print(href)
rst.append(href)
return rst
def download(url, link):
r = requests.get(url+link)
path = os.getcwd()
dir = path+'/'.join(link.split('/')[:-1])
if not os.path.exists(dir):
os.makedirs(dir)
print("下载", path+link)
with open(path+link, "wb") as code:
code.write(r.content)
print("下载完成")
if __name__ == "__main__":
#print(text)
url = "https://d.shikey.com"
queue = Queue()
queue.push("/jike/")
while(not queue.is_empty()):
link = queue.pop()
print("遍历目录:", link)
rst = get_link(url, link)
for link in rst:
if not link.endswith("?preview"):
queue.push(link)
else:
download(url, link[:-8])