一个爬虫的练习(妹子图)
一个爬虫项目(抓妹子的图)
url加密了,这儿用base64解密,js 自带一个token(解开图片的url地址)
话不多说 看源码:
#!/usr/bin/python # -*- coding: utf-8 -*- import hashlib import base64 from bs4 import BeautifulSoup import requests import re import os import queue import threading import math from multiprocessing import Pool import sys sys.stderr = None ''' url解码 ''' def parse(imgHash, constant): return decode_base64(imgHash).decode('utf8') def md5(src): m = hashlib.md5() m.update(src.encode("utf8")) return m.hexdigest() def decode_base64(data): missing_padding = 4 - len(data) % 4 if missing_padding: data += '=' * missing_padding return base64.b64decode(data) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' } ''' 页面抓取类 ''' class Spider(threading.Thread): def __init__(self, pages, proxies, url_manager): threading.Thread.__init__(self) self.pages = pages self.proxies = proxies self.url_manager = url_manager def get_Page(self, page, proxies, url_manager): bs_page = BeautifulSoup(page, "lxml") ''' 获取js文件地址从而得到constant常量 ''' try: model = re.findall(r'.*<script\ssrc=\"\/\/(cdn.jandan.net\/static\/min.*?)\"><\/script>.*', page) jsfile_url = "http://" + model[len(model) - 1] # 页面上可能有两个地址,取最后一个匹配的地址 except Exception as e: print(e) jsfile = requests.get(jsfile_url, headers=headers, proxies=proxies, timeout=3).text constant = re.search(r'.*remove\(\);var\sc=\w+\(e,\"(\w+)\".*', jsfile).group(1) ''' 向parse函数传入constant常量和img-hash得到图片地址 ''' for item in bs_page.select('.img-hash'): img_url = 'http:' + parse(item.text, constant) url_manager.addNewUrl(img_url) def run(self): for page in self.pages: self.get_Page(page, self.proxies, self.url_manager) ''' 程序入口 ''' def main(amount): url_manager = UrlManager() proxies = {'http': ''} # 尚未添加ip代理功能,程序已能正常运行 current_url = 'http://jandan.net/ooxx' # 当前页面url ''' 多线程抓取页面地址 ''' pages = [] # 所有待抓取页面 try: for i in range(amount): current_page = requests.get(current_url, headers=headers).text # 当前页面源码 pages.append(current_page) current_url = 'http:' + re.search(r'.*Older\sComments\"\shref=\"(.*?)\"\sclass.*', current_page).group( 1) # 提取下个页面url except Exception as e: pass page_threads = [] t_amount = 10 if len(pages) > 10 else len(pages) # 页面抓取线程数 for i in range(t_amount): t = Spider(pages[math.ceil(int((len(pages)) / t_amount) * i):math.ceil(int((len(pages)) / t_amount) * (i + 1))], proxies, url_manager) page_threads.append(t) for t in page_threads: t.start() for t in page_threads: t.join() img_threads = [] for i in range(10): # 固定10个线程用于下载图片 t = Download(url_manager) img_threads.append(t) for t in img_threads: t.start() for t in img_threads: t.join() L = threading.Lock() ''' 图片下载类 ''' class Download(threading.Thread): def __init__(self, url_manager): threading.Thread.__init__(self) self.url_manager = url_manager self.pic_headers = headers self.pic_headers['Host'] = 'wx3.sinaimg.cn' def download_Img(self, url): isGif = re.match(r'(.*\.sinaimg\.cn\/)(\w+)(\/.+\.gif)', url) if isGif: url = isGif.group(1) + 'large' + isGif.group(3) extensionName = re.match(r'.*(\.\w+)', url).group(1) # 图片扩展名 L.acquire() if not os.path.exists('img'): os.mkdir('img') with open('img/' + str(len(os.listdir('./img'))) + extensionName, 'wb') as f: # headers['Host']='wx3.sinaimg.cn' f.write(requests.get(url, headers=self.pic_headers).content) f.close() L.release() def run(self): while not self.url_manager.isEmpty(): imgUrl = self.url_manager.getNewUrl() self.download_Img(imgUrl) self.url_manager.addOldUrl(imgUrl) ''' url仓库,提供url更新以及记录功能 ''' class UrlManager: def __init__(self): self.url_used = [] self.url_target = queue.Queue() if os.path.exists('url.txt'): with open('url.txt', 'r') as f: for eachline in f.readlines(): self.url_used.append(eachline.strip()) else: open("url.txt", 'w') def getNewUrl(self): return self.url_target.get() def isEmpty(self): return self.url_target.empty() def addNewUrl(self, newUrl): if newUrl in self.url_used: pass else: self.url_target.put(newUrl) def addOldUrl(self, oldUrl): self.url_used.append(oldUrl) with open('url.txt', 'a') as f: f.write(oldUrl + '\n') if __name__ == '__main__': num_list= [i for i in range(48)] res_l = [] p = Pool() for i in num_list: res = p.apply_async(main, args=(int(i),)) res_l.append(res) for k in res_l: res = k.get() print('下载妹子(%s)'%k)
基于多线程,多进程(并且屏蔽了所有的错误,可以在上面扩展),谢谢!