s6tu
# -*- coding: utf-8 -*- # @Time : 2018/03/30 15:20 # @Author : cxa # @File : liuuchnagtu.py # @Software: PyCharm import requests from fake_useragent import UserAgent as UA from lxml import html import os import threading import traceback import time import random class GetImage(): def __init__(self): self.url="http://www.s6tu.com/explore/popular/?list=images&sort=likes_desc&page={}" self.imgpath = "//div[@class='list-item-image fixed-size']/a/img/@src" self.headers = { 'Accept': 'text/html, application/xhtml+xml, image/jxr, */*', 'Accept - Encoding': 'gzip, deflate', 'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5', 'Connection': 'Keep-Alive', 'User-Agent': UA().random, 'Host': 'www.s6tu.com',} def get_oen_page(self): try: os.makedirs("setu",exist_ok=True) for i in range(1,999): url=self.url.format(i) Session=requests.session() #Session.proxies.update(self.proxies) req=Session.get(url,headers=self.headers) if req.status_code==requests.codes.ok: root=html.fromstring(req.text) imglist=root.xpath(self.imgpath) newlist=[i.replace(".md.",".")for i in imglist] downloadThreads=[] for i in range(0,len(newlist),int(len(newlist)/3)): downloadThread = threading.Thread(target=self.getimglist, args=(newlist,i, i + int(len(newlist)/3))) downloadThreads.append(downloadThread) downloadThread.start() for item in downloadThreads: item.join() print("get one page over") else: print("errro") time.sleep(random.randint(1,5)) except: print("error,here is details:{}".format(traceback.format_exc())) def getimglist(self,newlist,start,end): if end>len(newlist): end=len(newlist) for i in range(start,end): imgurl=newlist[i] downloadThreads = [] print(imgurl) req=requests.get(imgurl,headers=self.headers) with open(os.path.join("setu",os.path.basename(imgurl)),"wb") as fs: fs.write(req.content) if __name__=="__main__": GetImage().get_oen_page()