Python superbwallpapers 动漫分类 下载
动漫分类壁纸多线程下载,有Bug
# -*- coding: utf-8 -*- import os,urllib2,re,urllib from bs4 import BeautifulSoup import socket socket.setdefaulttimeout(25) #in case of overtime:http://outofmemory.cn/code-snippet/16848/python-through-urllib-urlretrieve-file-setting-method #another way:urllib.request.socket.setdefaulttimeout(20) re_link = re.compile(r'/anime/.{0,50}-\d{5}') re_404 = re.compile(r'Page not found - Please try some of the popular items below') main_url = [] #main_url.append("http://www.superbwallpapers.com/anime/") pic_page = [] pic_name = [] pic_url = [] pic_url_number =[] end_page = 40 for each_page in range(end_page): main_url.append("http://www.superbwallpapers.com/anime/" + str(each_page + 1) + ".html") each_page += 1 print main_url #how_many = 0 def one_page(main_url): #global how_many main_page_html = urllib2.urlopen(main_url).read() soup = BeautifulSoup(main_page_html,fromEncoding="gb18030") match_pic = [] for link in soup.find_all('a'): href = str(link.get('href')) match = re_link.match(href) if match : #print match.group() match_pic.append(match.group()) global pic_url_number global pic_page global pic_name global pic_url for i in range((len(match_pic)-1)/2): pic_page.append('http://www.superbwallpapers.com' + match_pic[i*2]) pic_name.append(match_pic[i*2][7:]) pic_url.append("http://cdn.superbwallpapers.com/wallpapers" + match_pic[i*2] + "-1920x1080.jpg") pic_url_number.append(x) output = open('K://PIC/url.txt','w+') for x in range(end_page): one_page(main_url[x]) title = "K://PIC/" + str(x) if not os.path.isdir(title): os.mkdir(title) output.write(str(pic_url)) output.close() pic_number = 0 url_fail = [] import threading how_many = 0 lock = threading.Lock() #one_page(main_url[how_many]) class myThread (threading.Thread): def __init__(self, pic_url): threading.Thread.__init__(self) self.pic_url = pic_url def run(self): global pic_number global how_many print '%s acquire lock...' % threading.currentThread().getName() lock.acquire() print '%s get the lock.' % threading.currentThread().getName() o1 = pic_number pic_number += 1 # 释放锁 lock.release() print '%s release lock...' % threading.currentThread().getName() try: urllib.urlretrieve(pic_url[o1], "K://PIC/" + str(pic_url_number[o1])+ "/" + str(pic_name[o1]) + ".jpg") #detail:http://www.nowamagic.net/academy/detail/1302861 except: #except socket.timeout as e: try: urllib.urlretrieve(pic_url[o1], "K://PIC/" + str(pic_url_number[o1])+ "/" + str(pic_name[o1]) + ".jpg") except: global url_fail url_fail.append(pic_url[o1]) url_fail.append(pic_url_number[o1]) print "-----socket timout-----,record..." print "Picture " + str(pic_name[o1]) + " Downloaded" def start_new_thread(): thread = myThread(pic_url) thread.start() while pic_number <= len(pic_url): if threading.activeCount() < 7: start_new_thread()