2 import requests 3 from bs4 import BeautifulSoup 4 import urllib 5 import os 6 import threading 7 import time 8 lock1=threading.RLock()#对于公共资源PAGE_URL,FACE_URL要加锁 9 lock2=threading.RLock() 10 PAGE_URL=[] 11 FACE_URL=[] 12 for i in range(1,1476): 13 PAGE_URL.append('http://www.doutula.com/photo/list/?page='+str(i)) #观察URL 14 15 def get_faceurl(): 16 while True: 17 lock1.acquire() 18 if len(PAGE_URL)==0: #判断线程结束条件 19 lock1.release() 20 break 21 else: 22 url=PAGE_URL.pop()#修改公共资源后释放锁 23 lock1.release() 24 respon=requests.get(url) 25 soup=BeautifulSoup(respon.text,'html.parser') 26 lock2.acquire()#修改公共资源前加锁 27 for tag in soup.find_all('img',attrs={'class':'img-responsive lazy image_dta'}): 28 FACE_URL.append(tag['data-original']) 29 lock2.release()#释放锁 30 def download_face(): 31 while True: 32 lock2.acquire() 33 if len(FACE_URL)==0: 34 lock2.release() 35 break 36 else: 37 url=FACE_URL.pop() 38 lock2.release() 39 path_file=os.path.join('download',url.split('/')[-1])#由于LINUX和WINDOWS的路径斜杠不同所以用os.path.join 40 urllib.request.urlretrieve(url,path_file) 41 def main(): 42 for i in range(2): 43 td=threading.Thread(target=get_faceurl) 44 td.start() 45 td.join() 46 for i in range(4): 47 td=threading.Thread(target=download_face) 48 td.start() 49 td.join() 50 main()
两个线程去获取每个页面的图片的URL存储到一个全局列表里采用列表的append方法,四个线程从公共变量里获取URL进行下载,采用列表的pop方法。