分布式爬虫
闲来无事尝试一下多线程爬虫,
对于单一html页面的多线程解析
缺点是无法对抓取的图片进行分类
本次脚本的特点:图面与内容一一对应
#!/usr/bin/python # -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup import re import time from threading import Thread,Lock import Queue URL = 'http://movie.xxxxx.com/annual2015/#1' jobs = 10 q = Queue.Queue(maxsize = 10) def get_html(url): headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1581.2 Safari/537.36' } con = requests.get(url,headers=headers).content #return con.status_code return con def parser_html(page): global URL img = [] name = [] soup = BeautifulSoup(get_html(URL),'html.parser') try: a_tags = soup.find('div',id='main').find('div',attrs={'data-anchor':'%d'%page}).find('div',class_='content wrapper').find('div',class_='subjects-wrapper').find_all('a') for i in a_tags: img.append(i.find('div',class_='img').find('img')['data-src']) name.append([j.string for j in i.find('div',class_='cover-inner').find_all('p')]) except Exception,e: img.append(None) name.append(['none','none']) return img,name def download_working(q): while True: arguments = q.get() img,name = parser_html(arguments) for i in range(0,len(img)): if img[i]: with open('G:\\123\\%s_%s.jpg'%(name[i][0],name[i][1]),'wb') as f: f.write(requests.get(img[i],stream=True).content) time.sleep(1) q.task_done() if __name__ == '__main__': for i in range(1,jobs+1): q.put(i) for j in range(0,2): t_work = Thread(target=download_working,args=(q,)) t_work.setDaemon(True) t_work.start() #all of the tasks is already finished q.join()