python 抓取“煎蛋妹子图”并分页存在本地(普通版和多线程版)
想测试一下python多线程爬虫的效率,就写了个爬虫程序的多线程版和普通版。测试了一下。发现单线程比多线程还快,不理解。看来有两种可能,一是我写的多线程爬虫有问题,二是python对多线程的支持有局限。
暂时存在这里,希望以后能解决。
爬虫单线程版:
1 #coding=utf-8 2 import sys 3 reload(sys) 4 sys.setdefaultencoding('utf-8') 5 6 import requests 7 import re 8 import os 9 from threading import Thread 10 import urllib 11 from time import ctime 12 13 class crawl_girls(object): 14 """docstring for crawl_girls""" 15 def __init__(self, url, pagenum): 16 self.url = url 17 self.pagenum = pagenum 18 self.content = "" 19 self.img_urls = [] 20 self.img_names = [] 21 22 def getContent(self): 23 try: 24 imgs_html = requests.get(self.url) 25 imgs_html_content = imgs_html.content 26 self.content = imgs_html_content 27 #print self.content 28 except requests.exceptions.RequestException, e: 29 print e 30 31 def getImgNames(self): 32 img_names_patt = r'<li id="comment-(.+?)">' 33 self.img_names = re.findall(img_names_patt, self.content) 34 35 def getImgUrls(self): 36 img_urls_patt = r'<p><img src="(.+?)"' 37 self.img_urls = re.findall(img_urls_patt, self.content) 38 39 def start_download(self): 40 self.getContent() 41 self.getImgNames() 42 self.getImgUrls() 43 44 curr_path = os.getcwd() 45 curr_path = curr_path.replace('\\', '/') 46 curr_path = curr_path + '/' 47 file_dir = curr_path + str(self.pagenum) + '/' 48 os.mkdir(file_dir) 49 50 for name_url in zip(self.img_names, self.img_urls): 51 pic_name = name_url[1][-4:] 52 file_path = file_dir + name_url[0] + pic_name 53 #print 'start download',file_path 54 print 'starting at', ctime() 55 urllib.urlretrieve(name_url[1], file_path) 56 print 'finished at', ctime() 57 58 59 def main(page_start, page_end): 60 page = r'http://jandan.net/ooxx/page-1#comments' 61 for pagenum in range(page_start, page_end+1): 62 url = page.replace('1', str(pagenum)) 63 print url 64 girls = crawl_girls(url, pagenum) 65 girls.start_download() 66 67 print "all Done" 68 69 if __name__ == '__main__': 70 main(905, 906)
爬虫多线程版:
1 #coding=utf-8 2 import sys 3 reload(sys) 4 sys.setdefaultencoding('utf-8') 5 6 import requests 7 import re 8 import os 9 from threading import Thread 10 import urllib 11 from time import ctime 12 13 class crawl_girls(object): 14 """docstring for crawl_girls""" 15 def __init__(self, url, pagenum): 16 self.url = url 17 self.pagenum = pagenum 18 self.content = "" 19 self.img_urls = [] 20 self.img_names = [] 21 22 def getContent(self): 23 try: 24 imgs_html = requests.get(self.url) 25 imgs_html_content = imgs_html.content 26 self.content = imgs_html_content 27 #print self.content 28 except requests.exceptions.RequestException, e: 29 print e 30 31 def getImgNames(self): 32 img_names_patt = r'<li id="comment-(.+?)">' 33 self.img_names = re.findall(img_names_patt, self.content) 34 35 def getImgUrls(self): 36 img_urls_patt = r'<p><img src="(.+?)"' 37 self.img_urls = re.findall(img_urls_patt, self.content) 38 39 def start_thread(self): 40 self.getContent() 41 self.getImgNames() 42 self.getImgUrls() 43 44 curr_path = os.getcwd() 45 curr_path = curr_path.replace('\\', '/') 46 curr_path = curr_path + '/' 47 file_dir = curr_path + str(self.pagenum) + '/' 48 os.mkdir(file_dir) 49 50 for name_url in zip(self.img_names, self.img_urls): 51 pic_name = name_url[1][-4:] 52 file_path = file_dir + name_url[0] + pic_name 53 print 'start download',file_path 54 print 'starting at', ctime() 55 thread = download_threads(name_url[1], file_path) 56 thread.start() 57 thread.join() 58 print 'finished at', ctime() 59 60 class download_threads(Thread): 61 def __init__(self, url, path): 62 Thread.__init__(self) 63 self.url = url 64 self.path = path 65 66 def run(self): 67 urllib.urlretrieve(self.url, self.path) 68 69 def main(page_start, page_end): 70 page = r'http://jandan.net/ooxx/page-1#comments' 71 for pagenum in range(page_start, page_end+1): 72 url = page.replace('1', str(pagenum)) 73 print url 74 girls = crawl_girls(url, pagenum) 75 girls.start_thread() 76 77 print "all Done" 78 79 if __name__ == '__main__': 80 main(905, 906)