python 抓取“煎蛋妹子图”并分页存在本地(普通版和多线程版)

  想测试一下python多线程爬虫的效率,就写了个爬虫程序的多线程版和普通版。测试了一下。发现单线程比多线程还快,不理解。看来有两种可能,一是我写的多线程爬虫有问题,二是python对多线程的支持有局限。

  暂时存在这里,希望以后能解决。

  爬虫单线程版:

  

 1 #coding=utf-8
 2 import sys 
 3 reload(sys) 
 4 sys.setdefaultencoding('utf-8')
 5 
 6 import requests
 7 import re
 8 import os
 9 from threading import Thread 
10 import urllib
11 from time import ctime
12 
13 class crawl_girls(object):
14     """docstring for crawl_girls"""
15     def __init__(self, url, pagenum):
16         self.url = url
17         self.pagenum = pagenum
18         self.content = ""
19         self.img_urls = []
20         self.img_names = []
21 
22     def getContent(self):
23         try:
24             imgs_html = requests.get(self.url)
25             imgs_html_content = imgs_html.content
26             self.content = imgs_html_content
27             #print self.content
28         except requests.exceptions.RequestException, e:
29             print e
30 
31     def getImgNames(self):
32         img_names_patt = r'<li id="comment-(.+?)">'
33         self.img_names = re.findall(img_names_patt, self.content)
34 
35     def getImgUrls(self):
36         img_urls_patt = r'<p><img src="(.+?)"'
37         self.img_urls = re.findall(img_urls_patt, self.content)
38 
39     def start_download(self):
40         self.getContent()
41         self.getImgNames()
42         self.getImgUrls() 
43 
44         curr_path = os.getcwd()
45         curr_path = curr_path.replace('\\', '/')
46         curr_path = curr_path + '/'
47         file_dir = curr_path + str(self.pagenum) + '/'
48         os.mkdir(file_dir)
49 
50         for name_url in zip(self.img_names, self.img_urls):
51             pic_name = name_url[1][-4:]
52             file_path = file_dir + name_url[0] + pic_name
53             #print 'start download',file_path
54             print 'starting at', ctime()
55             urllib.urlretrieve(name_url[1], file_path)
56             print 'finished at', ctime()
57 
58            
59 def main(page_start, page_end):
60     page = r'http://jandan.net/ooxx/page-1#comments'
61     for pagenum in range(page_start, page_end+1):
62         url = page.replace('1', str(pagenum))
63         print url
64         girls = crawl_girls(url, pagenum)
65         girls.start_download()
66 
67     print "all Done"
68 
69 if __name__ == '__main__':  
70     main(905, 906)

  爬虫多线程版:

  

 1 #coding=utf-8
 2 import sys 
 3 reload(sys) 
 4 sys.setdefaultencoding('utf-8')
 5 
 6 import requests
 7 import re
 8 import os
 9 from threading import Thread 
10 import urllib
11 from time import ctime
12 
13 class crawl_girls(object):
14     """docstring for crawl_girls"""
15     def __init__(self, url, pagenum):
16         self.url = url
17         self.pagenum = pagenum
18         self.content = ""
19         self.img_urls = []
20         self.img_names = []
21 
22     def getContent(self):
23         try:
24             imgs_html = requests.get(self.url)
25             imgs_html_content = imgs_html.content
26             self.content = imgs_html_content
27             #print self.content
28         except requests.exceptions.RequestException, e:
29             print e
30 
31     def getImgNames(self):
32         img_names_patt = r'<li id="comment-(.+?)">'
33         self.img_names = re.findall(img_names_patt, self.content)
34 
35     def getImgUrls(self):
36         img_urls_patt = r'<p><img src="(.+?)"'
37         self.img_urls = re.findall(img_urls_patt, self.content)
38 
39     def start_thread(self):
40         self.getContent()
41         self.getImgNames()
42         self.getImgUrls() 
43 
44         curr_path = os.getcwd()
45         curr_path = curr_path.replace('\\', '/')
46         curr_path = curr_path + '/'
47         file_dir = curr_path + str(self.pagenum) + '/'
48         os.mkdir(file_dir)
49 
50         for name_url in zip(self.img_names, self.img_urls):
51             pic_name = name_url[1][-4:]
52             file_path = file_dir + name_url[0] + pic_name
53             print 'start download',file_path
54             print 'starting at', ctime()
55             thread = download_threads(name_url[1], file_path)
56             thread.start()
57             thread.join()
58             print 'finished at', ctime()
59 
60 class download_threads(Thread):
61     def __init__(self, url, path):
62         Thread.__init__(self)
63         self.url = url
64         self.path = path
65 
66     def run(self):
67         urllib.urlretrieve(self.url, self.path)
68            
69 def main(page_start, page_end):
70     page = r'http://jandan.net/ooxx/page-1#comments'
71     for pagenum in range(page_start, page_end+1):
72         url = page.replace('1', str(pagenum))
73         print url
74         girls = crawl_girls(url, pagenum)
75         girls.start_thread()
76 
77     print "all Done"
78 
79 if __name__ == '__main__':
80     main(905, 906)

 

  

posted @ 2013-08-19 00:15  lkprof  阅读(658)  评论(0编辑  收藏  举报