Python-爬取校花网视频(单线程和多线程版本)

一、参考文章

    python爬虫爬取校花网视频,单线程爬取

    爬虫----爬取校花网视频,包含多线程版本

    上述两篇文章都是对校花网视频的爬取,由于时间相隔很久了,校花网上的一些视频已经不存在了,因此上述文章中的代码在运行时会出现一些异常,本篇文章主要是对上述文章中的代码进行了优化和异常处理,在次做笔记记录方便以后查阅,修改如下:

1、添加的异常处理如下红色部分代码

二、单线程版本

 1 #-*- coding=utf-8 -*-
 2 import re
 3 import requests
 4 import hashlib
 5 import time
 6 import os
 7 
 8 header = {
 9     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36',
10     'Referer':'http://www.xiaohuar.com'
11     }
12 
13 def get_index(url):
14     respose = requests.get(url, headers = header)
15     if respose.status_code == 200:
16         return respose.text
17 
18 def parse_index(res):
19     urls = re.findall(r'class="items".*?href="(.*?)"', res, re.S)  # re.S 把文本信息转换成1行匹配
20     return urls
21 
22 
23 def get_detail(urls):
24     for url in urls:
25         if not url.startswith('http'):
26             url='http://www.xiaohuar.com%s' %url
27         result = requests.get(url, headers = header)
28         if result.status_code == 200 :
29             mp4_url_list = re.findall(r'id="media".*?src="(.*?)"', result.text, re.S)
30             if mp4_url_list:
31                 mp4_url = mp4_url_list[0]
32                 save(mp4_url)
33 
34 path = os.getcwd() + '/video/'
35 
36 def save(url):
37     try:#下载视频加异常处理
38         video = requests.get(url, headers = header)
39     except requests.exceptions.RequestException as e :
40         print(repr(e))
41         return
42     
43     if video.status_code == 200:
44         m = hashlib.md5()
45         m.update(url.encode('utf-8'))
46         m.update(str(time.time()).encode('utf-8'))
47         filename = r'%s.mp4' % m.hexdigest()
48         filepath = path + filename
49         print(filepath)
50         with open(filepath, 'wb') as f:
51             f.write(video.content)
52     else:
53         print(f'视频不存在了:{url}')
54 
55 def main():
56     for i in range(5):
57         res1 = get_index('http://www.xiaohuar.com/list-3-%s.html' % i )#拿第一页数据
58         res2 = parse_index(res1)#提取第一页上的所有url
59         get_detail(res2)#下载url集合上的视频
60 
61 if __name__ == '__main__':
62     main()

三、多线程版本

 1 #-*- coding=utf-8 -*-
 2 # 异步,多线程优化下载速度
 3 
 4 import requests
 5 import re
 6 import os
 7 import hashlib,time
 8 from concurrent.futures import ThreadPoolExecutor
 9 
10 p = ThreadPoolExecutor(30)
11  
12 def get_index(url):
13     response = requests.get(url)
14     if response.status_code == 200:
15         return response.text
16 
17 def parse_index(res):
18     res = res.result()
19     urls = re.findall(r'class="items".*?href="(.*?)"', res, re.S)
20     
21     p.submit(get_detail, urls)
22 
23 def get_detail(urls):
24     for url in urls:
25         if not url.startswith('http'):
26             url='http://www.xiaohuar.com%s' %url
27         r1=requests.get(url)
28         if r1.status_code == 200:
29             url_list=re.findall(r'id="media".*?src="(.*?)"', r1.text, re.S)
30             if url_list:
31                 mp4_url = url_list[0]
32                 save(mp4_url)
33 
34 path = os.getcwd() + '/video_mutil/'
35 if not os.path.exists(path):
36     os.makedirs(path)
37 
38 def save(url):
39     try:#下载视频做异常处理,视频可能不存在了
40         r2 = requests.get(url)
41     except requests.exceptions.RequestException as e :
42         print(repr(e))
43         return
44 
45     if r2.status_code == 200:
46         m=hashlib.md5()
47         m.update(url.encode('utf-8'))
48         m.update(str(time.time()).encode('utf-8'))
49         filename = '%s.mp4' %m.hexdigest()
50         file_path = path + filename
51         with open(file_path,'wb') as f:
52             f.write(r2.content)
53         print('视频下载完成:%s' % file_path)
54     else:
55         print(f'视频不存在了:{url}')
56 
57 def main():
58     for i in range(5):
59         p.submit(get_index, 'http://www.xiaohuar.com/list-3-%s.html' % i).add_done_callback(parse_index)
60 
61 if __name__ == '__main__':
62     main()

四、资源下载

    资源下载地址:Python爬取校花网视频-单线程和多线程版本

 

转载声明:本站文章无特别说明,皆为原创,版权所有,转载请注明:朝十晚八

posted @ 2018-11-25 20:30  朝十晚八  阅读(1664)  评论(0编辑  收藏  举报

返回顶部