用python爬校花网
import requests import re import hashlib,time def get_index(url): response=requests.get(url) if response.status_code == 200: return response.text def parse_index(res): urls=re.findall(r'class="items".*?href="(.*?)"',res,re.S) return urls def get_detail(urls): for url in urls: if not url.startswith('http'): url='http://www.xiaohuar.com%s' %url r1=requests.get(url) if r1.status_code == 200: url_list=re.findall(r'id="media".*?src="(.*?)"',r1.text,re.S) if url_list: mp4_url=url_list[0] # print(mp4_url) save(mp4_url) def save(url): print('Download:%s' %url) r2=requests.get(url) if r2.status_code == 200: m=hashlib.md5() m.update(url.encode('utf-8')) m.update(str(time.time()).encode('utf-8')) filename='%s.mp4' %m.hexdigest() file_path=r'D:\\爬虫视频\%s' % filename with open(file_path,'wb') as f: f.write(r2.content) def main(): for i in range(5): res1=get_index('http://www.xiaohuar.com/list-3-%s.html' %i) res2=parse_index(res1) get_detail(res2) if __name__ == '__main__': main()
基于上面代码开多线程爬取视频,优化下载速度
# 异步,多线程优化下载速度 import requests import re import hashlib,time from concurrent.futures import ThreadPoolExecutor p=ThreadPoolExecutor(30) def get_index(url): response=requests.get(url) if response.status_code == 200: return response.text def parse_index(res): res=res.result() urls=re.findall(r'class="items".*?href="(.*?)"',res,re.S) # return urls for url in urls: p.submit(get_detail,url) def get_detail(urls): for url in urls: if not url.startswith('http'): url='http://www.xiaohuar.com%s' %url r1=requests.get(url) if r1.status_code == 200: url_list=re.findall(r'id="media".*?src="(.*?)"',r1.text,re.S) if url_list: mp4_url=url_list[0] # print(mp4_url) save(mp4_url) def save(url): print('Download:%s' %url) r2=requests.get(url) if r2.status_code == 200: m=hashlib.md5() m.update(url.encode('utf-8')) m.update(str(time.time()).encode('utf-8')) filename='%s.mp4' %m.hexdigest() file_path=r'D:\\爬虫视频\%s' % filename with open(file_path,'wb') as f: f.write(r2.content) def main(): for i in range(5): p.submit(get_index,'http://www.xiaohuar.com/list-3-%s.html' %i).add_done_callback(parse_index) # res1=get_index('http://www.xiaohuar.com/list-3-%s.html' %i) # res2=parse_index(res1) # get_detail(res2) if __name__ == '__main__': main()