爬虫
爬取校花网视频
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
import requests import re import time,os import hashlib DOWNLOAD_PATH=r'F:\DOWNLOAD' def get_page(url): print('GET %s' %url) try: response=requests.get(url) print('response:',response) # <Response [200]> response 对象 if response.status_code == 200: return response.content # 二进制的视频列表页面数据 except Exception: pass def parse_index(res): obj=re.compile('class="items.*?<a href="(.*?)"',re.S) detail_urls=obj.findall(res.decode('gbk')) # 符合正则匹配的那些视频详细页的链接 列表形式 ['http://www.xiaohuar.com/p-3-97.html', ....] for detail_url in detail_urls: if not detail_url.startswith('http'): detail_url='http://www.xiaohuar.com'+detail_url yield detail_url # 各视频详细页的url def parse_detail(res): obj=re.compile('id="media".*?src="(.*?)"',re.S) res=obj.findall(res.decode('gbk')) # 符合正则匹配的那些视频mp4的链接 视频详细页只有一个视频,所以该匹配列表中只有一个mp4链接 if len(res) > 0: movie_url=res[0] return movie_url # 各视频的mp4链接 def save(movie_url): response=requests.get(movie_url,stream=False) if response.status_code == 200: m=hashlib.md5() m.update(str(time.time()).encode('utf-8')) m.update(movie_url.encode('utf-8')) filepath = os.path.join(DOWNLOAD_PATH, '%s.mp4' % m.hexdigest()) with open(filepath,'wb') as f: f.write(response.content) f.flush() print('%s下载成功'%filepath) def main(): index_url='http://www.xiaohuar.com/list-3-{0}.html' for i in range(5): print('*'*50,i) #爬取主页面 index_page=get_page(index_url.format(i,)) #解析主页面,拿到视频所在的地址列表 detail_urls=parse_index(index_page) # 生成器 #循环爬取视频页 for detail_url in detail_urls: #爬取视频页 detail_page=get_page(detail_url) #拿到视频的url movie_url=parse_detail(detail_page) print('movie_url==========',movie_url) if movie_url: #保存视频 save(movie_url) if __name__ == '__main__': main()
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
from concurrent.futures import ThreadPoolExecutor import queue import requests import re import time import hashlib from threading import current_thread p=ThreadPoolExecutor(50) def get_page(url): print('%s GET %s' %(current_thread().getName(),url)) try: response=requests.get(url) if response.status_code == 200: return response.content except Exception as e: print(e) def parse_index(res): print('%s parse index ' %current_thread().getName()) res=res.result() obj=re.compile('class="items.*?<a href="(.*?)"',re.S) detail_urls=obj.findall(res.decode('gbk')) for detail_url in detail_urls: if not detail_url.startswith('http'): detail_url='http://www.xiaohuar.com'+detail_url p.submit(get_page,detail_url).add_done_callback(parse_detail) def parse_detail(res): print('%s parse detail ' %current_thread().getName()) res=res.result() obj=re.compile('id="media".*?src="(.*?)"',re.S) res=obj.findall(res.decode('gbk')) if len(res) > 0: movie_url=res[0] print('MOVIE_URL: ',movie_url) with open('db.txt','a') as f: f.write('%s\n' %movie_url) # save(movie_url) p.submit(save,movie_url) print('%s下载任务已经提交' %movie_url) def save(movie_url): print('%s SAVE: %s' %(current_thread().getName(),movie_url)) try: response=requests.get(movie_url,stream=False) if response.status_code == 200: m=hashlib.md5() m.update(('%s%s.mp4' %(movie_url,time.time())).encode('utf-8')) filename=m.hexdigest() with open(r'./movies/%s.mp4' %filename,'wb') as f: f.write(response.content) f.flush() except Exception as e: print(e) def main(): index_url='http://www.xiaohuar.com/list-3-{0}.html' for i in range(5): p.submit(get_page,index_url.format(i,)).add_done_callback(parse_index) if __name__ == '__main__': main()
通过requests模块访问网站
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
import requests url='https://www.baidu.com/s' reponse=requests.get(url, # 网页url # params 将url中的请求信息进行编码 params={'wd':'美女',}, # GET请求的请求体在url中,若想发送类似 https://www.baidu.com/s?wd=美女 这类url 需要借助params进行编码 编译成 https://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3 才成 #请求头中需要携带的信息 User-Agent cookies referer # referer= 只允许从当前网页中跳转 headers={ # 表明来自电脑浏览器 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', }, ) print(reponse.status_code) print(reponse.text)