爬取校花网的视频
from requests_html import HTMLSession import os session = HTMLSession() # http://www.xiaohuar.com/list-3-0.html #获取索引页url def get_index_page(): for i in range(6): url = 'http://www.xiaohuar.com/list-3-%s.html'%i yield url #获取 # url= "http://www.xiaohuar.com/list-3-0.html" # r = session.get(url=url) # for element in r.html.find('#images a[class="imglink"]'): # print(element.attrs.get('href')) #解析索引页获取详情页url def get_detail_page(url): r = session.get(url=url) for element in r.html.find('#images a[class="imglink"]'): yield element.attrs.get('href') #测试解析详情页获取视频url,名字 # url = 'http://www.xiaohuar.com/p-3-136.html' # r = session.get(url=url) # r.html.encoding = "gbk" # file_name = r.html.find('title',first=True).text.replace('\\','') # print(file_name) # # element = r.html.find('#media source',first=True) # if element: # mp4_url = element.attrs.get('src') # else: # m3u8_url = r.html.search('var vHLSurl = "{}";')[0] # print(m3u8_url) #解析详情页获取视频url,名字 def get_url_name(url): r = session.get(url=url) r.html.encoding = "gbk" file_name = r.html.find('title',first=True).text.replace('\\','') print(file_name) element = r.html.find('#media source',first=True) if element: vurl = element.attrs.get('src') vtype = 'mp4' else: vurl = r.html.search('var vHLSurl = "{}";')[0] vtype = 'm3u8' return file_name,vurl,vtype #保存文件 def save(file_name,vurl,vtype): if vtype == "mp4": file_name += ".mp4" r = session.get(url=vurl) with open(file_name,'wb') as f: f.write(r.content) elif vtype == "m3u8": save_m3u8(file_name,vurl) #处理m3u8 def save_m3u8(file_name,vurl): if not os.path.exists(file_name): os.mkdir(file_name) r = session.get(url=vurl) m3u8_path = os.path.join(file_name,'playlist.m3u8') with open(m3u8_path,'wb') as f: f.write(r.content) for line in r.text: if line.endswith('ts'): ts_url = vurl.replace('playlist.m3u8',line) ts_path = os.path.join(file_name,line) r0 = session.get(url=ts_url) with open(ts_path,'wb') as f: f.write(r0.content) if __name__ == '__main__': for index_page in get_index_page(): for detail_url in get_detail_page(index_page): file_name, vurl, vtype = get_url_name(detail_url) save(file_name, vurl, vtype) # 上述的for循环,是由于yield导致的!建议使用,看起来大气
知识点补充: # print(str('电影'.encode('utf-8')).strip("b'").upper().replace('\X','%')) # 前端页面对中文的参数的编码原理 视频以m3u8结尾的,需要我门再进一步处理!拿到里面片段的ts文件!
因为存在视屏链接是以index结尾的,所以需要进一步完善!
posted on 2019-08-08 22:58 michael-chang 阅读(270) 评论(0) 编辑 收藏 举报