爬取校花网的视频

from requests_html import HTMLSession
import os
session = HTMLSession()

# http://www.xiaohuar.com/list-3-0.html
#获取索引页url
def get_index_page():
    for i in range(6):
        url = 'http://www.xiaohuar.com/list-3-%s.html'%i
        yield url

#获取
# url= "http://www.xiaohuar.com/list-3-0.html"
# r = session.get(url=url)
# for element in r.html.find('#images a[class="imglink"]'):
#     print(element.attrs.get('href'))


#解析索引页获取详情页url
def get_detail_page(url):
    r = session.get(url=url)
    for element in r.html.find('#images a[class="imglink"]'):
        yield element.attrs.get('href')

#测试解析详情页获取视频url,名字
# url = 'http://www.xiaohuar.com/p-3-136.html'
# r = session.get(url=url)
# r.html.encoding = "gbk"
# file_name = r.html.find('title',first=True).text.replace('\\','')
# print(file_name)
#
# element = r.html.find('#media source',first=True)
# if element:
#     mp4_url = element.attrs.get('src')
# else:
#     m3u8_url = r.html.search('var vHLSurl    = "{}";')[0]
#     print(m3u8_url)


#解析详情页获取视频url,名字
def get_url_name(url):
    r = session.get(url=url)
    r.html.encoding = "gbk"
    file_name = r.html.find('title',first=True).text.replace('\\','')
    print(file_name)
    element = r.html.find('#media source',first=True)
    if element:
        vurl = element.attrs.get('src')
        vtype = 'mp4'
    else:
        vurl = r.html.search('var vHLSurl    = "{}";')[0]
        vtype = 'm3u8'
    return file_name,vurl,vtype

#保存文件
def save(file_name,vurl,vtype):
    if vtype == "mp4":
        file_name += ".mp4"
        r = session.get(url=vurl)
        with open(file_name,'wb') as f:
            f.write(r.content)
    elif vtype == "m3u8":
        save_m3u8(file_name,vurl)

#处理m3u8
def save_m3u8(file_name,vurl):
    if not os.path.exists(file_name):
        os.mkdir(file_name)
    r = session.get(url=vurl)
    m3u8_path = os.path.join(file_name,'playlist.m3u8')
    with open(m3u8_path,'wb') as f:
        f.write(r.content)
    for line in r.text:
        if line.endswith('ts'):
            ts_url = vurl.replace('playlist.m3u8',line)
            ts_path = os.path.join(file_name,line)
            r0 = session.get(url=ts_url)
            with open(ts_path,'wb') as f:
                f.write(r0.content)


if __name__ == '__main__':
    for index_page in get_index_page():
        for detail_url in get_detail_page(index_page):
            file_name, vurl, vtype = get_url_name(detail_url)
            save(file_name, vurl, vtype)

#  上述的for循环,是由于yield导致的!建议使用,看起来大气

 

知识点补充:

# print(str('电影'.encode('utf-8')).strip("b'").upper().replace('\X','%'))

#    前端页面对中文的参数的编码原理



视频以m3u8结尾的,需要我门再进一步处理!拿到里面片段的ts文件!

 

因为存在视屏链接是以index结尾的,所以需要进一步完善!

 

posted on 2019-08-08 22:58  michael-chang  阅读(270)  评论(0编辑  收藏  举报

导航