python 某音文件下载

import time
# from pyquery import PyQuery as pq
import commonMethod
import datetime
import requests
import re
import os
import json

pattern_1 = '<source class="" src="//v3-web.douyinvod.com/(.{486,488}) type="">'
# pattern_2 = '<source class="" src="//v26-web.douyinvod.com/(.{486,488}) type="">'
# pattern_3 = '<source class="" src="//www.douyin.com/aweme/v1/play/(.+) type="">'

pattern_4='https://www.douyin.com/video/(\d+)'

def get_info_by_pattern(text,pattern):
    p = re.compile(pattern)
    p_res = p.findall(text)
    return p_res

def get_headdouyinvod_com():
    headers = {
        "Connection": "keep-alive",
        "Pragma": "no-cache",
        "Cache-Control": "no-cache",
        "sec-ch-ua": "\"Google Chrome\";v=\"95\", \"Chromium\";v=\"95\", \";Not A Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-User": "?1",
        "Sec-Fetch-Dest": "document",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
    }
    return headers

# 获取抖音链接
def get_douyin_url(driver1,sharl_url):
    try:
        if 'https' not in sharl_url:
            sharl_url = 'https://www.douyin.com/video/' + sharl_url
        VID = ''
        p_res = get_info_by_pattern(sharl_url, pattern_4)
        if len(p_res) > 0:
            VID = p_res[0]

        # driver1 = commonMethod.getDriver('',False)
        url = 'https://www.douyin.com'
        driver1.get(url)
        # driver1.delete_all_cookies()
        list_cooke = [
            {'domain': '.douyin.com', 'expiry': 1698999663, 'httpOnly': False, 'name': 'VIDEO_FILTER_MEMO_SELECT',
             'path': '/', 'secure': False, 'value': '%7B%22expireTime%22%3A1698999663897%2C%22type%22%3A1%7D'},
           ...]

        for cook in list_cooke:
            driver1.add_cookie(cook)
        time.sleep(5)
        driver1.refresh()

        # cookies = driver1.get_cookies()
        # print(cookies)
        # sharl_url = 'https://www.douyin.com/video/7294079788010999040'
        driver1.get(sharl_url)
        time.sleep(5)
        txt = driver1.execute_script("return document.documentElement.outerHTML")
        txt = txt.replace('amp;', '')
        # txt = pq(selenium_html)
        # pattern_1 = '<source class="" src="//v3-web.douyinvod.com/(.+) type="">'
        # pattern_2 = '<source class="" src="//v26-web.douyinvod.com/(.+) type="">'

        p_res = get_info_by_pattern(txt, pattern_1)
        if len(p_res) > 0:
            p_res = p_res[0].replace('"', '')
            # print(len(p_res))
            herf1 = 'https://v3-web.douyinvod.com/' + p_res
            print(herf1)
            return herf1,VID
    except Exception as ex:
        print('获取链接发生异常:',sharl_url,ex)

    return '',''
    # p_res = get_info_by_pattern(txt, pattern_2)
    # if len(p_res) > 0:
    #     p_res = p_res[0].replace('"', '')
    #     # print(len(p_res))
    #     herf1 = 'https://v26-web.douyinvod.com/' + p_res
    #     print(herf1)
    #     return herf1,VID
    # p_res = get_info_by_pattern(txt, pattern_3)
    # if len(p_res) > 0:
    #     p_res = p_res[0].replace('"', '')
    #     print(len(p_res))
    #     herf1 = 'https://www.douyin.com/aweme/v1/play/' + p_res
    #     print(herf1)
    #     return herf1,VID
    # return '',VID

#下载抖音文件
def dowfile_v3_web_douyinvod_com(file_url, fileName):
    page_size = 1024 * 128
    # url = "https://v3-web.douyinvod.com/4643edd0f5d68ed70fd6c6681f98ecdc/653b7807/video/tos/cn/tos-cn-ve-15c001-alinc2/ossMXydUtABkQozrA7g6NnEgHfeAOn9BkDhIAz/"
    # url='https://v3-web.douyinvod.com/4643edd0f5d68ed70fd6c6681f98ecdc/653b7807/video/tos/cn/tos-cn-ve-15c001-alinc2/ossMXydUtABkQozrA7g6NnEgHfeAOn9BkDhIAz/?a=6383&ch=26&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C3&cv=1&br=893&bt=893&cs=0&ds=3&ft=GN7rKGVVywIiRZm8Zmo~1u249EAp2yfbEvrK3ugu0mo0g3&mime_type=video_mp4&qs=1&rc=ZTpoODQzZmg3NGU8NjdlN0BpamZ4dGY6Zm92bjMzNGkzM0A0MDMvMjFgNTYxNmEuY2FgYSNtNi1ncjRvYXBgLS1kLTBzcw%3D%3D&btag=e00008000&dy_q=1698392554&feature_id=46a7bb47b4fd1280f3d3825bf2b29388&l=20231027154234B07D3936DE469D034161'

    headers = get_headdouyinvod_com()
    date = datetime.datetime.now()
    strstart = date.strftime("%Y-%m-%d-%H-%M-%S")
    print('开始下载:', strstart)

    try:
        response = requests.get(file_url, headers=headers, stream=True)
        # response = requests.get(url, headers=headers, params=params, stream=True)
        if response.status_code == 200 or response.status_code == 206:
            headers = json.loads(str(response.headers).replace('"', '').replace('\'', '"'))
            print(headers)
            with(open(fileName, 'ab')) as f:
                for chunk in response.iter_content(chunk_size=512):
                    if chunk:
                        f.write(chunk)

            if 'Content-Range' in headers:
                # Content_Range = str(headers['Content-Length'])
                # total_length = int(Content_Range)
                Content_Range = str(headers['Content-Range'])
                total_length = int(Content_Range.split('/')[1])
                if total_length > page_size:
                    rest_size = total_length - page_size
                    loop_count = int(rest_size / page_size)
                    if rest_size % page_size > 0:
                        loop_count += 1
                    if loop_count > 0:
                        flag_minus = False
                        for i in range(1, loop_count + 1):
                            loop_index = i
                            if flag_minus:
                                loop_index -= 1
                            sfrom = page_size * loop_index + 1
                            sto = page_size * (loop_index + 1)
                            headers['Range'] = 'bytes=%d-%d' % (sfrom, sto)
                            print(loop_index, loop_count)
                            try:
                                # proxy_one = getIP()
                                # proxies = {"https": "https://" + proxy_one}
                                response = requests.get(url, headers=headers, stream=True)
                                if response.status_code == 200 or response.status_code == 206:
                                    headers = json.loads(str(response.headers).replace('"', '').replace('\'', '"'))
                                    # print(headers)
                                    with(open(fileName, 'ab')) as f:
                                        for chunk in response.iter_content(chunk_size=512):
                                            if chunk:
                                                f.write(chunk)
                                flag_minus = False
                            except Exception as ex:
                                print(ex)
                                time.sleep(10)
                                if str(ex).find('Max retries exceeded with url') > -1:
                                    print('Max retries exceeded with url')
                                    flag_minus = True
                            time.sleep(1)

                        date = datetime.datetime.now()
                        strend = date.strftime("%Y-%m-%d %H:%M:%S")
                        print('完成下载:', strstart, strend)

    except Exception as ex:
        print('下载文件发生异常:',file_url,ex)

# 下载抖音文件,
def down_file(sharl_url_list,strDirectory):
    '''
    下载抖音文件
    :param sharl_url: 抖音详细链接列表,类似:['https://www.douyin.com/video/7294079788010999040','https://www.douyin.com/video/7293552737067928868']
    :param strDirectory: 抖音文件存储路径,类似:D:/douyin_file_down/202310
    :return:
    '''
    # strDirectory = os.getcwd()
    # sharl_url = 'https://www.douyin.com/video/7294079788010999040'
    try:
        driver1 = commonMethod.getDriver('', False)
        for sharl_url in sharl_url_list:
            file_url, VID = get_douyin_url(driver1, sharl_url)
            # file_url, VID ='https://v3-web.douyinvod.com/4643edd0f5d68ed70fd6c6681f98ecdc/653b7807/video/tos/cn/tos-cn-ve-15c001-alinc2/ossMXydUtABkQozrA7g6NnEgHfeAOn9BkDhIAz/?a=6383&ch=26&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C3&cv=1&br=893&bt=893&cs=0&ds=3&ft=GN7rKGVVywIiRZm8Zmo~1u249EAp2yfbEvrK3ugu0mo0g3&mime_type=video_mp4&qs=1&rc=ZTpoODQzZmg3NGU8NjdlN0BpamZ4dGY6Zm92bjMzNGkzM0A0MDMvMjFgNTYxNmEuY2FgYSNtNi1ncjRvYXBgLS1kLTBzcw%3D%3D&btag=e00008000&dy_q=1698392554&feature_id=46a7bb47b4fd1280f3d3825bf2b29388&l=20231027154234B07D3936DE469D034161','7294079788010999040'
            fileName = os.path.join(strDirectory, VID + '.mp4')
            if len(file_url) > 0:
                dowfile_v3_web_douyinvod_com(file_url, fileName)
        driver1.close()
    except Exception as ex:
        print(ex)

if __name__ == "__main__":
    # strDirectory = os.getcwd()
    # 保存路径
    strDirectory = 'D:/douyin/file/202310'
    # sharl_url='https://www.douyin.com/video/7293555365818453274'
    sharl_url_list=['https://www.douyin.com/video/7293555365818453274',
                    'https://www.douyin.com/video/7293552737067928868',
                    'https://www.douyin.com/video/7293555206388780324']
    down_file(sharl_url_list, strDirectory)
View Code

 

posted on 2023-10-27 17:08  shaomine  阅读(76)  评论(0编辑  收藏  举报