python爬虫脚本下载YouTube视频

python爬虫脚本下载YouTube视频

工作环境:

  • python 2.7.13

  • pip

  • lxml, 安装 pip install lxml,主要用xpath查找节点,可以使用re模块代替

  • pytube, 安装 pip install pytube

  • 科学上网工具

参考:

源码:

  1. # coding: utf-8 
  2. __author__ = "zwzhou" 
  3. __date__ = "2017-03-19" 
  4.  
  5. import urllib2 
  6. from pytube import YouTube 
  7. from pprint import pprint 
  8. from lxml import etree 
  9. import sys,getopt 
  10.  
  11. def getHtml(url): 
  12. user_agent='Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.13 (KHTML, like Gecko) Chrome/24.0.1284.0 Safari/537.13' 
  13. headers={'User-Agent':user_agent} 
  14. request=urllib2.Request(url,headers=headers) 
  15. response=urllib2.urlopen(request) 
  16. html=response.read() 
  17. return html 
  18.  
  19. def getUrl(html): 
  20. global savepath 
  21. global maxNumber 
  22. global timeThreshold 
  23. global cur_count 
  24. global videoLists 
  25. tree=etree.HTML(html) 
  26. urllist=tree.xpath(u'//div[@class="thumb-wrapper"]/a/@href'
  27. #print urllist 
  28. urllist_time=tree.xpath(u'//div[@class="thumb-wrapper"]/a/span/span/text()'
  29.  
  30. baseurl=r'https://www.youtube.com' 
  31. for (item_name,item_length) in zip(urllist,urllist_time): 
  32. #print item_name 
  33. #print item_length 
  34. try
  35. yt = YouTube(baseurl+item_name) 
  36. except
  37. print "Some thing wrong about the authority" 
  38.  
  39. print("video name:"+yt.filename) 
  40. print("video time:"+item_length) 
  41. if yt.filename in videoLists: # 文件已经存在 
  42. print "This video has been downloaded!" 
  43. else
  44. if checktime(item_length): 
  45. video = yt.filter('mp4')[-1
  46. print("Now is loading %s------------>"%yt.filename) 
  47. video.download(savepath) 
  48. print("--------------->%sVideo is loaded!"%yt.filename) 
  49. cur_count+=1 
  50. videoLists.append(yt.filename) 
  51. if cur_count >= maxNumber:# 达到要求 
  52. print('There are %d videos downloaded!This task is completed!'%maxNumber) 
  53. # TODO: if necessary, the videoLists can be logged 
  54. sys.exit()  
  55. else
  56. print 'This video is too long and it will not be downloaded, just be ignored!' 
  57. if urllist: 
  58. getUrl(baseurl+urllist[0]) #下一个页面 
  59.  
  60.  
  61. def checktime(timelength): 
  62. global timeThreshold 
  63. strs=timelength.split(':'
  64. time =int(strs[0])*60+int(strs[1]) 
  65. if time< timeThreshold: 
  66. return True 
  67. else
  68. return False 
  69.  
  70. def usage(): 
  71. print ''' 
  72. usage: python dl_youtube [option] [arg] 
  73. options and args: 
  74. -s : download path 
  75. -t : time threshold of the video to be loaded, in seconds 
  76. -u : start url which to be crawled, it can be set more than one time 
  77. -n : when downloading is stop, i.e. how many videos will be downloaded, default is 10000. 
  78. -h : print this help message 
  79. ''' 
  80.  
  81. if __name__ == "__main__"
  82. start_urls=['https://www.youtube.com/watch?v=TThzH_sJo6o'
  83. videoLists=[] # 保存文件名,防止重复下载 
  84. # 初始值 
  85. savepath=r"D://MyDownloads" 
  86. maxNumber=10000 
  87. timeThreshold=240 
  88. cur_count=0 
  89.  
  90. opts,args=getopt.getopt(sys.argv[1:],'hs:t:n:u:'
  91. for op,value in opts: 
  92. if op == "-s": # 下载路径,如默认 D://MyDownloads 
  93. savepath=value 
  94. elif op == '-t': # 时常限制,默认240s 
  95. timeThreshold =int(value) 
  96. elif op == "-h": # help 
  97. usage() 
  98. sys.exit() 
  99. elif op == '-n'
  100. maxNumber=int(value) 
  101. elif op == '-u': # 初始的搜索链接 
  102. start_urls.append(value) 
  103.  
  104. for item in start_urls: 
  105. html = getHtml(item) 
  106. getUrl(html) 
  107.  

使用

  1. python dl_youtube.py -n 10 -s D://MyDownloads -t 600 -u https://www.youtube.com/watch?v=TThzH_sJo6o 

将从页面 https://www.youtube.com/watch?v=TThzH_sJo6o 开始搜索下载10段时长小于6分钟的video保存到D://MyDownloads文件夹中。

posted @ 2017-03-19 17:00  一只有恒心的小菜鸟  阅读(11035)  评论(0编辑  收藏  举报