python爬虫--爬取cctv连续剧
1 #encoding=utf-8 2 import requests 3 from bs4 import BeautifulSoup 4 import re 5 import os 6 from aria2rpc import rpc_addUri 7 class Cntv(): 8 9 def openUrl(self,url): 10 """ 11 This method is used to open a web site 12 :param url:Web site to request 13 :return:Requested object 14 """ 15 header = { 16 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36" 17 } 18 response = requests.get(url, header) 19 return response 20 # pass 21 def getEachEpisodeUrl(self): 22 """ 23 Get the address of each episode of the TV play 24 :return:urls lists 25 """ 26 urls = [] 27 # response = requests.get(self.url) 28 url = "http://tv.cctv.com/2014/07/07/VIDA1404730290373811.shtml" 29 response = self.openUrl(url) 30 html = response.content.decode('utf-8') 31 soup = BeautifulSoup(html,'html.parser') 32 title = soup.select(".text_mod h3") 33 print(title[0].text) 34 episodes = soup.select('.img a') 35 # print(episodes) 36 for each in range(1,len(episodes),3): 37 print(episodes[each]['title'],"link:"+episodes[each]['href']) 38 urls.append(episodes[each]['href']) 39 print("Get Each Episode Url Come Over !!!") 40 return urls 41 def getEachDLUrl(self): 42 urls = self.getEachEpisodeUrl() 43 links = [] 44 for num,url in enumerate(urls): 45 response = self.openUrl(url) 46 html = response.text 47 # soup = BeautifulSoup(html, 'html.parser') 48 match = re.search(r'guid = "(\w+?)";', html) 49 pid = match.group(1) 50 # print(pid) 51 link = "http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do?pid=%s&tz=%s&from=%s&url=%s&idl=%s&idlr=%s&modifyed=%s" %(pid,'-8','000news',url,'32','32','false') 52 links.append(link) 53 print("获取第%d集" %(num)) 54 # print(urls) 55 return links 56 def getDLList(self): 57 """ 58 Get the download address for each episode of the TV play 59 :return:ownload address list 60 """ 61 links = self.getEachDLUrl() 62 # links = ["http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do?pid=59381a0e55404cf5b101f7d3bcad2da8&tz=-8&from=000news&url=http://tv.cctv.com/2014/07/15/VIDE1405435161521590.shtml&idl=32&idlr=32&modifyed=false"] 63 dl_urls = [] 64 for link in links: 65 dl_url = [] 66 response = self.openUrl(link) 67 # html = response.content.decode('utf-8') 68 dl_list = response.json()['video']['chapters4'] 69 for each in range(len(dl_list)): 70 downloadurl = dl_list[each]['url'] 71 dl_url.append(downloadurl) 72 print(downloadurl) 73 dl_urls.append(dl_url) 74 return dl_urls 75 def _add_aria2_task(self, url, name): 76 """ 77 :param url:download url 78 :param name:dowmload tv name 79 :return: 80 """ 81 try: 82 result = rpc_addUri(url, {'out': name}) 83 return result 84 except Exception as e: 85 print(e) 86 return None 87 88 89 # response.json()['video']['lowChapters'][0]['url'] 90 # response.json()['video']['chapters4'][0]['url'] 91 """ 92 def dlTv(self): 93 94 dl_urls_list = self.getDLList() 95 if os.path.exists("tv_list") == False: 96 os.mkdir("tv_list") 97 os.chdir("tv_list") 98 for dl_urls in dl_urls_list: 99 for dl_url in dl_urls: 100 print("download" + dl_url) 101 # response = self.openUrl(dl_url) 102 # with open("first.mp4",'ab') as tl: 103 # tl.write(response.content) 104 print("-"*20) 105 """ 106 if __name__ == "__main__": 107 cm = Cntv() 108 # cm.getUrl() 109 # cm.openUrl() 110 111 lists = cm.getDLList() 112 for num,list in enumerate(lists): 113 for i,url in enumerate(list): 114 cm._add_aria2_task(url, str(num+1)+'_'+str(i+1)+'.mp4')
分类:
python
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构