pressmuSpiderr
#!/usr/bin/env python # encoding: utf-8 import requests from random import choice from lxml import html from urllib.parse import urljoin,quote import os import time NAMEURLDIC={} NAMEURLDIC_L2={} ualist=["Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", "Mozilla/5.0 (compatible; Yahoo! Slurp/3.0; http://help.yahoo.com/help/us/ysearch/slurp)"] ua=choice(ualist) header={"User_Agent":ua} mailurl="https://press.mu" url="https://press.mu/tag" searc_url="https://press.mu/search/{}?p={}" def getpage(url): req=None try: req=requests.get(url=url,headers=header,stream=True) req.encoding=req.apparent_encoding except: pass return req def parse(url): source=getpage(url).text if len(source): root=html.fromstring(source) return root def buff(url): buff = None req=getpage(url) return req def save_file(title,url,type="m3u8"): if os.path.exists("pressimg"): pass else: os.mkdir("pressimg") with open(f'./pressimg/{title}.{type}',"wb") as fs: fs.write(buff(url).content) root=parse(url) taglist=root.xpath("//section[@id='tag']/ul/li/a") for tag in taglist: title=tag.xpath("./text()")[0] href=urljoin(mailurl,tag.xpath("./@href")[0]) NAMEURLDIC.setdefault(title,href) for k,v in NAMEURLDIC.items(): #第一页 root=parse(v) #视频件数: v_count=root.xpath("//p[@id='hit']/strong/text()")[0] v_max_page_num=root.xpath("//nav[@id='pager']/ul/li[last()-1]/a/text()")[0] print(f'当前分类为{k}:,视频件数为:{v_count}') for item in range(1,int(v_max_page_num)+1): print(f"获取第{item}页") if item==1: pass else: root = parse(searc_url.format(quote(title.strip()),item)) level2list=root.xpath("//section[@class='items']//h2/a") for level2 in level2list: title_level2 = level2.xpath("./text()")[0] href_level2 = urljoin(mailurl, level2.xpath("./@href")[0]) NAMEURLDIC_L2.setdefault(title_level2, href_level2) print(title_level2,href_level2) root2 = parse(href_level2) videourl=root2.xpath("//div[@id='player']//video/source/@src")[0] imgurl="https:"+root2.xpath("//div[@id='player']//video/@poster")[0] print("videourl",videourl) print("imgurl",imgurl) save_file(title_level2,videourl) save_file(title_level2,imgurl,"jpg") print("开始下载",f"{title_level2}.jpg")