【课程章节更新】猫影项目新爬虫源代码
起因
我在慕课有个flask 入门的课程:点击这里查看慕课课程。当时课程讲解的使用学习的视频网站已经不再提供服务了,为了方便大家学习这里重新找了一个视频源。这里郑重声明:该代码仅用于学习演示,请大家妥善使用,不要给源网站造成任何压力。
示例代码
新建一个python文件,文件名称是 movie2.py,代码如下
# -*- coding: utf-8 -*- from application import app,db import requests,os,time,hashlib,json,re from bs4 import BeautifulSoup from common.libs.DataHelper import getCurrentTime from urllib.parse import urlparse from common.models.movie import Movie import logging from flask.logging import default_handler ''' 示例命令如下 python manager.py runjob -m movie2 -a list | parse ''' class JobTask(): def __init__(self): ## 设置Job使用debug模式 app.config['DEBUG'] = True logging_format = logging.Formatter( '%(levelname)s %(asctime)s %(filename)s:%(funcName)s L%(lineno)s %(message)s') default_handler.setFormatter(logging_format) self.source = "2345movie" self.url = { "num" : 3, "url" : "https://dianying.2345.com/list/-------#d#.html", "path" : "/tmp/%s/" %( self.source ) } ''' 第一步 首先 获取列表list html 回来,通过解析html 获取详情 的 url等信息,在根据详情url 获取详情html 第二步 解析 详情的html ''' def run(self,params): act = params['act'] self.date = getCurrentTime( frm = "%Y%m%d") if act == "list": self.getList() self.parseInfo() elif act == "parse": self.parseInfo() ''' 获取列表 ''' def getList(self): config = self.url path_root = config['path'] + self.date path_list = path_root + "/list" path_info = path_root + "/info" path_json = path_root + "/json" path_vid = path_root + "/vid" self.makeSuredirs( path_root ) self.makeSuredirs( path_list ) self.makeSuredirs( path_info ) self.makeSuredirs( path_json ) self.makeSuredirs( path_vid ) pages = range( 1,config['num'] + 1 ) for idx in pages: tmp_path = path_list + "/" + str( idx ) tmp_url = config['url'].replace("#d#",str( idx ) ) app.logger.info( "get list : " + tmp_url ) if os.path.exists( tmp_path ): continue tmp_content = self.getHttpContent( tmp_url ) self.saveContent( tmp_path,tmp_content ) time.sleep(0.3) for idx in os.listdir( path_list ): tmp_content = self.getContent( path_list + "/" + str( idx ) ) items_data = self.parseList( tmp_content ) if not items_data: continue for item in items_data: app.logger.info("----------------") app.logger.info( item ) tmp_json_path = path_json + "/" + item['hash'] tmp_info_path = path_info + "/" + item['hash'] tmp_vid_path = path_vid + "/" + item['hash'] if not os.path.exists( tmp_json_path ): self.saveContent( tmp_json_path, json.dumps( item,ensure_ascii=False ) ) if not os.path.exists(tmp_info_path): tmp_content = self.getHttpContent( item['url'] ) self.saveContent( tmp_info_path,tmp_content ) if not os.path.exists( tmp_vid_path ): tmp_content = self.getHttpContent( item['vid_url'] ) self.saveContent( tmp_vid_path,tmp_content ) time.sleep( 0.3 ) def parseList(self,content): data = [] config = self.url url_info = urlparse( config['url'] ) url_domain = url_info[0] + "://" + url_info[1] tmp_soup = BeautifulSoup( str(content),"html.parser" ) tmp_list = tmp_soup.select( "div#contentList ul li" ) for tmp_item in tmp_list: tmp_target = tmp_item.select("div.li-pic a.aPlayBtn") tmp_name = tmp_target[0]['title'] tmp_href = tmp_target[0]['href'] if "https:" not in tmp_href and "//" in tmp_href: tmp_href = "https:%s" %( tmp_href ) tmp_vid_url = "" ##这里获取不到下载地址了,那就进去获取 tmp_data = { "name" : tmp_name, "url" : tmp_href, "vid_url" : tmp_vid_url, "hash" : hashlib.md5( tmp_href.encode("utf-8") ).hexdigest() } data.append( tmp_data ) return data ''' 解析详情信息 ''' def parseInfo(self): config = self.url path_root = config['path'] + self.date path_info = path_root + "/info" path_json = path_root + "/json" path_vid = path_root + "/vid" for filename in os.listdir( path_info ): tmp_json_path = path_json + "/" + filename tmp_info_path = path_info + "/" + filename tmp_vid_path = path_vid + "/" + filename tmp_data = json.loads( self.getContent( tmp_json_path) ) app.logger.info( tmp_info_path ) tmp_content = self.getContent( tmp_info_path ) tmp_soup = BeautifulSoup( tmp_content,"html.parser") try: ##页面没有日期我们就去当天吧 tmp_pub_date = self.date tmp_desc = tmp_soup.select( "div.txtIntroCon div.wholeTxt ul.newIntro li.extend .pHide" )[0].getText() tmp_classify = tmp_soup.select( "div.txtIntroCon div.wholeTxt ul.txtList li.li_3 div.emTit-l" )[2].getText() tmp_actor = tmp_soup.select( "div.txtIntroCon div.wholeTxt ul.txtList li.liActor div.emTit-l" )[1].getText() tmp_pic_list = tmp_soup.select("div.posterPlaceholder div.pic img ") tmp_pics = [] for tmp_pic in tmp_pic_list: tmp_pics.append( "https:" + tmp_pic['src'] ) #获取下载地址 直接从当前页面获取 #tmp_download_content = self.getContent( tmp_vid_path ) #tmp_vid_soup = BeautifulSoup( tmp_download_content ,"html.parser") tmp_download_list = tmp_soup.select( "div.txtIntroCon div.series div.series-con div.series-con-i a" ) tmp_magnet_url = "" if tmp_download_list: tmp_magnet_url = tmp_download_list[0]['href'] tmp_data['pub_date'] = tmp_pub_date tmp_data['desc'] = tmp_desc.strip() tmp_data['classify'] = tmp_classify.strip() tmp_data['actor'] = tmp_actor.strip() tmp_data['magnet_url'] = tmp_magnet_url tmp_data['source'] = self.source tmp_data['created_time'] = tmp_data['updated_time'] = getCurrentTime() if tmp_pics: tmp_data['cover_pic'] = tmp_pics[0] tmp_data['pics'] = json.dumps( tmp_pics ) tmp_movie_info = Movie.query.filter_by( hash = tmp_data['hash']).first() if tmp_movie_info: continue tmp_model_movie = Movie( **tmp_data ) db.session.add( tmp_model_movie ) db.session.commit() except Exception as e: app.logger.info( e ) continue return True def getHttpContent(self,url): try: headers = { 'Content-Type': 'text/html;charset=utf-8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36', 'Referer': "https://dianying.2345.com/list/", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" } r = requests.get(url, headers=headers) if r.status_code != 200 : return None return r.text except Exception: return None def saveContent(self,path,content): if content: with open( path,mode="w+",encoding="utf-8" ) as f: if type(content) != str: content = content.decode("utf-8") f.write(content ) f.flush() f.close() def getContent(self,path): if os.path.exists( path ): with open( path ,"r") as f: return f.read() return '' def makeSuredirs(self,path): if not os.path.exists( path ): os.makedirs( path )
原文地址:【课程章节更新】猫影项目新爬虫源代码
标签:flask linux
智能推荐
即学即码 分享 PHP、Linux、Python、Go,大数据,机器学习,人工智能等技术。马上学习码上学会。www.jixuejima.cn 公众号:learn_master