【课程章节更新】猫影项目新爬虫源代码

起因

        我在慕课有个flask 入门的课程:点击这里查看慕课课程。当时课程讲解的使用学习的视频网站已经不再提供服务了,为了方便大家学习这里重新找了一个视频源。这里郑重声明:该代码仅用于学习演示,请大家妥善使用,不要给源网站造成任何压力。


示例代码

新建一个python文件,文件名称是 movie2.py,代码如下

# -*- coding: utf-8 -*-
from application import app,db
import requests,os,time,hashlib,json,re
from bs4 import BeautifulSoup
from common.libs.DataHelper import getCurrentTime
from urllib.parse import urlparse
from common.models.movie import Movie
import logging
from flask.logging import default_handler
'''
示例命令如下
python manager.py runjob -m movie2 -a list | parse
'''
class JobTask():
    def __init__(self):
        ## 设置Job使用debug模式
        app.config['DEBUG'] = True
        logging_format = logging.Formatter(
            '%(levelname)s %(asctime)s %(filename)s:%(funcName)s L%(lineno)s %(message)s')
        default_handler.setFormatter(logging_format)
        self.source = "2345movie"
        self.url = {
            "num" : 3,
            "url" : "https://dianying.2345.com/list/-------#d#.html",
            "path" : "/tmp/%s/" %( self.source )
        }

    '''
    第一步 首先 获取列表list html 回来,通过解析html 获取详情 的 url等信息,在根据详情url 获取详情html
    第二步 解析 详情的html
    '''
    def run(self,params):
        act = params['act']
        self.date = getCurrentTime( frm = "%Y%m%d")
        if act == "list":
            self.getList()
            self.parseInfo()
        elif act == "parse":
            self.parseInfo()

    '''
    获取列表
    '''
    def getList(self):
        config = self.url
        path_root = config['path'] + self.date
        path_list = path_root + "/list"
        path_info = path_root + "/info"
        path_json = path_root + "/json"
        path_vid = path_root + "/vid"
        self.makeSuredirs( path_root )
        self.makeSuredirs( path_list )
        self.makeSuredirs( path_info )
        self.makeSuredirs( path_json )
        self.makeSuredirs( path_vid )

        pages = range( 1,config['num'] + 1 )
        for idx in pages:
            tmp_path = path_list + "/" + str( idx )
            tmp_url = config['url'].replace("#d#",str( idx ) )
            app.logger.info( "get list : " +  tmp_url )
            if os.path.exists( tmp_path ):
                continue

            tmp_content = self.getHttpContent( tmp_url )
            self.saveContent( tmp_path,tmp_content )
            time.sleep(0.3)

        for idx in os.listdir( path_list ):
            tmp_content = self.getContent( path_list + "/" + str( idx ) )
            items_data = self.parseList( tmp_content )
            if not items_data:
                continue

            for item in items_data:
                app.logger.info("----------------")
                app.logger.info( item )
                tmp_json_path = path_json + "/" + item['hash']
                tmp_info_path = path_info + "/" + item['hash']
                tmp_vid_path = path_vid + "/" + item['hash']
                if not os.path.exists( tmp_json_path ):
                    self.saveContent( tmp_json_path, json.dumps( item,ensure_ascii=False ) )

                if not os.path.exists(tmp_info_path):
                    tmp_content = self.getHttpContent( item['url'] )
                    self.saveContent(  tmp_info_path,tmp_content )

                if not os.path.exists( tmp_vid_path ):
                    tmp_content = self.getHttpContent( item['vid_url'] )
                    self.saveContent(  tmp_vid_path,tmp_content )

                time.sleep( 0.3 )


    def parseList(self,content):
        data = []
        config = self.url
        url_info = urlparse( config['url'] )
        url_domain = url_info[0] + "://" + url_info[1]

        tmp_soup = BeautifulSoup( str(content),"html.parser" )
        tmp_list = tmp_soup.select( "div#contentList ul li" )
        for tmp_item in tmp_list:
            tmp_target = tmp_item.select("div.li-pic a.aPlayBtn")
            tmp_name = tmp_target[0]['title']
            tmp_href = tmp_target[0]['href']
            if "https:" not in tmp_href and "//" in tmp_href:
                tmp_href = "https:%s" %( tmp_href )
            tmp_vid_url = "" ##这里获取不到下载地址了,那就进去获取
            tmp_data = {
                "name" : tmp_name,
                "url" : tmp_href,
                "vid_url" : tmp_vid_url,
                "hash" :  hashlib.md5( tmp_href.encode("utf-8") ).hexdigest()
            }
            data.append( tmp_data )

        return data

    '''
    解析详情信息
    '''
    def parseInfo(self):
        config = self.url
        path_root = config['path'] + self.date
        path_info = path_root + "/info"
        path_json = path_root + "/json"
        path_vid = path_root + "/vid"
        for filename in os.listdir(  path_info ):
            tmp_json_path = path_json + "/" + filename
            tmp_info_path = path_info + "/" + filename
            tmp_vid_path = path_vid + "/" + filename
            tmp_data = json.loads( self.getContent( tmp_json_path) )
            app.logger.info( tmp_info_path )
            tmp_content = self.getContent( tmp_info_path )
            tmp_soup = BeautifulSoup( tmp_content,"html.parser")
            try:
                ##页面没有日期我们就去当天吧
                tmp_pub_date = self.date
                tmp_desc = tmp_soup.select( "div.txtIntroCon div.wholeTxt ul.newIntro li.extend .pHide" )[0].getText()
                tmp_classify = tmp_soup.select( "div.txtIntroCon div.wholeTxt ul.txtList li.li_3 div.emTit-l" )[2].getText()
                tmp_actor = tmp_soup.select( "div.txtIntroCon div.wholeTxt ul.txtList li.liActor div.emTit-l" )[1].getText()
                tmp_pic_list = tmp_soup.select("div.posterPlaceholder div.pic img ")
                tmp_pics = []
                for tmp_pic in tmp_pic_list:
                    tmp_pics.append(  "https:" +  tmp_pic['src'] )

                #获取下载地址 直接从当前页面获取
                #tmp_download_content = self.getContent( tmp_vid_path  )
                #tmp_vid_soup = BeautifulSoup( tmp_download_content ,"html.parser")
                tmp_download_list = tmp_soup.select( "div.txtIntroCon div.series div.series-con div.series-con-i a" )
                tmp_magnet_url = ""
                if tmp_download_list:
                    tmp_magnet_url = tmp_download_list[0]['href']


                tmp_data['pub_date'] = tmp_pub_date
                tmp_data['desc'] = tmp_desc.strip()
                tmp_data['classify'] = tmp_classify.strip()
                tmp_data['actor'] = tmp_actor.strip()
                tmp_data['magnet_url'] = tmp_magnet_url
                tmp_data['source'] = self.source
                tmp_data['created_time'] = tmp_data['updated_time'] = getCurrentTime()
                if tmp_pics:
                    tmp_data['cover_pic'] = tmp_pics[0]
                    tmp_data['pics'] = json.dumps( tmp_pics )

                tmp_movie_info = Movie.query.filter_by( hash  = tmp_data['hash']).first()
                if tmp_movie_info:
                    continue

                tmp_model_movie = Movie( **tmp_data )
                db.session.add( tmp_model_movie )
                db.session.commit()
            except Exception as e:
                app.logger.info( e )
                continue
        return True

    def getHttpContent(self,url):
        try:
            headers = {
                'Content-Type': 'text/html;charset=utf-8',
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
                'Referer': "https://dianying.2345.com/list/",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
            }
            r = requests.get(url, headers=headers)
            if r.status_code != 200 :
                return None

            return r.text

        except Exception:
            return None

    def saveContent(self,path,content):
        if content:
            with open( path,mode="w+",encoding="utf-8" ) as f:
                if type(content) != str:
                    content = content.decode("utf-8")

                f.write(content )
                f.flush()
                f.close()

    def getContent(self,path):
        if os.path.exists( path ):
            with open( path ,"r") as  f:
                return f.read()

        return ''

    def makeSuredirs(self,path):
        if not os.path.exists( path ):
            os.makedirs( path )



原文地址:【课程章节更新】猫影项目新爬虫源代码
标签:flask   linux   

智能推荐

posted on 2022-04-06 23:20  即学即码+  阅读(295)  评论(0编辑  收藏  举报