Python使用pyppeteer搭建网页截图api

因为跨语言需要,打算把pyppeteer、图片压缩、aws S3封装成一个api来调用。

首先自然是要安装依赖 pip3 install pillow boto3 pyppeteer

运行一次脚本,pyppeteer会自动下载最新的浏览器到本地

2023.2.2 支持使用API重启chrome了

请求用GET/POST均可,GET将参数放进query,POST将参数放进消息体,使用application/x-www-urluncoded

# 使用GET
curl -i "http://127.0.0.1:9871/?token=xry1029&uri=https://www.baidu.com"
# 使用POST
curl -X POST -i "http://127.0.0.1:9871" --data "token=xry1029" --data "uri=https://www.baidu.com"

# 重启chrome
curl -i "http://127.0.0.1:9871/?token=xry1029&restart=1"

切记uri需要加http://,不需要的功能就注释掉吧...

#!/usr/bin/python3.8
# date: 2023.2.2
# httpd import
import http.server as BaseHTTPServer
import socketserver as SocketServer
import urllib.parse as urlparse
import threading
import re
import argparse
import json
# chrome import
from pyppeteer import launch, connect
import asyncio
import traceback
import time
import sys
# 图片压缩 import
import os
from PIL import Image
from PIL import ImageFile
# aws S3 import
import os
import boto3
import base64


def compress_image(outfile, savefile, kb=1536, quality=75, k=0.75): # tx服务器貌似只要1.5m以内的图
    """
    :param outfile: 要压缩的文件
    :param savefile: 导出文件
    :param kb: 压缩目标,KB
    :param k: 每次调整的压缩比率
    :param quality: 初始压缩比率
    :return: 压缩文件地址,压缩文件大小
    outfile => savefile
    """
    o_size = os.path.getsize(outfile) // 1024 # 函数返回为字节,除1024转为kb(1kb = 1024 bit)
    if o_size <= kb:
        os.rename(outfile,savefile)
        print('[compress] 无需压缩')
        return savefile

    ImageFile.LOAD_TRUNCATED_IMAGES = True # 防止图像被截断而报错

    while o_size > kb:
        im = Image.open(outfile)
        x, y = im.size
        out = im.resize((int(x*k), int(y*k)), Image.Resampling.LANCZOS)
        try:
            out.save(outfile, quality=quality) # quality 质量
        except Exception as e:
            print(e)
            break
        o_size = os.path.getsize(outfile) // 1024
    os.rename(outfile,savefile)
    print('[compress] 压缩完成')
    return savefile


async def launchchrome():
    # 全局化变量
    global WsToken
    #global browser
    browser = await launch(headless=False,autoClose=False,userDataDir='/home/user/user-data/',ignoreHTTPSErrors=True,args=['--disable-infobars','--no-sandbox','--process-per-tab','--disable-setuid-sandbox'])
    WsToken = browser.wsEndpoint
    print ("[browser] 浏览器启动完成!Ws: "+str(WsToken))
    await browser.disconnect()


loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(launchchrome())
loop.close()

async def launchchromenew():
    global WsToken
    browser = await launch(headless=False,autoClose=False,userDataDir='/home/user/user-data/',ignoreHTTPSErrors=True,handleSIGINT=False,handleSIGTERM=False,handleSIGHUP=False,args=['--disable-infobars','--no-sandbox','--process-per-tab','--disable-setuid-sandbox'])
    WsToken = browser.wsEndpoint
    print('[browser] 浏览器重启完成!Ws: '+str(WsToken))
    await browser.disconnect()


async def screenshot(Width, Height, Uri, Output, JSexec, SleepTime, FullPage):
    # 连接浏览器
    browser = await connect(browserWSEndpoint=WsToken) 
    page = await browser.newPage()
    # 设置页面视图大小
    await page.setViewport(viewport={'width':Width,'height':Height})
    # 是否启用JS,enabled设为False,则无渲染效果
    await page.setJavaScriptEnabled(enabled=True)
    # 跳到地址
    page.setDefaultNavigationTimeout(15000)  #渲染时间15s 多了就是出现未知的bug
    # 屏蔽webdriver
    await page.evaluateOnNewDocument('Object.defineProperty(navigator, "webdriver", {get: () => false})')
    await page.goto(Uri)
    # 运行js
    await page.evaluate(JSexec)
    # 等待时间
    time.sleep(SleepTime)
    # 截图
    Saveput = Output
    Output =  Saveput+'-cache.png'
    # 长截图...
    if FullPage == "true":
        await page.screenshot({'path': Output, 'fullPage': True})
    else:
        await page.screenshot({'path': Output})
    # 压缩一下
    compress_image(Output, Saveput)
    # 断开浏览器
    print('[browser] 生成图片完成')
    await page.close() #关掉标签防止内存太多ww
    await browser.disconnect()

async def StopChrome():
    # 连接浏览器
    global WsToken
    browser = await connect(browserWSEndpoint=WsToken) 
    # 关掉!
    await browser.close()
    await browser.disconnect()
    # wstoken归零
    WsToken = None

def upload_files(path_local, path_s3):
    """
    上传(重复上传会覆盖同名文件)
    :param path_local: 本地路径
    :param path_s3: s3路径
    """
    print(f'[aws S3] Start upload files.')
 
    if not upload_single_file(path_local, path_s3):
        raise Exception(f'[aws S3] Upload files failed.')
 
    print(f'[aws S3] Upload files successful.')
 
 
def upload_single_file(src_local_path, dest_s3_path):
    """
    上传单个文件
    请用upload_files方法
    :param src_local_path:
    :param dest_s3_path:
    :return:
    """
    try:
        with open(src_local_path, 'rb') as f:
            s3.upload_fileobj(f, BUCKET_NAME, dest_s3_path)
    except Exception as e:
        print(f'[aws S3] Upload data failed. | src: {src_local_path} | dest: {dest_s3_path} | Exception: {e}')
        return False
    #print(f'[aws S3] Uploading file successful. | src: {src_local_path} | dest: {dest_s3_path}')
    return True

# 开始操作s3
BUCKET_NAME = ""  # 存储桶名称

# aws_access_key_id和aws_secret_access_key
# 使用base64加密一下
S3_AKI = b''
S3_SAK = b''
# str类型
CN_S3_AKI = base64.b64decode(S3_AKI).decode('utf-8') 
CN_S3_SAK = base64.b64decode(S3_SAK).decode('utf-8')
CN_S3_AKI = CN_S3_AKI.replace("'", "")
CN_S3_SAK = CN_S3_SAK.replace("'", "")
CN_REGION_NAME = '' #前缀域名
ENDPOINT_URL = '' # endpoint 端点域名 一定看给的那个啊!!!
# 打开实例
s3_session = boto3.Session(region_name=CN_REGION_NAME,
                  aws_access_key_id=CN_S3_AKI,
                  aws_secret_access_key=CN_S3_SAK)
s3 = s3_session.client("s3", endpoint_url=ENDPOINT_URL)
print('[aws S3] 初始化完成')

class apiHandler(BaseHTTPServer.BaseHTTPRequestHandler):
    def do_GET(self):
        path,args=urlparse.splitquery(self.path)
        self._response(path, args)

    def do_POST(self):
        args = self.rfile.read(int(self.headers['content-length'])).decode("utf-8")
        self._response(self.path, args)

    def _response(self, path, args):
        self.send_response(200)
        self.send_header('Content-type','text/plain')
        self.end_headers()
        # 开始call chrome
        # 参数处理
        if args:
                args=urlparse.parse_qs(args).items()
                args=dict([(k,v[0]) for k,v in args])
        else:
                args={}
        
        Uri=args.get("uri","https://www.baidu.com")
        ScreenShotPath=args.get("path","test.jpg")
        Height=int(args.get("h","1080"))
        Token=args.get("token","null")
        Width=int(args.get("w","1920"))
        Restart=args.get("restart","0")
        FullPage=args.get("fullpage","false")
        JSexec=args.get("jsexec", "void(0);")
        SleepTime=int(args.get("sleeptime", "0"))
        err_str = None
        try:
            # 把下边的token换成你需要的就可以访问授权了
            if Token != 'xry1029':
                print("[httpd] 无token访问...")
                # 抛出无token
                raise Exception('Bad Token')
            else:
                if Restart == "1":
                    loop = asyncio.new_event_loop()
                    asyncio.set_event_loop(loop)
                    loop.run_until_complete(StopChrome())
                    loop.close()

                    loop = asyncio.new_event_loop()
                    asyncio.set_event_loop(loop)
                    # 这里是子进程 所以要屏蔽SIGINT SIGTERM注册不了
                    loop.run_until_complete(launchchromenew())
                    loop.close()
                    # 懒得再if一层了,直接抛exception
                    raise Exception('重启完成')
                else:
                    loop = asyncio.new_event_loop()
                    asyncio.set_event_loop(loop)
                    loop.run_until_complete(screenshot(Width, Height, Uri, "/home/user/www/"+ScreenShotPath, JSexec, SleepTime, FullPage))
                    loop.close()
        except Exception as e:
            err_str = '服务器错误: '+str(e) #+"\n"+traceback.format_exc()

        if err_str is not None:
            response_str = err_str
        else:
            # 上传s3 给markdown做准备
            upload_files('/home/user/www/'+ScreenShotPath, 'data/'+ScreenShotPath)
            response_str = "<你的s3域名>"+ScreenShotPath
        self.wfile.write(response_str.encode("UTF-8"))

class ThreadedServer(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
    def __init__(self, *args,**kwargs):
        # 线程锁 没用到......
        self.screen_lock = threading.Lock()
        BaseHTTPServer.HTTPServer.__init__(self, *args, **kwargs)



if __name__=="__main__":
    parser=argparse.ArgumentParser()
    parser.add_argument('-ip','--address',required=False,help='IP address to listen. Default is 127.0.0.1',default='127.0.0.1')
    parser.add_argument('-p','--port',type=int,help='port to bind',default=9087)
    args = parser.parse_args()
    
    #Setup the server.
    server = ThreadedServer((args.address, args.port), apiHandler)
 
    #start the server
    print('[httpd] Server is Ready. %s:%s' % (args.address, args.port))
    
    while True:
        try:
            server.handle_request()
        except KeyboardInterrupt:
            break
        
    print("[main] Control-C hit: Exiting server...")
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    loop.run_until_complete(StopChrome())
    loop.close()

用apipost之类的可以很方便的调试api

大概1.5s一张图,还是不太稳定..希望有大佬能帮忙提提意见

posted @ 2023-01-18 11:00  星如雨yu  阅读(593)  评论(0编辑  收藏  举报