flink metric checkpoints 平台化

flink平台化中，需要根据flink自身提供的restful api来获取作业checkpoints列表展示，以及根据checkpoint进行启动前的存在性校验。

restful api参考官网 https://ci.apache.org/projects/flink/flink-docs-release-1.9/monitoring/metrics.html

需要特别注意的是：

1、平台化的过程，平台的任务ID和flink的任务ID(jobid)不是同一个概念，所以需要根据任务名称进行转换。即通过‘/jobs/overview’获取所有的job信息，根据name进行关联。

2、checkpoints与savepoints在底层其实是统一，所以区分的只是其中一个‘is_savepoint’字段。

3、存在性校验时因为flink的路径格式存在多种，如file:///,hdfs:///，还有linux、windwos系统的区别，所以直接使用flink-core源码中的代码，拷贝的代价太大了，所以引用了‘<artifactId>flink-core</artifactId>’，虽然这是一个比较重的操作，但是我想后续平台化肯定也会引用的。

4、checkpoints合法性校验，即源码中AbstractFsCheckpointStorage.resolveCheckpointPointer()方法，但是我写不出来，原因是因为我觉得validate中的executionJobVertex是在构建executionGraph之后才会构建executionJobVertex的，而我作为校验时应该只有一个路径checkpointPointer作为参数，不涉及那么后面的操作。我认为这一块是在用checkpoint启动任务时才会用到。所以我想暂时先再启动checkpoint的代码写好我来参考下。

整体代码如下

package com.xx.monitor.metric.example;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.xx.common.http.Request;
import com.xx.common.http.Response;
import com.google.common.base.Preconditions;
import org.apache.flink.core.fs.FileStatus;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;

import java.io.FileNotFoundException;
import java.io.IOException;

/**
 * Description:
 * 1、flink作业checkpoint 列表展示
 * 2、flink作业savepoint 列表展示
 * 3、校验
 * @author: wulei
 * Version: 1.0
 * Create Date Time: 2020/2/25 11:24 AM.
 * Update Date Time:
 */
public class CheckPointDemoTest {

    /** checkpoint type flag */
    public static final String CHECKPOINT_TYPE = "checkpoint";

    /** savepoint type flag */
    public static final String SAVEPOINT_TYPE = "savepoint";

    /** flink metrics restful address */
    public static final String ADDRESS_PREFIX = "http://192.168.90.212:8081";

    /** flink metrics restful api */
    public static final String JOBS_OVERVIEW_SUFFIX = "/jobs/overview";
    public static final String JOBS_CHECKPOINTS_SUFFIX = "/jobs/%s/checkpoints";

    public static void main(String[] args){
        // list demo
        checkpointList("Socket Window WordCount");
        savepointList("Socket Window WordCount");

        // validate demo

        try {
            validate("checkpointPointer");
        } catch (IOException e) {
            // process IOException
        }

    }

    /**
     * 1、check/savepoint路径存在性校验
     * 2、合法性校验
     * @param checkpointPointer
     * @throws IOException
     */
    public static void validate(String checkpointPointer) throws IOException {
        // 1、存在性校验
        Preconditions.checkNotNull(checkpointPointer,"checkpointPointer cannot be null");
        Preconditions.checkArgument(!checkpointPointer.isEmpty(), "checkpointPointer cannot be empty");

        // check if the pointer is in fact a valid file path
        final Path path;
        try {
            path = new Path(checkpointPointer);
        }
        catch (Exception e) {
            throw new IOException("Checkpoint/savepoint path '" + checkpointPointer + "' is not a valid file URI. " +
                    "Either the pointer path is invalid, or the checkpoint was created by a different state backend.");
        }

        // check if the file system can be accessed
        final FileSystem fs;
        try {
            fs = path.getFileSystem();
        }
        catch (IOException e) {
            throw new IOException("Cannot access file system for checkpoint/savepoint path '" +
                    checkpointPointer + "'.", e);
        }

        final FileStatus status;
        try {
            status = fs.getFileStatus(path);
        }
        catch (FileNotFoundException e) {
            throw new FileNotFoundException("Cannot find checkpoint or savepoint " +
                    "file/directory '" + checkpointPointer + "' on file system '" + fs.getUri().getScheme() + "'.");
        }

        // if we are here, the file / directory exists
        final Path checkpointDir;
        final FileStatus metadataFileStatus;

        // If this is a directory, we need to find the meta data file
        if (status.isDir()) {
            checkpointDir = status.getPath();
            final Path metadataFilePath = new Path(path, "_metadata");
            try {
                metadataFileStatus = fs.getFileStatus(metadataFilePath);
            }
            catch (FileNotFoundException e) {
                throw new FileNotFoundException("Cannot find meta data file ' _metadata"  +
                        "' in directory '" + path + "'. Please try to load the checkpoint/savepoint " +
                        "directly from the metadata file instead of the directory.");
            }
        }

        // 2、合法性校验




    }

    public static void checkpointList(String jobName){
        points(CHECKPOINT_TYPE, jobName, ADDRESS_PREFIX);

    }

    public static void savepointList(String jobName){
        points(SAVEPOINT_TYPE, jobName, ADDRESS_PREFIX);

    }

    /**
     *
     * flink自身提供的metrics监控，以restful api的形式进行调用
     * 对于此功能的平台化关键有两点：1、jobid与平台taskId(taskInstanceId)的关联； 2、
     * http://192.168.90.212:8081/jobs/631fea77b842918e2e33070e45c03477/checkpoints
     * @param pointType
     */
    private static void points(String pointType, String jobName, String address) {

        // get all jobs, then get jobisd based on jobname
        Response<JSONObject> response = new Request(address + JOBS_OVERVIEW_SUFFIX).GET();
        String jid = null;
        if (response.isSuccess()) {
            JSONArray jobArray = response.getContent().getJSONArray("jobs");
            for(int i = 0; i < jobArray.size(); i++) {
                JSONObject jobJson = jobArray.getJSONObject(i);
                String name = jobJson.getString("name");
                if (null != name && name.equals(jobName)) {
                    jid = jobJson.getString("jid");
                    break;
                }
            }
        }

        if (jid == null) {
            return;
        }

        Response<JSONObject> responseCheckPoints = new Request(address + String.format(JOBS_CHECKPOINTS_SUFFIX, jid)).GET();
        if (response.isSuccess()) {
            JSONArray historyArray = responseCheckPoints.getContent().getJSONArray("history");
            for(int i = 0; i < historyArray.size(); i++) {
                JSONObject jobJson = historyArray.getJSONObject(i);
                Boolean is_savepoint = jobJson.getBoolean("is_savepoint");
                if (pointType.equals(CHECKPOINT_TYPE) && !is_savepoint) {
                    // checkpoints information
                    String id = jobJson.getString("id");
                    String status = jobJson.getString("status");
                    String trigger_timestamp = jobJson.getString("trigger_timestamp");
                    String end_to_end_duration = jobJson.getString("end_to_end_duration");
                    String num_acknowledged_subtasks = jobJson.getString("num_acknowledged_subtasks");
                    break;
                } else {
                    // savepoints information

                }
            }
        }
    }

}

<!-- Flink dependencies -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-core</artifactId>
            <version>${flink.version}</version>
        </dependency>

初步的思考是这样的，但是由于我获取flink jobs信息的基础是restful，当flink服务都挂的情况下，就无法获取cp的信息了，也就无法根据cp/sp来恢复作业了，所以还是要对cp/sp的路径进行持久化，具体的方法是对每一次实例的cp/sp的路径都唯一设置，其他实例ID与cp/sp一一对应，如/checkpoints/<taskid>/<instanceid>/<jobid>/chk-n,

发表于 2020-02-26 17:42 WuLei吴磊阅读(811) 评论(0) 编辑收藏举报

公告