基于onnxruntime c++ api开发benchmark工具

代码详见
https://github.com/SheepHuan/UnifiedModelBenchmark/tree/onnxruntime_android

1 环境配置

1.1 依赖库

google gflags
microsoft onnxruntime

1.2 编译

1.2.1 编译gflags

1.2.2 编译onnxruntime

2 代码实现

2.1 关键功能

调用ort 的cpu和nnapi 后端进行推理。
自动获取CV模型的输入输出节点，并构造张量。
计算运行模型整体运行时间的均值和标准差,满足nn-Meter需求。
设置模型推理线程数量。
(TODO) 推理语言和语音模型。
目前有些地方的实现还不是最优，后续优化。

2.2 实现

#include "core/session/onnxruntime_cxx_api.h"
#include "core/session/onnxruntime_c_api.h"
#ifdef ANDROID_PLATFORM
#include "providers/nnapi/nnapi_provider_factory.h"
#endif
#include <chrono>
#include <iostream>
#include <sstream>
#include <string>
#include <cstdio>
#include <vector>
#include "mutils/log.hpp"
#include <gflags/gflags.h>
#include <cmath>
DEFINE_string(graph, "", "onnx model path");
DEFINE_int32(warmup_runs, 3, "warmup_runs");
DEFINE_int32(num_runs, 10, "num_runs");
DEFINE_int32(num_threads, 3, "num_threads");
// DEFINE_bool(use_nnapi, false, "use nnapi");
DEFINE_bool(enable_op_profiling, false, "enable_op_profiling");
DEFINE_string(prefix, "", "result");

void calc_std_deviation(std::vector<double> arr, int size,double& latency_avg ,double& latency_std) {
    double sum = 0.0, mean, stddev = 0.0;
    // double min_val,max_val;
    for(int i=0; i<size; ++i) {
        sum += arr[i];
    }

    mean = sum/size;

    for(int i=0; i<size; ++i) {
        stddev += pow(arr[i] - mean, 2);
    }
    latency_avg = mean;
    latency_std = sqrt(stddev/size);
    // return sqrt(stddev/size);
}

int run(Ort::Session &session)
{
    std::vector<Ort::AllocatedStringPtr> ptrs;
    std::vector<Ort::Value> input_tensors;
    std::vector<std::string> input_names;
    std::vector<std::string> output_names;

    size_t input_count = session.GetInputCount();
    size_t output_count = session.GetOutputCount();
    auto mem_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
    Ort::AllocatorWithDefaultOptions allocator;
    for (size_t i = 0; i < input_count; i++)
    {
        std::vector<int64_t> input_dim = session.GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape();
        int data_type = session.GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetElementType();
        session.GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetElementType();
        std::cout << "input dim: ";
        size_t size = 1;
        for (size_t j = 0; j < input_dim.size(); j++)
        {
            if (input_dim[j] == -1)
            {
                input_dim[j] = 1;
            }
            std::cout << input_dim[j] << " ";
            size *= input_dim[j];
        }
        std::cout << std::endl;

        // 获取输入
        Ort::AllocatedStringPtr input_name_ptr = session.GetInputNameAllocated(i, allocator);
        std::string input_name = std::string(input_name_ptr.get());
        input_names.push_back(input_name);

        std::cout << "input_name: " << input_names[i] << std::endl;

        float *float32_data = (float *)malloc(sizeof(float) * size);
        input_tensors.push_back(
            Ort::Value::CreateTensor<float>(
                mem_info, float32_data, size, input_dim.data(), input_dim.size()));
    }

    for (size_t i = 0; i < output_count; i++)
    {
        Ort::AllocatedStringPtr output_name_ptr = session.GetOutputNameAllocated(i, allocator);
        std::string output_name = std::string(output_name_ptr.get());
        output_names.push_back(output_name);

        std::cout << "output_name: " << output_names[i] << std::endl;
    }

    int warmup_rounds = FLAGS_warmup_runs;
    int run_rounds = FLAGS_num_runs;
    double warmup_time = 0;

    for (int i = 0; i < warmup_rounds; i++)
    {
        std::vector<const char *> input_names_ptr;
        std::vector<const char *> output_names_ptr;

        for (size_t i = 0; i < input_count; i++)
        {
            input_names_ptr.push_back(input_names[i].c_str());
        }
        for (size_t i = 0; i < output_count; i++)
        {
            output_names_ptr.push_back(output_names[i].c_str());
        }
        std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
        auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_names_ptr.data(), input_tensors.data(), input_count, output_names_ptr.data(), output_count);
        std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
        auto time_span = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1);
        warmup_time = warmup_time + time_span.count();
    }
    double latency_avg = 0,latency_std=0;
    std::vector<double> latency_per_rounds;
    for (int i = 0; i < run_rounds; i++)
    {
        std::vector<const char *> input_names_ptr;
        std::vector<const char *> output_names_ptr;
        for (size_t i = 0; i < input_count; i++)
        {
            input_names_ptr.push_back(input_names[i].c_str());
        }
        for (size_t i = 0; i < output_count; i++)
        {
            output_names_ptr.push_back(output_names[i].c_str());
        }
        std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
        auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_names_ptr.data(), input_tensors.data(), input_count, output_names_ptr.data(), output_count);
        std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
        auto time_span = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1);
        // run_time = run_time + time_span.count();
        latency_per_rounds.push_back(time_span.count());
    }
    calc_std_deviation(latency_per_rounds,latency_per_rounds.size(),latency_avg,latency_std);
    printf("warmup: %d rounds, avg time: %f ms\nrun: %d rounds, avg time: %f +- %f ms\n",warmup_rounds,warmup_time*1.0/warmup_rounds,run_rounds,latency_avg,latency_std);
    return 0;
}

int main(int argc, char **argv)
{
    // 解析命令行参数
    gflags::ParseCommandLineFlags(&argc, &argv, true);
    std::string model_path = FLAGS_graph;
    bool enable_op_profiling = FLAGS_enable_op_profiling;
    std::string result_prefix = FLAGS_prefix;
    Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"};
    Ort::SessionOptions session_options;
    if (enable_op_profiling)
        session_options.EnableProfiling(result_prefix.c_str());

    Ort::Session session{env, model_path.c_str(), session_options}; // CPU

    run(session);
    return 0;
}

3 测试

# 测试onnx 模型
# 需要设置指定环境变量找到libonnxruntime.so及相关依赖库
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/data/local/tmp/hcp/libs"
# 指定相关参数运行
./main --graph="/mnt/sdcard/ort_models/FasterRCNN-12.onnx" --warmup_runs 3 --num_runs 10

posted @ 2023-05-25 13:34 SheepHuan 阅读(404) 评论(0) 编辑收藏举报

刷新页面返回顶部

SheepHuan的博客

基于onnxruntime c++ api开发benchmark工具

1 环境配置

1.1 依赖库

1.2 编译

1.2.1 编译gflags

1.2.2 编译onnxruntime

2 代码实现

2.1 关键功能

2.2 实现

3 测试

公告