基于onnxruntime c++ api开发benchmark工具
1.nn-meter——构建CNN推理预测器2.nn-meter——自定义Onnxruntime后端
3.基于onnxruntime c++ api开发benchmark工具
4.基于paddle lite 构建c++ benchmark代码详见
https://github.com/SheepHuan/UnifiedModelBenchmark/tree/onnxruntime_android
1 环境配置
1.1 依赖库
- google gflags
- microsoft onnxruntime
1.2 编译
1.2.1 编译gflags
1.2.2 编译onnxruntime
2 代码实现
2.1 关键功能
- 调用ort 的cpu和nnapi 后端进行推理。
- 自动获取CV模型的输入输出节点,并构造张量。
- 计算运行模型整体运行时间的均值和标准差,满足nn-Meter需求。
- 设置模型推理线程数量。
- (TODO) 推理语言和语音模型。
目前有些地方的实现还不是最优,后续优化。
2.2 实现
#include "core/session/onnxruntime_cxx_api.h" #include "core/session/onnxruntime_c_api.h" #ifdef ANDROID_PLATFORM #include "providers/nnapi/nnapi_provider_factory.h" #endif #include <chrono> #include <iostream> #include <sstream> #include <string> #include <cstdio> #include <vector> #include "mutils/log.hpp" #include <gflags/gflags.h> #include <cmath> DEFINE_string(graph, "", "onnx model path"); DEFINE_int32(warmup_runs, 3, "warmup_runs"); DEFINE_int32(num_runs, 10, "num_runs"); DEFINE_int32(num_threads, 3, "num_threads"); // DEFINE_bool(use_nnapi, false, "use nnapi"); DEFINE_bool(enable_op_profiling, false, "enable_op_profiling"); DEFINE_string(prefix, "", "result"); void calc_std_deviation(std::vector<double> arr, int size,double& latency_avg ,double& latency_std) { double sum = 0.0, mean, stddev = 0.0; // double min_val,max_val; for(int i=0; i<size; ++i) { sum += arr[i]; } mean = sum/size; for(int i=0; i<size; ++i) { stddev += pow(arr[i] - mean, 2); } latency_avg = mean; latency_std = sqrt(stddev/size); // return sqrt(stddev/size); } int run(Ort::Session &session) { std::vector<Ort::AllocatedStringPtr> ptrs; std::vector<Ort::Value> input_tensors; std::vector<std::string> input_names; std::vector<std::string> output_names; size_t input_count = session.GetInputCount(); size_t output_count = session.GetOutputCount(); auto mem_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); Ort::AllocatorWithDefaultOptions allocator; for (size_t i = 0; i < input_count; i++) { std::vector<int64_t> input_dim = session.GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape(); int data_type = session.GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetElementType(); session.GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetElementType(); std::cout << "input dim: "; size_t size = 1; for (size_t j = 0; j < input_dim.size(); j++) { if (input_dim[j] == -1) { input_dim[j] = 1; } std::cout << input_dim[j] << " "; size *= input_dim[j]; } std::cout << std::endl; // 获取输入 Ort::AllocatedStringPtr input_name_ptr = session.GetInputNameAllocated(i, allocator); std::string input_name = std::string(input_name_ptr.get()); input_names.push_back(input_name); std::cout << "input_name: " << input_names[i] << std::endl; float *float32_data = (float *)malloc(sizeof(float) * size); input_tensors.push_back( Ort::Value::CreateTensor<float>( mem_info, float32_data, size, input_dim.data(), input_dim.size())); } for (size_t i = 0; i < output_count; i++) { Ort::AllocatedStringPtr output_name_ptr = session.GetOutputNameAllocated(i, allocator); std::string output_name = std::string(output_name_ptr.get()); output_names.push_back(output_name); std::cout << "output_name: " << output_names[i] << std::endl; } int warmup_rounds = FLAGS_warmup_runs; int run_rounds = FLAGS_num_runs; double warmup_time = 0; for (int i = 0; i < warmup_rounds; i++) { std::vector<const char *> input_names_ptr; std::vector<const char *> output_names_ptr; for (size_t i = 0; i < input_count; i++) { input_names_ptr.push_back(input_names[i].c_str()); } for (size_t i = 0; i < output_count; i++) { output_names_ptr.push_back(output_names[i].c_str()); } std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now(); auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_names_ptr.data(), input_tensors.data(), input_count, output_names_ptr.data(), output_count); std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); auto time_span = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1); warmup_time = warmup_time + time_span.count(); } double latency_avg = 0,latency_std=0; std::vector<double> latency_per_rounds; for (int i = 0; i < run_rounds; i++) { std::vector<const char *> input_names_ptr; std::vector<const char *> output_names_ptr; for (size_t i = 0; i < input_count; i++) { input_names_ptr.push_back(input_names[i].c_str()); } for (size_t i = 0; i < output_count; i++) { output_names_ptr.push_back(output_names[i].c_str()); } std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now(); auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_names_ptr.data(), input_tensors.data(), input_count, output_names_ptr.data(), output_count); std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); auto time_span = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1); // run_time = run_time + time_span.count(); latency_per_rounds.push_back(time_span.count()); } calc_std_deviation(latency_per_rounds,latency_per_rounds.size(),latency_avg,latency_std); printf("warmup: %d rounds, avg time: %f ms\nrun: %d rounds, avg time: %f +- %f ms\n",warmup_rounds,warmup_time*1.0/warmup_rounds,run_rounds,latency_avg,latency_std); return 0; } int main(int argc, char **argv) { // 解析命令行参数 gflags::ParseCommandLineFlags(&argc, &argv, true); std::string model_path = FLAGS_graph; bool enable_op_profiling = FLAGS_enable_op_profiling; std::string result_prefix = FLAGS_prefix; Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; Ort::SessionOptions session_options; if (enable_op_profiling) session_options.EnableProfiling(result_prefix.c_str()); Ort::Session session{env, model_path.c_str(), session_options}; // CPU run(session); return 0; }
3 测试
# 测试onnx 模型 # 需要设置指定环境变量找到libonnxruntime.so及相关依赖库 export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/data/local/tmp/hcp/libs" # 指定相关参数运行 ./main --graph="/mnt/sdcard/ort_models/FasterRCNN-12.onnx" --warmup_runs 3 --num_runs 10
本文作者:SheepHuan的博客
本文链接:https://www.cnblogs.com/sheephuan/p/17430937.html
版权声明:本作品采用知识共享署名-非商业性使用-禁止演绎 2.5 中国大陆许可协议进行许可。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步