TensorRT——安装及使用

1 从源码编译

1.1 下载源码和库文件

TensorRT只开源了一部分，其中最核心的那部分是闭源的。开源部分是Github上的TensorRT仓库，闭源部分则是官方提供的TensorRT库。由于我电脑上的CUDA Runtime API是10.0版本的, 所以这里以TensorRT7.0为例。

下载开源代码

git clone -b master https://github.com/nvidia/TensorRT TensorRT -b release/7.0
cd TensorRT
// 初始化并更新仓库中每个子模块
git submodule update --init --recursive
export TRT_SOURCE=`pwd`

下载TensorRT闭源部分的库文件

cd ~/Downloads
// 下面的cuda-10.0指的是CUDA Runtime API的版本, 由CUDA Toolkit Installer进行安装
tar -xvzf TensorRT-7.0.0.11.Ubuntu-16.04.x86_64-gnu.cuda-10.0.cudnn7.6.tar.gz
export TRT_RELEASE=`pwd`/TensorRT-7.0.0.11
// 环境变量LD_LIBRARY_PATH是系统动态库的查找路径, PATH是系统可执行文件的查找路径
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRT_RELEASE/lib

库文件TensorRT-7.0.0.11中doc目录下的pdf文件是学习TensorRT的很好的资料，尤其是TensorRT-Developer-Guide.pdf

1.2 构建编译环境

这一步主要是制作一个用于编译的镜像，然后进入容器进行编译，这样不需要进行太多的环境配置。

查看自己的Linux系统版本和CUDA版本，修改docker/ubuntu.Dockerfile

// 查看系统版本信息
lsb_release -a
// 查看 CUDA Driver API版本
nvidia-smi
// 查看 CUDA Runtime API版本
cat /usr/local/cuda/version.txt
// 查看 cudnn版本
cat /usr/local/cuda/include/cudnn.h | grep CUDNN_MAJOR -A 2

从Dockerfile创建用于编译的镜像

docker build -f docker/ubuntu.Dockerfile --build-arg UBUNTU_VERSION=16.04 --build-arg CUDA_VERSION=10.0 --tag=tensorrt-ubuntu .

运行用于编译的容器

// 在容器中如果需要root权限, 在docker run 或者 docker exec 中添加 -u 0即可
docker run -u 0 -v $TRT_RELEASE:/tensorrt -v $TRT_SOURCE:/workspace/TensorRT -it --name build_container tensorrt-ubuntu:latest

1.3 编译

cd $TRT_SOURCE
mkdir build && cd build
cmake .. -DTRT_LIB_DIR=$TRT_RELEASE/lib -DTRT_BIN_DIR=`pwd`/out -DCUDA_VERSION=10.0
make -j$(nproc)

2 检查是否正确安装

这些步骤在容器外部操作即可，容器只是用来进行编译。

2.1 运行sampleMNIST例子

配置好共享库的搜索路径

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/TensorRT-7.0.0.11/lib:/usr/local/cuda/lib64

下载测试图像并运行例子

cd /path/to/TensorRT-7.0.0.11/data/mnist
python download_pgms.py
cd /path/to/TensorRT-7.0.0.22/bin
./sample_mnist

2.2 安装调用Python API

cd /path/to/TensorRT-7.0.0.11/python
pip install 对应python版本的.whl文件

然后运行python解释器，看看能否import tensorrt。

3 篇外—Ubuntu下的环境变量设置

上面这些步骤包含很多环境变量的设置，但它们都是通过在shell中采用export命令进行设置，这样当我们重新打开一个shell后，这些设置就失效了。如果我们需要将一些变量固定下来，每次打开一个新shell，这些环境变量能够自动加载进来，则方便多了。
我们可以按照interactive or non-interactive以及login or non-login对shell进行分类。

interactive non-login shell 只加载~/.bashrc文件的环境变量
interactive login shell
首先加载/etc/profile文件(这个是作用在所有user的)，然后按_{/.bash_profile、}/.bash_login和_{/.profile的查找顺序加载其中最先找到的那个文件。我的电脑上只有}/.profile文件，里面的内容如下，可以看到它调用了~/.bashrc文件

# if running bash
if [ -n "$BASH_VERSION" ]; then
    # include .bashrc if it exists
    if [ -f "$HOME/.bashrc" ]; then
        . "$HOME/.bashrc"
    fi
fi

# set PATH so it includes user's private bin directories
PATH="$HOME/bin:$HOME/.local/bin:$PATH"

为了避免interactive login shell 和 interactive non-login shell的不一致，一般来说，.bash_profile会调用.bashrc。

non-interactive shell从创建它的shell中继承环境变量。

参考资料：Difference Between .bashrc, .bash-profile, and .profile

4 TensorRT的核心接口

Parser: 包括Caffe Parser、UFF Parser和ONNX Parser。Parser的主要功能是根据训练好的network读创建Network Definition
Network Definition: 提供一些方法去确定网络的定义。下面是《TensorRT-Developer-Guide》中的描述

The Network Definition interface provides methods for the application to specify the definition of a network. Input and output tensors can be specified, layers can be added, and there is an interface for configuring each supported layer type. As well as layer types, such as convolutional and recurrent layers, and a Plugin layer type allows the application to implement functionality not natively supported by TensorRT. For more information about the Network Definition, see Network Definition API.

根据这里的描述，我们写好的Custom Layer是供Network Definition接口去调用的。
有两种方式去创建一个TensorRT Network：1）使用TensorRT提供的Parser接口；2）直接使用TensorRT的Network Definition接口(比如https://github.com/wang-xinyu/tensorrtx采用的就是这种方式，避免自己些Custom Layer)

Builder: Builder中的maximum batch size和maximum workspace是两个影响TensorRT优化效果的重要参数，一般来说maximum workspace应该设置得尽可能大
Engine: 有了Network Definition，才能创建出Engine。有了Engine，可以直接利用这个Engine来做推理，或者将Engine序列化，以便将来进行推理，因为从Network Definition到Engine这个过程会相对耗时。

涉及具体的calss和function，可以查阅文档

5 使用TRT的Python API对模型加速推理

安装pycuda

# 配置环境变量
export CUDA_PATH=/usr/local/cuda
export PATH=$PATH:/usr/local/cuda/bin
# 选择合适的pycuda版本
pip install pycuda==2018.1.1

调用python API进行推理

import pycuda.autoinit
import numpy as np
import pycuda.driver as cuda
import tensorrt as trt
import torch
import os
import time
from PIL import Image
import cv2
import torchvision

filename = 'test.jpg'
max_batch_size = 1
onnx_model_path = '../models/resnet50.onnx'
os.environ["CUDA_VISIBLE_DVICES"] = "4"

TRT_LOGGER = trt.Logger()  # This logger is required to build an engine


def get_img_np_nchw(filename):
    image = cv2.imread(filename)
    image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image_cv = cv2.resize(image_cv, (224, 224))
    miu = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    img_np = np.array(image_cv, dtype=float) / 255.
    r = (img_np[:, :, 0] - miu[0]) / std[0]
    g = (img_np[:, :, 1] - miu[1]) / std[1]
    b = (img_np[:, :, 2] - miu[2]) / std[2]
    img_np_t = np.array([r, g, b])
    img_np_nchw = np.expand_dims(img_np_t, axis=0)
    return img_np_nchw

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        """Within this context, host_mom means the cpu memory and device means the GPU memory
        """
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream


def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", \
               fp16_mode=False, int8_mode=False, save_engine=False,
               ):
    """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""

    def build_engine(max_batch_size, save_engine):
        """Takes an ONNX file and creates a TensorRT engine to run inference with"""
        with trt.Builder(TRT_LOGGER) as builder, \
                builder.create_network(1) as network, \
                trt.OnnxParser(network, TRT_LOGGER) as parser:

            builder.max_workspace_size = 1 << 30  # Your workspace size
            builder.max_batch_size = max_batch_size
            # pdb.set_trace()
            builder.fp16_mode = fp16_mode  # Default: False
            builder.int8_mode = int8_mode  # Default: False
            if int8_mode:
                # To be updated
                raise NotImplementedError

            # Parse model file
            if not os.path.exists(onnx_file_path):
                quit('ONNX file {} not found'.format(onnx_file_path))

            print('Loading ONNX file from path {}...'.format(onnx_file_path))
            with open(onnx_file_path, 'rb') as model:
                print('Beginning ONNX file parsing')
                parser.parse(model.read())

            print('Completed parsing of ONNX file')
            print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))

            engine = builder.build_cuda_engine(network)
            print("Completed creating Engine")

            if save_engine:
                with open(engine_file_path, "wb") as f:
                    f.write(engine.serialize())
            return engine

    if os.path.exists(engine_file_path):
        # If a serialized engine exists, load it instead of building a new one.
        print("Reading engine from file {}".format(engine_file_path))
        with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read())
    else:
        return build_engine(max_batch_size, save_engine)

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer data from CPU to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]


def postprocess_the_outputs(h_outputs, shape_of_output):
    h_outputs = h_outputs.reshape(*shape_of_output)
    return h_outputs



img_np_nchw = get_img_np_nchw(filename)
img_np_nchw = img_np_nchw.astype(dtype=np.float32)

# These two modes are dependent on hardwares
fp16_mode = False
int8_mode = False
trt_engine_path = './model_fp16_{}_int8_{}.trt'.format(fp16_mode, int8_mode)
# Build an engine
engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode, int8_mode)
# Create the context for this engine
context = engine.create_execution_context()
# Allocate buffers for input and output
inputs, outputs, bindings, stream = allocate_buffers(engine) # input, output: host # bindings

# Do inference
shape_of_output = (max_batch_size, 1000)
# Load data to the buffer
inputs[0].host = img_np_nchw.reshape(-1)

# inputs[1].host = ... for multiple input
t1 = time.time()
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data
t2 = time.time()
feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)

print('TensorRT ok')

model = torchvision.models.resnet50(pretrained=True).cuda()
resnet_model = model.eval()

input_for_torch = torch.from_numpy(img_np_nchw).cuda()
t3 = time.time()
feat_2= resnet_model(input_for_torch)
t4 = time.time()
feat_2 = feat_2.cpu().data.numpy()
print('Pytorch ok!')


mse = np.mean((feat - feat_2)**2)
print("Inference time with the TensorRT engine: {}".format(t2-t1))
print("Inference time with the PyTorch model: {}".format(t4-t3))
print('MSE Error = {}'.format(mse))

print('All completed!')

参考：
1 《如何使用TensorRT对训练好的PyTorch模型进行加速?》
2 github: PyTorch_ONNX_TensorRT

6 使用TRT的C++ API对模型加速推理

编写相关源文件
cpp_inference.cpp

#include <algorithm>
#include <assert.h>
#include <cmath>
#include <cuda_runtime_api.h>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <sys/stat.h>
#include <time.h>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "logger.h"
#include "common.h"
#include "image.hpp"
#define DebugP(x) std::cout << "Line" << __LINE__ << "  " << #x << "=" << x << std::endl


using namespace nvinfer1;

static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int INPUT_C = 3;
static const int OUTPUT_SIZE = 1000;

const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";

const std::string gSampleName = "TensorRT.sample_onnx_image";


samplesCommon::Args gArgs;


bool onnxToTRTModel(const std::string& modelFile, // name of the onnx model
                    unsigned int maxBatchSize,    // batch size - NB must be at least as large as the batch we want to run with
                    IHostMemory*& trtModelStream) // output buffer for the TensorRT model
{
    // create the builder
    IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
    assert(builder != nullptr);

    const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    std::cout << "explicitBatch is: " << explicitBatch << std::endl;
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(explicitBatch);

    auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());

    //Optional - uncomment below lines to view network layer information
    //config->setPrintLayerInfo(true);
    //parser->reportParsingInfo();

    if ( !parser->parseFromFile( locateFile(modelFile, gArgs.dataDirs).c_str(), static_cast<int>(gLogger.getReportableSeverity()) ) )
    {
        gLogError << "Failure while parsing ONNX file" << std::endl;
        return false;
    }
    
    // Build the engine
    builder->setMaxBatchSize(maxBatchSize);
    //builder->setMaxWorkspaceSize(1 << 20);
    builder->setMaxWorkspaceSize(10 << 20);
    builder->setFp16Mode(gArgs.runInFp16);
    builder->setInt8Mode(gArgs.runInInt8);

    if (gArgs.runInInt8)
    {
        samplesCommon::setAllTensorScales(network, 127.0f, 127.0f);
    }
    
    // samplesCommon::enableDLA(builder, gArgs.useDLACore);
    
    ICudaEngine* engine = builder->buildCudaEngine(*network);
    assert(engine);

    // we can destroy the parser
    parser->destroy();

    // serialize the engine, then close everything down
    trtModelStream = engine->serialize();
    engine->destroy();
    network->destroy();
    builder->destroy();

    return true;
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();
    // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
    // of these, but in this case we know that there is exactly one input and one output.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // note that indices are guaranteed to be less than IEngine::getNbBindings()
    
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
    
    DebugP(inputIndex); DebugP(outputIndex);
    // create GPU buffers and a stream
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // release the stream and the buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

//!
//! \brief This function prints the help information for running this sample
//!
void printHelpInfo()
{
    std::cout << "Usage: ./sample_onnx_mnist [-h or --help] [-d or --datadir=<path to data directory>] [--useDLACore=<int>]\n";
    std::cout << "--help          Display help information\n";
    std::cout << "--datadir       Specify path to a data directory, overriding the default. This option can be used multiple times to add multiple directories. If no data directories are given, the default is to use (data/samples/mnist/, data/mnist/)" << std::endl;
    std::cout << "--useDLACore=N  Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is the number of DLA engines on the platform." << std::endl;
    std::cout << "--int8          Run in Int8 mode.\n";
    std::cout << "--fp16          Run in FP16 mode." << std::endl;
}

int main(int argc, char** argv)
{
    bool argsOK = samplesCommon::parseArgs(gArgs, argc, argv);
    if (gArgs.help)
    {
        printHelpInfo();
        return EXIT_SUCCESS;
    }
    if (!argsOK)
    {
        gLogError << "Invalid arguments" << std::endl;
        printHelpInfo();
        return EXIT_FAILURE;
    }
    if (gArgs.dataDirs.empty())
    {
        gArgs.dataDirs = std::vector<std::string>{"data/"};
    }

    auto sampleTest = gLogger.defineTest(gSampleName, argc, const_cast<const char**>(argv));

    gLogger.reportTestStart(sampleTest);

    // create a TensorRT model from the onnx model and serialize it to a stream
    IHostMemory* trtModelStream{nullptr};

    if (!onnxToTRTModel("resnet50.onnx", 1, trtModelStream))
        gLogger.reportFail(sampleTest);

    assert(trtModelStream != nullptr);
    std::cout << "Successfully parsed ONNX file!!!!" << std::endl;
    
    
    std::cout << "Start reading the input image!!!!" << std::endl;
    
    cv::Mat image = cv::imread(locateFile("test.jpg", gArgs.dataDirs), cv::IMREAD_COLOR);
    if (image.empty()) {
        std::cout << "The input image is empty!!! Please check....."<<std::endl;
    }
    DebugP(image.size());
    cv::cvtColor(image, image, cv::COLOR_BGR2RGB);

    cv::Mat dst = cv::Mat::zeros(INPUT_H, INPUT_W, CV_32FC3);
    cv::resize(image, dst, dst.size());
    DebugP(dst.size());

    float* data = normal(dst); 

    // deserialize the engine
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    if (gArgs.useDLACore >= 0)
    {
        runtime->setDLACore(gArgs.useDLACore);
    }

    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
    assert(engine != nullptr);
    trtModelStream->destroy();
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    
    float prob[OUTPUT_SIZE];
    typedef std::chrono::high_resolution_clock Time;
    typedef std::chrono::duration<double, std::ratio<1, 1000>> ms;
    typedef std::chrono::duration<float> fsec;
    double total = 0.0;

    // run inference and cout time
    auto t0 = Time::now();
    doInference(*context, data, prob, 1);
    auto t1 = Time::now();
    fsec fs = t1 - t0;
    ms d = std::chrono::duration_cast<ms>(fs);
    total += d.count();
    // destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();
    
    std::cout << std::endl << "Running time of one image is:" << total << "ms" << std::endl;
   
    gLogInfo << "Output:\n";
    for (int i = 0; i < OUTPUT_SIZE; i++)
    {
        gLogInfo << prob[i] << " ";
    }
    gLogInfo << std::endl;

    return gLogger.reportTest(sampleTest, true);
}

image.hpp

#pragma once
typedef struct {
    int w;
    int h;
    int c;
    float *data;
} image;
float* normal(cv::Mat img);

image.cpp

#include <opencv2/opencv.hpp>
#include "image.hpp"

static const float kMean[3] = { 0.485f, 0.456f, 0.406f };
static const float kStdDev[3] = { 0.229f, 0.224f, 0.225f };
static const int map_[7][3] = { {0,0,0} ,
                {128,0,0},
                {0,128,0},
                {0,0,128},
                {128,128,0},
                {128,0,128},
                {0,128,0}};


float* normal(cv::Mat img) {
    //cv::Mat image(img.rows, img.cols, CV_32FC3);
    float * data;
    data = (float*)calloc(img.rows*img.cols * 3, sizeof(float));

    for (int c = 0; c < 3; ++c)
    {
        
        for (int i = 0; i < img.rows; ++i)
        { //获取第i行首像素指针 
            cv::Vec3b *p1 = img.ptr<cv::Vec3b>(i);
            //cv::Vec3b *p2 = image.ptr<cv::Vec3b>(i);
            for (int j = 0; j < img.cols; ++j)
            {
                data[c * img.cols * img.rows + i * img.cols + j] = (p1[j][c] / 255.0f - kMean[c]) / kStdDev[c];
       
            }
        }
        
    }
    return data;
}

编译运行程序
CMakeLists.txt

cmake_minimum_required(VERSION 3.3)

project(cpp_inference)

add_compile_options(-std=c++11)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)

find_package(CUDA REQUIRED)
find_package(OpenCV REQUIRED)

# 引入非标准的头文件搜索路径
include_directories(/home/TaipKang/workspace/TensorRT/experiments /home/TaipKang/workspace/TensorRT/include /home/TaipKang/TensorRT/samples/common)
include_directories(/usr/local/cuda/include /usr/include/x86_64-linux-gnu/)
# 添加非标准的共享库搜索路径
link_directoreis(/usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu/ /home/TaipKang/workspace/TensorRT-7.0.0.11/lib)

# 编译共享库
add_library(image SHARED image.cpp)
set_target_properties(image PROPERTIES VERSION 1.1 SOVERSION 1)
link_directoreis(/home/TaipKang/workspace/TensorRT/experiments/build)

add_executable(cpp_inference ./cpp_inference.cpp)
target_link_libraries(cpp_inference nvinfer)
target_link_libraries(cpp_inference cudart)
target_link_libraries(cpp_inference nvonnxparser)
target_link_libraries(cpp_inference nvparsers)
target_link_libraries(cpp_inference ${OpenCV_LIBS})
target_link_libraries(cpp_inference image)

参考：
1 《如何使用TensorRT对训练好的PyTorch模型进行加速?》

posted @ 2021-10-08 08:29 渐渐的笔记本阅读(3668) 评论(0) 编辑收藏举报

刷新页面返回顶部

渐渐的笔记本