TensorRT生成INT8校准文件

需要的资源

模型
训练模型时用的500~1000张图片

模型这里用的是onnx模型。

预处理

本文参考的例子是TensorRT的sampleSSD项目。

预处理需要将前述500张图片转成生成校准文件时需要的数据读取形式。

首先满足预处理代码中的convert命令，安装imagemagick

apt-get install imagemagick

预处理代码如下：

#
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import numpy as np
import sys
import os
import glob
import shutil
import struct
from random import shuffle

try:
    from PIL import Image
except ImportError as err:
    raise ImportError("""ERROR: Failed to import module ({})
Please make sure you have Pillow installed.
For installation instructions, see:
http://pillow.readthedocs.io/en/stable/installation.html""".format(err))

height = 640
width = 640
NUM_BATCHES = 0
NUM_PER_BATCH = 1
NUM_CALIBRATION_IMAGES = 500


HOME_DIR = '/home/cmhu/data/500images'

CALIBRATION_DATASET_LOC = HOME_DIR + '/images/*.jpg'


# images to test
imgs = []
print("Location of dataset = " + CALIBRATION_DATASET_LOC)
imgs = glob.glob(CALIBRATION_DATASET_LOC)
shuffle(imgs)

imgs = imgs[:NUM_CALIBRATION_IMAGES]
NUM_BATCHES = NUM_CALIBRATION_IMAGES // NUM_PER_BATCH + (NUM_CALIBRATION_IMAGES % NUM_PER_BATCH > 0)

print("Total number of images = " + str(len(imgs)))
print("NUM_PER_BATCH = " + str(NUM_PER_BATCH))
print("NUM_BATCHES = " + str(NUM_BATCHES))

# output
outDir  = HOME_DIR + "/batches"

if os.path.exists(outDir):
    os.system("rm " + outDir +"/*")

# prepare output
if not os.path.exists(outDir):
    os.makedirs(outDir)

for i in range(NUM_CALIBRATION_IMAGES):
    os.system("convert "+imgs[i]+" -resize "+str(height)+"x"+str(width)+"! "+outDir+"/"+str(i)+".ppm")

CALIBRATION_DATASET_LOC = outDir + '/*.ppm'
imgs = glob.glob(CALIBRATION_DATASET_LOC)

# load image, switch to BGR, subtract mean, and make dims C x H x W for Caffe
img = 0
for i in range(NUM_BATCHES):
    batchfile = outDir + "/batch_calibration" + str(i) + ".batch"
    batchlistfile = outDir + "/batch_calibration" + str(i) + ".list"
    batchlist = open(batchlistfile,'a')
    batch = np.zeros(shape=(NUM_PER_BATCH, 3, height, width), dtype = np.float32)
    for j in range(NUM_PER_BATCH):
        batchlist.write(os.path.basename(imgs[img]) + '\n')
        im = Image.open(imgs[img]).resize((width,height), Image.NEAREST)
        in_ = np.array(im, dtype=np.float32, order='C')
        in_ = in_[:,:,::-1]
        in_-= np.array((104.0, 117.0, 123.0))
        in_ = in_.transpose((2,0,1))
        batch[j] = in_
        img += 1

    # save
    batch.tofile(batchfile)
    batchlist.close()

    # Prepend batch shape information
    ba = bytearray(struct.pack("4i", batch.shape[0], batch.shape[1], batch.shape[2], batch.shape[3]))

    with open(batchfile, 'rb+') as f:
        content = f.read()
        f.seek(0,0)
        f.write(ba)
        f.write(content)

# os.system("rm " + outDir +"/*.ppm")

其中height 和 width 与模型的输入宽高一致。

其他注意修改 CALIBRATION_DATASET_LOC ，outDir

预处理得到的结果示例如下

生成校准文件

生成校准文件只需 buildEngine 成功，无需 infer

这里给出一个基于完整onnx模型推理的代码修改的生成INT8校准文件的示例代码，然后针对代码做说明：

#include "buffers.h"
#include "common.h"
#include "logger.h"

#include "parserOnnxConfig.h"
#include "NvInfer.h"

#include <algorithm>
#include <cmath>
#include <cuda_runtime_api.h>
#include <fstream>
#include <iostream>
#include <sstream>


#include <assert.h>
#include <stdio.h>
#include <vector>

#include <dirent.h>

#include "EntropyCalibrator.h"

using namespace std;

class VPTBatchStream : public IBatchStream
{
public:
    VPTBatchStream(
        int batchSize, int maxBatches, std::string prefix, std::string suffix, std::vector<std::string> directories)
        : mBatchSize(batchSize)
        , mMaxBatches(maxBatches)
        , mPrefix(prefix)
        , mSuffix(suffix)
        , mDataDir(directories)
    {
        string filepath = locateFile(mPrefix + std::string("0") + mSuffix, mDataDir);
        cout << filepath << endl;
        FILE* file = fopen(filepath.c_str(), "rb");
        assert(file != nullptr);
        int d[4];
        size_t readSize = fread(d, sizeof(int), 4, file);
        assert(readSize == 4);
        mDims.nbDims = 4;  // The number of dimensions.
        mDims.d[0] = d[0]; // Batch Size
        mDims.d[1] = d[1]; // Channels
        mDims.d[2] = d[2]; // Height
        mDims.d[3] = d[3]; // Width
        
        assert(mDims.d[0] > 0 && mDims.d[1] > 0 && mDims.d[2] > 0 && mDims.d[3] > 0);
        fclose(file);

        mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3];
        mBatch.resize(mBatchSize * mImageSize, 0);
        mLabels.resize(mBatchSize, 0);
        mFileBatch.resize(mDims.d[0] * mImageSize, 0);
        mFileLabels.resize(mDims.d[0], 0);
        reset(0);

        cout <<  mDims.d[1] <<" " << mDims.d[2] <<" " << mDims.d[3] << endl;
    }

    VPTBatchStream(int batchSize, int maxBatches, std::string prefix, std::vector<std::string> directories)
        : VPTBatchStream(batchSize, maxBatches, prefix, ".batch", directories)
    {
    }

    VPTBatchStream(
        int batchSize, int maxBatches, nvinfer1::Dims dims, std::string listFile, std::vector<std::string> directories)
        : mBatchSize(batchSize)
        , mMaxBatches(maxBatches)
        , mDims(dims)
        , mListFile(listFile)
        , mDataDir(directories)
    {
        mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3];
        mBatch.resize(mBatchSize * mImageSize, 0);
        mLabels.resize(mBatchSize, 0);
        mFileBatch.resize(mDims.d[0] * mImageSize, 0);
        mFileLabels.resize(mDims.d[0], 0);
        reset(0);
    }

    // Resets data members
    void reset(int firstBatch) override
    {
        mBatchCount = 0;
        mFileCount = 0;
        mFileBatchPos = mDims.d[0];
        skip(firstBatch);
    }

    // Advance to next batch and return true, or return false if there is no batch left.
    bool next() override
    {
        if (mBatchCount == mMaxBatches)
        {
            return false;
        }

        for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
        {
            assert(mFileBatchPos > 0 && mFileBatchPos <= mDims.d[0]);
            if (mFileBatchPos == mDims.d[0] && !update())
            {
                return false;
            }

            // copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
            csize = std::min(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos);
            std::copy_n(
                getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize);
            std::copy_n(getFileLabels() + mFileBatchPos, csize, getLabels() + batchPos);
        }
        mBatchCount++;
        return true;
    }

    // Skips the batches
    void skip(int skipCount) override
    {
        if (mBatchSize >= mDims.d[0] && mBatchSize % mDims.d[0] == 0 && mFileBatchPos == mDims.d[0])
        {
            mFileCount += skipCount * mBatchSize / mDims.d[0];
            return;
        }

        int x = mBatchCount;
        for (int i = 0; i < skipCount; i++)
        {
            next();
        }
        mBatchCount = x;
    }

    float* getBatch() override
    {
        return mBatch.data();
    }

    float* getLabels() override
    {
        return mLabels.data();
    }

    int getBatchesRead() const override
    {
        return mBatchCount;
    }

    int getBatchSize() const override
    {
        return mBatchSize;
    }

    nvinfer1::Dims getDims() const override
    {
        return mDims;
    }

private:
    float* getFileBatch()
    {
        return mFileBatch.data();
    }

    float* getFileLabels()
    {
        return mFileLabels.data();
    }

    bool update()
    {
        if (mListFile.empty())
        {
            std::string inputFileName = locateFile(mPrefix + std::to_string(mFileCount++) + mSuffix, mDataDir);
            FILE* file = fopen(inputFileName.c_str(), "rb");
            if (!file)
            {
                return false;
            }

            int d[4];
            size_t readSize = fread(d, sizeof(int), 4, file);
            assert(readSize == 4);
            assert(mDims.d[0] == d[0] && mDims.d[1] == d[1] && mDims.d[2] == d[2] && mDims.d[3] == d[3]);
            size_t readInputCount = fread(getFileBatch(), sizeof(float), mDims.d[0] * mImageSize, file);
            assert(readInputCount == size_t(mDims.d[0] * mImageSize));
            size_t readLabelCount = fread(getFileLabels(), sizeof(float), mDims.d[0], file);
            assert(readLabelCount == 0 || readLabelCount == size_t(mDims.d[0]));

            fclose(file);
        }
        else
        {
            std::vector<std::string> fNames;
            std::ifstream file(locateFile(mListFile, mDataDir), std::ios::binary);
            if (!file)
            {
                return false;
            }

            sample::gLogInfo << "Batch #" << mFileCount << std::endl;
            file.seekg(((mBatchCount * mBatchSize)) * 7);

            for (int i = 1; i <= mBatchSize; i++)
            {
                std::string sName;
                std::getline(file, sName);
                sName = sName + ".ppm";
                sample::gLogInfo << "Calibrating with file " << sName << std::endl;
                fNames.emplace_back(sName);
            }

            mFileCount++;

            const int imageC = 3;
            const int imageH = 640;
            const int imageW = 640;
            std::vector<samplesCommon::PPM<imageC, imageH, imageW>> ppms(fNames.size());
            for (uint32_t i = 0; i < fNames.size(); ++i)
            {
                readPPMFile(locateFile(fNames[i], mDataDir), ppms[i]);
            }

            std::vector<float> data(samplesCommon::volume(mDims));
            const float scale = 2.0 / 255.0;
            const float bias = 1.0;
            long int volChl = mDims.d[2] * mDims.d[3];

            // Normalize input data
            for (int i = 0, volImg = mDims.d[1] * mDims.d[2] * mDims.d[3]; i < mBatchSize; ++i)
            {
                for (int c = 0; c < mDims.d[1]; ++c)
                {
                    for (int j = 0; j < volChl; ++j)
                    {
                        data[i * volImg + c * volChl + j] = scale * float(ppms[i].buffer[j * mDims.d[1] + c]) - bias;
                    }
                }
            }

            std::copy_n(data.data(), mDims.d[0] * mImageSize, getFileBatch());
        }

        mFileBatchPos = 0;
        return true;
    }

    int mBatchSize{0};
    int mMaxBatches{0};
    int mBatchCount{0};
    int mFileCount{0};
    int mFileBatchPos{0};
    int mImageSize{0};
    std::vector<float> mBatch;         //!< Data for the batch
    std::vector<float> mLabels;        //!< Labels for the batch
    std::vector<float> mFileBatch;     //!< List of image files
    std::vector<float> mFileLabels;    //!< List of label files
    std::string mPrefix;               //!< Batch file name prefix
    std::string mSuffix;               //!< Batch file name suffix
    nvinfer1::Dims mDims;              //!< Input dimensions
    std::string mListFile;             //!< File name of the list of image names
    std::vector<std::string> mDataDir; //!< Directories where the files can be found
};

struct InferDeleter
{
    template <typename T>
    void operator()(T* obj) const
    {
        if (obj)
        {
            obj->destroy();
        }
    }
};

template <typename T>
using SampleUniquePtr = std::unique_ptr<T, samplesCommon::InferDeleter>;

static auto StreamDeleter = [](cudaStream_t* pStream)
    {
        if (pStream)
        {
            cudaStreamDestroy(*pStream);
            delete pStream;
        }
    };

inline std::unique_ptr<cudaStream_t, decltype(StreamDeleter)> makeCudaStream()
{
    std::unique_ptr<cudaStream_t, decltype(StreamDeleter)> pStream(new cudaStream_t, StreamDeleter);
    if (cudaStreamCreateWithFlags(pStream.get(), cudaStreamNonBlocking) != cudaSuccess)
    {
        pStream.reset(nullptr);
    }

    return pStream;
}

struct CalibratorParams
{
    int32_t batchSize{1};              //!< Number of inputs in a batch
    bool int8{false};                  //!< Allow runnning the network in Int8 mode.
    bool fp16{false};                  //!< Allow running the network in FP16 mode.
    std::vector<std::string> inputTensorNames;
    std::vector<std::string> outputTensorNames;
    std::string onnxFileName;
    std::string batchDatDir;
    std::string networkName;            // 用来标记生成的校准文件的，如networkName 为 best, 则生成的校准文件名为 CalibrationTablebest
};

bool buildEngine(CalibratorParams mParams) {

    // build
    auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()));
    if (!builder)
    {
        return false;
    }

    const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));
    if (!network)
    {
        return false;
    }

    auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
    if (!config)
    {
        return false;
    }

    auto parser = SampleUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, sample::gLogger.getTRTLogger()));
    if (!parser)
    {
        return false;
    }

    /************** constructNetwork **************/
    auto parsed = parser->parseFromFile(mParams.onnxFileName.c_str(),static_cast<int>(sample::gLogger.getReportableSeverity()));
    if (!parsed)
    {
        return false;
    }

    int32_t nb_inputs = network->getNbInputs();
    if (nb_inputs > 1)
    {
        cout << "warnning: nb_inputs > 1" << endl;
    }
    else if (nb_inputs <= 0)
    {
        cout << "error: nb_inputs < 0" << endl;
        return false;
    }

    Dims dim = network->getInput(0)->getDimensions();

    // 添加IOptimization Profile，指定维度范围
    // 解决  Error Code 4: Internal Error (Network has dynamic or shape inputs, but no optimization profile has been defined.)
    IOptimizationProfile* profile = builder->createOptimizationProfile();
    profile->setDimensions(mParams.inputTensorNames[0].c_str(), OptProfileSelector::kMIN, Dims4(1, dim.d[1], dim.d[2], dim.d[3]));
    profile->setDimensions(mParams.inputTensorNames[0].c_str(), OptProfileSelector::kOPT, Dims4(4, dim.d[1], dim.d[2], dim.d[3]));
    profile->setDimensions(mParams.inputTensorNames[0].c_str(), OptProfileSelector::kMAX, Dims4(16, dim.d[1], dim.d[2], dim.d[3]));
    int32_t index = config->addOptimizationProfile(profile);
    if (index < 0)
    {
        cout << "addOptimizationProfile failed !" << endl;
        return false;
    }
    cout << "profile index : " << index << endl;

    if (mParams.fp16)
    {
        config->setFlag(BuilderFlag::kFP16);
    }

    std::unique_ptr<IInt8Calibrator> calibrator;
    if (mParams.int8)
    {
        config->setFlag(BuilderFlag::kINT8);

        std::vector<std::string> dataDirs;
        dataDirs.push_back(mParams.batchDatDir);
        
        VPTBatchStream calibrationStream(1, 50, "/batch_calibration", dataDirs);
        calibrator.reset(new Int8EntropyCalibrator2<VPTBatchStream>(calibrationStream, 0, mParams.networkName.c_str(), mParams.inputTensorNames[0].c_str()));

        config->setInt8Calibrator(calibrator.get());
    }

    /***************************/

    // CUDA stream used for profiling by the builder.
    auto profileStream = makeCudaStream();
    if (!profileStream)
    {
        return false;
    }
    config->setProfileStream(*profileStream);

    auto mEngine = SampleUniquePtr<nvinfer1::ICudaEngine>(builder->buildEngineWithConfig(*network, *config));
    if (!mEngine)
    {
        return false;
    }

    return true;
}

int main() {
    cudaSetDevice(3);

    CalibratorParams mParams;
    mParams.onnxFileName = "/mnt/data/cmhu/bky/pytorch_10cls_part_newdata_20210715_yolov5m_hw640/weights/best.onnx";
    mParams.batchDatDir = "/mnt/data/cmhu/bky/500images/batches/";
    mParams.inputTensorNames.push_back("images");
    mParams.outputTensorNames.push_back("output");
    mParams.networkName = "best";
    mParams.int8 = true;
    mParams.fp16 = false;

    if (!buildEngine(mParams))
    {
        cout << "build engine failed !!!" << endl;
        return -1;
    }

    cout << "build engin succeed !!" << endl;
    
    return 0;
}

代码主要是两部分：VPTBatchStream 和 buildEngine。VPTBatchStream 按指定的接口方式读取batch数据，buildEngine构建推理引擎，在构建推理引擎的时候生成INT8校准文件，核心代码如下：

std::unique_ptr<IInt8Calibrator> calibrator;
if (mParams.int8)
{
    config->setFlag(BuilderFlag::kINT8);

    std::vector<std::string> dataDirs;
    dataDirs.push_back(mParams.batchDatDir);
    
    VPTBatchStream calibrationStream(1, 50, "/batch_calibration", dataDirs);
    calibrator.reset(new Int8EntropyCalibrator2<VPTBatchStream>(calibrationStream, 0, mParams.networkName.c_str(), mParams.inputTensorNames[0].c_str()));

    config->setInt8Calibrator(calibrator.get());
}

其中注意 calibrator 不能提前释放，否则会在构建推理引擎的时候报错，无法生成校准文件。

生成的校准文件名与设置的 networkName 参数有关，见 networkName 参数的注释。

生成的校准文件示例：

posted @ 2022-10-18 07:57 yeren2046 阅读(881) 评论(6) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

力耘的博客

学而时习之,不亦说乎? 有朋自远方来,不亦乐乎? 人不知而不愠,不亦君子乎?

TensorRT生成INT8校准文件

需要的资源

预处理

生成校准文件

公告