深度學(xué)習(xí)模型在訓(xùn)練好以后,下一步就是部署到不同的設(shè)備進(jìn)行測試,不同設(shè)備之間的轉(zhuǎn)換一般可以通過中間件ONNX進(jìn)行轉(zhuǎn)換,以達(dá)到不同平臺的通用。本文以模型轉(zhuǎn)為ONNX為起點,分析介紹ONNX轉(zhuǎn)為TensorRT Engine并進(jìn)行推理的整個流程鏈路。
1、ONNX序列化為TensorRT Engine
ONNX序列化為TRT模型的整個流程可以用下圖表示
使用C++的API進(jìn)行開發(fā)時,需要引入頭文件NvInfer以及NvOnnxParser,C++的接口都是通過I開頭的的接口類定義的,如ILogger、IBuilder等。
#include “NvInfer.h”
#include “NvOnnxParser.h”
using namespace nvonnxparser;
using namespace nvinfer1;
1.1 創(chuàng)建builder
創(chuàng)建構(gòu)建器之前有兩種方式實例化ILogger:
1、引用tensorrtx的logging.h,使用其中的Logger
#include "logging.h"
static Logger gLogger;
IBuilder* builder = createInferBuilder(gLogger);
2、繼承ILogger,實例化接口
class Logger : public ILogger
{
void log(Severity severity, const char* msg) noexcept override
{
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
} logger;
IBuilder* builder = createInferBuilder(gLogger);
1.2 創(chuàng)建network
創(chuàng)建構(gòu)建器后,需要創(chuàng)建網(wǎng)絡(luò)定義來進(jìn)行模型優(yōu)化:
INetworkDefinition *network = builder->createNetworkV2(0U); //是0U還是1u需視情況而定
1.3 創(chuàng)建parse解析器
創(chuàng)建onnx的解析器來進(jìn)行網(wǎng)絡(luò)定義的填充,并讀取模型文件并處理是否存在錯誤。
IParser* parser = createParser(*network, gLogger);
parser->parseFromFile(onnx_path, static_cast<int32_t>(ILogger::Severity::kWARNING));
for (int32_t i = 0; i < parser->getNbErrors(); ++i)
{
std::cout << parser->getError(i)->desc() << std::endl;
}
std::cout << "successfully parse the onnx model" << std::endl;
1.4 設(shè)置必要參數(shù)并創(chuàng)建Engine
IBuilderConfig *config = builder->createBuilderConfig();
builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(1 << 20);
auto profile = builder->createOptimizationProfile();
auto input_tensor = network->getInput(0);
auto input_dims = input_tensor->getDimensions();
input_dims.d[0] = 1;
profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
input_dims.d[0] = batchSize;
profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);
config->addOptimizationProfile(profile);
#ifdef USE_FP16
config->setFlag(BuilderFlag::kFP16);
#endif
#ifdef USE_INT8
config->setFlag(BuilderFlag::kINT8);
#endif
1.5 創(chuàng)建Engine并序列化
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
assert(engine != nullptr);
(*modelStream) = engine->serialize();
assert(modelStream != nullptr);
std::ofstream p(engine_path, std::ios::binary);
if (!p)
{
std::cerr << "could not open plan output file" << std::endl;
return -1;
}
p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
modelStream->destroy();
2、讀取序列化后TensorRT Engine 并進(jìn)行推理
onnx轉(zhuǎn)換為engine并序列化后,可以減少構(gòu)建和優(yōu)化模型的時間,如下圖所示,從序列化的engine讀取開始完成整個推理過程。
2.1 反序列化engine
讀取序列化的模型,存放在trtModelstream中。
size_t size{ 0 };
std::ifstream file(engine_path, std::ios::binary);
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();
2.2 創(chuàng)建runtime
通過logger創(chuàng)建runtime
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
2.3 創(chuàng)建engine
通過runtime解析trtModelstream,創(chuàng)建engine
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
assert(engine != nullptr);
2.4 創(chuàng)建context
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
runtime->destroy();
2.5 前處理+前向推理+后處理
前處理
float* input_data = (float*)malloc(3 * input_h * input_w * sizeof(float));
int ImgCount = InputImage.size();
for (int b = 0; b < ImgCount; b++) {
cv::Mat img = InputImage.at(b);
int w = img.cols;
int h = img.rows;
int i = 0;
for (int row = 0; row < h; ++row) {
uchar* uc_pixel = img.data + row * img.step;
for (int col = 0; col < input_w; ++col) {
input_data[b * 3 * input_h * input_w + i] = (float)uc_pixel[2] / 255.0;
input_data[b * 3 * input_h * input_w + i + input_h * input_w] = (float)uc_pixel[1] / 255.0;
input_data[b * 3 * input_h * input_w + i + 2 * input_h * input_w] = (float)uc_pixel[0] / 255.0;
uc_pixel += 3;
++i;
}
}
}
前向推理
void doInference()
{
const ICudaEngine& engine = context.getEngine();
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
//assert(engine.getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
//const int inputIndex = 0;
//const int outputIndex = 1;
// Create GPU buffers on device
cudaMalloc(&buffers[inputIndex], batchSize * 3 * input_h * input_w * sizeof(float));
cudaMalloc(&buffers[outputIndex], batchSize * output_size * sizeof(float));
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 *input_h * input_w * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * output_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
后處理
以LPRNet為例
std::vector<int> preds;
std::cout << std::endl;
for (int i = 0; i < 18; i++) {
int maxj = 0;
for (int j = 0; j < 68; j++) {
if (prob[i + 18 * j] > prob[i + 18 * maxj]) maxj = j;
}
preds.push_back(maxj);
}
int pre_c = preds[0];
std::vector<int> no_repeat_blank_label;
for (auto c: preds) {
if (c == pre_c || c == 68 - 1) {
if (c == 68 - 1) pre_c = c;
continue;
}
no_repeat_blank_label.push_back(c);
pre_c = c;
}
std::string str;
for (auto v: no_repeat_blank_label) {
str += alphabet[v];
}
以上是利用TensorRT C++ API進(jìn)行ONNX構(gòu)建trt engine,并進(jìn)行推理的全過程解析,基本所有的onnx轉(zhuǎn)化為TRT模型進(jìn)行推理都包含在以上方式中,僅此記錄。文章來源:http://www.zghlxwxcb.cn/news/detail-461964.html
–END–文章來源地址http://www.zghlxwxcb.cn/news/detail-461964.html
到了這里,關(guān)于onnx模型轉(zhuǎn)engine并進(jìn)行推理全過程解析的文章就介紹完了。如果您還想了解更多內(nèi)容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!