在C++中使用onnxruntime推断onnx模型比在Python中使用onnxruntime要花很多时间。我的模型在C++中的推理时间是0.5s,而在Python中是0.07s。后来,在C++中使用CUDA进行加速后,发现速度较慢,推理时间变为0.8s。是我的代码有问题吗?
C++的版本是2019。
onnxruntime的版本是1.9-gpu。
#include <assert.h>
#include <vector>
#include <onnxruntime_cxx_api.h>
#include <opencv2/opencv.hpp>
#include <cuda_provider_factory.h>
#include <opencv2\imgproc\types_c.h>
#include <time.h>
using namespace std;
using namespace cv;
int main(int argc, char* argv[]) {
Mat img = imread("C:/Users/dell/Desktop/CYH/SuperGlue/SuperGlue_load/3_color.png", 0);
img.convertTo(img, CV_32FC1);
img = img / 255.0;
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
Ort::SessionOptions session_options;
session_options.SetIntraOpNumThreads(1);
OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0);
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
#ifdef _WIN32
const wchar_t* model_path = L"C:\\Users\\dell\\PycharmProjects\\cyh_torch\\SuperGlue\\Onnx\\static_SuperPoint.onnx";
#else
const char* model_path = "C:\\Users\\dell\\PycharmProjects\\cyh_torch\\SuperGlue\\Onnx\\static_SuperPoint.onnx";
#endif
Ort::Session session(env, model_path, session_options);
// print model input layer (node names, types, shape etc.)
Ort::AllocatorWithDefaultOptions allocator;
// print number of model input nodes
size_t num_input_nodes = session.GetInputCount();
std::vector<const char*> input_node_names = { "image" };
std::vector<const char*> output_node_names = { "scores", "keypoints", "descriptors" };
std::vector<int64_t> input_node_dims = { 1, 1, 480, 640 };
size_t input_tensor_size = 480 * 640;
std::vector<float> input_tensor_values(input_tensor_size);
for (int i = 0; i < img.rows; i++) {
for (int j = 0; j < img.cols; j++) {
input_tensor_values[i * img.cols + j] = img.ptr<float>(i)[j];
}
}
// create input tensor object from data values
auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), input_node_dims.size());
assert(input_tensor.IsTensor());
std::vector<Ort::Value> ort_inputs;
ort_inputs.push_back(std::move(input_tensor));
// score model & input tensor, get back output tensor
clock_t start = clock();
auto output_tensors = session.Run(Ort::RunOptions{ nullptr }, input_node_names.data(), ort_inputs.data(), ort_inputs.size(), output_node_names.data(), 3);
clock_t end = clock();
cout << double(end - start) / CLOCKS_PER_SEC << endl;
// Get pointer to output tensor float values
float* floatarr = output_tensors[0].GetTensorMutableData<float>();
//float* floatarr_mask = output_tensors[1].GetTensorMutableData<float>();
printf("Done!\n");
return 0;
}