为什么在C++上运行onnxrumtime的时间要比python上的更长呢

在C++中使用onnxruntime推断onnx模型比在Python中使用onnxruntime要花很多时间。我的模型在C++中的推理时间是0.5s，而在Python中是0.07s。后来，在C++中使用CUDA进行加速后，发现速度较慢，推理时间变为0.8s。是我的代码有问题吗？

C++的版本是2019。

onnxruntime的版本是1.9-gpu。

#include <assert.h>
#include <vector>
#include <onnxruntime_cxx_api.h>
#include <opencv2/opencv.hpp>
#include <cuda_provider_factory.h>
#include <opencv2\imgproc\types_c.h>
#include <time.h>

using namespace std;
using namespace cv;

int main(int argc, char* argv[]) {

    Mat img = imread("C:/Users/dell/Desktop/CYH/SuperGlue/SuperGlue_load/3_color.png", 0);
    img.convertTo(img, CV_32FC1);
    img = img / 255.0;

    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
    Ort::SessionOptions session_options;
    session_options.SetIntraOpNumThreads(1);
    OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0);
    session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);

#ifdef _WIN32
    const wchar_t* model_path = L"C:\\Users\\dell\\PycharmProjects\\cyh_torch\\SuperGlue\\Onnx\\static_SuperPoint.onnx";
#else
    const char* model_path = "C:\\Users\\dell\\PycharmProjects\\cyh_torch\\SuperGlue\\Onnx\\static_SuperPoint.onnx";
#endif

    Ort::Session session(env, model_path, session_options);
    // print model input layer (node names, types, shape etc.)
    Ort::AllocatorWithDefaultOptions allocator;

    // print number of model input nodes
    size_t num_input_nodes = session.GetInputCount();
    std::vector<const char*> input_node_names = { "image" };
    std::vector<const char*> output_node_names = { "scores", "keypoints", "descriptors" };

    std::vector<int64_t> input_node_dims = { 1, 1, 480, 640 };
    size_t input_tensor_size = 480 * 640;
    std::vector<float> input_tensor_values(input_tensor_size);
    for (int i = 0; i < img.rows; i++) {
        for (int j = 0; j < img.cols; j++) {
            input_tensor_values[i * img.cols + j] = img.ptr<float>(i)[j];
        }
    }

    // create input tensor object from data values
    auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
    Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), input_node_dims.size());
    assert(input_tensor.IsTensor());

    std::vector<Ort::Value> ort_inputs;
    ort_inputs.push_back(std::move(input_tensor));
    // score model & input tensor, get back output tensor
    clock_t start = clock();
    auto output_tensors = session.Run(Ort::RunOptions{ nullptr }, input_node_names.data(), ort_inputs.data(), ort_inputs.size(), output_node_names.data(), 3);
    clock_t end = clock();
    cout << double(end - start) / CLOCKS_PER_SEC << endl;

    // Get pointer to output tensor float values
    float* floatarr = output_tensors[0].GetTensorMutableData<float>();

    //float* floatarr_mask = output_tensors[1].GetTensorMutableData<float>();

    printf("Done!\n");
    return 0;
}