Add YOLOv8 pose estimation support to ONNXRuntime C++ example (#23786)

Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
2026-04-21 14:07:18 +00:00 · 2026-03-05 23:27:10 +08:00 · 2026-03-05 23:27:10 +08:00 · 1ab234ed14
commit 1ab234ed14
parent 7bd3b2ce19
3 changed files with 244 additions and 20 deletions
--- a/examples/YOLOv8-ONNXRuntime-CPP/inference.cpp
+++ b/examples/YOLOv8-ONNXRuntime-CPP/inference.cpp
@ -59,7 +59,7 @@ char* YOLO_V8::PreProcess(cv::Mat& iImg, std::vector<int> iImgSize, cv::Mat& oIm
    switch (modelType)
    {
    case YOLO_DETECT_V8:
-    case YOLO_POSE:
+    case YOLO_POSE_V8:
    case YOLO_DETECT_V8_HALF:
    case YOLO_POSE_V8_HALF://LetterBox
    {
@ -82,7 +82,7 @@ char* YOLO_V8::PreProcess(cv::Mat& iImg, std::vector<int> iImgSize, cv::Mat& oIm
        int m = min(h, w);
        int top = (h - m) / 2;
        int left = (w - m) / 2;
-        cv::resize(oImg(cv::Rect(left, top, m, m)), oImg, cv::Size(iImgSize.at(1), iImgSize.at(0)));
+        cv::resize(oImg(cv::Rect(left, top, m, m)), oImg, cv::Size(iImgSize.at(0), iImgSize.at(1)));
        break;
    }
    }
@ -90,8 +90,8 @@ char* YOLO_V8::PreProcess(cv::Mat& iImg, std::vector<int> iImgSize, cv::Mat& oIm
 }


-char* YOLO_V8::CreateSession(DL_INIT_PARAM& iParams) {
-    char* Ret = RET_OK;
+const char* YOLO_V8::CreateSession(DL_INIT_PARAM& iParams) {
+    const char* Ret = RET_OK;
    std::regex pattern("[\u4e00-\u9fa5]");
    bool result = std::regex_search(iParams.modelPath, pattern);
    if (result)
@ -103,6 +103,7 @@ char* YOLO_V8::CreateSession(DL_INIT_PARAM& iParams) {
    try
    {
        rectConfidenceThreshold = iParams.rectConfidenceThreshold;
+        pointScoresThreshold = iParams.pointScoresThreshold;//change
        iouThreshold = iParams.iouThreshold;
        imgSize = iParams.imgSize;
        modelType = iParams.modelType;
@ -136,15 +137,15 @@ char* YOLO_V8::CreateSession(DL_INIT_PARAM& iParams) {
        {
            Ort::AllocatedStringPtr input_node_name = session->GetInputNameAllocated(i, allocator);
            char* temp_buf = new char[50];
-            strcpy(temp_buf, input_node_name.get());
+            strcpy_s(temp_buf, 50, input_node_name.get());
            inputNodeNames.push_back(temp_buf);
        }
        size_t OutputNodesNum = session->GetOutputCount();
        for (size_t i = 0; i < OutputNodesNum; i++)
        {
            Ort::AllocatedStringPtr output_node_name = session->GetOutputNameAllocated(i, allocator);
-            char* temp_buf = new char[10];
-            strcpy(temp_buf, output_node_name.get());
+            char* temp_buf = new char[50];
+            strcpy_s(temp_buf, 50, output_node_name.get());
            outputNodeNames.push_back(temp_buf);
        }
        options = Ort::RunOptions{ nullptr };
@ -157,10 +158,11 @@ char* YOLO_V8::CreateSession(DL_INIT_PARAM& iParams) {
        const char* str2 = e.what();
        std::string result = std::string(str1) + std::string(str2);
        char* merged = new char[result.length() + 1];
-        std::strcpy(merged, result.c_str());
+        strcpy_s(merged, result.length() + 1, result.c_str());
        std::cout << merged << std::endl;
        delete[] merged;
-        return "[YOLO_V8]:Create session failed.";
+        Ret = "[YOLO_V8]:Create session failed.";
+        return Ret;
    }

 }
@ -322,6 +324,104 @@ char* YOLO_V8::TensorProcess(clock_t& starttime_1, cv::Mat& iImg, N& blob, std::
        }
        break;
    }
+    case YOLO_POSE_V8://change
+    case YOLO_POSE_V8_HALF:
+    {
+        int signalResultNum = outputNodeDims[1];//56: "4+1+3*17"
+        int strideNum = outputNodeDims[2];//8400
+        //std::vector<int> class_ids;
+        std::vector<float> confidences;
+        std::vector<cv::Rect> boxes;
+        std::vector<cv::Point2f> keyPoints;
+        std::vector<float> keyPointsScores;
+        cv::Mat rawData;
+        if (modelType == YOLO_POSE_V8)
+        {
+            // FP32
+            rawData = cv::Mat(signalResultNum, strideNum, CV_32F, output);
+        }
+        else
+        {
+            // FP16
+            rawData = cv::Mat(signalResultNum, strideNum, CV_16F, output);
+            rawData.convertTo(rawData, CV_32F);
+        }
+        // Note:
+        // ultralytics add transpose operator to the output of yolov8 model.which make yolov8/v5/v7 has same shape
+        // https://github.com/ultralytics/assets/releases/download/v8.4.0/yolov8n-pose.pt
+        rawData = rawData.t();
+
+        float* pdata = (float*)rawData.data;
+        for (int i = 0; i < strideNum; ++i)
+        {
+            float* classesScores = pdata + 4;
+
+            if (pdata[4] > rectConfidenceThreshold)
+            {
+                cv::Mat kpts = cv::Mat(kpts_num, 3, CV_32FC1, pdata + 5);//keypoints & scores
+                confidences.push_back(pdata[4]);
+                float x = pdata[0];
+                float y = pdata[1];
+                float w = pdata[2];
+                float h = pdata[3];
+
+                int left = int((x - 0.5 * w) * resizeScales);
+                int top = int((y - 0.5 * h) * resizeScales);
+
+                int width = int(w * resizeScales);
+                int height = int(h * resizeScales);
+
+                boxes.push_back(cv::Rect(left, top, width, height));
+                for (int kp = 0; kp < kpts.rows; ++kp) {
+                    float x = kpts.at<float>(kp, 0) * resizeScales;      // x
+                    float y = kpts.at<float>(kp, 1) * resizeScales;      // y
+                    float score = kpts.at<float>(kp, 2);  // score
+                    keyPoints.push_back(cv::Point2f(x, y));
+                    keyPointsScores.push_back(kpts.at<float>(kp, 2));
+                }
+            }
+            pdata += signalResultNum;
+        }
+        std::vector<int> nmsResult;
+        cv::dnn::NMSBoxes(boxes, confidences, rectConfidenceThreshold, iouThreshold, nmsResult);
+
+        for (int i = 0; i < nmsResult.size(); i++)
+        {
+            int idx = nmsResult[i];
+            DL_RESULT result;
+            result.classId = 0;
+            result.confidence = confidences[idx];
+            result.box = boxes[idx];
+            result.keyPoints.clear();
+            result.keyPointsScore.clear();
+            for (int j = 0; j < this->classes.size() - 1; j++)
+            {
+                if (keyPointsScores[idx * (this->classes.size() - 1) + j] > pointScoresThreshold)
+                {
+                    result.keyPoints.push_back(keyPoints[idx * (this->classes.size() - 1) + j]);
+                    result.keyPointsScore.push_back(keyPointsScores[idx * (this->classes.size() - 1) + j]);
+                }
+            }
+            oResult.push_back(result);
+        }
+
+#ifdef benchmark
+        clock_t starttime_4 = clock();
+        double pre_process_time = (double)(starttime_2 - starttime_1) / CLOCKS_PER_SEC * 1000;
+        double process_time = (double)(starttime_3 - starttime_2) / CLOCKS_PER_SEC * 1000;
+        double post_process_time = (double)(starttime_4 - starttime_3) / CLOCKS_PER_SEC * 1000;
+        if (cudaEnable)
+        {
+            std::cout << "[YOLO_V8(CUDA)]: " << pre_process_time << "ms pre-process, " << process_time << "ms inference, " << post_process_time << "ms post-process." << std::endl;
+        }
+        else
+        {
+            std::cout << "[YOLO_V8(CPU)]: " << pre_process_time << "ms pre-process, " << process_time << "ms inference, " << post_process_time << "ms post-process." << std::endl;
+        }
+#endif // benchmark
+
+        break;
+    }
    default:
        std::cout << "[YOLO_V8]: " << "Not support model type." << std::endl;
    }
@ -332,7 +432,7 @@ char* YOLO_V8::TensorProcess(clock_t& starttime_1, cv::Mat& iImg, N& blob, std::

 char* YOLO_V8::WarmUpSession() {
    clock_t starttime_1 = clock();
-    cv::Mat iImg = cv::Mat(cv::Size(imgSize.at(1), imgSize.at(0)), CV_8UC3);
+    cv::Mat iImg = cv::Mat(cv::Size(imgSize.at(0), imgSize.at(1)), CV_8UC3);
    cv::Mat processedImg;
    PreProcess(iImg, imgSize, processedImg);
    if (modelType < 4)
@ -371,4 +471,4 @@ char* YOLO_V8::WarmUpSession() {
 #endif
    }
    return RET_OK;
-}
+}
--- a/examples/YOLOv8-ONNXRuntime-CPP/inference.h
+++ b/examples/YOLOv8-ONNXRuntime-CPP/inference.h
@ -3,6 +3,7 @@
 #pragma once

 #define    RET_OK nullptr
+//#define    USE_CUDA

 #ifdef _WIN32
 #include <Windows.h>
@ -25,7 +26,7 @@ enum MODEL_TYPE
 {
    //FLOAT32 MODEL
    YOLO_DETECT_V8 = 1,
-    YOLO_POSE = 2,
+    YOLO_POSE_V8 = 2,
    YOLO_CLS = 3,

    //FLOAT16 MODEL
@ -41,6 +42,7 @@ typedef struct _DL_INIT_PARAM
    MODEL_TYPE modelType = YOLO_DETECT_V8;
    std::vector<int> imgSize = { 640, 640 };
    float rectConfidenceThreshold = 0.6;
+    float pointScoresThreshold = 0.6;
    float iouThreshold = 0.5;
    int	keyPointsNum = 2;//Note:kpt number for pose
    bool cudaEnable = false;
@ -54,6 +56,7 @@ typedef struct _DL_RESULT
    int classId;
    float confidence;
    cv::Rect box;
+    std::vector<float> keyPointsScore;
    std::vector<cv::Point2f> keyPoints;
 } DL_RESULT;

@ -66,7 +69,7 @@ public:
    ~YOLO_V8();

 public:
-    char* CreateSession(DL_INIT_PARAM& iParams);
+    const char* CreateSession(DL_INIT_PARAM& iParams);

    char* RunSession(cv::Mat& iImg, std::vector<DL_RESULT>& oResult);

@ -79,6 +82,7 @@ public:
    char* PreProcess(cv::Mat& iImg, std::vector<int> iImgSize, cv::Mat& oImg);

    std::vector<std::string> classes{};
+    int kpts_num;

 private:
    Ort::Env env;
@ -91,6 +95,7 @@ private:
    MODEL_TYPE modelType;
    std::vector<int> imgSize;
    float rectConfidenceThreshold;
+    float pointScoresThreshold;
    float iouThreshold;
    float resizeScales;//letterbox scale
 };
--- a/examples/YOLOv8-ONNXRuntime-CPP/main.cpp
+++ b/examples/YOLOv8-ONNXRuntime-CPP/main.cpp
@ -9,7 +9,7 @@

 void Detector(YOLO_V8*& p) {
    std::filesystem::path current_path = std::filesystem::current_path();
-    std::filesystem::path imgs_path = current_path / "images";
+    std::filesystem::path imgs_path = current_path / "images/detect/";
    for (auto& i : std::filesystem::directory_iterator(imgs_path))
    {
        if (i.path().extension() == ".jpg" || i.path().extension() == ".png" || i.path().extension() == ".jpeg")
@ -97,11 +97,75 @@ void Classifier(YOLO_V8*& p)
    }
 }

+void PoseEstimator(YOLO_V8*& p)
+{
+    std::filesystem::path current_path = std::filesystem::current_path();
+    std::cout << "current_path: " << current_path << std::endl;
+    std::filesystem::path imgs_path = current_path / "images/pose/";
+
+    for (auto& i : std::filesystem::directory_iterator(imgs_path))
+    {
+        if (i.path().extension() == ".jpg" || i.path().extension() == ".png" || i.path().extension() == ".bmp")
+        {
+            std::string img_path = i.path().string();
+            cv::Mat img = cv::imread(img_path);
+            std::vector<DL_RESULT> res;
+            p->RunSession(img, res);
+            if (res.empty())
+            {
+                std::cout << "No pose detected in image: " << img_path << std::endl;
+                //continue;
+            }
+            for (auto& re : res)
+            {
+                cv::RNG rng(cv::getTickCount());
+                cv::Scalar color_box(rng.uniform(0, 256), rng.uniform(0, 256), rng.uniform(0, 256));
+                cv::Scalar color_point(rng.uniform(0, 256), rng.uniform(0, 256), rng.uniform(0, 256));
+
+                cv::rectangle(img, re.box, color_box, 2);
+
+                float confidence = floor(100 * re.confidence) / 100;
+                std::cout << std::fixed << std::setprecision(2);
+                std::string label_box = p->classes[re.classId] + " " +
+                    std::to_string(confidence).substr(0, std::to_string(confidence).size() - 4);
+
+                for (int i = 0; i < re.keyPoints.size(); i++)
+                {
+                    cv::circle(img, re.keyPoints[i], 5, color_point, -1);
+                    std::string label_point = p->classes[i + 1];
+                    cv::putText(img, label_point, re.keyPoints[i], cv::FONT_HERSHEY_SIMPLEX, 0.75, cv::Scalar(0, 0, 0), 2);
+                }
+                cv::rectangle(
+                    img,
+                    cv::Point(re.box.x, re.box.y - 25),
+                    cv::Point(re.box.x + label_box.length() * 15, re.box.y),
+                    color_box,
+                    cv::FILLED
+                );
+
+                cv::putText(
+                    img,
+                    label_box,
+                    cv::Point(re.box.x, re.box.y - 5),
+                    cv::FONT_HERSHEY_SIMPLEX,
+                    0.75,
+                    cv::Scalar(0, 0, 0),
+                    2
+                );
+            }
+            //std::cout << "Press any key to exit" << std::endl;
+            cv::imshow("Result of Detection", img);
+            cv::waitKey(0);
+            cv::destroyAllWindows();


-int ReadCocoYaml(YOLO_V8*& p) {
+        }
+    }
+}
+
+int ReadCocoYaml(YOLO_V8*& p, const std::string& yamlPath = "coco.yaml") {
    // Open the YAML file
-    std::ifstream file("coco.yaml");
+    std::ifstream file(yamlPath);
    if (!file.is_open())
    {
        std::cerr << "Failed to open file" << std::endl;
@ -119,8 +183,25 @@ int ReadCocoYaml(YOLO_V8*& p) {
    // Find the start and end of the names section
    std::size_t start = 0;
    std::size_t end = 0;
+    std::string kpt_num = "none";
+    int kpts = 0;
    for (std::size_t i = 0; i < lines.size(); i++)
    {
+        if (lines[i].find("kpt_shape:") != std::string::npos)
+        {
+            std::stringstream kpt_shape(lines[i]);
+            std::getline(kpt_shape, kpt_num, '[');
+            std::getline(kpt_shape, kpt_num, ',');
+            if (!kpt_num.empty()) {
+                try {
+                    kpts = std::stoi(kpt_num);
+                    std::cout << "kpt_num as integer: " << kpts << std::endl;
+                }
+                catch (const std::exception& e) {
+                    std::cerr << "Error converting kpt_num to integer: " << e.what() << std::endl;
+                }
+            }
+        }
        if (lines[i].find("names:") != std::string::npos)
        {
            start = i + 1;
@ -132,6 +213,7 @@ int ReadCocoYaml(YOLO_V8*& p) {
        }
    }

+
    // Extract the names
    std::vector<std::string> names;
    for (std::size_t i = start; i < end; i++)
@ -142,8 +224,14 @@ int ReadCocoYaml(YOLO_V8*& p) {
        std::getline(ss, name); // Extract the string after the delimiter
        names.push_back(name);
    }
+    if (kpts != 0) {
+        for (int i = 1; i <= kpts; i++) {
+            names.push_back(std::to_string(i));
+		}
+    }

    p->classes = names;
+	p->kpts_num = kpts;
    return 0;
 }

@ -151,11 +239,11 @@ int ReadCocoYaml(YOLO_V8*& p) {
 void DetectTest()
 {
    YOLO_V8* yoloDetector = new YOLO_V8;
-    ReadCocoYaml(yoloDetector);
+    ReadCocoYaml(yoloDetector, "./yaml/coco.yaml");
    DL_INIT_PARAM params;
    params.rectConfidenceThreshold = 0.1;
    params.iouThreshold = 0.5;
-    params.modelPath = "yolov8n.onnx";
+    params.modelPath = "./models/yolov8n.onnx";
    params.imgSize = { 640, 640 };
 #ifdef USE_CUDA
    params.cudaEnable = true;
@ -181,15 +269,46 @@ void ClsTest()
 {
    YOLO_V8* yoloDetector = new YOLO_V8;
    std::string model_path = "cls.onnx";
-    ReadCocoYaml(yoloDetector);
+    ReadCocoYaml(yoloDetector, "./yaml/cls.yaml");
    DL_INIT_PARAM params{ model_path, YOLO_CLS, {224, 224} };
    yoloDetector->CreateSession(params);
    Classifier(yoloDetector);
 }

+void PoseTest()
+{
+    YOLO_V8* yoloDetector = new YOLO_V8;
+    ReadCocoYaml(yoloDetector, "./yaml/coco8-pose.yaml");
+    DL_INIT_PARAM params;
+    params.rectConfidenceThreshold = 0.25;
+    params.pointScoresThreshold = 0.5;
+    params.iouThreshold = 0.7;
+    params.modelPath = "./models/yolov8n-pose.onnx";
+    params.imgSize = { 640, 640 };
+#ifdef USE_CUDA
+    params.cudaEnable = true;
+
+    // GPU FP32 inference
+    params.modelType = YOLO_POSE_V8;
+    // GPU FP16 inference
+    //Note: change fp16 onnx model
+    //params.modelType = YOLO_DETECT_V8_HALF;
+
+#else
+    // CPU inference
+    params.modelType = YOLO_POSE_V8;
+    params.cudaEnable = false;
+
+#endif
+    yoloDetector->CreateSession(params);
+    PoseEstimator(yoloDetector);
+
+    delete yoloDetector;
+}

 int main()
 {
    //DetectTest();
-    ClsTest();
+    //ClsTest();
+	PoseTest();
 }