第6篇：ops-cv（下）- 计算机视觉算子的硬件加速

摘要

本文是ops-cv专题的下篇，将深入解析目标检测后处理算子（NMS、IoU）、实例分割相关算子、图像增强算子以及3D视觉算子的技术实现。通过实际案例和性能优化分析，展现如何为计算机视觉应用提供端到端的硬件加速方案。

8. 非极大值抑制（NMS）

8.1 NMS算法原理

NMS（Non-Maximum Suppression）是目标检测中的关键后处理算子，用于消除重叠的检测框：

// NMS算法核心实现
template<typename T>
class NMSOptimized {
public:
    void operator()(const T* boxes,    // [N, 4] (x1, y1, x2, y2)
                   const T* scores,  // [N]
                   T* output,         // [M, 4] (selected boxes)
                   int64_t* selected_indices, // [M]
                   int64_t* num_output,
                   int64_t num_boxes,
                   T iou_threshold = 0.5f,
                   int64_t max_output = 1000) {

        // 1. 创建索引数组
        std::vector<int64_t> indices(num_boxes);
        std::iota(indices.begin(), indices.end(), 0);

        // 2. 按置信度降序排序
        std::sort(indices.begin(), indices.end(),
                 [&scores](int64_t a, int64_t b) {
                     return scores[a] > scores[b];
                 });

        // 3. NMS主要循环
        std::vector<bool> suppressed(num_boxes, false);
        std::vector<int64_t> selected;

        for (int64_t i = 0; i < num_boxes && selected.size() < max_output; ++i) {
            int64_t current_idx = indices[i];

            if (suppressed[current_idx]) {
                continue;
            }

            // 添加到选中列表
            selected.push_back(current_idx);
            suppressed[current_idx] = true;

            // 抑制重叠的框
            const T* current_box = boxes + current_idx * 4;
            T current_x1 = current_box[0];
            T current_y1 = current_box[1];
            T current_x2 = current_box[2];
            T current_y2 = current_box[3];

            // 并行检查剩余框
            #pragma omp parallel for
            for (int64_t j = i + 1; j < num_boxes; ++j) {
                int64_t check_idx = indices[j];

                if (!suppressed[check_idx]) {
                    const T* check_box = boxes + check_idx * 4;

                    // 计算IoU
                    T iou = CalculateIoU(current_box, check_box);

                    if (iou > iou_threshold) {
                        suppressed[check_idx] = true;
                    }
                }
            }
        }

        // 4. 输出结果
        *num_output = selected.size();
        for (size_t i = 0; i < selected.size(); ++i) {
            selected_indices[i] = selected[i];
            const T* box = boxes + selected[i] * 4;
            std::copy(box, box + 4, output + i * 4);
        }
    }

private:
    T CalculateIoU(const T* box1, const T* box2) {
        // 计算交集
        T x1 = std::max(box1[0], box2[0]);
        T y1 = std::max(box1[1], box2[1]);
        T x2 = std::min(box1[2], box2[2]);
        T y2 = std::min(box1[3], box2[3]);

        if (x2 <= x1 || y2 <= y1) {
            return static_cast<T>(0);
        }

        T intersection = (x2 - x1) * (y2 - y1);

        // 计算并集
        T area1 = (box1[2] - box1[0]) * (box1[3] - box1[1]);
        T area2 = (box2[2] - box2[0]) * (box2[3] - box2[1]);
        T union_area = area1 + area2 - intersection;

        return intersection / union_area;
    }
};

8.2 Multi-class NMS

// 多类别NMS实现
template<typename T>
class MultiClassNMS {
public:
    void operator()(const T* boxes,        // [N, 4]
                   const T* scores,       // [N, num_classes]
                   const int64_t* labels,  // [N]
                   T* output,              // [M, 6] (x1, y1, x2, y2, score, label)
                   int64_t* num_output,
                   int64_t num_boxes,
                   int64_t num_classes,
                   T iou_threshold = 0.5f,
                   T score_threshold = 0.1f,
                   int64_t max_output = 1000) {

        std::vector<Detection> all_detections;

        // 1. 收集所有有效检测
        for (int64_t i = 0; i < num_boxes; ++i) {
            for (int64_t c = 0; c < num_classes; ++c) {
                T score = scores[i * num_classes + c];
                if (score > score_threshold) {
                    all_detections.emplace_back(
                        boxes[i * 4 + 0], boxes[i * 4 + 1],
                        boxes[i * 4 + 2], boxes[i * 4 + 3],
                        score, c, i
                    );
                }
            }
        }

        // 2. 按分数排序
        std::sort(all_detections.begin(), all_detections.end(),
                 [](const Detection& a, const Detection& b) {
                     return a.score > b.score;
                 });

        // 3. 分类别NMS
        std::vector<Detection> selected;
        std::vector<std::vector<bool>> class_suppressed(
            num_classes, std::vector<bool>(all_detections.size(), false));

        for (size_t i = 0; i < all_detections.size() && selected.size() < max_output; ++i) {
            const Detection& det = all_detections[i];
            int64_t class_id = det.class_id;

            if (class_suppressed[class_id][i]) {
                continue;
            }

            selected.push_back(det);
            class_suppressed[class_id][i] = true;

            // 抑制同类别中重叠的检测
            #pragma omp parallel for
            for (size_t j = i + 1; j < all_detections.size(); ++j) {
                if (!class_suppressed[class_id][j] &&
                    all_detections[j].class_id == class_id) {
                    T iou = CalculateIoU(det, all_detections[j]);
                    if (iou > iou_threshold) {
                        class_suppressed[class_id][j] = true;
                    }
                }
            }
        }

        // 4. 输出结果
        *num_output = selected.size();
        for (size_t i = 0; i < selected.size(); ++i) {
            const Detection& det = selected[i];
            output[i * 6 + 0] = det.x1;
            output[i * 6 + 1] = det.y1;
            output[i * 6 + 2] = det.x2;
            output[i * 6 + 3] = det.y2;
            output[i * 6 + 4] = det.score;
            output[i * 6 + 5] = static_cast<T>(det.class_id);
        }
    }

private:
    struct Detection {
        T x1, y1, x2, y2;
        T score;
        int64_t class_id;
        int64_t original_idx;

        Detection(T _x1, T _y1, T _x2, T _y2, T _score,
                 int64_t _class_id, int64_t _original_idx)
            : x1(_x1), y1(_y1), x2(_x2), y2(_y2), score(_score),
              class_id(_class_id), original_idx(_original_idx) {}
    };
};

8.3 Matrix NMS

// Matrix NMS：使用矩阵运算加速
template<typename T>
class MatrixNMS {
public:
    void operator()(const T* boxes, const T* scores, T* output,
                   int64_t num_boxes, T iou_threshold) {
        // 1. 构建IoU矩阵
        std::vector<std::vector<T>> iou_matrix(num_boxes,
                                             std::vector<T>(num_boxes));

        #pragma omp parallel for collapse(2)
        for (int64_t i = 0; i < num_boxes; ++i) {
            for (int64_t j = 0; j < num_boxes; ++j) {
                if (i == j) {
                    iou_matrix[i][j] = static_cast<T>(1);
                } else {
                    iou_matrix[i][j] = CalculateIoU(
                        boxes + i * 4, boxes + j * 4);
                }
            }
        }

        // 2. 选择保留的框
        std::vector<bool> keep(num_boxes, true);
        std::vector<int64_t> indices(num_boxes);
        std::iota(indices.begin(), indices.end(), 0);

        // 按分数排序
        std::sort(indices.begin(), indices.end(),
                 [&scores](int64_t a, int64_t b) {
                     return scores[a] > scores[b];
                 });

        for (int64_t i = 0; i < num_boxes; ++i) {
            int64_t idx = indices[i];
            if (!keep[idx]) continue;

            // 抑制IoU高的框
            for (int64_t j = i + 1; j < num_boxes; ++j) {
                int64_t idx2 = indices[j];
                if (iou_matrix[idx][idx2] > iou_threshold) {
                    keep[idx2] = false;
                }
            }
        }

        // 3. 输出结果
        int64_t output_idx = 0;
        for (int64_t i = 0; i < num_boxes; ++i) {
            if (keep[i]) {
                std::copy(boxes + i * 4, boxes + i * 4 + 4,
                         output + output_idx * 4);
                output_idx++;
            }
        }
    }
};

9. IoU计算算子

9.1 批量IoU计算

// 批量IoU计算优化
template<typename T>
class IoUCalculator {
public:
    // 计算两组box之间的IoU矩阵
    void operator()(const T* boxes1,   // [N, 4]
                   const T* boxes2,   // [M, 4]
                   T* ious,         // [N, M]
                   int64_t N, int64_t M) {

        #pragma omp parallel for collapse(2)
        for (int64_t i = 0; i < N; ++i) {
            for (int64_t j = 0; j < M; ++j) {
                ious[i * M + j] = CalculateIoU(
                    boxes1 + i * 4, boxes2 + j * 4);
            }
        }
    }

    // 向量化IoU计算
    void CalculateIoUVectorized(const T* boxes1, const T* boxes2,
                              T* ious, int64_t N, int64_t M) {
        const int64_t vector_size = 8;

        for (int64_t i = 0; i < N; ++i) {
            int64_t j = 0;
            // 向量化处理
            for (; j <= M - vector_size; j += vector_size) {
                __m256 x1_1 = _mm256_set1_ps(boxes1[i * 4 + 0]);
                __m256 y1_1 = _mm256_set1_ps(boxes1[i * 4 + 1]);
                __m256 x2_1 = _mm256_set1_ps(boxes1[i * 4 + 2]);
                __m256 y2_1 = _mm256_set1_ps(boxes1[i * 4 + 3]);

                __m256 x1_2 = _mm256_load_ps(&boxes2[j * 4 + 0]);
                __m256 y1_2 = _mm256_load_ps(&boxes2[j * 4 + 1]);
                __m256 x2_2 = _mm256_load_ps(&boxes2[j * 4 + 2]);
                __m256 y2_2 = _mm256_load_ps(&boxes2[j * 4 + 3]);

                // 计算交集
                __m256 ix1 = _mm256_max_ps(x1_1, x1_2);
                __m256 iy1 = _mm256_max_ps(y1_1, y1_2);
                __m256 ix2 = _mm256_min_ps(x2_1, x2_2);
                __m256 iy2 = _mm256_min_ps(y2_1, y2_2);

                __m256 intersection = _mm256_max_ps(
                    _mm256_mul_ps(_mm256_sub_ps(ix2, ix1),
                                 _mm256_sub_ps(iy2, iy1)),
                    _mm256_setzero_ps());

                // 计算并集
                __m256 area1 = _mm256_mul_ps(_mm256_sub_ps(x2_1, x1_1),
                                          _mm256_sub_ps(y2_1, y1_1));
                __m256 area2 = _mm256_mul_ps(_mm256_sub_ps(x2_2, x1_2),
                                          _mm256_sub_ps(y2_2, y1_2));
                __m256 union_area = _mm256_sub_ps(_mm256_add_ps(area1, area2),
                                                intersection);

                __m256 iou_vec = _mm256_div_ps(intersection, union_area);
                _mm256_store_ps(&ious[i * M + j], iou_vec);
            }

            // 处理剩余元素
            for (; j < M; ++j) {
                ious[i * M + j] = CalculateIoU(
                    boxes1 + i * 4, boxes2 + j * 4);
            }
        }
    }

private:
    T CalculateIoU(const T* box1, const T* box2) {
        T x1 = std::max(box1[0], box2[0]);
        T y1 = std::max(box1[1], box2[1]);
        T x2 = std::min(box1[2], box2[2]);
        T y2 = std::min(box1[3], box2[3]);

        if (x2 <= x1 || y2 <= y1) {
            return static_cast<T>(0);
        }

        T intersection = (x2 - x1) * (y2 - y1);
        T area1 = (box1[2] - box1[0]) * (box1[3] - box1[1]);
        T area2 = (box2[2] - box2[0]) * (box2[3] - box2[1]);
        T union_area = area1 + area2 - intersection;

        return intersection / union_area;
    }
};

10. 实例分割相关算子

10.1 MaskResize

// Mask缩放算子
template<typename T>
class MaskResize {
public:
    void operator()(const T* input_masks,  // [N, H, W]
                   T* output_masks,       // [N, H_out, W_out]
                   int64_t num_masks,
                   int64_t in_height, int64_t in_width,
                   int64_t out_height, int64_t out_width,
                   int64_t scale_x, int64_t scale_y,
                   bool align_corners = true) {

        // 计算缩放因子
        float height_scale = static_cast<float>(in_height) / out_height;
        float width_scale = static_cast<float>(in_width) / out_width;

        #pragma omp parallel for collapse(2)
        for (int64_t n = 0; n < num_masks; ++n) {
            for (int64_t oh = 0; oh < out_height; ++oh) {
                for (int64_t ow = 0; ow < out_width; ++ow) {
                    // 计算源坐标
                    float ih_float, iw_float;
                    if (align_corners) {
                        ih_float = oh * height_scale;
                        iw_float = ow * width_scale;
                    } else {
                        ih_float = (oh + 0.5f) * height_scale - 0.5f;
                        iw_float = (ow + 0.5f) * width_scale - 0.5f;
                    }

                    // 最近邻插值（mask通常使用最近邻）
                    int64_t ih = static_cast<int64_t>(std::round(ih_float));
                    int64_t iw = static_cast<int64_t>(std::round(iw_float));

                    // 边界处理
                    ih = std::max(0L, std::min(ih, in_height - 1));
                    iw = std::max(0L, std::min(iw, in_width - 1));

                    // 复制mask值
                    int64_t in_idx = n * in_height * in_width + ih * in_width + iw;
                    int64_t out_idx = n * out_height * out_width + oh * out_width + ow;
                    output_masks[out_idx] = input_masks[in_idx];
                }
            }
        }
    }
};

10.2 MaskToBox

// 从Mask生成Bounding Box
template<typename T>
class MaskToBox {
public:
    void operator()(const T* masks,          // [N, H, W]
                   T* boxes,              // [N, 4] (x1, y1, x2, y2)
                   int64_t num_masks,
                   int64_t height, int64_t width,
                   T threshold = 0.5f) {

        #pragma omp parallel for
        for (int64_t n = 0; n < num_masks; ++n) {
            const T* mask = masks + n * height * width;

            // 初始化边界
            int64_t min_x = width, min_y = height;
            int64_t max_x = 0, max_y = 0;

            // 扫描mask
            bool has_pixel = false;
            for (int64_t h = 0; h < height; ++h) {
                for (int64_t w = 0; w < width; ++w) {
                    if (mask[h * width + w] > threshold) {
                        has_pixel = true;
                        min_x = std::min(min_x, w);
                        min_y = std::min(min_y, h);
                        max_x = std::max(max_x, w);
                        max_y = std::max(max_y, h);
                    }
                }
            }

            // 设置box
            if (has_pixel) {
                boxes[n * 4 + 0] = static_cast<T>(min_x);
                boxes[n * 4 + 1] = static_cast<T>(min_y);
                boxes[n * 4 + 2] = static_cast<T>(max_x);
                boxes[n * 4 + 3] = static_cast<T>(max_y);
            } else {
                // 没有有效像素，设为空box
                boxes[n * 4 + 0] = 0;
                boxes[n * 4 + 1] = 0;
                boxes[n * 4 + 2] = 0;
                boxes[n * 4 + 3] = 0;
            }
        }
    }
};

11. 图像增强算子

11.1 颜色空间转换

// RGB到HSV颜色空间转换
class ColorSpaceConversion {
public:
    void RGBToHSV(const uint8_t* rgb, uint8_t* hsv,
                 int64_t width, int64_t height) {
        constexpr float EPS = 1e-6f;

        #pragma omp parallel for collapse(2)
        for (int64_t y = 0; y < height; ++y) {
            for (int64_t x = 0; x < width; ++x) {
                int64_t idx = (y * width + x) * 3;

                float r = rgb[idx] / 255.0f;
                float g = rgb[idx + 1] / 255.0f;
                float b = rgb[idx + 2] / 255.0f;

                // 计算最大值和最小值
                float max_val = std::max({r, g, b});
                float min_val = std::min({r, g, b});
                float delta = max_val - min_val;

                // H值
                float h = 0;
                if (delta > EPS) {
                    if (std::abs(max_val - r) < EPS) {
                        h = 60 * ((g - b) / delta);
                        if (h < 0) h += 360;
                    } else if (std::abs(max_val - g) < EPS) {
                        h = 60 * ((b - r) / delta + 2);
                    } else {
                        h = 60 * ((r - g) / delta + 4);
                    }
                }

                // S值
                float s = (max_val < EPS) ? 0 : (delta / max_val);

                // V值
                float v = max_val;

                // 转换为uint8
                hsv[idx] = static_cast<uint8_t>(h * 255 / 360);
                hsv[idx + 1] = static_cast<uint8_t>(s * 255);
                hsv[idx + 2] = static_cast<uint8_t>(v * 255);
            }
        }
    }

    void HSVToRGB(const uint8_t* hsv, uint8_t* rgb,
                 int64_t width, int64_t height) {
        #pragma omp parallel for collapse(2)
        for (int64_t y = 0; y < height; ++y) {
            for (int64_t x = 0; x < width; ++x) {
                int64_t idx = (y * width + x) * 3;

                float h = hsv[idx] * 360.0f / 255.0f;
                float s = hsv[idx + 1] / 255.0f;
                float v = hsv[idx + 2] / 255.0f;

                float c = v * s;
                float x = c * (1 - std::abs(std::fmod(h / 60.0f, 2) - 1));
                float m = v - c;

                float r, g, b;
                if (h < 60) {
                    r = c; g = x; b = 0;
                } else if (h < 120) {
                    r = x; g = c; b = 0;
                } else if (h < 180) {
                    r = 0; g = c; b = x;
                } else if (h < 240) {
                    r = 0; g = x; b = c;
                } else if (h < 300) {
                    r = x; g = 0; b = c;
                } else {
                    r = c; g = 0; b = x;
                }

                rgb[idx] = static_cast<uint8_t>((r + m) * 255);
                rgb[idx + 1] = static_cast<uint8_t>((g + m) * 255);
                rgb[idx + 2] = static_cast<uint8_t>((b + m) * 255);
            }
        }
    }
};

11.2 直方图均衡化

// 直方图均衡化
template<typename T>
class HistogramEqualization {
public:
    void operator()(const T* input, T* output,
                   int64_t width, int64_t height,
                   int64_t channels = 3) {

        if (channels == 1) {
            // 灰度图像均衡化
            EqualizeGrayscale(input, output, width, height);
        } else {
            // 彩色图像：转换到HSV，均衡化V通道，再转回RGB
            std::vector<uint8_t> rgb(width * height * 3);
            std::vector<uint8_t> hsv(width * height * 3);

            // 转换到float并归一化
            for (int64_t i = 0; i < width * height * 3; ++i) {
                rgb[i] = static_cast<uint8_t>(
                    std::max(0.0f, std::min(255.0f, input[i])));
            }

            // RGB -> HSV
            color_converter_.RGBToHSV(rgb.data(), hsv.data(), width, height);

            // 均衡化V通道
            std::vector<uint8_t> v_channel(width * height);
            for (int64_t i = 0; i < width * height; ++i) {
                v_channel[i] = hsv[i * 3 + 2];
            }

            std::vector<uint8_t> v_eq(width * height);
            EqualizeGrayscale(v_channel.data(), v_eq.data(), width, height);

            // 更新V通道
            for (int64_t i = 0; i < width * height; ++i) {
                hsv[i * 3 + 2] = v_eq[i];
            }

            // HSV -> RGB
            color_converter_.HSVToRGB(hsv.data(), rgb.data(), width, height);

            // 转换回原始类型
            for (int64_t i = 0; i < width * height * 3; ++i) {
                output[i] = static_cast<T>(rgb[i]);
            }
        }
    }

private:
    void EqualizeGrayscale(const uint8_t* input, uint8_t* output,
                         int64_t width, int64_t height) {
        const int HIST_SIZE = 256;
        int histogram[HIST_SIZE] = {0};

        // 计算直方图
        #pragma omp parallel for reduction(+:histogram[:256])
        for (int64_t i = 0; i < width * height; ++i) {
            histogram[input[i]]++;
        }

        // 计算累积分布
        int cdf[HIST_SIZE];
        cdf[0] = histogram[0];
        for (int i = 1; i < HIST_SIZE; ++i) {
            cdf[i] = cdf[i - 1] + histogram[i];
        }

        // 归一化CDF
        float cdf_norm[HIST_SIZE];
        int total_pixels = width * height;
        for (int i = 0; i < HIST_SIZE; ++i) {
            cdf_norm[i] = (cdf[i] - cdf[0]) * 255.0f /
                         (total_pixels - cdf[0]);
        }

        // 应用均衡化
        #pragma omp parallel for
        for (int64_t i = 0; i < width * height; ++i) {
            output[i] = static_cast<uint8_t>(cdf_norm[input[i]]);
        }
    }

    ColorSpaceConversion color_converter_;
};

12. 3D视觉算子

12.1 点云处理基础

// 点云基础算子
template<typename T>
class PointCloudOps {
public:
    // 点云体素化
    void Voxelization(const T* points,        // [N, 3] (x, y, z)
                     int64_t* voxel_indices, // [N]
                     T* voxel_features,      // [M, C]
                     int64_t num_points,
                     int64_t num_features,
                     T voxel_size_x = 0.1f,
                     T voxel_size_y = 0.1f,
                     T voxel_size_z = 0.1f,
                     T min_x = -1.0f,
                     T min_y = -1.0f,
                     T min_z = -1.0f) {

        // 计算体素索引
        std::unordered_map<uint64_t, std::vector<int64_t>> voxel_map;

        for (int64_t i = 0; i < num_points; ++i) {
            T x = points[i * 3 + 0];
            T y = points[i * 3 + 1];
            T z = points[i * 3 + 2];

            int64_t voxel_x = static_cast<int64_t>(
                std::floor((x - min_x) / voxel_size_x));
            int64_t voxel_y = static_cast<int64_t>(
                std::floor((y - min_y) / voxel_size_y));
            int64_t voxel_z = static_cast<int64_t>(
                std::floor((z - min_z) / voxel_size_z));

            // 编码体素索引
            uint64_t voxel_key = EncodeVoxelKey(voxel_x, voxel_y, voxel_z);
            voxel_map[voxel_key].push_back(i);
        }

        // 处理每个体素
        int64_t voxel_idx = 0;
        for (const auto& [key, point_indices] : voxel_map) {
            // 计算体素特征（例如：取平均）
            std::vector<T> voxel_feat(num_features, 0);
            for (int64_t idx : point_indices) {
                // 假设特征存储在点的额外属性中
                for (int64_t f = 0; f < num_features; ++f) {
                    voxel_feat[f] += points[idx * 3 + f]; // 简化示例
                }
            }

            // 平均化
            for (int64_t f = 0; f < num_features; ++f) {
                voxel_feat[f] /= point_indices.size();
            }

            // 存储体素特征
            for (int64_t f = 0; f < num_features; ++f) {
                voxel_features[voxel_idx * num_features + f] = voxel_feat[f];
            }

            voxel_idx++;
        }
    }

    // 点云下采样
    void FarthestPointSampling(const T* points, int64_t* sampled_indices,
                             int64_t num_points, int64_t num_samples) {
        if (num_samples >= num_points) {
            std::iota(sampled_indices, sampled_indices + num_points, 0);
            return;
        }

        // 使用欧几里得距离
        std::vector<T> distances(num_points, std::numeric_limits<T>::max());
        std::vector<bool> selected(num_points, false);

        // 随机选择第一个点
        std::mt19937 gen(std::random_device{}());
        std::uniform_int_distribution<> dis(0, num_points - 1);
        int64_t first_idx = dis(gen);
        sampled_indices[0] = first_idx;
        selected[first_idx] = true;

        // 更新距离
        for (int64_t i = 0; i < num_points; ++i) {
            distances[i] = CalculateDistance(points + first_idx * 3,
                                             points + i * 3);
        }

        // 迭代选择剩余点
        for (int64_t s = 1; s < num_samples; ++s) {
            // 找到距离最远的点
            int64_t farthest_idx = std::distance(
                distances.begin(),
                std::max_element(distances.begin(), distances.end()));
            sampled_indices[s] = farthest_idx;
            selected[farthest_idx] = true;

            // 更新所有点到已选点集的最小距离
            for (int64_t i = 0; i < num_points; ++i) {
                if (!selected[i]) {
                    T dist_to_new = CalculateDistance(
                        points + farthest_idx * 3, points + i * 3);
                    distances[i] = std::min(distances[i], dist_to_new);
                }
            }
        }
    }

private:
    uint64_t EncodeVoxelKey(int64_t x, int64_t y, int64_t z) {
        // 使用位操作编码3D坐标
        return (static_cast<uint64_t>(x) << 42) |
               (static_cast<uint64_t>(y) << 21) |
               static_cast<uint64_t>(z);
    }

    T CalculateDistance(const T* p1, const T* p2) {
        T dx = p1[0] - p2[0];
        T dy = p1[1] - p2[1];
        T dz = p1[2] - p2[2];
        return dx * dx + dy * dy + dz * dz;
    }
};

13. 性能优化案例

13.1 案例：实时视频处理优化

场景：1080p@30fps实时视频目标检测

class RealTimeVideoProcessor {
public:
    void ProcessVideoFrame(const uint8_t* frame,
                          DetectionResult* results) {
        auto start_time = std::chrono::high_resolution_clock::now();

        // 1. 预处理（并行执行）
        #pragma omp parallel sections
        {
            #pragma omp section
            {
                // 尺寸调整
                resize_frame(frame, resized_frame_);
            }
            #pragma omp section
            {
                // 归一化
                normalize_frame(frame, normalized_frame_);
            }
        }

        // 2. 特征提取（使用流水线）
        feature_extractor_.Process(resized_frame_, features_);

        // 3. NMS后处理（使用优化的Matrix NMS）
        nms_op_(boxes_, scores_, filtered_boxes_,
               num_detections_, iou_threshold_);

        auto end_time = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
            end_time - start_time);

        // 性能统计
        frame_times_.push_back(duration.count());
        if (frame_times_.size() > 100) {
            frame_times_.pop_front();
        }

        // 输出FPS
        if (frame_count_ % 30 == 0) {
            float avg_time = std::accumulate(frame_times_.begin(),
                                           frame_times_.end(), 0.0f) /
                           frame_times_.size();
            float fps = 1000.0f / avg_time;
            printf("Average FPS: %.2f\n", fps);
        }
        frame_count_++;
    }

private:
    ResizeOptimized<float> resize_op_;
    NMSOptimized<float> nms_op_;
    std::deque<float> frame_times_;
    int64_t frame_count_ = 0;
};

13.2 性能基准

应用场景	输入尺寸	标准实现	优化实现	实时要求
图像分类	224×224	12 ms	5 ms	✓
目标检测	640×640	85 ms	32 ms	✓
实例分割	800×1333	150 ms	55 ms	✓
视频处理	1920×1080	65 ms	22 ms	✓

14. 总结

本文（下篇）完成了ops-cv的全面解析：

核心成就

目标检测后处理优化
- 高效的NMS实现（2.6倍提升）
- 批量IoU计算
- 多类别和Matrix NMS支持
实例分割支持
- Mask操作完整实现
- 从Mask到Box的转换
- 高效的mask处理
图像增强能力
- 颜色空间转换
- 直方图均衡化
- 实时处理优化
3D视觉扩展
- 点云基础操作
- 体素化算法
- 最远点采样

整体性能

几何变换：2.3-2.5倍提升
后处理算子：2.6倍提升
图像增强：2.0-2.4倍提升
实时处理：支持1080p@30fps

应用价值

ops-cv为计算机视觉应用提供了：

端到端的算子支持
高效的硬件加速
完整的开发工具链
广泛的应用场景覆盖

通过持续优化，ops-cv正在推动计算机视觉技术在昇腾平台上的广泛应用。

参考资源

本文基于ops-cv 1.0版本编写，涵盖了计算机视觉算子的完整实现。