From 4e2ff289941c046feac470fd139b1b7f2312e321 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Thu, 7 May 2026 16:13:48 +0200 Subject: [PATCH 1/4] refactor: unify existing codebase to use Point and BBox sruct across the vision stack --- .../host_objects/JsiConversions.h | 8 +- .../BaseInstanceSegmentation.cpp | 12 +- .../models/ocr/RecognitionHandler.cpp | 18 +- .../common/rnexecutorch/models/ocr/Types.h | 8 +- .../models/ocr/utils/DetectorUtils.cpp | 195 +++++------------- .../ocr/utils/RecognitionHandlerUtils.cpp | 12 +- .../models/ocr/utils/RecognizerUtils.cpp | 35 ++-- .../models/ocr/utils/RecognizerUtils.h | 18 +- .../models/pose_estimation/Types.h | 10 +- .../models/vertical_ocr/VerticalOCR.cpp | 25 ++- .../rnexecutorch/utils/FrameTransform.cpp | 40 ++-- .../utils/computer_vision/Processing.cpp | 9 +- .../utils/computer_vision/Types.h | 20 +- 13 files changed, 159 insertions(+), 251 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index a20fd7b1bc..247780b6f5 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -462,10 +462,10 @@ getJsiValue(const std::unordered_map &map, inline jsi::Value getJsiValue(const utils::computer_vision::BBox &bbox, jsi::Runtime &runtime) { jsi::Object obj(runtime); - obj.setProperty(runtime, "x1", bbox.x1); - obj.setProperty(runtime, "y1", bbox.y1); - obj.setProperty(runtime, "x2", bbox.x2); - obj.setProperty(runtime, "y2", bbox.y2); + obj.setProperty(runtime, "x1", bbox.p1.x); + obj.setProperty(runtime, "y1", bbox.p1.y); + obj.setProperty(runtime, "x2", bbox.p2.x); + obj.setProperty(runtime, "y2", bbox.p2.y); return obj; } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.cpp b/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.cpp index 3d2f9d1715..776f8edd20 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.cpp @@ -161,10 +161,10 @@ cv::Rect BaseInstanceSegmentation::computeMaskCropRect( const utils::computer_vision::BBox &bboxModel, cv::Size modelInputSize, cv::Size maskSize) { - float mx1F = bboxModel.x1 * maskSize.width / modelInputSize.width; - float my1F = bboxModel.y1 * maskSize.height / modelInputSize.height; - float mx2F = bboxModel.x2 * maskSize.width / modelInputSize.width; - float my2F = bboxModel.y2 * maskSize.height / modelInputSize.height; + float mx1F = bboxModel.p1.x * maskSize.width / modelInputSize.width; + float my1F = bboxModel.p1.y * maskSize.height / modelInputSize.height; + float mx2F = bboxModel.p2.x * maskSize.width / modelInputSize.width; + float my2F = bboxModel.p2.y * maskSize.height / modelInputSize.height; int32_t mx1 = std::max(0, static_cast(std::floor(mx1F))); int32_t my1 = std::max(0, static_cast(std::floor(my1F))); @@ -193,8 +193,8 @@ cv::Mat BaseInstanceSegmentation::warpToOriginalResolution( float scaleY = static_cast(originalSize.height) / maskSize.height; cv::Mat M = (cv::Mat_(2, 3) << scaleX, 0, - (maskRect.x * scaleX - bboxOriginal.x1), 0, scaleY, - (maskRect.y * scaleY - bboxOriginal.y1)); + (maskRect.x * scaleX - bboxOriginal.p1.x), 0, scaleY, + (maskRect.y * scaleY - bboxOriginal.p1.y)); cv::Size bboxSize(static_cast(std::round(bboxOriginal.width())), static_cast(std::round(bboxOriginal.height()))); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp index dfde737655..4725154a86 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp @@ -57,12 +57,20 @@ void RecognitionHandler::processBBox(std::vector &boxList, we want to return the boxes shifted and rescaled to match the original image dimensions. */ - for (auto &point : box.bbox) { - point.x = (point.x - ratioAndPadding.left) * ratioAndPadding.resizeRatio; - point.y = (point.y - ratioAndPadding.top) * ratioAndPadding.resizeRatio; - } + const float ratio = ratioAndPadding.resizeRatio; + const float padLeft = static_cast(ratioAndPadding.left); + const float padTop = static_cast(ratioAndPadding.top); + auto tx = [&](types::Point p) -> types::Point { + return {(p.x - padLeft) * ratio, (p.y - padTop) * ratio}; + }; + std::array corners = { + tx(box.bbox.p1), + tx({box.bbox.p2.x, box.bbox.p1.y}), + tx(box.bbox.p2), + tx({box.bbox.p1.x, box.bbox.p2.y}), + }; boxList.emplace_back( - box.bbox, + corners, converter.decodeGreedy(predictionIndices, predictionIndices.size())[0], confidenceScore); } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h index bb0a24aad1..664efa400c 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h @@ -1,14 +1,12 @@ #pragma once #include +#include #include #include namespace rnexecutorch::models::ocr::types { -struct Point { - float x; - float y; -}; +using namespace rnexecutorch::utils::computer_vision; struct ValuesAndIndices { std::vector values; @@ -16,7 +14,7 @@ struct ValuesAndIndices { }; struct DetectorBBox { - std::array bbox; + BBox bbox; float angle; }; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/DetectorUtils.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/DetectorUtils.cpp index 7614e97a1f..e398201d28 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/DetectorUtils.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/DetectorUtils.cpp @@ -8,23 +8,8 @@ #include namespace rnexecutorch::models::ocr::utils { -std::array -cvPointsFromPoints(const std::array &points) { - std::array cvPoints; -#pragma unroll - for (std::size_t i = 0; i < cvPoints.size(); ++i) { - cvPoints[i] = cv::Point2f(points[i].x, points[i].y); - } - return cvPoints; -} - -std::array pointsFromCvPoints(cv::Point2f cvPoints[4]) { - std::array points; -#pragma unroll - for (std::size_t i = 0; i < points.size(); ++i) { - points[i] = {.x = cvPoints[i].x, .y = cvPoints[i].y}; - } - return points; +static std::array bboxToCorners(const types::BBox &bbox) { + return {bbox.p1, {bbox.p2.x, bbox.p1.y}, bbox.p2, {bbox.p1.x, bbox.p2.y}}; } std::pair interleavedArrayToMats(std::span data, @@ -99,8 +84,16 @@ extractMinAreaBBoxFromContour(const std::vector contour) { std::array vertices; minRect.points(vertices.data()); - std::array points = pointsFromCvPoints(vertices.data()); - return {.bbox = points, .angle = minRect.angle}; + float minX = + std::min({vertices[0].x, vertices[1].x, vertices[2].x, vertices[3].x}); + float minY = + std::min({vertices[0].y, vertices[1].y, vertices[2].y, vertices[3].y}); + float maxX = + std::max({vertices[0].x, vertices[1].x, vertices[2].x, vertices[3].x}); + float maxY = + std::max({vertices[0].y, vertices[1].y, vertices[2].y, vertices[3].y}); + types::BBox bbox = {{minX, minY}, {maxX, maxY}}; + return {.bbox = bbox, .angle = minRect.angle}; } void getBoxFromContour(cv::Mat &segMap, @@ -296,10 +289,7 @@ float calculateRestoreRatio(int32_t currentSize, int32_t desiredSize) { void restoreBboxRatio(std::vector &boxes, float restoreRatio) { for (auto &box : boxes) { - for (auto &point : box.bbox) { - point.x *= restoreRatio; - point.y *= restoreRatio; - } + box.bbox = box.bbox.scale(restoreRatio, restoreRatio); } } @@ -318,36 +308,16 @@ types::Point midpointBetweenPoint(const types::Point &p1, return {.x = std::midpoint(p1.x, p2.x), .y = std::midpoint(p1.y, p2.y)}; } -types::Point centerOfBox(const std::array &box) { - return midpointBetweenPoint(box[0], box[2]); +types::Point centerOfBox(const types::BBox &box) { + return midpointBetweenPoint(box.p1, box.p2); } -// function for both; finding maximal side length and minimal side length -template -float findExtremeSideLength(const std::array &points, - Compare comp) { - float extremeLength = distanceFromPoint(points[0], points[1]); - -#pragma unroll - for (std::size_t i = 1; i < points.size(); i++) { - const auto ¤tPoint = points[i]; - const auto &nextPoint = points[(i + 1) % points.size()]; - const float sideLength = distanceFromPoint(currentPoint, nextPoint); - - if (comp(sideLength, extremeLength)) { - extremeLength = sideLength; - } - } - - return extremeLength; +float minSideLength(const types::BBox &bbox) { + return std::min(bbox.width(), bbox.height()); } -float minSideLength(const std::array &points) { - return findExtremeSideLength(points, std::less{}); -} - -float maxSideLength(const std::array &points) { - return findExtremeSideLength(points, std::greater{}); +float maxSideLength(const types::BBox &bbox) { + return std::max(bbox.width(), bbox.height()); } /** @@ -366,8 +336,8 @@ float maxSideLength(const std::array &points) { * - a bool indicating whether the line is * considered vertical. */ -std::tuple -fitLineToShortestSides(const std::array &points) { +std::tuple fitLineToShortestSides(const types::BBox &bbox) { + const std::array points = bboxToCorners(bbox); std::array, 4> sides; std::array midpoints; #pragma unroll @@ -414,37 +384,35 @@ fitLineToShortestSides(const std::array &points) { return {m, c, isVertical}; } -std::array rotateBox(const std::array &box, - float angle) { - const types::Point center = centerOfBox(box); - +types::BBox rotateBox(const types::BBox &bbox, float angle) { + const types::Point center = centerOfBox(bbox); const float radians = angle * M_PI / 180.0f; - std::array rotatedPoints; - for (std::size_t i = 0; i < box.size(); ++i) { - const types::Point &point = box[i]; - const float translatedX = point.x - center.x; - const float translatedY = point.y - center.y; - - const float rotatedX = - translatedX * std::cos(radians) - translatedY * std::sin(radians); - const float rotatedY = - translatedX * std::sin(radians) + translatedY * std::cos(radians); - - rotatedPoints[i] = {.x = rotatedX + center.x, .y = rotatedY + center.y}; + float minX = std::numeric_limits::max(); + float minY = std::numeric_limits::max(); + float maxX = std::numeric_limits::lowest(); + float maxY = std::numeric_limits::lowest(); + + for (const auto &p : bboxToCorners(bbox)) { + const float tx = p.x - center.x; + const float ty = p.y - center.y; + const float rx = tx * std::cos(radians) - ty * std::sin(radians) + center.x; + const float ry = tx * std::sin(radians) + ty * std::cos(radians) + center.y; + minX = std::min(minX, rx); + minY = std::min(minY, ry); + maxX = std::max(maxX, rx); + maxY = std::max(maxY, ry); } - return rotatedPoints; + return {{minX, minY}, {maxX, maxY}}; } -float calculateMinimalDistanceBetweenBox( - const std::array &box1, - const std::array &box2) { +float calculateMinimalDistanceBetweenBox(const types::BBox &box1, + const types::BBox &box2) { float minDistance = std::numeric_limits::max(); - for (const types::Point &corner1 : box1) { - for (const types::Point &corner2 : box2) { - const float distance = distanceFromPoint(corner1, corner2); - minDistance = std::min(distance, minDistance); + for (const auto &c1 : bboxToCorners(box1)) { + for (const auto &c2 : bboxToCorners(box2)) { + minDistance = std::min(minDistance, distanceFromPoint(c1, c2)); } } return minDistance; @@ -466,66 +434,15 @@ float calculateMinimalDistanceBetweenBox( * 4. The points are ordered starting from the top-left in a clockwise manner: * top-left, top-right, bottom-right, bottom-left. */ -std::array -orderPointsClockwise(const std::array &points) { - types::Point topLeft, topRight, bottomRight, bottomLeft; - float minSum = std::numeric_limits::max(); - float maxSum = std::numeric_limits::lowest(); - float minDiff = std::numeric_limits::max(); - float maxDiff = std::numeric_limits::lowest(); - - for (const auto &pt : points) { - const float sum = pt.x + pt.y; - const float diff = pt.y - pt.x; - - if (sum < minSum) { - minSum = sum; - topLeft = pt; - } - if (sum > maxSum) { - maxSum = sum; - bottomRight = pt; - } - if (diff < minDiff) { - minDiff = diff; - topRight = pt; - } - if (diff > maxDiff) { - maxDiff = diff; - bottomLeft = pt; - } - } - - return {topLeft, topRight, bottomRight, bottomLeft}; +types::BBox orderPointsClockwise(const types::BBox &bbox) { + return {{std::min(bbox.p1.x, bbox.p2.x), std::min(bbox.p1.y, bbox.p2.y)}, + {std::max(bbox.p1.x, bbox.p2.x), std::max(bbox.p1.y, bbox.p2.y)}}; } -std::array -mergeRotatedBoxes(std::array &box1, - std::array &box2) { - box1 = orderPointsClockwise(box1); - box2 = orderPointsClockwise(box2); - - auto points1 = cvPointsFromPoints(box1); - auto points2 = cvPointsFromPoints(box2); - - std::array allPoints; - std::copy(points1.begin(), points1.end(), allPoints.begin()); - std::copy(points2.begin(), points2.end(), allPoints.begin() + points1.size()); - - std::vector hullIndices; - cv::convexHull(allPoints, hullIndices, false); - - std::vector hullPoints; - for (int32_t idx : hullIndices) { - hullPoints.push_back(allPoints[idx]); - } - - cv::RotatedRect minAreaRect = cv::minAreaRect(hullPoints); - - std::array rectPoints; - minAreaRect.points(rectPoints.data()); - - return pointsFromCvPoints(rectPoints.data()); +types::BBox mergeRotatedBoxes(const types::BBox &box1, + const types::BBox &box2) { + return {{std::min(box1.p1.x, box2.p1.x), std::min(box1.p1.y, box2.p1.y)}, + {std::max(box1.p2.x, box2.p2.x), std::max(box1.p2.y, box2.p2.y)}}; } /** @@ -555,8 +472,8 @@ mergeRotatedBoxes(std::array &box1, std::optional> findClosestBox(const std::vector &boxes, const std::unordered_set &ignoredIdxs, - const std::array ¤tBox, bool isVertical, - float m, float c, float centerThreshold) { + const types::BBox ¤tBox, bool isVertical, float m, float c, + float centerThreshold) { float smallestDistance = std::numeric_limits::max(); ssize_t idx = -1; float boxHeight = 0.0f; @@ -566,7 +483,7 @@ findClosestBox(const std::vector &boxes, if (ignoredIdxs.contains(i)) { continue; } - std::array bbox = boxes[i].bbox; + const types::BBox &bbox = boxes[i].bbox; const types::Point centerOfProcessedBox = centerOfBox(bbox); const float distanceBetweenCenters = distanceFromPoint(centerOfCurrentBox, centerOfProcessedBox); @@ -616,11 +533,7 @@ removeSmallBoxesFromArray(const std::vector &boxes, return filteredBoxes; } -static float minimumYFromBox(const std::array &box) { - return std::ranges::min_element( - box, [](types::Point a, types::Point b) { return a.y < b.y; }) - ->y; -} +static float minimumYFromBox(const types::BBox &bbox) { return bbox.p1.y; } std::vector groupTextBoxes(std::vector &boxes, float centerThreshold, diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognitionHandlerUtils.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognitionHandlerUtils.cpp index 41c3f78187..0e50f1c038 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognitionHandlerUtils.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognitionHandlerUtils.cpp @@ -42,12 +42,12 @@ void computeRatioAndResize(cv::Mat &img, cv::Size size, int32_t modelHeight) { cv::Mat cropImage(types::DetectorBBox box, cv::Mat &image, int32_t modelHeight) { - // Convert custom points to cv::Point2f - std::array points; -#pragma unroll - for (std::size_t i = 0; i < points.size(); ++i) { - points[i] = cv::Point2f(box.bbox[i].x, box.bbox[i].y); - } + const std::array points = {{ + {box.bbox.p1.x, box.bbox.p1.y}, + {box.bbox.p2.x, box.bbox.p1.y}, + {box.bbox.p2.x, box.bbox.p2.y}, + {box.bbox.p1.x, box.bbox.p2.y}, + }}; cv::RotatedRect rotatedRect = cv::minAreaRect(points); cv::Point2f rectPoints[4]; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.cpp index 89a84fa676..e959739ab1 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.cpp @@ -1,8 +1,11 @@ #include "RecognizerUtils.h" #include #include +#include namespace rnexecutorch::models::ocr::utils { +using namespace rnexecutorch::utils::computer_vision; + cv::Mat softmax(const cv::Mat &inputs) { cv::Mat maxVal; cv::reduce(inputs, maxVal, 1, cv::REDUCE_MAX, CV_32F); @@ -80,9 +83,10 @@ float confidenceScore(const std::vector &values, return std::pow(product, exponent); } -cv::Rect extractBoundingBox(std::array &points) { - cv::Mat pointsMat(4, 1, CV_32FC2, points.data()); - return cv::boundingRect(pointsMat); +cv::Rect extractBoundingBox(const BBox &bbox) { + return cv::Rect(static_cast(bbox.p1.x), static_cast(bbox.p1.y), + static_cast(bbox.width()), + static_cast(bbox.height())); } cv::Mat characterBitMask(const cv::Mat &img) { @@ -157,22 +161,22 @@ cv::Mat characterBitMask(const cv::Mat &img) { return resultImage; } -cv::Mat -cropImageWithBoundingBox(const cv::Mat &img, - const std::array &bbox, - const std::array &originalBbox, - const types::PaddingInfo &paddings, - const types::PaddingInfo &originalPaddings) { - if (originalBbox.empty()) { +cv::Mat cropImageWithBoundingBox(const cv::Mat &img, const BBox &bbox, + const BBox &originalBbox, + const types::PaddingInfo &paddings, + const types::PaddingInfo &originalPaddings) { + if (!originalBbox.isValid()) { throw RnExecutorchError(RnExecutorchErrorCode::UnknownError, "Original bounding box cannot be empty."); } - const types::Point topLeft = originalBbox[0]; + const types::Point topLeft = originalBbox.p1; + const std::array bboxCorners = { + bbox.p1, {bbox.p2.x, bbox.p1.y}, bbox.p2, {bbox.p1.x, bbox.p2.y}}; std::vector points; - points.reserve(bbox.size()); + points.reserve(4); - for (const auto &point : bbox) { + for (const auto &point : bboxCorners) { types::Point transformedPoint = point; transformedPoint.x -= paddings.left; @@ -202,9 +206,8 @@ cropImageWithBoundingBox(const cv::Mat &img, return croppedImage; } -cv::Mat prepareForRecognition(const cv::Mat &originalImage, - const std::array &bbox, - const std::array &originalBbox, +cv::Mat prepareForRecognition(const cv::Mat &originalImage, const BBox &bbox, + const BBox &originalBbox, const types::PaddingInfo &paddings, const types::PaddingInfo &originalPaddings) { auto croppedChar = cropImageWithBoundingBox(originalImage, bbox, originalBbox, diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.h b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.h index 71e3a9c25e..d693193386 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include #include @@ -21,7 +20,7 @@ cv::Mat softmax(const cv::Mat &inputs); types::ValuesAndIndices findMaxValuesIndices(const cv::Mat &mat); std::vector sumProbabilityRows(const cv::Mat &matrix); void divideMatrixByRows(cv::Mat &matrix, const std::vector &rowSums); -cv::Rect extractBoundingBox(std::array &points); +cv::Rect extractBoundingBox(const types::BBox &bbox); /** * @brief Computes confidence score for given values and indices vectors. @@ -43,12 +42,10 @@ cv::Mat characterBitMask(const cv::Mat &img); * with internal bounding box and padding. * It does so to preserve the best possible image quality. */ -cv::Mat -cropImageWithBoundingBox(const cv::Mat &img, - const std::array &bbox, - const std::array &originalBbox, - const types::PaddingInfo &paddings, - const types::PaddingInfo &originalPaddings); +cv::Mat cropImageWithBoundingBox(const cv::Mat &img, const types::BBox &bbox, + const types::BBox &originalBbox, + const types::PaddingInfo &paddings, + const types::PaddingInfo &originalPaddings); /** * @brief Perform cropping, resizing and convert to grayscale to prepare image @@ -62,10 +59,9 @@ cropImageWithBoundingBox(const cv::Mat &img, * * @details it utilizes cropImageWithBoundingBox to perform specific cropping. */ - cv::Mat prepareForRecognition(const cv::Mat &originalImage, - const std::array &bbox, - const std::array &originalBbox, + const types::BBox &bbox, + const types::BBox &originalBbox, const types::PaddingInfo &paddings, const types::PaddingInfo &originalPaddings); } // namespace rnexecutorch::models::ocr::utils diff --git a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/Types.h index 7d671ab7bb..6a7cdc2e88 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/Types.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/Types.h @@ -1,18 +1,12 @@ #pragma once -#include +#include #include namespace rnexecutorch::models::pose_estimation { -// Single keypoint (x, y) -struct Keypoint { - int32_t x; - int32_t y; -}; - // N keypoints for one person, depending on the model in question -using PersonKeypoints = std::vector; +using PersonKeypoints = std::vector; // N people for each image using PoseDetections = std::vector; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp index 88a027d01b..6338b5f25b 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp @@ -187,10 +187,8 @@ types::OCRDetection VerticalOCR::_processSingleTextBox( float confidenceScore = 0.0; if (!characterBoxes.empty()) { // Prepare information useful for proper boxes shifting and image cropping. - const int32_t boxWidth = - static_cast(box.bbox[2].x - box.bbox[0].x); - const int32_t boxHeight = - static_cast(box.bbox[2].y - box.bbox[0].y); + const int32_t boxWidth = static_cast(box.bbox.width()); + const int32_t boxHeight = static_cast(box.bbox.height()); cv::Size narrowRecognizerSize = detector.calculateModelImageSize(constants::kSmallDetectorWidth); types::PaddingInfo paddingsBox = utils::calculateResizeRatioAndPaddings( @@ -205,13 +203,18 @@ types::OCRDetection VerticalOCR::_processSingleTextBox( paddingsBox, imagePaddings); } // Modify the returned boxes to match the original image size - std::array finalBbox; - for (size_t i = 0; i < box.bbox.size(); ++i) { - finalBbox[i].x = - (box.bbox[i].x - imagePaddings.left) * imagePaddings.resizeRatio; - finalBbox[i].y = - (box.bbox[i].y - imagePaddings.top) * imagePaddings.resizeRatio; - } + const float ratio = imagePaddings.resizeRatio; + const float padLeft = static_cast(imagePaddings.left); + const float padTop = static_cast(imagePaddings.top); + auto tx = [&](types::Point p) -> types::Point { + return {(p.x - padLeft) * ratio, (p.y - padTop) * ratio}; + }; + std::array finalBbox = { + tx(box.bbox.p1), + tx({box.bbox.p2.x, box.bbox.p1.y}), + tx(box.bbox.p2), + tx({box.bbox.p1.x, box.bbox.p2.y}), + }; return {finalBbox, text, confidenceScore}; } diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.cpp b/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.cpp index 80425c2dab..e9cf1e9d73 100644 --- a/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.cpp @@ -48,23 +48,19 @@ void inverseRotateBbox(computer_vision::BBox &bbox, switch (orient.orientation) { case Orientation::Up: { // landscape-left → portrait: nx = h - y, ny = x - float nx1 = h - bbox.y2, ny1 = bbox.x1; - float nx2 = h - bbox.y1, ny2 = bbox.x2; - bbox.x1 = nx1; - bbox.y1 = ny1; - bbox.x2 = nx2; - bbox.y2 = ny2; + float nx1 = h - bbox.p2.y, ny1 = bbox.p1.x; + float nx2 = h - bbox.p1.y, ny2 = bbox.p2.x; + bbox.p1 = {nx1, ny1}; + bbox.p2 = {nx2, ny2}; break; } case Orientation::Right: { #if defined(__APPLE__) // iOS upside-down portrait → portrait: nx = w - x, ny = h - y - float nx1 = w - bbox.x2, ny1 = h - bbox.y2; - float nx2 = w - bbox.x1, ny2 = h - bbox.y1; - bbox.x1 = nx1; - bbox.y1 = ny1; - bbox.x2 = nx2; - bbox.y2 = ny2; + float nx1 = w - bbox.p2.x, ny1 = h - bbox.p2.y; + float nx2 = w - bbox.p1.x, ny2 = h - bbox.p1.y; + bbox.p1 = {nx1, ny1}; + bbox.p2 = {nx2, ny2}; #endif // Android front-cam upright portrait: rotated frame already in screen // space, no inverse needed. @@ -72,12 +68,10 @@ void inverseRotateBbox(computer_vision::BBox &bbox, } case Orientation::Down: { // landscape-right → portrait: nx = y, ny = w - x - float nx1 = bbox.y1, ny1 = w - bbox.x2; - float nx2 = bbox.y2, ny2 = w - bbox.x1; - bbox.x1 = nx1; - bbox.y1 = ny1; - bbox.x2 = nx2; - bbox.y2 = ny2; + float nx1 = bbox.p1.y, ny1 = w - bbox.p2.x; + float nx2 = bbox.p2.y, ny2 = w - bbox.p1.x; + bbox.p1 = {nx1, ny1}; + bbox.p2 = {nx2, ny2}; break; } case Orientation::Left: @@ -93,12 +87,10 @@ void inverseRotateBbox(computer_vision::BBox &bbox, orient.orientation == Orientation::Down); float sw = swapped ? h : w; float sh = swapped ? w : h; - float nx1 = sw - bbox.x2, ny1 = sh - bbox.y2; - float nx2 = sw - bbox.x1, ny2 = sh - bbox.y1; - bbox.x1 = nx1; - bbox.y1 = ny1; - bbox.x2 = nx2; - bbox.y2 = ny2; + float nx1 = sw - bbox.p2.x, ny1 = sh - bbox.p2.y; + float nx2 = sw - bbox.p1.x, ny2 = sh - bbox.p1.y; + bbox.p1 = {nx1, ny1}; + bbox.p2 = {nx2, ny2}; } #endif } diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.cpp b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.cpp index 108fd6ff8a..8ced125ecf 100644 --- a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.cpp @@ -1,14 +1,13 @@ #include "Processing.h" #include -#include namespace rnexecutorch::utils::computer_vision { float computeIoU(const BBox &a, const BBox &b) { - float x1 = std::max(a.x1, b.x1); - float y1 = std::max(a.y1, b.y1); - float x2 = std::min(a.x2, b.x2); - float y2 = std::min(a.y2, b.y2); + float x1 = std::max(a.p1.x, b.p1.x); + float y1 = std::max(a.p1.y, b.p1.y); + float x2 = std::min(a.p2.x, b.p2.x); + float y2 = std::min(a.p2.y, b.p2.y); float intersectionArea = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1); float areaA = a.area(); diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h index 8899d3b87c..943d0b1c91 100644 --- a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h +++ b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h @@ -4,24 +4,26 @@ namespace rnexecutorch::utils::computer_vision { -struct BBox { - - float width() const { return x2 - x1; } - - float height() const { return y2 - y1; } +struct Point { + float x; + float y; +}; +struct BBox { + float width() const { return p2.x - p1.x; } + float height() const { return p2.y - p1.y; } float area() const { return width() * height(); } bool isValid() const { - return x2 > x1 && y2 > y1 && x1 >= 0.0f && y1 >= 0.0f; + return p2.x > p1.x && p2.y > p1.y && p1.x >= 0.0f && p1.y; } BBox scale(float widthRatio, float heightRatio) const { - return {x1 * widthRatio, y1 * heightRatio, x2 * widthRatio, - y2 * heightRatio}; + return {{p1.x * widthRatio, p1.y * heightRatio}, + {p2.x * widthRatio, p2.y * heightRatio}}; } - float x1, y1, x2, y2; + Point p1, p2; }; template From 91c483ee8347b820c59fc17d2b0d51668dab6d99 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 7 May 2026 19:07:23 +0000 Subject: [PATCH 2/4] refactor(ocr): reduce OCR bounding boxes from 4 vertices to 2-point AABB Resolves #760. The OCR and VerticalOCR pipelines previously exposed all four rotated-rectangle corners in OCRDetection.bbox. Two points (top-left and bottom-right of the axis-aligned bounding box) are sufficient for downstream rendering and are simpler to consume. Changes: - Types.h: shrink OCRDetection.bbox from std::array to std::array - RecognitionHandler.cpp: compute AABB (min/max x,y) over the four detector corners instead of forwarding them verbatim - VerticalOCR.cpp: same AABB reduction in _processSingleTextBox - OCR.cpp / VerticalOCR.cpp generateFromFrame: re-normalize the two bbox corners after inverseRotatePoints to guarantee bbox[0] <= bbox[1] - JsiConversions.h: serialize 2 points instead of 4 to JavaScript - OCRTest.cpp / VerticalOCRTest.cpp: assert size==2 and that bbox[1] >= bbox[0] - ocr.ts: narrow TypeScript type from Point[] to [Point,Point] and update docs --- .../host_objects/JsiConversions.h | 4 +-- .../common/rnexecutorch/models/ocr/OCR.cpp | 8 ++++++ .../models/ocr/RecognitionHandler.cpp | 17 +++++------- .../common/rnexecutorch/models/ocr/Types.h | 2 +- .../models/vertical_ocr/VerticalOCR.cpp | 27 +++++++++---------- .../tests/integration/OCRTest.cpp | 12 ++++----- .../tests/integration/VerticalOCRTest.cpp | 22 ++++++++------- .../react-native-executorch/src/types/ocr.ts | 5 ++-- 8 files changed, 51 insertions(+), 46 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index 247780b6f5..9c6642a6cd 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -526,9 +526,9 @@ getJsiValue(const std::vector &detections, auto jsiDetectionObject = jsi::Object(runtime); - auto jsiBboxArray = jsi::Array(runtime, 4); + auto jsiBboxArray = jsi::Array(runtime, 2); #pragma unroll - for (size_t j = 0; j < 4u; ++j) { + for (size_t j = 0; j < 2u; ++j) { auto jsiPointObject = jsi::Object(runtime); jsiPointObject.setProperty(runtime, "x", detection.bbox[j].x); jsiPointObject.setProperty(runtime, "y", detection.bbox[j].y); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp index 3c08d16daa..60887e0f7b 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp @@ -1,5 +1,6 @@ #include "OCR.h" #include "Constants.h" +#include #include #include #include @@ -69,6 +70,13 @@ OCR::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) { for (auto &det : detections) { ::rnexecutorch::utils::inverseRotatePoints(det.bbox, orient, rotated.size()); + // Re-normalize to a proper AABB after the coordinate rotation. + float minX = std::min(det.bbox[0].x, det.bbox[1].x); + float minY = std::min(det.bbox[0].y, det.bbox[1].y); + float maxX = std::max(det.bbox[0].x, det.bbox[1].x); + float maxY = std::max(det.bbox[0].y, det.bbox[1].y); + det.bbox[0] = {minX, minY}; + det.bbox[1] = {maxX, maxY}; } return detections; } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp index 4725154a86..e70e46ab9c 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp @@ -55,22 +55,17 @@ void RecognitionHandler::processBBox(std::vector &boxList, /* Since the boxes were corresponding to the image resized to 1280x1280, we want to return the boxes shifted and rescaled to match the original - image dimensions. + image dimensions. Compute the axis-aligned bounding box (AABB) from the + four rotated corners and store only the top-left and bottom-right points. */ const float ratio = ratioAndPadding.resizeRatio; const float padLeft = static_cast(ratioAndPadding.left); const float padTop = static_cast(ratioAndPadding.top); - auto tx = [&](types::Point p) -> types::Point { - return {(p.x - padLeft) * ratio, (p.y - padTop) * ratio}; - }; - std::array corners = { - tx(box.bbox.p1), - tx({box.bbox.p2.x, box.bbox.p1.y}), - tx(box.bbox.p2), - tx({box.bbox.p1.x, box.bbox.p2.y}), - }; + types::BBox transformedBbox{ + {(box.bbox.p1.x - padLeft) * ratio, (box.bbox.p1.y - padTop) * ratio}, + {(box.bbox.p2.x - padLeft) * ratio, (box.bbox.p2.y - padTop) * ratio}}; boxList.emplace_back( - corners, + transformedBbox, converter.decodeGreedy(predictionIndices, predictionIndices.size())[0], confidenceScore); } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h index 664efa400c..e3d38c33bb 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h @@ -25,7 +25,7 @@ struct PaddingInfo { }; struct OCRDetection { - std::array bbox; + std::array bbox; std::string text; float score; }; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp index 6338b5f25b..c0a531ecd3 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp @@ -1,4 +1,5 @@ #include "VerticalOCR.h" +#include #include #include #include @@ -71,8 +72,12 @@ VerticalOCR::generateFromFrame(jsi::Runtime &runtime, cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(bgr, orient); auto detections = runInference(rotated); for (auto &det : detections) { - ::rnexecutorch::utils::inverseRotatePoints(det.bbox, orient, - rotated.size()); + std::array corners = {det.bbox.p1, det.bbox.p2}; + ::rnexecutorch::utils::inverseRotatePoints(corners, orient, rotated.size()); + det.bbox = {{std::min(corners[0].x, corners[1].x), + std::min(corners[0].y, corners[1].y)}, + {std::max(corners[0].x, corners[1].x), + std::max(corners[0].y, corners[1].y)}}; } return detections; } @@ -202,21 +207,15 @@ types::OCRDetection VerticalOCR::_processSingleTextBox( : _handleJointCharacters(box, originalImage, characterBoxes, paddingsBox, imagePaddings); } - // Modify the returned boxes to match the original image size + // Modify the returned boxes to match the original image size. const float ratio = imagePaddings.resizeRatio; const float padLeft = static_cast(imagePaddings.left); const float padTop = static_cast(imagePaddings.top); - auto tx = [&](types::Point p) -> types::Point { - return {(p.x - padLeft) * ratio, (p.y - padTop) * ratio}; - }; - std::array finalBbox = { - tx(box.bbox.p1), - tx({box.bbox.p2.x, box.bbox.p1.y}), - tx(box.bbox.p2), - tx({box.bbox.p1.x, box.bbox.p2.y}), - }; - - return {finalBbox, text, confidenceScore}; + types::BBox transformedBbox{ + {(box.bbox.p1.x - padLeft) * ratio, (box.bbox.p1.y - padTop) * ratio}, + {(box.bbox.p2.x - padLeft) * ratio, (box.bbox.p2.y - padTop) * ratio}}; + + return {transformedBbox, text, confidenceScore}; } void VerticalOCR::unload() noexcept { diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp index 072c761164..de995fa9f9 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp @@ -100,12 +100,12 @@ TEST(OCRGenerateTests, DetectionsHaveValidBoundingBoxes) { auto results = model.generateFromString(kValidTestImagePath); for (const auto &detection : results) { - // Each bbox should have 4 points - EXPECT_EQ(detection.bbox.size(), 4u); - for (const auto &point : detection.bbox) { - EXPECT_GE(point.x, 0.0f); - EXPECT_GE(point.y, 0.0f); - } + // Each bbox has 2 points: top-left [0] and bottom-right [1] + EXPECT_EQ(detection.bbox.size(), 2u); + EXPECT_GE(detection.bbox[0].x, 0.0f); + EXPECT_GE(detection.bbox[0].y, 0.0f); + EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x); + EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y); } } diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp index fd6d59441d..f409926b83 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp @@ -117,11 +117,12 @@ TEST(VerticalOCRGenerateTests, IndependentCharsDetectionsHaveValidBBoxes) { auto results = model.generateFromString(kValidVerticalTestImagePath); for (const auto &detection : results) { - EXPECT_EQ(detection.bbox.size(), 4u); - for (const auto &point : detection.bbox) { - EXPECT_GE(point.x, 0.0f); - EXPECT_GE(point.y, 0.0f); - } + // Each bbox has 2 points: top-left [0] and bottom-right [1] + EXPECT_EQ(detection.bbox.size(), 2u); + EXPECT_GE(detection.bbox[0].x, 0.0f); + EXPECT_GE(detection.bbox[0].y, 0.0f); + EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x); + EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y); } } @@ -180,11 +181,12 @@ TEST(VerticalOCRGenerateTests, JointCharsDetectionsHaveValidBBoxes) { auto results = model.generateFromString(kValidVerticalTestImagePath); for (const auto &detection : results) { - EXPECT_EQ(detection.bbox.size(), 4u); - for (const auto &point : detection.bbox) { - EXPECT_GE(point.x, 0.0f); - EXPECT_GE(point.y, 0.0f); - } + // Each bbox has 2 points: top-left [0] and bottom-right [1] + EXPECT_EQ(detection.bbox.size(), 2u); + EXPECT_GE(detection.bbox[0].x, 0.0f); + EXPECT_GE(detection.bbox[0].y, 0.0f); + EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x); + EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y); } } diff --git a/packages/react-native-executorch/src/types/ocr.ts b/packages/react-native-executorch/src/types/ocr.ts index d2f3781095..16f9fbcff1 100644 --- a/packages/react-native-executorch/src/types/ocr.ts +++ b/packages/react-native-executorch/src/types/ocr.ts @@ -6,12 +6,13 @@ import { Frame, PixelData, ResourceSource } from './common'; * OCRDetection represents a single detected text instance in an image, * including its bounding box, recognized text, and confidence score. * @category Types - * @property {Point[]} bbox - An array of points defining the bounding box around the detected text. + * @property {[Point, Point]} bbox - A tuple of two points defining the axis-aligned bounding box + * around the detected text: `bbox[0]` is the top-left corner and `bbox[1]` is the bottom-right corner. * @property {string} text - The recognized text within the bounding box. * @property {number} score - The confidence score of the OCR detection, ranging from 0 to 1. */ export interface OCRDetection { - bbox: Point[]; + bbox: [Point, Point]; text: string; score: number; } From c4fcf545f2b5eb9cc75c90306179b32ec8f3c16c Mon Sep 17 00:00:00 2001 From: chmjkb Date: Fri, 8 May 2026 14:13:21 +0200 Subject: [PATCH 3/4] chore: change TS types --- .../components/ImageWithOCRBboxes.tsx | 15 +++++++------- .../vision_camera/tasks/OCRTask.tsx | 14 +++++++++---- .../03-hooks/02-computer-vision/useOCR.md | 12 ++++++----- .../02-computer-vision/useVerticalOCR.md | 12 ++++++----- .../host_objects/JsiConversions.h | 12 ++--------- .../common/rnexecutorch/models/ocr/OCR.cpp | 15 ++++++-------- .../common/rnexecutorch/models/ocr/Types.h | 3 +-- .../tests/integration/OCRTest.cpp | 10 ++++------ .../tests/integration/VerticalOCRTest.cpp | 20 ++++++++----------- .../react-native-executorch/src/types/ocr.ts | 17 +++------------- 10 files changed, 56 insertions(+), 74 deletions(-) diff --git a/apps/computer-vision/components/ImageWithOCRBboxes.tsx b/apps/computer-vision/components/ImageWithOCRBboxes.tsx index 1c8fe616af..eb1f9acbf6 100644 --- a/apps/computer-vision/components/ImageWithOCRBboxes.tsx +++ b/apps/computer-vision/components/ImageWithOCRBboxes.tsx @@ -59,13 +59,14 @@ export default function ImageWithOCRBboxes({ {detections.map((detection, index) => { const { scaleX, scaleY, offsetX, offsetY } = calculateAdjustedDimensions(); - const points = detection.bbox.map((point) => ({ - x: point.x * scaleX + offsetX, - y: point.y * scaleY + offsetY, - })); - - const pointsString = points - .map((point) => `${point.x},${point.y}`) + const { x1, y1, x2, y2 } = detection.bbox; + const pointsString = [ + [x1, y1], + [x2, y1], + [x2, y2], + [x1, y2], + ] + .map(([x, y]) => `${x * scaleX + offsetX},${y * scaleY + offsetY}`) .join(' '); return ( diff --git a/apps/computer-vision/components/vision_camera/tasks/OCRTask.tsx b/apps/computer-vision/components/vision_camera/tasks/OCRTask.tsx index fbdb1148e0..3dfa874b43 100644 --- a/apps/computer-vision/components/vision_camera/tasks/OCRTask.tsx +++ b/apps/computer-vision/components/vision_camera/tasks/OCRTask.tsx @@ -110,11 +110,17 @@ export default function OCRTask({ style={StyleSheet.absoluteFill} > {detections.map((det, i) => { - const pts = det.bbox - .map((p) => `${p.x * scale + offsetX},${p.y * scale + offsetY}`) + const { x1, y1, x2, y2 } = det.bbox; + const pts = [ + [x1, y1], + [x2, y1], + [x2, y2], + [x1, y2], + ] + .map(([x, y]) => `${x * scale + offsetX},${y * scale + offsetY}`) .join(' '); - const labelX = det.bbox[0]!.x * scale + offsetX; - const labelY = det.bbox[0]!.y * scale + offsetY - 4; + const labelX = x1 * scale + offsetX; + const labelY = y1 * scale + offsetY - 4; return ( diff --git a/docs/docs/03-hooks/02-computer-vision/useOCR.md b/docs/docs/03-hooks/02-computer-vision/useOCR.md index 7754992700..8f015e77a5 100644 --- a/docs/docs/03-hooks/02-computer-vision/useOCR.md +++ b/docs/docs/03-hooks/02-computer-vision/useOCR.md @@ -61,19 +61,21 @@ See the full guide: [VisionCamera Integration](./visioncamera-integration.md). The detection object is specified as follows: ```typescript -interface Point { - x: number; - y: number; +interface Bbox { + x1: number; + y1: number; + x2: number; + y2: number; } interface OCRDetection { - bbox: Point[]; + bbox: Bbox; text: string; score: number; } ``` -The `bbox` property contains information about the bounding box of detected text regions. It is represented as four points, which are corners of detected bounding box. +The `bbox` property contains the axis-aligned bounding box of the detected text region. `x1`/`y1` is the top-left corner and `x2`/`y2` is the bottom-right corner. The `text` property contains the text recognized within detected text region. The `score` represents the confidence score of the recognized text. ## Example diff --git a/docs/docs/03-hooks/02-computer-vision/useVerticalOCR.md b/docs/docs/03-hooks/02-computer-vision/useVerticalOCR.md index d65c97cf25..ff27b55e17 100644 --- a/docs/docs/03-hooks/02-computer-vision/useVerticalOCR.md +++ b/docs/docs/03-hooks/02-computer-vision/useVerticalOCR.md @@ -69,19 +69,21 @@ See the full guide: [VisionCamera Integration](./visioncamera-integration.md). The detection object is specified as follows: ```typescript -interface Point { - x: number; - y: number; +interface Bbox { + x1: number; + y1: number; + x2: number; + y2: number; } interface OCRDetection { - bbox: Point[]; + bbox: Bbox; text: string; score: number; } ``` -The `bbox` property contains information about the bounding box of detected text regions. It is represented as four points, which are corners of detected bounding box. +The `bbox` property contains the axis-aligned bounding box of the detected text region. `x1`/`y1` is the top-left corner and `x2`/`y2` is the bottom-right corner. The `text` property contains the text recognized within detected text region. The `score` represents the confidence score of the recognized text. ## Example diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index 9c6642a6cd..077d426c8f 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -526,16 +526,8 @@ getJsiValue(const std::vector &detections, auto jsiDetectionObject = jsi::Object(runtime); - auto jsiBboxArray = jsi::Array(runtime, 2); -#pragma unroll - for (size_t j = 0; j < 2u; ++j) { - auto jsiPointObject = jsi::Object(runtime); - jsiPointObject.setProperty(runtime, "x", detection.bbox[j].x); - jsiPointObject.setProperty(runtime, "y", detection.bbox[j].y); - jsiBboxArray.setValueAtIndex(runtime, j, jsiPointObject); - } - - jsiDetectionObject.setProperty(runtime, "bbox", jsiBboxArray); + jsiDetectionObject.setProperty(runtime, "bbox", + getJsiValue(detection.bbox, runtime)); jsiDetectionObject.setProperty( runtime, "text", jsi::String::createFromUtf8(runtime, detection.text)); jsiDetectionObject.setProperty(runtime, "score", detection.score); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp index 60887e0f7b..d3e6964a05 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp @@ -68,15 +68,12 @@ OCR::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) { cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(bgr, orient); auto detections = runInference(rotated); for (auto &det : detections) { - ::rnexecutorch::utils::inverseRotatePoints(det.bbox, orient, - rotated.size()); - // Re-normalize to a proper AABB after the coordinate rotation. - float minX = std::min(det.bbox[0].x, det.bbox[1].x); - float minY = std::min(det.bbox[0].y, det.bbox[1].y); - float maxX = std::max(det.bbox[0].x, det.bbox[1].x); - float maxY = std::max(det.bbox[0].y, det.bbox[1].y); - det.bbox[0] = {minX, minY}; - det.bbox[1] = {maxX, maxY}; + std::array corners = {det.bbox.p1, det.bbox.p2}; + ::rnexecutorch::utils::inverseRotatePoints(corners, orient, rotated.size()); + det.bbox = {{std::min(corners[0].x, corners[1].x), + std::min(corners[0].y, corners[1].y)}, + {std::max(corners[0].x, corners[1].x), + std::max(corners[0].y, corners[1].y)}}; } return detections; } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h index e3d38c33bb..8e711d382c 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include #include @@ -25,7 +24,7 @@ struct PaddingInfo { }; struct OCRDetection { - std::array bbox; + BBox bbox; std::string text; float score; }; diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp index de995fa9f9..a97e4c2121 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp @@ -100,12 +100,10 @@ TEST(OCRGenerateTests, DetectionsHaveValidBoundingBoxes) { auto results = model.generateFromString(kValidTestImagePath); for (const auto &detection : results) { - // Each bbox has 2 points: top-left [0] and bottom-right [1] - EXPECT_EQ(detection.bbox.size(), 2u); - EXPECT_GE(detection.bbox[0].x, 0.0f); - EXPECT_GE(detection.bbox[0].y, 0.0f); - EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x); - EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y); + EXPECT_GE(detection.bbox.p1.x, 0.0f); + EXPECT_GE(detection.bbox.p1.y, 0.0f); + EXPECT_GE(detection.bbox.p2.x, detection.bbox.p1.x); + EXPECT_GE(detection.bbox.p2.y, detection.bbox.p1.y); } } diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp index f409926b83..c92abc0f15 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp @@ -117,12 +117,10 @@ TEST(VerticalOCRGenerateTests, IndependentCharsDetectionsHaveValidBBoxes) { auto results = model.generateFromString(kValidVerticalTestImagePath); for (const auto &detection : results) { - // Each bbox has 2 points: top-left [0] and bottom-right [1] - EXPECT_EQ(detection.bbox.size(), 2u); - EXPECT_GE(detection.bbox[0].x, 0.0f); - EXPECT_GE(detection.bbox[0].y, 0.0f); - EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x); - EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y); + EXPECT_GE(detection.bbox.p1.x, 0.0f); + EXPECT_GE(detection.bbox.p1.y, 0.0f); + EXPECT_GE(detection.bbox.p2.x, detection.bbox.p1.x); + EXPECT_GE(detection.bbox.p2.y, detection.bbox.p1.y); } } @@ -181,12 +179,10 @@ TEST(VerticalOCRGenerateTests, JointCharsDetectionsHaveValidBBoxes) { auto results = model.generateFromString(kValidVerticalTestImagePath); for (const auto &detection : results) { - // Each bbox has 2 points: top-left [0] and bottom-right [1] - EXPECT_EQ(detection.bbox.size(), 2u); - EXPECT_GE(detection.bbox[0].x, 0.0f); - EXPECT_GE(detection.bbox[0].y, 0.0f); - EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x); - EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y); + EXPECT_GE(detection.bbox.p1.x, 0.0f); + EXPECT_GE(detection.bbox.p1.y, 0.0f); + EXPECT_GE(detection.bbox.p2.x, detection.bbox.p1.x); + EXPECT_GE(detection.bbox.p2.y, detection.bbox.p1.y); } } diff --git a/packages/react-native-executorch/src/types/ocr.ts b/packages/react-native-executorch/src/types/ocr.ts index 16f9fbcff1..38879e6c9d 100644 --- a/packages/react-native-executorch/src/types/ocr.ts +++ b/packages/react-native-executorch/src/types/ocr.ts @@ -1,33 +1,22 @@ import { symbols } from '../constants/ocr/symbols'; import { RnExecutorchError } from '../errors/errorUtils'; import { Frame, PixelData, ResourceSource } from './common'; +import { Bbox } from './objectDetection'; /** * OCRDetection represents a single detected text instance in an image, * including its bounding box, recognized text, and confidence score. * @category Types - * @property {[Point, Point]} bbox - A tuple of two points defining the axis-aligned bounding box - * around the detected text: `bbox[0]` is the top-left corner and `bbox[1]` is the bottom-right corner. + * @property {Bbox} bbox - The axis-aligned bounding box around the detected text, with `x1`/`y1` as the top-left corner and `x2`/`y2` as the bottom-right corner. * @property {string} text - The recognized text within the bounding box. * @property {number} score - The confidence score of the OCR detection, ranging from 0 to 1. */ export interface OCRDetection { - bbox: [Point, Point]; + bbox: Bbox; text: string; score: number; } -/** - * Point represents a coordinate in 2D space. - * @category Types - * @property {number} x - The x-coordinate of the point. - * @property {number} y - The y-coordinate of the point. - */ -export interface Point { - x: number; - y: number; -} - /** * Configuration properties for the `useOCR` hook. * @category Types From 95a6c69a62f45a72dcb41485216042871c15422b Mon Sep 17 00:00:00 2001 From: chmjkb Date: Tue, 12 May 2026 13:19:54 +0200 Subject: [PATCH 4/4] fix: check if y is >= 0.0f --- .../common/rnexecutorch/utils/computer_vision/Types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h index 943d0b1c91..7698d9807f 100644 --- a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h +++ b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h @@ -15,7 +15,7 @@ struct BBox { float area() const { return width() * height(); } bool isValid() const { - return p2.x > p1.x && p2.y > p1.y && p1.x >= 0.0f && p1.y; + return p2.x > p1.x && p2.y > p1.y && p1.x >= 0.0f && p1.y >= 0.0f; } BBox scale(float widthRatio, float heightRatio) const {