diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index a20fd7b1bc..fac780ad50 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -526,9 +526,9 @@ getJsiValue(const std::vector &detections, auto jsiDetectionObject = jsi::Object(runtime); - auto jsiBboxArray = jsi::Array(runtime, 4); + auto jsiBboxArray = jsi::Array(runtime, 2); #pragma unroll - for (size_t j = 0; j < 4u; ++j) { + for (size_t j = 0; j < 2u; ++j) { auto jsiPointObject = jsi::Object(runtime); jsiPointObject.setProperty(runtime, "x", detection.bbox[j].x); jsiPointObject.setProperty(runtime, "y", detection.bbox[j].y); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp index 3c08d16daa..60887e0f7b 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp @@ -1,5 +1,6 @@ #include "OCR.h" #include "Constants.h" +#include #include #include #include @@ -69,6 +70,13 @@ OCR::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) { for (auto &det : detections) { ::rnexecutorch::utils::inverseRotatePoints(det.bbox, orient, rotated.size()); + // Re-normalize to a proper AABB after the coordinate rotation. + float minX = std::min(det.bbox[0].x, det.bbox[1].x); + float minY = std::min(det.bbox[0].y, det.bbox[1].y); + float maxX = std::max(det.bbox[0].x, det.bbox[1].x); + float maxY = std::max(det.bbox[0].y, det.bbox[1].y); + det.bbox[0] = {minX, minY}; + det.bbox[1] = {maxX, maxY}; } return detections; } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp index dfde737655..258edf340e 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp @@ -1,4 +1,6 @@ #include "RecognitionHandler.h" +#include +#include #include #include #include @@ -55,14 +57,24 @@ void RecognitionHandler::processBBox(std::vector &boxList, /* Since the boxes were corresponding to the image resized to 1280x1280, we want to return the boxes shifted and rescaled to match the original - image dimensions. + image dimensions. Compute the axis-aligned bounding box (AABB) from the + four rotated corners and store only the top-left and bottom-right points. */ - for (auto &point : box.bbox) { - point.x = (point.x - ratioAndPadding.left) * ratioAndPadding.resizeRatio; - point.y = (point.y - ratioAndPadding.top) * ratioAndPadding.resizeRatio; + float minX = std::numeric_limits::max(); + float minY = std::numeric_limits::max(); + float maxX = std::numeric_limits::lowest(); + float maxY = std::numeric_limits::lowest(); + for (const auto &point : box.bbox) { + float x = (point.x - ratioAndPadding.left) * ratioAndPadding.resizeRatio; + float y = (point.y - ratioAndPadding.top) * ratioAndPadding.resizeRatio; + minX = std::min(minX, x); + minY = std::min(minY, y); + maxX = std::max(maxX, x); + maxY = std::max(maxY, y); } boxList.emplace_back( - box.bbox, + std::array{types::Point{minX, minY}, + types::Point{maxX, maxY}}, converter.decodeGreedy(predictionIndices, predictionIndices.size())[0], confidenceScore); } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h index bb0a24aad1..af623a07b4 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h @@ -27,7 +27,7 @@ struct PaddingInfo { }; struct OCRDetection { - std::array bbox; + std::array bbox; std::string text; float score; }; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp index 88a027d01b..3b9fcbef2a 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp @@ -1,4 +1,6 @@ #include "VerticalOCR.h" +#include +#include #include #include #include @@ -73,6 +75,13 @@ VerticalOCR::generateFromFrame(jsi::Runtime &runtime, for (auto &det : detections) { ::rnexecutorch::utils::inverseRotatePoints(det.bbox, orient, rotated.size()); + // Re-normalize to a proper AABB after the coordinate rotation. + float minX = std::min(det.bbox[0].x, det.bbox[1].x); + float minY = std::min(det.bbox[0].y, det.bbox[1].y); + float maxX = std::max(det.bbox[0].x, det.bbox[1].x); + float maxY = std::max(det.bbox[0].y, det.bbox[1].y); + det.bbox[0] = {minX, minY}; + det.bbox[1] = {maxX, maxY}; } return detections; } @@ -204,16 +213,24 @@ types::OCRDetection VerticalOCR::_processSingleTextBox( : _handleJointCharacters(box, originalImage, characterBoxes, paddingsBox, imagePaddings); } - // Modify the returned boxes to match the original image size - std::array finalBbox; + // Modify the returned boxes to match the original image size. Compute the + // axis-aligned bounding box (AABB) from the four rotated corners and store + // only the top-left and bottom-right points. + float minX = std::numeric_limits::max(); + float minY = std::numeric_limits::max(); + float maxX = std::numeric_limits::lowest(); + float maxY = std::numeric_limits::lowest(); for (size_t i = 0; i < box.bbox.size(); ++i) { - finalBbox[i].x = - (box.bbox[i].x - imagePaddings.left) * imagePaddings.resizeRatio; - finalBbox[i].y = - (box.bbox[i].y - imagePaddings.top) * imagePaddings.resizeRatio; + float x = (box.bbox[i].x - imagePaddings.left) * imagePaddings.resizeRatio; + float y = (box.bbox[i].y - imagePaddings.top) * imagePaddings.resizeRatio; + minX = std::min(minX, x); + minY = std::min(minY, y); + maxX = std::max(maxX, x); + maxY = std::max(maxY, y); } - return {finalBbox, text, confidenceScore}; + return {{types::Point{minX, minY}, types::Point{maxX, maxY}}, text, + confidenceScore}; } void VerticalOCR::unload() noexcept { diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp index 072c761164..de995fa9f9 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp @@ -100,12 +100,12 @@ TEST(OCRGenerateTests, DetectionsHaveValidBoundingBoxes) { auto results = model.generateFromString(kValidTestImagePath); for (const auto &detection : results) { - // Each bbox should have 4 points - EXPECT_EQ(detection.bbox.size(), 4u); - for (const auto &point : detection.bbox) { - EXPECT_GE(point.x, 0.0f); - EXPECT_GE(point.y, 0.0f); - } + // Each bbox has 2 points: top-left [0] and bottom-right [1] + EXPECT_EQ(detection.bbox.size(), 2u); + EXPECT_GE(detection.bbox[0].x, 0.0f); + EXPECT_GE(detection.bbox[0].y, 0.0f); + EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x); + EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y); } } diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp index fd6d59441d..f409926b83 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp @@ -117,11 +117,12 @@ TEST(VerticalOCRGenerateTests, IndependentCharsDetectionsHaveValidBBoxes) { auto results = model.generateFromString(kValidVerticalTestImagePath); for (const auto &detection : results) { - EXPECT_EQ(detection.bbox.size(), 4u); - for (const auto &point : detection.bbox) { - EXPECT_GE(point.x, 0.0f); - EXPECT_GE(point.y, 0.0f); - } + // Each bbox has 2 points: top-left [0] and bottom-right [1] + EXPECT_EQ(detection.bbox.size(), 2u); + EXPECT_GE(detection.bbox[0].x, 0.0f); + EXPECT_GE(detection.bbox[0].y, 0.0f); + EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x); + EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y); } } @@ -180,11 +181,12 @@ TEST(VerticalOCRGenerateTests, JointCharsDetectionsHaveValidBBoxes) { auto results = model.generateFromString(kValidVerticalTestImagePath); for (const auto &detection : results) { - EXPECT_EQ(detection.bbox.size(), 4u); - for (const auto &point : detection.bbox) { - EXPECT_GE(point.x, 0.0f); - EXPECT_GE(point.y, 0.0f); - } + // Each bbox has 2 points: top-left [0] and bottom-right [1] + EXPECT_EQ(detection.bbox.size(), 2u); + EXPECT_GE(detection.bbox[0].x, 0.0f); + EXPECT_GE(detection.bbox[0].y, 0.0f); + EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x); + EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y); } } diff --git a/packages/react-native-executorch/src/types/ocr.ts b/packages/react-native-executorch/src/types/ocr.ts index d2f3781095..16f9fbcff1 100644 --- a/packages/react-native-executorch/src/types/ocr.ts +++ b/packages/react-native-executorch/src/types/ocr.ts @@ -6,12 +6,13 @@ import { Frame, PixelData, ResourceSource } from './common'; * OCRDetection represents a single detected text instance in an image, * including its bounding box, recognized text, and confidence score. * @category Types - * @property {Point[]} bbox - An array of points defining the bounding box around the detected text. + * @property {[Point, Point]} bbox - A tuple of two points defining the axis-aligned bounding box + * around the detected text: `bbox[0]` is the top-left corner and `bbox[1]` is the bottom-right corner. * @property {string} text - The recognized text within the bounding box. * @property {number} score - The confidence score of the OCR detection, ranging from 0 to 1. */ export interface OCRDetection { - bbox: Point[]; + bbox: [Point, Point]; text: string; score: number; }