software-mansion · msluszniak · Mar 3, 2026 · May 7, 2026 · May 7, 2026
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -526,9 +526,9 @@ getJsiValue(const std::vector<models::ocr::types::OCRDetection> &detections,
 
     auto jsiDetectionObject = jsi::Object(runtime);
 
-    auto jsiBboxArray = jsi::Array(runtime, 4);
+    auto jsiBboxArray = jsi::Array(runtime, 2);
 #pragma unroll
-    for (size_t j = 0; j < 4u; ++j) {
+    for (size_t j = 0; j < 2u; ++j) {
       auto jsiPointObject = jsi::Object(runtime);
       jsiPointObject.setProperty(runtime, "x", detection.bbox[j].x);
       jsiPointObject.setProperty(runtime, "y", detection.bbox[j].y);

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp
@@ -1,5 +1,6 @@
 #include "OCR.h"
 #include "Constants.h"
+#include <algorithm>
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/ErrorCodes.h>
 #include <rnexecutorch/data_processing/ImageProcessing.h>
@@ -69,6 +70,13 @@ OCR::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) {
   for (auto &det : detections) {
     ::rnexecutorch::utils::inverseRotatePoints(det.bbox, orient,
                                                rotated.size());
+    // Re-normalize to a proper AABB after the coordinate rotation.
+    float minX = std::min(det.bbox[0].x, det.bbox[1].x);
+    float minY = std::min(det.bbox[0].y, det.bbox[1].y);
+    float maxX = std::max(det.bbox[0].x, det.bbox[1].x);
+    float maxY = std::max(det.bbox[0].y, det.bbox[1].y);
+    det.bbox[0] = {minX, minY};
+    det.bbox[1] = {maxX, maxY};
   }
   return detections;
 }

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp
@@ -1,4 +1,6 @@
 #include "RecognitionHandler.h"
+#include <algorithm>
+#include <limits>
 #include <rnexecutorch/data_processing/ImageProcessing.h>
 #include <rnexecutorch/models/ocr/Constants.h>
 #include <rnexecutorch/models/ocr/utils/RecognitionHandlerUtils.h>
@@ -55,14 +57,24 @@ void RecognitionHandler::processBBox(std::vector<types::OCRDetection> &boxList,
   /*
     Since the boxes were corresponding to the image resized to 1280x1280,
     we want to return the boxes shifted and rescaled to match the original
-    image dimensions.
+    image dimensions. Compute the axis-aligned bounding box (AABB) from the
+    four rotated corners and store only the top-left and bottom-right points.
   */
-  for (auto &point : box.bbox) {
-    point.x = (point.x - ratioAndPadding.left) * ratioAndPadding.resizeRatio;
-    point.y = (point.y - ratioAndPadding.top) * ratioAndPadding.resizeRatio;
+  float minX = std::numeric_limits<float>::max();
+  float minY = std::numeric_limits<float>::max();
+  float maxX = std::numeric_limits<float>::lowest();
+  float maxY = std::numeric_limits<float>::lowest();
+  for (const auto &point : box.bbox) {
+    float x = (point.x - ratioAndPadding.left) * ratioAndPadding.resizeRatio;
+    float y = (point.y - ratioAndPadding.top) * ratioAndPadding.resizeRatio;
+    minX = std::min(minX, x);
+    minY = std::min(minY, y);
+    maxX = std::max(maxX, x);
+    maxY = std::max(maxY, y);
   }
   boxList.emplace_back(
-      box.bbox,
+      std::array<types::Point, 2>{types::Point{minX, minY},
+                                  types::Point{maxX, maxY}},
       converter.decodeGreedy(predictionIndices, predictionIndices.size())[0],
       confidenceScore);
 }

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h
@@ -27,7 +27,7 @@ struct PaddingInfo {
 };
 
 struct OCRDetection {
-  std::array<types::Point, 4> bbox;
+  std::array<types::Point, 2> bbox;
   std::string text;
   float score;
 };

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp
@@ -1,4 +1,6 @@
 #include "VerticalOCR.h"
+#include <algorithm>
+#include <limits>
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/ErrorCodes.h>
 #include <rnexecutorch/data_processing/ImageProcessing.h>
@@ -73,6 +75,13 @@ VerticalOCR::generateFromFrame(jsi::Runtime &runtime,
   for (auto &det : detections) {
     ::rnexecutorch::utils::inverseRotatePoints(det.bbox, orient,
                                                rotated.size());
+    // Re-normalize to a proper AABB after the coordinate rotation.
+    float minX = std::min(det.bbox[0].x, det.bbox[1].x);
+    float minY = std::min(det.bbox[0].y, det.bbox[1].y);
+    float maxX = std::max(det.bbox[0].x, det.bbox[1].x);
+    float maxY = std::max(det.bbox[0].y, det.bbox[1].y);
+    det.bbox[0] = {minX, minY};
+    det.bbox[1] = {maxX, maxY};
   }
   return detections;
 }
@@ -204,16 +213,24 @@ types::OCRDetection VerticalOCR::_processSingleTextBox(
             : _handleJointCharacters(box, originalImage, characterBoxes,
                                      paddingsBox, imagePaddings);
   }
-  // Modify the returned boxes to match the original image size
-  std::array<types::Point, 4> finalBbox;
+  // Modify the returned boxes to match the original image size. Compute the
+  // axis-aligned bounding box (AABB) from the four rotated corners and store
+  // only the top-left and bottom-right points.
+  float minX = std::numeric_limits<float>::max();
+  float minY = std::numeric_limits<float>::max();
+  float maxX = std::numeric_limits<float>::lowest();
+  float maxY = std::numeric_limits<float>::lowest();
   for (size_t i = 0; i < box.bbox.size(); ++i) {
-    finalBbox[i].x =
-        (box.bbox[i].x - imagePaddings.left) * imagePaddings.resizeRatio;
-    finalBbox[i].y =
-        (box.bbox[i].y - imagePaddings.top) * imagePaddings.resizeRatio;
+    float x = (box.bbox[i].x - imagePaddings.left) * imagePaddings.resizeRatio;
+    float y = (box.bbox[i].y - imagePaddings.top) * imagePaddings.resizeRatio;
+    minX = std::min(minX, x);
+    minY = std::min(minY, y);
+    maxX = std::max(maxX, x);
+    maxY = std::max(maxY, y);
   }
 
-  return {finalBbox, text, confidenceScore};
+  return {{types::Point{minX, minY}, types::Point{maxX, maxY}}, text,
+          confidenceScore};
 }
 
 void VerticalOCR::unload() noexcept {

diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp
@@ -100,12 +100,12 @@ TEST(OCRGenerateTests, DetectionsHaveValidBoundingBoxes) {
   auto results = model.generateFromString(kValidTestImagePath);
 
   for (const auto &detection : results) {
-    // Each bbox should have 4 points
-    EXPECT_EQ(detection.bbox.size(), 4u);
-    for (const auto &point : detection.bbox) {
-      EXPECT_GE(point.x, 0.0f);
-      EXPECT_GE(point.y, 0.0f);
-    }
+    // Each bbox has 2 points: top-left [0] and bottom-right [1]
+    EXPECT_EQ(detection.bbox.size(), 2u);
+    EXPECT_GE(detection.bbox[0].x, 0.0f);
+    EXPECT_GE(detection.bbox[0].y, 0.0f);
+    EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x);
+    EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y);
   }
 }
 

diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp
@@ -117,11 +117,12 @@ TEST(VerticalOCRGenerateTests, IndependentCharsDetectionsHaveValidBBoxes) {
   auto results = model.generateFromString(kValidVerticalTestImagePath);
 
   for (const auto &detection : results) {
-    EXPECT_EQ(detection.bbox.size(), 4u);
-    for (const auto &point : detection.bbox) {
-      EXPECT_GE(point.x, 0.0f);
-      EXPECT_GE(point.y, 0.0f);
-    }
+    // Each bbox has 2 points: top-left [0] and bottom-right [1]
+    EXPECT_EQ(detection.bbox.size(), 2u);
+    EXPECT_GE(detection.bbox[0].x, 0.0f);
+    EXPECT_GE(detection.bbox[0].y, 0.0f);
+    EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x);
+    EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y);
   }
 }
 
@@ -180,11 +181,12 @@ TEST(VerticalOCRGenerateTests, JointCharsDetectionsHaveValidBBoxes) {
   auto results = model.generateFromString(kValidVerticalTestImagePath);
 
   for (const auto &detection : results) {
-    EXPECT_EQ(detection.bbox.size(), 4u);
-    for (const auto &point : detection.bbox) {
-      EXPECT_GE(point.x, 0.0f);
-      EXPECT_GE(point.y, 0.0f);
-    }
+    // Each bbox has 2 points: top-left [0] and bottom-right [1]
+    EXPECT_EQ(detection.bbox.size(), 2u);
+    EXPECT_GE(detection.bbox[0].x, 0.0f);
+    EXPECT_GE(detection.bbox[0].y, 0.0f);
+    EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x);
+    EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y);
   }
 }
 

diff --git a/packages/react-native-executorch/src/types/ocr.ts b/packages/react-native-executorch/src/types/ocr.ts
@@ -6,12 +6,13 @@ import { Frame, PixelData, ResourceSource } from './common';
  * OCRDetection represents a single detected text instance in an image,
  * including its bounding box, recognized text, and confidence score.
  * @category Types
- * @property {Point[]} bbox - An array of points defining the bounding box around the detected text.
+ * @property {[Point, Point]} bbox - A tuple of two points defining the axis-aligned bounding box
+ *   around the detected text: `bbox[0]` is the top-left corner and `bbox[1]` is the bottom-right corner.
  * @property {string} text - The recognized text within the bounding box.
  * @property {number} score - The confidence score of the OCR detection, ranging from 0 to 1.
  */
 export interface OCRDetection {
-  bbox: Point[];
+  bbox: [Point, Point];
   text: string;
   score: number;
 }