software-mansion · chmjkb · May 7, 2026 · May 7, 2026 · May 8, 2026
diff --git a/apps/computer-vision/components/ImageWithOCRBboxes.tsx b/apps/computer-vision/components/ImageWithOCRBboxes.tsx
@@ -59,13 +59,14 @@ export default function ImageWithOCRBboxes({
         {detections.map((detection, index) => {
           const { scaleX, scaleY, offsetX, offsetY } =
             calculateAdjustedDimensions();
-          const points = detection.bbox.map((point) => ({
-            x: point.x * scaleX + offsetX,
-            y: point.y * scaleY + offsetY,
-          }));
-
-          const pointsString = points
-            .map((point) => `${point.x},${point.y}`)
+          const { x1, y1, x2, y2 } = detection.bbox;
+          const pointsString = [
+            [x1, y1],
+            [x2, y1],
+            [x2, y2],
+            [x1, y2],
+          ]
+            .map(([x, y]) => `${x * scaleX + offsetX},${y * scaleY + offsetY}`)
             .join(' ');
 
           return (

diff --git a/apps/computer-vision/components/vision_camera/tasks/OCRTask.tsx b/apps/computer-vision/components/vision_camera/tasks/OCRTask.tsx
@@ -110,11 +110,17 @@ export default function OCRTask({
         style={StyleSheet.absoluteFill}
       >
         {detections.map((det, i) => {
-          const pts = det.bbox
-            .map((p) => `${p.x * scale + offsetX},${p.y * scale + offsetY}`)
+          const { x1, y1, x2, y2 } = det.bbox;
+          const pts = [
+            [x1, y1],
+            [x2, y1],
+            [x2, y2],
+            [x1, y2],
+          ]
+            .map(([x, y]) => `${x * scale + offsetX},${y * scale + offsetY}`)
             .join(' ');
-          const labelX = det.bbox[0]!.x * scale + offsetX;
-          const labelY = det.bbox[0]!.y * scale + offsetY - 4;
+          const labelX = x1 * scale + offsetX;
+          const labelY = y1 * scale + offsetY - 4;
           return (
             <React.Fragment key={i}>
               <Polygon points={pts} fill="none" stroke="cyan" strokeWidth={2} />

diff --git a/docs/docs/03-hooks/02-computer-vision/useOCR.md b/docs/docs/03-hooks/02-computer-vision/useOCR.md
@@ -61,19 +61,21 @@ See the full guide: [VisionCamera Integration](./visioncamera-integration.md).
 The detection object is specified as follows:
 
 ```typescript
-interface Point {
-  x: number;
-  y: number;
+interface Bbox {
+  x1: number;
+  y1: number;
+  x2: number;
+  y2: number;
 }
 
 interface OCRDetection {
-  bbox: Point[];
+  bbox: Bbox;
   text: string;
   score: number;
 }
 ```
 
-The `bbox` property contains information about the bounding box of detected text regions. It is represented as four points, which are corners of detected bounding box.
+The `bbox` property contains the axis-aligned bounding box of the detected text region. `x1`/`y1` is the top-left corner and `x2`/`y2` is the bottom-right corner.
 The `text` property contains the text recognized within detected text region. The `score` represents the confidence score of the recognized text.
 
 ## Example

diff --git a/docs/docs/03-hooks/02-computer-vision/useVerticalOCR.md b/docs/docs/03-hooks/02-computer-vision/useVerticalOCR.md
@@ -69,19 +69,21 @@ See the full guide: [VisionCamera Integration](./visioncamera-integration.md).
 The detection object is specified as follows:
 
 ```typescript
-interface Point {
-  x: number;
-  y: number;
+interface Bbox {
+  x1: number;
+  y1: number;
+  x2: number;
+  y2: number;
 }
 
 interface OCRDetection {
-  bbox: Point[];
+  bbox: Bbox;
   text: string;
   score: number;
 }
 ```
 
-The `bbox` property contains information about the bounding box of detected text regions. It is represented as four points, which are corners of detected bounding box.
+The `bbox` property contains the axis-aligned bounding box of the detected text region. `x1`/`y1` is the top-left corner and `x2`/`y2` is the bottom-right corner.
 The `text` property contains the text recognized within detected text region. The `score` represents the confidence score of the recognized text.
 
 ## Example

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -462,10 +462,10 @@ getJsiValue(const std::unordered_map<std::string_view, float> &map,
 inline jsi::Value getJsiValue(const utils::computer_vision::BBox &bbox,
                               jsi::Runtime &runtime) {
   jsi::Object obj(runtime);
-  obj.setProperty(runtime, "x1", bbox.x1);
-  obj.setProperty(runtime, "y1", bbox.y1);
-  obj.setProperty(runtime, "x2", bbox.x2);
-  obj.setProperty(runtime, "y2", bbox.y2);
+  obj.setProperty(runtime, "x1", bbox.p1.x);
+  obj.setProperty(runtime, "y1", bbox.p1.y);
+  obj.setProperty(runtime, "x2", bbox.p2.x);
+  obj.setProperty(runtime, "y2", bbox.p2.y);
   return obj;
 }
 
@@ -526,16 +526,8 @@ getJsiValue(const std::vector<models::ocr::types::OCRDetection> &detections,
 
     auto jsiDetectionObject = jsi::Object(runtime);
 
-    auto jsiBboxArray = jsi::Array(runtime, 4);
-#pragma unroll
-    for (size_t j = 0; j < 4u; ++j) {
-      auto jsiPointObject = jsi::Object(runtime);
-      jsiPointObject.setProperty(runtime, "x", detection.bbox[j].x);
-      jsiPointObject.setProperty(runtime, "y", detection.bbox[j].y);
-      jsiBboxArray.setValueAtIndex(runtime, j, jsiPointObject);
-    }
-
-    jsiDetectionObject.setProperty(runtime, "bbox", jsiBboxArray);
+    jsiDetectionObject.setProperty(runtime, "bbox",
+                                   getJsiValue(detection.bbox, runtime));
     jsiDetectionObject.setProperty(
         runtime, "text", jsi::String::createFromUtf8(runtime, detection.text));
     jsiDetectionObject.setProperty(runtime, "score", detection.score);

diff --git a/...-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.cpp b/...-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.cpp
@@ -161,10 +161,10 @@ cv::Rect BaseInstanceSegmentation::computeMaskCropRect(
     const utils::computer_vision::BBox &bboxModel, cv::Size modelInputSize,
     cv::Size maskSize) {
 
-  float mx1F = bboxModel.x1 * maskSize.width / modelInputSize.width;
-  float my1F = bboxModel.y1 * maskSize.height / modelInputSize.height;
-  float mx2F = bboxModel.x2 * maskSize.width / modelInputSize.width;
-  float my2F = bboxModel.y2 * maskSize.height / modelInputSize.height;
+  float mx1F = bboxModel.p1.x * maskSize.width / modelInputSize.width;
+  float my1F = bboxModel.p1.y * maskSize.height / modelInputSize.height;
+  float mx2F = bboxModel.p2.x * maskSize.width / modelInputSize.width;
+  float my2F = bboxModel.p2.y * maskSize.height / modelInputSize.height;
 
   int32_t mx1 = std::max(0, static_cast<int32_t>(std::floor(mx1F)));
   int32_t my1 = std::max(0, static_cast<int32_t>(std::floor(my1F)));
@@ -193,8 +193,8 @@ cv::Mat BaseInstanceSegmentation::warpToOriginalResolution(
   float scaleY = static_cast<float>(originalSize.height) / maskSize.height;
 
   cv::Mat M = (cv::Mat_<float>(2, 3) << scaleX, 0,
-               (maskRect.x * scaleX - bboxOriginal.x1), 0, scaleY,
-               (maskRect.y * scaleY - bboxOriginal.y1));
+               (maskRect.x * scaleX - bboxOriginal.p1.x), 0, scaleY,
+               (maskRect.y * scaleY - bboxOriginal.p1.y));
 
   cv::Size bboxSize(static_cast<int32_t>(std::round(bboxOriginal.width())),
                     static_cast<int32_t>(std::round(bboxOriginal.height())));

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp
@@ -1,5 +1,6 @@
 #include "OCR.h"
 #include "Constants.h"
+#include <algorithm>
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/ErrorCodes.h>
 #include <rnexecutorch/data_processing/ImageProcessing.h>
@@ -67,8 +68,12 @@ OCR::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) {
   cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(bgr, orient);
   auto detections = runInference(rotated);
   for (auto &det : detections) {
-    ::rnexecutorch::utils::inverseRotatePoints(det.bbox, orient,
-                                               rotated.size());
+    std::array<types::Point, 2> corners = {det.bbox.p1, det.bbox.p2};
+    ::rnexecutorch::utils::inverseRotatePoints(corners, orient, rotated.size());
+    det.bbox = {{std::min(corners[0].x, corners[1].x),
+                 std::min(corners[0].y, corners[1].y)},
+                {std::max(corners[0].x, corners[1].x),
+                 std::max(corners[0].y, corners[1].y)}};
   }
   return detections;
 }

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp
@@ -55,14 +55,17 @@ void RecognitionHandler::processBBox(std::vector<types::OCRDetection> &boxList,
   /*
     Since the boxes were corresponding to the image resized to 1280x1280,
     we want to return the boxes shifted and rescaled to match the original
-    image dimensions.
+    image dimensions. Compute the axis-aligned bounding box (AABB) from the
+    four rotated corners and store only the top-left and bottom-right points.
   */
-  for (auto &point : box.bbox) {
-    point.x = (point.x - ratioAndPadding.left) * ratioAndPadding.resizeRatio;
-    point.y = (point.y - ratioAndPadding.top) * ratioAndPadding.resizeRatio;
-  }
+  const float ratio = ratioAndPadding.resizeRatio;
+  const float padLeft = static_cast<float>(ratioAndPadding.left);
+  const float padTop = static_cast<float>(ratioAndPadding.top);
+  types::BBox transformedBbox{
+      {(box.bbox.p1.x - padLeft) * ratio, (box.bbox.p1.y - padTop) * ratio},
+      {(box.bbox.p2.x - padLeft) * ratio, (box.bbox.p2.y - padTop) * ratio}};
   boxList.emplace_back(
-      box.bbox,
+      transformedBbox,
       converter.decodeGreedy(predictionIndices, predictionIndices.size())[0],
       confidenceScore);
 }

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h
@@ -1,22 +1,19 @@
 #pragma once
 
-#include <array>
+#include <rnexecutorch/utils/computer_vision/Types.h>
 #include <string>
 #include <vector>
 
 namespace rnexecutorch::models::ocr::types {
-struct Point {
-  float x;
-  float y;
-};
+using namespace rnexecutorch::utils::computer_vision;
 
 struct ValuesAndIndices {
   std::vector<float> values;
   std::vector<int32_t> indices;
 };
 
 struct DetectorBBox {
-  std::array<Point, 4> bbox;
+  BBox bbox;
   float angle;
 };
 
@@ -27,7 +24,7 @@ struct PaddingInfo {
 };
 
 struct OCRDetection {
-  std::array<types::Point, 4> bbox;
+  BBox bbox;
   std::string text;
   float score;
 };