From 4e2ff289941c046feac470fd139b1b7f2312e321 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Thu, 7 May 2026 16:13:48 +0200
Subject: [PATCH 1/4] refactor: unify existing codebase to use Point and BBox
 sruct across the vision stack

---
 .../host_objects/JsiConversions.h             |   8 +-
 .../BaseInstanceSegmentation.cpp              |  12 +-
 .../models/ocr/RecognitionHandler.cpp         |  18 +-
 .../common/rnexecutorch/models/ocr/Types.h    |   8 +-
 .../models/ocr/utils/DetectorUtils.cpp        | 195 +++++-------------
 .../ocr/utils/RecognitionHandlerUtils.cpp     |  12 +-
 .../models/ocr/utils/RecognizerUtils.cpp      |  35 ++--
 .../models/ocr/utils/RecognizerUtils.h        |  18 +-
 .../models/pose_estimation/Types.h            |  10 +-
 .../models/vertical_ocr/VerticalOCR.cpp       |  25 ++-
 .../rnexecutorch/utils/FrameTransform.cpp     |  40 ++--
 .../utils/computer_vision/Processing.cpp      |   9 +-
 .../utils/computer_vision/Types.h             |  20 +-
 13 files changed, 159 insertions(+), 251 deletions(-)
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index a20fd7b1bc..247780b6f5 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -462,10 +462,10 @@ getJsiValue(const std::unordered_map<std::string_view, float> &map,
 inline jsi::Value getJsiValue(const utils::computer_vision::BBox &bbox,
                               jsi::Runtime &runtime) {
   jsi::Object obj(runtime);
-  obj.setProperty(runtime, "x1", bbox.x1);
-  obj.setProperty(runtime, "y1", bbox.y1);
-  obj.setProperty(runtime, "x2", bbox.x2);
-  obj.setProperty(runtime, "y2", bbox.y2);
+  obj.setProperty(runtime, "x1", bbox.p1.x);
+  obj.setProperty(runtime, "y1", bbox.p1.y);
+  obj.setProperty(runtime, "x2", bbox.p2.x);
+  obj.setProperty(runtime, "y2", bbox.p2.y);
   return obj;
 }
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.cpp b/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.cpp
index 3d2f9d1715..776f8edd20 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.cpp
@@ -161,10 +161,10 @@ cv::Rect BaseInstanceSegmentation::computeMaskCropRect(
     const utils::computer_vision::BBox &bboxModel, cv::Size modelInputSize,
     cv::Size maskSize) {
 
-  float mx1F = bboxModel.x1 * maskSize.width / modelInputSize.width;
-  float my1F = bboxModel.y1 * maskSize.height / modelInputSize.height;
-  float mx2F = bboxModel.x2 * maskSize.width / modelInputSize.width;
-  float my2F = bboxModel.y2 * maskSize.height / modelInputSize.height;
+  float mx1F = bboxModel.p1.x * maskSize.width / modelInputSize.width;
+  float my1F = bboxModel.p1.y * maskSize.height / modelInputSize.height;
+  float mx2F = bboxModel.p2.x * maskSize.width / modelInputSize.width;
+  float my2F = bboxModel.p2.y * maskSize.height / modelInputSize.height;
 
   int32_t mx1 = std::max(0, static_cast<int32_t>(std::floor(mx1F)));
   int32_t my1 = std::max(0, static_cast<int32_t>(std::floor(my1F)));
@@ -193,8 +193,8 @@ cv::Mat BaseInstanceSegmentation::warpToOriginalResolution(
   float scaleY = static_cast<float>(originalSize.height) / maskSize.height;
 
   cv::Mat M = (cv::Mat_<float>(2, 3) << scaleX, 0,
-               (maskRect.x * scaleX - bboxOriginal.x1), 0, scaleY,
-               (maskRect.y * scaleY - bboxOriginal.y1));
+               (maskRect.x * scaleX - bboxOriginal.p1.x), 0, scaleY,
+               (maskRect.y * scaleY - bboxOriginal.p1.y));
 
   cv::Size bboxSize(static_cast<int32_t>(std::round(bboxOriginal.width())),
                     static_cast<int32_t>(std::round(bboxOriginal.height())));
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp
index dfde737655..4725154a86 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp
@@ -57,12 +57,20 @@ void RecognitionHandler::processBBox(std::vector<types::OCRDetection> &boxList,
     we want to return the boxes shifted and rescaled to match the original
     image dimensions.
   */
-  for (auto &point : box.bbox) {
-    point.x = (point.x - ratioAndPadding.left) * ratioAndPadding.resizeRatio;
-    point.y = (point.y - ratioAndPadding.top) * ratioAndPadding.resizeRatio;
-  }
+  const float ratio = ratioAndPadding.resizeRatio;
+  const float padLeft = static_cast<float>(ratioAndPadding.left);
+  const float padTop = static_cast<float>(ratioAndPadding.top);
+  auto tx = [&](types::Point p) -> types::Point {
+    return {(p.x - padLeft) * ratio, (p.y - padTop) * ratio};
+  };
+  std::array<types::Point, 4> corners = {
+      tx(box.bbox.p1),
+      tx({box.bbox.p2.x, box.bbox.p1.y}),
+      tx(box.bbox.p2),
+      tx({box.bbox.p1.x, box.bbox.p2.y}),
+  };
   boxList.emplace_back(
-      box.bbox,
+      corners,
       converter.decodeGreedy(predictionIndices, predictionIndices.size())[0],
       confidenceScore);
 }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h
index bb0a24aad1..664efa400c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h
@@ -1,14 +1,12 @@
 #pragma once
 
 #include <array>
+#include <rnexecutorch/utils/computer_vision/Types.h>
 #include <string>
 #include <vector>
 
 namespace rnexecutorch::models::ocr::types {
-struct Point {
-  float x;
-  float y;
-};
+using namespace rnexecutorch::utils::computer_vision;
 
 struct ValuesAndIndices {
   std::vector<float> values;
@@ -16,7 +14,7 @@ struct ValuesAndIndices {
 };
 
 struct DetectorBBox {
-  std::array<Point, 4> bbox;
+  BBox bbox;
   float angle;
 };
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/DetectorUtils.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/DetectorUtils.cpp
index 7614e97a1f..e398201d28 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/DetectorUtils.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/DetectorUtils.cpp
@@ -8,23 +8,8 @@
 #include <unordered_set>
 
 namespace rnexecutorch::models::ocr::utils {
-std::array<cv::Point2f, 4>
-cvPointsFromPoints(const std::array<types::Point, 4> &points) {
-  std::array<cv::Point2f, 4> cvPoints;
-#pragma unroll
-  for (std::size_t i = 0; i < cvPoints.size(); ++i) {
-    cvPoints[i] = cv::Point2f(points[i].x, points[i].y);
-  }
-  return cvPoints;
-}
-
-std::array<types::Point, 4> pointsFromCvPoints(cv::Point2f cvPoints[4]) {
-  std::array<types::Point, 4> points;
-#pragma unroll
-  for (std::size_t i = 0; i < points.size(); ++i) {
-    points[i] = {.x = cvPoints[i].x, .y = cvPoints[i].y};
-  }
-  return points;
+static std::array<types::Point, 4> bboxToCorners(const types::BBox &bbox) {
+  return {bbox.p1, {bbox.p2.x, bbox.p1.y}, bbox.p2, {bbox.p1.x, bbox.p2.y}};
 }
 
 std::pair<cv::Mat, cv::Mat> interleavedArrayToMats(std::span<const float> data,
@@ -99,8 +84,16 @@ extractMinAreaBBoxFromContour(const std::vector<cv::Point> contour) {
   std::array<cv::Point2f, 4> vertices;
   minRect.points(vertices.data());
 
-  std::array<types::Point, 4> points = pointsFromCvPoints(vertices.data());
-  return {.bbox = points, .angle = minRect.angle};
+  float minX =
+      std::min({vertices[0].x, vertices[1].x, vertices[2].x, vertices[3].x});
+  float minY =
+      std::min({vertices[0].y, vertices[1].y, vertices[2].y, vertices[3].y});
+  float maxX =
+      std::max({vertices[0].x, vertices[1].x, vertices[2].x, vertices[3].x});
+  float maxY =
+      std::max({vertices[0].y, vertices[1].y, vertices[2].y, vertices[3].y});
+  types::BBox bbox = {{minX, minY}, {maxX, maxY}};
+  return {.bbox = bbox, .angle = minRect.angle};
 }
 
 void getBoxFromContour(cv::Mat &segMap,
@@ -296,10 +289,7 @@ float calculateRestoreRatio(int32_t currentSize, int32_t desiredSize) {
 void restoreBboxRatio(std::vector<types::DetectorBBox> &boxes,
                       float restoreRatio) {
   for (auto &box : boxes) {
-    for (auto &point : box.bbox) {
-      point.x *= restoreRatio;
-      point.y *= restoreRatio;
-    }
+    box.bbox = box.bbox.scale(restoreRatio, restoreRatio);
   }
 }
 
@@ -318,36 +308,16 @@ types::Point midpointBetweenPoint(const types::Point &p1,
   return {.x = std::midpoint(p1.x, p2.x), .y = std::midpoint(p1.y, p2.y)};
 }
 
-types::Point centerOfBox(const std::array<types::Point, 4> &box) {
-  return midpointBetweenPoint(box[0], box[2]);
+types::Point centerOfBox(const types::BBox &box) {
+  return midpointBetweenPoint(box.p1, box.p2);
 }
 
-// function for both; finding maximal side length and minimal side length
-template <typename Compare>
-float findExtremeSideLength(const std::array<types::Point, 4> &points,
-                            Compare comp) {
-  float extremeLength = distanceFromPoint(points[0], points[1]);
-
-#pragma unroll
-  for (std::size_t i = 1; i < points.size(); i++) {
-    const auto &currentPoint = points[i];
-    const auto &nextPoint = points[(i + 1) % points.size()];
-    const float sideLength = distanceFromPoint(currentPoint, nextPoint);
-
-    if (comp(sideLength, extremeLength)) {
-      extremeLength = sideLength;
-    }
-  }
-
-  return extremeLength;
+float minSideLength(const types::BBox &bbox) {
+  return std::min(bbox.width(), bbox.height());
 }
 
-float minSideLength(const std::array<types::Point, 4> &points) {
-  return findExtremeSideLength(points, std::less<float>{});
-}
-
-float maxSideLength(const std::array<types::Point, 4> &points) {
-  return findExtremeSideLength(points, std::greater<float>{});
+float maxSideLength(const types::BBox &bbox) {
+  return std::max(bbox.width(), bbox.height());
 }
 
 /**
@@ -366,8 +336,8 @@ float maxSideLength(const std::array<types::Point, 4> &points) {
  *   - a bool indicating whether the line is
  * considered vertical.
  */
-std::tuple<float, float, bool>
-fitLineToShortestSides(const std::array<types::Point, 4> &points) {
+std::tuple<float, float, bool> fitLineToShortestSides(const types::BBox &bbox) {
+  const std::array<types::Point, 4> points = bboxToCorners(bbox);
   std::array<std::pair<float, float>, 4> sides;
   std::array<types::Point, 4> midpoints;
 #pragma unroll
@@ -414,37 +384,35 @@ fitLineToShortestSides(const std::array<types::Point, 4> &points) {
   return {m, c, isVertical};
 }
 
-std::array<types::Point, 4> rotateBox(const std::array<types::Point, 4> &box,
-                                      float angle) {
-  const types::Point center = centerOfBox(box);
-
+types::BBox rotateBox(const types::BBox &bbox, float angle) {
+  const types::Point center = centerOfBox(bbox);
   const float radians = angle * M_PI / 180.0f;
 
-  std::array<types::Point, 4> rotatedPoints;
-  for (std::size_t i = 0; i < box.size(); ++i) {
-    const types::Point &point = box[i];
-    const float translatedX = point.x - center.x;
-    const float translatedY = point.y - center.y;
-
-    const float rotatedX =
-        translatedX * std::cos(radians) - translatedY * std::sin(radians);
-    const float rotatedY =
-        translatedX * std::sin(radians) + translatedY * std::cos(radians);
-
-    rotatedPoints[i] = {.x = rotatedX + center.x, .y = rotatedY + center.y};
+  float minX = std::numeric_limits<float>::max();
+  float minY = std::numeric_limits<float>::max();
+  float maxX = std::numeric_limits<float>::lowest();
+  float maxY = std::numeric_limits<float>::lowest();
+
+  for (const auto &p : bboxToCorners(bbox)) {
+    const float tx = p.x - center.x;
+    const float ty = p.y - center.y;
+    const float rx = tx * std::cos(radians) - ty * std::sin(radians) + center.x;
+    const float ry = tx * std::sin(radians) + ty * std::cos(radians) + center.y;
+    minX = std::min(minX, rx);
+    minY = std::min(minY, ry);
+    maxX = std::max(maxX, rx);
+    maxY = std::max(maxY, ry);
   }
 
-  return rotatedPoints;
+  return {{minX, minY}, {maxX, maxY}};
 }
 
-float calculateMinimalDistanceBetweenBox(
-    const std::array<types::Point, 4> &box1,
-    const std::array<types::Point, 4> &box2) {
+float calculateMinimalDistanceBetweenBox(const types::BBox &box1,
+                                         const types::BBox &box2) {
   float minDistance = std::numeric_limits<float>::max();
-  for (const types::Point &corner1 : box1) {
-    for (const types::Point &corner2 : box2) {
-      const float distance = distanceFromPoint(corner1, corner2);
-      minDistance = std::min(distance, minDistance);
+  for (const auto &c1 : bboxToCorners(box1)) {
+    for (const auto &c2 : bboxToCorners(box2)) {
+      minDistance = std::min(minDistance, distanceFromPoint(c1, c2));
     }
   }
   return minDistance;
@@ -466,66 +434,15 @@ float calculateMinimalDistanceBetweenBox(
  * 4. The points are ordered starting from the top-left in a clockwise manner:
  * top-left, top-right, bottom-right, bottom-left.
  */
-std::array<types::Point, 4>
-orderPointsClockwise(const std::array<types::Point, 4> &points) {
-  types::Point topLeft, topRight, bottomRight, bottomLeft;
-  float minSum = std::numeric_limits<float>::max();
-  float maxSum = std::numeric_limits<float>::lowest();
-  float minDiff = std::numeric_limits<float>::max();
-  float maxDiff = std::numeric_limits<float>::lowest();
-
-  for (const auto &pt : points) {
-    const float sum = pt.x + pt.y;
-    const float diff = pt.y - pt.x;
-
-    if (sum < minSum) {
-      minSum = sum;
-      topLeft = pt;
-    }
-    if (sum > maxSum) {
-      maxSum = sum;
-      bottomRight = pt;
-    }
-    if (diff < minDiff) {
-      minDiff = diff;
-      topRight = pt;
-    }
-    if (diff > maxDiff) {
-      maxDiff = diff;
-      bottomLeft = pt;
-    }
-  }
-
-  return {topLeft, topRight, bottomRight, bottomLeft};
+types::BBox orderPointsClockwise(const types::BBox &bbox) {
+  return {{std::min(bbox.p1.x, bbox.p2.x), std::min(bbox.p1.y, bbox.p2.y)},
+          {std::max(bbox.p1.x, bbox.p2.x), std::max(bbox.p1.y, bbox.p2.y)}};
 }
 
-std::array<types::Point, 4>
-mergeRotatedBoxes(std::array<types::Point, 4> &box1,
-                  std::array<types::Point, 4> &box2) {
-  box1 = orderPointsClockwise(box1);
-  box2 = orderPointsClockwise(box2);
-
-  auto points1 = cvPointsFromPoints(box1);
-  auto points2 = cvPointsFromPoints(box2);
-
-  std::array<cv::Point2f, points1.size() + points2.size()> allPoints;
-  std::copy(points1.begin(), points1.end(), allPoints.begin());
-  std::copy(points2.begin(), points2.end(), allPoints.begin() + points1.size());
-
-  std::vector<int32_t> hullIndices;
-  cv::convexHull(allPoints, hullIndices, false);
-
-  std::vector<cv::Point2f> hullPoints;
-  for (int32_t idx : hullIndices) {
-    hullPoints.push_back(allPoints[idx]);
-  }
-
-  cv::RotatedRect minAreaRect = cv::minAreaRect(hullPoints);
-
-  std::array<cv::Point2f, 4> rectPoints;
-  minAreaRect.points(rectPoints.data());
-
-  return pointsFromCvPoints(rectPoints.data());
+types::BBox mergeRotatedBoxes(const types::BBox &box1,
+                              const types::BBox &box2) {
+  return {{std::min(box1.p1.x, box2.p1.x), std::min(box1.p1.y, box2.p1.y)},
+          {std::max(box1.p2.x, box2.p2.x), std::max(box1.p2.y, box2.p2.y)}};
 }
 
 /**
@@ -555,8 +472,8 @@ mergeRotatedBoxes(std::array<types::Point, 4> &box1,
 std::optional<std::pair<std::size_t, float>>
 findClosestBox(const std::vector<types::DetectorBBox> &boxes,
                const std::unordered_set<std::size_t> &ignoredIdxs,
-               const std::array<types::Point, 4> &currentBox, bool isVertical,
-               float m, float c, float centerThreshold) {
+               const types::BBox &currentBox, bool isVertical, float m, float c,
+               float centerThreshold) {
   float smallestDistance = std::numeric_limits<float>::max();
   ssize_t idx = -1;
   float boxHeight = 0.0f;
@@ -566,7 +483,7 @@ findClosestBox(const std::vector<types::DetectorBBox> &boxes,
     if (ignoredIdxs.contains(i)) {
       continue;
     }
-    std::array<types::Point, 4> bbox = boxes[i].bbox;
+    const types::BBox &bbox = boxes[i].bbox;
     const types::Point centerOfProcessedBox = centerOfBox(bbox);
     const float distanceBetweenCenters =
         distanceFromPoint(centerOfCurrentBox, centerOfProcessedBox);
@@ -616,11 +533,7 @@ removeSmallBoxesFromArray(const std::vector<types::DetectorBBox> &boxes,
   return filteredBoxes;
 }
 
-static float minimumYFromBox(const std::array<types::Point, 4> &box) {
-  return std::ranges::min_element(
-             box, [](types::Point a, types::Point b) { return a.y < b.y; })
-      ->y;
-}
+static float minimumYFromBox(const types::BBox &bbox) { return bbox.p1.y; }
 
 std::vector<types::DetectorBBox>
 groupTextBoxes(std::vector<types::DetectorBBox> &boxes, float centerThreshold,
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognitionHandlerUtils.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognitionHandlerUtils.cpp
index 41c3f78187..0e50f1c038 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognitionHandlerUtils.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognitionHandlerUtils.cpp
@@ -42,12 +42,12 @@ void computeRatioAndResize(cv::Mat &img, cv::Size size, int32_t modelHeight) {
 
 cv::Mat cropImage(types::DetectorBBox box, cv::Mat &image,
                   int32_t modelHeight) {
-  // Convert custom points to cv::Point2f
-  std::array<cv::Point2f, 4> points;
-#pragma unroll
-  for (std::size_t i = 0; i < points.size(); ++i) {
-    points[i] = cv::Point2f(box.bbox[i].x, box.bbox[i].y);
-  }
+  const std::array<cv::Point2f, 4> points = {{
+      {box.bbox.p1.x, box.bbox.p1.y},
+      {box.bbox.p2.x, box.bbox.p1.y},
+      {box.bbox.p2.x, box.bbox.p2.y},
+      {box.bbox.p1.x, box.bbox.p2.y},
+  }};
 
   cv::RotatedRect rotatedRect = cv::minAreaRect(points);
   cv::Point2f rectPoints[4];
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.cpp
index 89a84fa676..e959739ab1 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.cpp
@@ -1,8 +1,11 @@
 #include "RecognizerUtils.h"
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/ErrorCodes.h>
+#include <rnexecutorch/utils/computer_vision/Types.h>
 
 namespace rnexecutorch::models::ocr::utils {
+using namespace rnexecutorch::utils::computer_vision;
+
 cv::Mat softmax(const cv::Mat &inputs) {
   cv::Mat maxVal;
   cv::reduce(inputs, maxVal, 1, cv::REDUCE_MAX, CV_32F);
@@ -80,9 +83,10 @@ float confidenceScore(const std::vector<float> &values,
   return std::pow(product, exponent);
 }
 
-cv::Rect extractBoundingBox(std::array<types::Point, 4> &points) {
-  cv::Mat pointsMat(4, 1, CV_32FC2, points.data());
-  return cv::boundingRect(pointsMat);
+cv::Rect extractBoundingBox(const BBox &bbox) {
+  return cv::Rect(static_cast<int>(bbox.p1.x), static_cast<int>(bbox.p1.y),
+                  static_cast<int>(bbox.width()),
+                  static_cast<int>(bbox.height()));
 }
 
 cv::Mat characterBitMask(const cv::Mat &img) {
@@ -157,22 +161,22 @@ cv::Mat characterBitMask(const cv::Mat &img) {
   return resultImage;
 }
 
-cv::Mat
-cropImageWithBoundingBox(const cv::Mat &img,
-                         const std::array<types::Point, 4> &bbox,
-                         const std::array<types::Point, 4> &originalBbox,
-                         const types::PaddingInfo &paddings,
-                         const types::PaddingInfo &originalPaddings) {
-  if (originalBbox.empty()) {
+cv::Mat cropImageWithBoundingBox(const cv::Mat &img, const BBox &bbox,
+                                 const BBox &originalBbox,
+                                 const types::PaddingInfo &paddings,
+                                 const types::PaddingInfo &originalPaddings) {
+  if (!originalBbox.isValid()) {
     throw RnExecutorchError(RnExecutorchErrorCode::UnknownError,
                             "Original bounding box cannot be empty.");
   }
-  const types::Point topLeft = originalBbox[0];
+  const types::Point topLeft = originalBbox.p1;
 
+  const std::array<Point, 4> bboxCorners = {
+      bbox.p1, {bbox.p2.x, bbox.p1.y}, bbox.p2, {bbox.p1.x, bbox.p2.y}};
   std::vector<cv::Point2f> points;
-  points.reserve(bbox.size());
+  points.reserve(4);
 
-  for (const auto &point : bbox) {
+  for (const auto &point : bboxCorners) {
     types::Point transformedPoint = point;
 
     transformedPoint.x -= paddings.left;
@@ -202,9 +206,8 @@ cropImageWithBoundingBox(const cv::Mat &img,
   return croppedImage;
 }
 
-cv::Mat prepareForRecognition(const cv::Mat &originalImage,
-                              const std::array<types::Point, 4> &bbox,
-                              const std::array<types::Point, 4> &originalBbox,
+cv::Mat prepareForRecognition(const cv::Mat &originalImage, const BBox &bbox,
+                              const BBox &originalBbox,
                               const types::PaddingInfo &paddings,
                               const types::PaddingInfo &originalPaddings) {
   auto croppedChar = cropImageWithBoundingBox(originalImage, bbox, originalBbox,
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.h b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.h
index 71e3a9c25e..d693193386 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/utils/RecognizerUtils.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <array>
 #include <opencv2/opencv.hpp>
 #include <rnexecutorch/models/ocr/Constants.h>
 #include <rnexecutorch/models/ocr/Types.h>
@@ -21,7 +20,7 @@ cv::Mat softmax(const cv::Mat &inputs);
 types::ValuesAndIndices findMaxValuesIndices(const cv::Mat &mat);
 std::vector<float> sumProbabilityRows(const cv::Mat &matrix);
 void divideMatrixByRows(cv::Mat &matrix, const std::vector<float> &rowSums);
-cv::Rect extractBoundingBox(std::array<types::Point, 4> &points);
+cv::Rect extractBoundingBox(const types::BBox &bbox);
 
 /**
  * @brief Computes confidence score for given values and indices vectors.
@@ -43,12 +42,10 @@ cv::Mat characterBitMask(const cv::Mat &img);
  * with internal bounding box and padding.
  * It does so to preserve the best possible image quality.
  */
-cv::Mat
-cropImageWithBoundingBox(const cv::Mat &img,
-                         const std::array<types::Point, 4> &bbox,
-                         const std::array<types::Point, 4> &originalBbox,
-                         const types::PaddingInfo &paddings,
-                         const types::PaddingInfo &originalPaddings);
+cv::Mat cropImageWithBoundingBox(const cv::Mat &img, const types::BBox &bbox,
+                                 const types::BBox &originalBbox,
+                                 const types::PaddingInfo &paddings,
+                                 const types::PaddingInfo &originalPaddings);
 
 /**
  * @brief Perform cropping, resizing and convert to grayscale to prepare image
@@ -62,10 +59,9 @@ cropImageWithBoundingBox(const cv::Mat &img,
  *
  * @details it utilizes cropImageWithBoundingBox to perform specific cropping.
  */
-
 cv::Mat prepareForRecognition(const cv::Mat &originalImage,
-                              const std::array<types::Point, 4> &bbox,
-                              const std::array<types::Point, 4> &originalBbox,
+                              const types::BBox &bbox,
+                              const types::BBox &originalBbox,
                               const types::PaddingInfo &paddings,
                               const types::PaddingInfo &originalPaddings);
 } // namespace rnexecutorch::models::ocr::utils
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/Types.h
index 7d671ab7bb..6a7cdc2e88 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/Types.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/Types.h
@@ -1,18 +1,12 @@
 #pragma once
 
-#include <cstdint>
+#include <rnexecutorch/utils/computer_vision/Types.h>
 #include <vector>
 
 namespace rnexecutorch::models::pose_estimation {
 
-// Single keypoint (x, y)
-struct Keypoint {
-  int32_t x;
-  int32_t y;
-};
-
 // N keypoints for one person, depending on the model in question
-using PersonKeypoints = std::vector<Keypoint>;
+using PersonKeypoints = std::vector<utils::computer_vision::Point>;
 
 // N people for each image
 using PoseDetections = std::vector<PersonKeypoints>;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp
index 88a027d01b..6338b5f25b 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp
@@ -187,10 +187,8 @@ types::OCRDetection VerticalOCR::_processSingleTextBox(
   float confidenceScore = 0.0;
   if (!characterBoxes.empty()) {
     // Prepare information useful for proper boxes shifting and image cropping.
-    const int32_t boxWidth =
-        static_cast<int32_t>(box.bbox[2].x - box.bbox[0].x);
-    const int32_t boxHeight =
-        static_cast<int32_t>(box.bbox[2].y - box.bbox[0].y);
+    const int32_t boxWidth = static_cast<int32_t>(box.bbox.width());
+    const int32_t boxHeight = static_cast<int32_t>(box.bbox.height());
     cv::Size narrowRecognizerSize =
         detector.calculateModelImageSize(constants::kSmallDetectorWidth);
     types::PaddingInfo paddingsBox = utils::calculateResizeRatioAndPaddings(
@@ -205,13 +203,18 @@ types::OCRDetection VerticalOCR::_processSingleTextBox(
                                      paddingsBox, imagePaddings);
   }
   // Modify the returned boxes to match the original image size
-  std::array<types::Point, 4> finalBbox;
-  for (size_t i = 0; i < box.bbox.size(); ++i) {
-    finalBbox[i].x =
-        (box.bbox[i].x - imagePaddings.left) * imagePaddings.resizeRatio;
-    finalBbox[i].y =
-        (box.bbox[i].y - imagePaddings.top) * imagePaddings.resizeRatio;
-  }
+  const float ratio = imagePaddings.resizeRatio;
+  const float padLeft = static_cast<float>(imagePaddings.left);
+  const float padTop = static_cast<float>(imagePaddings.top);
+  auto tx = [&](types::Point p) -> types::Point {
+    return {(p.x - padLeft) * ratio, (p.y - padTop) * ratio};
+  };
+  std::array<types::Point, 4> finalBbox = {
+      tx(box.bbox.p1),
+      tx({box.bbox.p2.x, box.bbox.p1.y}),
+      tx(box.bbox.p2),
+      tx({box.bbox.p1.x, box.bbox.p2.y}),
+  };
 
   return {finalBbox, text, confidenceScore};
 }
diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.cpp b/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.cpp
index 80425c2dab..e9cf1e9d73 100644
--- a/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.cpp
@@ -48,23 +48,19 @@ void inverseRotateBbox(computer_vision::BBox &bbox,
   switch (orient.orientation) {
   case Orientation::Up: {
     // landscape-left → portrait: nx = h - y, ny = x
-    float nx1 = h - bbox.y2, ny1 = bbox.x1;
-    float nx2 = h - bbox.y1, ny2 = bbox.x2;
-    bbox.x1 = nx1;
-    bbox.y1 = ny1;
-    bbox.x2 = nx2;
-    bbox.y2 = ny2;
+    float nx1 = h - bbox.p2.y, ny1 = bbox.p1.x;
+    float nx2 = h - bbox.p1.y, ny2 = bbox.p2.x;
+    bbox.p1 = {nx1, ny1};
+    bbox.p2 = {nx2, ny2};
     break;
   }
   case Orientation::Right: {
 #if defined(__APPLE__)
     // iOS upside-down portrait → portrait: nx = w - x, ny = h - y
-    float nx1 = w - bbox.x2, ny1 = h - bbox.y2;
-    float nx2 = w - bbox.x1, ny2 = h - bbox.y1;
-    bbox.x1 = nx1;
-    bbox.y1 = ny1;
-    bbox.x2 = nx2;
-    bbox.y2 = ny2;
+    float nx1 = w - bbox.p2.x, ny1 = h - bbox.p2.y;
+    float nx2 = w - bbox.p1.x, ny2 = h - bbox.p1.y;
+    bbox.p1 = {nx1, ny1};
+    bbox.p2 = {nx2, ny2};
 #endif
     // Android front-cam upright portrait: rotated frame already in screen
     // space, no inverse needed.
@@ -72,12 +68,10 @@ void inverseRotateBbox(computer_vision::BBox &bbox,
   }
   case Orientation::Down: {
     // landscape-right → portrait: nx = y, ny = w - x
-    float nx1 = bbox.y1, ny1 = w - bbox.x2;
-    float nx2 = bbox.y2, ny2 = w - bbox.x1;
-    bbox.x1 = nx1;
-    bbox.y1 = ny1;
-    bbox.x2 = nx2;
-    bbox.y2 = ny2;
+    float nx1 = bbox.p1.y, ny1 = w - bbox.p2.x;
+    float nx2 = bbox.p2.y, ny2 = w - bbox.p1.x;
+    bbox.p1 = {nx1, ny1};
+    bbox.p2 = {nx2, ny2};
     break;
   }
   case Orientation::Left:
@@ -93,12 +87,10 @@ void inverseRotateBbox(computer_vision::BBox &bbox,
                     orient.orientation == Orientation::Down);
     float sw = swapped ? h : w;
     float sh = swapped ? w : h;
-    float nx1 = sw - bbox.x2, ny1 = sh - bbox.y2;
-    float nx2 = sw - bbox.x1, ny2 = sh - bbox.y1;
-    bbox.x1 = nx1;
-    bbox.y1 = ny1;
-    bbox.x2 = nx2;
-    bbox.y2 = ny2;
+    float nx1 = sw - bbox.p2.x, ny1 = sh - bbox.p2.y;
+    float nx2 = sw - bbox.p1.x, ny2 = sh - bbox.p1.y;
+    bbox.p1 = {nx1, ny1};
+    bbox.p2 = {nx2, ny2};
   }
 #endif
 }
diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.cpp b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.cpp
index 108fd6ff8a..8ced125ecf 100644
--- a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.cpp
@@ -1,14 +1,13 @@
 #include "Processing.h"
 #include <algorithm>
-#include <cmath>
 
 namespace rnexecutorch::utils::computer_vision {
 
 float computeIoU(const BBox &a, const BBox &b) {
-  float x1 = std::max(a.x1, b.x1);
-  float y1 = std::max(a.y1, b.y1);
-  float x2 = std::min(a.x2, b.x2);
-  float y2 = std::min(a.y2, b.y2);
+  float x1 = std::max(a.p1.x, b.p1.x);
+  float y1 = std::max(a.p1.y, b.p1.y);
+  float x2 = std::min(a.p2.x, b.p2.x);
+  float y2 = std::min(a.p2.y, b.p2.y);
 
   float intersectionArea = std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
   float areaA = a.area();
diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h
index 8899d3b87c..943d0b1c91 100644
--- a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h
+++ b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h
@@ -4,24 +4,26 @@
 
 namespace rnexecutorch::utils::computer_vision {
 
-struct BBox {
-
-  float width() const { return x2 - x1; }
-
-  float height() const { return y2 - y1; }
+struct Point {
+  float x;
+  float y;
+};
 
+struct BBox {
+  float width() const { return p2.x - p1.x; }
+  float height() const { return p2.y - p1.y; }
   float area() const { return width() * height(); }
 
   bool isValid() const {
-    return x2 > x1 && y2 > y1 && x1 >= 0.0f && y1 >= 0.0f;
+    return p2.x > p1.x && p2.y > p1.y && p1.x >= 0.0f && p1.y;
   }
 
   BBox scale(float widthRatio, float heightRatio) const {
-    return {x1 * widthRatio, y1 * heightRatio, x2 * widthRatio,
-            y2 * heightRatio};
+    return {{p1.x * widthRatio, p1.y * heightRatio},
+            {p2.x * widthRatio, p2.y * heightRatio}};
   }
 
-  float x1, y1, x2, y2;
+  Point p1, p2;
 };
 
 template <typename T>

From 91c483ee8347b820c59fc17d2b0d51668dab6d99 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 7 May 2026 19:07:23 +0000
Subject: [PATCH 2/4] refactor(ocr): reduce OCR bounding boxes from 4 vertices
 to 2-point AABB

Resolves #760. The OCR and VerticalOCR pipelines previously exposed all four
rotated-rectangle corners in OCRDetection.bbox. Two points (top-left and
bottom-right of the axis-aligned bounding box) are sufficient for downstream
rendering and are simpler to consume.

Changes:
- Types.h: shrink OCRDetection.bbox from std::array<Point,4> to std::array<Point,2>
- RecognitionHandler.cpp: compute AABB (min/max x,y) over the four detector
  corners instead of forwarding them verbatim
- VerticalOCR.cpp: same AABB reduction in _processSingleTextBox
- OCR.cpp / VerticalOCR.cpp generateFromFrame: re-normalize the two bbox
  corners after inverseRotatePoints to guarantee bbox[0] <= bbox[1]
- JsiConversions.h: serialize 2 points instead of 4 to JavaScript
- OCRTest.cpp / VerticalOCRTest.cpp: assert size==2 and that bbox[1] >= bbox[0]
- ocr.ts: narrow TypeScript type from Point[] to [Point,Point] and update docs
---
 .../host_objects/JsiConversions.h             |  4 +--
 .../common/rnexecutorch/models/ocr/OCR.cpp    |  8 ++++++
 .../models/ocr/RecognitionHandler.cpp         | 17 +++++-------
 .../common/rnexecutorch/models/ocr/Types.h    |  2 +-
 .../models/vertical_ocr/VerticalOCR.cpp       | 27 +++++++++----------
 .../tests/integration/OCRTest.cpp             | 12 ++++-----
 .../tests/integration/VerticalOCRTest.cpp     | 22 ++++++++-------
 .../react-native-executorch/src/types/ocr.ts  |  5 ++--
 8 files changed, 51 insertions(+), 46 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 247780b6f5..9c6642a6cd 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -526,9 +526,9 @@ getJsiValue(const std::vector<models::ocr::types::OCRDetection> &detections,
 
     auto jsiDetectionObject = jsi::Object(runtime);
 
-    auto jsiBboxArray = jsi::Array(runtime, 4);
+    auto jsiBboxArray = jsi::Array(runtime, 2);
 #pragma unroll
-    for (size_t j = 0; j < 4u; ++j) {
+    for (size_t j = 0; j < 2u; ++j) {
       auto jsiPointObject = jsi::Object(runtime);
       jsiPointObject.setProperty(runtime, "x", detection.bbox[j].x);
       jsiPointObject.setProperty(runtime, "y", detection.bbox[j].y);
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp
index 3c08d16daa..60887e0f7b 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp
@@ -1,5 +1,6 @@
 #include "OCR.h"
 #include "Constants.h"
+#include <algorithm>
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/ErrorCodes.h>
 #include <rnexecutorch/data_processing/ImageProcessing.h>
@@ -69,6 +70,13 @@ OCR::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) {
   for (auto &det : detections) {
     ::rnexecutorch::utils::inverseRotatePoints(det.bbox, orient,
                                                rotated.size());
+    // Re-normalize to a proper AABB after the coordinate rotation.
+    float minX = std::min(det.bbox[0].x, det.bbox[1].x);
+    float minY = std::min(det.bbox[0].y, det.bbox[1].y);
+    float maxX = std::max(det.bbox[0].x, det.bbox[1].x);
+    float maxY = std::max(det.bbox[0].y, det.bbox[1].y);
+    det.bbox[0] = {minX, minY};
+    det.bbox[1] = {maxX, maxY};
   }
   return detections;
 }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp
index 4725154a86..e70e46ab9c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/RecognitionHandler.cpp
@@ -55,22 +55,17 @@ void RecognitionHandler::processBBox(std::vector<types::OCRDetection> &boxList,
   /*
     Since the boxes were corresponding to the image resized to 1280x1280,
     we want to return the boxes shifted and rescaled to match the original
-    image dimensions.
+    image dimensions. Compute the axis-aligned bounding box (AABB) from the
+    four rotated corners and store only the top-left and bottom-right points.
   */
   const float ratio = ratioAndPadding.resizeRatio;
   const float padLeft = static_cast<float>(ratioAndPadding.left);
   const float padTop = static_cast<float>(ratioAndPadding.top);
-  auto tx = [&](types::Point p) -> types::Point {
-    return {(p.x - padLeft) * ratio, (p.y - padTop) * ratio};
-  };
-  std::array<types::Point, 4> corners = {
-      tx(box.bbox.p1),
-      tx({box.bbox.p2.x, box.bbox.p1.y}),
-      tx(box.bbox.p2),
-      tx({box.bbox.p1.x, box.bbox.p2.y}),
-  };
+  types::BBox transformedBbox{
+      {(box.bbox.p1.x - padLeft) * ratio, (box.bbox.p1.y - padTop) * ratio},
+      {(box.bbox.p2.x - padLeft) * ratio, (box.bbox.p2.y - padTop) * ratio}};
   boxList.emplace_back(
-      corners,
+      transformedBbox,
       converter.decodeGreedy(predictionIndices, predictionIndices.size())[0],
       confidenceScore);
 }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h
index 664efa400c..e3d38c33bb 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h
@@ -25,7 +25,7 @@ struct PaddingInfo {
 };
 
 struct OCRDetection {
-  std::array<types::Point, 4> bbox;
+  std::array<types::Point, 2> bbox;
   std::string text;
   float score;
 };
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp
index 6338b5f25b..c0a531ecd3 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/vertical_ocr/VerticalOCR.cpp
@@ -1,4 +1,5 @@
 #include "VerticalOCR.h"
+#include <algorithm>
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/ErrorCodes.h>
 #include <rnexecutorch/data_processing/ImageProcessing.h>
@@ -71,8 +72,12 @@ VerticalOCR::generateFromFrame(jsi::Runtime &runtime,
   cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(bgr, orient);
   auto detections = runInference(rotated);
   for (auto &det : detections) {
-    ::rnexecutorch::utils::inverseRotatePoints(det.bbox, orient,
-                                               rotated.size());
+    std::array<types::Point, 2> corners = {det.bbox.p1, det.bbox.p2};
+    ::rnexecutorch::utils::inverseRotatePoints(corners, orient, rotated.size());
+    det.bbox = {{std::min(corners[0].x, corners[1].x),
+                 std::min(corners[0].y, corners[1].y)},
+                {std::max(corners[0].x, corners[1].x),
+                 std::max(corners[0].y, corners[1].y)}};
   }
   return detections;
 }
@@ -202,21 +207,15 @@ types::OCRDetection VerticalOCR::_processSingleTextBox(
             : _handleJointCharacters(box, originalImage, characterBoxes,
                                      paddingsBox, imagePaddings);
   }
-  // Modify the returned boxes to match the original image size
+  // Modify the returned boxes to match the original image size.
   const float ratio = imagePaddings.resizeRatio;
   const float padLeft = static_cast<float>(imagePaddings.left);
   const float padTop = static_cast<float>(imagePaddings.top);
-  auto tx = [&](types::Point p) -> types::Point {
-    return {(p.x - padLeft) * ratio, (p.y - padTop) * ratio};
-  };
-  std::array<types::Point, 4> finalBbox = {
-      tx(box.bbox.p1),
-      tx({box.bbox.p2.x, box.bbox.p1.y}),
-      tx(box.bbox.p2),
-      tx({box.bbox.p1.x, box.bbox.p2.y}),
-  };
-
-  return {finalBbox, text, confidenceScore};
+  types::BBox transformedBbox{
+      {(box.bbox.p1.x - padLeft) * ratio, (box.bbox.p1.y - padTop) * ratio},
+      {(box.bbox.p2.x - padLeft) * ratio, (box.bbox.p2.y - padTop) * ratio}};
+
+  return {transformedBbox, text, confidenceScore};
 }
 
 void VerticalOCR::unload() noexcept {
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp
index 072c761164..de995fa9f9 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp
@@ -100,12 +100,12 @@ TEST(OCRGenerateTests, DetectionsHaveValidBoundingBoxes) {
   auto results = model.generateFromString(kValidTestImagePath);
 
   for (const auto &detection : results) {
-    // Each bbox should have 4 points
-    EXPECT_EQ(detection.bbox.size(), 4u);
-    for (const auto &point : detection.bbox) {
-      EXPECT_GE(point.x, 0.0f);
-      EXPECT_GE(point.y, 0.0f);
-    }
+    // Each bbox has 2 points: top-left [0] and bottom-right [1]
+    EXPECT_EQ(detection.bbox.size(), 2u);
+    EXPECT_GE(detection.bbox[0].x, 0.0f);
+    EXPECT_GE(detection.bbox[0].y, 0.0f);
+    EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x);
+    EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y);
   }
 }
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp
index fd6d59441d..f409926b83 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp
@@ -117,11 +117,12 @@ TEST(VerticalOCRGenerateTests, IndependentCharsDetectionsHaveValidBBoxes) {
   auto results = model.generateFromString(kValidVerticalTestImagePath);
 
   for (const auto &detection : results) {
-    EXPECT_EQ(detection.bbox.size(), 4u);
-    for (const auto &point : detection.bbox) {
-      EXPECT_GE(point.x, 0.0f);
-      EXPECT_GE(point.y, 0.0f);
-    }
+    // Each bbox has 2 points: top-left [0] and bottom-right [1]
+    EXPECT_EQ(detection.bbox.size(), 2u);
+    EXPECT_GE(detection.bbox[0].x, 0.0f);
+    EXPECT_GE(detection.bbox[0].y, 0.0f);
+    EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x);
+    EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y);
   }
 }
 
@@ -180,11 +181,12 @@ TEST(VerticalOCRGenerateTests, JointCharsDetectionsHaveValidBBoxes) {
   auto results = model.generateFromString(kValidVerticalTestImagePath);
 
   for (const auto &detection : results) {
-    EXPECT_EQ(detection.bbox.size(), 4u);
-    for (const auto &point : detection.bbox) {
-      EXPECT_GE(point.x, 0.0f);
-      EXPECT_GE(point.y, 0.0f);
-    }
+    // Each bbox has 2 points: top-left [0] and bottom-right [1]
+    EXPECT_EQ(detection.bbox.size(), 2u);
+    EXPECT_GE(detection.bbox[0].x, 0.0f);
+    EXPECT_GE(detection.bbox[0].y, 0.0f);
+    EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x);
+    EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y);
   }
 }
 
diff --git a/packages/react-native-executorch/src/types/ocr.ts b/packages/react-native-executorch/src/types/ocr.ts
index d2f3781095..16f9fbcff1 100644
--- a/packages/react-native-executorch/src/types/ocr.ts
+++ b/packages/react-native-executorch/src/types/ocr.ts
@@ -6,12 +6,13 @@ import { Frame, PixelData, ResourceSource } from './common';
  * OCRDetection represents a single detected text instance in an image,
  * including its bounding box, recognized text, and confidence score.
  * @category Types
- * @property {Point[]} bbox - An array of points defining the bounding box around the detected text.
+ * @property {[Point, Point]} bbox - A tuple of two points defining the axis-aligned bounding box
+ *   around the detected text: `bbox[0]` is the top-left corner and `bbox[1]` is the bottom-right corner.
  * @property {string} text - The recognized text within the bounding box.
  * @property {number} score - The confidence score of the OCR detection, ranging from 0 to 1.
  */
 export interface OCRDetection {
-  bbox: Point[];
+  bbox: [Point, Point];
   text: string;
   score: number;
 }

From c4fcf545f2b5eb9cc75c90306179b32ec8f3c16c Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Fri, 8 May 2026 14:13:21 +0200
Subject: [PATCH 3/4] chore: change TS types

---
 .../components/ImageWithOCRBboxes.tsx         | 15 +++++++-------
 .../vision_camera/tasks/OCRTask.tsx           | 14 +++++++++----
 .../03-hooks/02-computer-vision/useOCR.md     | 12 ++++++-----
 .../02-computer-vision/useVerticalOCR.md      | 12 ++++++-----
 .../host_objects/JsiConversions.h             | 12 ++---------
 .../common/rnexecutorch/models/ocr/OCR.cpp    | 15 ++++++--------
 .../common/rnexecutorch/models/ocr/Types.h    |  3 +--
 .../tests/integration/OCRTest.cpp             | 10 ++++------
 .../tests/integration/VerticalOCRTest.cpp     | 20 ++++++++-----------
 .../react-native-executorch/src/types/ocr.ts  | 17 +++-------------
 10 files changed, 56 insertions(+), 74 deletions(-)

diff --git a/apps/computer-vision/components/ImageWithOCRBboxes.tsx b/apps/computer-vision/components/ImageWithOCRBboxes.tsx
index 1c8fe616af..eb1f9acbf6 100644
--- a/apps/computer-vision/components/ImageWithOCRBboxes.tsx
+++ b/apps/computer-vision/components/ImageWithOCRBboxes.tsx
@@ -59,13 +59,14 @@ export default function ImageWithOCRBboxes({
         {detections.map((detection, index) => {
           const { scaleX, scaleY, offsetX, offsetY } =
             calculateAdjustedDimensions();
-          const points = detection.bbox.map((point) => ({
-            x: point.x * scaleX + offsetX,
-            y: point.y * scaleY + offsetY,
-          }));
-
-          const pointsString = points
-            .map((point) => `${point.x},${point.y}`)
+          const { x1, y1, x2, y2 } = detection.bbox;
+          const pointsString = [
+            [x1, y1],
+            [x2, y1],
+            [x2, y2],
+            [x1, y2],
+          ]
+            .map(([x, y]) => `${x * scaleX + offsetX},${y * scaleY + offsetY}`)
             .join(' ');
 
           return (
diff --git a/apps/computer-vision/components/vision_camera/tasks/OCRTask.tsx b/apps/computer-vision/components/vision_camera/tasks/OCRTask.tsx
index fbdb1148e0..3dfa874b43 100644
--- a/apps/computer-vision/components/vision_camera/tasks/OCRTask.tsx
+++ b/apps/computer-vision/components/vision_camera/tasks/OCRTask.tsx
@@ -110,11 +110,17 @@ export default function OCRTask({
         style={StyleSheet.absoluteFill}
       >
         {detections.map((det, i) => {
-          const pts = det.bbox
-            .map((p) => `${p.x * scale + offsetX},${p.y * scale + offsetY}`)
+          const { x1, y1, x2, y2 } = det.bbox;
+          const pts = [
+            [x1, y1],
+            [x2, y1],
+            [x2, y2],
+            [x1, y2],
+          ]
+            .map(([x, y]) => `${x * scale + offsetX},${y * scale + offsetY}`)
             .join(' ');
-          const labelX = det.bbox[0]!.x * scale + offsetX;
-          const labelY = det.bbox[0]!.y * scale + offsetY - 4;
+          const labelX = x1 * scale + offsetX;
+          const labelY = y1 * scale + offsetY - 4;
           return (
             <React.Fragment key={i}>
               <Polygon points={pts} fill="none" stroke="cyan" strokeWidth={2} />
diff --git a/docs/docs/03-hooks/02-computer-vision/useOCR.md b/docs/docs/03-hooks/02-computer-vision/useOCR.md
index 7754992700..8f015e77a5 100644
--- a/docs/docs/03-hooks/02-computer-vision/useOCR.md
+++ b/docs/docs/03-hooks/02-computer-vision/useOCR.md
@@ -61,19 +61,21 @@ See the full guide: [VisionCamera Integration](./visioncamera-integration.md).
 The detection object is specified as follows:
 
 ```typescript
-interface Point {
-  x: number;
-  y: number;
+interface Bbox {
+  x1: number;
+  y1: number;
+  x2: number;
+  y2: number;
 }
 
 interface OCRDetection {
-  bbox: Point[];
+  bbox: Bbox;
   text: string;
   score: number;
 }
 ```
 
-The `bbox` property contains information about the bounding box of detected text regions. It is represented as four points, which are corners of detected bounding box.
+The `bbox` property contains the axis-aligned bounding box of the detected text region. `x1`/`y1` is the top-left corner and `x2`/`y2` is the bottom-right corner.
 The `text` property contains the text recognized within detected text region. The `score` represents the confidence score of the recognized text.
 
 ## Example
diff --git a/docs/docs/03-hooks/02-computer-vision/useVerticalOCR.md b/docs/docs/03-hooks/02-computer-vision/useVerticalOCR.md
index d65c97cf25..ff27b55e17 100644
--- a/docs/docs/03-hooks/02-computer-vision/useVerticalOCR.md
+++ b/docs/docs/03-hooks/02-computer-vision/useVerticalOCR.md
@@ -69,19 +69,21 @@ See the full guide: [VisionCamera Integration](./visioncamera-integration.md).
 The detection object is specified as follows:
 
 ```typescript
-interface Point {
-  x: number;
-  y: number;
+interface Bbox {
+  x1: number;
+  y1: number;
+  x2: number;
+  y2: number;
 }
 
 interface OCRDetection {
-  bbox: Point[];
+  bbox: Bbox;
   text: string;
   score: number;
 }
 ```
 
-The `bbox` property contains information about the bounding box of detected text regions. It is represented as four points, which are corners of detected bounding box.
+The `bbox` property contains the axis-aligned bounding box of the detected text region. `x1`/`y1` is the top-left corner and `x2`/`y2` is the bottom-right corner.
 The `text` property contains the text recognized within detected text region. The `score` represents the confidence score of the recognized text.
 
 ## Example
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 9c6642a6cd..077d426c8f 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -526,16 +526,8 @@ getJsiValue(const std::vector<models::ocr::types::OCRDetection> &detections,
 
     auto jsiDetectionObject = jsi::Object(runtime);
 
-    auto jsiBboxArray = jsi::Array(runtime, 2);
-#pragma unroll
-    for (size_t j = 0; j < 2u; ++j) {
-      auto jsiPointObject = jsi::Object(runtime);
-      jsiPointObject.setProperty(runtime, "x", detection.bbox[j].x);
-      jsiPointObject.setProperty(runtime, "y", detection.bbox[j].y);
-      jsiBboxArray.setValueAtIndex(runtime, j, jsiPointObject);
-    }
-
-    jsiDetectionObject.setProperty(runtime, "bbox", jsiBboxArray);
+    jsiDetectionObject.setProperty(runtime, "bbox",
+                                   getJsiValue(detection.bbox, runtime));
     jsiDetectionObject.setProperty(
         runtime, "text", jsi::String::createFromUtf8(runtime, detection.text));
     jsiDetectionObject.setProperty(runtime, "score", detection.score);
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp
index 60887e0f7b..d3e6964a05 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/OCR.cpp
@@ -68,15 +68,12 @@ OCR::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) {
   cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(bgr, orient);
   auto detections = runInference(rotated);
   for (auto &det : detections) {
-    ::rnexecutorch::utils::inverseRotatePoints(det.bbox, orient,
-                                               rotated.size());
-    // Re-normalize to a proper AABB after the coordinate rotation.
-    float minX = std::min(det.bbox[0].x, det.bbox[1].x);
-    float minY = std::min(det.bbox[0].y, det.bbox[1].y);
-    float maxX = std::max(det.bbox[0].x, det.bbox[1].x);
-    float maxY = std::max(det.bbox[0].y, det.bbox[1].y);
-    det.bbox[0] = {minX, minY};
-    det.bbox[1] = {maxX, maxY};
+    std::array<types::Point, 2> corners = {det.bbox.p1, det.bbox.p2};
+    ::rnexecutorch::utils::inverseRotatePoints(corners, orient, rotated.size());
+    det.bbox = {{std::min(corners[0].x, corners[1].x),
+                 std::min(corners[0].y, corners[1].y)},
+                {std::max(corners[0].x, corners[1].x),
+                 std::max(corners[0].y, corners[1].y)}};
   }
   return detections;
 }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h
index e3d38c33bb..8e711d382c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Types.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <array>
 #include <rnexecutorch/utils/computer_vision/Types.h>
 #include <string>
 #include <vector>
@@ -25,7 +24,7 @@ struct PaddingInfo {
 };
 
 struct OCRDetection {
-  std::array<types::Point, 2> bbox;
+  BBox bbox;
   std::string text;
   float score;
 };
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp
index de995fa9f9..a97e4c2121 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/OCRTest.cpp
@@ -100,12 +100,10 @@ TEST(OCRGenerateTests, DetectionsHaveValidBoundingBoxes) {
   auto results = model.generateFromString(kValidTestImagePath);
 
   for (const auto &detection : results) {
-    // Each bbox has 2 points: top-left [0] and bottom-right [1]
-    EXPECT_EQ(detection.bbox.size(), 2u);
-    EXPECT_GE(detection.bbox[0].x, 0.0f);
-    EXPECT_GE(detection.bbox[0].y, 0.0f);
-    EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x);
-    EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y);
+    EXPECT_GE(detection.bbox.p1.x, 0.0f);
+    EXPECT_GE(detection.bbox.p1.y, 0.0f);
+    EXPECT_GE(detection.bbox.p2.x, detection.bbox.p1.x);
+    EXPECT_GE(detection.bbox.p2.y, detection.bbox.p1.y);
   }
 }
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp
index f409926b83..c92abc0f15 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/VerticalOCRTest.cpp
@@ -117,12 +117,10 @@ TEST(VerticalOCRGenerateTests, IndependentCharsDetectionsHaveValidBBoxes) {
   auto results = model.generateFromString(kValidVerticalTestImagePath);
 
   for (const auto &detection : results) {
-    // Each bbox has 2 points: top-left [0] and bottom-right [1]
-    EXPECT_EQ(detection.bbox.size(), 2u);
-    EXPECT_GE(detection.bbox[0].x, 0.0f);
-    EXPECT_GE(detection.bbox[0].y, 0.0f);
-    EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x);
-    EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y);
+    EXPECT_GE(detection.bbox.p1.x, 0.0f);
+    EXPECT_GE(detection.bbox.p1.y, 0.0f);
+    EXPECT_GE(detection.bbox.p2.x, detection.bbox.p1.x);
+    EXPECT_GE(detection.bbox.p2.y, detection.bbox.p1.y);
   }
 }
 
@@ -181,12 +179,10 @@ TEST(VerticalOCRGenerateTests, JointCharsDetectionsHaveValidBBoxes) {
   auto results = model.generateFromString(kValidVerticalTestImagePath);
 
   for (const auto &detection : results) {
-    // Each bbox has 2 points: top-left [0] and bottom-right [1]
-    EXPECT_EQ(detection.bbox.size(), 2u);
-    EXPECT_GE(detection.bbox[0].x, 0.0f);
-    EXPECT_GE(detection.bbox[0].y, 0.0f);
-    EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x);
-    EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y);
+    EXPECT_GE(detection.bbox.p1.x, 0.0f);
+    EXPECT_GE(detection.bbox.p1.y, 0.0f);
+    EXPECT_GE(detection.bbox.p2.x, detection.bbox.p1.x);
+    EXPECT_GE(detection.bbox.p2.y, detection.bbox.p1.y);
   }
 }
 
diff --git a/packages/react-native-executorch/src/types/ocr.ts b/packages/react-native-executorch/src/types/ocr.ts
index 16f9fbcff1..38879e6c9d 100644
--- a/packages/react-native-executorch/src/types/ocr.ts
+++ b/packages/react-native-executorch/src/types/ocr.ts
@@ -1,33 +1,22 @@
 import { symbols } from '../constants/ocr/symbols';
 import { RnExecutorchError } from '../errors/errorUtils';
 import { Frame, PixelData, ResourceSource } from './common';
+import { Bbox } from './objectDetection';
 
 /**
  * OCRDetection represents a single detected text instance in an image,
  * including its bounding box, recognized text, and confidence score.
  * @category Types
- * @property {[Point, Point]} bbox - A tuple of two points defining the axis-aligned bounding box
- *   around the detected text: `bbox[0]` is the top-left corner and `bbox[1]` is the bottom-right corner.
+ * @property {Bbox} bbox - The axis-aligned bounding box around the detected text, with `x1`/`y1` as the top-left corner and `x2`/`y2` as the bottom-right corner.
  * @property {string} text - The recognized text within the bounding box.
  * @property {number} score - The confidence score of the OCR detection, ranging from 0 to 1.
  */
 export interface OCRDetection {
-  bbox: [Point, Point];
+  bbox: Bbox;
   text: string;
   score: number;
 }
 
-/**
- * Point represents a coordinate in 2D space.
- * @category Types
- * @property {number} x - The x-coordinate of the point.
- * @property {number} y - The y-coordinate of the point.
- */
-export interface Point {
-  x: number;
-  y: number;
-}
-
 /**
  * Configuration properties for the `useOCR` hook.
  * @category Types

From 95a6c69a62f45a72dcb41485216042871c15422b Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Tue, 12 May 2026 13:19:54 +0200
Subject: [PATCH 4/4] fix: check if y is >= 0.0f

---
 .../common/rnexecutorch/utils/computer_vision/Types.h           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h
index 943d0b1c91..7698d9807f 100644
--- a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h
+++ b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Types.h
@@ -15,7 +15,7 @@ struct BBox {
   float area() const { return width() * height(); }
 
   bool isValid() const {
-    return p2.x > p1.x && p2.y > p1.y && p1.x >= 0.0f && p1.y;
+    return p2.x > p1.x && p2.y > p1.y && p1.x >= 0.0f && p1.y >= 0.0f;
   }
 
   BBox scale(float widthRatio, float heightRatio) const {