Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -526,9 +526,9 @@ getJsiValue(const std::vector<models::ocr::types::OCRDetection> &detections,

auto jsiDetectionObject = jsi::Object(runtime);

auto jsiBboxArray = jsi::Array(runtime, 4);
auto jsiBboxArray = jsi::Array(runtime, 2);
#pragma unroll
for (size_t j = 0; j < 4u; ++j) {
for (size_t j = 0; j < 2u; ++j) {
auto jsiPointObject = jsi::Object(runtime);
jsiPointObject.setProperty(runtime, "x", detection.bbox[j].x);
jsiPointObject.setProperty(runtime, "y", detection.bbox[j].y);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "OCR.h"
#include "Constants.h"
#include <algorithm>
#include <rnexecutorch/Error.h>
#include <rnexecutorch/ErrorCodes.h>
#include <rnexecutorch/data_processing/ImageProcessing.h>
Expand Down Expand Up @@ -69,6 +70,13 @@ OCR::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) {
for (auto &det : detections) {
::rnexecutorch::utils::inverseRotatePoints(det.bbox, orient,
rotated.size());
// Re-normalize to a proper AABB after the coordinate rotation.
float minX = std::min(det.bbox[0].x, det.bbox[1].x);
float minY = std::min(det.bbox[0].y, det.bbox[1].y);
float maxX = std::max(det.bbox[0].x, det.bbox[1].x);
float maxY = std::max(det.bbox[0].y, det.bbox[1].y);
det.bbox[0] = {minX, minY};
det.bbox[1] = {maxX, maxY};
}
return detections;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include "RecognitionHandler.h"
#include <algorithm>
#include <limits>
#include <rnexecutorch/data_processing/ImageProcessing.h>
#include <rnexecutorch/models/ocr/Constants.h>
#include <rnexecutorch/models/ocr/utils/RecognitionHandlerUtils.h>
Expand Down Expand Up @@ -55,14 +57,24 @@ void RecognitionHandler::processBBox(std::vector<types::OCRDetection> &boxList,
/*
Since the boxes were corresponding to the image resized to 1280x1280,
we want to return the boxes shifted and rescaled to match the original
image dimensions.
image dimensions. Compute the axis-aligned bounding box (AABB) from the
four rotated corners and store only the top-left and bottom-right points.
*/
for (auto &point : box.bbox) {
point.x = (point.x - ratioAndPadding.left) * ratioAndPadding.resizeRatio;
point.y = (point.y - ratioAndPadding.top) * ratioAndPadding.resizeRatio;
float minX = std::numeric_limits<float>::max();
float minY = std::numeric_limits<float>::max();
float maxX = std::numeric_limits<float>::lowest();
float maxY = std::numeric_limits<float>::lowest();
for (const auto &point : box.bbox) {
float x = (point.x - ratioAndPadding.left) * ratioAndPadding.resizeRatio;
float y = (point.y - ratioAndPadding.top) * ratioAndPadding.resizeRatio;
minX = std::min(minX, x);
minY = std::min(minY, y);
maxX = std::max(maxX, x);
maxY = std::max(maxY, y);
}
boxList.emplace_back(
box.bbox,
std::array<types::Point, 2>{types::Point{minX, minY},
types::Point{maxX, maxY}},
converter.decodeGreedy(predictionIndices, predictionIndices.size())[0],
confidenceScore);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ struct PaddingInfo {
};

struct OCRDetection {
std::array<types::Point, 4> bbox;
std::array<types::Point, 2> bbox;
std::string text;
float score;
};
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include "VerticalOCR.h"
#include <algorithm>
#include <limits>
#include <rnexecutorch/Error.h>
#include <rnexecutorch/ErrorCodes.h>
#include <rnexecutorch/data_processing/ImageProcessing.h>
Expand Down Expand Up @@ -73,6 +75,13 @@ VerticalOCR::generateFromFrame(jsi::Runtime &runtime,
for (auto &det : detections) {
::rnexecutorch::utils::inverseRotatePoints(det.bbox, orient,
rotated.size());
// Re-normalize to a proper AABB after the coordinate rotation.
float minX = std::min(det.bbox[0].x, det.bbox[1].x);
float minY = std::min(det.bbox[0].y, det.bbox[1].y);
float maxX = std::max(det.bbox[0].x, det.bbox[1].x);
float maxY = std::max(det.bbox[0].y, det.bbox[1].y);
det.bbox[0] = {minX, minY};
det.bbox[1] = {maxX, maxY};
}
return detections;
}
Expand Down Expand Up @@ -204,16 +213,24 @@ types::OCRDetection VerticalOCR::_processSingleTextBox(
: _handleJointCharacters(box, originalImage, characterBoxes,
paddingsBox, imagePaddings);
}
// Modify the returned boxes to match the original image size
std::array<types::Point, 4> finalBbox;
// Modify the returned boxes to match the original image size. Compute the
// axis-aligned bounding box (AABB) from the four rotated corners and store
// only the top-left and bottom-right points.
float minX = std::numeric_limits<float>::max();
float minY = std::numeric_limits<float>::max();
float maxX = std::numeric_limits<float>::lowest();
float maxY = std::numeric_limits<float>::lowest();
for (size_t i = 0; i < box.bbox.size(); ++i) {
finalBbox[i].x =
(box.bbox[i].x - imagePaddings.left) * imagePaddings.resizeRatio;
finalBbox[i].y =
(box.bbox[i].y - imagePaddings.top) * imagePaddings.resizeRatio;
float x = (box.bbox[i].x - imagePaddings.left) * imagePaddings.resizeRatio;
float y = (box.bbox[i].y - imagePaddings.top) * imagePaddings.resizeRatio;
minX = std::min(minX, x);
minY = std::min(minY, y);
maxX = std::max(maxX, x);
maxY = std::max(maxY, y);
}

return {finalBbox, text, confidenceScore};
return {{types::Point{minX, minY}, types::Point{maxX, maxY}}, text,
confidenceScore};
}

void VerticalOCR::unload() noexcept {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,12 @@ TEST(OCRGenerateTests, DetectionsHaveValidBoundingBoxes) {
auto results = model.generateFromString(kValidTestImagePath);

for (const auto &detection : results) {
// Each bbox should have 4 points
EXPECT_EQ(detection.bbox.size(), 4u);
for (const auto &point : detection.bbox) {
EXPECT_GE(point.x, 0.0f);
EXPECT_GE(point.y, 0.0f);
}
// Each bbox has 2 points: top-left [0] and bottom-right [1]
EXPECT_EQ(detection.bbox.size(), 2u);
EXPECT_GE(detection.bbox[0].x, 0.0f);
EXPECT_GE(detection.bbox[0].y, 0.0f);
EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x);
EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,12 @@ TEST(VerticalOCRGenerateTests, IndependentCharsDetectionsHaveValidBBoxes) {
auto results = model.generateFromString(kValidVerticalTestImagePath);

for (const auto &detection : results) {
EXPECT_EQ(detection.bbox.size(), 4u);
for (const auto &point : detection.bbox) {
EXPECT_GE(point.x, 0.0f);
EXPECT_GE(point.y, 0.0f);
}
// Each bbox has 2 points: top-left [0] and bottom-right [1]
EXPECT_EQ(detection.bbox.size(), 2u);
EXPECT_GE(detection.bbox[0].x, 0.0f);
EXPECT_GE(detection.bbox[0].y, 0.0f);
EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x);
EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y);
}
}

Expand Down Expand Up @@ -180,11 +181,12 @@ TEST(VerticalOCRGenerateTests, JointCharsDetectionsHaveValidBBoxes) {
auto results = model.generateFromString(kValidVerticalTestImagePath);

for (const auto &detection : results) {
EXPECT_EQ(detection.bbox.size(), 4u);
for (const auto &point : detection.bbox) {
EXPECT_GE(point.x, 0.0f);
EXPECT_GE(point.y, 0.0f);
}
// Each bbox has 2 points: top-left [0] and bottom-right [1]
EXPECT_EQ(detection.bbox.size(), 2u);
EXPECT_GE(detection.bbox[0].x, 0.0f);
EXPECT_GE(detection.bbox[0].y, 0.0f);
EXPECT_GE(detection.bbox[1].x, detection.bbox[0].x);
EXPECT_GE(detection.bbox[1].y, detection.bbox[0].y);
}
}

Expand Down
5 changes: 3 additions & 2 deletions packages/react-native-executorch/src/types/ocr.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@ import { Frame, PixelData, ResourceSource } from './common';
* OCRDetection represents a single detected text instance in an image,
* including its bounding box, recognized text, and confidence score.
* @category Types
* @property {Point[]} bbox - An array of points defining the bounding box around the detected text.
* @property {[Point, Point]} bbox - A tuple of two points defining the axis-aligned bounding box
* around the detected text: `bbox[0]` is the top-left corner and `bbox[1]` is the bottom-right corner.
* @property {string} text - The recognized text within the bounding box.
* @property {number} score - The confidence score of the OCR detection, ranging from 0 to 1.
*/
export interface OCRDetection {
bbox: Point[];
bbox: [Point, Point];
text: string;
score: number;
}
Expand Down
Loading