Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions apps/computer-vision/components/ImageWithOCRBboxes.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,14 @@ export default function ImageWithOCRBboxes({
{detections.map((detection, index) => {
const { scaleX, scaleY, offsetX, offsetY } =
calculateAdjustedDimensions();
const points = detection.bbox.map((point) => ({
x: point.x * scaleX + offsetX,
y: point.y * scaleY + offsetY,
}));

const pointsString = points
.map((point) => `${point.x},${point.y}`)
const { x1, y1, x2, y2 } = detection.bbox;
const pointsString = [
[x1, y1],
[x2, y1],
[x2, y2],
[x1, y2],
]
.map(([x, y]) => `${x * scaleX + offsetX},${y * scaleY + offsetY}`)
.join(' ');

return (
Expand Down
14 changes: 10 additions & 4 deletions apps/computer-vision/components/vision_camera/tasks/OCRTask.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,17 @@ export default function OCRTask({
style={StyleSheet.absoluteFill}
>
{detections.map((det, i) => {
const pts = det.bbox
.map((p) => `${p.x * scale + offsetX},${p.y * scale + offsetY}`)
const { x1, y1, x2, y2 } = det.bbox;
const pts = [
[x1, y1],
[x2, y1],
[x2, y2],
[x1, y2],
]
.map(([x, y]) => `${x * scale + offsetX},${y * scale + offsetY}`)
.join(' ');
const labelX = det.bbox[0]!.x * scale + offsetX;
const labelY = det.bbox[0]!.y * scale + offsetY - 4;
const labelX = x1 * scale + offsetX;
const labelY = y1 * scale + offsetY - 4;
return (
<React.Fragment key={i}>
<Polygon points={pts} fill="none" stroke="cyan" strokeWidth={2} />
Expand Down
12 changes: 7 additions & 5 deletions docs/docs/03-hooks/02-computer-vision/useOCR.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,19 +61,21 @@ See the full guide: [VisionCamera Integration](./visioncamera-integration.md).
The detection object is specified as follows:

```typescript
interface Point {
x: number;
y: number;
interface Bbox {
x1: number;
y1: number;
x2: number;
y2: number;
}

interface OCRDetection {
bbox: Point[];
bbox: Bbox;
text: string;
score: number;
}
```
Comment thread
chmjkb marked this conversation as resolved.

The `bbox` property contains information about the bounding box of detected text regions. It is represented as four points, which are corners of detected bounding box.
The `bbox` property contains the axis-aligned bounding box of the detected text region. `x1`/`y1` is the top-left corner and `x2`/`y2` is the bottom-right corner.
The `text` property contains the text recognized within detected text region. The `score` represents the confidence score of the recognized text.

## Example
Expand Down
12 changes: 7 additions & 5 deletions docs/docs/03-hooks/02-computer-vision/useVerticalOCR.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,19 +69,21 @@ See the full guide: [VisionCamera Integration](./visioncamera-integration.md).
The detection object is specified as follows:

```typescript
interface Point {
x: number;
y: number;
interface Bbox {
x1: number;
y1: number;
x2: number;
y2: number;
}

interface OCRDetection {
bbox: Point[];
bbox: Bbox;
text: string;
score: number;
}
```

The `bbox` property contains information about the bounding box of detected text regions. It is represented as four points, which are corners of detected bounding box.
The `bbox` property contains the axis-aligned bounding box of the detected text region. `x1`/`y1` is the top-left corner and `x2`/`y2` is the bottom-right corner.
The `text` property contains the text recognized within detected text region. The `score` represents the confidence score of the recognized text.

## Example
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -462,10 +462,10 @@ getJsiValue(const std::unordered_map<std::string_view, float> &map,
inline jsi::Value getJsiValue(const utils::computer_vision::BBox &bbox,
jsi::Runtime &runtime) {
jsi::Object obj(runtime);
obj.setProperty(runtime, "x1", bbox.x1);
obj.setProperty(runtime, "y1", bbox.y1);
obj.setProperty(runtime, "x2", bbox.x2);
obj.setProperty(runtime, "y2", bbox.y2);
obj.setProperty(runtime, "x1", bbox.p1.x);
obj.setProperty(runtime, "y1", bbox.p1.y);
obj.setProperty(runtime, "x2", bbox.p2.x);
obj.setProperty(runtime, "y2", bbox.p2.y);
return obj;
}

Expand Down Expand Up @@ -526,16 +526,8 @@ getJsiValue(const std::vector<models::ocr::types::OCRDetection> &detections,

auto jsiDetectionObject = jsi::Object(runtime);

auto jsiBboxArray = jsi::Array(runtime, 4);
#pragma unroll
for (size_t j = 0; j < 4u; ++j) {
auto jsiPointObject = jsi::Object(runtime);
jsiPointObject.setProperty(runtime, "x", detection.bbox[j].x);
jsiPointObject.setProperty(runtime, "y", detection.bbox[j].y);
jsiBboxArray.setValueAtIndex(runtime, j, jsiPointObject);
}

jsiDetectionObject.setProperty(runtime, "bbox", jsiBboxArray);
jsiDetectionObject.setProperty(runtime, "bbox",
getJsiValue(detection.bbox, runtime));
jsiDetectionObject.setProperty(
runtime, "text", jsi::String::createFromUtf8(runtime, detection.text));
jsiDetectionObject.setProperty(runtime, "score", detection.score);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,10 +161,10 @@ cv::Rect BaseInstanceSegmentation::computeMaskCropRect(
const utils::computer_vision::BBox &bboxModel, cv::Size modelInputSize,
cv::Size maskSize) {

float mx1F = bboxModel.x1 * maskSize.width / modelInputSize.width;
float my1F = bboxModel.y1 * maskSize.height / modelInputSize.height;
float mx2F = bboxModel.x2 * maskSize.width / modelInputSize.width;
float my2F = bboxModel.y2 * maskSize.height / modelInputSize.height;
float mx1F = bboxModel.p1.x * maskSize.width / modelInputSize.width;
float my1F = bboxModel.p1.y * maskSize.height / modelInputSize.height;
float mx2F = bboxModel.p2.x * maskSize.width / modelInputSize.width;
float my2F = bboxModel.p2.y * maskSize.height / modelInputSize.height;

int32_t mx1 = std::max(0, static_cast<int32_t>(std::floor(mx1F)));
int32_t my1 = std::max(0, static_cast<int32_t>(std::floor(my1F)));
Expand Down Expand Up @@ -193,8 +193,8 @@ cv::Mat BaseInstanceSegmentation::warpToOriginalResolution(
float scaleY = static_cast<float>(originalSize.height) / maskSize.height;

cv::Mat M = (cv::Mat_<float>(2, 3) << scaleX, 0,
(maskRect.x * scaleX - bboxOriginal.x1), 0, scaleY,
(maskRect.y * scaleY - bboxOriginal.y1));
(maskRect.x * scaleX - bboxOriginal.p1.x), 0, scaleY,
(maskRect.y * scaleY - bboxOriginal.p1.y));

cv::Size bboxSize(static_cast<int32_t>(std::round(bboxOriginal.width())),
static_cast<int32_t>(std::round(bboxOriginal.height())));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "OCR.h"
#include "Constants.h"
#include <algorithm>
#include <rnexecutorch/Error.h>
#include <rnexecutorch/ErrorCodes.h>
#include <rnexecutorch/data_processing/ImageProcessing.h>
Expand Down Expand Up @@ -67,8 +68,12 @@ OCR::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) {
cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(bgr, orient);
auto detections = runInference(rotated);
for (auto &det : detections) {
::rnexecutorch::utils::inverseRotatePoints(det.bbox, orient,
rotated.size());
std::array<types::Point, 2> corners = {det.bbox.p1, det.bbox.p2};
::rnexecutorch::utils::inverseRotatePoints(corners, orient, rotated.size());
det.bbox = {{std::min(corners[0].x, corners[1].x),
std::min(corners[0].y, corners[1].y)},
{std::max(corners[0].x, corners[1].x),
std::max(corners[0].y, corners[1].y)}};
}
return detections;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,17 @@ void RecognitionHandler::processBBox(std::vector<types::OCRDetection> &boxList,
/*
Since the boxes were corresponding to the image resized to 1280x1280,
we want to return the boxes shifted and rescaled to match the original
image dimensions.
image dimensions. Compute the axis-aligned bounding box (AABB) from the
four rotated corners and store only the top-left and bottom-right points.
*/
for (auto &point : box.bbox) {
point.x = (point.x - ratioAndPadding.left) * ratioAndPadding.resizeRatio;
point.y = (point.y - ratioAndPadding.top) * ratioAndPadding.resizeRatio;
}
const float ratio = ratioAndPadding.resizeRatio;
const float padLeft = static_cast<float>(ratioAndPadding.left);
const float padTop = static_cast<float>(ratioAndPadding.top);
types::BBox transformedBbox{
{(box.bbox.p1.x - padLeft) * ratio, (box.bbox.p1.y - padTop) * ratio},
{(box.bbox.p2.x - padLeft) * ratio, (box.bbox.p2.y - padTop) * ratio}};
boxList.emplace_back(
box.bbox,
transformedBbox,
converter.decodeGreedy(predictionIndices, predictionIndices.size())[0],
confidenceScore);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
#pragma once

#include <array>
#include <rnexecutorch/utils/computer_vision/Types.h>
#include <string>
#include <vector>

namespace rnexecutorch::models::ocr::types {
struct Point {
float x;
float y;
};
using namespace rnexecutorch::utils::computer_vision;

struct ValuesAndIndices {
std::vector<float> values;
std::vector<int32_t> indices;
};

struct DetectorBBox {
std::array<Point, 4> bbox;
BBox bbox;
float angle;
};

Expand All @@ -27,7 +24,7 @@ struct PaddingInfo {
};

struct OCRDetection {
std::array<types::Point, 4> bbox;
BBox bbox;
std::string text;
float score;
};
Expand Down
Loading
Loading