From 2c95b20e85349bbd6788d66cf4dff3836dd54779 Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Tue, 5 May 2026 16:27:10 +0200 Subject: [PATCH 01/18] feat: implement initial FastSAM instance segmentation screen and model integration --- .cspell-wordlist.txt | 1 + apps/computer-vision/app/_layout.tsx | 8 + apps/computer-vision/app/fast_sam/index.tsx | 561 ++++++++++++++++++ apps/computer-vision/app/index.tsx | 6 + .../app/instance_segmentation/index.tsx | 4 + .../src/constants/commonVision.ts | 11 + .../src/constants/modelUrls.ts | 22 + packages/react-native-executorch/src/index.ts | 1 + .../InstanceSegmentationModule.ts | 16 + .../src/types/instanceSegmentation.ts | 4 +- .../src/utils/fastSAMPrompts.ts | 111 ++++ 11 files changed, 744 insertions(+), 1 deletion(-) create mode 100644 apps/computer-vision/app/fast_sam/index.tsx create mode 100644 packages/react-native-executorch/src/utils/fastSAMPrompts.ts diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt index 84d006eefe..cbd5f6d67d 100644 --- a/.cspell-wordlist.txt +++ b/.cspell-wordlist.txt @@ -203,3 +203,4 @@ fishjam Fishjam deinitialize Deinitialize +fastsam diff --git a/apps/computer-vision/app/_layout.tsx b/apps/computer-vision/app/_layout.tsx index 03770c2720..7aed6af92f 100644 --- a/apps/computer-vision/app/_layout.tsx +++ b/apps/computer-vision/app/_layout.tsx @@ -189,6 +189,14 @@ export default function _layout() { headerTitleStyle: { color: ColorPalette.primary }, }} /> + ); diff --git a/apps/computer-vision/app/fast_sam/index.tsx b/apps/computer-vision/app/fast_sam/index.tsx new file mode 100644 index 0000000000..d47e538c8a --- /dev/null +++ b/apps/computer-vision/app/fast_sam/index.tsx @@ -0,0 +1,561 @@ +import React, { useContext, useMemo, useRef, useState } from 'react'; +import { + View, + StyleSheet, + Text, + TouchableOpacity, + GestureResponderEvent, + Image, +} from 'react-native'; +import { + Canvas, + Image as SkiaImage, + Rect, + Group, + useImage, + Skia, + AlphaType, + ColorType, +} from '@shopify/react-native-skia'; +import { + useInstanceSegmentation, + FASTSAM_S, + FASTSAM_X, + InstanceSegmentationModelSources, + SegmentedInstance, + FastSAMLabel, + selectByPoint, + selectByBox, + Bbox, +} from 'react-native-executorch'; +import { GeneratingContext } from '../../context'; +import { ModelPicker, ModelOption } from '../../components/ModelPicker'; +import { BottomBar } from '../../components/BottomBar'; +import { StatsBar } from '../../components/StatsBar'; +import Spinner from '../../components/Spinner'; +import ScreenWrapper from '../../ScreenWrapper'; +import { getImage } from '../../utils'; +import ColorPalette from '../../colors'; + +type PromptMode = 'point' | 'box'; + +const MODELS: ModelOption[] = [ + { label: 'FastSAM-s', value: FASTSAM_S }, + { label: 'FastSAM-x', value: FASTSAM_X }, +]; + +export default function FastSAMScreen() { + const { setGlobalGenerating } = useContext(GeneratingContext); + + const [selectedModel, setSelectedModel] = + useState(FASTSAM_S); + const [mode, setMode] = useState('point'); + const [inferenceTime, setInferenceTime] = useState(null); + + const [imageUri, setImageUri] = useState(''); + const [imageSize, setImageSize] = useState({ width: 0, height: 0 }); + + const rawInstancesRef = useRef[]>([]); + const [selection, setSelection] = useState | null>(null); + + const [draftBox, setDraftBox] = useState<{ + x1: number; + y1: number; + x2: number; + y2: number; + } | null>(null); + const boxStartRef = useRef<{ x: number; y: number } | null>(null); + + const sourceLayoutRef = useRef({ width: 0, height: 0 }); + const cutoutLayoutRef = useRef({ width: 0, height: 0 }); + const [cutoutLayout, setCutoutLayout] = useState({ width: 0, height: 0 }); + + const { isReady, isGenerating, downloadProgress, forward, error } = + useInstanceSegmentation({ model: selectedModel }); + + React.useEffect(() => { + setGlobalGenerating(isGenerating); + }, [isGenerating, setGlobalGenerating]); + + // ------------------------------------------------------------------------- + // Coordinate conversion (source image box) + // ------------------------------------------------------------------------- + + function touchToImageCoords(touchX: number, touchY: number) { + const { width: cw, height: ch } = sourceLayoutRef.current; + const { width: iw, height: ih } = imageSize; + if (iw === 0 || ih === 0) return null; + const scale = Math.min(cw / iw, ch / ih); + const offsetX = (cw - iw * scale) / 2; + const offsetY = (ch - ih * scale) / 2; + return { + x: (touchX - offsetX) / scale, + y: (touchY - offsetY) / scale, + }; + } + + // ------------------------------------------------------------------------- + // Point prompt + // ------------------------------------------------------------------------- + + function handleTap(e: GestureResponderEvent) { + if (mode !== 'point' || rawInstancesRef.current.length === 0) return; + const coords = touchToImageCoords( + e.nativeEvent.locationX, + e.nativeEvent.locationY + ); + if (!coords) return; + const match = selectByPoint( + rawInstancesRef.current, + Math.round(coords.x), + Math.round(coords.y) + ); + setSelection(match ?? null); + } + + // ------------------------------------------------------------------------- + // Box prompt + // ------------------------------------------------------------------------- + + function handleBoxStart(e: GestureResponderEvent) { + if (mode !== 'box') return; + const coords = touchToImageCoords( + e.nativeEvent.locationX, + e.nativeEvent.locationY + ); + if (!coords) return; + boxStartRef.current = coords; + setDraftBox({ x1: coords.x, y1: coords.y, x2: coords.x, y2: coords.y }); + } + + function handleBoxMove(e: GestureResponderEvent) { + if (mode !== 'box' || !boxStartRef.current) return; + const coords = touchToImageCoords( + e.nativeEvent.locationX, + e.nativeEvent.locationY + ); + if (!coords) return; + const s = boxStartRef.current; + setDraftBox({ + x1: Math.min(s.x, coords.x), + y1: Math.min(s.y, coords.y), + x2: Math.max(s.x, coords.x), + y2: Math.max(s.y, coords.y), + }); + } + + function handleBoxEnd(e: GestureResponderEvent) { + if ( + mode !== 'box' || + !boxStartRef.current || + rawInstancesRef.current.length === 0 + ) { + boxStartRef.current = null; + setDraftBox(null); + return; + } + const coords = touchToImageCoords( + e.nativeEvent.locationX, + e.nativeEvent.locationY + ); + const s = boxStartRef.current; + boxStartRef.current = null; + setDraftBox(null); + if (!coords) return; + const box: Bbox = { + x1: Math.min(s.x, coords.x), + y1: Math.min(s.y, coords.y), + x2: Math.max(s.x, coords.x), + y2: Math.max(s.y, coords.y), + }; + setSelection(selectByBox(rawInstancesRef.current, box) ?? null); + } + + // ------------------------------------------------------------------------- + // Image loading & inference + // ------------------------------------------------------------------------- + + const handleCameraPress = async (isCamera: boolean) => { + const image = await getImage(isCamera); + if (!image?.uri) return; + setImageUri(image.uri); + setImageSize({ width: image.width ?? 0, height: image.height ?? 0 }); + rawInstancesRef.current = []; + setSelection(null); + setInferenceTime(null); + }; + + const runForward = async () => { + if (!imageUri) return; + try { + const start = Date.now(); + const output = await forward(imageUri, { + confidenceThreshold: 0.4, + iouThreshold: 0.9, + maxInstances: 100, + returnMaskAtOriginalResolution: true, + }); + setInferenceTime(Date.now() - start); + rawInstancesRef.current = output; + setSelection(null); + } catch (e) { + console.error(e); + } + }; + + // ------------------------------------------------------------------------- + // Cutout rendering + // ------------------------------------------------------------------------- + + const skiaSource = useImage(imageUri || null); + + const alphaMask = useMemo(() => { + if (!selection) return null; + return buildAlphaMask( + selection.mask, + selection.maskWidth, + selection.maskHeight, + selection.bbox.x1, + selection.bbox.y1, + imageSize.width, + imageSize.height + ); + }, [selection, imageSize]); + + const { width: cw, height: ch } = cutoutLayout; + const { width: iw, height: ih } = imageSize; + const cutoutScale = + cw > 0 && ch > 0 && iw > 0 && ih > 0 ? Math.min(cw / iw, ch / ih) : 1; + const cutoutOffsetX = (cw - iw * cutoutScale) / 2; + const cutoutOffsetY = (ch - ih * cutoutScale) / 2; + + // Draft box overlay coords (source box) + const { width: scw, height: sch } = sourceLayoutRef.current; + const srcScale = iw > 0 && ih > 0 ? Math.min(scw / iw, sch / ih) : 1; + const srcOffsetX = (scw - iw * srcScale) / 2; + const srcOffsetY = (sch - ih * srcScale) / 2; + + // ------------------------------------------------------------------------- + // Error / loading + // ------------------------------------------------------------------------- + + if (!isReady && error) { + return ( + + + Error Loading Model + {error.message} + + + ); + } + + if (!isReady) { + return ( + + ); + } + + return ( + + {/* ---- Source image box ---- */} + { + const { width, height } = e.nativeEvent.layout; + sourceLayoutRef.current = { width, height }; + }} + onTouchStart={(e) => { + if (mode === 'point') handleTap(e); + else handleBoxStart(e); + }} + onTouchMove={(e) => { + if (mode === 'box') handleBoxMove(e); + }} + onTouchEnd={(e) => { + if (mode === 'box') handleBoxEnd(e); + }} + > + + {!imageUri && ( + + Load an image to get started + + )} + {/* Draft box */} + {draftBox && iw > 0 && ( + + + + )} + + + {/* ---- Cutout box ---- */} + { + const { width, height } = e.nativeEvent.layout; + cutoutLayoutRef.current = { width, height }; + setCutoutLayout({ width, height }); + }} + > + {selection && skiaSource && alphaMask ? ( + + + + + + + + ) : ( + + + {rawInstancesRef.current.length > 0 + ? 'Tap or draw a box on the image above' + : imageUri + ? 'Run inference first' + : ''} + + + )} + + + {/* ---- Controls ---- */} + + + setMode('point')} + > + + Point + + + setMode('box')} + > + + Box + + + + + + { + setSelectedModel(m); + rawInstancesRef.current = []; + setSelection(null); + setInferenceTime(null); + }} + /> + + 0 + ? rawInstancesRef.current.length + : null + } + /> + + + + ); +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +// Builds a full-image alpha mask. `mask` is bbox-relative (maskWidth × maskHeight), +// positioned at (bboxX1, bboxY1) within an image of size (imgW × imgH). +function buildAlphaMask( + mask: Uint8Array, + maskWidth: number, + maskHeight: number, + bboxX1: number, + bboxY1: number, + imgW: number, + imgH: number +) { + const MAX_DIM = 512; + const ds = Math.min(1, MAX_DIM / Math.max(imgW, imgH)); + const dstW = Math.max(1, Math.round(imgW * ds)); + const dstH = Math.max(1, Math.round(imgH * ds)); + + const pixels = new Uint8Array(dstW * dstH * 4); + + // Place the bbox-relative mask into the full-image canvas + const offX = Math.round(bboxX1 * ds); + const offY = Math.round(bboxY1 * ds); + const scaledMaskW = Math.max(1, Math.round(maskWidth * ds)); + const scaledMaskH = Math.max(1, Math.round(maskHeight * ds)); + + for (let dy = 0; dy < scaledMaskH; dy++) { + const sy = Math.min( + Math.floor((dy / scaledMaskH) * maskHeight), + maskHeight - 1 + ); + for (let dx = 0; dx < scaledMaskW; dx++) { + const sx = Math.min( + Math.floor((dx / scaledMaskW) * maskWidth), + maskWidth - 1 + ); + if (mask[sy * maskWidth + sx] > 0) { + const imgX = offX + dx; + const imgY = offY + dy; + if (imgX >= 0 && imgX < dstW && imgY >= 0 && imgY < dstH) { + const i = (imgY * dstW + imgX) * 4; + pixels[i] = 255; + pixels[i + 1] = 255; + pixels[i + 2] = 255; + pixels[i + 3] = 255; + } + } + } + } + + const data = Skia.Data.fromBytes(pixels); + const img = Skia.Image.MakeImage( + { + width: dstW, + height: dstH, + alphaType: AlphaType.Premul, + colorType: ColorType.RGBA_8888, + }, + data, + dstW * 4 + ); + data.dispose(); + return img; +} + +// --------------------------------------------------------------------------- +// Styles +// --------------------------------------------------------------------------- + +const styles = StyleSheet.create({ + imageBox: { + flex: 1, + width: '100%', + borderBottomWidth: 1, + borderBottomColor: '#e0e0e0', + }, + image: { + width: '100%', + height: '100%', + }, + hint: { + ...StyleSheet.absoluteFillObject, + justifyContent: 'center', + alignItems: 'center', + }, + hintText: { + fontSize: 14, + color: '#aaa', + }, + controls: { + flexDirection: 'row', + alignItems: 'center', + paddingHorizontal: 16, + paddingVertical: 10, + borderTopWidth: 1, + borderTopColor: '#e0e0e0', + }, + modeToggle: { + flexDirection: 'row', + borderRadius: 8, + overflow: 'hidden', + borderWidth: 1, + borderColor: ColorPalette.primary, + }, + modeBtn: { + paddingHorizontal: 20, + paddingVertical: 8, + backgroundColor: '#fff', + }, + modeBtnActive: { + backgroundColor: ColorPalette.primary, + }, + modeBtnText: { + fontSize: 14, + fontWeight: '600', + color: ColorPalette.primary, + }, + modeBtnTextActive: { + color: '#fff', + }, + errorContainer: { + flex: 1, + justifyContent: 'center', + alignItems: 'center', + padding: 32, + }, + errorTitle: { + fontSize: 20, + fontWeight: '700', + color: '#e74c3c', + marginBottom: 12, + }, + errorText: { + fontSize: 14, + color: '#555', + textAlign: 'center', + }, +}); diff --git a/apps/computer-vision/app/index.tsx b/apps/computer-vision/app/index.tsx index 15b9d8650b..690ebfb331 100644 --- a/apps/computer-vision/app/index.tsx +++ b/apps/computer-vision/app/index.tsx @@ -47,6 +47,12 @@ export default function Home() { > Pose Estimation + router.navigate('fast_sam/')} + > + FastSAM + router.navigate('ocr/')} diff --git a/apps/computer-vision/app/instance_segmentation/index.tsx b/apps/computer-vision/app/instance_segmentation/index.tsx index dba53875e5..f669c383d5 100644 --- a/apps/computer-vision/app/instance_segmentation/index.tsx +++ b/apps/computer-vision/app/instance_segmentation/index.tsx @@ -11,6 +11,8 @@ import { YOLO26X_SEG, RF_DETR_NANO_SEG, InstanceSegmentationModelSources, + FASTSAM_S, + FASTSAM_X, } from 'react-native-executorch'; import { View, @@ -35,6 +37,8 @@ const MODELS: ModelOption[] = [ { label: 'Yolo26L', value: YOLO26L_SEG }, { label: 'Yolo26X', value: YOLO26X_SEG }, { label: 'RF-DeTR Nano', value: RF_DETR_NANO_SEG }, + { label: 'FastSAM-S', value: FASTSAM_S }, + { label: 'FastSAM-X', value: FASTSAM_X }, ]; export default function InstanceSegmentationScreen() { diff --git a/packages/react-native-executorch/src/constants/commonVision.ts b/packages/react-native-executorch/src/constants/commonVision.ts index ecea0f8069..bac8e8c520 100644 --- a/packages/react-native-executorch/src/constants/commonVision.ts +++ b/packages/react-native-executorch/src/constants/commonVision.ts @@ -118,6 +118,17 @@ export enum CocoLabel { * @see {@link CocoLabel} for the RF-DETR / SSDLite variant * @category Types */ +/** + * Class label for FastSAM models. + * + * FastSAM is class-agnostic and produces a single "object" class for every + * detected region. Use this enum when working with `fastsam-s` or `fastsam-x`. + * @category Types + */ +export enum FastSAMLabel { + OBJECT = 0, +} + export enum CocoLabelYolo { PERSON = 0, BICYCLE = 1, diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 6895601e1e..6c24934176 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -1010,6 +1010,26 @@ export const SELFIE_SEGMENTATION = { modelSource: SELFIE_SEGMENTATION_MODEL, } as const; +// FastSAM Instance Segmentation +const FASTSAM_S_SEG_MODEL = `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-s/xnnpack/fastsam_s_xnnpack_fp32.pte`; +const FASTSAM_X_SEG_MODEL = `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-x/xnnpack/fastsam_x_xnnpack_fp32.pte`; + +/** + * @category Models - Instance Segmentation + */ +export const FASTSAM_S = { + modelName: 'fastsam-s', + modelSource: FASTSAM_S_SEG_MODEL, +} as const; + +/** + * @category Models - Instance Segmentation + */ +export const FASTSAM_X = { + modelName: 'fastsam-x', + modelSource: FASTSAM_X_SEG_MODEL, +} as const; + /** * @category Models - Instance Segmentation */ @@ -1352,6 +1372,8 @@ export const MODEL_REGISTRY = { YOLO26L_SEG, YOLO26X_SEG, RF_DETR_NANO_SEG, + FASTSAM_S, + FASTSAM_X, CLIP_VIT_BASE_PATCH32_IMAGE, CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED, ALL_MINILM_L6_V2, diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts index 96d167a7d2..6c88de830e 100644 --- a/packages/react-native-executorch/src/index.ts +++ b/packages/react-native-executorch/src/index.ts @@ -212,6 +212,7 @@ export * from './utils/BaseResourceFetcherClass'; export * from './utils/llm'; export * from './common/Logger'; export * from './utils/llms/context_strategy'; +export * from './utils/fastSAMPrompts'; // types export * from './types/objectDetection'; diff --git a/packages/react-native-executorch/src/modules/computer_vision/InstanceSegmentationModule.ts b/packages/react-native-executorch/src/modules/computer_vision/InstanceSegmentationModule.ts index 2e70e6bdec..e7e96f2deb 100644 --- a/packages/react-native-executorch/src/modules/computer_vision/InstanceSegmentationModule.ts +++ b/packages/react-native-executorch/src/modules/computer_vision/InstanceSegmentationModule.ts @@ -23,6 +23,7 @@ import { import { CocoLabel, CocoLabelYolo, + FastSAMLabel, IMAGENET1K_MEAN, IMAGENET1K_STD, } from '../../constants/commonVision'; @@ -39,6 +40,18 @@ const YOLO_SEG_CONFIG = { }, } satisfies InstanceSegmentationConfig; +const FASTSAM_CONFIG = { + preprocessorConfig: undefined, + labelMap: FastSAMLabel, + availableInputSizes: undefined, + defaultInputSize: undefined, + defaultConfidenceThreshold: 0.5, + defaultIouThreshold: 0.9, + postprocessorConfig: { + applyNMS: true, + }, +} satisfies InstanceSegmentationConfig; + const RF_DETR_NANO_SEG_CONFIG = { preprocessorConfig: { normMean: IMAGENET1K_MEAN, normStd: IMAGENET1K_STD }, labelMap: CocoLabel, @@ -81,10 +94,13 @@ const ModelConfigs = { 'yolo26l-seg': YOLO_SEG_CONFIG, 'yolo26x-seg': YOLO_SEG_CONFIG, 'rfdetr-nano-seg': RF_DETR_NANO_SEG_CONFIG, + 'fastsam-s': FASTSAM_CONFIG, + 'fastsam-x': FASTSAM_CONFIG, } as const satisfies Record< InstanceSegmentationModelName, | InstanceSegmentationConfig | InstanceSegmentationConfig + | InstanceSegmentationConfig >; /** @internal */ diff --git a/packages/react-native-executorch/src/types/instanceSegmentation.ts b/packages/react-native-executorch/src/types/instanceSegmentation.ts index 869f0cdcd7..ff7f4ae314 100644 --- a/packages/react-native-executorch/src/types/instanceSegmentation.ts +++ b/packages/react-native-executorch/src/types/instanceSegmentation.ts @@ -114,7 +114,9 @@ export type InstanceSegmentationModelSources = | { modelName: 'yolo26m-seg'; modelSource: ResourceSource } | { modelName: 'yolo26l-seg'; modelSource: ResourceSource } | { modelName: 'yolo26x-seg'; modelSource: ResourceSource } - | { modelName: 'rfdetr-nano-seg'; modelSource: ResourceSource }; + | { modelName: 'rfdetr-nano-seg'; modelSource: ResourceSource } + | { modelName: 'fastsam-s'; modelSource: ResourceSource } + | { modelName: 'fastsam-x'; modelSource: ResourceSource }; /** * Union of all built-in instance segmentation model names. diff --git a/packages/react-native-executorch/src/utils/fastSAMPrompts.ts b/packages/react-native-executorch/src/utils/fastSAMPrompts.ts new file mode 100644 index 0000000000..61f799ee40 --- /dev/null +++ b/packages/react-native-executorch/src/utils/fastSAMPrompts.ts @@ -0,0 +1,111 @@ +import { LabelEnum } from '../types/common'; +import { Bbox } from '../types/objectDetection'; +import { SegmentedInstance } from '../types/instanceSegmentation'; + +/** + * Selects the best matching instance for a given point prompt. + * + * Finds all instances whose mask covers the point (x, y), then returns the one + * with the smallest mask area (ties broken by box area, then confidence). This + * matches the behavior of FastSAM's point-prompt selection. + * @param instances - Array of segmented instances returned by `forward()`. + * @param x - X coordinate in original image space. + * @param y - Y coordinate in original image space. + * @returns The best matching instance, or `null` if no mask covers the point. + */ +export function selectByPoint( + instances: SegmentedInstance[], + x: number, + y: number +): SegmentedInstance | null { + const px = Math.round(x); + const py = Math.round(y); + + const matches = instances.filter((inst) => { + const mx = px - Math.round(inst.bbox.x1); + const my = py - Math.round(inst.bbox.y1); + if (mx < 0 || my < 0 || mx >= inst.maskWidth || my >= inst.maskHeight) { + return false; + } + return inst.mask[my * inst.maskWidth + mx] === 1; + }); + + if (matches.length === 0) return null; + + return matches.reduce((best, inst) => { + const maskArea = countMaskPixels(inst.mask); + const bestMaskArea = countMaskPixels(best.mask); + if (maskArea !== bestMaskArea) return maskArea < bestMaskArea ? inst : best; + + const boxArea = bboxArea(inst.bbox); + const bestBoxArea = bboxArea(best.bbox); + if (boxArea !== bestBoxArea) return boxArea < bestBoxArea ? inst : best; + + return inst.score > best.score ? inst : best; + }); +} + +/** + * Selects the best matching instance for a given box prompt. + * + * Finds all instances that overlap with the prompt box, then returns the one + * with the highest IoU with that box (ties broken by smallest mask area, then + * highest confidence). This matches the behavior of FastSAM's box-prompt + * selection. + * @param instances - Array of segmented instances returned by `forward()`. + * @param box - The prompt bounding box in image coordinates. + * @returns The best matching instance, or `null` if no instance overlaps. + */ +export function selectByBox( + instances: SegmentedInstance[], + box: Bbox +): SegmentedInstance | null { + const { x1: px1, y1: py1, x2: px2, y2: py2 } = box; + const promptArea = Math.max(px2 - px1, 0) * Math.max(py2 - py1, 0); + + type Match = { + iou: number; + maskArea: number; + score: number; + inst: SegmentedInstance; + }; + let best: Match | null = null; + + for (const inst of instances) { + const { x1, y1, x2, y2 } = inst.bbox; + const interX1 = Math.max(px1, x1); + const interY1 = Math.max(py1, y1); + const interX2 = Math.min(px2, x2); + const interY2 = Math.min(py2, y2); + const interArea = + Math.max(interX2 - interX1, 0) * Math.max(interY2 - interY1, 0); + if (interArea <= 0) continue; + + const detArea = bboxArea(inst.bbox); + const iou = interArea / (promptArea + detArea - interArea + 1e-7); + const maskArea = countMaskPixels(inst.mask); + + if ( + best === null || + iou > best.iou || + (iou === best.iou && maskArea < best.maskArea) || + (iou === best.iou && + maskArea === best.maskArea && + inst.score > best.score) + ) { + best = { iou, maskArea, score: inst.score, inst }; + } + } + + return best?.inst ?? null; +} + +function countMaskPixels(mask: Uint8Array): number { + let count = 0; + for (let i = 0; i < mask.length; i++) count += mask[i]!; + return count; +} + +function bboxArea(bbox: Bbox): number { + return Math.max(bbox.x2 - bbox.x1, 0) * Math.max(bbox.y2 - bbox.y1, 0); +} From a8968324b4b7f710ceef745f2560e5bb03800bb5 Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Tue, 5 May 2026 17:40:14 +0200 Subject: [PATCH 02/18] feat: optimize FastSAM selection algorithms and improve performance logging --- apps/computer-vision/app/fast_sam/index.tsx | 24 ++++++++++++----- .../src/utils/fastSAMPrompts.ts | 26 +++---------------- 2 files changed, 22 insertions(+), 28 deletions(-) diff --git a/apps/computer-vision/app/fast_sam/index.tsx b/apps/computer-vision/app/fast_sam/index.tsx index d47e538c8a..b8fa556cb9 100644 --- a/apps/computer-vision/app/fast_sam/index.tsx +++ b/apps/computer-vision/app/fast_sam/index.tsx @@ -107,11 +107,13 @@ export default function FastSAMScreen() { e.nativeEvent.locationY ); if (!coords) return; + const t0 = Date.now(); const match = selectByPoint( rawInstancesRef.current, Math.round(coords.x), Math.round(coords.y) ); + console.log(`[FastSAM] selectByPoint(): ${Date.now() - t0}ms`); setSelection(match ?? null); } @@ -170,7 +172,10 @@ export default function FastSAMScreen() { x2: Math.max(s.x, coords.x), y2: Math.max(s.y, coords.y), }; - setSelection(selectByBox(rawInstancesRef.current, box) ?? null); + const t0 = Date.now(); + const match = selectByBox(rawInstancesRef.current, box); + console.log(`[FastSAM] selectByBox(): ${Date.now() - t0}ms`); + setSelection(match ?? null); } // ------------------------------------------------------------------------- @@ -190,14 +195,18 @@ export default function FastSAMScreen() { const runForward = async () => { if (!imageUri) return; try { - const start = Date.now(); + const t0 = Date.now(); const output = await forward(imageUri, { confidenceThreshold: 0.4, iouThreshold: 0.9, - maxInstances: 100, + maxInstances: 50, returnMaskAtOriginalResolution: true, }); - setInferenceTime(Date.now() - start); + const inferenceMs = Date.now() - t0; + console.log( + `[FastSAM] forward(): ${inferenceMs}ms, instances: ${output.length}` + ); + setInferenceTime(inferenceMs); rawInstancesRef.current = output; setSelection(null); } catch (e) { @@ -213,7 +222,8 @@ export default function FastSAMScreen() { const alphaMask = useMemo(() => { if (!selection) return null; - return buildAlphaMask( + const t0 = Date.now(); + const mask = buildAlphaMask( selection.mask, selection.maskWidth, selection.maskHeight, @@ -222,6 +232,8 @@ export default function FastSAMScreen() { imageSize.width, imageSize.height ); + console.log(`[FastSAM] buildAlphaMask(): ${Date.now() - t0}ms`); + return mask; }, [selection, imageSize]); const { width: cw, height: ch } = cutoutLayout; @@ -434,7 +446,7 @@ function buildAlphaMask( imgW: number, imgH: number ) { - const MAX_DIM = 512; + const MAX_DIM = 256; const ds = Math.min(1, MAX_DIM / Math.max(imgW, imgH)); const dstW = Math.max(1, Math.round(imgW * ds)); const dstH = Math.max(1, Math.round(imgH * ds)); diff --git a/packages/react-native-executorch/src/utils/fastSAMPrompts.ts b/packages/react-native-executorch/src/utils/fastSAMPrompts.ts index 61f799ee40..b2ee2f29c4 100644 --- a/packages/react-native-executorch/src/utils/fastSAMPrompts.ts +++ b/packages/react-native-executorch/src/utils/fastSAMPrompts.ts @@ -6,8 +6,7 @@ import { SegmentedInstance } from '../types/instanceSegmentation'; * Selects the best matching instance for a given point prompt. * * Finds all instances whose mask covers the point (x, y), then returns the one - * with the smallest mask area (ties broken by box area, then confidence). This - * matches the behavior of FastSAM's point-prompt selection. + * with the smallest bounding box area (ties broken by highest confidence). * @param instances - Array of segmented instances returned by `forward()`. * @param x - X coordinate in original image space. * @param y - Y coordinate in original image space. @@ -33,10 +32,6 @@ export function selectByPoint( if (matches.length === 0) return null; return matches.reduce((best, inst) => { - const maskArea = countMaskPixels(inst.mask); - const bestMaskArea = countMaskPixels(best.mask); - if (maskArea !== bestMaskArea) return maskArea < bestMaskArea ? inst : best; - const boxArea = bboxArea(inst.bbox); const bestBoxArea = bboxArea(best.bbox); if (boxArea !== bestBoxArea) return boxArea < bestBoxArea ? inst : best; @@ -49,9 +44,7 @@ export function selectByPoint( * Selects the best matching instance for a given box prompt. * * Finds all instances that overlap with the prompt box, then returns the one - * with the highest IoU with that box (ties broken by smallest mask area, then - * highest confidence). This matches the behavior of FastSAM's box-prompt - * selection. + * with the highest IoU with that box (ties broken by highest confidence). * @param instances - Array of segmented instances returned by `forward()`. * @param box - The prompt bounding box in image coordinates. * @returns The best matching instance, or `null` if no instance overlaps. @@ -65,7 +58,6 @@ export function selectByBox( type Match = { iou: number; - maskArea: number; score: number; inst: SegmentedInstance; }; @@ -83,29 +75,19 @@ export function selectByBox( const detArea = bboxArea(inst.bbox); const iou = interArea / (promptArea + detArea - interArea + 1e-7); - const maskArea = countMaskPixels(inst.mask); if ( best === null || iou > best.iou || - (iou === best.iou && maskArea < best.maskArea) || - (iou === best.iou && - maskArea === best.maskArea && - inst.score > best.score) + (iou === best.iou && inst.score > best.score) ) { - best = { iou, maskArea, score: inst.score, inst }; + best = { iou, score: inst.score, inst }; } } return best?.inst ?? null; } -function countMaskPixels(mask: Uint8Array): number { - let count = 0; - for (let i = 0; i < mask.length; i++) count += mask[i]!; - return count; -} - function bboxArea(bbox: Bbox): number { return Math.max(bbox.x2 - bbox.x1, 0) * Math.max(bbox.y2 - bbox.y1, 0); } From fe425df0d9454468400fae6b00c9a4dd35e0f2ff Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Tue, 5 May 2026 17:46:02 +0200 Subject: [PATCH 03/18] refactor: move FastSAMLabel enum definition to a more appropriate location --- .../src/constants/commonVision.ts | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/packages/react-native-executorch/src/constants/commonVision.ts b/packages/react-native-executorch/src/constants/commonVision.ts index bac8e8c520..6221d5701e 100644 --- a/packages/react-native-executorch/src/constants/commonVision.ts +++ b/packages/react-native-executorch/src/constants/commonVision.ts @@ -118,17 +118,6 @@ export enum CocoLabel { * @see {@link CocoLabel} for the RF-DETR / SSDLite variant * @category Types */ -/** - * Class label for FastSAM models. - * - * FastSAM is class-agnostic and produces a single "object" class for every - * detected region. Use this enum when working with `fastsam-s` or `fastsam-x`. - * @category Types - */ -export enum FastSAMLabel { - OBJECT = 0, -} - export enum CocoLabelYolo { PERSON = 0, BICYCLE = 1, @@ -211,3 +200,14 @@ export enum CocoLabelYolo { HAIR_DRIER = 78, TOOTHBRUSH = 79, } + +/** + * Class label for FastSAM models. + * + * FastSAM is class-agnostic and produces a single "object" class for every + * detected region. Use this enum when working with `fastsam-s` or `fastsam-x`. + * @category Types + */ +export enum FastSAMLabel { + OBJECT = 0, +} From fd94f6a8861efa090a01f7760bb55411d806ecb8 Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Wed, 6 May 2026 22:48:53 +0200 Subject: [PATCH 04/18] feat: add text prompts and modify the example app --- apps/computer-vision/app/_layout.tsx | 4 +- apps/computer-vision/app/fast_sam/index.tsx | 661 ++++++++---------- apps/computer-vision/app/index.tsx | 2 +- packages/react-native-executorch/src/index.ts | 2 +- ...AMPrompts.ts => segmentAnythingPrompts.ts} | 52 ++ 5 files changed, 365 insertions(+), 356 deletions(-) rename packages/react-native-executorch/src/utils/{fastSAMPrompts.ts => segmentAnythingPrompts.ts} (62%) diff --git a/apps/computer-vision/app/_layout.tsx b/apps/computer-vision/app/_layout.tsx index 7aed6af92f..bb7ab7165e 100644 --- a/apps/computer-vision/app/_layout.tsx +++ b/apps/computer-vision/app/_layout.tsx @@ -192,8 +192,8 @@ export default function _layout() { diff --git a/apps/computer-vision/app/fast_sam/index.tsx b/apps/computer-vision/app/fast_sam/index.tsx index b8fa556cb9..4118bf3419 100644 --- a/apps/computer-vision/app/fast_sam/index.tsx +++ b/apps/computer-vision/app/fast_sam/index.tsx @@ -1,31 +1,33 @@ -import React, { useContext, useMemo, useRef, useState } from 'react'; +import React, { useContext, useEffect, useRef, useState } from 'react'; import { View, StyleSheet, Text, + TextInput, TouchableOpacity, GestureResponderEvent, - Image, } from 'react-native'; import { Canvas, - Image as SkiaImage, Rect, - Group, - useImage, Skia, - AlphaType, - ColorType, + useImage, + type SkImage, } from '@shopify/react-native-skia'; import { useInstanceSegmentation, + useImageEmbeddings, + useTextEmbeddings, FASTSAM_S, FASTSAM_X, + CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED, + CLIP_VIT_BASE_PATCH32_TEXT, InstanceSegmentationModelSources, SegmentedInstance, FastSAMLabel, selectByPoint, selectByBox, + selectByText, Bbox, } from 'react-native-executorch'; import { GeneratingContext } from '../../context'; @@ -34,17 +36,21 @@ import { BottomBar } from '../../components/BottomBar'; import { StatsBar } from '../../components/StatsBar'; import Spinner from '../../components/Spinner'; import ScreenWrapper from '../../ScreenWrapper'; +import ImageWithMasks, { + buildDisplayInstances, + DisplayInstance, +} from '../../components/ImageWithMasks'; import { getImage } from '../../utils'; import ColorPalette from '../../colors'; -type PromptMode = 'point' | 'box'; +type PromptMode = 'point' | 'box' | 'text'; const MODELS: ModelOption[] = [ - { label: 'FastSAM-s', value: FASTSAM_S }, - { label: 'FastSAM-x', value: FASTSAM_X }, + { label: 'FastSAM-S', value: FASTSAM_S }, + { label: 'FastSAM-X', value: FASTSAM_X }, ]; -export default function FastSAMScreen() { +export default function SegmentAnythingScreen() { const { setGlobalGenerating } = useContext(GeneratingContext); const [selectedModel, setSelectedModel] = @@ -56,131 +62,149 @@ export default function FastSAMScreen() { const [imageSize, setImageSize] = useState({ width: 0, height: 0 }); const rawInstancesRef = useRef[]>([]); - const [selection, setSelection] = useState | null>(null); - - const [draftBox, setDraftBox] = useState<{ - x1: number; - y1: number; - x2: number; - y2: number; - } | null>(null); - const boxStartRef = useRef<{ x: number; y: number } | null>(null); + const [selection, setSelection] = useState([]); - const sourceLayoutRef = useRef({ width: 0, height: 0 }); - const cutoutLayoutRef = useRef({ width: 0, height: 0 }); - const [cutoutLayout, setCutoutLayout] = useState({ width: 0, height: 0 }); + const [draftBox, setDraftBox] = useState(null); + const boxStartRef = useRef<{ x: number; y: number } | null>(null); + const layoutRef = useRef({ width: 0, height: 0 }); const { isReady, isGenerating, downloadProgress, forward, error } = useInstanceSegmentation({ model: selectedModel }); - React.useEffect(() => { + const clipImage = useImageEmbeddings({ + model: CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED, + }); + const clipText = useTextEmbeddings({ model: CLIP_VIT_BASE_PATCH32_TEXT }); + const skiaSource = useImage(imageUri || null); + + const [textPrompt, setTextPrompt] = useState(''); + const [textBusy, setTextBusy] = useState(false); + const [embeddingProgress, setEmbeddingProgress] = useState<{ + done: number; + total: number; + } | null>(null); + const instanceEmbeddingsRef = useRef(null); + + useEffect(() => { setGlobalGenerating(isGenerating); }, [isGenerating, setGlobalGenerating]); - // ------------------------------------------------------------------------- - // Coordinate conversion (source image box) - // ------------------------------------------------------------------------- + function applyMatch( + match: SegmentedInstance | null + ): void { + setSelection(match ? buildDisplayInstances([match]) : []); + } function touchToImageCoords(touchX: number, touchY: number) { - const { width: cw, height: ch } = sourceLayoutRef.current; + const { width: cw, height: ch } = layoutRef.current; const { width: iw, height: ih } = imageSize; if (iw === 0 || ih === 0) return null; const scale = Math.min(cw / iw, ch / ih); - const offsetX = (cw - iw * scale) / 2; - const offsetY = (ch - ih * scale) / 2; return { - x: (touchX - offsetX) / scale, - y: (touchY - offsetY) / scale, + x: (touchX - (cw - iw * scale) / 2) / scale, + y: (touchY - (ch - ih * scale) / 2) / scale, }; } - // ------------------------------------------------------------------------- - // Point prompt - // ------------------------------------------------------------------------- - function handleTap(e: GestureResponderEvent) { if (mode !== 'point' || rawInstancesRef.current.length === 0) return; - const coords = touchToImageCoords( + const c = touchToImageCoords( e.nativeEvent.locationX, e.nativeEvent.locationY ); - if (!coords) return; - const t0 = Date.now(); - const match = selectByPoint( - rawInstancesRef.current, - Math.round(coords.x), - Math.round(coords.y) + if (!c) return; + applyMatch( + selectByPoint(rawInstancesRef.current, Math.round(c.x), Math.round(c.y)) ); - console.log(`[FastSAM] selectByPoint(): ${Date.now() - t0}ms`); - setSelection(match ?? null); } - // ------------------------------------------------------------------------- - // Box prompt - // ------------------------------------------------------------------------- - function handleBoxStart(e: GestureResponderEvent) { if (mode !== 'box') return; - const coords = touchToImageCoords( + const c = touchToImageCoords( e.nativeEvent.locationX, e.nativeEvent.locationY ); - if (!coords) return; - boxStartRef.current = coords; - setDraftBox({ x1: coords.x, y1: coords.y, x2: coords.x, y2: coords.y }); + if (!c) return; + boxStartRef.current = c; + setDraftBox({ x1: c.x, y1: c.y, x2: c.x, y2: c.y }); } function handleBoxMove(e: GestureResponderEvent) { if (mode !== 'box' || !boxStartRef.current) return; - const coords = touchToImageCoords( + const c = touchToImageCoords( e.nativeEvent.locationX, e.nativeEvent.locationY ); - if (!coords) return; + if (!c) return; const s = boxStartRef.current; setDraftBox({ - x1: Math.min(s.x, coords.x), - y1: Math.min(s.y, coords.y), - x2: Math.max(s.x, coords.x), - y2: Math.max(s.y, coords.y), + x1: Math.min(s.x, c.x), + y1: Math.min(s.y, c.y), + x2: Math.max(s.x, c.x), + y2: Math.max(s.y, c.y), }); } function handleBoxEnd(e: GestureResponderEvent) { - if ( - mode !== 'box' || - !boxStartRef.current || - rawInstancesRef.current.length === 0 - ) { - boxStartRef.current = null; - setDraftBox(null); - return; - } - const coords = touchToImageCoords( + if (mode !== 'box' || !boxStartRef.current) return; + const c = touchToImageCoords( e.nativeEvent.locationX, e.nativeEvent.locationY ); const s = boxStartRef.current; boxStartRef.current = null; setDraftBox(null); - if (!coords) return; - const box: Bbox = { - x1: Math.min(s.x, coords.x), - y1: Math.min(s.y, coords.y), - x2: Math.max(s.x, coords.x), - y2: Math.max(s.y, coords.y), - }; - const t0 = Date.now(); - const match = selectByBox(rawInstancesRef.current, box); - console.log(`[FastSAM] selectByBox(): ${Date.now() - t0}ms`); - setSelection(match ?? null); + if (!c || rawInstancesRef.current.length === 0) return; + applyMatch( + selectByBox(rawInstancesRef.current, { + x1: Math.min(s.x, c.x), + y1: Math.min(s.y, c.y), + x2: Math.max(s.x, c.x), + y2: Math.max(s.y, c.y), + }) + ); } - // ------------------------------------------------------------------------- - // Image loading & inference - // ------------------------------------------------------------------------- + async function runTextPrompt() { + const instances = rawInstancesRef.current; + if ( + !textPrompt.trim() || + instances.length === 0 || + !skiaSource || + !clipImage.isReady || + !clipText.isReady || + textBusy + ) { + return; + } + setTextBusy(true); + try { + if (!instanceEmbeddingsRef.current) { + setEmbeddingProgress({ done: 0, total: instances.length }); + const embeddings: Float32Array[] = []; + for (let i = 0; i < instances.length; i++) { + embeddings.push( + await cropAndEmbed( + skiaSource, + instances[i]!.bbox, + clipImage.forward + ) + ); + setEmbeddingProgress({ done: i + 1, total: instances.length }); + } + instanceEmbeddingsRef.current = embeddings; + setEmbeddingProgress(null); + } + const textEmb = await clipText.forward(textPrompt); + applyMatch( + selectByText(instances, instanceEmbeddingsRef.current, textEmb) + ); + } catch (e) { + console.error(e); + } finally { + setTextBusy(false); + } + } const handleCameraPress = async (isCamera: boolean) => { const image = await getImage(isCamera); @@ -188,71 +212,30 @@ export default function FastSAMScreen() { setImageUri(image.uri); setImageSize({ width: image.width ?? 0, height: image.height ?? 0 }); rawInstancesRef.current = []; - setSelection(null); + instanceEmbeddingsRef.current = null; + setSelection([]); setInferenceTime(null); }; const runForward = async () => { if (!imageUri) return; try { - const t0 = Date.now(); + const start = Date.now(); const output = await forward(imageUri, { confidenceThreshold: 0.4, iouThreshold: 0.9, maxInstances: 50, returnMaskAtOriginalResolution: true, }); - const inferenceMs = Date.now() - t0; - console.log( - `[FastSAM] forward(): ${inferenceMs}ms, instances: ${output.length}` - ); - setInferenceTime(inferenceMs); + setInferenceTime(Date.now() - start); rawInstancesRef.current = output; - setSelection(null); + instanceEmbeddingsRef.current = null; + setSelection([]); } catch (e) { console.error(e); } }; - // ------------------------------------------------------------------------- - // Cutout rendering - // ------------------------------------------------------------------------- - - const skiaSource = useImage(imageUri || null); - - const alphaMask = useMemo(() => { - if (!selection) return null; - const t0 = Date.now(); - const mask = buildAlphaMask( - selection.mask, - selection.maskWidth, - selection.maskHeight, - selection.bbox.x1, - selection.bbox.y1, - imageSize.width, - imageSize.height - ); - console.log(`[FastSAM] buildAlphaMask(): ${Date.now() - t0}ms`); - return mask; - }, [selection, imageSize]); - - const { width: cw, height: ch } = cutoutLayout; - const { width: iw, height: ih } = imageSize; - const cutoutScale = - cw > 0 && ch > 0 && iw > 0 && ih > 0 ? Math.min(cw / iw, ch / ih) : 1; - const cutoutOffsetX = (cw - iw * cutoutScale) / 2; - const cutoutOffsetY = (ch - ih * cutoutScale) / 2; - - // Draft box overlay coords (source box) - const { width: scw, height: sch } = sourceLayoutRef.current; - const srcScale = iw > 0 && ih > 0 ? Math.min(scw / iw, sch / ih) : 1; - const srcOffsetX = (scw - iw * srcScale) / 2; - const srcOffsetY = (sch - ih * srcScale) / 2; - - // ------------------------------------------------------------------------- - // Error / loading - // ------------------------------------------------------------------------- - if (!isReady && error) { return ( @@ -268,137 +251,141 @@ export default function FastSAMScreen() { return ( ); } + const { width: cw, height: ch } = layoutRef.current; + const { width: iw, height: ih } = imageSize; + const drawScale = iw > 0 && ih > 0 ? Math.min(cw / iw, ch / ih) : 1; + const offsetX = (cw - iw * drawScale) / 2; + const offsetY = (ch - ih * drawScale) / 2; + + const stepHint = !imageUri + ? null + : inferenceTime === null + ? 'Tap Run to detect instances' + : rawInstancesRef.current.length === 0 + ? 'No instances detected — try another image' + : selection.length === 0 + ? 'Tap a point, draw a box, or describe an object' + : null; + return ( - {/* ---- Source image box ---- */} - { - const { width, height } = e.nativeEvent.layout; - sourceLayoutRef.current = { width, height }; - }} - onTouchStart={(e) => { - if (mode === 'point') handleTap(e); - else handleBoxStart(e); - }} - onTouchMove={(e) => { - if (mode === 'box') handleBoxMove(e); - }} - onTouchEnd={(e) => { - if (mode === 'box') handleBoxEnd(e); - }} - > - - {!imageUri && ( - - Load an image to get started - - )} - {/* Draft box */} - {draftBox && iw > 0 && ( - - + + { + layoutRef.current = { + width: e.nativeEvent.layout.width, + height: e.nativeEvent.layout.height, + }; + }} + onTouchStart={(e) => { + if (mode === 'point') handleTap(e); + else if (mode === 'box') handleBoxStart(e); + }} + onTouchMove={handleBoxMove} + onTouchEnd={handleBoxEnd} + > + - - )} - - - {/* ---- Cutout box ---- */} - { - const { width, height } = e.nativeEvent.layout; - cutoutLayoutRef.current = { width, height }; - setCutoutLayout({ width, height }); - }} - > - {selection && skiaSource && alphaMask ? ( - - - - - - - - ) : ( - - - {rawInstancesRef.current.length > 0 - ? 'Tap or draw a box on the image above' - : imageUri - ? 'Run inference first' - : ''} - + {draftBox && iw > 0 && ( + + + + )} - )} + {!imageUri && ( + + Segment Anything + + Segment any object in an image. (1) Pick an image, (2) tap Run + to detect instances, (3) tap a point, draw a box, or describe an + object to segment it. + + + )} + - {/* ---- Controls ---- */} - - - setMode('point')} - > - {stepHint}} + + + {(['point', 'box', 'text'] as PromptMode[]).map((m) => { + const promptDisabled = rawInstancesRef.current.length === 0; + return ( + setMode(m)} + disabled={promptDisabled} > - Point - - + + {m[0]!.toUpperCase() + m.slice(1)} + + + ); + })} + + + {mode === 'text' && ( + + setMode('box')} + style={[styles.textBtn, textBusy && styles.textBtnDisabled]} + onPress={runTextPrompt} + disabled={ + !textPrompt.trim() || + textBusy || + rawInstancesRef.current.length === 0 || + !clipImage.isReady || + !clipText.isReady + } > - - Box - + {textBusy ? '…' : 'Find'} - + )} + {mode === 'text' && embeddingProgress && ( + + Embedding instances {embeddingProgress.done}/{embeddingProgress.total}{' '} + (subsequent text queries are instant) + + )} { setSelectedModel(m); rawInstancesRef.current = []; - setSelection(null); + instanceEmbeddingsRef.current = null; + setSelection([]); setInferenceTime(null); }} /> @@ -431,127 +419,100 @@ export default function FastSAMScreen() { ); } -// --------------------------------------------------------------------------- -// Helpers -// --------------------------------------------------------------------------- - -// Builds a full-image alpha mask. `mask` is bbox-relative (maskWidth × maskHeight), -// positioned at (bboxX1, bboxY1) within an image of size (imgW × imgH). -function buildAlphaMask( - mask: Uint8Array, - maskWidth: number, - maskHeight: number, - bboxX1: number, - bboxY1: number, - imgW: number, - imgH: number -) { - const MAX_DIM = 256; - const ds = Math.min(1, MAX_DIM / Math.max(imgW, imgH)); - const dstW = Math.max(1, Math.round(imgW * ds)); - const dstH = Math.max(1, Math.round(imgH * ds)); - - const pixels = new Uint8Array(dstW * dstH * 4); - - // Place the bbox-relative mask into the full-image canvas - const offX = Math.round(bboxX1 * ds); - const offY = Math.round(bboxY1 * ds); - const scaledMaskW = Math.max(1, Math.round(maskWidth * ds)); - const scaledMaskH = Math.max(1, Math.round(maskHeight * ds)); - - for (let dy = 0; dy < scaledMaskH; dy++) { - const sy = Math.min( - Math.floor((dy / scaledMaskH) * maskHeight), - maskHeight - 1 - ); - for (let dx = 0; dx < scaledMaskW; dx++) { - const sx = Math.min( - Math.floor((dx / scaledMaskW) * maskWidth), - maskWidth - 1 - ); - if (mask[sy * maskWidth + sx] > 0) { - const imgX = offX + dx; - const imgY = offY + dy; - if (imgX >= 0 && imgX < dstW && imgY >= 0 && imgY < dstH) { - const i = (imgY * dstW + imgX) * 4; - pixels[i] = 255; - pixels[i + 1] = 255; - pixels[i + 2] = 255; - pixels[i + 3] = 255; - } - } - } - } - - const data = Skia.Data.fromBytes(pixels); - const img = Skia.Image.MakeImage( +async function cropAndEmbed( + image: SkImage, + bbox: Bbox, + forward: (input: string) => Promise +): Promise { + const w = Math.max(1, Math.round(bbox.x2 - bbox.x1)); + const h = Math.max(1, Math.round(bbox.y2 - bbox.y1)); + const surface = Skia.Surface.MakeOffscreen(w, h); + if (!surface) throw new Error('Failed to create offscreen Skia surface'); + surface.getCanvas().drawImageRect( + image, { - width: dstW, - height: dstH, - alphaType: AlphaType.Premul, - colorType: ColorType.RGBA_8888, + x: bbox.x1, + y: bbox.y1, + width: bbox.x2 - bbox.x1, + height: bbox.y2 - bbox.y1, }, - data, - dstW * 4 + { x: 0, y: 0, width: w, height: h }, + Skia.Paint() ); - data.dispose(); - return img; + const base64 = surface.makeImageSnapshot().encodeToBase64(); + return forward(`data:image/png;base64,${base64}`); } -// --------------------------------------------------------------------------- -// Styles -// --------------------------------------------------------------------------- - const styles = StyleSheet.create({ - imageBox: { - flex: 1, - width: '100%', - borderBottomWidth: 1, - borderBottomColor: '#e0e0e0', - }, - image: { - width: '100%', - height: '100%', + container: { flex: 6, width: '100%' }, + imageContainer: { flex: 1, width: '100%', padding: 16 }, + imageTouchArea: { flex: 1, position: 'relative' }, + infoContainer: { alignItems: 'center', padding: 16, gap: 8 }, + infoTitle: { fontSize: 18, fontWeight: '600', color: 'navy' }, + infoText: { + fontSize: 14, + color: '#555', + textAlign: 'center', + lineHeight: 20, }, - hint: { - ...StyleSheet.absoluteFillObject, + modeRow: { + flexDirection: 'row', justifyContent: 'center', - alignItems: 'center', + paddingVertical: 8, + gap: 8, }, - hintText: { - fontSize: 14, - color: '#aaa', + modeBtn: { + paddingHorizontal: 18, + paddingVertical: 8, + borderRadius: 8, + borderWidth: 1, + borderColor: ColorPalette.primary, + backgroundColor: '#fff', }, - controls: { + modeBtnActive: { backgroundColor: ColorPalette.primary }, + modeBtnDisabled: { borderColor: '#cbd5e1', backgroundColor: '#f8fafc' }, + modeBtnText: { fontSize: 14, fontWeight: '600', color: ColorPalette.primary }, + modeBtnTextActive: { color: '#fff' }, + modeBtnTextDisabled: { color: '#cbd5e1' }, + textRow: { flexDirection: 'row', alignItems: 'center', paddingHorizontal: 16, - paddingVertical: 10, - borderTopWidth: 1, - borderTopColor: '#e0e0e0', + paddingBottom: 8, + gap: 8, }, - modeToggle: { - flexDirection: 'row', - borderRadius: 8, - overflow: 'hidden', + textInput: { + flex: 1, + backgroundColor: '#fff', borderWidth: 1, borderColor: ColorPalette.primary, + borderRadius: 12, + paddingHorizontal: 14, + paddingVertical: 12, + fontSize: 16, + color: '#0f172a', }, - modeBtn: { + textBtn: { + backgroundColor: ColorPalette.primary, + borderRadius: 12, paddingHorizontal: 20, - paddingVertical: 8, - backgroundColor: '#fff', + paddingVertical: 14, }, - modeBtnActive: { - backgroundColor: ColorPalette.primary, + textBtnDisabled: { backgroundColor: '#cbd5e1' }, + textBtnLabel: { color: '#fff', fontWeight: '700', fontSize: 16 }, + statusLine: { + paddingHorizontal: 16, + paddingBottom: 6, + fontSize: 12, + color: '#64748b', }, - modeBtnText: { - fontSize: 14, - fontWeight: '600', + stepHint: { + paddingHorizontal: 16, + paddingTop: 6, + fontSize: 13, + fontWeight: '500', color: ColorPalette.primary, - }, - modeBtnTextActive: { - color: '#fff', + textAlign: 'center', }, errorContainer: { flex: 1, @@ -565,9 +526,5 @@ const styles = StyleSheet.create({ color: '#e74c3c', marginBottom: 12, }, - errorText: { - fontSize: 14, - color: '#555', - textAlign: 'center', - }, + errorText: { fontSize: 14, color: '#555', textAlign: 'center' }, }); diff --git a/apps/computer-vision/app/index.tsx b/apps/computer-vision/app/index.tsx index 690ebfb331..f7c1dae5b7 100644 --- a/apps/computer-vision/app/index.tsx +++ b/apps/computer-vision/app/index.tsx @@ -51,7 +51,7 @@ export default function Home() { style={styles.button} onPress={() => router.navigate('fast_sam/')} > - FastSAM + Segment Anything ( function bboxArea(bbox: Bbox): number { return Math.max(bbox.x2 - bbox.x1, 0) * Math.max(bbox.y2 - bbox.y1, 0); } + +/** + * Selects the best matching instance for a text prompt. + * + * Returns the instance whose image embedding has the highest cosine similarity + * with the text embedding. The caller is responsible for producing the + * embeddings (e.g. with CLIP) and passing them in the same order as + * `instances`; embeddings do not need to be pre-normalized. + * @param instances - Array of segmented instances returned by `forward()`. + * @param instanceEmbeddings - Image embedding for each instance, in the same order as `instances`. + * @param textEmbedding - Embedding of the text prompt. + * @returns The best matching instance, or `null` if `instances` is empty. + */ +export function selectByText( + instances: SegmentedInstance[], + instanceEmbeddings: Float32Array[], + textEmbedding: Float32Array +): SegmentedInstance | null { + if (instances.length === 0) return null; + if (instances.length !== instanceEmbeddings.length) { + throw new Error( + `selectByText: instances (${instances.length}) and instanceEmbeddings (${instanceEmbeddings.length}) must have the same length` + ); + } + + let textNormSq = 0; + for (let i = 0; i < textEmbedding.length; i++) { + const v = textEmbedding[i]!; + textNormSq += v * v; + } + const textNorm = Math.sqrt(textNormSq); + + let bestIdx = 0; + let bestScore = -Infinity; + for (let i = 0; i < instances.length; i++) { + const emb = instanceEmbeddings[i]!; + const n = Math.min(emb.length, textEmbedding.length); + let dot = 0; + let embNormSq = 0; + for (let j = 0; j < n; j++) { + const a = emb[j]!; + dot += a * textEmbedding[j]!; + embNormSq += a * a; + } + const score = dot / (Math.sqrt(embNormSq) * textNorm + 1e-7); + if (score > bestScore) { + bestScore = score; + bestIdx = i; + } + } + return instances[bestIdx]!; +} From c28dc716dfdffd1aee4cd0c2827f8d48884f240c Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Wed, 6 May 2026 23:37:22 +0200 Subject: [PATCH 05/18] docs: add initial docs generated --- .cspell-wordlist.txt | 1 + .../02-computer-vision/segment-anything.md | 143 ++++++++++++++++++ .../useInstanceSegmentation.md | 6 + 3 files changed, 150 insertions(+) create mode 100644 docs/docs/03-hooks/02-computer-vision/segment-anything.md diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt index cbd5f6d67d..b9809a8734 100644 --- a/.cspell-wordlist.txt +++ b/.cspell-wordlist.txt @@ -204,3 +204,4 @@ Fishjam deinitialize Deinitialize fastsam +promptable diff --git a/docs/docs/03-hooks/02-computer-vision/segment-anything.md b/docs/docs/03-hooks/02-computer-vision/segment-anything.md new file mode 100644 index 0000000000..8cf974d034 --- /dev/null +++ b/docs/docs/03-hooks/02-computer-vision/segment-anything.md @@ -0,0 +1,143 @@ +--- +title: Segment Anything with FastSAM +--- + +[FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM) is a class-agnostic, promptable segmentation model. Unlike YOLO or RF-DETR (which return labelled detections), FastSAM segments **every** instance in an image without classifying them — you then pick the one you want with a point, box, or text prompt. + +`FASTSAM_S` and `FASTSAM_X` are loaded with the regular [`useInstanceSegmentation`](./useInstanceSegmentation.md) hook. `react-native-executorch` ships three small selector utilities to pick an instance from the hook's output: `selectByPoint`, `selectByBox`, and `selectByText`. + +## API Reference + +- [`selectByPoint` API Reference](../../06-api-reference/functions/selectByPoint.md) +- [`selectByBox` API Reference](../../06-api-reference/functions/selectByBox.md) +- [`selectByText` API Reference](../../06-api-reference/functions/selectByText.md) + +## High Level Overview + +The workflow has three steps: + +1. Load `FASTSAM_S` (or `FASTSAM_X`) with `useInstanceSegmentation`. +2. Run `forward(image)` once — the result is every detected instance. +3. Use a selector to pick the one matching the user's prompt. Re-run a selector when the prompt changes; you don't need to call `forward` again. + +```typescript +import { + useInstanceSegmentation, + selectByPoint, + selectByBox, + selectByText, + FASTSAM_S, +} from 'react-native-executorch'; + +const model = useInstanceSegmentation({ model: FASTSAM_S }); + +try { + const instances = await model.forward(imageUri); + + // Point: the smallest instance whose mask covers (x, y). + const a = selectByPoint(instances, x, y); + console.log('point match:', a?.bbox); + + // Box: the instance with highest IoU with the prompt box. + const b = selectByBox(instances, { x1, y1, x2, y2 }); + console.log('box match:', b?.bbox); + + // Text: highest cosine similarity between text and per-instance image + // embeddings (you must provide the embeddings, e.g. with CLIP). + const c = selectByText(instances, instanceEmbeddings, textEmbedding); + console.log('text match:', c?.bbox); +} catch (error) { + console.error(error); +} +``` + +The hook output is typed as [`SegmentedInstance`](../../06-api-reference/interfaces/SegmentedInstance.md). FastSAM emits a single label, [`FastSAMLabel.OBJECT`](../../06-api-reference/enumerations/FastSAMLabel.md) (`'OBJECT' = 0`). + +## Selecting by point + +`selectByPoint` returns the instance whose mask covers the point `(x, y)`. When several instances overlap (e.g. a small object inside a larger one), the one with the smallest bounding box wins; ties are broken by confidence. Returns `null` if no mask covers the point. + +It accepts three arguments: + +- `instances` (required) - The array of [`SegmentedInstance`](../../06-api-reference/interfaces/SegmentedInstance.md) returned by `forward()`. +- `x` (required) - X coordinate of the prompt point, in the **original image's** pixel space. +- `y` (required) - Y coordinate of the prompt point, in the **original image's** pixel space. + +:::info +`returnMaskAtOriginalResolution: true` (the default) is required for `selectByPoint` — masks must be in original image coordinates so they align with the touch coordinates passed in. +::: + +## Selecting by box + +`selectByBox` returns the instance with the highest IoU with the prompt box. Useful for "draw a box around what you want" UX. Returns `null` if no instance overlaps. + +It accepts two arguments: + +- `instances` (required) - The array of [`SegmentedInstance`](../../06-api-reference/interfaces/SegmentedInstance.md) returned by `forward()`. +- `box` (required) - A [`Bbox`](../../06-api-reference/interfaces/Bbox.md) (`{ x1, y1, x2, y2 }`) in the original image's pixel space. + +## Selecting by text + +`selectByText` returns the instance whose image embedding has the highest cosine similarity with the text embedding. The caller produces the embeddings — typically by cropping each instance's bbox and running [CLIP](./useImageEmbeddings.md) image encoder, plus running the [CLIP text encoder](../01-natural-language-processing/useTextEmbeddings.md) on the prompt. + +It accepts three arguments: + +- `instances` (required) - The array of [`SegmentedInstance`](../../06-api-reference/interfaces/SegmentedInstance.md) returned by `forward()`. +- `instanceEmbeddings` (required) - One `Float32Array` per instance, in the same order as `instances`. Throws if lengths differ. +- `textEmbedding` (required) - A `Float32Array` for the text prompt. + +Embeddings do not need to be pre-normalized. Returns `null` only when `instances` is empty. + +### Example with CLIP + +```typescript +import { + useInstanceSegmentation, + useImageEmbeddings, + useTextEmbeddings, + selectByText, + FASTSAM_S, + CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED, + CLIP_VIT_BASE_PATCH32_TEXT, +} from 'react-native-executorch'; + +function App() { + const sam = useInstanceSegmentation({ model: FASTSAM_S }); + const clipImage = useImageEmbeddings({ + model: CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED, + }); + const clipText = useTextEmbeddings({ model: CLIP_VIT_BASE_PATCH32_TEXT }); + + const handlePrompt = async (imageUri: string, prompt: string) => { + if (!sam.isReady || !clipImage.isReady || !clipText.isReady) return; + + try { + const instances = await sam.forward(imageUri); + + // Embed each instance's bbox crop. Cropping is your responsibility — + // any image manipulator (e.g. expo-image-manipulator) works. + const instanceEmbeddings = await Promise.all( + instances.map((inst) => + clipImage.forward(cropToBbox(imageUri, inst.bbox)) + ) + ); + + const textEmb = await clipText.forward(prompt); + const match = selectByText(instances, instanceEmbeddings, textEmb); + console.log('match:', match?.bbox, match?.score); + } catch (error) { + console.error(error); + } + }; + + // ... +} +``` + +:::tip +Embedding all instances is the slow part of text prompts (one CLIP forward per instance). Cache `instanceEmbeddings` and reuse them across multiple text queries on the same image; only invalidate when you call `sam.forward` again. +::: + +## Example app + +The [`computer-vision`](https://github.com/software-mansion/react-native-executorch/tree/main/apps/computer-vision/app/fast_sam) example contains a working "Segment Anything" screen with all three prompt modes wired up. diff --git a/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md b/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md index 14e2ff8478..6b502348d9 100644 --- a/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md +++ b/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md @@ -132,3 +132,9 @@ YOLO models use the [`CocoLabelYolo`](../../06-api-reference/enumerations/CocoLa | yolo26l-seg | 80 | [COCO (YOLO)](../../06-api-reference/enumerations/CocoLabelYolo.md) | 384, 512, 640 | | yolo26x-seg | 80 | [COCO (YOLO)](../../06-api-reference/enumerations/CocoLabelYolo.md) | 384, 512, 640 | | rfdetr-nano-seg | 91 | [COCO](../../06-api-reference/enumerations/CocoLabel.md) | 312 (fixed) | +| fastsam-s | 1 | [FastSAMLabel](../../06-api-reference/enumerations/FastSAMLabel.md) | 640 (fixed) | +| fastsam-x | 1 | [FastSAMLabel](../../06-api-reference/enumerations/FastSAMLabel.md) | 640 (fixed) | + +:::tip +FastSAM models are class-agnostic — they segment every instance without classifying. To pick a specific instance from the output, use the [point/box/text selectors](./segment-anything.md). +::: From 52a035a9dd9b3c8a8aa1d45121de1d5a7db95345 Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Thu, 7 May 2026 00:58:33 +0200 Subject: [PATCH 06/18] feat: rename FastSAM screen and update documentation links to Segment Anything --- apps/computer-vision/app/_layout.tsx | 2 +- apps/computer-vision/app/index.tsx | 2 +- .../app/{fast_sam => segment_anything}/index.tsx | 0 docs/docs/03-hooks/02-computer-vision/segment-anything.md | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename apps/computer-vision/app/{fast_sam => segment_anything}/index.tsx (100%) diff --git a/apps/computer-vision/app/_layout.tsx b/apps/computer-vision/app/_layout.tsx index bb7ab7165e..a4868f92ae 100644 --- a/apps/computer-vision/app/_layout.tsx +++ b/apps/computer-vision/app/_layout.tsx @@ -190,7 +190,7 @@ export default function _layout() { }} /> router.navigate('fast_sam/')} + onPress={() => router.navigate('segment_anything/')} > Segment Anything diff --git a/apps/computer-vision/app/fast_sam/index.tsx b/apps/computer-vision/app/segment_anything/index.tsx similarity index 100% rename from apps/computer-vision/app/fast_sam/index.tsx rename to apps/computer-vision/app/segment_anything/index.tsx diff --git a/docs/docs/03-hooks/02-computer-vision/segment-anything.md b/docs/docs/03-hooks/02-computer-vision/segment-anything.md index 8cf974d034..50a7bd9536 100644 --- a/docs/docs/03-hooks/02-computer-vision/segment-anything.md +++ b/docs/docs/03-hooks/02-computer-vision/segment-anything.md @@ -140,4 +140,4 @@ Embedding all instances is the slow part of text prompts (one CLIP forward per i ## Example app -The [`computer-vision`](https://github.com/software-mansion/react-native-executorch/tree/main/apps/computer-vision/app/fast_sam) example contains a working "Segment Anything" screen with all three prompt modes wired up. +The [`computer-vision`](https://github.com/software-mansion/react-native-executorch/tree/main/apps/computer-vision/app/segment_anything) example contains a working "Segment Anything" screen with all three prompt modes wired up. From 376aca68a01359297995a9caed259c6383c7a5bc Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Thu, 7 May 2026 12:24:51 +0200 Subject: [PATCH 07/18] Update packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts Co-authored-by: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com> --- .../src/utils/segmentAnythingPrompts.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts index b512bc2f66..b4263c1f36 100644 --- a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts +++ b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts @@ -117,8 +117,7 @@ export function selectByText( } let textNormSq = 0; - for (let i = 0; i < textEmbedding.length; i++) { - const v = textEmbedding[i]!; + for (const v of textEmbedding) { textNormSq += v * v; } const textNorm = Math.sqrt(textNormSq); From 20bd4ba5c24f4d1e3e22c2c2a6acfe67f5adf1ec Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Thu, 7 May 2026 16:53:10 +0200 Subject: [PATCH 08/18] feat: add CoreML models --- .../react-native-executorch/src/constants/modelUrls.ts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 6c24934176..387dfdc3d8 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -1011,8 +1011,14 @@ export const SELFIE_SEGMENTATION = { } as const; // FastSAM Instance Segmentation -const FASTSAM_S_SEG_MODEL = `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-s/xnnpack/fastsam_s_xnnpack_fp32.pte`; -const FASTSAM_X_SEG_MODEL = `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-x/xnnpack/fastsam_x_xnnpack_fp32.pte`; +const FASTSAM_S_SEG_MODEL = + Platform.OS === 'ios' + ? `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-s/coreml/fastsam_s_coreml_fp16.pte` + : `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-s/xnnpack/fastsam_s_xnnpack_fp32.pte`; +const FASTSAM_X_SEG_MODEL = + Platform.OS === 'ios' + ? `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-x/coreml/fastsam_x_coreml_fp16.pte` + : `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-x/xnnpack/fastsam_x_xnnpack_fp32.pte`; /** * @category Models - Instance Segmentation From 16d6a0a1cf4f329da2718cd71738d6fe37e90fa6 Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Thu, 7 May 2026 16:53:39 +0200 Subject: [PATCH 09/18] fix: small fixes in segment anything example app --- .../app/segment_anything/index.tsx | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/apps/computer-vision/app/segment_anything/index.tsx b/apps/computer-vision/app/segment_anything/index.tsx index 4118bf3419..05cc14b558 100644 --- a/apps/computer-vision/app/segment_anything/index.tsx +++ b/apps/computer-vision/app/segment_anything/index.tsx @@ -363,21 +363,23 @@ export default function SegmentAnythingScreen() { onChangeText={setTextPrompt} onSubmitEditing={runTextPrompt} returnKeyType="search" - editable={!textBusy} /> - { + const findInactive = !textPrompt.trim() || - textBusy || rawInstancesRef.current.length === 0 || !clipImage.isReady || - !clipText.isReady - } - > - {textBusy ? '…' : 'Find'} - + !clipText.isReady; + return ( + + Find + + ); + })()} )} {mode === 'text' && embeddingProgress && ( @@ -392,6 +394,7 @@ export default function SegmentAnythingScreen() { selectedModel={selectedModel} disabled={isGenerating} onSelect={(m) => { + if (m.modelName === selectedModel.modelName) return; setSelectedModel(m); rawInstancesRef.current = []; instanceEmbeddingsRef.current = null; @@ -495,8 +498,9 @@ const styles = StyleSheet.create({ textBtn: { backgroundColor: ColorPalette.primary, borderRadius: 12, - paddingHorizontal: 20, paddingVertical: 14, + width: 80, + alignItems: 'center', }, textBtnDisabled: { backgroundColor: '#cbd5e1' }, textBtnLabel: { color: '#fff', fontWeight: '700', fontSize: 16 }, From f1202a5fa8d428a92d499a8d896feac8a370b0cd Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Thu, 7 May 2026 16:54:16 +0200 Subject: [PATCH 10/18] feat: add FastSAM to vision camera --- .../app/vision_camera/index.tsx | 6 +++++ .../tasks/InstanceSegmentationTask.tsx | 26 ++++++++++++++++--- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/apps/computer-vision/app/vision_camera/index.tsx b/apps/computer-vision/app/vision_camera/index.tsx index 4020d20023..7a399f443f 100644 --- a/apps/computer-vision/app/vision_camera/index.tsx +++ b/apps/computer-vision/app/vision_camera/index.tsx @@ -54,6 +54,8 @@ type ModelId = | 'segmentationSelfie' | 'instanceSegmentationYolo26n' | 'instanceSegmentationRfdetr' + | 'instanceSegmentationFastsamS' + | 'instanceSegmentationFastsamX' | 'poseEstimationYolo26n' | 'ocr' | 'styleTransferCandy' @@ -87,6 +89,8 @@ const TASKS: Task[] = [ variants: [ { id: 'instanceSegmentationYolo26n', label: 'YOLO26N Seg' }, { id: 'instanceSegmentationRfdetr', label: 'RF-DETR Nano Seg' }, + { id: 'instanceSegmentationFastsamS', label: 'FastSAM-S' }, + { id: 'instanceSegmentationFastsamX', label: 'FastSAM-X' }, ], }, { @@ -284,6 +288,8 @@ export default function VisionCameraScreen() { activeModel as | 'instanceSegmentationYolo26n' | 'instanceSegmentationRfdetr' + | 'instanceSegmentationFastsamS' + | 'instanceSegmentationFastsamX' } /> )} diff --git a/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx b/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx index 8bcdfb3844..51f892a0c7 100644 --- a/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx +++ b/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx @@ -6,9 +6,12 @@ import { SegmentedInstance, YOLO26N_SEG, RF_DETR_NANO_SEG, + FASTSAM_S, + FASTSAM_X, useInstanceSegmentation, CocoLabel, CocoLabelYolo, + FastSAMLabel, } from 'react-native-executorch'; import { Canvas, Image as SkiaImage } from '@shopify/react-native-skia'; import { labelColor, labelColorBg } from '../../utils/colors'; @@ -20,7 +23,9 @@ import { type InstSegModelId = | 'instanceSegmentationYolo26n' - | 'instanceSegmentationRfdetr'; + | 'instanceSegmentationRfdetr' + | 'instanceSegmentationFastsamS' + | 'instanceSegmentationFastsamX'; type Props = TaskProps & { activeModel: InstSegModelId }; @@ -44,9 +49,23 @@ export default function InstanceSegmentationTask({ model: RF_DETR_NANO_SEG, preventLoad: activeModel !== 'instanceSegmentationRfdetr', }); + const fastsamS = useInstanceSegmentation({ + model: FASTSAM_S, + preventLoad: activeModel !== 'instanceSegmentationFastsamS', + }); + const fastsamX = useInstanceSegmentation({ + model: FASTSAM_X, + preventLoad: activeModel !== 'instanceSegmentationFastsamX', + }); const active = - activeModel === 'instanceSegmentationYolo26n' ? yolo26n : rfdetr; + activeModel === 'instanceSegmentationYolo26n' + ? yolo26n + : activeModel === 'instanceSegmentationRfdetr' + ? rfdetr + : activeModel === 'instanceSegmentationFastsamS' + ? fastsamS + : fastsamX; const [instances, setInstances] = useState([]); const [imageSize, setImageSize] = useState({ width: 1, height: 1 }); @@ -74,7 +93,8 @@ export default function InstanceSegmentationTask({ (p: { results: | SegmentedInstance[] - | SegmentedInstance[]; + | SegmentedInstance[] + | SegmentedInstance[]; imageWidth: number; imageHeight: number; }) => { From decff98f4d1ef3441f9d7798e150a172ce4c79c2 Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Fri, 8 May 2026 12:09:43 +0200 Subject: [PATCH 11/18] refactor: simplify active model selection in InstanceSegmentationTask --- .../tasks/InstanceSegmentationTask.tsx | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx b/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx index 51f892a0c7..52251f6e3e 100644 --- a/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx +++ b/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx @@ -58,14 +58,12 @@ export default function InstanceSegmentationTask({ preventLoad: activeModel !== 'instanceSegmentationFastsamX', }); - const active = - activeModel === 'instanceSegmentationYolo26n' - ? yolo26n - : activeModel === 'instanceSegmentationRfdetr' - ? rfdetr - : activeModel === 'instanceSegmentationFastsamS' - ? fastsamS - : fastsamX; + const active = { + instanceSegmentationYolo26n: yolo26n, + instanceSegmentationRfdetr: rfdetr, + instanceSegmentationFastsamS: fastsamS, + instanceSegmentationFastsamX: fastsamX, + }[activeModel]; const [instances, setInstances] = useState([]); const [imageSize, setImageSize] = useState({ width: 1, height: 1 }); From 324e8d3c9961776dfbaa09ca1cd34dcd0f6206e6 Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Fri, 8 May 2026 12:39:18 +0200 Subject: [PATCH 12/18] fix: fix keyboard handling and layout for SegmentAnythingScreen --- .../app/segment_anything/index.tsx | 299 ++++++++++-------- 1 file changed, 162 insertions(+), 137 deletions(-) diff --git a/apps/computer-vision/app/segment_anything/index.tsx b/apps/computer-vision/app/segment_anything/index.tsx index 05cc14b558..d2b85b5463 100644 --- a/apps/computer-vision/app/segment_anything/index.tsx +++ b/apps/computer-vision/app/segment_anything/index.tsx @@ -5,7 +5,11 @@ import { Text, TextInput, TouchableOpacity, + TouchableWithoutFeedback, GestureResponderEvent, + Keyboard, + KeyboardAvoidingView, + Platform, } from 'react-native'; import { Canvas, @@ -166,6 +170,7 @@ export default function SegmentAnythingScreen() { } async function runTextPrompt() { + Keyboard.dismiss(); const instances = rawInstancesRef.current; if ( !textPrompt.trim() || @@ -207,6 +212,7 @@ export default function SegmentAnythingScreen() { } const handleCameraPress = async (isCamera: boolean) => { + Keyboard.dismiss(); const image = await getImage(isCamera); if (!image?.uri) return; setImageUri(image.uri); @@ -218,6 +224,7 @@ export default function SegmentAnythingScreen() { }; const runForward = async () => { + Keyboard.dismiss(); if (!imageUri) return; try { const start = Date.now(); @@ -274,150 +281,167 @@ export default function SegmentAnythingScreen() { return ( - - - { - layoutRef.current = { - width: e.nativeEvent.layout.width, - height: e.nativeEvent.layout.height, - }; - }} - onTouchStart={(e) => { - if (mode === 'point') handleTap(e); - else if (mode === 'box') handleBoxStart(e); - }} - onTouchMove={handleBoxMove} - onTouchEnd={handleBoxEnd} - > - - {draftBox && iw > 0 && ( - - + + + + { + layoutRef.current = { + width: e.nativeEvent.layout.width, + height: e.nativeEvent.layout.height, + }; + }} + onTouchStart={(e) => { + Keyboard.dismiss(); + if (mode === 'point') handleTap(e); + else if (mode === 'box') handleBoxStart(e); + }} + onTouchMove={handleBoxMove} + onTouchEnd={handleBoxEnd} + > + - - )} + {draftBox && iw > 0 && ( + + + + )} + + {!imageUri && ( + + Segment Anything + + Segment any object in an image. (1) Pick an image, (2) tap + Run to detect instances, (3) tap a point, draw a box, or + describe an object to segment it. + + + )} + + + + {stepHint && {stepHint}} + + + {(['point', 'box', 'text'] as PromptMode[]).map((m) => { + const promptDisabled = rawInstancesRef.current.length === 0; + return ( + { + if (m !== 'text') Keyboard.dismiss(); + setMode(m); + }} + disabled={promptDisabled} + > + + {m[0]!.toUpperCase() + m.slice(1)} + + + ); + })} - {!imageUri && ( - - Segment Anything - - Segment any object in an image. (1) Pick an image, (2) tap Run - to detect instances, (3) tap a point, draw a box, or describe an - object to segment it. - + + {mode === 'text' && ( + + + {(() => { + const findInactive = + !textPrompt.trim() || + rawInstancesRef.current.length === 0 || + !clipImage.isReady || + !clipText.isReady; + return ( + + Find + + ); + })()} )} - - - - {stepHint && {stepHint}} - - - {(['point', 'box', 'text'] as PromptMode[]).map((m) => { - const promptDisabled = rawInstancesRef.current.length === 0; - return ( - setMode(m)} - disabled={promptDisabled} - > - - {m[0]!.toUpperCase() + m.slice(1)} - - - ); - })} - - - {mode === 'text' && ( - - + Embedding instances {embeddingProgress.done}/ + {embeddingProgress.total} (subsequent text queries are instant) + + )} + + { + if (m.modelName === selectedModel.modelName) return; + setSelectedModel(m); + rawInstancesRef.current = []; + instanceEmbeddingsRef.current = null; + setSelection([]); + setInferenceTime(null); + }} /> - {(() => { - const findInactive = - !textPrompt.trim() || - rawInstancesRef.current.length === 0 || - !clipImage.isReady || - !clipText.isReady; - return ( - - Find - - ); - })()} - - )} - {mode === 'text' && embeddingProgress && ( - - Embedding instances {embeddingProgress.done}/{embeddingProgress.total}{' '} - (subsequent text queries are instant) - - )} - - { - if (m.modelName === selectedModel.modelName) return; - setSelectedModel(m); - rawInstancesRef.current = []; - instanceEmbeddingsRef.current = null; - setSelection([]); - setInferenceTime(null); - }} - /> - 0 - ? rawInstancesRef.current.length - : null - } - /> + 0 + ? rawInstancesRef.current.length + : null + } + /> - + + + ); } @@ -447,6 +471,7 @@ async function cropAndEmbed( } const styles = StyleSheet.create({ + flex: { flex: 1 }, container: { flex: 6, width: '100%' }, imageContainer: { flex: 1, width: '100%', padding: 16 }, imageTouchArea: { flex: 1, position: 'relative' }, From 69ce94d34ce1376acca60c1f7376a2852ab3c3aa Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Fri, 8 May 2026 17:05:34 +0200 Subject: [PATCH 13/18] fix: fix cropping logic in SegmentAnythingScreen for text prompts --- .../app/segment_anything/index.tsx | 89 ++++++++++++++++--- .../src/utils/segmentAnythingPrompts.ts | 38 ++++---- 2 files changed, 93 insertions(+), 34 deletions(-) diff --git a/apps/computer-vision/app/segment_anything/index.tsx b/apps/computer-vision/app/segment_anything/index.tsx index d2b85b5463..ae8087bd78 100644 --- a/apps/computer-vision/app/segment_anything/index.tsx +++ b/apps/computer-vision/app/segment_anything/index.tsx @@ -17,6 +17,8 @@ import { Skia, useImage, type SkImage, + ColorType, + AlphaType, } from '@shopify/react-native-skia'; import { useInstanceSegmentation, @@ -188,10 +190,14 @@ export default function SegmentAnythingScreen() { setEmbeddingProgress({ done: 0, total: instances.length }); const embeddings: Float32Array[] = []; for (let i = 0; i < instances.length; i++) { + const inst = instances[i]!; embeddings.push( await cropAndEmbed( skiaSource, - instances[i]!.bbox, + inst.bbox, + inst.mask, + inst.maskWidth, + inst.maskHeight, clipImage.forward ) ); @@ -201,9 +207,12 @@ export default function SegmentAnythingScreen() { setEmbeddingProgress(null); } const textEmb = await clipText.forward(textPrompt); - applyMatch( - selectByText(instances, instanceEmbeddingsRef.current, textEmb) + const match = selectByText( + instances, + instanceEmbeddingsRef.current, + textEmb ); + applyMatch(match); } catch (e) { console.error(e); } finally { @@ -449,24 +458,76 @@ export default function SegmentAnythingScreen() { async function cropAndEmbed( image: SkImage, bbox: Bbox, + mask: Uint8Array, + maskWidth: number, + maskHeight: number, forward: (input: string) => Promise ): Promise { - const w = Math.max(1, Math.round(bbox.x2 - bbox.x1)); - const h = Math.max(1, Math.round(bbox.y2 - bbox.y1)); - const surface = Skia.Surface.MakeOffscreen(w, h); + // FastSAM-style full-image white canvas, but with the mask applied: + // inside the bbox we keep image pixels where mask=1 and overwrite the + // rest with white. CLIP then sees a uniform white scene with only the + // segmented object visible at its original position/size. + const imgW = image.width(); + const imgH = image.height(); + const surface = Skia.Surface.MakeOffscreen(imgW, imgH); if (!surface) throw new Error('Failed to create offscreen Skia surface'); - surface.getCanvas().drawImageRect( - image, + const canvas = surface.getCanvas(); + canvas.clear(Skia.Color('white')); + + const x1 = Math.max(0, Math.round(bbox.x1)); + const y1 = Math.max(0, Math.round(bbox.y1)); + const x2 = Math.min(imgW, Math.round(bbox.x2)); + const y2 = Math.min(imgH, Math.round(bbox.y2)); + const w = x2 - x1; + const h = y2 - y1; + if (w > 0 && h > 0) { + canvas.drawImageRect( + image, + { x: x1, y: y1, width: w, height: h }, + { x: x1, y: y1, width: w, height: h }, + Skia.Paint() + ); + } + + // Inverse mask: opaque white where mask=0, transparent where mask=1. + // Drawn on top within the bbox, it overpaints non-mask pixels with white + // and leaves the segmented object intact. + const inversePixels = new Uint8Array(mask.length * 4); + for (let i = 0; i < mask.length; i++) { + const outside = mask[i]! === 0; + const idx = i * 4; + inversePixels[idx] = outside ? 255 : 0; + inversePixels[idx + 1] = outside ? 255 : 0; + inversePixels[idx + 2] = outside ? 255 : 0; + inversePixels[idx + 3] = outside ? 255 : 0; + } + const inverseData = Skia.Data.fromBytes(inversePixels); + const inverseMaskImg = Skia.Image.MakeImage( { - x: bbox.x1, - y: bbox.y1, - width: bbox.x2 - bbox.x1, - height: bbox.y2 - bbox.y1, + width: maskWidth, + height: maskHeight, + colorType: ColorType.RGBA_8888, + alphaType: AlphaType.Premul, }, - { x: 0, y: 0, width: w, height: h }, - Skia.Paint() + inverseData, + maskWidth * 4 ); + if (inverseMaskImg) { + canvas.drawImageRect( + inverseMaskImg, + { x: 0, y: 0, width: maskWidth, height: maskHeight }, + { + x: bbox.x1, + y: bbox.y1, + width: bbox.x2 - bbox.x1, + height: bbox.y2 - bbox.y1, + }, + Skia.Paint() + ); + } + const base64 = surface.makeImageSnapshot().encodeToBase64(); + inverseData.dispose(); return forward(`data:image/png;base64,${base64}`); } diff --git a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts index b4263c1f36..9c8b69ba1f 100644 --- a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts +++ b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts @@ -98,7 +98,7 @@ function bboxArea(bbox: Bbox): number { * Returns the instance whose image embedding has the highest cosine similarity * with the text embedding. The caller is responsible for producing the * embeddings (e.g. with CLIP) and passing them in the same order as - * `instances`; embeddings do not need to be pre-normalized. + * `instances`. * @param instances - Array of segmented instances returned by `forward()`. * @param instanceEmbeddings - Image embedding for each instance, in the same order as `instances`. * @param textEmbedding - Embedding of the text prompt. @@ -116,29 +116,27 @@ export function selectByText( ); } - let textNormSq = 0; - for (const v of textEmbedding) { - textNormSq += v * v; - } - const textNorm = Math.sqrt(textNormSq); - + const scores = calculateDotProducts(instanceEmbeddings, textEmbedding); let bestIdx = 0; let bestScore = -Infinity; - for (let i = 0; i < instances.length; i++) { - const emb = instanceEmbeddings[i]!; - const n = Math.min(emb.length, textEmbedding.length); - let dot = 0; - let embNormSq = 0; - for (let j = 0; j < n; j++) { - const a = emb[j]!; - dot += a * textEmbedding[j]!; - embNormSq += a * a; - } - const score = dot / (Math.sqrt(embNormSq) * textNorm + 1e-7); - if (score > bestScore) { - bestScore = score; + for (let i = 0; i < scores.length; i++) { + if (scores[i]! > bestScore) { + bestScore = scores[i]!; bestIdx = i; } } return instances[bestIdx]!; } + +function calculateDotProducts( + instanceEmbeddings: Float32Array[], + textEmbedding: Float32Array +): number[] { + return instanceEmbeddings.map((emb) => { + let dot = 0; + for (let j = 0; j < emb.length; j++) { + dot += emb[j]! * textEmbedding[j]!; + } + return dot; + }); +} From 8f84b1761cbeafd9cb572104e830de578205a62c Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Fri, 8 May 2026 17:37:58 +0200 Subject: [PATCH 14/18] feat: add common vision utilities --- .../src/utils/commonVision.ts | 10 ++++++ .../src/utils/segmentAnythingPrompts.ts | 33 ++++++++----------- 2 files changed, 23 insertions(+), 20 deletions(-) create mode 100644 packages/react-native-executorch/src/utils/commonVision.ts diff --git a/packages/react-native-executorch/src/utils/commonVision.ts b/packages/react-native-executorch/src/utils/commonVision.ts new file mode 100644 index 0000000000..7cd9b2a44b --- /dev/null +++ b/packages/react-native-executorch/src/utils/commonVision.ts @@ -0,0 +1,10 @@ +import { Bbox } from '../types/objectDetection'; + +/** + * Calculates the area of a bounding box. + * @param bbox - Bounding box to calculate area for. + * @returns Area of the bounding box. + */ +export function bboxArea(bbox: Bbox): number { + return Math.max(bbox.x2 - bbox.x1, 0) * Math.max(bbox.y2 - bbox.y1, 0); +} diff --git a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts index 9c8b69ba1f..f162de5e83 100644 --- a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts +++ b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts @@ -1,6 +1,7 @@ import { LabelEnum } from '../types/common'; import { Bbox } from '../types/objectDetection'; import { SegmentedInstance } from '../types/instanceSegmentation'; +import { bboxArea } from './commonVision'; /** * Selects the best matching instance for a given point prompt. @@ -54,7 +55,7 @@ export function selectByBox( box: Bbox ): SegmentedInstance | null { const { x1: px1, y1: py1, x2: px2, y2: py2 } = box; - const promptArea = Math.max(px2 - px1, 0) * Math.max(py2 - py1, 0); + const promptArea = bboxArea(box); type Match = { iou: number; @@ -88,10 +89,6 @@ export function selectByBox( return best?.inst ?? null; } -function bboxArea(bbox: Bbox): number { - return Math.max(bbox.x2 - bbox.x1, 0) * Math.max(bbox.y2 - bbox.y1, 0); -} - /** * Selects the best matching instance for a text prompt. * @@ -112,11 +109,20 @@ export function selectByText( if (instances.length === 0) return null; if (instances.length !== instanceEmbeddings.length) { throw new Error( - `selectByText: instances (${instances.length}) and instanceEmbeddings (${instanceEmbeddings.length}) must have the same length` + `selectByText: instances (${instances.length})` + + `and instanceEmbeddings (${instanceEmbeddings.length})` + + `must have the same length` ); } - const scores = calculateDotProducts(instanceEmbeddings, textEmbedding); + const scores = instanceEmbeddings.map((emb) => { + let dot = 0; + for (let j = 0; j < emb.length; j++) { + dot += emb[j]! * textEmbedding[j]!; + } + return dot; + }); + let bestIdx = 0; let bestScore = -Infinity; for (let i = 0; i < scores.length; i++) { @@ -127,16 +133,3 @@ export function selectByText( } return instances[bestIdx]!; } - -function calculateDotProducts( - instanceEmbeddings: Float32Array[], - textEmbedding: Float32Array -): number[] { - return instanceEmbeddings.map((emb) => { - let dot = 0; - for (let j = 0; j < emb.length; j++) { - dot += emb[j]! * textEmbedding[j]!; - } - return dot; - }); -} From 7dc462f02a8ae73eaf5cc67ee3e5c5abcc873c5a Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Mon, 11 May 2026 11:41:53 +0200 Subject: [PATCH 15/18] docs: update docs --- .../02-computer-vision/segment-anything.md | 143 ------------------ .../useInstanceSegmentation.md | 42 ++++- 2 files changed, 41 insertions(+), 144 deletions(-) delete mode 100644 docs/docs/03-hooks/02-computer-vision/segment-anything.md diff --git a/docs/docs/03-hooks/02-computer-vision/segment-anything.md b/docs/docs/03-hooks/02-computer-vision/segment-anything.md deleted file mode 100644 index 50a7bd9536..0000000000 --- a/docs/docs/03-hooks/02-computer-vision/segment-anything.md +++ /dev/null @@ -1,143 +0,0 @@ ---- -title: Segment Anything with FastSAM ---- - -[FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM) is a class-agnostic, promptable segmentation model. Unlike YOLO or RF-DETR (which return labelled detections), FastSAM segments **every** instance in an image without classifying them — you then pick the one you want with a point, box, or text prompt. - -`FASTSAM_S` and `FASTSAM_X` are loaded with the regular [`useInstanceSegmentation`](./useInstanceSegmentation.md) hook. `react-native-executorch` ships three small selector utilities to pick an instance from the hook's output: `selectByPoint`, `selectByBox`, and `selectByText`. - -## API Reference - -- [`selectByPoint` API Reference](../../06-api-reference/functions/selectByPoint.md) -- [`selectByBox` API Reference](../../06-api-reference/functions/selectByBox.md) -- [`selectByText` API Reference](../../06-api-reference/functions/selectByText.md) - -## High Level Overview - -The workflow has three steps: - -1. Load `FASTSAM_S` (or `FASTSAM_X`) with `useInstanceSegmentation`. -2. Run `forward(image)` once — the result is every detected instance. -3. Use a selector to pick the one matching the user's prompt. Re-run a selector when the prompt changes; you don't need to call `forward` again. - -```typescript -import { - useInstanceSegmentation, - selectByPoint, - selectByBox, - selectByText, - FASTSAM_S, -} from 'react-native-executorch'; - -const model = useInstanceSegmentation({ model: FASTSAM_S }); - -try { - const instances = await model.forward(imageUri); - - // Point: the smallest instance whose mask covers (x, y). - const a = selectByPoint(instances, x, y); - console.log('point match:', a?.bbox); - - // Box: the instance with highest IoU with the prompt box. - const b = selectByBox(instances, { x1, y1, x2, y2 }); - console.log('box match:', b?.bbox); - - // Text: highest cosine similarity between text and per-instance image - // embeddings (you must provide the embeddings, e.g. with CLIP). - const c = selectByText(instances, instanceEmbeddings, textEmbedding); - console.log('text match:', c?.bbox); -} catch (error) { - console.error(error); -} -``` - -The hook output is typed as [`SegmentedInstance`](../../06-api-reference/interfaces/SegmentedInstance.md). FastSAM emits a single label, [`FastSAMLabel.OBJECT`](../../06-api-reference/enumerations/FastSAMLabel.md) (`'OBJECT' = 0`). - -## Selecting by point - -`selectByPoint` returns the instance whose mask covers the point `(x, y)`. When several instances overlap (e.g. a small object inside a larger one), the one with the smallest bounding box wins; ties are broken by confidence. Returns `null` if no mask covers the point. - -It accepts three arguments: - -- `instances` (required) - The array of [`SegmentedInstance`](../../06-api-reference/interfaces/SegmentedInstance.md) returned by `forward()`. -- `x` (required) - X coordinate of the prompt point, in the **original image's** pixel space. -- `y` (required) - Y coordinate of the prompt point, in the **original image's** pixel space. - -:::info -`returnMaskAtOriginalResolution: true` (the default) is required for `selectByPoint` — masks must be in original image coordinates so they align with the touch coordinates passed in. -::: - -## Selecting by box - -`selectByBox` returns the instance with the highest IoU with the prompt box. Useful for "draw a box around what you want" UX. Returns `null` if no instance overlaps. - -It accepts two arguments: - -- `instances` (required) - The array of [`SegmentedInstance`](../../06-api-reference/interfaces/SegmentedInstance.md) returned by `forward()`. -- `box` (required) - A [`Bbox`](../../06-api-reference/interfaces/Bbox.md) (`{ x1, y1, x2, y2 }`) in the original image's pixel space. - -## Selecting by text - -`selectByText` returns the instance whose image embedding has the highest cosine similarity with the text embedding. The caller produces the embeddings — typically by cropping each instance's bbox and running [CLIP](./useImageEmbeddings.md) image encoder, plus running the [CLIP text encoder](../01-natural-language-processing/useTextEmbeddings.md) on the prompt. - -It accepts three arguments: - -- `instances` (required) - The array of [`SegmentedInstance`](../../06-api-reference/interfaces/SegmentedInstance.md) returned by `forward()`. -- `instanceEmbeddings` (required) - One `Float32Array` per instance, in the same order as `instances`. Throws if lengths differ. -- `textEmbedding` (required) - A `Float32Array` for the text prompt. - -Embeddings do not need to be pre-normalized. Returns `null` only when `instances` is empty. - -### Example with CLIP - -```typescript -import { - useInstanceSegmentation, - useImageEmbeddings, - useTextEmbeddings, - selectByText, - FASTSAM_S, - CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED, - CLIP_VIT_BASE_PATCH32_TEXT, -} from 'react-native-executorch'; - -function App() { - const sam = useInstanceSegmentation({ model: FASTSAM_S }); - const clipImage = useImageEmbeddings({ - model: CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED, - }); - const clipText = useTextEmbeddings({ model: CLIP_VIT_BASE_PATCH32_TEXT }); - - const handlePrompt = async (imageUri: string, prompt: string) => { - if (!sam.isReady || !clipImage.isReady || !clipText.isReady) return; - - try { - const instances = await sam.forward(imageUri); - - // Embed each instance's bbox crop. Cropping is your responsibility — - // any image manipulator (e.g. expo-image-manipulator) works. - const instanceEmbeddings = await Promise.all( - instances.map((inst) => - clipImage.forward(cropToBbox(imageUri, inst.bbox)) - ) - ); - - const textEmb = await clipText.forward(prompt); - const match = selectByText(instances, instanceEmbeddings, textEmb); - console.log('match:', match?.bbox, match?.score); - } catch (error) { - console.error(error); - } - }; - - // ... -} -``` - -:::tip -Embedding all instances is the slow part of text prompts (one CLIP forward per instance). Cache `instanceEmbeddings` and reuse them across multiple text queries on the same image; only invalidate when you call `sam.forward` again. -::: - -## Example app - -The [`computer-vision`](https://github.com/software-mansion/react-native-executorch/tree/main/apps/computer-vision/app/segment_anything) example contains a working "Segment Anything" screen with all three prompt modes wired up. diff --git a/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md b/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md index 6b502348d9..b9bdaa2774 100644 --- a/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md +++ b/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md @@ -136,5 +136,45 @@ YOLO models use the [`CocoLabelYolo`](../../06-api-reference/enumerations/CocoLa | fastsam-x | 1 | [FastSAMLabel](../../06-api-reference/enumerations/FastSAMLabel.md) | 640 (fixed) | :::tip -FastSAM models are class-agnostic — they segment every instance without classifying. To pick a specific instance from the output, use the [point/box/text selectors](./segment-anything.md). +FastSAM models are class-agnostic, so they segment every instance without classifying it. That makes them a good fit for promptable selection workflows. ::: + +## Promptable selection + +Instance segmentation models return a list of segmented instances. After `forward()`, you can use prompt-based selectors to pick the instance you want. Use point selection for tap-to-select or cutout tools, box selection for drag-to-outline workflows, and text selection for search or describe-it-in-words flows. For example, a photo-editing app can use point selection to isolate a person, create custom sticker or background-removal flow can use box selection, and a shopping app can use text selection to find a product by name or description: + +1. Load an instance segmentation model with `useInstanceSegmentation`. +2. Run `forward(image)` once to get the detected instances. +3. Use a selector to pick the instance or instances matching the user's prompt. +4. Re-run the selector when the prompt changes; you do not need to call `forward` again unless the image changes. + +```typescript +import { + useInstanceSegmentation, + selectByPoint, + selectByBox, + selectByText, + FASTSAM_X, +} from 'react-native-executorch'; + +const model = useInstanceSegmentation({ model: FASTSAM_X }); + +try { + const instances = await model.forward(imageUri); + + // Point: the smallest instance whose mask covers (x, y). + const pointMatch = selectByPoint(instances, x, y); + console.log('point match:', pointMatch?.bbox); + + // Box: the instance with highest IoU with the prompt box. + const boxMatch = selectByBox(instances, { x1, y1, x2, y2 }); + console.log('box match:', boxMatch?.bbox); + + // Text: highest cosine similarity between text and per-instance image + // embeddings (you must provide the embeddings, e.g. with CLIP). + const textMatch = selectByText(instances, instanceEmbeddings, textEmbedding); + console.log('text match:', textMatch?.bbox); +} catch (error) { + console.error(error); +} +``` From ee0403cf9e447ff6b9a16b0367fa77e3a414551d Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Mon, 11 May 2026 12:11:11 +0200 Subject: [PATCH 16/18] feat: enhance selectByText function to support multiple top matches --- .cspell-wordlist.txt | 1 + .../app/segment_anything/index.tsx | 7 --- .../src/utils/segmentAnythingPrompts.ts | 52 +++++++++++++------ 3 files changed, 38 insertions(+), 22 deletions(-) diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt index b9809a8734..de81c01b40 100644 --- a/.cspell-wordlist.txt +++ b/.cspell-wordlist.txt @@ -205,3 +205,4 @@ deinitialize Deinitialize fastsam promptable +topk diff --git a/apps/computer-vision/app/segment_anything/index.tsx b/apps/computer-vision/app/segment_anything/index.tsx index ae8087bd78..037a988327 100644 --- a/apps/computer-vision/app/segment_anything/index.tsx +++ b/apps/computer-vision/app/segment_anything/index.tsx @@ -463,10 +463,6 @@ async function cropAndEmbed( maskHeight: number, forward: (input: string) => Promise ): Promise { - // FastSAM-style full-image white canvas, but with the mask applied: - // inside the bbox we keep image pixels where mask=1 and overwrite the - // rest with white. CLIP then sees a uniform white scene with only the - // segmented object visible at its original position/size. const imgW = image.width(); const imgH = image.height(); const surface = Skia.Surface.MakeOffscreen(imgW, imgH); @@ -489,9 +485,6 @@ async function cropAndEmbed( ); } - // Inverse mask: opaque white where mask=0, transparent where mask=1. - // Drawn on top within the bbox, it overpaints non-mask pixels with white - // and leaves the segmented object intact. const inversePixels = new Uint8Array(mask.length * 4); for (let i = 0; i < mask.length; i++) { const outside = mask[i]! === 0; diff --git a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts index f162de5e83..db854705c7 100644 --- a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts +++ b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts @@ -90,27 +90,41 @@ export function selectByBox( } /** - * Selects the best matching instance for a text prompt. + * Selects the best matching instance(s) for a text prompt. * - * Returns the instance whose image embedding has the highest cosine similarity + * Returns the instance(s) whose image embedding has the highest cosine similarity * with the text embedding. The caller is responsible for producing the * embeddings (e.g. with CLIP) and passing them in the same order as * `instances`. * @param instances - Array of segmented instances returned by `forward()`. * @param instanceEmbeddings - Image embedding for each instance, in the same order as `instances`. * @param textEmbedding - Embedding of the text prompt. - * @returns The best matching instance, or `null` if `instances` is empty. + * @param topk - Number of top matches to return (defaults to 1). + * @returns The best matching instance (or null) if topk is 1, otherwise an array of the topk matching instances. */ export function selectByText( instances: SegmentedInstance[], instanceEmbeddings: Float32Array[], - textEmbedding: Float32Array -): SegmentedInstance | null { - if (instances.length === 0) return null; + textEmbedding: Float32Array, + topk?: 1 +): SegmentedInstance | null; +export function selectByText( + instances: SegmentedInstance[], + instanceEmbeddings: Float32Array[], + textEmbedding: Float32Array, + topk: number +): SegmentedInstance[]; +export function selectByText( + instances: SegmentedInstance[], + instanceEmbeddings: Float32Array[], + textEmbedding: Float32Array, + topk = 1 +): SegmentedInstance | null | SegmentedInstance[] { + if (instances.length === 0) return topk === 1 ? null : []; if (instances.length !== instanceEmbeddings.length) { throw new Error( - `selectByText: instances (${instances.length})` + - `and instanceEmbeddings (${instanceEmbeddings.length})` + + `selectByText: instances (${instances.length}) ` + + `and instanceEmbeddings (${instanceEmbeddings.length}) ` + `must have the same length` ); } @@ -123,13 +137,21 @@ export function selectByText( return dot; }); - let bestIdx = 0; - let bestScore = -Infinity; - for (let i = 0; i < scores.length; i++) { - if (scores[i]! > bestScore) { - bestScore = scores[i]!; - bestIdx = i; + if (topk === 1) { + let bestIdx = 0; + let bestScore = -Infinity; + for (let i = 0; i < scores.length; i++) { + if (scores[i]! > bestScore) { + bestScore = scores[i]!; + bestIdx = i; + } } + return instances[bestIdx]!; } - return instances[bestIdx]!; + + return instances + .map((instance, index) => ({ instance, score: scores[index]! })) + .sort((a, b) => b.score - a.score) + .slice(0, topk) + .map((item) => item.instance); } From 49563f08943135fe13c53969d4bdcb1396094239 Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Mon, 11 May 2026 13:46:26 +0200 Subject: [PATCH 17/18] fix: add pointerEvents="none" to overlay view in ImageWithMasks component --- apps/computer-vision/components/ImageWithMasks.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/computer-vision/components/ImageWithMasks.tsx b/apps/computer-vision/components/ImageWithMasks.tsx index bd768909b2..8bb435f47a 100644 --- a/apps/computer-vision/components/ImageWithMasks.tsx +++ b/apps/computer-vision/components/ImageWithMasks.tsx @@ -156,7 +156,7 @@ export default function ImageWithMasks({ /> {instances.length > 0 && ( - + {instances.map((inst, idx) => { const mx = inst.bbox.x1 * scale + offsetX; From 80bf79d92c991d0973fe8ca87951d5e330c961c2 Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Mon, 11 May 2026 15:12:19 +0200 Subject: [PATCH 18/18] docs: update inference time and model size documentation; add FastSAM usage tips --- docs/docs/02-benchmarks/inference-time.md | 24 ++++++++++++------- docs/docs/02-benchmarks/model-size.md | 18 +++++++------- .../useInstanceSegmentation.md | 6 +++++ 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/docs/docs/02-benchmarks/inference-time.md b/docs/docs/02-benchmarks/inference-time.md index faef5c603d..8ef213238f 100644 --- a/docs/docs/02-benchmarks/inference-time.md +++ b/docs/docs/02-benchmarks/inference-time.md @@ -230,17 +230,23 @@ slower for very large images, which can increase total time. ## Instance Segmentation :::note -Times presented in the tables are measured for YOLO models with input size equal to 512. Other input sizes may yield slower or faster inference times. RF-DETR Nano Seg uses a fixed resolution of 312×312. +Times presented in the tables are measured for YOLO models with input size equal +to 512. Other input sizes may yield slower or faster inference times. RF-DETR +Nano Seg uses a fixed resolution of 312×312. ::: -| Model | Samsung Galaxy S24 (XNNPACK) [ms] | Iphone 17 pro (XNNPACK) [ms] | -| ---------------- | --------------------------------- | ---------------------------- | -| YOLO26N_SEG | 92 | 90 | -| YOLO26S_SEG | 220 | 188 | -| YOLO26M_SEG | 570 | 550 | -| YOLO26L_SEG | 680 | 608 | -| YOLO26X_SEG | 1410 | 1338 | -| RF_DETR_NANO_SEG | 549 | 330 | +| Model | Samsung Galaxy S24 [ms] | Iphone 17 pro [ms] | Pixel 10 [ms] | +| :------------------------- | :---------------------: | :----------------: | :-----------: | +| YOLO26N_SEG (XNNPACK) | 92 | 90 | 93 | +| YOLO26S_SEG (XNNPACK) | 220 | 188 | 193 | +| YOLO26M_SEG (XNNPACK) | 570 | 550 | 481 | +| YOLO26L_SEG (XNNPACK) | 680 | 608 | 582 | +| YOLO26X_SEG (XNNPACK) | 1410 | 1338 | 1191 | +| RF_DETR_NANO_SEG (XNNPACK) | 549 | 330 | 428 | +| FASTSAM_S (XNNPACK) | - | 30 | 286 | +| FASTSAM_X (XNNPACK) | - | 2520 | 1993 | +| FASTSAM_S (Core ML) | - | 51 | - | +| FASTSAM_X (Core ML) | - | 72 | - | ## Text to image diff --git a/docs/docs/02-benchmarks/model-size.md b/docs/docs/02-benchmarks/model-size.md index 8dea094839..6d7f7cb753 100644 --- a/docs/docs/02-benchmarks/model-size.md +++ b/docs/docs/02-benchmarks/model-size.md @@ -22,14 +22,16 @@ title: Model Size ## Instance Segmentation -| Model | XNNPACK [MB] | -| ---------------- | :----------: | -| YOLO26N_SEG | 11.6 | -| YOLO26S_SEG | 42.3 | -| YOLO26M_SEG | 95.4 | -| YOLO26L_SEG | 113 | -| YOLO26X_SEG | 252 | -| RF_DETR_NANO_SEG | 124 | +| Model | XNNPACK [MB] | Core ML FP32 [MB] | Core ML FP16 [MB] | +| ---------------- | :----------: | :---------------: | :---------------: | +| YOLO26N_SEG | 11.6 | - | - | +| YOLO26S_SEG | 42.3 | - | - | +| YOLO26M_SEG | 95.4 | - | - | +| YOLO26L_SEG | 113 | - | - | +| YOLO26X_SEG | 252 | - | - | +| RF_DETR_NANO_SEG | 124 | - | - | +| FASTSAM_S | 47.3 | 47.8 | 24.2 | +| FASTSAM_X | 289 | 290 | 145 | ## Style Transfer diff --git a/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md b/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md index b9bdaa2774..6835262a6a 100644 --- a/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md +++ b/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md @@ -178,3 +178,9 @@ try { console.error(error); } ``` + +:::tip +Use FastSAM-S for faster performance on simple images with non-overlapping +instances and FastSAM-X for better accuracy on complex scenes with many +overlapping objects. +:::