diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt
index 84d006eefe..de81c01b40 100644
--- a/.cspell-wordlist.txt
+++ b/.cspell-wordlist.txt
@@ -203,3 +203,6 @@ fishjam
Fishjam
deinitialize
Deinitialize
+fastsam
+promptable
+topk
diff --git a/apps/computer-vision/app/_layout.tsx b/apps/computer-vision/app/_layout.tsx
index 03770c2720..a4868f92ae 100644
--- a/apps/computer-vision/app/_layout.tsx
+++ b/apps/computer-vision/app/_layout.tsx
@@ -189,6 +189,14 @@ export default function _layout() {
headerTitleStyle: { color: ColorPalette.primary },
}}
/>
+
);
diff --git a/apps/computer-vision/app/index.tsx b/apps/computer-vision/app/index.tsx
index 15b9d8650b..e67e7eb5cb 100644
--- a/apps/computer-vision/app/index.tsx
+++ b/apps/computer-vision/app/index.tsx
@@ -47,6 +47,12 @@ export default function Home() {
>
Pose Estimation
+ router.navigate('segment_anything/')}
+ >
+ Segment Anything
+
router.navigate('ocr/')}
diff --git a/apps/computer-vision/app/instance_segmentation/index.tsx b/apps/computer-vision/app/instance_segmentation/index.tsx
index dba53875e5..f669c383d5 100644
--- a/apps/computer-vision/app/instance_segmentation/index.tsx
+++ b/apps/computer-vision/app/instance_segmentation/index.tsx
@@ -11,6 +11,8 @@ import {
YOLO26X_SEG,
RF_DETR_NANO_SEG,
InstanceSegmentationModelSources,
+ FASTSAM_S,
+ FASTSAM_X,
} from 'react-native-executorch';
import {
View,
@@ -35,6 +37,8 @@ const MODELS: ModelOption[] = [
{ label: 'Yolo26L', value: YOLO26L_SEG },
{ label: 'Yolo26X', value: YOLO26X_SEG },
{ label: 'RF-DeTR Nano', value: RF_DETR_NANO_SEG },
+ { label: 'FastSAM-S', value: FASTSAM_S },
+ { label: 'FastSAM-X', value: FASTSAM_X },
];
export default function InstanceSegmentationScreen() {
diff --git a/apps/computer-vision/app/segment_anything/index.tsx b/apps/computer-vision/app/segment_anything/index.tsx
new file mode 100644
index 0000000000..037a988327
--- /dev/null
+++ b/apps/computer-vision/app/segment_anything/index.tsx
@@ -0,0 +1,613 @@
+import React, { useContext, useEffect, useRef, useState } from 'react';
+import {
+ View,
+ StyleSheet,
+ Text,
+ TextInput,
+ TouchableOpacity,
+ TouchableWithoutFeedback,
+ GestureResponderEvent,
+ Keyboard,
+ KeyboardAvoidingView,
+ Platform,
+} from 'react-native';
+import {
+ Canvas,
+ Rect,
+ Skia,
+ useImage,
+ type SkImage,
+ ColorType,
+ AlphaType,
+} from '@shopify/react-native-skia';
+import {
+ useInstanceSegmentation,
+ useImageEmbeddings,
+ useTextEmbeddings,
+ FASTSAM_S,
+ FASTSAM_X,
+ CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED,
+ CLIP_VIT_BASE_PATCH32_TEXT,
+ InstanceSegmentationModelSources,
+ SegmentedInstance,
+ FastSAMLabel,
+ selectByPoint,
+ selectByBox,
+ selectByText,
+ Bbox,
+} from 'react-native-executorch';
+import { GeneratingContext } from '../../context';
+import { ModelPicker, ModelOption } from '../../components/ModelPicker';
+import { BottomBar } from '../../components/BottomBar';
+import { StatsBar } from '../../components/StatsBar';
+import Spinner from '../../components/Spinner';
+import ScreenWrapper from '../../ScreenWrapper';
+import ImageWithMasks, {
+ buildDisplayInstances,
+ DisplayInstance,
+} from '../../components/ImageWithMasks';
+import { getImage } from '../../utils';
+import ColorPalette from '../../colors';
+
+type PromptMode = 'point' | 'box' | 'text';
+
+const MODELS: ModelOption[] = [
+ { label: 'FastSAM-S', value: FASTSAM_S },
+ { label: 'FastSAM-X', value: FASTSAM_X },
+];
+
+export default function SegmentAnythingScreen() {
+ const { setGlobalGenerating } = useContext(GeneratingContext);
+
+ const [selectedModel, setSelectedModel] =
+ useState(FASTSAM_S);
+ const [mode, setMode] = useState('point');
+ const [inferenceTime, setInferenceTime] = useState(null);
+
+ const [imageUri, setImageUri] = useState('');
+ const [imageSize, setImageSize] = useState({ width: 0, height: 0 });
+
+ const rawInstancesRef = useRef[]>([]);
+ const [selection, setSelection] = useState([]);
+
+ const [draftBox, setDraftBox] = useState(null);
+ const boxStartRef = useRef<{ x: number; y: number } | null>(null);
+ const layoutRef = useRef({ width: 0, height: 0 });
+
+ const { isReady, isGenerating, downloadProgress, forward, error } =
+ useInstanceSegmentation({ model: selectedModel });
+
+ const clipImage = useImageEmbeddings({
+ model: CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED,
+ });
+ const clipText = useTextEmbeddings({ model: CLIP_VIT_BASE_PATCH32_TEXT });
+ const skiaSource = useImage(imageUri || null);
+
+ const [textPrompt, setTextPrompt] = useState('');
+ const [textBusy, setTextBusy] = useState(false);
+ const [embeddingProgress, setEmbeddingProgress] = useState<{
+ done: number;
+ total: number;
+ } | null>(null);
+ const instanceEmbeddingsRef = useRef(null);
+
+ useEffect(() => {
+ setGlobalGenerating(isGenerating);
+ }, [isGenerating, setGlobalGenerating]);
+
+ function applyMatch(
+ match: SegmentedInstance | null
+ ): void {
+ setSelection(match ? buildDisplayInstances([match]) : []);
+ }
+
+ function touchToImageCoords(touchX: number, touchY: number) {
+ const { width: cw, height: ch } = layoutRef.current;
+ const { width: iw, height: ih } = imageSize;
+ if (iw === 0 || ih === 0) return null;
+ const scale = Math.min(cw / iw, ch / ih);
+ return {
+ x: (touchX - (cw - iw * scale) / 2) / scale,
+ y: (touchY - (ch - ih * scale) / 2) / scale,
+ };
+ }
+
+ function handleTap(e: GestureResponderEvent) {
+ if (mode !== 'point' || rawInstancesRef.current.length === 0) return;
+ const c = touchToImageCoords(
+ e.nativeEvent.locationX,
+ e.nativeEvent.locationY
+ );
+ if (!c) return;
+ applyMatch(
+ selectByPoint(rawInstancesRef.current, Math.round(c.x), Math.round(c.y))
+ );
+ }
+
+ function handleBoxStart(e: GestureResponderEvent) {
+ if (mode !== 'box') return;
+ const c = touchToImageCoords(
+ e.nativeEvent.locationX,
+ e.nativeEvent.locationY
+ );
+ if (!c) return;
+ boxStartRef.current = c;
+ setDraftBox({ x1: c.x, y1: c.y, x2: c.x, y2: c.y });
+ }
+
+ function handleBoxMove(e: GestureResponderEvent) {
+ if (mode !== 'box' || !boxStartRef.current) return;
+ const c = touchToImageCoords(
+ e.nativeEvent.locationX,
+ e.nativeEvent.locationY
+ );
+ if (!c) return;
+ const s = boxStartRef.current;
+ setDraftBox({
+ x1: Math.min(s.x, c.x),
+ y1: Math.min(s.y, c.y),
+ x2: Math.max(s.x, c.x),
+ y2: Math.max(s.y, c.y),
+ });
+ }
+
+ function handleBoxEnd(e: GestureResponderEvent) {
+ if (mode !== 'box' || !boxStartRef.current) return;
+ const c = touchToImageCoords(
+ e.nativeEvent.locationX,
+ e.nativeEvent.locationY
+ );
+ const s = boxStartRef.current;
+ boxStartRef.current = null;
+ setDraftBox(null);
+ if (!c || rawInstancesRef.current.length === 0) return;
+ applyMatch(
+ selectByBox(rawInstancesRef.current, {
+ x1: Math.min(s.x, c.x),
+ y1: Math.min(s.y, c.y),
+ x2: Math.max(s.x, c.x),
+ y2: Math.max(s.y, c.y),
+ })
+ );
+ }
+
+ async function runTextPrompt() {
+ Keyboard.dismiss();
+ const instances = rawInstancesRef.current;
+ if (
+ !textPrompt.trim() ||
+ instances.length === 0 ||
+ !skiaSource ||
+ !clipImage.isReady ||
+ !clipText.isReady ||
+ textBusy
+ ) {
+ return;
+ }
+ setTextBusy(true);
+ try {
+ if (!instanceEmbeddingsRef.current) {
+ setEmbeddingProgress({ done: 0, total: instances.length });
+ const embeddings: Float32Array[] = [];
+ for (let i = 0; i < instances.length; i++) {
+ const inst = instances[i]!;
+ embeddings.push(
+ await cropAndEmbed(
+ skiaSource,
+ inst.bbox,
+ inst.mask,
+ inst.maskWidth,
+ inst.maskHeight,
+ clipImage.forward
+ )
+ );
+ setEmbeddingProgress({ done: i + 1, total: instances.length });
+ }
+ instanceEmbeddingsRef.current = embeddings;
+ setEmbeddingProgress(null);
+ }
+ const textEmb = await clipText.forward(textPrompt);
+ const match = selectByText(
+ instances,
+ instanceEmbeddingsRef.current,
+ textEmb
+ );
+ applyMatch(match);
+ } catch (e) {
+ console.error(e);
+ } finally {
+ setTextBusy(false);
+ }
+ }
+
+ const handleCameraPress = async (isCamera: boolean) => {
+ Keyboard.dismiss();
+ const image = await getImage(isCamera);
+ if (!image?.uri) return;
+ setImageUri(image.uri);
+ setImageSize({ width: image.width ?? 0, height: image.height ?? 0 });
+ rawInstancesRef.current = [];
+ instanceEmbeddingsRef.current = null;
+ setSelection([]);
+ setInferenceTime(null);
+ };
+
+ const runForward = async () => {
+ Keyboard.dismiss();
+ if (!imageUri) return;
+ try {
+ const start = Date.now();
+ const output = await forward(imageUri, {
+ confidenceThreshold: 0.4,
+ iouThreshold: 0.9,
+ maxInstances: 50,
+ returnMaskAtOriginalResolution: true,
+ });
+ setInferenceTime(Date.now() - start);
+ rawInstancesRef.current = output;
+ instanceEmbeddingsRef.current = null;
+ setSelection([]);
+ } catch (e) {
+ console.error(e);
+ }
+ };
+
+ if (!isReady && error) {
+ return (
+
+
+ Error Loading Model
+ {error.message}
+
+
+ );
+ }
+
+ if (!isReady) {
+ return (
+
+ );
+ }
+
+ const { width: cw, height: ch } = layoutRef.current;
+ const { width: iw, height: ih } = imageSize;
+ const drawScale = iw > 0 && ih > 0 ? Math.min(cw / iw, ch / ih) : 1;
+ const offsetX = (cw - iw * drawScale) / 2;
+ const offsetY = (ch - ih * drawScale) / 2;
+
+ const stepHint = !imageUri
+ ? null
+ : inferenceTime === null
+ ? 'Tap Run to detect instances'
+ : rawInstancesRef.current.length === 0
+ ? 'No instances detected — try another image'
+ : selection.length === 0
+ ? 'Tap a point, draw a box, or describe an object'
+ : null;
+
+ return (
+
+
+
+
+
+ {
+ layoutRef.current = {
+ width: e.nativeEvent.layout.width,
+ height: e.nativeEvent.layout.height,
+ };
+ }}
+ onTouchStart={(e) => {
+ Keyboard.dismiss();
+ if (mode === 'point') handleTap(e);
+ else if (mode === 'box') handleBoxStart(e);
+ }}
+ onTouchMove={handleBoxMove}
+ onTouchEnd={handleBoxEnd}
+ >
+
+ {draftBox && iw > 0 && (
+
+ )}
+
+ {!imageUri && (
+
+ Segment Anything
+
+ Segment any object in an image. (1) Pick an image, (2) tap
+ Run to detect instances, (3) tap a point, draw a box, or
+ describe an object to segment it.
+
+
+ )}
+
+
+
+ {stepHint && {stepHint}}
+
+
+ {(['point', 'box', 'text'] as PromptMode[]).map((m) => {
+ const promptDisabled = rawInstancesRef.current.length === 0;
+ return (
+ {
+ if (m !== 'text') Keyboard.dismiss();
+ setMode(m);
+ }}
+ disabled={promptDisabled}
+ >
+
+ {m[0]!.toUpperCase() + m.slice(1)}
+
+
+ );
+ })}
+
+
+ {mode === 'text' && (
+
+
+ {(() => {
+ const findInactive =
+ !textPrompt.trim() ||
+ rawInstancesRef.current.length === 0 ||
+ !clipImage.isReady ||
+ !clipText.isReady;
+ return (
+
+ Find
+
+ );
+ })()}
+
+ )}
+ {mode === 'text' && embeddingProgress && (
+
+ Embedding instances {embeddingProgress.done}/
+ {embeddingProgress.total} (subsequent text queries are instant)
+
+ )}
+
+ {
+ if (m.modelName === selectedModel.modelName) return;
+ setSelectedModel(m);
+ rawInstancesRef.current = [];
+ instanceEmbeddingsRef.current = null;
+ setSelection([]);
+ setInferenceTime(null);
+ }}
+ />
+
+ 0
+ ? rawInstancesRef.current.length
+ : null
+ }
+ />
+
+
+
+
+
+ );
+}
+
+async function cropAndEmbed(
+ image: SkImage,
+ bbox: Bbox,
+ mask: Uint8Array,
+ maskWidth: number,
+ maskHeight: number,
+ forward: (input: string) => Promise
+): Promise {
+ const imgW = image.width();
+ const imgH = image.height();
+ const surface = Skia.Surface.MakeOffscreen(imgW, imgH);
+ if (!surface) throw new Error('Failed to create offscreen Skia surface');
+ const canvas = surface.getCanvas();
+ canvas.clear(Skia.Color('white'));
+
+ const x1 = Math.max(0, Math.round(bbox.x1));
+ const y1 = Math.max(0, Math.round(bbox.y1));
+ const x2 = Math.min(imgW, Math.round(bbox.x2));
+ const y2 = Math.min(imgH, Math.round(bbox.y2));
+ const w = x2 - x1;
+ const h = y2 - y1;
+ if (w > 0 && h > 0) {
+ canvas.drawImageRect(
+ image,
+ { x: x1, y: y1, width: w, height: h },
+ { x: x1, y: y1, width: w, height: h },
+ Skia.Paint()
+ );
+ }
+
+ const inversePixels = new Uint8Array(mask.length * 4);
+ for (let i = 0; i < mask.length; i++) {
+ const outside = mask[i]! === 0;
+ const idx = i * 4;
+ inversePixels[idx] = outside ? 255 : 0;
+ inversePixels[idx + 1] = outside ? 255 : 0;
+ inversePixels[idx + 2] = outside ? 255 : 0;
+ inversePixels[idx + 3] = outside ? 255 : 0;
+ }
+ const inverseData = Skia.Data.fromBytes(inversePixels);
+ const inverseMaskImg = Skia.Image.MakeImage(
+ {
+ width: maskWidth,
+ height: maskHeight,
+ colorType: ColorType.RGBA_8888,
+ alphaType: AlphaType.Premul,
+ },
+ inverseData,
+ maskWidth * 4
+ );
+ if (inverseMaskImg) {
+ canvas.drawImageRect(
+ inverseMaskImg,
+ { x: 0, y: 0, width: maskWidth, height: maskHeight },
+ {
+ x: bbox.x1,
+ y: bbox.y1,
+ width: bbox.x2 - bbox.x1,
+ height: bbox.y2 - bbox.y1,
+ },
+ Skia.Paint()
+ );
+ }
+
+ const base64 = surface.makeImageSnapshot().encodeToBase64();
+ inverseData.dispose();
+ return forward(`data:image/png;base64,${base64}`);
+}
+
+const styles = StyleSheet.create({
+ flex: { flex: 1 },
+ container: { flex: 6, width: '100%' },
+ imageContainer: { flex: 1, width: '100%', padding: 16 },
+ imageTouchArea: { flex: 1, position: 'relative' },
+ infoContainer: { alignItems: 'center', padding: 16, gap: 8 },
+ infoTitle: { fontSize: 18, fontWeight: '600', color: 'navy' },
+ infoText: {
+ fontSize: 14,
+ color: '#555',
+ textAlign: 'center',
+ lineHeight: 20,
+ },
+ modeRow: {
+ flexDirection: 'row',
+ justifyContent: 'center',
+ paddingVertical: 8,
+ gap: 8,
+ },
+ modeBtn: {
+ paddingHorizontal: 18,
+ paddingVertical: 8,
+ borderRadius: 8,
+ borderWidth: 1,
+ borderColor: ColorPalette.primary,
+ backgroundColor: '#fff',
+ },
+ modeBtnActive: { backgroundColor: ColorPalette.primary },
+ modeBtnDisabled: { borderColor: '#cbd5e1', backgroundColor: '#f8fafc' },
+ modeBtnText: { fontSize: 14, fontWeight: '600', color: ColorPalette.primary },
+ modeBtnTextActive: { color: '#fff' },
+ modeBtnTextDisabled: { color: '#cbd5e1' },
+ textRow: {
+ flexDirection: 'row',
+ alignItems: 'center',
+ paddingHorizontal: 16,
+ paddingBottom: 8,
+ gap: 8,
+ },
+ textInput: {
+ flex: 1,
+ backgroundColor: '#fff',
+ borderWidth: 1,
+ borderColor: ColorPalette.primary,
+ borderRadius: 12,
+ paddingHorizontal: 14,
+ paddingVertical: 12,
+ fontSize: 16,
+ color: '#0f172a',
+ },
+ textBtn: {
+ backgroundColor: ColorPalette.primary,
+ borderRadius: 12,
+ paddingVertical: 14,
+ width: 80,
+ alignItems: 'center',
+ },
+ textBtnDisabled: { backgroundColor: '#cbd5e1' },
+ textBtnLabel: { color: '#fff', fontWeight: '700', fontSize: 16 },
+ statusLine: {
+ paddingHorizontal: 16,
+ paddingBottom: 6,
+ fontSize: 12,
+ color: '#64748b',
+ },
+ stepHint: {
+ paddingHorizontal: 16,
+ paddingTop: 6,
+ fontSize: 13,
+ fontWeight: '500',
+ color: ColorPalette.primary,
+ textAlign: 'center',
+ },
+ errorContainer: {
+ flex: 1,
+ justifyContent: 'center',
+ alignItems: 'center',
+ padding: 32,
+ },
+ errorTitle: {
+ fontSize: 20,
+ fontWeight: '700',
+ color: '#e74c3c',
+ marginBottom: 12,
+ },
+ errorText: { fontSize: 14, color: '#555', textAlign: 'center' },
+});
diff --git a/apps/computer-vision/app/vision_camera/index.tsx b/apps/computer-vision/app/vision_camera/index.tsx
index 4020d20023..7a399f443f 100644
--- a/apps/computer-vision/app/vision_camera/index.tsx
+++ b/apps/computer-vision/app/vision_camera/index.tsx
@@ -54,6 +54,8 @@ type ModelId =
| 'segmentationSelfie'
| 'instanceSegmentationYolo26n'
| 'instanceSegmentationRfdetr'
+ | 'instanceSegmentationFastsamS'
+ | 'instanceSegmentationFastsamX'
| 'poseEstimationYolo26n'
| 'ocr'
| 'styleTransferCandy'
@@ -87,6 +89,8 @@ const TASKS: Task[] = [
variants: [
{ id: 'instanceSegmentationYolo26n', label: 'YOLO26N Seg' },
{ id: 'instanceSegmentationRfdetr', label: 'RF-DETR Nano Seg' },
+ { id: 'instanceSegmentationFastsamS', label: 'FastSAM-S' },
+ { id: 'instanceSegmentationFastsamX', label: 'FastSAM-X' },
],
},
{
@@ -284,6 +288,8 @@ export default function VisionCameraScreen() {
activeModel as
| 'instanceSegmentationYolo26n'
| 'instanceSegmentationRfdetr'
+ | 'instanceSegmentationFastsamS'
+ | 'instanceSegmentationFastsamX'
}
/>
)}
diff --git a/apps/computer-vision/components/ImageWithMasks.tsx b/apps/computer-vision/components/ImageWithMasks.tsx
index bd768909b2..8bb435f47a 100644
--- a/apps/computer-vision/components/ImageWithMasks.tsx
+++ b/apps/computer-vision/components/ImageWithMasks.tsx
@@ -156,7 +156,7 @@ export default function ImageWithMasks({
/>
{instances.length > 0 && (
-
+