From 2c95b20e85349bbd6788d66cf4dff3836dd54779 Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Tue, 5 May 2026 16:27:10 +0200
Subject: [PATCH 01/18] feat: implement initial FastSAM instance segmentation
 screen and model integration

---
 .cspell-wordlist.txt                          |   1 +
 apps/computer-vision/app/_layout.tsx          |   8 +
 apps/computer-vision/app/fast_sam/index.tsx   | 561 ++++++++++++++++++
 apps/computer-vision/app/index.tsx            |   6 +
 .../app/instance_segmentation/index.tsx       |   4 +
 .../src/constants/commonVision.ts             |  11 +
 .../src/constants/modelUrls.ts                |  22 +
 packages/react-native-executorch/src/index.ts |   1 +
 .../InstanceSegmentationModule.ts             |  16 +
 .../src/types/instanceSegmentation.ts         |   4 +-
 .../src/utils/fastSAMPrompts.ts               | 111 ++++
 11 files changed, 744 insertions(+), 1 deletion(-)
 create mode 100644 apps/computer-vision/app/fast_sam/index.tsx
 create mode 100644 packages/react-native-executorch/src/utils/fastSAMPrompts.ts
diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt
index 84d006eefe..cbd5f6d67d 100644
--- a/.cspell-wordlist.txt
+++ b/.cspell-wordlist.txt
@@ -203,3 +203,4 @@ fishjam
 Fishjam
 deinitialize
 Deinitialize
+fastsam
diff --git a/apps/computer-vision/app/_layout.tsx b/apps/computer-vision/app/_layout.tsx
index 03770c2720..7aed6af92f 100644
--- a/apps/computer-vision/app/_layout.tsx
+++ b/apps/computer-vision/app/_layout.tsx
@@ -189,6 +189,14 @@ export default function _layout() {
             headerTitleStyle: { color: ColorPalette.primary },
           }}
         />
+        <Drawer.Screen
+          name="fast_sam/index"
+          options={{
+            drawerLabel: 'FastSAM',
+            title: 'FastSAM',
+            headerTitleStyle: { color: ColorPalette.primary },
+          }}
+        />
       </Drawer>
     </GeneratingContext>
   );
diff --git a/apps/computer-vision/app/fast_sam/index.tsx b/apps/computer-vision/app/fast_sam/index.tsx
new file mode 100644
index 0000000000..d47e538c8a
--- /dev/null
+++ b/apps/computer-vision/app/fast_sam/index.tsx
@@ -0,0 +1,561 @@
+import React, { useContext, useMemo, useRef, useState } from 'react';
+import {
+  View,
+  StyleSheet,
+  Text,
+  TouchableOpacity,
+  GestureResponderEvent,
+  Image,
+} from 'react-native';
+import {
+  Canvas,
+  Image as SkiaImage,
+  Rect,
+  Group,
+  useImage,
+  Skia,
+  AlphaType,
+  ColorType,
+} from '@shopify/react-native-skia';
+import {
+  useInstanceSegmentation,
+  FASTSAM_S,
+  FASTSAM_X,
+  InstanceSegmentationModelSources,
+  SegmentedInstance,
+  FastSAMLabel,
+  selectByPoint,
+  selectByBox,
+  Bbox,
+} from 'react-native-executorch';
+import { GeneratingContext } from '../../context';
+import { ModelPicker, ModelOption } from '../../components/ModelPicker';
+import { BottomBar } from '../../components/BottomBar';
+import { StatsBar } from '../../components/StatsBar';
+import Spinner from '../../components/Spinner';
+import ScreenWrapper from '../../ScreenWrapper';
+import { getImage } from '../../utils';
+import ColorPalette from '../../colors';
+
+type PromptMode = 'point' | 'box';
+
+const MODELS: ModelOption<InstanceSegmentationModelSources>[] = [
+  { label: 'FastSAM-s', value: FASTSAM_S },
+  { label: 'FastSAM-x', value: FASTSAM_X },
+];
+
+export default function FastSAMScreen() {
+  const { setGlobalGenerating } = useContext(GeneratingContext);
+
+  const [selectedModel, setSelectedModel] =
+    useState<InstanceSegmentationModelSources>(FASTSAM_S);
+  const [mode, setMode] = useState<PromptMode>('point');
+  const [inferenceTime, setInferenceTime] = useState<number | null>(null);
+
+  const [imageUri, setImageUri] = useState('');
+  const [imageSize, setImageSize] = useState({ width: 0, height: 0 });
+
+  const rawInstancesRef = useRef<SegmentedInstance<typeof FastSAMLabel>[]>([]);
+  const [selection, setSelection] = useState<SegmentedInstance<
+    typeof FastSAMLabel
+  > | null>(null);
+
+  const [draftBox, setDraftBox] = useState<{
+    x1: number;
+    y1: number;
+    x2: number;
+    y2: number;
+  } | null>(null);
+  const boxStartRef = useRef<{ x: number; y: number } | null>(null);
+
+  const sourceLayoutRef = useRef({ width: 0, height: 0 });
+  const cutoutLayoutRef = useRef({ width: 0, height: 0 });
+  const [cutoutLayout, setCutoutLayout] = useState({ width: 0, height: 0 });
+
+  const { isReady, isGenerating, downloadProgress, forward, error } =
+    useInstanceSegmentation({ model: selectedModel });
+
+  React.useEffect(() => {
+    setGlobalGenerating(isGenerating);
+  }, [isGenerating, setGlobalGenerating]);
+
+  // -------------------------------------------------------------------------
+  // Coordinate conversion (source image box)
+  // -------------------------------------------------------------------------
+
+  function touchToImageCoords(touchX: number, touchY: number) {
+    const { width: cw, height: ch } = sourceLayoutRef.current;
+    const { width: iw, height: ih } = imageSize;
+    if (iw === 0 || ih === 0) return null;
+    const scale = Math.min(cw / iw, ch / ih);
+    const offsetX = (cw - iw * scale) / 2;
+    const offsetY = (ch - ih * scale) / 2;
+    return {
+      x: (touchX - offsetX) / scale,
+      y: (touchY - offsetY) / scale,
+    };
+  }
+
+  // -------------------------------------------------------------------------
+  // Point prompt
+  // -------------------------------------------------------------------------
+
+  function handleTap(e: GestureResponderEvent) {
+    if (mode !== 'point' || rawInstancesRef.current.length === 0) return;
+    const coords = touchToImageCoords(
+      e.nativeEvent.locationX,
+      e.nativeEvent.locationY
+    );
+    if (!coords) return;
+    const match = selectByPoint(
+      rawInstancesRef.current,
+      Math.round(coords.x),
+      Math.round(coords.y)
+    );
+    setSelection(match ?? null);
+  }
+
+  // -------------------------------------------------------------------------
+  // Box prompt
+  // -------------------------------------------------------------------------
+
+  function handleBoxStart(e: GestureResponderEvent) {
+    if (mode !== 'box') return;
+    const coords = touchToImageCoords(
+      e.nativeEvent.locationX,
+      e.nativeEvent.locationY
+    );
+    if (!coords) return;
+    boxStartRef.current = coords;
+    setDraftBox({ x1: coords.x, y1: coords.y, x2: coords.x, y2: coords.y });
+  }
+
+  function handleBoxMove(e: GestureResponderEvent) {
+    if (mode !== 'box' || !boxStartRef.current) return;
+    const coords = touchToImageCoords(
+      e.nativeEvent.locationX,
+      e.nativeEvent.locationY
+    );
+    if (!coords) return;
+    const s = boxStartRef.current;
+    setDraftBox({
+      x1: Math.min(s.x, coords.x),
+      y1: Math.min(s.y, coords.y),
+      x2: Math.max(s.x, coords.x),
+      y2: Math.max(s.y, coords.y),
+    });
+  }
+
+  function handleBoxEnd(e: GestureResponderEvent) {
+    if (
+      mode !== 'box' ||
+      !boxStartRef.current ||
+      rawInstancesRef.current.length === 0
+    ) {
+      boxStartRef.current = null;
+      setDraftBox(null);
+      return;
+    }
+    const coords = touchToImageCoords(
+      e.nativeEvent.locationX,
+      e.nativeEvent.locationY
+    );
+    const s = boxStartRef.current;
+    boxStartRef.current = null;
+    setDraftBox(null);
+    if (!coords) return;
+    const box: Bbox = {
+      x1: Math.min(s.x, coords.x),
+      y1: Math.min(s.y, coords.y),
+      x2: Math.max(s.x, coords.x),
+      y2: Math.max(s.y, coords.y),
+    };
+    setSelection(selectByBox(rawInstancesRef.current, box) ?? null);
+  }
+
+  // -------------------------------------------------------------------------
+  // Image loading & inference
+  // -------------------------------------------------------------------------
+
+  const handleCameraPress = async (isCamera: boolean) => {
+    const image = await getImage(isCamera);
+    if (!image?.uri) return;
+    setImageUri(image.uri);
+    setImageSize({ width: image.width ?? 0, height: image.height ?? 0 });
+    rawInstancesRef.current = [];
+    setSelection(null);
+    setInferenceTime(null);
+  };
+
+  const runForward = async () => {
+    if (!imageUri) return;
+    try {
+      const start = Date.now();
+      const output = await forward(imageUri, {
+        confidenceThreshold: 0.4,
+        iouThreshold: 0.9,
+        maxInstances: 100,
+        returnMaskAtOriginalResolution: true,
+      });
+      setInferenceTime(Date.now() - start);
+      rawInstancesRef.current = output;
+      setSelection(null);
+    } catch (e) {
+      console.error(e);
+    }
+  };
+
+  // -------------------------------------------------------------------------
+  // Cutout rendering
+  // -------------------------------------------------------------------------
+
+  const skiaSource = useImage(imageUri || null);
+
+  const alphaMask = useMemo(() => {
+    if (!selection) return null;
+    return buildAlphaMask(
+      selection.mask,
+      selection.maskWidth,
+      selection.maskHeight,
+      selection.bbox.x1,
+      selection.bbox.y1,
+      imageSize.width,
+      imageSize.height
+    );
+  }, [selection, imageSize]);
+
+  const { width: cw, height: ch } = cutoutLayout;
+  const { width: iw, height: ih } = imageSize;
+  const cutoutScale =
+    cw > 0 && ch > 0 && iw > 0 && ih > 0 ? Math.min(cw / iw, ch / ih) : 1;
+  const cutoutOffsetX = (cw - iw * cutoutScale) / 2;
+  const cutoutOffsetY = (ch - ih * cutoutScale) / 2;
+
+  // Draft box overlay coords (source box)
+  const { width: scw, height: sch } = sourceLayoutRef.current;
+  const srcScale = iw > 0 && ih > 0 ? Math.min(scw / iw, sch / ih) : 1;
+  const srcOffsetX = (scw - iw * srcScale) / 2;
+  const srcOffsetY = (sch - ih * srcScale) / 2;
+
+  // -------------------------------------------------------------------------
+  // Error / loading
+  // -------------------------------------------------------------------------
+
+  if (!isReady && error) {
+    return (
+      <ScreenWrapper>
+        <View style={styles.errorContainer}>
+          <Text style={styles.errorTitle}>Error Loading Model</Text>
+          <Text style={styles.errorText}>{error.message}</Text>
+        </View>
+      </ScreenWrapper>
+    );
+  }
+
+  if (!isReady) {
+    return (
+      <Spinner
+        visible
+        textContent={`Loading model ${(downloadProgress * 100).toFixed(0)}%`}
+      />
+    );
+  }
+
+  return (
+    <ScreenWrapper>
+      {/* ---- Source image box ---- */}
+      <View
+        style={styles.imageBox}
+        onLayout={(e) => {
+          const { width, height } = e.nativeEvent.layout;
+          sourceLayoutRef.current = { width, height };
+        }}
+        onTouchStart={(e) => {
+          if (mode === 'point') handleTap(e);
+          else handleBoxStart(e);
+        }}
+        onTouchMove={(e) => {
+          if (mode === 'box') handleBoxMove(e);
+        }}
+        onTouchEnd={(e) => {
+          if (mode === 'box') handleBoxEnd(e);
+        }}
+      >
+        <Image
+          style={styles.image}
+          resizeMode="contain"
+          source={
+            imageUri
+              ? { uri: imageUri }
+              : require('../../assets/icons/executorch_logo.png')
+          }
+        />
+        {!imageUri && (
+          <View style={styles.hint}>
+            <Text style={styles.hintText}>Load an image to get started</Text>
+          </View>
+        )}
+        {/* Draft box */}
+        {draftBox && iw > 0 && (
+          <Canvas style={StyleSheet.absoluteFill} pointerEvents="none">
+            <Rect
+              x={draftBox.x1 * srcScale + srcOffsetX}
+              y={draftBox.y1 * srcScale + srcOffsetY}
+              width={(draftBox.x2 - draftBox.x1) * srcScale}
+              height={(draftBox.y2 - draftBox.y1) * srcScale}
+              style="stroke"
+              strokeWidth={2}
+              color="rgba(0,200,255,1)"
+            />
+          </Canvas>
+        )}
+      </View>
+
+      {/* ---- Cutout box ---- */}
+      <View
+        style={styles.imageBox}
+        onLayout={(e) => {
+          const { width, height } = e.nativeEvent.layout;
+          cutoutLayoutRef.current = { width, height };
+          setCutoutLayout({ width, height });
+        }}
+      >
+        {selection && skiaSource && alphaMask ? (
+          <Canvas style={StyleSheet.absoluteFill}>
+            <Rect x={0} y={0} width={cw} height={ch} color="black" />
+            <Group layer>
+              <SkiaImage
+                image={skiaSource}
+                x={cutoutOffsetX}
+                y={cutoutOffsetY}
+                width={iw * cutoutScale}
+                height={ih * cutoutScale}
+                fit="fill"
+              />
+              <SkiaImage
+                image={alphaMask}
+                x={cutoutOffsetX}
+                y={cutoutOffsetY}
+                width={iw * cutoutScale}
+                height={ih * cutoutScale}
+                fit="fill"
+                blendMode="dstIn"
+              />
+            </Group>
+          </Canvas>
+        ) : (
+          <View style={styles.hint}>
+            <Text style={styles.hintText}>
+              {rawInstancesRef.current.length > 0
+                ? 'Tap or draw a box on the image above'
+                : imageUri
+                  ? 'Run inference first'
+                  : ''}
+            </Text>
+          </View>
+        )}
+      </View>
+
+      {/* ---- Controls ---- */}
+      <View style={styles.controls}>
+        <View style={styles.modeToggle}>
+          <TouchableOpacity
+            style={[styles.modeBtn, mode === 'point' && styles.modeBtnActive]}
+            onPress={() => setMode('point')}
+          >
+            <Text
+              style={[
+                styles.modeBtnText,
+                mode === 'point' && styles.modeBtnTextActive,
+              ]}
+            >
+              Point
+            </Text>
+          </TouchableOpacity>
+          <TouchableOpacity
+            style={[styles.modeBtn, mode === 'box' && styles.modeBtnActive]}
+            onPress={() => setMode('box')}
+          >
+            <Text
+              style={[
+                styles.modeBtnText,
+                mode === 'box' && styles.modeBtnTextActive,
+              ]}
+            >
+              Box
+            </Text>
+          </TouchableOpacity>
+        </View>
+      </View>
+
+      <ModelPicker
+        models={MODELS}
+        selectedModel={selectedModel}
+        disabled={isGenerating}
+        onSelect={(m) => {
+          setSelectedModel(m);
+          rawInstancesRef.current = [];
+          setSelection(null);
+          setInferenceTime(null);
+        }}
+      />
+
+      <StatsBar
+        inferenceTime={inferenceTime}
+        detectionCount={
+          rawInstancesRef.current.length > 0
+            ? rawInstancesRef.current.length
+            : null
+        }
+      />
+
+      <BottomBar
+        handleCameraPress={handleCameraPress}
+        runForward={runForward}
+        hasImage={!!imageUri}
+        isGenerating={isGenerating}
+      />
+    </ScreenWrapper>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+// Builds a full-image alpha mask. `mask` is bbox-relative (maskWidth × maskHeight),
+// positioned at (bboxX1, bboxY1) within an image of size (imgW × imgH).
+function buildAlphaMask(
+  mask: Uint8Array,
+  maskWidth: number,
+  maskHeight: number,
+  bboxX1: number,
+  bboxY1: number,
+  imgW: number,
+  imgH: number
+) {
+  const MAX_DIM = 512;
+  const ds = Math.min(1, MAX_DIM / Math.max(imgW, imgH));
+  const dstW = Math.max(1, Math.round(imgW * ds));
+  const dstH = Math.max(1, Math.round(imgH * ds));
+
+  const pixels = new Uint8Array(dstW * dstH * 4);
+
+  // Place the bbox-relative mask into the full-image canvas
+  const offX = Math.round(bboxX1 * ds);
+  const offY = Math.round(bboxY1 * ds);
+  const scaledMaskW = Math.max(1, Math.round(maskWidth * ds));
+  const scaledMaskH = Math.max(1, Math.round(maskHeight * ds));
+
+  for (let dy = 0; dy < scaledMaskH; dy++) {
+    const sy = Math.min(
+      Math.floor((dy / scaledMaskH) * maskHeight),
+      maskHeight - 1
+    );
+    for (let dx = 0; dx < scaledMaskW; dx++) {
+      const sx = Math.min(
+        Math.floor((dx / scaledMaskW) * maskWidth),
+        maskWidth - 1
+      );
+      if (mask[sy * maskWidth + sx] > 0) {
+        const imgX = offX + dx;
+        const imgY = offY + dy;
+        if (imgX >= 0 && imgX < dstW && imgY >= 0 && imgY < dstH) {
+          const i = (imgY * dstW + imgX) * 4;
+          pixels[i] = 255;
+          pixels[i + 1] = 255;
+          pixels[i + 2] = 255;
+          pixels[i + 3] = 255;
+        }
+      }
+    }
+  }
+
+  const data = Skia.Data.fromBytes(pixels);
+  const img = Skia.Image.MakeImage(
+    {
+      width: dstW,
+      height: dstH,
+      alphaType: AlphaType.Premul,
+      colorType: ColorType.RGBA_8888,
+    },
+    data,
+    dstW * 4
+  );
+  data.dispose();
+  return img;
+}
+
+// ---------------------------------------------------------------------------
+// Styles
+// ---------------------------------------------------------------------------
+
+const styles = StyleSheet.create({
+  imageBox: {
+    flex: 1,
+    width: '100%',
+    borderBottomWidth: 1,
+    borderBottomColor: '#e0e0e0',
+  },
+  image: {
+    width: '100%',
+    height: '100%',
+  },
+  hint: {
+    ...StyleSheet.absoluteFillObject,
+    justifyContent: 'center',
+    alignItems: 'center',
+  },
+  hintText: {
+    fontSize: 14,
+    color: '#aaa',
+  },
+  controls: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    paddingHorizontal: 16,
+    paddingVertical: 10,
+    borderTopWidth: 1,
+    borderTopColor: '#e0e0e0',
+  },
+  modeToggle: {
+    flexDirection: 'row',
+    borderRadius: 8,
+    overflow: 'hidden',
+    borderWidth: 1,
+    borderColor: ColorPalette.primary,
+  },
+  modeBtn: {
+    paddingHorizontal: 20,
+    paddingVertical: 8,
+    backgroundColor: '#fff',
+  },
+  modeBtnActive: {
+    backgroundColor: ColorPalette.primary,
+  },
+  modeBtnText: {
+    fontSize: 14,
+    fontWeight: '600',
+    color: ColorPalette.primary,
+  },
+  modeBtnTextActive: {
+    color: '#fff',
+  },
+  errorContainer: {
+    flex: 1,
+    justifyContent: 'center',
+    alignItems: 'center',
+    padding: 32,
+  },
+  errorTitle: {
+    fontSize: 20,
+    fontWeight: '700',
+    color: '#e74c3c',
+    marginBottom: 12,
+  },
+  errorText: {
+    fontSize: 14,
+    color: '#555',
+    textAlign: 'center',
+  },
+});
diff --git a/apps/computer-vision/app/index.tsx b/apps/computer-vision/app/index.tsx
index 15b9d8650b..690ebfb331 100644
--- a/apps/computer-vision/app/index.tsx
+++ b/apps/computer-vision/app/index.tsx
@@ -47,6 +47,12 @@ export default function Home() {
         >
           <Text style={styles.buttonText}>Pose Estimation</Text>
         </TouchableOpacity>
+        <TouchableOpacity
+          style={styles.button}
+          onPress={() => router.navigate('fast_sam/')}
+        >
+          <Text style={styles.buttonText}>FastSAM</Text>
+        </TouchableOpacity>
         <TouchableOpacity
           style={styles.button}
           onPress={() => router.navigate('ocr/')}
diff --git a/apps/computer-vision/app/instance_segmentation/index.tsx b/apps/computer-vision/app/instance_segmentation/index.tsx
index dba53875e5..f669c383d5 100644
--- a/apps/computer-vision/app/instance_segmentation/index.tsx
+++ b/apps/computer-vision/app/instance_segmentation/index.tsx
@@ -11,6 +11,8 @@ import {
   YOLO26X_SEG,
   RF_DETR_NANO_SEG,
   InstanceSegmentationModelSources,
+  FASTSAM_S,
+  FASTSAM_X,
 } from 'react-native-executorch';
 import {
   View,
@@ -35,6 +37,8 @@ const MODELS: ModelOption<InstanceSegmentationModelSources>[] = [
   { label: 'Yolo26L', value: YOLO26L_SEG },
   { label: 'Yolo26X', value: YOLO26X_SEG },
   { label: 'RF-DeTR Nano', value: RF_DETR_NANO_SEG },
+  { label: 'FastSAM-S', value: FASTSAM_S },
+  { label: 'FastSAM-X', value: FASTSAM_X },
 ];
 
 export default function InstanceSegmentationScreen() {
diff --git a/packages/react-native-executorch/src/constants/commonVision.ts b/packages/react-native-executorch/src/constants/commonVision.ts
index ecea0f8069..bac8e8c520 100644
--- a/packages/react-native-executorch/src/constants/commonVision.ts
+++ b/packages/react-native-executorch/src/constants/commonVision.ts
@@ -118,6 +118,17 @@ export enum CocoLabel {
  * @see {@link CocoLabel} for the RF-DETR / SSDLite variant
  * @category Types
  */
+/**
+ * Class label for FastSAM models.
+ *
+ * FastSAM is class-agnostic and produces a single "object" class for every
+ * detected region. Use this enum when working with `fastsam-s` or `fastsam-x`.
+ * @category Types
+ */
+export enum FastSAMLabel {
+  OBJECT = 0,
+}
+
 export enum CocoLabelYolo {
   PERSON = 0,
   BICYCLE = 1,
diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 6895601e1e..6c24934176 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -1010,6 +1010,26 @@ export const SELFIE_SEGMENTATION = {
   modelSource: SELFIE_SEGMENTATION_MODEL,
 } as const;
 
+// FastSAM Instance Segmentation
+const FASTSAM_S_SEG_MODEL = `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-s/xnnpack/fastsam_s_xnnpack_fp32.pte`;
+const FASTSAM_X_SEG_MODEL = `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-x/xnnpack/fastsam_x_xnnpack_fp32.pte`;
+
+/**
+ * @category Models - Instance Segmentation
+ */
+export const FASTSAM_S = {
+  modelName: 'fastsam-s',
+  modelSource: FASTSAM_S_SEG_MODEL,
+} as const;
+
+/**
+ * @category Models - Instance Segmentation
+ */
+export const FASTSAM_X = {
+  modelName: 'fastsam-x',
+  modelSource: FASTSAM_X_SEG_MODEL,
+} as const;
+
 /**
  * @category Models - Instance Segmentation
  */
@@ -1352,6 +1372,8 @@ export const MODEL_REGISTRY = {
     YOLO26L_SEG,
     YOLO26X_SEG,
     RF_DETR_NANO_SEG,
+    FASTSAM_S,
+    FASTSAM_X,
     CLIP_VIT_BASE_PATCH32_IMAGE,
     CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED,
     ALL_MINILM_L6_V2,
diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts
index 96d167a7d2..6c88de830e 100644
--- a/packages/react-native-executorch/src/index.ts
+++ b/packages/react-native-executorch/src/index.ts
@@ -212,6 +212,7 @@ export * from './utils/BaseResourceFetcherClass';
 export * from './utils/llm';
 export * from './common/Logger';
 export * from './utils/llms/context_strategy';
+export * from './utils/fastSAMPrompts';
 
 // types
 export * from './types/objectDetection';
diff --git a/packages/react-native-executorch/src/modules/computer_vision/InstanceSegmentationModule.ts b/packages/react-native-executorch/src/modules/computer_vision/InstanceSegmentationModule.ts
index 2e70e6bdec..e7e96f2deb 100644
--- a/packages/react-native-executorch/src/modules/computer_vision/InstanceSegmentationModule.ts
+++ b/packages/react-native-executorch/src/modules/computer_vision/InstanceSegmentationModule.ts
@@ -23,6 +23,7 @@ import {
 import {
   CocoLabel,
   CocoLabelYolo,
+  FastSAMLabel,
   IMAGENET1K_MEAN,
   IMAGENET1K_STD,
 } from '../../constants/commonVision';
@@ -39,6 +40,18 @@ const YOLO_SEG_CONFIG = {
   },
 } satisfies InstanceSegmentationConfig<typeof CocoLabelYolo>;
 
+const FASTSAM_CONFIG = {
+  preprocessorConfig: undefined,
+  labelMap: FastSAMLabel,
+  availableInputSizes: undefined,
+  defaultInputSize: undefined,
+  defaultConfidenceThreshold: 0.5,
+  defaultIouThreshold: 0.9,
+  postprocessorConfig: {
+    applyNMS: true,
+  },
+} satisfies InstanceSegmentationConfig<typeof FastSAMLabel>;
+
 const RF_DETR_NANO_SEG_CONFIG = {
   preprocessorConfig: { normMean: IMAGENET1K_MEAN, normStd: IMAGENET1K_STD },
   labelMap: CocoLabel,
@@ -81,10 +94,13 @@ const ModelConfigs = {
   'yolo26l-seg': YOLO_SEG_CONFIG,
   'yolo26x-seg': YOLO_SEG_CONFIG,
   'rfdetr-nano-seg': RF_DETR_NANO_SEG_CONFIG,
+  'fastsam-s': FASTSAM_CONFIG,
+  'fastsam-x': FASTSAM_CONFIG,
 } as const satisfies Record<
   InstanceSegmentationModelName,
   | InstanceSegmentationConfig<typeof CocoLabel>
   | InstanceSegmentationConfig<typeof CocoLabelYolo>
+  | InstanceSegmentationConfig<typeof FastSAMLabel>
 >;
 
 /** @internal */
diff --git a/packages/react-native-executorch/src/types/instanceSegmentation.ts b/packages/react-native-executorch/src/types/instanceSegmentation.ts
index 869f0cdcd7..ff7f4ae314 100644
--- a/packages/react-native-executorch/src/types/instanceSegmentation.ts
+++ b/packages/react-native-executorch/src/types/instanceSegmentation.ts
@@ -114,7 +114,9 @@ export type InstanceSegmentationModelSources =
   | { modelName: 'yolo26m-seg'; modelSource: ResourceSource }
   | { modelName: 'yolo26l-seg'; modelSource: ResourceSource }
   | { modelName: 'yolo26x-seg'; modelSource: ResourceSource }
-  | { modelName: 'rfdetr-nano-seg'; modelSource: ResourceSource };
+  | { modelName: 'rfdetr-nano-seg'; modelSource: ResourceSource }
+  | { modelName: 'fastsam-s'; modelSource: ResourceSource }
+  | { modelName: 'fastsam-x'; modelSource: ResourceSource };
 
 /**
  * Union of all built-in instance segmentation model names.
diff --git a/packages/react-native-executorch/src/utils/fastSAMPrompts.ts b/packages/react-native-executorch/src/utils/fastSAMPrompts.ts
new file mode 100644
index 0000000000..61f799ee40
--- /dev/null
+++ b/packages/react-native-executorch/src/utils/fastSAMPrompts.ts
@@ -0,0 +1,111 @@
+import { LabelEnum } from '../types/common';
+import { Bbox } from '../types/objectDetection';
+import { SegmentedInstance } from '../types/instanceSegmentation';
+
+/**
+ * Selects the best matching instance for a given point prompt.
+ *
+ * Finds all instances whose mask covers the point (x, y), then returns the one
+ * with the smallest mask area (ties broken by box area, then confidence). This
+ * matches the behavior of FastSAM's point-prompt selection.
+ * @param instances - Array of segmented instances returned by `forward()`.
+ * @param x - X coordinate in original image space.
+ * @param y - Y coordinate in original image space.
+ * @returns The best matching instance, or `null` if no mask covers the point.
+ */
+export function selectByPoint<L extends LabelEnum>(
+  instances: SegmentedInstance<L>[],
+  x: number,
+  y: number
+): SegmentedInstance<L> | null {
+  const px = Math.round(x);
+  const py = Math.round(y);
+
+  const matches = instances.filter((inst) => {
+    const mx = px - Math.round(inst.bbox.x1);
+    const my = py - Math.round(inst.bbox.y1);
+    if (mx < 0 || my < 0 || mx >= inst.maskWidth || my >= inst.maskHeight) {
+      return false;
+    }
+    return inst.mask[my * inst.maskWidth + mx] === 1;
+  });
+
+  if (matches.length === 0) return null;
+
+  return matches.reduce((best, inst) => {
+    const maskArea = countMaskPixels(inst.mask);
+    const bestMaskArea = countMaskPixels(best.mask);
+    if (maskArea !== bestMaskArea) return maskArea < bestMaskArea ? inst : best;
+
+    const boxArea = bboxArea(inst.bbox);
+    const bestBoxArea = bboxArea(best.bbox);
+    if (boxArea !== bestBoxArea) return boxArea < bestBoxArea ? inst : best;
+
+    return inst.score > best.score ? inst : best;
+  });
+}
+
+/**
+ * Selects the best matching instance for a given box prompt.
+ *
+ * Finds all instances that overlap with the prompt box, then returns the one
+ * with the highest IoU with that box (ties broken by smallest mask area, then
+ * highest confidence). This matches the behavior of FastSAM's box-prompt
+ * selection.
+ * @param instances - Array of segmented instances returned by `forward()`.
+ * @param box - The prompt bounding box in image coordinates.
+ * @returns The best matching instance, or `null` if no instance overlaps.
+ */
+export function selectByBox<L extends LabelEnum>(
+  instances: SegmentedInstance<L>[],
+  box: Bbox
+): SegmentedInstance<L> | null {
+  const { x1: px1, y1: py1, x2: px2, y2: py2 } = box;
+  const promptArea = Math.max(px2 - px1, 0) * Math.max(py2 - py1, 0);
+
+  type Match = {
+    iou: number;
+    maskArea: number;
+    score: number;
+    inst: SegmentedInstance<L>;
+  };
+  let best: Match | null = null;
+
+  for (const inst of instances) {
+    const { x1, y1, x2, y2 } = inst.bbox;
+    const interX1 = Math.max(px1, x1);
+    const interY1 = Math.max(py1, y1);
+    const interX2 = Math.min(px2, x2);
+    const interY2 = Math.min(py2, y2);
+    const interArea =
+      Math.max(interX2 - interX1, 0) * Math.max(interY2 - interY1, 0);
+    if (interArea <= 0) continue;
+
+    const detArea = bboxArea(inst.bbox);
+    const iou = interArea / (promptArea + detArea - interArea + 1e-7);
+    const maskArea = countMaskPixels(inst.mask);
+
+    if (
+      best === null ||
+      iou > best.iou ||
+      (iou === best.iou && maskArea < best.maskArea) ||
+      (iou === best.iou &&
+        maskArea === best.maskArea &&
+        inst.score > best.score)
+    ) {
+      best = { iou, maskArea, score: inst.score, inst };
+    }
+  }
+
+  return best?.inst ?? null;
+}
+
+function countMaskPixels(mask: Uint8Array): number {
+  let count = 0;
+  for (let i = 0; i < mask.length; i++) count += mask[i]!;
+  return count;
+}
+
+function bboxArea(bbox: Bbox): number {
+  return Math.max(bbox.x2 - bbox.x1, 0) * Math.max(bbox.y2 - bbox.y1, 0);
+}

From a8968324b4b7f710ceef745f2560e5bb03800bb5 Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Tue, 5 May 2026 17:40:14 +0200
Subject: [PATCH 02/18] feat: optimize FastSAM selection algorithms and improve
 performance logging

---
 apps/computer-vision/app/fast_sam/index.tsx   | 24 ++++++++++++-----
 .../src/utils/fastSAMPrompts.ts               | 26 +++----------------
 2 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/apps/computer-vision/app/fast_sam/index.tsx b/apps/computer-vision/app/fast_sam/index.tsx
index d47e538c8a..b8fa556cb9 100644
--- a/apps/computer-vision/app/fast_sam/index.tsx
+++ b/apps/computer-vision/app/fast_sam/index.tsx
@@ -107,11 +107,13 @@ export default function FastSAMScreen() {
       e.nativeEvent.locationY
     );
     if (!coords) return;
+    const t0 = Date.now();
     const match = selectByPoint(
       rawInstancesRef.current,
       Math.round(coords.x),
       Math.round(coords.y)
     );
+    console.log(`[FastSAM] selectByPoint(): ${Date.now() - t0}ms`);
     setSelection(match ?? null);
   }
 
@@ -170,7 +172,10 @@ export default function FastSAMScreen() {
       x2: Math.max(s.x, coords.x),
       y2: Math.max(s.y, coords.y),
     };
-    setSelection(selectByBox(rawInstancesRef.current, box) ?? null);
+    const t0 = Date.now();
+    const match = selectByBox(rawInstancesRef.current, box);
+    console.log(`[FastSAM] selectByBox(): ${Date.now() - t0}ms`);
+    setSelection(match ?? null);
   }
 
   // -------------------------------------------------------------------------
@@ -190,14 +195,18 @@ export default function FastSAMScreen() {
   const runForward = async () => {
     if (!imageUri) return;
     try {
-      const start = Date.now();
+      const t0 = Date.now();
       const output = await forward(imageUri, {
         confidenceThreshold: 0.4,
         iouThreshold: 0.9,
-        maxInstances: 100,
+        maxInstances: 50,
         returnMaskAtOriginalResolution: true,
       });
-      setInferenceTime(Date.now() - start);
+      const inferenceMs = Date.now() - t0;
+      console.log(
+        `[FastSAM] forward(): ${inferenceMs}ms, instances: ${output.length}`
+      );
+      setInferenceTime(inferenceMs);
       rawInstancesRef.current = output;
       setSelection(null);
     } catch (e) {
@@ -213,7 +222,8 @@ export default function FastSAMScreen() {
 
   const alphaMask = useMemo(() => {
     if (!selection) return null;
-    return buildAlphaMask(
+    const t0 = Date.now();
+    const mask = buildAlphaMask(
       selection.mask,
       selection.maskWidth,
       selection.maskHeight,
@@ -222,6 +232,8 @@ export default function FastSAMScreen() {
       imageSize.width,
       imageSize.height
     );
+    console.log(`[FastSAM] buildAlphaMask(): ${Date.now() - t0}ms`);
+    return mask;
   }, [selection, imageSize]);
 
   const { width: cw, height: ch } = cutoutLayout;
@@ -434,7 +446,7 @@ function buildAlphaMask(
   imgW: number,
   imgH: number
 ) {
-  const MAX_DIM = 512;
+  const MAX_DIM = 256;
   const ds = Math.min(1, MAX_DIM / Math.max(imgW, imgH));
   const dstW = Math.max(1, Math.round(imgW * ds));
   const dstH = Math.max(1, Math.round(imgH * ds));
diff --git a/packages/react-native-executorch/src/utils/fastSAMPrompts.ts b/packages/react-native-executorch/src/utils/fastSAMPrompts.ts
index 61f799ee40..b2ee2f29c4 100644
--- a/packages/react-native-executorch/src/utils/fastSAMPrompts.ts
+++ b/packages/react-native-executorch/src/utils/fastSAMPrompts.ts
@@ -6,8 +6,7 @@ import { SegmentedInstance } from '../types/instanceSegmentation';
  * Selects the best matching instance for a given point prompt.
  *
  * Finds all instances whose mask covers the point (x, y), then returns the one
- * with the smallest mask area (ties broken by box area, then confidence). This
- * matches the behavior of FastSAM's point-prompt selection.
+ * with the smallest bounding box area (ties broken by highest confidence).
  * @param instances - Array of segmented instances returned by `forward()`.
  * @param x - X coordinate in original image space.
  * @param y - Y coordinate in original image space.
@@ -33,10 +32,6 @@ export function selectByPoint<L extends LabelEnum>(
   if (matches.length === 0) return null;
 
   return matches.reduce((best, inst) => {
-    const maskArea = countMaskPixels(inst.mask);
-    const bestMaskArea = countMaskPixels(best.mask);
-    if (maskArea !== bestMaskArea) return maskArea < bestMaskArea ? inst : best;
-
     const boxArea = bboxArea(inst.bbox);
     const bestBoxArea = bboxArea(best.bbox);
     if (boxArea !== bestBoxArea) return boxArea < bestBoxArea ? inst : best;
@@ -49,9 +44,7 @@ export function selectByPoint<L extends LabelEnum>(
  * Selects the best matching instance for a given box prompt.
  *
  * Finds all instances that overlap with the prompt box, then returns the one
- * with the highest IoU with that box (ties broken by smallest mask area, then
- * highest confidence). This matches the behavior of FastSAM's box-prompt
- * selection.
+ * with the highest IoU with that box (ties broken by highest confidence).
  * @param instances - Array of segmented instances returned by `forward()`.
  * @param box - The prompt bounding box in image coordinates.
  * @returns The best matching instance, or `null` if no instance overlaps.
@@ -65,7 +58,6 @@ export function selectByBox<L extends LabelEnum>(
 
   type Match = {
     iou: number;
-    maskArea: number;
     score: number;
     inst: SegmentedInstance<L>;
   };
@@ -83,29 +75,19 @@ export function selectByBox<L extends LabelEnum>(
 
     const detArea = bboxArea(inst.bbox);
     const iou = interArea / (promptArea + detArea - interArea + 1e-7);
-    const maskArea = countMaskPixels(inst.mask);
 
     if (
       best === null ||
       iou > best.iou ||
-      (iou === best.iou && maskArea < best.maskArea) ||
-      (iou === best.iou &&
-        maskArea === best.maskArea &&
-        inst.score > best.score)
+      (iou === best.iou && inst.score > best.score)
     ) {
-      best = { iou, maskArea, score: inst.score, inst };
+      best = { iou, score: inst.score, inst };
     }
   }
 
   return best?.inst ?? null;
 }
 
-function countMaskPixels(mask: Uint8Array): number {
-  let count = 0;
-  for (let i = 0; i < mask.length; i++) count += mask[i]!;
-  return count;
-}
-
 function bboxArea(bbox: Bbox): number {
   return Math.max(bbox.x2 - bbox.x1, 0) * Math.max(bbox.y2 - bbox.y1, 0);
 }

From fe425df0d9454468400fae6b00c9a4dd35e0f2ff Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Tue, 5 May 2026 17:46:02 +0200
Subject: [PATCH 03/18] refactor: move FastSAMLabel enum definition to a more
 appropriate location

---
 .../src/constants/commonVision.ts             | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/packages/react-native-executorch/src/constants/commonVision.ts b/packages/react-native-executorch/src/constants/commonVision.ts
index bac8e8c520..6221d5701e 100644
--- a/packages/react-native-executorch/src/constants/commonVision.ts
+++ b/packages/react-native-executorch/src/constants/commonVision.ts
@@ -118,17 +118,6 @@ export enum CocoLabel {
  * @see {@link CocoLabel} for the RF-DETR / SSDLite variant
  * @category Types
  */
-/**
- * Class label for FastSAM models.
- *
- * FastSAM is class-agnostic and produces a single "object" class for every
- * detected region. Use this enum when working with `fastsam-s` or `fastsam-x`.
- * @category Types
- */
-export enum FastSAMLabel {
-  OBJECT = 0,
-}
-
 export enum CocoLabelYolo {
   PERSON = 0,
   BICYCLE = 1,
@@ -211,3 +200,14 @@ export enum CocoLabelYolo {
   HAIR_DRIER = 78,
   TOOTHBRUSH = 79,
 }
+
+/**
+ * Class label for FastSAM models.
+ *
+ * FastSAM is class-agnostic and produces a single "object" class for every
+ * detected region. Use this enum when working with `fastsam-s` or `fastsam-x`.
+ * @category Types
+ */
+export enum FastSAMLabel {
+  OBJECT = 0,
+}

From fd94f6a8861efa090a01f7760bb55411d806ecb8 Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Wed, 6 May 2026 22:48:53 +0200
Subject: [PATCH 04/18] feat: add text prompts and modify the example app

---
 apps/computer-vision/app/_layout.tsx          |   4 +-
 apps/computer-vision/app/fast_sam/index.tsx   | 661 ++++++++----------
 apps/computer-vision/app/index.tsx            |   2 +-
 packages/react-native-executorch/src/index.ts |   2 +-
 ...AMPrompts.ts => segmentAnythingPrompts.ts} |  52 ++
 5 files changed, 365 insertions(+), 356 deletions(-)
 rename packages/react-native-executorch/src/utils/{fastSAMPrompts.ts => segmentAnythingPrompts.ts} (62%)

diff --git a/apps/computer-vision/app/_layout.tsx b/apps/computer-vision/app/_layout.tsx
index 7aed6af92f..bb7ab7165e 100644
--- a/apps/computer-vision/app/_layout.tsx
+++ b/apps/computer-vision/app/_layout.tsx
@@ -192,8 +192,8 @@ export default function _layout() {
         <Drawer.Screen
           name="fast_sam/index"
           options={{
-            drawerLabel: 'FastSAM',
-            title: 'FastSAM',
+            drawerLabel: 'Segment Anything',
+            title: 'Segment Anything',
             headerTitleStyle: { color: ColorPalette.primary },
           }}
         />
diff --git a/apps/computer-vision/app/fast_sam/index.tsx b/apps/computer-vision/app/fast_sam/index.tsx
index b8fa556cb9..4118bf3419 100644
--- a/apps/computer-vision/app/fast_sam/index.tsx
+++ b/apps/computer-vision/app/fast_sam/index.tsx
@@ -1,31 +1,33 @@
-import React, { useContext, useMemo, useRef, useState } from 'react';
+import React, { useContext, useEffect, useRef, useState } from 'react';
 import {
   View,
   StyleSheet,
   Text,
+  TextInput,
   TouchableOpacity,
   GestureResponderEvent,
-  Image,
 } from 'react-native';
 import {
   Canvas,
-  Image as SkiaImage,
   Rect,
-  Group,
-  useImage,
   Skia,
-  AlphaType,
-  ColorType,
+  useImage,
+  type SkImage,
 } from '@shopify/react-native-skia';
 import {
   useInstanceSegmentation,
+  useImageEmbeddings,
+  useTextEmbeddings,
   FASTSAM_S,
   FASTSAM_X,
+  CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED,
+  CLIP_VIT_BASE_PATCH32_TEXT,
   InstanceSegmentationModelSources,
   SegmentedInstance,
   FastSAMLabel,
   selectByPoint,
   selectByBox,
+  selectByText,
   Bbox,
 } from 'react-native-executorch';
 import { GeneratingContext } from '../../context';
@@ -34,17 +36,21 @@ import { BottomBar } from '../../components/BottomBar';
 import { StatsBar } from '../../components/StatsBar';
 import Spinner from '../../components/Spinner';
 import ScreenWrapper from '../../ScreenWrapper';
+import ImageWithMasks, {
+  buildDisplayInstances,
+  DisplayInstance,
+} from '../../components/ImageWithMasks';
 import { getImage } from '../../utils';
 import ColorPalette from '../../colors';
 
-type PromptMode = 'point' | 'box';
+type PromptMode = 'point' | 'box' | 'text';
 
 const MODELS: ModelOption<InstanceSegmentationModelSources>[] = [
-  { label: 'FastSAM-s', value: FASTSAM_S },
-  { label: 'FastSAM-x', value: FASTSAM_X },
+  { label: 'FastSAM-S', value: FASTSAM_S },
+  { label: 'FastSAM-X', value: FASTSAM_X },
 ];
 
-export default function FastSAMScreen() {
+export default function SegmentAnythingScreen() {
   const { setGlobalGenerating } = useContext(GeneratingContext);
 
   const [selectedModel, setSelectedModel] =
@@ -56,131 +62,149 @@ export default function FastSAMScreen() {
   const [imageSize, setImageSize] = useState({ width: 0, height: 0 });
 
   const rawInstancesRef = useRef<SegmentedInstance<typeof FastSAMLabel>[]>([]);
-  const [selection, setSelection] = useState<SegmentedInstance<
-    typeof FastSAMLabel
-  > | null>(null);
-
-  const [draftBox, setDraftBox] = useState<{
-    x1: number;
-    y1: number;
-    x2: number;
-    y2: number;
-  } | null>(null);
-  const boxStartRef = useRef<{ x: number; y: number } | null>(null);
+  const [selection, setSelection] = useState<DisplayInstance[]>([]);
 
-  const sourceLayoutRef = useRef({ width: 0, height: 0 });
-  const cutoutLayoutRef = useRef({ width: 0, height: 0 });
-  const [cutoutLayout, setCutoutLayout] = useState({ width: 0, height: 0 });
+  const [draftBox, setDraftBox] = useState<Bbox | null>(null);
+  const boxStartRef = useRef<{ x: number; y: number } | null>(null);
+  const layoutRef = useRef({ width: 0, height: 0 });
 
   const { isReady, isGenerating, downloadProgress, forward, error } =
     useInstanceSegmentation({ model: selectedModel });
 
-  React.useEffect(() => {
+  const clipImage = useImageEmbeddings({
+    model: CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED,
+  });
+  const clipText = useTextEmbeddings({ model: CLIP_VIT_BASE_PATCH32_TEXT });
+  const skiaSource = useImage(imageUri || null);
+
+  const [textPrompt, setTextPrompt] = useState('');
+  const [textBusy, setTextBusy] = useState(false);
+  const [embeddingProgress, setEmbeddingProgress] = useState<{
+    done: number;
+    total: number;
+  } | null>(null);
+  const instanceEmbeddingsRef = useRef<Float32Array[] | null>(null);
+
+  useEffect(() => {
     setGlobalGenerating(isGenerating);
   }, [isGenerating, setGlobalGenerating]);
 
-  // -------------------------------------------------------------------------
-  // Coordinate conversion (source image box)
-  // -------------------------------------------------------------------------
+  function applyMatch(
+    match: SegmentedInstance<typeof FastSAMLabel> | null
+  ): void {
+    setSelection(match ? buildDisplayInstances([match]) : []);
+  }
 
   function touchToImageCoords(touchX: number, touchY: number) {
-    const { width: cw, height: ch } = sourceLayoutRef.current;
+    const { width: cw, height: ch } = layoutRef.current;
     const { width: iw, height: ih } = imageSize;
     if (iw === 0 || ih === 0) return null;
     const scale = Math.min(cw / iw, ch / ih);
-    const offsetX = (cw - iw * scale) / 2;
-    const offsetY = (ch - ih * scale) / 2;
     return {
-      x: (touchX - offsetX) / scale,
-      y: (touchY - offsetY) / scale,
+      x: (touchX - (cw - iw * scale) / 2) / scale,
+      y: (touchY - (ch - ih * scale) / 2) / scale,
     };
   }
 
-  // -------------------------------------------------------------------------
-  // Point prompt
-  // -------------------------------------------------------------------------
-
   function handleTap(e: GestureResponderEvent) {
     if (mode !== 'point' || rawInstancesRef.current.length === 0) return;
-    const coords = touchToImageCoords(
+    const c = touchToImageCoords(
       e.nativeEvent.locationX,
       e.nativeEvent.locationY
     );
-    if (!coords) return;
-    const t0 = Date.now();
-    const match = selectByPoint(
-      rawInstancesRef.current,
-      Math.round(coords.x),
-      Math.round(coords.y)
+    if (!c) return;
+    applyMatch(
+      selectByPoint(rawInstancesRef.current, Math.round(c.x), Math.round(c.y))
     );
-    console.log(`[FastSAM] selectByPoint(): ${Date.now() - t0}ms`);
-    setSelection(match ?? null);
   }
 
-  // -------------------------------------------------------------------------
-  // Box prompt
-  // -------------------------------------------------------------------------
-
   function handleBoxStart(e: GestureResponderEvent) {
     if (mode !== 'box') return;
-    const coords = touchToImageCoords(
+    const c = touchToImageCoords(
       e.nativeEvent.locationX,
       e.nativeEvent.locationY
     );
-    if (!coords) return;
-    boxStartRef.current = coords;
-    setDraftBox({ x1: coords.x, y1: coords.y, x2: coords.x, y2: coords.y });
+    if (!c) return;
+    boxStartRef.current = c;
+    setDraftBox({ x1: c.x, y1: c.y, x2: c.x, y2: c.y });
   }
 
   function handleBoxMove(e: GestureResponderEvent) {
     if (mode !== 'box' || !boxStartRef.current) return;
-    const coords = touchToImageCoords(
+    const c = touchToImageCoords(
       e.nativeEvent.locationX,
       e.nativeEvent.locationY
     );
-    if (!coords) return;
+    if (!c) return;
     const s = boxStartRef.current;
     setDraftBox({
-      x1: Math.min(s.x, coords.x),
-      y1: Math.min(s.y, coords.y),
-      x2: Math.max(s.x, coords.x),
-      y2: Math.max(s.y, coords.y),
+      x1: Math.min(s.x, c.x),
+      y1: Math.min(s.y, c.y),
+      x2: Math.max(s.x, c.x),
+      y2: Math.max(s.y, c.y),
     });
   }
 
   function handleBoxEnd(e: GestureResponderEvent) {
-    if (
-      mode !== 'box' ||
-      !boxStartRef.current ||
-      rawInstancesRef.current.length === 0
-    ) {
-      boxStartRef.current = null;
-      setDraftBox(null);
-      return;
-    }
-    const coords = touchToImageCoords(
+    if (mode !== 'box' || !boxStartRef.current) return;
+    const c = touchToImageCoords(
       e.nativeEvent.locationX,
       e.nativeEvent.locationY
     );
     const s = boxStartRef.current;
     boxStartRef.current = null;
     setDraftBox(null);
-    if (!coords) return;
-    const box: Bbox = {
-      x1: Math.min(s.x, coords.x),
-      y1: Math.min(s.y, coords.y),
-      x2: Math.max(s.x, coords.x),
-      y2: Math.max(s.y, coords.y),
-    };
-    const t0 = Date.now();
-    const match = selectByBox(rawInstancesRef.current, box);
-    console.log(`[FastSAM] selectByBox(): ${Date.now() - t0}ms`);
-    setSelection(match ?? null);
+    if (!c || rawInstancesRef.current.length === 0) return;
+    applyMatch(
+      selectByBox(rawInstancesRef.current, {
+        x1: Math.min(s.x, c.x),
+        y1: Math.min(s.y, c.y),
+        x2: Math.max(s.x, c.x),
+        y2: Math.max(s.y, c.y),
+      })
+    );
   }
 
-  // -------------------------------------------------------------------------
-  // Image loading & inference
-  // -------------------------------------------------------------------------
+  async function runTextPrompt() {
+    const instances = rawInstancesRef.current;
+    if (
+      !textPrompt.trim() ||
+      instances.length === 0 ||
+      !skiaSource ||
+      !clipImage.isReady ||
+      !clipText.isReady ||
+      textBusy
+    ) {
+      return;
+    }
+    setTextBusy(true);
+    try {
+      if (!instanceEmbeddingsRef.current) {
+        setEmbeddingProgress({ done: 0, total: instances.length });
+        const embeddings: Float32Array[] = [];
+        for (let i = 0; i < instances.length; i++) {
+          embeddings.push(
+            await cropAndEmbed(
+              skiaSource,
+              instances[i]!.bbox,
+              clipImage.forward
+            )
+          );
+          setEmbeddingProgress({ done: i + 1, total: instances.length });
+        }
+        instanceEmbeddingsRef.current = embeddings;
+        setEmbeddingProgress(null);
+      }
+      const textEmb = await clipText.forward(textPrompt);
+      applyMatch(
+        selectByText(instances, instanceEmbeddingsRef.current, textEmb)
+      );
+    } catch (e) {
+      console.error(e);
+    } finally {
+      setTextBusy(false);
+    }
+  }
 
   const handleCameraPress = async (isCamera: boolean) => {
     const image = await getImage(isCamera);
@@ -188,71 +212,30 @@ export default function FastSAMScreen() {
     setImageUri(image.uri);
     setImageSize({ width: image.width ?? 0, height: image.height ?? 0 });
     rawInstancesRef.current = [];
-    setSelection(null);
+    instanceEmbeddingsRef.current = null;
+    setSelection([]);
     setInferenceTime(null);
   };
 
   const runForward = async () => {
     if (!imageUri) return;
     try {
-      const t0 = Date.now();
+      const start = Date.now();
       const output = await forward(imageUri, {
         confidenceThreshold: 0.4,
         iouThreshold: 0.9,
         maxInstances: 50,
         returnMaskAtOriginalResolution: true,
       });
-      const inferenceMs = Date.now() - t0;
-      console.log(
-        `[FastSAM] forward(): ${inferenceMs}ms, instances: ${output.length}`
-      );
-      setInferenceTime(inferenceMs);
+      setInferenceTime(Date.now() - start);
       rawInstancesRef.current = output;
-      setSelection(null);
+      instanceEmbeddingsRef.current = null;
+      setSelection([]);
     } catch (e) {
       console.error(e);
     }
   };
 
-  // -------------------------------------------------------------------------
-  // Cutout rendering
-  // -------------------------------------------------------------------------
-
-  const skiaSource = useImage(imageUri || null);
-
-  const alphaMask = useMemo(() => {
-    if (!selection) return null;
-    const t0 = Date.now();
-    const mask = buildAlphaMask(
-      selection.mask,
-      selection.maskWidth,
-      selection.maskHeight,
-      selection.bbox.x1,
-      selection.bbox.y1,
-      imageSize.width,
-      imageSize.height
-    );
-    console.log(`[FastSAM] buildAlphaMask(): ${Date.now() - t0}ms`);
-    return mask;
-  }, [selection, imageSize]);
-
-  const { width: cw, height: ch } = cutoutLayout;
-  const { width: iw, height: ih } = imageSize;
-  const cutoutScale =
-    cw > 0 && ch > 0 && iw > 0 && ih > 0 ? Math.min(cw / iw, ch / ih) : 1;
-  const cutoutOffsetX = (cw - iw * cutoutScale) / 2;
-  const cutoutOffsetY = (ch - ih * cutoutScale) / 2;
-
-  // Draft box overlay coords (source box)
-  const { width: scw, height: sch } = sourceLayoutRef.current;
-  const srcScale = iw > 0 && ih > 0 ? Math.min(scw / iw, sch / ih) : 1;
-  const srcOffsetX = (scw - iw * srcScale) / 2;
-  const srcOffsetY = (sch - ih * srcScale) / 2;
-
-  // -------------------------------------------------------------------------
-  // Error / loading
-  // -------------------------------------------------------------------------
-
   if (!isReady && error) {
     return (
       <ScreenWrapper>
@@ -268,137 +251,141 @@ export default function FastSAMScreen() {
     return (
       <Spinner
         visible
-        textContent={`Loading model ${(downloadProgress * 100).toFixed(0)}%`}
+        textContent={`Loading the model ${(downloadProgress * 100).toFixed(0)} %`}
       />
     );
   }
 
+  const { width: cw, height: ch } = layoutRef.current;
+  const { width: iw, height: ih } = imageSize;
+  const drawScale = iw > 0 && ih > 0 ? Math.min(cw / iw, ch / ih) : 1;
+  const offsetX = (cw - iw * drawScale) / 2;
+  const offsetY = (ch - ih * drawScale) / 2;
+
+  const stepHint = !imageUri
+    ? null
+    : inferenceTime === null
+      ? 'Tap Run to detect instances'
+      : rawInstancesRef.current.length === 0
+        ? 'No instances detected — try another image'
+        : selection.length === 0
+          ? 'Tap a point, draw a box, or describe an object'
+          : null;
+
   return (
     <ScreenWrapper>
-      {/* ---- Source image box ---- */}
-      <View
-        style={styles.imageBox}
-        onLayout={(e) => {
-          const { width, height } = e.nativeEvent.layout;
-          sourceLayoutRef.current = { width, height };
-        }}
-        onTouchStart={(e) => {
-          if (mode === 'point') handleTap(e);
-          else handleBoxStart(e);
-        }}
-        onTouchMove={(e) => {
-          if (mode === 'box') handleBoxMove(e);
-        }}
-        onTouchEnd={(e) => {
-          if (mode === 'box') handleBoxEnd(e);
-        }}
-      >
-        <Image
-          style={styles.image}
-          resizeMode="contain"
-          source={
-            imageUri
-              ? { uri: imageUri }
-              : require('../../assets/icons/executorch_logo.png')
-          }
-        />
-        {!imageUri && (
-          <View style={styles.hint}>
-            <Text style={styles.hintText}>Load an image to get started</Text>
-          </View>
-        )}
-        {/* Draft box */}
-        {draftBox && iw > 0 && (
-          <Canvas style={StyleSheet.absoluteFill} pointerEvents="none">
-            <Rect
-              x={draftBox.x1 * srcScale + srcOffsetX}
-              y={draftBox.y1 * srcScale + srcOffsetY}
-              width={(draftBox.x2 - draftBox.x1) * srcScale}
-              height={(draftBox.y2 - draftBox.y1) * srcScale}
-              style="stroke"
-              strokeWidth={2}
-              color="rgba(0,200,255,1)"
+      <View style={styles.container}>
+        <View style={styles.imageContainer}>
+          <View
+            style={styles.imageTouchArea}
+            onLayout={(e) => {
+              layoutRef.current = {
+                width: e.nativeEvent.layout.width,
+                height: e.nativeEvent.layout.height,
+              };
+            }}
+            onTouchStart={(e) => {
+              if (mode === 'point') handleTap(e);
+              else if (mode === 'box') handleBoxStart(e);
+            }}
+            onTouchMove={handleBoxMove}
+            onTouchEnd={handleBoxEnd}
+          >
+            <ImageWithMasks
+              imageUri={imageUri}
+              instances={selection}
+              imageWidth={imageSize.width}
+              imageHeight={imageSize.height}
             />
-          </Canvas>
-        )}
-      </View>
-
-      {/* ---- Cutout box ---- */}
-      <View
-        style={styles.imageBox}
-        onLayout={(e) => {
-          const { width, height } = e.nativeEvent.layout;
-          cutoutLayoutRef.current = { width, height };
-          setCutoutLayout({ width, height });
-        }}
-      >
-        {selection && skiaSource && alphaMask ? (
-          <Canvas style={StyleSheet.absoluteFill}>
-            <Rect x={0} y={0} width={cw} height={ch} color="black" />
-            <Group layer>
-              <SkiaImage
-                image={skiaSource}
-                x={cutoutOffsetX}
-                y={cutoutOffsetY}
-                width={iw * cutoutScale}
-                height={ih * cutoutScale}
-                fit="fill"
-              />
-              <SkiaImage
-                image={alphaMask}
-                x={cutoutOffsetX}
-                y={cutoutOffsetY}
-                width={iw * cutoutScale}
-                height={ih * cutoutScale}
-                fit="fill"
-                blendMode="dstIn"
-              />
-            </Group>
-          </Canvas>
-        ) : (
-          <View style={styles.hint}>
-            <Text style={styles.hintText}>
-              {rawInstancesRef.current.length > 0
-                ? 'Tap or draw a box on the image above'
-                : imageUri
-                  ? 'Run inference first'
-                  : ''}
-            </Text>
+            {draftBox && iw > 0 && (
+              <Canvas style={StyleSheet.absoluteFill} pointerEvents="none">
+                <Rect
+                  x={draftBox.x1 * drawScale + offsetX}
+                  y={draftBox.y1 * drawScale + offsetY}
+                  width={(draftBox.x2 - draftBox.x1) * drawScale}
+                  height={(draftBox.y2 - draftBox.y1) * drawScale}
+                  style="stroke"
+                  strokeWidth={2}
+                  color="rgba(0,200,255,1)"
+                />
+              </Canvas>
+            )}
           </View>
-        )}
+          {!imageUri && (
+            <View style={styles.infoContainer}>
+              <Text style={styles.infoTitle}>Segment Anything</Text>
+              <Text style={styles.infoText}>
+                Segment any object in an image. (1) Pick an image, (2) tap Run
+                to detect instances, (3) tap a point, draw a box, or describe an
+                object to segment it.
+              </Text>
+            </View>
+          )}
+        </View>
       </View>
 
-      {/* ---- Controls ---- */}
-      <View style={styles.controls}>
-        <View style={styles.modeToggle}>
-          <TouchableOpacity
-            style={[styles.modeBtn, mode === 'point' && styles.modeBtnActive]}
-            onPress={() => setMode('point')}
-          >
-            <Text
+      {stepHint && <Text style={styles.stepHint}>{stepHint}</Text>}
+
+      <View style={styles.modeRow}>
+        {(['point', 'box', 'text'] as PromptMode[]).map((m) => {
+          const promptDisabled = rawInstancesRef.current.length === 0;
+          return (
+            <TouchableOpacity
+              key={m}
               style={[
-                styles.modeBtnText,
-                mode === 'point' && styles.modeBtnTextActive,
+                styles.modeBtn,
+                mode === m && styles.modeBtnActive,
+                promptDisabled && styles.modeBtnDisabled,
               ]}
+              onPress={() => setMode(m)}
+              disabled={promptDisabled}
             >
-              Point
-            </Text>
-          </TouchableOpacity>
+              <Text
+                style={[
+                  styles.modeBtnText,
+                  mode === m && styles.modeBtnTextActive,
+                  promptDisabled && styles.modeBtnTextDisabled,
+                ]}
+              >
+                {m[0]!.toUpperCase() + m.slice(1)}
+              </Text>
+            </TouchableOpacity>
+          );
+        })}
+      </View>
+
+      {mode === 'text' && (
+        <View style={styles.textRow}>
+          <TextInput
+            style={styles.textInput}
+            placeholder="Describe an object…"
+            value={textPrompt}
+            onChangeText={setTextPrompt}
+            onSubmitEditing={runTextPrompt}
+            returnKeyType="search"
+            editable={!textBusy}
+          />
           <TouchableOpacity
-            style={[styles.modeBtn, mode === 'box' && styles.modeBtnActive]}
-            onPress={() => setMode('box')}
+            style={[styles.textBtn, textBusy && styles.textBtnDisabled]}
+            onPress={runTextPrompt}
+            disabled={
+              !textPrompt.trim() ||
+              textBusy ||
+              rawInstancesRef.current.length === 0 ||
+              !clipImage.isReady ||
+              !clipText.isReady
+            }
           >
-            <Text
-              style={[
-                styles.modeBtnText,
-                mode === 'box' && styles.modeBtnTextActive,
-              ]}
-            >
-              Box
-            </Text>
+            <Text style={styles.textBtnLabel}>{textBusy ? '…' : 'Find'}</Text>
           </TouchableOpacity>
         </View>
-      </View>
+      )}
+      {mode === 'text' && embeddingProgress && (
+        <Text style={styles.statusLine}>
+          Embedding instances {embeddingProgress.done}/{embeddingProgress.total}{' '}
+          (subsequent text queries are instant)
+        </Text>
+      )}
 
       <ModelPicker
         models={MODELS}
@@ -407,7 +394,8 @@ export default function FastSAMScreen() {
         onSelect={(m) => {
           setSelectedModel(m);
           rawInstancesRef.current = [];
-          setSelection(null);
+          instanceEmbeddingsRef.current = null;
+          setSelection([]);
           setInferenceTime(null);
         }}
       />
@@ -431,127 +419,100 @@ export default function FastSAMScreen() {
   );
 }
 
-// ---------------------------------------------------------------------------
-// Helpers
-// ---------------------------------------------------------------------------
-
-// Builds a full-image alpha mask. `mask` is bbox-relative (maskWidth × maskHeight),
-// positioned at (bboxX1, bboxY1) within an image of size (imgW × imgH).
-function buildAlphaMask(
-  mask: Uint8Array,
-  maskWidth: number,
-  maskHeight: number,
-  bboxX1: number,
-  bboxY1: number,
-  imgW: number,
-  imgH: number
-) {
-  const MAX_DIM = 256;
-  const ds = Math.min(1, MAX_DIM / Math.max(imgW, imgH));
-  const dstW = Math.max(1, Math.round(imgW * ds));
-  const dstH = Math.max(1, Math.round(imgH * ds));
-
-  const pixels = new Uint8Array(dstW * dstH * 4);
-
-  // Place the bbox-relative mask into the full-image canvas
-  const offX = Math.round(bboxX1 * ds);
-  const offY = Math.round(bboxY1 * ds);
-  const scaledMaskW = Math.max(1, Math.round(maskWidth * ds));
-  const scaledMaskH = Math.max(1, Math.round(maskHeight * ds));
-
-  for (let dy = 0; dy < scaledMaskH; dy++) {
-    const sy = Math.min(
-      Math.floor((dy / scaledMaskH) * maskHeight),
-      maskHeight - 1
-    );
-    for (let dx = 0; dx < scaledMaskW; dx++) {
-      const sx = Math.min(
-        Math.floor((dx / scaledMaskW) * maskWidth),
-        maskWidth - 1
-      );
-      if (mask[sy * maskWidth + sx] > 0) {
-        const imgX = offX + dx;
-        const imgY = offY + dy;
-        if (imgX >= 0 && imgX < dstW && imgY >= 0 && imgY < dstH) {
-          const i = (imgY * dstW + imgX) * 4;
-          pixels[i] = 255;
-          pixels[i + 1] = 255;
-          pixels[i + 2] = 255;
-          pixels[i + 3] = 255;
-        }
-      }
-    }
-  }
-
-  const data = Skia.Data.fromBytes(pixels);
-  const img = Skia.Image.MakeImage(
+async function cropAndEmbed(
+  image: SkImage,
+  bbox: Bbox,
+  forward: (input: string) => Promise<Float32Array>
+): Promise<Float32Array> {
+  const w = Math.max(1, Math.round(bbox.x2 - bbox.x1));
+  const h = Math.max(1, Math.round(bbox.y2 - bbox.y1));
+  const surface = Skia.Surface.MakeOffscreen(w, h);
+  if (!surface) throw new Error('Failed to create offscreen Skia surface');
+  surface.getCanvas().drawImageRect(
+    image,
     {
-      width: dstW,
-      height: dstH,
-      alphaType: AlphaType.Premul,
-      colorType: ColorType.RGBA_8888,
+      x: bbox.x1,
+      y: bbox.y1,
+      width: bbox.x2 - bbox.x1,
+      height: bbox.y2 - bbox.y1,
     },
-    data,
-    dstW * 4
+    { x: 0, y: 0, width: w, height: h },
+    Skia.Paint()
   );
-  data.dispose();
-  return img;
+  const base64 = surface.makeImageSnapshot().encodeToBase64();
+  return forward(`data:image/png;base64,${base64}`);
 }
 
-// ---------------------------------------------------------------------------
-// Styles
-// ---------------------------------------------------------------------------
-
 const styles = StyleSheet.create({
-  imageBox: {
-    flex: 1,
-    width: '100%',
-    borderBottomWidth: 1,
-    borderBottomColor: '#e0e0e0',
-  },
-  image: {
-    width: '100%',
-    height: '100%',
+  container: { flex: 6, width: '100%' },
+  imageContainer: { flex: 1, width: '100%', padding: 16 },
+  imageTouchArea: { flex: 1, position: 'relative' },
+  infoContainer: { alignItems: 'center', padding: 16, gap: 8 },
+  infoTitle: { fontSize: 18, fontWeight: '600', color: 'navy' },
+  infoText: {
+    fontSize: 14,
+    color: '#555',
+    textAlign: 'center',
+    lineHeight: 20,
   },
-  hint: {
-    ...StyleSheet.absoluteFillObject,
+  modeRow: {
+    flexDirection: 'row',
     justifyContent: 'center',
-    alignItems: 'center',
+    paddingVertical: 8,
+    gap: 8,
   },
-  hintText: {
-    fontSize: 14,
-    color: '#aaa',
+  modeBtn: {
+    paddingHorizontal: 18,
+    paddingVertical: 8,
+    borderRadius: 8,
+    borderWidth: 1,
+    borderColor: ColorPalette.primary,
+    backgroundColor: '#fff',
   },
-  controls: {
+  modeBtnActive: { backgroundColor: ColorPalette.primary },
+  modeBtnDisabled: { borderColor: '#cbd5e1', backgroundColor: '#f8fafc' },
+  modeBtnText: { fontSize: 14, fontWeight: '600', color: ColorPalette.primary },
+  modeBtnTextActive: { color: '#fff' },
+  modeBtnTextDisabled: { color: '#cbd5e1' },
+  textRow: {
     flexDirection: 'row',
     alignItems: 'center',
     paddingHorizontal: 16,
-    paddingVertical: 10,
-    borderTopWidth: 1,
-    borderTopColor: '#e0e0e0',
+    paddingBottom: 8,
+    gap: 8,
   },
-  modeToggle: {
-    flexDirection: 'row',
-    borderRadius: 8,
-    overflow: 'hidden',
+  textInput: {
+    flex: 1,
+    backgroundColor: '#fff',
     borderWidth: 1,
     borderColor: ColorPalette.primary,
+    borderRadius: 12,
+    paddingHorizontal: 14,
+    paddingVertical: 12,
+    fontSize: 16,
+    color: '#0f172a',
   },
-  modeBtn: {
+  textBtn: {
+    backgroundColor: ColorPalette.primary,
+    borderRadius: 12,
     paddingHorizontal: 20,
-    paddingVertical: 8,
-    backgroundColor: '#fff',
+    paddingVertical: 14,
   },
-  modeBtnActive: {
-    backgroundColor: ColorPalette.primary,
+  textBtnDisabled: { backgroundColor: '#cbd5e1' },
+  textBtnLabel: { color: '#fff', fontWeight: '700', fontSize: 16 },
+  statusLine: {
+    paddingHorizontal: 16,
+    paddingBottom: 6,
+    fontSize: 12,
+    color: '#64748b',
   },
-  modeBtnText: {
-    fontSize: 14,
-    fontWeight: '600',
+  stepHint: {
+    paddingHorizontal: 16,
+    paddingTop: 6,
+    fontSize: 13,
+    fontWeight: '500',
     color: ColorPalette.primary,
-  },
-  modeBtnTextActive: {
-    color: '#fff',
+    textAlign: 'center',
   },
   errorContainer: {
     flex: 1,
@@ -565,9 +526,5 @@ const styles = StyleSheet.create({
     color: '#e74c3c',
     marginBottom: 12,
   },
-  errorText: {
-    fontSize: 14,
-    color: '#555',
-    textAlign: 'center',
-  },
+  errorText: { fontSize: 14, color: '#555', textAlign: 'center' },
 });
diff --git a/apps/computer-vision/app/index.tsx b/apps/computer-vision/app/index.tsx
index 690ebfb331..f7c1dae5b7 100644
--- a/apps/computer-vision/app/index.tsx
+++ b/apps/computer-vision/app/index.tsx
@@ -51,7 +51,7 @@ export default function Home() {
           style={styles.button}
           onPress={() => router.navigate('fast_sam/')}
         >
-          <Text style={styles.buttonText}>FastSAM</Text>
+          <Text style={styles.buttonText}>Segment Anything</Text>
         </TouchableOpacity>
         <TouchableOpacity
           style={styles.button}
diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts
index 6c88de830e..84d6da5150 100644
--- a/packages/react-native-executorch/src/index.ts
+++ b/packages/react-native-executorch/src/index.ts
@@ -212,7 +212,7 @@ export * from './utils/BaseResourceFetcherClass';
 export * from './utils/llm';
 export * from './common/Logger';
 export * from './utils/llms/context_strategy';
-export * from './utils/fastSAMPrompts';
+export * from './utils/segmentAnythingPrompts';
 
 // types
 export * from './types/objectDetection';
diff --git a/packages/react-native-executorch/src/utils/fastSAMPrompts.ts b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts
similarity index 62%
rename from packages/react-native-executorch/src/utils/fastSAMPrompts.ts
rename to packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts
index b2ee2f29c4..b512bc2f66 100644
--- a/packages/react-native-executorch/src/utils/fastSAMPrompts.ts
+++ b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts
@@ -91,3 +91,55 @@ export function selectByBox<L extends LabelEnum>(
 function bboxArea(bbox: Bbox): number {
   return Math.max(bbox.x2 - bbox.x1, 0) * Math.max(bbox.y2 - bbox.y1, 0);
 }
+
+/**
+ * Selects the best matching instance for a text prompt.
+ *
+ * Returns the instance whose image embedding has the highest cosine similarity
+ * with the text embedding. The caller is responsible for producing the
+ * embeddings (e.g. with CLIP) and passing them in the same order as
+ * `instances`; embeddings do not need to be pre-normalized.
+ * @param instances - Array of segmented instances returned by `forward()`.
+ * @param instanceEmbeddings - Image embedding for each instance, in the same order as `instances`.
+ * @param textEmbedding - Embedding of the text prompt.
+ * @returns The best matching instance, or `null` if `instances` is empty.
+ */
+export function selectByText<L extends LabelEnum>(
+  instances: SegmentedInstance<L>[],
+  instanceEmbeddings: Float32Array[],
+  textEmbedding: Float32Array
+): SegmentedInstance<L> | null {
+  if (instances.length === 0) return null;
+  if (instances.length !== instanceEmbeddings.length) {
+    throw new Error(
+      `selectByText: instances (${instances.length}) and instanceEmbeddings (${instanceEmbeddings.length}) must have the same length`
+    );
+  }
+
+  let textNormSq = 0;
+  for (let i = 0; i < textEmbedding.length; i++) {
+    const v = textEmbedding[i]!;
+    textNormSq += v * v;
+  }
+  const textNorm = Math.sqrt(textNormSq);
+
+  let bestIdx = 0;
+  let bestScore = -Infinity;
+  for (let i = 0; i < instances.length; i++) {
+    const emb = instanceEmbeddings[i]!;
+    const n = Math.min(emb.length, textEmbedding.length);
+    let dot = 0;
+    let embNormSq = 0;
+    for (let j = 0; j < n; j++) {
+      const a = emb[j]!;
+      dot += a * textEmbedding[j]!;
+      embNormSq += a * a;
+    }
+    const score = dot / (Math.sqrt(embNormSq) * textNorm + 1e-7);
+    if (score > bestScore) {
+      bestScore = score;
+      bestIdx = i;
+    }
+  }
+  return instances[bestIdx]!;
+}

From c28dc716dfdffd1aee4cd0c2827f8d48884f240c Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Wed, 6 May 2026 23:37:22 +0200
Subject: [PATCH 05/18] docs: add initial docs generated

---
 .cspell-wordlist.txt                          |   1 +
 .../02-computer-vision/segment-anything.md    | 143 ++++++++++++++++++
 .../useInstanceSegmentation.md                |   6 +
 3 files changed, 150 insertions(+)
 create mode 100644 docs/docs/03-hooks/02-computer-vision/segment-anything.md

diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt
index cbd5f6d67d..b9809a8734 100644
--- a/.cspell-wordlist.txt
+++ b/.cspell-wordlist.txt
@@ -204,3 +204,4 @@ Fishjam
 deinitialize
 Deinitialize
 fastsam
+promptable
diff --git a/docs/docs/03-hooks/02-computer-vision/segment-anything.md b/docs/docs/03-hooks/02-computer-vision/segment-anything.md
new file mode 100644
index 0000000000..8cf974d034
--- /dev/null
+++ b/docs/docs/03-hooks/02-computer-vision/segment-anything.md
@@ -0,0 +1,143 @@
+---
+title: Segment Anything with FastSAM
+---
+
+[FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM) is a class-agnostic, promptable segmentation model. Unlike YOLO or RF-DETR (which return labelled detections), FastSAM segments **every** instance in an image without classifying them — you then pick the one you want with a point, box, or text prompt.
+
+`FASTSAM_S` and `FASTSAM_X` are loaded with the regular [`useInstanceSegmentation`](./useInstanceSegmentation.md) hook. `react-native-executorch` ships three small selector utilities to pick an instance from the hook's output: `selectByPoint`, `selectByBox`, and `selectByText`.
+
+## API Reference
+
+- [`selectByPoint` API Reference](../../06-api-reference/functions/selectByPoint.md)
+- [`selectByBox` API Reference](../../06-api-reference/functions/selectByBox.md)
+- [`selectByText` API Reference](../../06-api-reference/functions/selectByText.md)
+
+## High Level Overview
+
+The workflow has three steps:
+
+1. Load `FASTSAM_S` (or `FASTSAM_X`) with `useInstanceSegmentation`.
+2. Run `forward(image)` once — the result is every detected instance.
+3. Use a selector to pick the one matching the user's prompt. Re-run a selector when the prompt changes; you don't need to call `forward` again.
+
+```typescript
+import {
+  useInstanceSegmentation,
+  selectByPoint,
+  selectByBox,
+  selectByText,
+  FASTSAM_S,
+} from 'react-native-executorch';
+
+const model = useInstanceSegmentation({ model: FASTSAM_S });
+
+try {
+  const instances = await model.forward(imageUri);
+
+  // Point: the smallest instance whose mask covers (x, y).
+  const a = selectByPoint(instances, x, y);
+  console.log('point match:', a?.bbox);
+
+  // Box: the instance with highest IoU with the prompt box.
+  const b = selectByBox(instances, { x1, y1, x2, y2 });
+  console.log('box match:', b?.bbox);
+
+  // Text: highest cosine similarity between text and per-instance image
+  // embeddings (you must provide the embeddings, e.g. with CLIP).
+  const c = selectByText(instances, instanceEmbeddings, textEmbedding);
+  console.log('text match:', c?.bbox);
+} catch (error) {
+  console.error(error);
+}
+```
+
+The hook output is typed as [`SegmentedInstance<typeof FastSAMLabel>`](../../06-api-reference/interfaces/SegmentedInstance.md). FastSAM emits a single label, [`FastSAMLabel.OBJECT`](../../06-api-reference/enumerations/FastSAMLabel.md) (`'OBJECT' = 0`).
+
+## Selecting by point
+
+`selectByPoint` returns the instance whose mask covers the point `(x, y)`. When several instances overlap (e.g. a small object inside a larger one), the one with the smallest bounding box wins; ties are broken by confidence. Returns `null` if no mask covers the point.
+
+It accepts three arguments:
+
+- `instances` (required) - The array of [`SegmentedInstance`](../../06-api-reference/interfaces/SegmentedInstance.md) returned by `forward()`.
+- `x` (required) - X coordinate of the prompt point, in the **original image's** pixel space.
+- `y` (required) - Y coordinate of the prompt point, in the **original image's** pixel space.
+
+:::info
+`returnMaskAtOriginalResolution: true` (the default) is required for `selectByPoint` — masks must be in original image coordinates so they align with the touch coordinates passed in.
+:::
+
+## Selecting by box
+
+`selectByBox` returns the instance with the highest IoU with the prompt box. Useful for "draw a box around what you want" UX. Returns `null` if no instance overlaps.
+
+It accepts two arguments:
+
+- `instances` (required) - The array of [`SegmentedInstance`](../../06-api-reference/interfaces/SegmentedInstance.md) returned by `forward()`.
+- `box` (required) - A [`Bbox`](../../06-api-reference/interfaces/Bbox.md) (`{ x1, y1, x2, y2 }`) in the original image's pixel space.
+
+## Selecting by text
+
+`selectByText` returns the instance whose image embedding has the highest cosine similarity with the text embedding. The caller produces the embeddings — typically by cropping each instance's bbox and running [CLIP](./useImageEmbeddings.md) image encoder, plus running the [CLIP text encoder](../01-natural-language-processing/useTextEmbeddings.md) on the prompt.
+
+It accepts three arguments:
+
+- `instances` (required) - The array of [`SegmentedInstance`](../../06-api-reference/interfaces/SegmentedInstance.md) returned by `forward()`.
+- `instanceEmbeddings` (required) - One `Float32Array` per instance, in the same order as `instances`. Throws if lengths differ.
+- `textEmbedding` (required) - A `Float32Array` for the text prompt.
+
+Embeddings do not need to be pre-normalized. Returns `null` only when `instances` is empty.
+
+### Example with CLIP
+
+```typescript
+import {
+  useInstanceSegmentation,
+  useImageEmbeddings,
+  useTextEmbeddings,
+  selectByText,
+  FASTSAM_S,
+  CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED,
+  CLIP_VIT_BASE_PATCH32_TEXT,
+} from 'react-native-executorch';
+
+function App() {
+  const sam = useInstanceSegmentation({ model: FASTSAM_S });
+  const clipImage = useImageEmbeddings({
+    model: CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED,
+  });
+  const clipText = useTextEmbeddings({ model: CLIP_VIT_BASE_PATCH32_TEXT });
+
+  const handlePrompt = async (imageUri: string, prompt: string) => {
+    if (!sam.isReady || !clipImage.isReady || !clipText.isReady) return;
+
+    try {
+      const instances = await sam.forward(imageUri);
+
+      // Embed each instance's bbox crop. Cropping is your responsibility —
+      // any image manipulator (e.g. expo-image-manipulator) works.
+      const instanceEmbeddings = await Promise.all(
+        instances.map((inst) =>
+          clipImage.forward(cropToBbox(imageUri, inst.bbox))
+        )
+      );
+
+      const textEmb = await clipText.forward(prompt);
+      const match = selectByText(instances, instanceEmbeddings, textEmb);
+      console.log('match:', match?.bbox, match?.score);
+    } catch (error) {
+      console.error(error);
+    }
+  };
+
+  // ...
+}
+```
+
+:::tip
+Embedding all instances is the slow part of text prompts (one CLIP forward per instance). Cache `instanceEmbeddings` and reuse them across multiple text queries on the same image; only invalidate when you call `sam.forward` again.
+:::
+
+## Example app
+
+The [`computer-vision`](https://github.com/software-mansion/react-native-executorch/tree/main/apps/computer-vision/app/fast_sam) example contains a working "Segment Anything" screen with all three prompt modes wired up.
diff --git a/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md b/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md
index 14e2ff8478..6b502348d9 100644
--- a/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md
+++ b/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md
@@ -132,3 +132,9 @@ YOLO models use the [`CocoLabelYolo`](../../06-api-reference/enumerations/CocoLa
 | yolo26l-seg     | 80                | [COCO (YOLO)](../../06-api-reference/enumerations/CocoLabelYolo.md) | 384, 512, 640         |
 | yolo26x-seg     | 80                | [COCO (YOLO)](../../06-api-reference/enumerations/CocoLabelYolo.md) | 384, 512, 640         |
 | rfdetr-nano-seg | 91                | [COCO](../../06-api-reference/enumerations/CocoLabel.md)            | 312 (fixed)           |
+| fastsam-s       | 1                 | [FastSAMLabel](../../06-api-reference/enumerations/FastSAMLabel.md) | 640 (fixed)           |
+| fastsam-x       | 1                 | [FastSAMLabel](../../06-api-reference/enumerations/FastSAMLabel.md) | 640 (fixed)           |
+
+:::tip
+FastSAM models are class-agnostic — they segment every instance without classifying. To pick a specific instance from the output, use the [point/box/text selectors](./segment-anything.md).
+:::

From 52a035a9dd9b3c8a8aa1d45121de1d5a7db95345 Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Thu, 7 May 2026 00:58:33 +0200
Subject: [PATCH 06/18] feat: rename FastSAM screen and update documentation
 links to Segment Anything

---
 apps/computer-vision/app/_layout.tsx                            | 2 +-
 apps/computer-vision/app/index.tsx                              | 2 +-
 .../app/{fast_sam => segment_anything}/index.tsx                | 0
 docs/docs/03-hooks/02-computer-vision/segment-anything.md       | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename apps/computer-vision/app/{fast_sam => segment_anything}/index.tsx (100%)

diff --git a/apps/computer-vision/app/_layout.tsx b/apps/computer-vision/app/_layout.tsx
index bb7ab7165e..a4868f92ae 100644
--- a/apps/computer-vision/app/_layout.tsx
+++ b/apps/computer-vision/app/_layout.tsx
@@ -190,7 +190,7 @@ export default function _layout() {
           }}
         />
         <Drawer.Screen
-          name="fast_sam/index"
+          name="segment_anything/index"
           options={{
             drawerLabel: 'Segment Anything',
             title: 'Segment Anything',
diff --git a/apps/computer-vision/app/index.tsx b/apps/computer-vision/app/index.tsx
index f7c1dae5b7..e67e7eb5cb 100644
--- a/apps/computer-vision/app/index.tsx
+++ b/apps/computer-vision/app/index.tsx
@@ -49,7 +49,7 @@ export default function Home() {
         </TouchableOpacity>
         <TouchableOpacity
           style={styles.button}
-          onPress={() => router.navigate('fast_sam/')}
+          onPress={() => router.navigate('segment_anything/')}
         >
           <Text style={styles.buttonText}>Segment Anything</Text>
         </TouchableOpacity>
diff --git a/apps/computer-vision/app/fast_sam/index.tsx b/apps/computer-vision/app/segment_anything/index.tsx
similarity index 100%
rename from apps/computer-vision/app/fast_sam/index.tsx
rename to apps/computer-vision/app/segment_anything/index.tsx
diff --git a/docs/docs/03-hooks/02-computer-vision/segment-anything.md b/docs/docs/03-hooks/02-computer-vision/segment-anything.md
index 8cf974d034..50a7bd9536 100644
--- a/docs/docs/03-hooks/02-computer-vision/segment-anything.md
+++ b/docs/docs/03-hooks/02-computer-vision/segment-anything.md
@@ -140,4 +140,4 @@ Embedding all instances is the slow part of text prompts (one CLIP forward per i
 
 ## Example app
 
-The [`computer-vision`](https://github.com/software-mansion/react-native-executorch/tree/main/apps/computer-vision/app/fast_sam) example contains a working "Segment Anything" screen with all three prompt modes wired up.
+The [`computer-vision`](https://github.com/software-mansion/react-native-executorch/tree/main/apps/computer-vision/app/segment_anything) example contains a working "Segment Anything" screen with all three prompt modes wired up.

From 376aca68a01359297995a9caed259c6383c7a5bc Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Thu, 7 May 2026 12:24:51 +0200
Subject: [PATCH 07/18] Update
 packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts

Co-authored-by: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com>
---
 .../src/utils/segmentAnythingPrompts.ts                        | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts
index b512bc2f66..b4263c1f36 100644
--- a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts
+++ b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts
@@ -117,8 +117,7 @@ export function selectByText<L extends LabelEnum>(
   }
 
   let textNormSq = 0;
-  for (let i = 0; i < textEmbedding.length; i++) {
-    const v = textEmbedding[i]!;
+  for (const v of textEmbedding) {
     textNormSq += v * v;
   }
   const textNorm = Math.sqrt(textNormSq);

From 20bd4ba5c24f4d1e3e22c2c2a6acfe67f5adf1ec Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Thu, 7 May 2026 16:53:10 +0200
Subject: [PATCH 08/18] feat: add CoreML models

---
 .../react-native-executorch/src/constants/modelUrls.ts | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 6c24934176..387dfdc3d8 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -1011,8 +1011,14 @@ export const SELFIE_SEGMENTATION = {
 } as const;
 
 // FastSAM Instance Segmentation
-const FASTSAM_S_SEG_MODEL = `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-s/xnnpack/fastsam_s_xnnpack_fp32.pte`;
-const FASTSAM_X_SEG_MODEL = `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-x/xnnpack/fastsam_x_xnnpack_fp32.pte`;
+const FASTSAM_S_SEG_MODEL =
+  Platform.OS === 'ios'
+    ? `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-s/coreml/fastsam_s_coreml_fp16.pte`
+    : `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-s/xnnpack/fastsam_s_xnnpack_fp32.pte`;
+const FASTSAM_X_SEG_MODEL =
+  Platform.OS === 'ios'
+    ? `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-x/coreml/fastsam_x_coreml_fp16.pte`
+    : `${URL_PREFIX}-fast-sam/${NEXT_VERSION_TAG}/fastsam-x/xnnpack/fastsam_x_xnnpack_fp32.pte`;
 
 /**
  * @category Models - Instance Segmentation

From 16d6a0a1cf4f329da2718cd71738d6fe37e90fa6 Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Thu, 7 May 2026 16:53:39 +0200
Subject: [PATCH 09/18] fix: small fixes in segment anything example app

---
 .../app/segment_anything/index.tsx            | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/apps/computer-vision/app/segment_anything/index.tsx b/apps/computer-vision/app/segment_anything/index.tsx
index 4118bf3419..05cc14b558 100644
--- a/apps/computer-vision/app/segment_anything/index.tsx
+++ b/apps/computer-vision/app/segment_anything/index.tsx
@@ -363,21 +363,23 @@ export default function SegmentAnythingScreen() {
             onChangeText={setTextPrompt}
             onSubmitEditing={runTextPrompt}
             returnKeyType="search"
-            editable={!textBusy}
           />
-          <TouchableOpacity
-            style={[styles.textBtn, textBusy && styles.textBtnDisabled]}
-            onPress={runTextPrompt}
-            disabled={
+          {(() => {
+            const findInactive =
               !textPrompt.trim() ||
-              textBusy ||
               rawInstancesRef.current.length === 0 ||
               !clipImage.isReady ||
-              !clipText.isReady
-            }
-          >
-            <Text style={styles.textBtnLabel}>{textBusy ? '…' : 'Find'}</Text>
-          </TouchableOpacity>
+              !clipText.isReady;
+            return (
+              <TouchableOpacity
+                style={[styles.textBtn, findInactive && styles.textBtnDisabled]}
+                onPress={runTextPrompt}
+                disabled={findInactive || textBusy}
+              >
+                <Text style={styles.textBtnLabel}>Find</Text>
+              </TouchableOpacity>
+            );
+          })()}
         </View>
       )}
       {mode === 'text' && embeddingProgress && (
@@ -392,6 +394,7 @@ export default function SegmentAnythingScreen() {
         selectedModel={selectedModel}
         disabled={isGenerating}
         onSelect={(m) => {
+          if (m.modelName === selectedModel.modelName) return;
           setSelectedModel(m);
           rawInstancesRef.current = [];
           instanceEmbeddingsRef.current = null;
@@ -495,8 +498,9 @@ const styles = StyleSheet.create({
   textBtn: {
     backgroundColor: ColorPalette.primary,
     borderRadius: 12,
-    paddingHorizontal: 20,
     paddingVertical: 14,
+    width: 80,
+    alignItems: 'center',
   },
   textBtnDisabled: { backgroundColor: '#cbd5e1' },
   textBtnLabel: { color: '#fff', fontWeight: '700', fontSize: 16 },

From f1202a5fa8d428a92d499a8d896feac8a370b0cd Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Thu, 7 May 2026 16:54:16 +0200
Subject: [PATCH 10/18] feat: add FastSAM to vision camera

---
 .../app/vision_camera/index.tsx               |  6 +++++
 .../tasks/InstanceSegmentationTask.tsx        | 26 ++++++++++++++++---
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/apps/computer-vision/app/vision_camera/index.tsx b/apps/computer-vision/app/vision_camera/index.tsx
index 4020d20023..7a399f443f 100644
--- a/apps/computer-vision/app/vision_camera/index.tsx
+++ b/apps/computer-vision/app/vision_camera/index.tsx
@@ -54,6 +54,8 @@ type ModelId =
   | 'segmentationSelfie'
   | 'instanceSegmentationYolo26n'
   | 'instanceSegmentationRfdetr'
+  | 'instanceSegmentationFastsamS'
+  | 'instanceSegmentationFastsamX'
   | 'poseEstimationYolo26n'
   | 'ocr'
   | 'styleTransferCandy'
@@ -87,6 +89,8 @@ const TASKS: Task[] = [
     variants: [
       { id: 'instanceSegmentationYolo26n', label: 'YOLO26N Seg' },
       { id: 'instanceSegmentationRfdetr', label: 'RF-DETR Nano Seg' },
+      { id: 'instanceSegmentationFastsamS', label: 'FastSAM-S' },
+      { id: 'instanceSegmentationFastsamX', label: 'FastSAM-X' },
     ],
   },
   {
@@ -284,6 +288,8 @@ export default function VisionCameraScreen() {
             activeModel as
               | 'instanceSegmentationYolo26n'
               | 'instanceSegmentationRfdetr'
+              | 'instanceSegmentationFastsamS'
+              | 'instanceSegmentationFastsamX'
           }
         />
       )}
diff --git a/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx b/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx
index 8bcdfb3844..51f892a0c7 100644
--- a/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx
+++ b/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx
@@ -6,9 +6,12 @@ import {
   SegmentedInstance,
   YOLO26N_SEG,
   RF_DETR_NANO_SEG,
+  FASTSAM_S,
+  FASTSAM_X,
   useInstanceSegmentation,
   CocoLabel,
   CocoLabelYolo,
+  FastSAMLabel,
 } from 'react-native-executorch';
 import { Canvas, Image as SkiaImage } from '@shopify/react-native-skia';
 import { labelColor, labelColorBg } from '../../utils/colors';
@@ -20,7 +23,9 @@ import {
 
 type InstSegModelId =
   | 'instanceSegmentationYolo26n'
-  | 'instanceSegmentationRfdetr';
+  | 'instanceSegmentationRfdetr'
+  | 'instanceSegmentationFastsamS'
+  | 'instanceSegmentationFastsamX';
 
 type Props = TaskProps & { activeModel: InstSegModelId };
 
@@ -44,9 +49,23 @@ export default function InstanceSegmentationTask({
     model: RF_DETR_NANO_SEG,
     preventLoad: activeModel !== 'instanceSegmentationRfdetr',
   });
+  const fastsamS = useInstanceSegmentation({
+    model: FASTSAM_S,
+    preventLoad: activeModel !== 'instanceSegmentationFastsamS',
+  });
+  const fastsamX = useInstanceSegmentation({
+    model: FASTSAM_X,
+    preventLoad: activeModel !== 'instanceSegmentationFastsamX',
+  });
 
   const active =
-    activeModel === 'instanceSegmentationYolo26n' ? yolo26n : rfdetr;
+    activeModel === 'instanceSegmentationYolo26n'
+      ? yolo26n
+      : activeModel === 'instanceSegmentationRfdetr'
+        ? rfdetr
+        : activeModel === 'instanceSegmentationFastsamS'
+          ? fastsamS
+          : fastsamX;
 
   const [instances, setInstances] = useState<DisplayInstance[]>([]);
   const [imageSize, setImageSize] = useState({ width: 1, height: 1 });
@@ -74,7 +93,8 @@ export default function InstanceSegmentationTask({
     (p: {
       results:
         | SegmentedInstance<typeof CocoLabel>[]
-        | SegmentedInstance<typeof CocoLabelYolo>[];
+        | SegmentedInstance<typeof CocoLabelYolo>[]
+        | SegmentedInstance<typeof FastSAMLabel>[];
       imageWidth: number;
       imageHeight: number;
     }) => {

From decff98f4d1ef3441f9d7798e150a172ce4c79c2 Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Fri, 8 May 2026 12:09:43 +0200
Subject: [PATCH 11/18] refactor: simplify active model selection in
 InstanceSegmentationTask

---
 .../tasks/InstanceSegmentationTask.tsx             | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx b/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx
index 51f892a0c7..52251f6e3e 100644
--- a/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx
+++ b/apps/computer-vision/components/vision_camera/tasks/InstanceSegmentationTask.tsx
@@ -58,14 +58,12 @@ export default function InstanceSegmentationTask({
     preventLoad: activeModel !== 'instanceSegmentationFastsamX',
   });
 
-  const active =
-    activeModel === 'instanceSegmentationYolo26n'
-      ? yolo26n
-      : activeModel === 'instanceSegmentationRfdetr'
-        ? rfdetr
-        : activeModel === 'instanceSegmentationFastsamS'
-          ? fastsamS
-          : fastsamX;
+  const active = {
+    instanceSegmentationYolo26n: yolo26n,
+    instanceSegmentationRfdetr: rfdetr,
+    instanceSegmentationFastsamS: fastsamS,
+    instanceSegmentationFastsamX: fastsamX,
+  }[activeModel];
 
   const [instances, setInstances] = useState<DisplayInstance[]>([]);
   const [imageSize, setImageSize] = useState({ width: 1, height: 1 });

From 324e8d3c9961776dfbaa09ca1cd34dcd0f6206e6 Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Fri, 8 May 2026 12:39:18 +0200
Subject: [PATCH 12/18] fix: fix keyboard handling and layout for
 SegmentAnythingScreen

---
 .../app/segment_anything/index.tsx            | 299 ++++++++++--------
 1 file changed, 162 insertions(+), 137 deletions(-)

diff --git a/apps/computer-vision/app/segment_anything/index.tsx b/apps/computer-vision/app/segment_anything/index.tsx
index 05cc14b558..d2b85b5463 100644
--- a/apps/computer-vision/app/segment_anything/index.tsx
+++ b/apps/computer-vision/app/segment_anything/index.tsx
@@ -5,7 +5,11 @@ import {
   Text,
   TextInput,
   TouchableOpacity,
+  TouchableWithoutFeedback,
   GestureResponderEvent,
+  Keyboard,
+  KeyboardAvoidingView,
+  Platform,
 } from 'react-native';
 import {
   Canvas,
@@ -166,6 +170,7 @@ export default function SegmentAnythingScreen() {
   }
 
   async function runTextPrompt() {
+    Keyboard.dismiss();
     const instances = rawInstancesRef.current;
     if (
       !textPrompt.trim() ||
@@ -207,6 +212,7 @@ export default function SegmentAnythingScreen() {
   }
 
   const handleCameraPress = async (isCamera: boolean) => {
+    Keyboard.dismiss();
     const image = await getImage(isCamera);
     if (!image?.uri) return;
     setImageUri(image.uri);
@@ -218,6 +224,7 @@ export default function SegmentAnythingScreen() {
   };
 
   const runForward = async () => {
+    Keyboard.dismiss();
     if (!imageUri) return;
     try {
       const start = Date.now();
@@ -274,150 +281,167 @@ export default function SegmentAnythingScreen() {
 
   return (
     <ScreenWrapper>
-      <View style={styles.container}>
-        <View style={styles.imageContainer}>
-          <View
-            style={styles.imageTouchArea}
-            onLayout={(e) => {
-              layoutRef.current = {
-                width: e.nativeEvent.layout.width,
-                height: e.nativeEvent.layout.height,
-              };
-            }}
-            onTouchStart={(e) => {
-              if (mode === 'point') handleTap(e);
-              else if (mode === 'box') handleBoxStart(e);
-            }}
-            onTouchMove={handleBoxMove}
-            onTouchEnd={handleBoxEnd}
-          >
-            <ImageWithMasks
-              imageUri={imageUri}
-              instances={selection}
-              imageWidth={imageSize.width}
-              imageHeight={imageSize.height}
-            />
-            {draftBox && iw > 0 && (
-              <Canvas style={StyleSheet.absoluteFill} pointerEvents="none">
-                <Rect
-                  x={draftBox.x1 * drawScale + offsetX}
-                  y={draftBox.y1 * drawScale + offsetY}
-                  width={(draftBox.x2 - draftBox.x1) * drawScale}
-                  height={(draftBox.y2 - draftBox.y1) * drawScale}
-                  style="stroke"
-                  strokeWidth={2}
-                  color="rgba(0,200,255,1)"
+      <TouchableWithoutFeedback onPress={Keyboard.dismiss}>
+        <KeyboardAvoidingView
+          style={styles.flex}
+          contentContainerStyle={styles.flex}
+          collapsable={false}
+          behavior={Platform.OS === 'ios' ? 'position' : undefined}
+          keyboardVerticalOffset={Platform.OS === 'ios' ? 120 : 40}
+        >
+          <View style={styles.container}>
+            <View style={styles.imageContainer}>
+              <View
+                style={styles.imageTouchArea}
+                onLayout={(e) => {
+                  layoutRef.current = {
+                    width: e.nativeEvent.layout.width,
+                    height: e.nativeEvent.layout.height,
+                  };
+                }}
+                onTouchStart={(e) => {
+                  Keyboard.dismiss();
+                  if (mode === 'point') handleTap(e);
+                  else if (mode === 'box') handleBoxStart(e);
+                }}
+                onTouchMove={handleBoxMove}
+                onTouchEnd={handleBoxEnd}
+              >
+                <ImageWithMasks
+                  imageUri={imageUri}
+                  instances={selection}
+                  imageWidth={imageSize.width}
+                  imageHeight={imageSize.height}
                 />
-              </Canvas>
-            )}
+                {draftBox && iw > 0 && (
+                  <Canvas style={StyleSheet.absoluteFill} pointerEvents="none">
+                    <Rect
+                      x={draftBox.x1 * drawScale + offsetX}
+                      y={draftBox.y1 * drawScale + offsetY}
+                      width={(draftBox.x2 - draftBox.x1) * drawScale}
+                      height={(draftBox.y2 - draftBox.y1) * drawScale}
+                      style="stroke"
+                      strokeWidth={2}
+                      color="rgba(0,200,255,1)"
+                    />
+                  </Canvas>
+                )}
+              </View>
+              {!imageUri && (
+                <View style={styles.infoContainer}>
+                  <Text style={styles.infoTitle}>Segment Anything</Text>
+                  <Text style={styles.infoText}>
+                    Segment any object in an image. (1) Pick an image, (2) tap
+                    Run to detect instances, (3) tap a point, draw a box, or
+                    describe an object to segment it.
+                  </Text>
+                </View>
+              )}
+            </View>
+          </View>
+
+          {stepHint && <Text style={styles.stepHint}>{stepHint}</Text>}
+
+          <View style={styles.modeRow}>
+            {(['point', 'box', 'text'] as PromptMode[]).map((m) => {
+              const promptDisabled = rawInstancesRef.current.length === 0;
+              return (
+                <TouchableOpacity
+                  key={m}
+                  style={[
+                    styles.modeBtn,
+                    mode === m && styles.modeBtnActive,
+                    promptDisabled && styles.modeBtnDisabled,
+                  ]}
+                  onPress={() => {
+                    if (m !== 'text') Keyboard.dismiss();
+                    setMode(m);
+                  }}
+                  disabled={promptDisabled}
+                >
+                  <Text
+                    style={[
+                      styles.modeBtnText,
+                      mode === m && styles.modeBtnTextActive,
+                      promptDisabled && styles.modeBtnTextDisabled,
+                    ]}
+                  >
+                    {m[0]!.toUpperCase() + m.slice(1)}
+                  </Text>
+                </TouchableOpacity>
+              );
+            })}
           </View>
-          {!imageUri && (
-            <View style={styles.infoContainer}>
-              <Text style={styles.infoTitle}>Segment Anything</Text>
-              <Text style={styles.infoText}>
-                Segment any object in an image. (1) Pick an image, (2) tap Run
-                to detect instances, (3) tap a point, draw a box, or describe an
-                object to segment it.
-              </Text>
+
+          {mode === 'text' && (
+            <View style={styles.textRow}>
+              <TextInput
+                style={styles.textInput}
+                placeholder="Describe an object…"
+                value={textPrompt}
+                onChangeText={setTextPrompt}
+                onSubmitEditing={runTextPrompt}
+                returnKeyType="search"
+              />
+              {(() => {
+                const findInactive =
+                  !textPrompt.trim() ||
+                  rawInstancesRef.current.length === 0 ||
+                  !clipImage.isReady ||
+                  !clipText.isReady;
+                return (
+                  <TouchableOpacity
+                    style={[
+                      styles.textBtn,
+                      findInactive && styles.textBtnDisabled,
+                    ]}
+                    onPress={runTextPrompt}
+                    disabled={findInactive || textBusy}
+                  >
+                    <Text style={styles.textBtnLabel}>Find</Text>
+                  </TouchableOpacity>
+                );
+              })()}
             </View>
           )}
-        </View>
-      </View>
-
-      {stepHint && <Text style={styles.stepHint}>{stepHint}</Text>}
-
-      <View style={styles.modeRow}>
-        {(['point', 'box', 'text'] as PromptMode[]).map((m) => {
-          const promptDisabled = rawInstancesRef.current.length === 0;
-          return (
-            <TouchableOpacity
-              key={m}
-              style={[
-                styles.modeBtn,
-                mode === m && styles.modeBtnActive,
-                promptDisabled && styles.modeBtnDisabled,
-              ]}
-              onPress={() => setMode(m)}
-              disabled={promptDisabled}
-            >
-              <Text
-                style={[
-                  styles.modeBtnText,
-                  mode === m && styles.modeBtnTextActive,
-                  promptDisabled && styles.modeBtnTextDisabled,
-                ]}
-              >
-                {m[0]!.toUpperCase() + m.slice(1)}
-              </Text>
-            </TouchableOpacity>
-          );
-        })}
-      </View>
-
-      {mode === 'text' && (
-        <View style={styles.textRow}>
-          <TextInput
-            style={styles.textInput}
-            placeholder="Describe an object…"
-            value={textPrompt}
-            onChangeText={setTextPrompt}
-            onSubmitEditing={runTextPrompt}
-            returnKeyType="search"
+          {mode === 'text' && embeddingProgress && (
+            <Text style={styles.statusLine}>
+              Embedding instances {embeddingProgress.done}/
+              {embeddingProgress.total} (subsequent text queries are instant)
+            </Text>
+          )}
+
+          <ModelPicker
+            models={MODELS}
+            selectedModel={selectedModel}
+            disabled={isGenerating}
+            onSelect={(m) => {
+              if (m.modelName === selectedModel.modelName) return;
+              setSelectedModel(m);
+              rawInstancesRef.current = [];
+              instanceEmbeddingsRef.current = null;
+              setSelection([]);
+              setInferenceTime(null);
+            }}
           />
-          {(() => {
-            const findInactive =
-              !textPrompt.trim() ||
-              rawInstancesRef.current.length === 0 ||
-              !clipImage.isReady ||
-              !clipText.isReady;
-            return (
-              <TouchableOpacity
-                style={[styles.textBtn, findInactive && styles.textBtnDisabled]}
-                onPress={runTextPrompt}
-                disabled={findInactive || textBusy}
-              >
-                <Text style={styles.textBtnLabel}>Find</Text>
-              </TouchableOpacity>
-            );
-          })()}
-        </View>
-      )}
-      {mode === 'text' && embeddingProgress && (
-        <Text style={styles.statusLine}>
-          Embedding instances {embeddingProgress.done}/{embeddingProgress.total}{' '}
-          (subsequent text queries are instant)
-        </Text>
-      )}
-
-      <ModelPicker
-        models={MODELS}
-        selectedModel={selectedModel}
-        disabled={isGenerating}
-        onSelect={(m) => {
-          if (m.modelName === selectedModel.modelName) return;
-          setSelectedModel(m);
-          rawInstancesRef.current = [];
-          instanceEmbeddingsRef.current = null;
-          setSelection([]);
-          setInferenceTime(null);
-        }}
-      />
 
-      <StatsBar
-        inferenceTime={inferenceTime}
-        detectionCount={
-          rawInstancesRef.current.length > 0
-            ? rawInstancesRef.current.length
-            : null
-        }
-      />
+          <StatsBar
+            inferenceTime={inferenceTime}
+            detectionCount={
+              rawInstancesRef.current.length > 0
+                ? rawInstancesRef.current.length
+                : null
+            }
+          />
 
-      <BottomBar
-        handleCameraPress={handleCameraPress}
-        runForward={runForward}
-        hasImage={!!imageUri}
-        isGenerating={isGenerating}
-      />
+          <BottomBar
+            handleCameraPress={handleCameraPress}
+            runForward={runForward}
+            hasImage={!!imageUri}
+            isGenerating={isGenerating}
+          />
+        </KeyboardAvoidingView>
+      </TouchableWithoutFeedback>
     </ScreenWrapper>
   );
 }
@@ -447,6 +471,7 @@ async function cropAndEmbed(
 }
 
 const styles = StyleSheet.create({
+  flex: { flex: 1 },
   container: { flex: 6, width: '100%' },
   imageContainer: { flex: 1, width: '100%', padding: 16 },
   imageTouchArea: { flex: 1, position: 'relative' },

From 69ce94d34ce1376acca60c1f7376a2852ab3c3aa Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Fri, 8 May 2026 17:05:34 +0200
Subject: [PATCH 13/18] fix: fix cropping logic in SegmentAnythingScreen for
 text prompts

---
 .../app/segment_anything/index.tsx            | 89 ++++++++++++++++---
 .../src/utils/segmentAnythingPrompts.ts       | 38 ++++----
 2 files changed, 93 insertions(+), 34 deletions(-)

diff --git a/apps/computer-vision/app/segment_anything/index.tsx b/apps/computer-vision/app/segment_anything/index.tsx
index d2b85b5463..ae8087bd78 100644
--- a/apps/computer-vision/app/segment_anything/index.tsx
+++ b/apps/computer-vision/app/segment_anything/index.tsx
@@ -17,6 +17,8 @@ import {
   Skia,
   useImage,
   type SkImage,
+  ColorType,
+  AlphaType,
 } from '@shopify/react-native-skia';
 import {
   useInstanceSegmentation,
@@ -188,10 +190,14 @@ export default function SegmentAnythingScreen() {
         setEmbeddingProgress({ done: 0, total: instances.length });
         const embeddings: Float32Array[] = [];
         for (let i = 0; i < instances.length; i++) {
+          const inst = instances[i]!;
           embeddings.push(
             await cropAndEmbed(
               skiaSource,
-              instances[i]!.bbox,
+              inst.bbox,
+              inst.mask,
+              inst.maskWidth,
+              inst.maskHeight,
               clipImage.forward
             )
           );
@@ -201,9 +207,12 @@ export default function SegmentAnythingScreen() {
         setEmbeddingProgress(null);
       }
       const textEmb = await clipText.forward(textPrompt);
-      applyMatch(
-        selectByText(instances, instanceEmbeddingsRef.current, textEmb)
+      const match = selectByText(
+        instances,
+        instanceEmbeddingsRef.current,
+        textEmb
       );
+      applyMatch(match);
     } catch (e) {
       console.error(e);
     } finally {
@@ -449,24 +458,76 @@ export default function SegmentAnythingScreen() {
 async function cropAndEmbed(
   image: SkImage,
   bbox: Bbox,
+  mask: Uint8Array,
+  maskWidth: number,
+  maskHeight: number,
   forward: (input: string) => Promise<Float32Array>
 ): Promise<Float32Array> {
-  const w = Math.max(1, Math.round(bbox.x2 - bbox.x1));
-  const h = Math.max(1, Math.round(bbox.y2 - bbox.y1));
-  const surface = Skia.Surface.MakeOffscreen(w, h);
+  // FastSAM-style full-image white canvas, but with the mask applied:
+  // inside the bbox we keep image pixels where mask=1 and overwrite the
+  // rest with white. CLIP then sees a uniform white scene with only the
+  // segmented object visible at its original position/size.
+  const imgW = image.width();
+  const imgH = image.height();
+  const surface = Skia.Surface.MakeOffscreen(imgW, imgH);
   if (!surface) throw new Error('Failed to create offscreen Skia surface');
-  surface.getCanvas().drawImageRect(
-    image,
+  const canvas = surface.getCanvas();
+  canvas.clear(Skia.Color('white'));
+
+  const x1 = Math.max(0, Math.round(bbox.x1));
+  const y1 = Math.max(0, Math.round(bbox.y1));
+  const x2 = Math.min(imgW, Math.round(bbox.x2));
+  const y2 = Math.min(imgH, Math.round(bbox.y2));
+  const w = x2 - x1;
+  const h = y2 - y1;
+  if (w > 0 && h > 0) {
+    canvas.drawImageRect(
+      image,
+      { x: x1, y: y1, width: w, height: h },
+      { x: x1, y: y1, width: w, height: h },
+      Skia.Paint()
+    );
+  }
+
+  // Inverse mask: opaque white where mask=0, transparent where mask=1.
+  // Drawn on top within the bbox, it overpaints non-mask pixels with white
+  // and leaves the segmented object intact.
+  const inversePixels = new Uint8Array(mask.length * 4);
+  for (let i = 0; i < mask.length; i++) {
+    const outside = mask[i]! === 0;
+    const idx = i * 4;
+    inversePixels[idx] = outside ? 255 : 0;
+    inversePixels[idx + 1] = outside ? 255 : 0;
+    inversePixels[idx + 2] = outside ? 255 : 0;
+    inversePixels[idx + 3] = outside ? 255 : 0;
+  }
+  const inverseData = Skia.Data.fromBytes(inversePixels);
+  const inverseMaskImg = Skia.Image.MakeImage(
     {
-      x: bbox.x1,
-      y: bbox.y1,
-      width: bbox.x2 - bbox.x1,
-      height: bbox.y2 - bbox.y1,
+      width: maskWidth,
+      height: maskHeight,
+      colorType: ColorType.RGBA_8888,
+      alphaType: AlphaType.Premul,
     },
-    { x: 0, y: 0, width: w, height: h },
-    Skia.Paint()
+    inverseData,
+    maskWidth * 4
   );
+  if (inverseMaskImg) {
+    canvas.drawImageRect(
+      inverseMaskImg,
+      { x: 0, y: 0, width: maskWidth, height: maskHeight },
+      {
+        x: bbox.x1,
+        y: bbox.y1,
+        width: bbox.x2 - bbox.x1,
+        height: bbox.y2 - bbox.y1,
+      },
+      Skia.Paint()
+    );
+  }
+
   const base64 = surface.makeImageSnapshot().encodeToBase64();
+  inverseData.dispose();
   return forward(`data:image/png;base64,${base64}`);
 }
 
diff --git a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts
index b4263c1f36..9c8b69ba1f 100644
--- a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts
+++ b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts
@@ -98,7 +98,7 @@ function bboxArea(bbox: Bbox): number {
  * Returns the instance whose image embedding has the highest cosine similarity
  * with the text embedding. The caller is responsible for producing the
  * embeddings (e.g. with CLIP) and passing them in the same order as
- * `instances`; embeddings do not need to be pre-normalized.
+ * `instances`.
  * @param instances - Array of segmented instances returned by `forward()`.
  * @param instanceEmbeddings - Image embedding for each instance, in the same order as `instances`.
  * @param textEmbedding - Embedding of the text prompt.
@@ -116,29 +116,27 @@ export function selectByText<L extends LabelEnum>(
     );
   }
 
-  let textNormSq = 0;
-  for (const v of textEmbedding) {
-    textNormSq += v * v;
-  }
-  const textNorm = Math.sqrt(textNormSq);
-
+  const scores = calculateDotProducts(instanceEmbeddings, textEmbedding);
   let bestIdx = 0;
   let bestScore = -Infinity;
-  for (let i = 0; i < instances.length; i++) {
-    const emb = instanceEmbeddings[i]!;
-    const n = Math.min(emb.length, textEmbedding.length);
-    let dot = 0;
-    let embNormSq = 0;
-    for (let j = 0; j < n; j++) {
-      const a = emb[j]!;
-      dot += a * textEmbedding[j]!;
-      embNormSq += a * a;
-    }
-    const score = dot / (Math.sqrt(embNormSq) * textNorm + 1e-7);
-    if (score > bestScore) {
-      bestScore = score;
+  for (let i = 0; i < scores.length; i++) {
+    if (scores[i]! > bestScore) {
+      bestScore = scores[i]!;
       bestIdx = i;
     }
   }
   return instances[bestIdx]!;
 }
+
+function calculateDotProducts(
+  instanceEmbeddings: Float32Array[],
+  textEmbedding: Float32Array
+): number[] {
+  return instanceEmbeddings.map((emb) => {
+    let dot = 0;
+    for (let j = 0; j < emb.length; j++) {
+      dot += emb[j]! * textEmbedding[j]!;
+    }
+    return dot;
+  });
+}

From 8f84b1761cbeafd9cb572104e830de578205a62c Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Fri, 8 May 2026 17:37:58 +0200
Subject: [PATCH 14/18] feat: add common vision utilities

---
 .../src/utils/commonVision.ts                 | 10 ++++++
 .../src/utils/segmentAnythingPrompts.ts       | 33 ++++++++-----------
 2 files changed, 23 insertions(+), 20 deletions(-)
 create mode 100644 packages/react-native-executorch/src/utils/commonVision.ts

diff --git a/packages/react-native-executorch/src/utils/commonVision.ts b/packages/react-native-executorch/src/utils/commonVision.ts
new file mode 100644
index 0000000000..7cd9b2a44b
--- /dev/null
+++ b/packages/react-native-executorch/src/utils/commonVision.ts
@@ -0,0 +1,10 @@
+import { Bbox } from '../types/objectDetection';
+
+/**
+ * Calculates the area of a bounding box.
+ * @param bbox - Bounding box to calculate area for.
+ * @returns Area of the bounding box.
+ */
+export function bboxArea(bbox: Bbox): number {
+  return Math.max(bbox.x2 - bbox.x1, 0) * Math.max(bbox.y2 - bbox.y1, 0);
+}
diff --git a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts
index 9c8b69ba1f..f162de5e83 100644
--- a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts
+++ b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts
@@ -1,6 +1,7 @@
 import { LabelEnum } from '../types/common';
 import { Bbox } from '../types/objectDetection';
 import { SegmentedInstance } from '../types/instanceSegmentation';
+import { bboxArea } from './commonVision';
 
 /**
  * Selects the best matching instance for a given point prompt.
@@ -54,7 +55,7 @@ export function selectByBox<L extends LabelEnum>(
   box: Bbox
 ): SegmentedInstance<L> | null {
   const { x1: px1, y1: py1, x2: px2, y2: py2 } = box;
-  const promptArea = Math.max(px2 - px1, 0) * Math.max(py2 - py1, 0);
+  const promptArea = bboxArea(box);
 
   type Match = {
     iou: number;
@@ -88,10 +89,6 @@ export function selectByBox<L extends LabelEnum>(
   return best?.inst ?? null;
 }
 
-function bboxArea(bbox: Bbox): number {
-  return Math.max(bbox.x2 - bbox.x1, 0) * Math.max(bbox.y2 - bbox.y1, 0);
-}
-
 /**
  * Selects the best matching instance for a text prompt.
  *
@@ -112,11 +109,20 @@ export function selectByText<L extends LabelEnum>(
   if (instances.length === 0) return null;
   if (instances.length !== instanceEmbeddings.length) {
     throw new Error(
-      `selectByText: instances (${instances.length}) and instanceEmbeddings (${instanceEmbeddings.length}) must have the same length`
+      `selectByText: instances (${instances.length})` +
+        `and instanceEmbeddings (${instanceEmbeddings.length})` +
+        `must have the same length`
     );
   }
 
-  const scores = calculateDotProducts(instanceEmbeddings, textEmbedding);
+  const scores = instanceEmbeddings.map((emb) => {
+    let dot = 0;
+    for (let j = 0; j < emb.length; j++) {
+      dot += emb[j]! * textEmbedding[j]!;
+    }
+    return dot;
+  });
+
   let bestIdx = 0;
   let bestScore = -Infinity;
   for (let i = 0; i < scores.length; i++) {
@@ -127,16 +133,3 @@ export function selectByText<L extends LabelEnum>(
   }
   return instances[bestIdx]!;
 }
-
-function calculateDotProducts(
-  instanceEmbeddings: Float32Array[],
-  textEmbedding: Float32Array
-): number[] {
-  return instanceEmbeddings.map((emb) => {
-    let dot = 0;
-    for (let j = 0; j < emb.length; j++) {
-      dot += emb[j]! * textEmbedding[j]!;
-    }
-    return dot;
-  });
-}

From 7dc462f02a8ae73eaf5cc67ee3e5c5abcc873c5a Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Mon, 11 May 2026 11:41:53 +0200
Subject: [PATCH 15/18] docs: update docs

---
 .../02-computer-vision/segment-anything.md    | 143 ------------------
 .../useInstanceSegmentation.md                |  42 ++++-
 2 files changed, 41 insertions(+), 144 deletions(-)
 delete mode 100644 docs/docs/03-hooks/02-computer-vision/segment-anything.md

diff --git a/docs/docs/03-hooks/02-computer-vision/segment-anything.md b/docs/docs/03-hooks/02-computer-vision/segment-anything.md
deleted file mode 100644
index 50a7bd9536..0000000000
--- a/docs/docs/03-hooks/02-computer-vision/segment-anything.md
+++ /dev/null
@@ -1,143 +0,0 @@
----
-title: Segment Anything with FastSAM
----
-
-[FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM) is a class-agnostic, promptable segmentation model. Unlike YOLO or RF-DETR (which return labelled detections), FastSAM segments **every** instance in an image without classifying them — you then pick the one you want with a point, box, or text prompt.
-
-`FASTSAM_S` and `FASTSAM_X` are loaded with the regular [`useInstanceSegmentation`](./useInstanceSegmentation.md) hook. `react-native-executorch` ships three small selector utilities to pick an instance from the hook's output: `selectByPoint`, `selectByBox`, and `selectByText`.
-
-## API Reference
-
-- [`selectByPoint` API Reference](../../06-api-reference/functions/selectByPoint.md)
-- [`selectByBox` API Reference](../../06-api-reference/functions/selectByBox.md)
-- [`selectByText` API Reference](../../06-api-reference/functions/selectByText.md)
-
-## High Level Overview
-
-The workflow has three steps:
-
-1. Load `FASTSAM_S` (or `FASTSAM_X`) with `useInstanceSegmentation`.
-2. Run `forward(image)` once — the result is every detected instance.
-3. Use a selector to pick the one matching the user's prompt. Re-run a selector when the prompt changes; you don't need to call `forward` again.
-
-```typescript
-import {
-  useInstanceSegmentation,
-  selectByPoint,
-  selectByBox,
-  selectByText,
-  FASTSAM_S,
-} from 'react-native-executorch';
-
-const model = useInstanceSegmentation({ model: FASTSAM_S });
-
-try {
-  const instances = await model.forward(imageUri);
-
-  // Point: the smallest instance whose mask covers (x, y).
-  const a = selectByPoint(instances, x, y);
-  console.log('point match:', a?.bbox);
-
-  // Box: the instance with highest IoU with the prompt box.
-  const b = selectByBox(instances, { x1, y1, x2, y2 });
-  console.log('box match:', b?.bbox);
-
-  // Text: highest cosine similarity between text and per-instance image
-  // embeddings (you must provide the embeddings, e.g. with CLIP).
-  const c = selectByText(instances, instanceEmbeddings, textEmbedding);
-  console.log('text match:', c?.bbox);
-} catch (error) {
-  console.error(error);
-}
-```
-
-The hook output is typed as [`SegmentedInstance<typeof FastSAMLabel>`](../../06-api-reference/interfaces/SegmentedInstance.md). FastSAM emits a single label, [`FastSAMLabel.OBJECT`](../../06-api-reference/enumerations/FastSAMLabel.md) (`'OBJECT' = 0`).
-
-## Selecting by point
-
-`selectByPoint` returns the instance whose mask covers the point `(x, y)`. When several instances overlap (e.g. a small object inside a larger one), the one with the smallest bounding box wins; ties are broken by confidence. Returns `null` if no mask covers the point.
-
-It accepts three arguments:
-
-- `instances` (required) - The array of [`SegmentedInstance`](../../06-api-reference/interfaces/SegmentedInstance.md) returned by `forward()`.
-- `x` (required) - X coordinate of the prompt point, in the **original image's** pixel space.
-- `y` (required) - Y coordinate of the prompt point, in the **original image's** pixel space.
-
-:::info
-`returnMaskAtOriginalResolution: true` (the default) is required for `selectByPoint` — masks must be in original image coordinates so they align with the touch coordinates passed in.
-:::
-
-## Selecting by box
-
-`selectByBox` returns the instance with the highest IoU with the prompt box. Useful for "draw a box around what you want" UX. Returns `null` if no instance overlaps.
-
-It accepts two arguments:
-
-- `instances` (required) - The array of [`SegmentedInstance`](../../06-api-reference/interfaces/SegmentedInstance.md) returned by `forward()`.
-- `box` (required) - A [`Bbox`](../../06-api-reference/interfaces/Bbox.md) (`{ x1, y1, x2, y2 }`) in the original image's pixel space.
-
-## Selecting by text
-
-`selectByText` returns the instance whose image embedding has the highest cosine similarity with the text embedding. The caller produces the embeddings — typically by cropping each instance's bbox and running [CLIP](./useImageEmbeddings.md) image encoder, plus running the [CLIP text encoder](../01-natural-language-processing/useTextEmbeddings.md) on the prompt.
-
-It accepts three arguments:
-
-- `instances` (required) - The array of [`SegmentedInstance`](../../06-api-reference/interfaces/SegmentedInstance.md) returned by `forward()`.
-- `instanceEmbeddings` (required) - One `Float32Array` per instance, in the same order as `instances`. Throws if lengths differ.
-- `textEmbedding` (required) - A `Float32Array` for the text prompt.
-
-Embeddings do not need to be pre-normalized. Returns `null` only when `instances` is empty.
-
-### Example with CLIP
-
-```typescript
-import {
-  useInstanceSegmentation,
-  useImageEmbeddings,
-  useTextEmbeddings,
-  selectByText,
-  FASTSAM_S,
-  CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED,
-  CLIP_VIT_BASE_PATCH32_TEXT,
-} from 'react-native-executorch';
-
-function App() {
-  const sam = useInstanceSegmentation({ model: FASTSAM_S });
-  const clipImage = useImageEmbeddings({
-    model: CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED,
-  });
-  const clipText = useTextEmbeddings({ model: CLIP_VIT_BASE_PATCH32_TEXT });
-
-  const handlePrompt = async (imageUri: string, prompt: string) => {
-    if (!sam.isReady || !clipImage.isReady || !clipText.isReady) return;
-
-    try {
-      const instances = await sam.forward(imageUri);
-
-      // Embed each instance's bbox crop. Cropping is your responsibility —
-      // any image manipulator (e.g. expo-image-manipulator) works.
-      const instanceEmbeddings = await Promise.all(
-        instances.map((inst) =>
-          clipImage.forward(cropToBbox(imageUri, inst.bbox))
-        )
-      );
-
-      const textEmb = await clipText.forward(prompt);
-      const match = selectByText(instances, instanceEmbeddings, textEmb);
-      console.log('match:', match?.bbox, match?.score);
-    } catch (error) {
-      console.error(error);
-    }
-  };
-
-  // ...
-}
-```
-
-:::tip
-Embedding all instances is the slow part of text prompts (one CLIP forward per instance). Cache `instanceEmbeddings` and reuse them across multiple text queries on the same image; only invalidate when you call `sam.forward` again.
-:::
-
-## Example app
-
-The [`computer-vision`](https://github.com/software-mansion/react-native-executorch/tree/main/apps/computer-vision/app/segment_anything) example contains a working "Segment Anything" screen with all three prompt modes wired up.
diff --git a/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md b/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md
index 6b502348d9..b9bdaa2774 100644
--- a/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md
+++ b/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md
@@ -136,5 +136,45 @@ YOLO models use the [`CocoLabelYolo`](../../06-api-reference/enumerations/CocoLa
 | fastsam-x       | 1                 | [FastSAMLabel](../../06-api-reference/enumerations/FastSAMLabel.md) | 640 (fixed)           |
 
 :::tip
-FastSAM models are class-agnostic — they segment every instance without classifying. To pick a specific instance from the output, use the [point/box/text selectors](./segment-anything.md).
+FastSAM models are class-agnostic, so they segment every instance without classifying it. That makes them a good fit for promptable selection workflows.
 :::
+
+## Promptable selection
+
+Instance segmentation models return a list of segmented instances. After `forward()`, you can use prompt-based selectors to pick the instance you want. Use point selection for tap-to-select or cutout tools, box selection for drag-to-outline workflows, and text selection for search or describe-it-in-words flows. For example, a photo-editing app can use point selection to isolate a person, create custom sticker or background-removal flow can use box selection, and a shopping app can use text selection to find a product by name or description:
+
+1. Load an instance segmentation model with `useInstanceSegmentation`.
+2. Run `forward(image)` once to get the detected instances.
+3. Use a selector to pick the instance or instances matching the user's prompt.
+4. Re-run the selector when the prompt changes; you do not need to call `forward` again unless the image changes.
+
+```typescript
+import {
+  useInstanceSegmentation,
+  selectByPoint,
+  selectByBox,
+  selectByText,
+  FASTSAM_X,
+} from 'react-native-executorch';
+
+const model = useInstanceSegmentation({ model: FASTSAM_X });
+
+try {
+  const instances = await model.forward(imageUri);
+
+  // Point: the smallest instance whose mask covers (x, y).
+  const pointMatch = selectByPoint(instances, x, y);
+  console.log('point match:', pointMatch?.bbox);
+
+  // Box: the instance with highest IoU with the prompt box.
+  const boxMatch = selectByBox(instances, { x1, y1, x2, y2 });
+  console.log('box match:', boxMatch?.bbox);
+
+  // Text: highest cosine similarity between text and per-instance image
+  // embeddings (you must provide the embeddings, e.g. with CLIP).
+  const textMatch = selectByText(instances, instanceEmbeddings, textEmbedding);
+  console.log('text match:', textMatch?.bbox);
+} catch (error) {
+  console.error(error);
+}
+```

From ee0403cf9e447ff6b9a16b0367fa77e3a414551d Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Mon, 11 May 2026 12:11:11 +0200
Subject: [PATCH 16/18] feat: enhance selectByText function to support multiple
 top matches

---
 .cspell-wordlist.txt                          |  1 +
 .../app/segment_anything/index.tsx            |  7 ---
 .../src/utils/segmentAnythingPrompts.ts       | 52 +++++++++++++------
 3 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt
index b9809a8734..de81c01b40 100644
--- a/.cspell-wordlist.txt
+++ b/.cspell-wordlist.txt
@@ -205,3 +205,4 @@ deinitialize
 Deinitialize
 fastsam
 promptable
+topk
diff --git a/apps/computer-vision/app/segment_anything/index.tsx b/apps/computer-vision/app/segment_anything/index.tsx
index ae8087bd78..037a988327 100644
--- a/apps/computer-vision/app/segment_anything/index.tsx
+++ b/apps/computer-vision/app/segment_anything/index.tsx
@@ -463,10 +463,6 @@ async function cropAndEmbed(
   maskHeight: number,
   forward: (input: string) => Promise<Float32Array>
 ): Promise<Float32Array> {
-  // FastSAM-style full-image white canvas, but with the mask applied:
-  // inside the bbox we keep image pixels where mask=1 and overwrite the
-  // rest with white. CLIP then sees a uniform white scene with only the
-  // segmented object visible at its original position/size.
   const imgW = image.width();
   const imgH = image.height();
   const surface = Skia.Surface.MakeOffscreen(imgW, imgH);
@@ -489,9 +485,6 @@ async function cropAndEmbed(
     );
   }
 
-  // Inverse mask: opaque white where mask=0, transparent where mask=1.
-  // Drawn on top within the bbox, it overpaints non-mask pixels with white
-  // and leaves the segmented object intact.
   const inversePixels = new Uint8Array(mask.length * 4);
   for (let i = 0; i < mask.length; i++) {
     const outside = mask[i]! === 0;
diff --git a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts
index f162de5e83..db854705c7 100644
--- a/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts
+++ b/packages/react-native-executorch/src/utils/segmentAnythingPrompts.ts
@@ -90,27 +90,41 @@ export function selectByBox<L extends LabelEnum>(
 }
 
 /**
- * Selects the best matching instance for a text prompt.
+ * Selects the best matching instance(s) for a text prompt.
  *
- * Returns the instance whose image embedding has the highest cosine similarity
+ * Returns the instance(s) whose image embedding has the highest cosine similarity
  * with the text embedding. The caller is responsible for producing the
  * embeddings (e.g. with CLIP) and passing them in the same order as
  * `instances`.
  * @param instances - Array of segmented instances returned by `forward()`.
  * @param instanceEmbeddings - Image embedding for each instance, in the same order as `instances`.
  * @param textEmbedding - Embedding of the text prompt.
- * @returns The best matching instance, or `null` if `instances` is empty.
+ * @param topk - Number of top matches to return (defaults to 1).
+ * @returns The best matching instance (or null) if topk is 1, otherwise an array of the topk matching instances.
  */
 export function selectByText<L extends LabelEnum>(
   instances: SegmentedInstance<L>[],
   instanceEmbeddings: Float32Array[],
-  textEmbedding: Float32Array
-): SegmentedInstance<L> | null {
-  if (instances.length === 0) return null;
+  textEmbedding: Float32Array,
+  topk?: 1
+): SegmentedInstance<L> | null;
+export function selectByText<L extends LabelEnum>(
+  instances: SegmentedInstance<L>[],
+  instanceEmbeddings: Float32Array[],
+  textEmbedding: Float32Array,
+  topk: number
+): SegmentedInstance<L>[];
+export function selectByText<L extends LabelEnum>(
+  instances: SegmentedInstance<L>[],
+  instanceEmbeddings: Float32Array[],
+  textEmbedding: Float32Array,
+  topk = 1
+): SegmentedInstance<L> | null | SegmentedInstance<L>[] {
+  if (instances.length === 0) return topk === 1 ? null : [];
   if (instances.length !== instanceEmbeddings.length) {
     throw new Error(
-      `selectByText: instances (${instances.length})` +
-        `and instanceEmbeddings (${instanceEmbeddings.length})` +
+      `selectByText: instances (${instances.length}) ` +
+        `and instanceEmbeddings (${instanceEmbeddings.length}) ` +
         `must have the same length`
     );
   }
@@ -123,13 +137,21 @@ export function selectByText<L extends LabelEnum>(
     return dot;
   });
 
-  let bestIdx = 0;
-  let bestScore = -Infinity;
-  for (let i = 0; i < scores.length; i++) {
-    if (scores[i]! > bestScore) {
-      bestScore = scores[i]!;
-      bestIdx = i;
+  if (topk === 1) {
+    let bestIdx = 0;
+    let bestScore = -Infinity;
+    for (let i = 0; i < scores.length; i++) {
+      if (scores[i]! > bestScore) {
+        bestScore = scores[i]!;
+        bestIdx = i;
+      }
     }
+    return instances[bestIdx]!;
   }
-  return instances[bestIdx]!;
+
+  return instances
+    .map((instance, index) => ({ instance, score: scores[index]! }))
+    .sort((a, b) => b.score - a.score)
+    .slice(0, topk)
+    .map((item) => item.instance);
 }

From 49563f08943135fe13c53969d4bdcb1396094239 Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Mon, 11 May 2026 13:46:26 +0200
Subject: [PATCH 17/18] fix: add pointerEvents="none" to overlay view in
 ImageWithMasks component

---
 apps/computer-vision/components/ImageWithMasks.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/computer-vision/components/ImageWithMasks.tsx b/apps/computer-vision/components/ImageWithMasks.tsx
index bd768909b2..8bb435f47a 100644
--- a/apps/computer-vision/components/ImageWithMasks.tsx
+++ b/apps/computer-vision/components/ImageWithMasks.tsx
@@ -156,7 +156,7 @@ export default function ImageWithMasks({
       />
 
       {instances.length > 0 && (
-        <View style={styles.overlay}>
+        <View style={styles.overlay} pointerEvents="none">
           <Canvas style={styles.canvas}>
             {instances.map((inst, idx) => {
               const mx = inst.bbox.x1 * scale + offsetX;

From 80bf79d92c991d0973fe8ca87951d5e330c961c2 Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Mon, 11 May 2026 15:12:19 +0200
Subject: [PATCH 18/18] docs: update inference time and model size
 documentation; add FastSAM usage tips

---
 docs/docs/02-benchmarks/inference-time.md     | 24 ++++++++++++-------
 docs/docs/02-benchmarks/model-size.md         | 18 +++++++-------
 .../useInstanceSegmentation.md                |  6 +++++
 3 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/docs/docs/02-benchmarks/inference-time.md b/docs/docs/02-benchmarks/inference-time.md
index faef5c603d..8ef213238f 100644
--- a/docs/docs/02-benchmarks/inference-time.md
+++ b/docs/docs/02-benchmarks/inference-time.md
@@ -230,17 +230,23 @@ slower for very large images, which can increase total time.
 ## Instance Segmentation
 
 :::note
-Times presented in the tables are measured for YOLO models with input size equal to 512. Other input sizes may yield slower or faster inference times. RF-DETR Nano Seg uses a fixed resolution of 312×312.
+Times presented in the tables are measured for YOLO models with input size equal
+to 512. Other input sizes may yield slower or faster inference times. RF-DETR
+Nano Seg uses a fixed resolution of 312×312.
 :::
 
-| Model            | Samsung Galaxy S24 (XNNPACK) [ms] | Iphone 17 pro (XNNPACK) [ms] |
-| ---------------- | --------------------------------- | ---------------------------- |
-| YOLO26N_SEG      | 92                                | 90                           |
-| YOLO26S_SEG      | 220                               | 188                          |
-| YOLO26M_SEG      | 570                               | 550                          |
-| YOLO26L_SEG      | 680                               | 608                          |
-| YOLO26X_SEG      | 1410                              | 1338                         |
-| RF_DETR_NANO_SEG | 549                               | 330                          |
+| Model                      | Samsung Galaxy S24 [ms] | Iphone 17 pro [ms] | Pixel 10 [ms] |
+| :------------------------- | :---------------------: | :----------------: | :-----------: |
+| YOLO26N_SEG (XNNPACK)      |           92            |         90         |      93       |
+| YOLO26S_SEG (XNNPACK)      |           220           |        188         |      193      |
+| YOLO26M_SEG (XNNPACK)      |           570           |        550         |      481      |
+| YOLO26L_SEG (XNNPACK)      |           680           |        608         |      582      |
+| YOLO26X_SEG (XNNPACK)      |          1410           |        1338        |     1191      |
+| RF_DETR_NANO_SEG (XNNPACK) |           549           |        330         |      428      |
+| FASTSAM_S (XNNPACK)        |            -            |         30         |      286      |
+| FASTSAM_X (XNNPACK)        |            -            |        2520        |     1993      |
+| FASTSAM_S (Core ML)        |            -            |         51         |       -       |
+| FASTSAM_X (Core ML)        |            -            |         72         |       -       |
 
 ## Text to image
 
diff --git a/docs/docs/02-benchmarks/model-size.md b/docs/docs/02-benchmarks/model-size.md
index 8dea094839..6d7f7cb753 100644
--- a/docs/docs/02-benchmarks/model-size.md
+++ b/docs/docs/02-benchmarks/model-size.md
@@ -22,14 +22,16 @@ title: Model Size
 
 ## Instance Segmentation
 
-| Model            | XNNPACK [MB] |
-| ---------------- | :----------: |
-| YOLO26N_SEG      |     11.6     |
-| YOLO26S_SEG      |     42.3     |
-| YOLO26M_SEG      |     95.4     |
-| YOLO26L_SEG      |     113      |
-| YOLO26X_SEG      |     252      |
-| RF_DETR_NANO_SEG |     124      |
+| Model            | XNNPACK [MB] | Core ML FP32 [MB] | Core ML FP16 [MB] |
+| ---------------- | :----------: | :---------------: | :---------------: |
+| YOLO26N_SEG      |     11.6     |         -         |         -         |
+| YOLO26S_SEG      |     42.3     |         -         |         -         |
+| YOLO26M_SEG      |     95.4     |         -         |         -         |
+| YOLO26L_SEG      |     113      |         -         |         -         |
+| YOLO26X_SEG      |     252      |         -         |         -         |
+| RF_DETR_NANO_SEG |     124      |         -         |         -         |
+| FASTSAM_S        |     47.3     |       47.8        |       24.2        |
+| FASTSAM_X        |     289      |        290        |        145        |
 
 ## Style Transfer
 
diff --git a/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md b/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md
index b9bdaa2774..6835262a6a 100644
--- a/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md
+++ b/docs/docs/03-hooks/02-computer-vision/useInstanceSegmentation.md
@@ -178,3 +178,9 @@ try {
   console.error(error);
 }
 ```
+
+:::tip
+Use FastSAM-S for faster performance on simple images with non-overlapping
+instances and FastSAM-X for better accuracy on complex scenes with many
+overlapping objects.
+:::