pytorch · kirklandsign · Feb 5, 2026 · Feb 5, 2026 · Feb 5, 2026 · Feb 5, 2026
@@ -283,6 +283,18 @@ fbcode_target(_kind = runtime.python_test,
     ],
 )
 
+fbcode_target(_kind = runtime.python_test,
+    name = "attention_sink_ring_buffer_test",
+    srcs = [
+        "source_transformation/test_attention_sink_ring_buffer.py",
+    ],
+    supports_static_listing = False,
+    deps = [
+        "//caffe2:torch",
+        ":export_library",
+    ],
+)
+
 fbcode_target(_kind = runtime.python_test,
     name = "quantized_sdpa_source_transform_test",
     srcs = [

@@ -0,0 +1,31 @@
+base:
+  metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+
+model:
+  use_sdpa_with_kv_cache: True  # Now supported! We set use_attention_mask=True on SDPACustom
+  use_kv_cache: True
+  dtype_override: fp32
+  enable_dynamic_shape: True
+  # Attention Sink: "sink_size,window_size,eviction_batch_size"
+  # sink_size=4: Keep first 4 tokens (e.g., BOS + system prompt)
+  # window_size=124: 滑动窗口大小
+  # eviction_batch_size=1: 每次驱逐 1 个 token
-  # window_size=124: 滑动窗口大小
-  # eviction_batch_size=1: 每次驱逐 1 个 token
+  # window_size=124: sliding window size
+  # eviction_batch_size=1: evict 1 token at a time
-  # window_size=124: 滑动窗口大小
-  # eviction_batch_size=1: 每次驱逐 1 个 token
+  # window_size=124: Sliding window size
+  # eviction_batch_size=1: Evict 1 token at a time
-  # window_size=124: 滑动窗口大小
-  # eviction_batch_size=1: 每次驱逐 1 个 token
+  # window_size=124: sliding window size
+  # eviction_batch_size=1: evict 1 token at a time
-  # window_size=124: 滑动窗口大小
-  # eviction_batch_size=1: 每次驱逐 1 个 token
+  # window_size=124: Sliding window size
+  # eviction_batch_size=1: Evict 1 token at a time
+  # KV cache size = sink_size + window_size * 2 = 4 + 124*2 = 252
+  use_attention_sink: "4,124,1"
+
+export:
+  # max_context_length controls the RoPE frequency table size.
+  # It must be >= sink_size + window_size (128), but larger values are
+  # recommended to support generation beyond the sliding window.
+  # The model default (e.g., 8192 or 131072) is typically used if not specified.
+  # For testing, we use the model's default by not setting this explicitly.
+
+quantization:
+  qmode: 8da4w
+  group_size: 128
+  embedding_quantize: 4,32
+
+backend:
+  xnnpack:
+    enabled: True
+    extended_ops: True
@@ -338,6 +338,8 @@ def eval_llama_with_attention_sink(model_name: str, args: argparse.ArgumentParse
     Evaluate the model's perplexity when AttentionSink is enabled.
 
     This is mostly copied from https://github.com/mit-han-lab/streaming-llm/blob/main/examples/eval_long_ppl.py
+
+    Updated for the ring-buffer based attention sink implementation.
     """
     # Convert args to LlmConfig
     from executorch.extension.llm.export.config.llm_config import LlmConfig
@@ -351,7 +353,13 @@ def eval_llama_with_attention_sink(model_name: str, args: argparse.ArgumentParse
     sink_size = int(attention_sink_params[0])
     window_size = int(attention_sink_params[1])
 
-    assert llm_config.export.max_seq_length == sink_size + window_size
+    # For the ring buffer implementation, the cache size is sink_size + window_size * 2
+    # max_context_length should be >= sink_size + window_size (for RoPE frequencies)
+    # but can be larger to support extended generation
+    assert llm_config.export.max_context_length >= sink_size + window_size, (
+        f"max_context_length ({llm_config.export.max_context_length}) must be >= "
+        f"sink_size + window_size ({sink_size + window_size})"
+    )
 
     device = "cuda" if torch.cuda.is_available() else "cpu"
     manager: LLMEdgeManager = _prepare_for_llama_export(llm_config)

@@ -218,7 +218,22 @@ def __init__(self, llm_config: Optional[LlmConfig] = None):
             window_size = int(attention_sink_params[1])
             eviction_batch_size = int(attention_sink_params[2])
 
-            assert self.llm_config.export.max_context_length == sink_size + window_size
+            # max_context_length must be >= sink_size + window_size to have enough RoPE frequencies
+            # A larger max_context_length is allowed (and recommended) to support generation beyond
+            # the sliding window size.
+            assert self.llm_config.export.max_context_length >= sink_size + window_size, (
+                f"max_context_length ({self.llm_config.export.max_context_length}) must be >= "
+                f"sink_size + window_size ({sink_size + window_size})"
+            )
+
+            # IMPORTANT: For attention sink, we need RoPE frequencies for all possible generation
+            # positions, not just the cache size. Override the model's max_context_len to use
+            # a larger value that supports extended generation.
+            # We use model_args.max_context_len which was set from export.max_context_length
+            # but for RoPE we need the full generation length capability.
+            # Use 131072 (128k) as default for Llama 3.2 models or the original model max if larger.
+            default_rope_length = max(131072, model_args.max_context_len)
-            # Use 131072 (128k) as default for Llama 3.2 models or the original model max if larger.
-            default_rope_length = max(131072, model_args.max_context_len)
+            # Derive the RoPE max context length from configuration rather than using a fixed
+            # default. Use the larger of export.max_context_length and the original
+            # model_args.max_context_len.
+            default_rope_length = max(
+                self.llm_config.export.max_context_length,
+                model_args.max_context_len,
+            )
-            # Use 131072 (128k) as default for Llama 3.2 models or the original model max if larger.
-            default_rope_length = max(131072, model_args.max_context_len)
+            # Derive the RoPE max context length from configuration rather than using a fixed
+            # default. Use the larger of export.max_context_length and the original
+            # model_args.max_context_len.
+            default_rope_length = max(
+                self.llm_config.export.max_context_length,
+                model_args.max_context_len,
+            )
+            model_args.max_context_len = default_rope_length
-            # positions, not just the cache size. Override the model's max_context_len to use
-            # a larger value that supports extended generation.
-            # We use model_args.max_context_len which was set from export.max_context_length
-            # but for RoPE we need the full generation length capability.
-            # Use 131072 (128k) as default for Llama 3.2 models or the original model max if larger.
-            default_rope_length = max(131072, model_args.max_context_len)
-            model_args.max_context_len = default_rope_length
+            # positions, not just the cache size. We rely on model_args.max_context_len, which is
+            # set from export.max_context_length and already validated above to be large enough.
+            # Ensure it is at least sink_size + window_size without forcing an arbitrary larger default
+            # that could cause excessive memory usage on models with smaller contexts.
+            min_required_context = sink_size + window_size
+            if model_args.max_context_len < min_required_context:
+                model_args.max_context_len = min_required_context
-            # positions, not just the cache size. Override the model's max_context_len to use
-            # a larger value that supports extended generation.
-            # We use model_args.max_context_len which was set from export.max_context_length
-            # but for RoPE we need the full generation length capability.
-            # Use 131072 (128k) as default for Llama 3.2 models or the original model max if larger.
-            default_rope_length = max(131072, model_args.max_context_len)
-            model_args.max_context_len = default_rope_length
+            # positions, not just the cache size. We rely on model_args.max_context_len, which is
+            # set from export.max_context_length and already validated above to be large enough.
+            # Ensure it is at least sink_size + window_size without forcing an arbitrary larger default
+            # that could cause excessive memory usage on models with smaller contexts.
+            min_required_context = sink_size + window_size
+            if model_args.max_context_len < min_required_context:
+                model_args.max_context_len = min_required_context
 
             self.model_ = enable_attention_sink(
                 module=self.model_,