-
Notifications
You must be signed in to change notification settings - Fork 828
[wip] Support multimethod in export_llama_lib #17231
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: gh/lucylq/134/base
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,124 @@ | ||
| #!/bin/bash | ||
| # Copyright (c) Meta Platforms, Inc. and affiliates. | ||
| # All rights reserved. | ||
| # | ||
| # This source code is licensed under the BSD-style license found in the | ||
| # LICENSE file in the root directory of this source tree. | ||
|
|
||
| set -exu | ||
| # shellcheck source=/dev/null | ||
| source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" | ||
|
|
||
| cmake_install_executorch_libraries() { | ||
| echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a" | ||
| rm -rf cmake-out | ||
| cmake --workflow llm-release | ||
| } | ||
|
|
||
| cmake_build_llama_runner() { | ||
| echo "Building llama runner" | ||
| pushd extension/llm/tokenizers | ||
| echo "Updating tokenizers submodule" | ||
| git submodule update --init | ||
| popd | ||
| make llama-cpu | ||
| } | ||
|
|
||
| cleanup_files() { | ||
| echo "Deleting downloaded and generated files" | ||
| rm -rf "${HF_QWEN_PATH}/" | ||
| rm -rf "${HF_ADAPTER_PATH}/" | ||
| rm -rf *.pte | ||
| rm -f result*.txt | ||
| } | ||
|
|
||
| # Download LoRA adapter. | ||
| python -m pip install -q huggingface_hub | ||
| HF_ADAPTER_REPO="lucylq/qwen3_06B_lora_math" | ||
| HF_ADAPTER_PATH=$( | ||
| bash "$(dirname "${BASH_SOURCE[0]}")/download_hf_hub.sh" \ | ||
| --model_id "${HF_ADAPTER_REPO}" \ | ||
| --files "adapter_config.json" "adapter_model.safetensors" | ||
| ) | ||
|
|
||
| # Download base model (for tokenizer path). | ||
| HF_QWEN_PATH=$(python -c "from huggingface_hub import snapshot_download; print(snapshot_download('unsloth/Qwen3-0.6B'))") | ||
| echo "Model downloaded to: $HF_QWEN_PATH" | ||
|
|
||
| ### EXPORT MULTIMETHOD PTE ### | ||
| # Set environment variables for OmegaConf interpolation in yaml. | ||
| export LORA_ADAPTER_CHECKPOINT="${HF_ADAPTER_PATH}/adapter_model.safetensors" | ||
| export LORA_ADAPTER_CONFIG="${HF_ADAPTER_PATH}/adapter_config.json" | ||
|
|
||
| $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \ | ||
| --config examples/models/qwen3/config/qwen3_multimethod.yaml | ||
|
|
||
| ### BUILD LLAMA RUNNER ### | ||
| cmake_install_executorch_libraries | ||
| cmake_build_llama_runner | ||
|
|
||
| # Runner constants. | ||
| RUNTIME_ARGS="--tokenizer_path=${HF_QWEN_PATH}/ --temperature=0 --seq_len=100 --warmup=1" | ||
| PROMPT="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant" | ||
|
|
||
| # Expected outputs. | ||
| EXPECTED_LORA_PREFIX=" | ||
| <|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant | ||
| To calculate 15% of 80" | ||
|
|
||
| EXPECTED_BASE_PREFIX="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant: | ||
| <think> | ||
| Okay, so I need to calculate 15% of 80." | ||
|
|
||
| ### TEST 1: Run lora_forward method ### | ||
| NOW=$(date +"%H:%M:%S") | ||
| echo "Test 1: Multimethod lora_forward. Starting at ${NOW}" | ||
| # shellcheck source=/dev/null | ||
| cmake-out/examples/models/llama/llama_main \ | ||
| --model_path=multimethod_qwen.pte \ | ||
| --method_name=lora_forward \ | ||
| --prompt="${PROMPT}" \ | ||
| ${RUNTIME_ARGS} > result_lora.txt | ||
| NOW=$(date +"%H:%M:%S") | ||
| echo "Finished at ${NOW}" | ||
|
|
||
| RESULT=$(cat result_lora.txt) | ||
| if [[ "${RESULT}" == "${EXPECTED_LORA_PREFIX}"* ]]; then | ||
| echo "Expected result prefix: ${EXPECTED_LORA_PREFIX}" | ||
| echo "Actual result: ${RESULT}" | ||
| echo "Test 1 (lora_forward): Success" | ||
| else | ||
| echo "Expected result prefix: ${EXPECTED_LORA_PREFIX}" | ||
| echo "Actual result: ${RESULT}" | ||
| echo "Test 1 (lora_forward): Failure" | ||
| cleanup_files | ||
| exit 1 | ||
| fi | ||
|
|
||
| ### TEST 2: Run base_forward method ### | ||
| NOW=$(date +"%H:%M:%S") | ||
| echo "Test 2: Multimethod base_forward. Starting at ${NOW}" | ||
| # shellcheck source=/dev/null | ||
| cmake-out/examples/models/llama/llama_main \ | ||
| --model_path=multimethod_qwen.pte \ | ||
| --method_name=base_forward \ | ||
| --prompt="${PROMPT}" \ | ||
| ${RUNTIME_ARGS} > result_base.txt | ||
| NOW=$(date +"%H:%M:%S") | ||
| echo "Finished at ${NOW}" | ||
|
|
||
| RESULT=$(cat result_base.txt) | ||
| if [[ "${RESULT}" == "${EXPECTED_BASE_PREFIX}"* ]]; then | ||
| echo "Expected result prefix: ${EXPECTED_BASE_PREFIX}" | ||
| echo "Actual result: ${RESULT}" | ||
| echo "Test 2 (base_forward): Success" | ||
| else | ||
| echo "Expected result prefix: ${EXPECTED_BASE_PREFIX}" | ||
| echo "Actual result: ${RESULT}" | ||
| echo "Test 2 (base_forward): Failure" | ||
| cleanup_files | ||
| exit 1 | ||
| fi | ||
|
|
||
| echo "Multimethod tests passed!" | ||
| cleanup_files | ||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -20,15 +20,17 @@ | |||||||||
| from importlib import resources as _resources | ||||||||||
| from json import JSONDecodeError | ||||||||||
| from pathlib import Path | ||||||||||
| from typing import Callable, List, Optional, Union | ||||||||||
| from typing import Callable, Dict, List, Optional, Union | ||||||||||
|
|
||||||||||
| import torch | ||||||||||
| from torch.export import ExportedProgram | ||||||||||
|
|
||||||||||
| from executorch.devtools.backend_debug import print_delegation_info | ||||||||||
| from executorch.devtools.etrecord import generate_etrecord as generate_etrecord_func | ||||||||||
| from executorch.examples.models.llama.hf_download import ( | ||||||||||
| download_and_convert_hf_checkpoint, | ||||||||||
| ) | ||||||||||
| from executorch.exir import to_edge_transform_and_lower | ||||||||||
| from executorch.exir.passes.init_mutable_pass import InitializedMutableBufferPass | ||||||||||
| from executorch.extension.llm.export.builder import DType, LLMEdgeManager | ||||||||||
| from executorch.extension.llm.export.config.llm_config import LlmConfig | ||||||||||
|
|
@@ -844,6 +846,28 @@ def _validate_args(llm_config): | |||||||||
| "Shared embedding is only supported with torchao quantization." | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| if llm_config.multimethod.enabled: | ||||||||||
| if llm_config.base.lora is not None: | ||||||||||
| raise ValueError( | ||||||||||
| "Cannot use both base.lora and multimethod.methods. " | ||||||||||
| "Use multimethod.methods for all LoRA variants." | ||||||||||
| ) | ||||||||||
| if llm_config.quantization.pt2e_quantize is not None: | ||||||||||
| raise ValueError( | ||||||||||
| "PT2E quantization is not supported with multimethod export." | ||||||||||
| ) | ||||||||||
| if ( | ||||||||||
| llm_config.backend.coreml.enabled | ||||||||||
| or llm_config.backend.vulkan.enabled | ||||||||||
| or llm_config.backend.qnn.enabled | ||||||||||
| or llm_config.backend.mps.enabled | ||||||||||
| or llm_config.backend.openvino.enabled | ||||||||||
| ): | ||||||||||
| raise ValueError( | ||||||||||
| "Multimethod export only supports XNNPACK backend or portable ops" | ||||||||||
| "Please disable other backends (coreml, vulkan, qnn, mps, openvino)." | ||||||||||
|
Comment on lines
+867
to
+868
|
||||||||||
| "Multimethod export only supports XNNPACK backend or portable ops" | |
| "Please disable other backends (coreml, vulkan, qnn, mps, openvino)." | |
| "Multimethod export only supports XNNPACK backend or portable ops." | |
| " Please disable other backends (coreml, vulkan, qnn, mps, openvino)." |
Copilot
AI
Feb 5, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Return type annotation is too generic. The function returns either a list of partitioners or None, so the type should be Optional[List[Partitioner]] instead of Optional[List]. This would provide better type safety and match the usage pattern where partitioners are passed to to_edge_transform_and_lower.
Copilot
AI
Feb 5, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The generate_etrecord configuration option is not being passed to the to_edge_transform_and_lower call. This means that ETRecord generation will not work with multimethod export even if llm_config.debug.generate_etrecord is set to True. Add generate_etrecord=llm_config.debug.generate_etrecord, to the function call.
| constant_methods=first_builder.metadata, | |
| constant_methods=first_builder.metadata, | |
| generate_etrecord=llm_config.debug.generate_etrecord, |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| base: | ||
| model_class: "qwen3_0_6b" | ||
| params: "examples/models/qwen3/config/0_6b_config.json" | ||
| metadata: '{"get_bos_id": 151644, "get_eos_ids":[151645]}' | ||
|
|
||
| model: | ||
| use_kv_cache: true | ||
| use_sdpa_with_kv_cache: true | ||
|
|
||
| export: | ||
| output_name: multimethod_qwen | ||
|
|
||
| backend: | ||
| xnnpack: | ||
| enabled: true | ||
|
|
||
| quantization: | ||
| qmode: "8da4w" | ||
| group_size: 32 | ||
|
|
||
| multimethod: | ||
| methods: | ||
| # LoRA method - adapter paths from environment variables | ||
| lora_forward: | ||
| adapter_checkpoint: ${oc.env:LORA_ADAPTER_CHECKPOINT} | ||
| adapter_config: ${oc.env:LORA_ADAPTER_CONFIG} | ||
| # Base method - no LoRA | ||
| base_forward: null |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This script installs the
huggingface_hubpackage directly from PyPI at runtime without any version pinning or integrity verification, which introduces a supply-chain risk: if the package (or the index it is fetched from) is compromised, arbitrary code could execute in the CI environment with access to repository data and any configured secrets. To reduce this risk, pinhuggingface_hubto a known-good version (e.g., via a requirements/constraints file) and, where possible, enable hash-based verification or use a vetted internal package mirror instead of an unpinned direct install.