From 0702fc08a37677d7fd2af2d8508af7b88546affa Mon Sep 17 00:00:00 2001 From: vx120 <893600387@qq.com> Date: Wed, 18 Mar 2026 18:36:50 +0800 Subject: [PATCH 1/3] fix: patch moe weight loader before base weight sync --- .../sampler/vllm_sampler/vllm_worker_extension.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/twinkle/sampler/vllm_sampler/vllm_worker_extension.py b/src/twinkle/sampler/vllm_sampler/vllm_worker_extension.py index 42be5095..493d75e7 100644 --- a/src/twinkle/sampler/vllm_sampler/vllm_worker_extension.py +++ b/src/twinkle/sampler/vllm_sampler/vllm_worker_extension.py @@ -131,6 +131,11 @@ def update_weights_from_ipc( if peft_config and base_sync_done: self.remove_lora(VLLM_LORA_INT_ID) + else: + try: + self.monkey_patch_model() + except Exception as e: + logger.warning(f"Failed to apply MoE weight_loader patch before load_weights: {e}") # Detect TP rank — vLLM sets self.rank on each worker. tp_rank = getattr(self, 'rank', 0) @@ -353,6 +358,12 @@ def load_synced_weights( # fix: Keep device resolution consistent with update_weights_from_ipc to avoid path divergence. self.device = torch.device(Torch.get_device(getattr(self, 'local_rank', None))) + if not (peft_config and base_sync_done): + try: + self.monkey_patch_model() + except Exception as e: + logger.warning(f"Failed to apply MoE weight_loader patch before load_weights: {e}") + weight_list = list(weights.items()) self._load_weights(weight_list, peft_config=peft_config, base_sync_done=base_sync_done) From 6f4e9ed05dca53bc970e1885fef89e3f5af09085 Mon Sep 17 00:00:00 2001 From: vx120 <893600387@qq.com> Date: Wed, 18 Mar 2026 19:11:21 +0800 Subject: [PATCH 2/3] fix: apply pre-commit changes --- src/twinkle/sampler/vllm_sampler/vllm_worker_extension.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/twinkle/sampler/vllm_sampler/vllm_worker_extension.py b/src/twinkle/sampler/vllm_sampler/vllm_worker_extension.py index 493d75e7..c0f04ae4 100644 --- a/src/twinkle/sampler/vllm_sampler/vllm_worker_extension.py +++ b/src/twinkle/sampler/vllm_sampler/vllm_worker_extension.py @@ -135,7 +135,7 @@ def update_weights_from_ipc( try: self.monkey_patch_model() except Exception as e: - logger.warning(f"Failed to apply MoE weight_loader patch before load_weights: {e}") + logger.warning(f'Failed to apply MoE weight_loader patch before load_weights: {e}') # Detect TP rank — vLLM sets self.rank on each worker. tp_rank = getattr(self, 'rank', 0) @@ -362,7 +362,7 @@ def load_synced_weights( try: self.monkey_patch_model() except Exception as e: - logger.warning(f"Failed to apply MoE weight_loader patch before load_weights: {e}") + logger.warning(f'Failed to apply MoE weight_loader patch before load_weights: {e}') weight_list = list(weights.items()) self._load_weights(weight_list, peft_config=peft_config, base_sync_done=base_sync_done) From 2860e2e23b7418567bb32c2dc7d383247de2d15f Mon Sep 17 00:00:00 2001 From: vx120 <893600387@qq.com> Date: Mon, 30 Mar 2026 17:27:27 +0800 Subject: [PATCH 3/3] patch qwen3moe by existed code --- src/twinkle/sampler/vllm_sampler/vllm_sampler.py | 3 +-- .../sampler/vllm_sampler/vllm_worker_extension.py | 11 ----------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py index 915e012f..62f15630 100644 --- a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py +++ b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py @@ -122,8 +122,7 @@ def __init__(self, model_id: str, engine_args: Dict[str, Any] = None, device_mes # fix: On NPU, monkey_patch_model can trigger Triton compatibility errors and abort sampler init. # fix: Explicitly skip this patch on NPU and keep it for non-NPU paths only. # NPU platform may trigger triton errors with monkey_patch_model - if Platform.get_platform().device_prefix() != 'npu': - self._run_in_loop(self.engine.engine.collective_rpc('monkey_patch_model')) + self._run_in_loop(self.engine.engine.collective_rpc('monkey_patch_model')) VLLMLoraWeights()(self) diff --git a/src/twinkle/sampler/vllm_sampler/vllm_worker_extension.py b/src/twinkle/sampler/vllm_sampler/vllm_worker_extension.py index dc017995..61920cd9 100644 --- a/src/twinkle/sampler/vllm_sampler/vllm_worker_extension.py +++ b/src/twinkle/sampler/vllm_sampler/vllm_worker_extension.py @@ -131,11 +131,6 @@ def update_weights_from_ipc( if peft_config and base_sync_done: self.remove_lora(VLLM_LORA_INT_ID) - else: - try: - self.monkey_patch_model() - except Exception as e: - logger.warning(f'Failed to apply MoE weight_loader patch before load_weights: {e}') # Detect TP rank — vLLM sets self.rank on each worker. tp_rank = getattr(self, 'rank', 0) @@ -358,12 +353,6 @@ def load_synced_weights( # fix: Keep device resolution consistent with update_weights_from_ipc to avoid path divergence. self.device = torch.device(Torch.get_device(getattr(self, 'local_rank', None))) - if not (peft_config and base_sync_done): - try: - self.monkey_patch_model() - except Exception as e: - logger.warning(f'Failed to apply MoE weight_loader patch before load_weights: {e}') - weight_list = list(weights.items()) self._load_weights(weight_list, peft_config=peft_config, base_sync_done=base_sync_done)