PaddlePaddle · chang-wenbin · May 11, 2026 · May 11, 2026 · May 11, 2026 · May 11, 2026
diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
@@ -279,6 +279,8 @@ def _validate_split_kv_size(value: int) -> int:
     "FD_FP8_QUANT_WITH_POW2SCALE": lambda: bool(int(os.getenv("FD_FP8_QUANT_WITH_POW2SCALE", "0"))),
     # enable kv cache manager v1
     "ENABLE_V1_KVCACHE_MANAGER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_MANAGER", "0")),
+    # run dummy run for profile
+    "FD_RUN_DUMMY_FOR_PROFILE": lambda: int(os.getenv("FD_RUN_DUMMY_FOR_PROFILE", "0")),
 }
 
 

diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_blackwell_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_blackwell_backend.py
@@ -23,6 +23,7 @@
 from paddleformers.utils.log import logger
 
 import fastdeploy
+from fastdeploy import envs
 from fastdeploy.model_executor.layers.moe.ep import deep_ep
 from fastdeploy.model_executor.layers.quantization.fp8_utils import (
     deep_gemm,
@@ -623,6 +624,8 @@ def apply_ep_prefill(
         """
         gate_out = gate(x)
         gate_out = gate_out.cast("float32")
+        if envs.FD_RUN_DUMMY_FOR_PROFILE:
+            gate_out = paddle.randn_like(gate_out, dtype="float32")
 
         hidden_size = x.shape[1]
 
@@ -963,6 +966,8 @@ def apply_ep_decode(
         """
         gate_out = gate(x)
         gate_out = gate_out.cast("float32")
+        if envs.FD_RUN_DUMMY_FOR_PROFILE:
+            gate_out = paddle.randn_like(gate_out, dtype="float32")
         # 1. Select topk experts and weights
         topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out)
 
@@ -1050,6 +1055,8 @@ def apply_tp(
         """
         gate_out = gate(x)
         gate_out = gate_out.cast("float32")
+        if envs.FD_RUN_DUMMY_FOR_PROFILE:
+            gate_out = paddle.randn_like(gate_out, dtype="float32")
 
         if layer.topk_method == "noaux_tc":
 

diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
@@ -23,6 +23,7 @@
 from paddleformers.utils.log import logger
 
 import fastdeploy
+from fastdeploy import envs
 from fastdeploy.platforms import current_platform
 
 from ..utils import get_tensor, group_wise_int4_weight_quantize, pack, rotate_model
@@ -137,6 +138,8 @@ def apply_ep_prefill(
         """
         gate_out = gate(x)
         gate_out = gate_out.cast("float32")
+        if envs.FD_RUN_DUMMY_FOR_PROFILE:
+            gate_out = paddle.randn_like(gate_out, dtype="float32")
         # 1. Select topk experts and weights
         topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out)
 
@@ -292,6 +295,8 @@ def apply_ep_decode(
         """
         gate_out = gate(x)
         gate_out = gate_out.cast("float32")
+        if envs.FD_RUN_DUMMY_FOR_PROFILE:
+            gate_out = paddle.randn_like(gate_out, dtype="float32")
         estimate_total_token_nums = gate_out.shape[0] * layer.top_k
         # 1. Select topk experts and weights
         topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out)
@@ -439,6 +444,8 @@ def apply_tp(
             use_fused = not fastdeploy.envs.FD_ENABLE_RL and current_platform.is_cuda() and not fc1_latent_proj
             if not use_fused:
                 gate_out = gate_out.cast("float32")
+                if envs.FD_RUN_DUMMY_FOR_PROFILE:
+                    gate_out = paddle.randn_like(gate_out, dtype="float32")
                 if fc1_latent_proj is not None:
                     x = fc1_latent_proj(x)
             gate_out, topk_weights, topk_idx = get_moe_scores(
@@ -481,6 +488,8 @@ def apply_tp(
             )
         else:
             gate_out = gate_out.cast("float32")
+            if envs.FD_RUN_DUMMY_FOR_PROFILE:
+                gate_out = paddle.randn_like(gate_out, dtype="float32")
             if fc1_latent_proj is not None:
                 x = fc1_latent_proj(x)
             (

diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
@@ -24,6 +24,7 @@
 from paddleformers.utils.log import logger
 
 import fastdeploy
+from fastdeploy import envs
 from fastdeploy.model_executor.layers.moe.ep import deep_ep
 from fastdeploy.model_executor.layers.quantization.fp8_utils import (
     deep_gemm,
@@ -341,6 +342,8 @@ def apply_ep_prefill(
         """
         gate_out = gate(x)
         gate_out = gate_out.cast("float32")
+        if envs.FD_RUN_DUMMY_FOR_PROFILE:
+            gate_out = paddle.randn_like(gate_out, dtype="float32")
 
         hidden_size = layer.hidden_size
 
@@ -674,6 +677,8 @@ def apply_ep_decode(
         """
         gate_out = gate(x)
         gate_out = gate_out.cast("float32")
+        if envs.FD_RUN_DUMMY_FOR_PROFILE:
+            gate_out = paddle.randn_like(gate_out, dtype="float32")
         # 1. Select topk experts and weights
         topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out)
 
@@ -790,6 +795,8 @@ def apply_tp(
             )
         else:
             gate_out = gate_out.cast("float32")
+            if envs.FD_RUN_DUMMY_FOR_PROFILE:
+                gate_out = paddle.randn_like(gate_out, dtype="float32")
             topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
                 gate_out,
                 layer.gate_correction_bias,

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -1173,8 +1173,9 @@ def get_input_length_list(
 
         # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
         # TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
-        if self.fd_config.parallel_config.enable_expert_parallel:
-            input_length = min(input_length, 32)
+        if not envs.FD_RUN_DUMMY_FOR_PROFILE:
+            if self.fd_config.parallel_config.enable_expert_parallel:
+                input_length = min(input_length, 32)
 
         block_num = (
             input_length + self.cache_config.block_size - 1
@@ -2030,6 +2031,12 @@ def _dummy_run(
             if self.enable_mm:
                 model_inputs["image_features"] = self.share_inputs["image_features"]
 
+            if envs.FD_RUN_DUMMY_FOR_PROFILE:
+                import datetime
+
+                paddle.distributed.barrier()
+                starttime = datetime.datetime.now()
+
             # 3. Run model
             model_output = self.model(
                 model_inputs,
@@ -2059,6 +2066,13 @@ def _dummy_run(
                 )
                 self._dummy_sampler_run(hidden_states, model_output, batch_size, accept_all_drafts, reject_all_drafts)
 
+            if envs.FD_RUN_DUMMY_FOR_PROFILE:
+                paddle.distributed.barrier()
+                endtime = datetime.datetime.now()
+                duringtime = endtime - starttime
+                time_ms = duringtime.seconds * 1000 + duringtime.microseconds / 1000.0
+                print("The whole end to end time : ", time_ms, "ms")
+
             # 7. Updata 'infer_seed' and step_cuda()
             if not self.speculative_decoding:
                 self.share_inputs["infer_seed"].add_(self.infer_seed_increment)

diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
@@ -1333,7 +1333,7 @@ def run_worker_proc() -> None:
     # Instead of doing end to end tests which is very unstable, we can profile the following line of code to pick the best model.
     # so we add an environment variable RUN_DUMMY_FOR_PROFILE to control whether to run dummy run for profile.
     # Any Question refer to ChangWenBin.
-    if int(os.getenv("RUN_DUMMY_FOR_PROFILE", "0")) == 1:
+    if envs.FD_RUN_DUMMY_FOR_PROFILE:
         worker_proc.worker.model_runner._dummy_run(
             num_tokens=100, batch_size=1, expected_decode_len=10, step_use_cudagraph=True
         )