diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 6be28f1f3be..7a9d6e412fc 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -279,6 +279,8 @@ def _validate_split_kv_size(value: int) -> int: "FD_FP8_QUANT_WITH_POW2SCALE": lambda: bool(int(os.getenv("FD_FP8_QUANT_WITH_POW2SCALE", "0"))), # enable kv cache manager v1 "ENABLE_V1_KVCACHE_MANAGER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_MANAGER", "0")), + # run dummy run for profile + "FD_RUN_DUMMY_FOR_PROFILE": lambda: int(os.getenv("FD_RUN_DUMMY_FOR_PROFILE", "0")), } diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_blackwell_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_blackwell_backend.py index 274deda8b69..2fa9eac7257 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_blackwell_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_blackwell_backend.py @@ -23,6 +23,7 @@ from paddleformers.utils.log import logger import fastdeploy +from fastdeploy import envs from fastdeploy.model_executor.layers.moe.ep import deep_ep from fastdeploy.model_executor.layers.quantization.fp8_utils import ( deep_gemm, @@ -623,6 +624,8 @@ def apply_ep_prefill( """ gate_out = gate(x) gate_out = gate_out.cast("float32") + if envs.FD_RUN_DUMMY_FOR_PROFILE: + gate_out = paddle.randn_like(gate_out, dtype="float32") hidden_size = x.shape[1] @@ -963,6 +966,8 @@ def apply_ep_decode( """ gate_out = gate(x) gate_out = gate_out.cast("float32") + if envs.FD_RUN_DUMMY_FOR_PROFILE: + gate_out = paddle.randn_like(gate_out, dtype="float32") # 1. Select topk experts and weights topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out) @@ -1050,6 +1055,8 @@ def apply_tp( """ gate_out = gate(x) gate_out = gate_out.cast("float32") + if envs.FD_RUN_DUMMY_FOR_PROFILE: + gate_out = paddle.randn_like(gate_out, dtype="float32") if layer.topk_method == "noaux_tc": diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index faf5f774d6c..483fd8759db 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -23,6 +23,7 @@ from paddleformers.utils.log import logger import fastdeploy +from fastdeploy import envs from fastdeploy.platforms import current_platform from ..utils import get_tensor, group_wise_int4_weight_quantize, pack, rotate_model @@ -137,6 +138,8 @@ def apply_ep_prefill( """ gate_out = gate(x) gate_out = gate_out.cast("float32") + if envs.FD_RUN_DUMMY_FOR_PROFILE: + gate_out = paddle.randn_like(gate_out, dtype="float32") # 1. Select topk experts and weights topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out) @@ -292,6 +295,8 @@ def apply_ep_decode( """ gate_out = gate(x) gate_out = gate_out.cast("float32") + if envs.FD_RUN_DUMMY_FOR_PROFILE: + gate_out = paddle.randn_like(gate_out, dtype="float32") estimate_total_token_nums = gate_out.shape[0] * layer.top_k # 1. Select topk experts and weights topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out) @@ -439,6 +444,8 @@ def apply_tp( use_fused = not fastdeploy.envs.FD_ENABLE_RL and current_platform.is_cuda() and not fc1_latent_proj if not use_fused: gate_out = gate_out.cast("float32") + if envs.FD_RUN_DUMMY_FOR_PROFILE: + gate_out = paddle.randn_like(gate_out, dtype="float32") if fc1_latent_proj is not None: x = fc1_latent_proj(x) gate_out, topk_weights, topk_idx = get_moe_scores( @@ -481,6 +488,8 @@ def apply_tp( ) else: gate_out = gate_out.cast("float32") + if envs.FD_RUN_DUMMY_FOR_PROFILE: + gate_out = paddle.randn_like(gate_out, dtype="float32") if fc1_latent_proj is not None: x = fc1_latent_proj(x) ( diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index 0264b11110c..18ffb4ee1e9 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -24,6 +24,7 @@ from paddleformers.utils.log import logger import fastdeploy +from fastdeploy import envs from fastdeploy.model_executor.layers.moe.ep import deep_ep from fastdeploy.model_executor.layers.quantization.fp8_utils import ( deep_gemm, @@ -341,6 +342,8 @@ def apply_ep_prefill( """ gate_out = gate(x) gate_out = gate_out.cast("float32") + if envs.FD_RUN_DUMMY_FOR_PROFILE: + gate_out = paddle.randn_like(gate_out, dtype="float32") hidden_size = layer.hidden_size @@ -674,6 +677,8 @@ def apply_ep_decode( """ gate_out = gate(x) gate_out = gate_out.cast("float32") + if envs.FD_RUN_DUMMY_FOR_PROFILE: + gate_out = paddle.randn_like(gate_out, dtype="float32") # 1. Select topk experts and weights topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out) @@ -790,6 +795,8 @@ def apply_tp( ) else: gate_out = gate_out.cast("float32") + if envs.FD_RUN_DUMMY_FOR_PROFILE: + gate_out = paddle.randn_like(gate_out, dtype="float32") topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select( gate_out, layer.gate_correction_bias, diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 1f9b1902517..050129841a5 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1173,8 +1173,9 @@ def get_input_length_list( # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan. # TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP. - if self.fd_config.parallel_config.enable_expert_parallel: - input_length = min(input_length, 32) + if not envs.FD_RUN_DUMMY_FOR_PROFILE: + if self.fd_config.parallel_config.enable_expert_parallel: + input_length = min(input_length, 32) block_num = ( input_length + self.cache_config.block_size - 1 @@ -2030,6 +2031,12 @@ def _dummy_run( if self.enable_mm: model_inputs["image_features"] = self.share_inputs["image_features"] + if envs.FD_RUN_DUMMY_FOR_PROFILE: + import datetime + + paddle.distributed.barrier() + starttime = datetime.datetime.now() + # 3. Run model model_output = self.model( model_inputs, @@ -2059,6 +2066,13 @@ def _dummy_run( ) self._dummy_sampler_run(hidden_states, model_output, batch_size, accept_all_drafts, reject_all_drafts) + if envs.FD_RUN_DUMMY_FOR_PROFILE: + paddle.distributed.barrier() + endtime = datetime.datetime.now() + duringtime = endtime - starttime + time_ms = duringtime.seconds * 1000 + duringtime.microseconds / 1000.0 + print("The whole end to end time : ", time_ms, "ms") + # 7. Updata 'infer_seed' and step_cuda() if not self.speculative_decoding: self.share_inputs["infer_seed"].add_(self.infer_seed_increment) diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 28a943cf9d4..fbaed132c49 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -1333,7 +1333,7 @@ def run_worker_proc() -> None: # Instead of doing end to end tests which is very unstable, we can profile the following line of code to pick the best model. # so we add an environment variable RUN_DUMMY_FOR_PROFILE to control whether to run dummy run for profile. # Any Question refer to ChangWenBin. - if int(os.getenv("RUN_DUMMY_FOR_PROFILE", "0")) == 1: + if envs.FD_RUN_DUMMY_FOR_PROFILE: worker_proc.worker.model_runner._dummy_run( num_tokens=100, batch_size=1, expected_decode_len=10, step_use_cudagraph=True )