Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions fastdeploy/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,8 @@ def _validate_split_kv_size(value: int) -> int:
"FD_FP8_QUANT_WITH_POW2SCALE": lambda: bool(int(os.getenv("FD_FP8_QUANT_WITH_POW2SCALE", "0"))),
# enable kv cache manager v1
"ENABLE_V1_KVCACHE_MANAGER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_MANAGER", "0")),
# run dummy run for profile
"FD_RUN_DUMMY_FOR_PROFILE": lambda: int(os.getenv("FD_RUN_DUMMY_FOR_PROFILE", "0")),
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from paddleformers.utils.log import logger

import fastdeploy
from fastdeploy import envs
from fastdeploy.model_executor.layers.moe.ep import deep_ep
from fastdeploy.model_executor.layers.quantization.fp8_utils import (
deep_gemm,
Expand Down Expand Up @@ -623,6 +624,8 @@ def apply_ep_prefill(
"""
gate_out = gate(x)
gate_out = gate_out.cast("float32")
if envs.FD_RUN_DUMMY_FOR_PROFILE:
gate_out = paddle.randn_like(gate_out, dtype="float32")

hidden_size = x.shape[1]

Expand Down Expand Up @@ -963,6 +966,8 @@ def apply_ep_decode(
"""
gate_out = gate(x)
gate_out = gate_out.cast("float32")
if envs.FD_RUN_DUMMY_FOR_PROFILE:
gate_out = paddle.randn_like(gate_out, dtype="float32")
# 1. Select topk experts and weights
topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out)

Expand Down Expand Up @@ -1050,6 +1055,8 @@ def apply_tp(
"""
gate_out = gate(x)
gate_out = gate_out.cast("float32")
if envs.FD_RUN_DUMMY_FOR_PROFILE:
gate_out = paddle.randn_like(gate_out, dtype="float32")

if layer.topk_method == "noaux_tc":

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from paddleformers.utils.log import logger

import fastdeploy
from fastdeploy import envs
from fastdeploy.platforms import current_platform

from ..utils import get_tensor, group_wise_int4_weight_quantize, pack, rotate_model
Expand Down Expand Up @@ -137,6 +138,8 @@ def apply_ep_prefill(
"""
gate_out = gate(x)
gate_out = gate_out.cast("float32")
if envs.FD_RUN_DUMMY_FOR_PROFILE:
gate_out = paddle.randn_like(gate_out, dtype="float32")
# 1. Select topk experts and weights
topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out)

Expand Down Expand Up @@ -292,6 +295,8 @@ def apply_ep_decode(
"""
gate_out = gate(x)
gate_out = gate_out.cast("float32")
if envs.FD_RUN_DUMMY_FOR_PROFILE:
gate_out = paddle.randn_like(gate_out, dtype="float32")
estimate_total_token_nums = gate_out.shape[0] * layer.top_k
# 1. Select topk experts and weights
topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out)
Expand Down Expand Up @@ -439,6 +444,8 @@ def apply_tp(
use_fused = not fastdeploy.envs.FD_ENABLE_RL and current_platform.is_cuda() and not fc1_latent_proj
if not use_fused:
gate_out = gate_out.cast("float32")
if envs.FD_RUN_DUMMY_FOR_PROFILE:
gate_out = paddle.randn_like(gate_out, dtype="float32")
if fc1_latent_proj is not None:
x = fc1_latent_proj(x)
gate_out, topk_weights, topk_idx = get_moe_scores(
Expand Down Expand Up @@ -481,6 +488,8 @@ def apply_tp(
)
else:
gate_out = gate_out.cast("float32")
if envs.FD_RUN_DUMMY_FOR_PROFILE:
gate_out = paddle.randn_like(gate_out, dtype="float32")
if fc1_latent_proj is not None:
x = fc1_latent_proj(x)
(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from paddleformers.utils.log import logger

import fastdeploy
from fastdeploy import envs
from fastdeploy.model_executor.layers.moe.ep import deep_ep
from fastdeploy.model_executor.layers.quantization.fp8_utils import (
deep_gemm,
Expand Down Expand Up @@ -341,6 +342,8 @@ def apply_ep_prefill(
"""
gate_out = gate(x)
gate_out = gate_out.cast("float32")
if envs.FD_RUN_DUMMY_FOR_PROFILE:
gate_out = paddle.randn_like(gate_out, dtype="float32")

hidden_size = layer.hidden_size

Expand Down Expand Up @@ -674,6 +677,8 @@ def apply_ep_decode(
"""
gate_out = gate(x)
gate_out = gate_out.cast("float32")
if envs.FD_RUN_DUMMY_FOR_PROFILE:
gate_out = paddle.randn_like(gate_out, dtype="float32")
# 1. Select topk experts and weights
topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out)

Expand Down Expand Up @@ -790,6 +795,8 @@ def apply_tp(
)
else:
gate_out = gate_out.cast("float32")
if envs.FD_RUN_DUMMY_FOR_PROFILE:
gate_out = paddle.randn_like(gate_out, dtype="float32")
topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
gate_out,
layer.gate_correction_bias,
Expand Down
18 changes: 16 additions & 2 deletions fastdeploy/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1173,8 +1173,9 @@ def get_input_length_list(

# NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
# TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
if self.fd_config.parallel_config.enable_expert_parallel:
input_length = min(input_length, 32)
if not envs.FD_RUN_DUMMY_FOR_PROFILE:
if self.fd_config.parallel_config.enable_expert_parallel:
input_length = min(input_length, 32)

block_num = (
input_length + self.cache_config.block_size - 1
Expand Down Expand Up @@ -2030,6 +2031,12 @@ def _dummy_run(
if self.enable_mm:
model_inputs["image_features"] = self.share_inputs["image_features"]

if envs.FD_RUN_DUMMY_FOR_PROFILE:
import datetime

paddle.distributed.barrier()
starttime = datetime.datetime.now()

# 3. Run model
model_output = self.model(
model_inputs,
Expand Down Expand Up @@ -2059,6 +2066,13 @@ def _dummy_run(
)
self._dummy_sampler_run(hidden_states, model_output, batch_size, accept_all_drafts, reject_all_drafts)

if envs.FD_RUN_DUMMY_FOR_PROFILE:
paddle.distributed.barrier()
endtime = datetime.datetime.now()
duringtime = endtime - starttime
time_ms = duringtime.seconds * 1000 + duringtime.microseconds / 1000.0
print("The whole end to end time : ", time_ms, "ms")

# 7. Updata 'infer_seed' and step_cuda()
if not self.speculative_decoding:
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/worker/worker_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -1333,7 +1333,7 @@ def run_worker_proc() -> None:
# Instead of doing end to end tests which is very unstable, we can profile the following line of code to pick the best model.
# so we add an environment variable RUN_DUMMY_FOR_PROFILE to control whether to run dummy run for profile.
# Any Question refer to ChangWenBin.
if int(os.getenv("RUN_DUMMY_FOR_PROFILE", "0")) == 1:
if envs.FD_RUN_DUMMY_FOR_PROFILE:
Comment thread
chang-wenbin marked this conversation as resolved.
worker_proc.worker.model_runner._dummy_run(
num_tokens=100, batch_size=1, expected_decode_len=10, step_use_cudagraph=True
)
Expand Down
Loading