From eb481e92af50e90e904be1e9f0426b134cff3289 Mon Sep 17 00:00:00 2001 From: Carl Persson Date: Tue, 16 Dec 2025 15:22:57 +0000 Subject: [PATCH 01/12] add flash attn te support for wan --- src/maxdiffusion/models/attention_flax.py | 75 +++++++++++-------- .../wan/transformers/transformer_wan.py | 4 +- src/maxdiffusion/train_wan.py | 5 +- src/maxdiffusion/trainers/wan_trainer.py | 19 ++++- 4 files changed, 64 insertions(+), 39 deletions(-) diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py index 2982e19e..438d0bcb 100644 --- a/src/maxdiffusion/models/attention_flax.py +++ b/src/maxdiffusion/models/attention_flax.py @@ -78,8 +78,11 @@ def _reshape_data_from_cudnn_flash(tensor): def _reshape_data_for_cudnn_flash(tensor, heads): # reshapes from [b, s, h * d] to [b, s, h, d] (input format to flash format) - batch, seq, heads_and_dim_head = tensor.shape - tensor = tensor.reshape(batch, seq, heads, heads_and_dim_head // heads) + if len(tensor.shape) == 3: + batch, seq, dim_head = tensor.shape + tensor = tensor.reshape(batch, seq, heads, dim_head // heads) + else: + tensor = jnp.transpose(tensor, (0, 2, 1, 3)) return tensor @@ -89,7 +92,8 @@ def _reshape_batch_dim_to_heads(tensor, heads): tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim) tensor = jnp.transpose(tensor, (0, 2, 1, 3)) reshaped_tensor = tensor.reshape(batch_size // head_size, seq_len, dim * head_size) - return jax.lax.with_sharding_constraint(reshaped_tensor, PartitionSpec("data", "fsdp", "tensor")) + axis_names = nn.logical_to_mesh_axes((BATCH, LENGTH, HEAD, D_KV)) + return jax.lax.with_sharding_constraint(reshaped_tensor, axis_names) def _reshape_heads_to_batch_dim(tensor, heads): @@ -102,8 +106,8 @@ def _reshape_heads_to_batch_dim(tensor, heads): else: batch_size, head_size, seq_len, head_dim = tensor.shape reshaped_tensor = tensor.reshape(batch_size * head_size, seq_len, head_dim) - - return jax.lax.with_sharding_constraint(reshaped_tensor, PartitionSpec("data", "fsdp", "tensor")) + axis_names = nn.logical_to_mesh_axes((BATCH, LENGTH, HEAD, D_KV)) + return jax.lax.with_sharding_constraint(reshaped_tensor, axis_names) def _reshape_heads_to_head_dim(tensor): @@ -112,7 +116,8 @@ def _reshape_heads_to_head_dim(tensor): b, h, s, d = tensor.shape tensor = jnp.transpose(tensor, axes=[0, 2, 1, 3]) reshaped_tensor = jnp.reshape(tensor, (b, -1, h * d)) - return jax.lax.with_sharding_constraint(reshaped_tensor, PartitionSpec("data", "fsdp", "tensor")) + axis_names = nn.logical_to_mesh_axes((BATCH, LENGTH, HEAD, D_KV)) + return jax.lax.with_sharding_constraint(reshaped_tensor, axis_names) def _unflatten_heads(tensor, heads): @@ -481,24 +486,12 @@ def _cudnn_flash_attention(query: Array, key: Array, value: Array, heads: int, m key = _reshape_data_for_cudnn_flash(key, heads) value = _reshape_data_for_cudnn_flash(value, heads) - cudnn_flash_axis_names = (BATCH, LENGTH, HEAD, D_KV) - axis_names = nn.logical_to_mesh_axes(cudnn_flash_axis_names) - - query = nn.with_logical_constraint(query, axis_names) - key = nn.with_logical_constraint(key, axis_names) - value = nn.with_logical_constraint(value, axis_names) - - @functools.partial( - shard_map.shard_map, - mesh=mesh, - in_specs=(axis_names, axis_names, axis_names), - out_specs=axis_names, - check_rep=False, - ) - def wrap_flash_attention(query, key, value): - return jax.vmap(dpa_layer)(query, key, value, mask=None) - - out = wrap_flash_attention(query, key, value) + axis_names = nn.logical_to_mesh_axes((BATCH, LENGTH, HEAD, D_KV)) + query = jax.lax.with_sharding_constraint(query, axis_names) + key = jax.lax.with_sharding_constraint(key, axis_names) + value = jax.lax.with_sharding_constraint(value, axis_names) + + out = dpa_layer(query, key, value, mask=None) return _reshape_data_from_cudnn_flash(out) @@ -728,9 +721,25 @@ def __init__( mask_padding_tokens: bool = True, residual_checkpoint_name: str | None = None, ): - self.dpa_layer = None if attention_kernel == "cudnn_flash_te": - raise NotImplementedError(f"{self} has not been tested with {attention_kernel}") + from transformer_engine.jax.flax.transformer import DotProductAttention # pytype: disable=import-error + jax.config.update("jax_use_shardy_partitioner", False) + + dpa_layer = DotProductAttention( + head_dim=dim_head, + num_attention_heads=heads, + num_gqa_groups=heads, + attn_mask_type="no_mask", # 'no_mask', 'padding', 'causal', or 'padding_causal' + attn_bias_type="NO_BIAS", # 'no_bias', 'pre_scale_bias' or 'post_scale_bias' + # attention_dropout=self.dropout_rate, + dropout_rng_name="aqt", + dtype=dtype, + qkv_layout="BSHD_BSHD_BSHD", # 'BS3HD', 'BSHD_BS2HD' or 'BSHD_BSHD_BSHD' + scale_factor=scale, + transpose_batch_sequence=False, + ) + variables = {} + self.dpa_layer = functools.partial(dpa_layer.apply, variables) self.mesh = mesh self.scale = scale @@ -794,8 +803,9 @@ def setup(self): self.dpa_layer = None if self.attention_kernel == "cudnn_flash_te": from transformer_engine.jax.flax.transformer import DotProductAttention # pytype: disable=import-error + jax.config.update("jax_use_shardy_partitioner", False) - self.dpa_layer = DotProductAttention( + dpa_layer = DotProductAttention( head_dim=self.dim_head, num_attention_heads=self.heads, num_gqa_groups=self.heads, @@ -809,6 +819,9 @@ def setup(self): scale_factor=self.scale, transpose_batch_sequence=False, ) + variables = {} + self.dpa_layer = functools.partial(dpa_layer.apply, variables) + def apply_attention(self, query: Array, key: Array, value: Array, attention_mask: Array = None): return _apply_attention( @@ -867,9 +880,6 @@ def __init__( added_kv_proj_dim: Optional[int] = None, # New for I2V image_seq_len: Optional[int] = None, # New for I2V ): - if attention_kernel == "cudnn_flash_te": - raise NotImplementedError(f"Wan 2.1 has not been tested with {attention_kernel}") - if attention_kernel in {"flash", "cudnn_flash_te"} and mesh is None: raise ValueError(f"The flash attention kernel requires a value for mesh, but mesh is {self.mesh}") self.dim_head = dim_head @@ -1058,8 +1068,9 @@ def __call__( deterministic: bool = True, rngs: nnx.Rngs = None, ) -> jax.Array: - hidden_states = jax.lax.with_sharding_constraint(hidden_states, PartitionSpec("data", "fsdp", "tensor")) - encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, PartitionSpec("data", "fsdp", "tensor")) + axis_names = nn.logical_to_mesh_axes((BATCH, LENGTH, HEAD)) + hidden_states = jax.lax.with_sharding_constraint(hidden_states, axis_names) + encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, axis_names) dtype = hidden_states.dtype is_self_attention = encoder_hidden_states is None if encoder_hidden_states is None: diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py index a18b127c..63db7496 100644 --- a/src/maxdiffusion/models/wan/transformers/transformer_wan.py +++ b/src/maxdiffusion/models/wan/transformers/transformer_wan.py @@ -539,7 +539,7 @@ def init_block(rngs): if scan_layers: self.blocks = init_block(rngs) else: - blocks = nnx.List([]) + blocks = [] for _ in range(num_layers): block = WanTransformerBlock( rngs=rngs, @@ -561,7 +561,7 @@ def init_block(rngs): enable_jax_named_scopes=enable_jax_named_scopes, ) blocks.append(block) - self.blocks = blocks + self.blocks = nnx.data(blocks) self.norm_out = FP32LayerNorm(rngs=rngs, dim=inner_dim, eps=eps, elementwise_affine=False) self.proj_out = nnx.Linear( diff --git a/src/maxdiffusion/train_wan.py b/src/maxdiffusion/train_wan.py index fea15720..cc246797 100644 --- a/src/maxdiffusion/train_wan.py +++ b/src/maxdiffusion/train_wan.py @@ -35,7 +35,10 @@ def main(argv: Sequence[str]) -> None: config = pyconfig.config validate_train_config(config) max_logging.log(f"Found {jax.device_count()} devices.") - flax.config.update("flax_always_shard_variable", False) + try: + flax.config.update("flax_always_shard_variable", False) + except: + pass train(config) diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py index f23836a5..8a1c6930 100644 --- a/src/maxdiffusion/trainers/wan_trainer.py +++ b/src/maxdiffusion/trainers/wan_trainer.py @@ -40,6 +40,7 @@ from flax.training import train_state from maxdiffusion.pipelines.wan.wan_pipeline import WanPipeline from jax.experimental import multihost_utils +from transformer_engine.jax.sharding import global_shard_guard, MeshResource class TrainState(train_state.TrainState): @@ -309,7 +310,8 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data pretty_string = pprint.pformat(state_spec.opt_state, indent=4, width=60) max_logging.log(pretty_string) max_logging.log("------------------------------------------------") - max_utils.delete_pytree(params) + if self.config.hardware != 'gpu': + max_utils.delete_pytree(params) data_shardings = self.get_data_shardings(mesh) eval_data_shardings = self.get_eval_data_shardings(mesh) @@ -359,15 +361,24 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data scheduler_state = pipeline.scheduler_state example_batch = load_next_batch(train_data_iterator, None, self.config) + # Designate the context parallel axis for sharding + cp_resource = '' + for rules in self.config.logical_axis_rules: + if rules[0] == "activation_length": + if isinstance(rules[1], list): + cp_resource = rules[1][0] + else: + cp_resource = rules[1] + mesh_resource = MeshResource(cp_resource=cp_resource) + with ThreadPoolExecutor(max_workers=1) as executor: for step in np.arange(start_step, self.config.max_train_steps): if self.config.enable_profiler and step == first_profiling_step: max_utils.activate_profiler(self.config) start_step_time = datetime.datetime.now() next_batch_future = executor.submit(load_next_batch, train_data_iterator, example_batch, self.config) - with jax.profiler.StepTraceAnnotation("train", step_num=step), pipeline.mesh, nn_partitioning.axis_rules( - self.config.logical_axis_rules - ): + with jax.profiler.StepTraceAnnotation("train", step_num=step), pipeline.mesh, \ + global_shard_guard(mesh_resource), nn_partitioning.axis_rules(self.config.logical_axis_rules): state, scheduler_state, train_metric, rng = p_train_step(state, example_batch, rng, scheduler_state) train_metric["scalar"]["learning/loss"].block_until_ready() last_step_completion = datetime.datetime.now() From 6d7a714723804699b49313a67acc19d711d77bb2 Mon Sep 17 00:00:00 2001 From: Carl Persson Date: Tue, 16 Dec 2025 17:04:45 +0000 Subject: [PATCH 02/12] add gpu optimized sharding parallelism --- src/maxdiffusion/configs/base_wan_14b.yml | 26 +++++++------- src/maxdiffusion/max_utils.py | 35 +++++++++++++------ .../models/wan/autoencoder_kl_wan.py | 2 +- .../wan/transformers/transformer_wan.py | 13 ++++--- 4 files changed, 47 insertions(+), 29 deletions(-) diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml index b2a11dba..45db65d7 100644 --- a/src/maxdiffusion/configs/base_wan_14b.yml +++ b/src/maxdiffusion/configs/base_wan_14b.yml @@ -151,7 +151,7 @@ hardware: 'tpu' # Supported hardware types are 'tpu', 'gpu' skip_jax_distributed_system: False # Parallelism -mesh_axes: ['data', 'fsdp', 'tensor'] +mesh_axes: ['data', 'tensor', 'fsdp_tpu', 'fsdp_gpu'] # batch : batch dimension of data and activations # hidden : @@ -166,32 +166,34 @@ mesh_axes: ['data', 'fsdp', 'tensor'] # conv_in : conv.shape[2] weight # conv_out : conv.shape[-1] weight logical_axis_rules: [ - ['batch', 'data'], - ['activation_batch', 'data'], - ['activation_self_attn_heads', ['fsdp', 'tensor']], - ['activation_cross_attn_q_length', ['fsdp', 'tensor']], - ['activation_length', 'fsdp'], + ['batch', ['data', 'fsdp_gpu']], + ['activation_batch', ['data', 'fsdp_gpu']], + ['activation_self_attn_heads', ['fsdp_tpu', 'tensor']], + ['activation_cross_attn_q_length', ['fsdp_tpu', 'tensor']], + ['activation_length', 'fsdp_tpu'], ['activation_heads', 'tensor'], ['mlp','tensor'], - ['embed','fsdp'], + ['embed', ['fsdp_tpu', 'fsdp_gpu']], ['heads', 'tensor'], ['norm', 'tensor'], - ['conv_batch', ['data','fsdp']], + ['conv_batch', ['data', 'fsdp_tpu', 'fsdp_gpu']], ['out_channels', 'tensor'], - ['conv_out', 'fsdp'], + ['conv_out', 'fsdp_tpu'], ] -data_sharding: [['data', 'fsdp', 'tensor']] +data_sharding: [['data', 'tensor', 'fsdp_tpu', 'fsdp_gpu']] # One axis for each parallelism type may hold a placeholder (-1) # value to auto-shard based on available slices and devices. # By default, product of the DCN axes should equal number of slices # and product of the ICI axes should equal number of devices per slice. dcn_data_parallelism: 1 # recommended DCN axis to be auto-sharded -dcn_fsdp_parallelism: -1 dcn_tensor_parallelism: 1 +dcn_fsdp_tpu_parallelism: -1 +dcn_fsdp_gpu_parallelism: 1 # recommended DCN axis to be auto-sharded ici_data_parallelism: 1 -ici_fsdp_parallelism: -1 # recommended ICI axis to be auto-sharded ici_tensor_parallelism: 1 +ici_fsdp_tpu_parallelism: -1 +ici_fsdp_gpu_parallelism: 1 # recommended ICI axis to be auto-sharded allow_split_physical_axes: False diff --git a/src/maxdiffusion/max_utils.py b/src/maxdiffusion/max_utils.py index fb7266a1..b289a8ad 100644 --- a/src/maxdiffusion/max_utils.py +++ b/src/maxdiffusion/max_utils.py @@ -268,17 +268,30 @@ def create_device_mesh(config, devices=None, logging=True): max_logging.log(f"Devices: {devices} (num_devices: {num_devices})") multi_slice_env = num_slices > 1 - - dcn_parallelism = [ - config.dcn_data_parallelism, - config.dcn_fsdp_parallelism, - config.dcn_tensor_parallelism, - ] - ici_parallelism = [ - config.ici_data_parallelism, - config.ici_fsdp_parallelism, - config.ici_tensor_parallelism, - ] + if "dcn_fsdp_tpu_parallelism" in config.get_keys(): + dcn_parallelism = [ + config.dcn_data_parallelism, + config.dcn_tensor_parallelism, + config.dcn_fsdp_tpu_parallelism, + config.dcn_fsdp_gpu_parallelism, + ] + ici_parallelism = [ + config.ici_data_parallelism, + config.ici_tensor_parallelism, + config.ici_fsdp_tpu_parallelism, + config.ici_fsdp_gpu_parallelism, + ] + else: + dcn_parallelism = [ + config.dcn_data_parallelism, + config.dcn_fsdp_parallelism, + config.dcn_tensor_parallelism, + ] + ici_parallelism = [ + config.ici_data_parallelism, + config.ici_fsdp_parallelism, + config.ici_tensor_parallelism, + ] # Find possible unspecified parallelisms ici_parallelism = fill_unspecified_mesh_axes(ici_parallelism, num_devices_per_slice, "ICI") diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py index 1da2d18f..a91d96f6 100644 --- a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py +++ b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py @@ -73,7 +73,7 @@ def __init__( self._depth_padding_before = self._causal_padding[1][0] # 2 * padding_tuple[0] # Set sharding dynamically based on out_channels. - num_fsdp_axis_devices = mesh.device_ids.shape[1] + num_fsdp_axis_devices = mesh.device_ids.shape[2] kernel_sharding = (None, None, None, None, None) if out_channels % num_fsdp_axis_devices == 0: kernel_sharding = (None, None, None, None, "conv_out") diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py index 63db7496..2693c7c4 100644 --- a/src/maxdiffusion/models/wan/transformers/transformer_wan.py +++ b/src/maxdiffusion/models/wan/transformers/transformer_wan.py @@ -378,12 +378,15 @@ def __call__( encoder_attention_mask: Optional[jax.Array] = None, ): with self.conditional_named_scope("transformer_block"): - shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split( - (self.adaln_scale_shift_table + temb.astype(jnp.float32)), 6, axis=1 - ) - hidden_states = jax.lax.with_sharding_constraint(hidden_states, PartitionSpec("data", "fsdp", "tensor")) + with self.conditional_named_scope("adaln"): + shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split( + (self.adaln_scale_shift_table + temb.astype(jnp.float32)), 6, axis=1 + ) + axis_names = nn.logical_to_mesh_axes(("activation_batch", "activation_length", "activation_heads")) + hidden_states = jax.lax.with_sharding_constraint(hidden_states, axis_names) hidden_states = checkpoint_name(hidden_states, "hidden_states") - encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, PartitionSpec("data", "fsdp", None)) + axis_names = nn.logical_to_mesh_axes(("activation_batch", "activation_length", "activation_kv")) + encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, axis_names) # 1. Self-attention with self.conditional_named_scope("self_attn"): From 70bc2bdc3a7ec81b1b2f453c7f0c7333c524897e Mon Sep 17 00:00:00 2001 From: Carl Persson Date: Tue, 16 Dec 2025 18:53:49 +0000 Subject: [PATCH 03/12] sharding bugfixes --- src/maxdiffusion/models/attention_flax.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py index 438d0bcb..3048e57d 100644 --- a/src/maxdiffusion/models/attention_flax.py +++ b/src/maxdiffusion/models/attention_flax.py @@ -92,7 +92,7 @@ def _reshape_batch_dim_to_heads(tensor, heads): tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim) tensor = jnp.transpose(tensor, (0, 2, 1, 3)) reshaped_tensor = tensor.reshape(batch_size // head_size, seq_len, dim * head_size) - axis_names = nn.logical_to_mesh_axes((BATCH, LENGTH, HEAD, D_KV)) + axis_names = nn.logical_to_mesh_axes((BATCH, LENGTH, HEAD)) return jax.lax.with_sharding_constraint(reshaped_tensor, axis_names) @@ -106,7 +106,7 @@ def _reshape_heads_to_batch_dim(tensor, heads): else: batch_size, head_size, seq_len, head_dim = tensor.shape reshaped_tensor = tensor.reshape(batch_size * head_size, seq_len, head_dim) - axis_names = nn.logical_to_mesh_axes((BATCH, LENGTH, HEAD, D_KV)) + axis_names = nn.logical_to_mesh_axes((BATCH, LENGTH, HEAD)) return jax.lax.with_sharding_constraint(reshaped_tensor, axis_names) @@ -116,7 +116,7 @@ def _reshape_heads_to_head_dim(tensor): b, h, s, d = tensor.shape tensor = jnp.transpose(tensor, axes=[0, 2, 1, 3]) reshaped_tensor = jnp.reshape(tensor, (b, -1, h * d)) - axis_names = nn.logical_to_mesh_axes((BATCH, LENGTH, HEAD, D_KV)) + axis_names = nn.logical_to_mesh_axes((BATCH, LENGTH, HEAD)) return jax.lax.with_sharding_constraint(reshaped_tensor, axis_names) @@ -253,7 +253,11 @@ def _tpu_flash_attention( block_kv_dq=None if attention_kernel == "tokamax_flash" else min(kv_max_block_size, query.shape[2]), use_fused_bwd_kernel=True if attention_kernel == "tokamax_flash" else False, ) - num_fsdp_shards = mesh.shape["fsdp"] + fsdp_key = "fsdp" + if "fsdp_tpu" in mesh.shape.keys(): + fsdp_key = "fsdp_tpu" + + num_fsdp_shards = mesh.shape[fsdp_key] query = _reshape_data_for_flash(query, heads) key = _reshape_data_for_flash(key, heads) value = _reshape_data_for_flash(value, heads) @@ -355,13 +359,13 @@ def wrap_flash_attention(query, key, value): perm = [(j, (j + 1) % num_fsdp_shards) for j in range(num_fsdp_shards)] - k1 = jax.lax.ppermute(key, axis_name="fsdp", perm=perm) - v1 = jax.lax.ppermute(value, axis_name="fsdp", perm=perm) + k1 = jax.lax.ppermute(key, axis_name=fsdp_key, perm=perm) + v1 = jax.lax.ppermute(value, axis_name=fsdp_key, perm=perm) def ring_scan_body(carry, _): m, l, o, k_current, v_current = carry - k_next = jax.lax.ppermute(k_current, axis_name="fsdp", perm=perm) - v_next = jax.lax.ppermute(v_current, axis_name="fsdp", perm=perm) + k_next = jax.lax.ppermute(k_current, axis_name=fsdp_key, perm=perm) + v_next = jax.lax.ppermute(v_current, axis_name=fsdp_key, perm=perm) out_chunk, (lse_chunk,) = vmapped_splash(query, k_current, v_current, segment_ids) @@ -388,7 +392,7 @@ def ring_scan_body(carry, _): return attention_output[:, :, :query_seq_len, :kv_size].astype(query.dtype) - devices_in_data_fsdp = mesh.shape["data"] * mesh.shape["fsdp"] + devices_in_data_fsdp = mesh.shape["data"] * mesh.shape[fsdp_key] # This warning might show up when doing model eval for example, when calculating model flops # and that is expected. if not (query.shape[0] / devices_in_data_fsdp).is_integer(): @@ -721,6 +725,7 @@ def __init__( mask_padding_tokens: bool = True, residual_checkpoint_name: str | None = None, ): + self.dpa_layer = None if attention_kernel == "cudnn_flash_te": from transformer_engine.jax.flax.transformer import DotProductAttention # pytype: disable=import-error jax.config.update("jax_use_shardy_partitioner", False) From cf8220be5b50c5bbff25d57a9bce8fb96aa1de80 Mon Sep 17 00:00:00 2001 From: Carl Persson Date: Wed, 17 Dec 2025 09:11:49 +0000 Subject: [PATCH 04/12] generalize across sharding parallelisms --- src/maxdiffusion/max_utils.py | 12 ++++++++++++ src/maxdiffusion/models/attention_flax.py | 6 ++---- src/maxdiffusion/models/wan/autoencoder_kl_wan.py | 6 +++++- src/maxdiffusion/trainers/wan_trainer.py | 12 +++--------- 4 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/maxdiffusion/max_utils.py b/src/maxdiffusion/max_utils.py index b289a8ad..00edeffe 100644 --- a/src/maxdiffusion/max_utils.py +++ b/src/maxdiffusion/max_utils.py @@ -664,3 +664,15 @@ def maybe_initialize_jax_distributed_system(raw_keys): max_logging.log("Jax distributed system initialized on GPU!") else: jax.distributed.initialize() + +def get_axis_names(axis_key: str, config=None) -> str: + """Returns the mesh axis names given the logical axis key from config.logical_axis_rules.""" + axis_name = '' + if config: + axis_rules = config.logical_axis_rules + else: + axis_rules = nn.get_logical_axis_rules() + for rules in axis_rules: + if rules[0] == axis_key: + axis_name = rules[1] + return axis_name diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py index 3048e57d..5c1b8c7a 100644 --- a/src/maxdiffusion/models/attention_flax.py +++ b/src/maxdiffusion/models/attention_flax.py @@ -29,6 +29,7 @@ from tokamax._src.ops.experimental.tpu.splash_attention import splash_attention_kernel as tokamax_splash_attention_kernel from einops import rearrange from .. import common_types, max_logging +from .. import max_utils from . import quantizations from .modeling_flax_utils import get_activation @@ -253,10 +254,7 @@ def _tpu_flash_attention( block_kv_dq=None if attention_kernel == "tokamax_flash" else min(kv_max_block_size, query.shape[2]), use_fused_bwd_kernel=True if attention_kernel == "tokamax_flash" else False, ) - fsdp_key = "fsdp" - if "fsdp_tpu" in mesh.shape.keys(): - fsdp_key = "fsdp_tpu" - + fsdp_key = max_utils.get_axis_names("activation_length") num_fsdp_shards = mesh.shape[fsdp_key] query = _reshape_data_for_flash(query, heads) key = _reshape_data_for_flash(key, heads) diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py index a91d96f6..91ceb6e2 100644 --- a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py +++ b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py @@ -21,6 +21,7 @@ import jax.numpy as jnp from flax import nnx from ...configuration_utils import ConfigMixin +from ... import max_utils from ..modeling_flax_utils import FlaxModelMixin, get_activation from ... import common_types from ..vae_flax import (FlaxAutoencoderKLOutput, FlaxDiagonalGaussianDistribution, FlaxDecoderOutput) @@ -73,7 +74,10 @@ def __init__( self._depth_padding_before = self._causal_padding[1][0] # 2 * padding_tuple[0] # Set sharding dynamically based on out_channels. - num_fsdp_axis_devices = mesh.device_ids.shape[2] + fspd_key = max_utils.get_axis_names("activation_length") + if not fspd_key: + fspd_key = "fsdp" + num_fsdp_axis_devices = mesh.shape[fspd_key] kernel_sharding = (None, None, None, None, None) if out_channels % num_fsdp_axis_devices == 0: kernel_sharding = (None, None, None, None, "conv_out") diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py index 8a1c6930..f27279d9 100644 --- a/src/maxdiffusion/trainers/wan_trainer.py +++ b/src/maxdiffusion/trainers/wan_trainer.py @@ -211,8 +211,8 @@ def prepare_sample_eval(features): return data_iterator def start_training(self): - - pipeline, opt_state, step = self.checkpointer.load_checkpoint() + with nn_partitioning.axis_rules(self.config.logical_axis_rules): + pipeline, opt_state, step = self.checkpointer.load_checkpoint() restore_args = {} if opt_state and step: restore_args = {"opt_state": opt_state, "step": step} @@ -362,13 +362,7 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data example_batch = load_next_batch(train_data_iterator, None, self.config) # Designate the context parallel axis for sharding - cp_resource = '' - for rules in self.config.logical_axis_rules: - if rules[0] == "activation_length": - if isinstance(rules[1], list): - cp_resource = rules[1][0] - else: - cp_resource = rules[1] + cp_resource = max_utils.get_axis_names("activation_length", config=self.config) mesh_resource = MeshResource(cp_resource=cp_resource) with ThreadPoolExecutor(max_workers=1) as executor: From cf3eb63ccce44e265ad4b2286261a72591fc07e4 Mon Sep 17 00:00:00 2001 From: Carl Persson Date: Thu, 8 Jan 2026 11:06:54 +0000 Subject: [PATCH 05/12] fix issue with inference using fsdp + te flash attention --- src/maxdiffusion/generate_wan.py | 5 ++++- .../pipelines/wan/wan_pipeline_2_1.py | 12 +++++++++++- .../pipelines/wan/wan_pipeline_2_2.py | 12 ++++++++++-- src/maxdiffusion/trainers/wan_trainer.py | 17 +++++++++++------ 4 files changed, 36 insertions(+), 10 deletions(-) diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py index d3aad31d..7ed4d5a3 100644 --- a/src/maxdiffusion/generate_wan.py +++ b/src/maxdiffusion/generate_wan.py @@ -248,7 +248,10 @@ def run(config, pipeline=None, filename_prefix=""): def main(argv: Sequence[str]) -> None: pyconfig.initialize(argv) - flax.config.update("flax_always_shard_variable", False) + try: + flax.config.update("flax_always_shard_variable", False) + except: + pass run(pyconfig.config) diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py index 5617e3b7..0ff54a70 100644 --- a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py +++ b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py @@ -17,17 +17,20 @@ from typing import List, Union, Optional from ...pyconfig import HyperParameters from functools import partial +from contextlib import nullcontext from flax import nnx from flax.linen import partitioning as nn_partitioning import jax import jax.numpy as jnp from ...schedulers.scheduling_unipc_multistep_flax import FlaxUniPCMultistepScheduler +from maxdiffusion import max_utils class WanPipeline2_1(WanPipeline): """Pipeline for WAN 2.1 with a single transformer.""" def __init__(self, config: HyperParameters, transformer: Optional[WanModel], **kwargs): super().__init__(config=config, **kwargs) self.transformer = transformer + self.config = config @classmethod def _load_and_init(cls, config, restored_checkpoint=None, vae_only=False, load_transformer=True): @@ -113,8 +116,15 @@ def __call__( scheduler=self.scheduler, scheduler_state=scheduler_state, ) + # Set the TE shard_guard context_manager if using TE cudnn_flash attention + if self.config.attention == "cudnn_flash_te": + from transformer_engine.jax.sharding import global_shard_guard, MeshResource # pytype: disable=import-error + cp_resource = max_utils.get_axis_names("activation_length", config=self.config) + shard_guard = global_shard_guard(MeshResource(cp_resource=cp_resource)) + else: + shard_guard = nullcontext() - with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules): + with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules), shard_guard: latents = p_run_inference( graphdef=graphdef, sharded_state=state, diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py index c0400f60..2f972266 100644 --- a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py +++ b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py @@ -17,6 +17,7 @@ from typing import List, Union, Optional from ...pyconfig import HyperParameters from functools import partial +from contextlib import nullcontext from flax import nnx from flax.linen import partitioning as nn_partitioning import jax @@ -127,8 +128,15 @@ def __call__( scheduler=self.scheduler, scheduler_state=scheduler_state, ) - - with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules): + # Set the TE shard_guard context_manager if using TE cudnn_flash attention + if self.config.attention == "cudnn_flash_te": + from transformer_engine.jax.sharding import global_shard_guard, MeshResource # pytype: disable=import-error + cp_resource = max_utils.get_axis_names("activation_length", config=self.config) + shard_guard = global_shard_guard(MeshResource(cp_resource=cp_resource)) + else: + shard_guard = nullcontext() + + with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules), shard_guard: latents = p_run_inference( low_noise_graphdef=low_noise_graphdef, low_noise_state=low_noise_state, diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py index f27279d9..bdca7090 100644 --- a/src/maxdiffusion/trainers/wan_trainer.py +++ b/src/maxdiffusion/trainers/wan_trainer.py @@ -20,6 +20,7 @@ import pprint import numpy as np import threading +from contextlib import nullcontext from concurrent.futures import ThreadPoolExecutor import tensorflow as tf import jax.numpy as jnp @@ -40,7 +41,6 @@ from flax.training import train_state from maxdiffusion.pipelines.wan.wan_pipeline import WanPipeline from jax.experimental import multihost_utils -from transformer_engine.jax.sharding import global_shard_guard, MeshResource class TrainState(train_state.TrainState): @@ -361,18 +361,23 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data scheduler_state = pipeline.scheduler_state example_batch = load_next_batch(train_data_iterator, None, self.config) - # Designate the context parallel axis for sharding - cp_resource = max_utils.get_axis_names("activation_length", config=self.config) - mesh_resource = MeshResource(cp_resource=cp_resource) - with ThreadPoolExecutor(max_workers=1) as executor: for step in np.arange(start_step, self.config.max_train_steps): if self.config.enable_profiler and step == first_profiling_step: max_utils.activate_profiler(self.config) start_step_time = datetime.datetime.now() + + # Designate the context parallel axis for sharding + if self.config.attention == "cudnn_flash_te": + from transformer_engine.jax.sharding import global_shard_guard, MeshResource # pytype: disable=import-error + cp_resource = max_utils.get_axis_names("activation_length", config=self.config) + shard_guard = global_shard_guard(MeshResource(cp_resource=cp_resource)) + else: + shard_guard = nullcontext() + next_batch_future = executor.submit(load_next_batch, train_data_iterator, example_batch, self.config) with jax.profiler.StepTraceAnnotation("train", step_num=step), pipeline.mesh, \ - global_shard_guard(mesh_resource), nn_partitioning.axis_rules(self.config.logical_axis_rules): + shard_guard, nn_partitioning.axis_rules(self.config.logical_axis_rules): state, scheduler_state, train_metric, rng = p_train_step(state, example_batch, rng, scheduler_state) train_metric["scalar"]["learning/loss"].block_until_ready() last_step_completion = datetime.datetime.now() From 48b63ce5442984f4e8c0007e2b31cd44de802eb0 Mon Sep 17 00:00:00 2001 From: Carl Persson Date: Thu, 8 Jan 2026 12:14:28 +0000 Subject: [PATCH 06/12] revert fsdp_tpu name change --- src/maxdiffusion/configs/base_wan_14b.yml | 28 +++++++++++------------ src/maxdiffusion/max_utils.py | 10 ++++---- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml index 45db65d7..466efd0f 100644 --- a/src/maxdiffusion/configs/base_wan_14b.yml +++ b/src/maxdiffusion/configs/base_wan_14b.yml @@ -151,7 +151,7 @@ hardware: 'tpu' # Supported hardware types are 'tpu', 'gpu' skip_jax_distributed_system: False # Parallelism -mesh_axes: ['data', 'tensor', 'fsdp_tpu', 'fsdp_gpu'] +mesh_axes: ['data', 'tensor', 'fsdp', 'fsdp_batch'] # batch : batch dimension of data and activations # hidden : @@ -166,21 +166,21 @@ mesh_axes: ['data', 'tensor', 'fsdp_tpu', 'fsdp_gpu'] # conv_in : conv.shape[2] weight # conv_out : conv.shape[-1] weight logical_axis_rules: [ - ['batch', ['data', 'fsdp_gpu']], - ['activation_batch', ['data', 'fsdp_gpu']], - ['activation_self_attn_heads', ['fsdp_tpu', 'tensor']], - ['activation_cross_attn_q_length', ['fsdp_tpu', 'tensor']], - ['activation_length', 'fsdp_tpu'], + ['batch', ['data', 'fsdp_batch']], + ['activation_batch', ['data', 'fsdp_batch']], + ['activation_self_attn_heads', ['fsdp', 'tensor']], + ['activation_cross_attn_q_length', ['fsdp', 'tensor']], + ['activation_length', 'fsdp'], ['activation_heads', 'tensor'], ['mlp','tensor'], - ['embed', ['fsdp_tpu', 'fsdp_gpu']], + ['embed', ['fsdp', 'fsdp_batch']], ['heads', 'tensor'], ['norm', 'tensor'], - ['conv_batch', ['data', 'fsdp_tpu', 'fsdp_gpu']], + ['conv_batch', ['data', 'fsdp', 'fsdp_batch']], ['out_channels', 'tensor'], - ['conv_out', 'fsdp_tpu'], + ['conv_out', 'fsdp'], ] -data_sharding: [['data', 'tensor', 'fsdp_tpu', 'fsdp_gpu']] +data_sharding: [['data', 'tensor', 'fsdp', 'fsdp_batch']] # One axis for each parallelism type may hold a placeholder (-1) # value to auto-shard based on available slices and devices. @@ -188,12 +188,12 @@ data_sharding: [['data', 'tensor', 'fsdp_tpu', 'fsdp_gpu']] # and product of the ICI axes should equal number of devices per slice. dcn_data_parallelism: 1 # recommended DCN axis to be auto-sharded dcn_tensor_parallelism: 1 -dcn_fsdp_tpu_parallelism: -1 -dcn_fsdp_gpu_parallelism: 1 # recommended DCN axis to be auto-sharded +dcn_fsdp_parallelism: -1 +dcn_fsdp_batch_parallelism: 1 # recommended DCN axis to be auto-sharded ici_data_parallelism: 1 ici_tensor_parallelism: 1 -ici_fsdp_tpu_parallelism: -1 -ici_fsdp_gpu_parallelism: 1 # recommended ICI axis to be auto-sharded +ici_fsdp_parallelism: -1 +ici_fsdp_batch_parallelism: 1 # recommended ICI axis to be auto-sharded allow_split_physical_axes: False diff --git a/src/maxdiffusion/max_utils.py b/src/maxdiffusion/max_utils.py index 00edeffe..57b453e1 100644 --- a/src/maxdiffusion/max_utils.py +++ b/src/maxdiffusion/max_utils.py @@ -268,18 +268,18 @@ def create_device_mesh(config, devices=None, logging=True): max_logging.log(f"Devices: {devices} (num_devices: {num_devices})") multi_slice_env = num_slices > 1 - if "dcn_fsdp_tpu_parallelism" in config.get_keys(): + if "dcn_fsdp_batch_parallelism" in config.get_keys(): dcn_parallelism = [ config.dcn_data_parallelism, config.dcn_tensor_parallelism, - config.dcn_fsdp_tpu_parallelism, - config.dcn_fsdp_gpu_parallelism, + config.dcn_fsdp_parallelism, + config.dcn_fsdp_batch_parallelism, ] ici_parallelism = [ config.ici_data_parallelism, config.ici_tensor_parallelism, - config.ici_fsdp_tpu_parallelism, - config.ici_fsdp_gpu_parallelism, + config.ici_fsdp_parallelism, + config.ici_fsdp_batch_parallelism, ] else: dcn_parallelism = [ From 13f6408dca240c2730c14bba8990f9a74503611f Mon Sep 17 00:00:00 2001 From: Carl Persson Date: Thu, 8 Jan 2026 12:43:12 +0000 Subject: [PATCH 07/12] update readme with wan2.1 gpu notes --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index e6584d23..81b166ae 100644 --- a/README.md +++ b/README.md @@ -255,6 +255,9 @@ After installation completes, run the training script. - In Wan2.1, the ici_fsdp_parallelism axis is used for sequence parallelism, the ici_tensor_parallelism axis is used for head parallelism. - You can enable both, keeping in mind that Wan2.1 has 40 heads and 40 must be evenly divisible by ici_tensor_parallelism. - For Sequence parallelism, the code pads the sequence length to evenly divide the sequence. Try out different ici_fsdp_parallelism numbers, but we find 2 and 4 to be the best right now. + - For use on GPU it is recommended to enable the cudnn_te_flash attention kernel for optimal performance. + - Best performance is achieved with the use of batch parallelism, which can be enabled by using the ici_fsdp_batch_parallelism axis. Note that this parallelism strategy does not support fractional batch sizes. + - ici_fsdp_batch_parallelism and ici_fsdp_parallelism can be combined to allow for fractional batch sizes. However, padding is not currently supported for the cudnn_te_flash attention kernel and it is therefore required that the sequence length is divisible by the number of devices in the ici_fsdp_parallelism axis. You should eventually see a training run as: From f6e284e668997748c37def4a0f453992718d0b50 Mon Sep 17 00:00:00 2001 From: Carl Persson Date: Fri, 9 Jan 2026 11:20:10 +0000 Subject: [PATCH 08/12] re-order parallelism axes and revert dynamic context parallel axes selection --- src/maxdiffusion/configs/base_wan_14b.yml | 12 ++--- src/maxdiffusion/max_utils.py | 20 ++------- src/maxdiffusion/models/attention_flax.py | 44 +++++++++++++++++-- .../models/wan/autoencoder_kl_wan.py | 5 +-- .../pipelines/wan/wan_pipeline_2_1.py | 4 +- .../pipelines/wan/wan_pipeline_2_2.py | 3 +- src/maxdiffusion/pyconfig.py | 4 +- src/maxdiffusion/trainers/wan_trainer.py | 3 +- 8 files changed, 56 insertions(+), 39 deletions(-) diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml index 466efd0f..26ced51e 100644 --- a/src/maxdiffusion/configs/base_wan_14b.yml +++ b/src/maxdiffusion/configs/base_wan_14b.yml @@ -151,7 +151,7 @@ hardware: 'tpu' # Supported hardware types are 'tpu', 'gpu' skip_jax_distributed_system: False # Parallelism -mesh_axes: ['data', 'tensor', 'fsdp', 'fsdp_batch'] +mesh_axes: ['data', 'fsdp_batch', 'fsdp', 'tensor'] # batch : batch dimension of data and activations # hidden : @@ -180,20 +180,20 @@ logical_axis_rules: [ ['out_channels', 'tensor'], ['conv_out', 'fsdp'], ] -data_sharding: [['data', 'tensor', 'fsdp', 'fsdp_batch']] +data_sharding: [['data', 'fsdp_batch', 'fsdp', 'tensor']] # One axis for each parallelism type may hold a placeholder (-1) # value to auto-shard based on available slices and devices. # By default, product of the DCN axes should equal number of slices # and product of the ICI axes should equal number of devices per slice. dcn_data_parallelism: 1 # recommended DCN axis to be auto-sharded -dcn_tensor_parallelism: 1 +dcn_fsdp_batch_parallelism: 1 dcn_fsdp_parallelism: -1 -dcn_fsdp_batch_parallelism: 1 # recommended DCN axis to be auto-sharded +dcn_tensor_parallelism: 1 ici_data_parallelism: 1 +ici_fsdp_batch_parallelism: 1 +ici_fsdp_parallelism: -1 # recommended ICI axis to be auto-sharded ici_tensor_parallelism: 1 -ici_fsdp_parallelism: -1 -ici_fsdp_batch_parallelism: 1 # recommended ICI axis to be auto-sharded allow_split_physical_axes: False diff --git a/src/maxdiffusion/max_utils.py b/src/maxdiffusion/max_utils.py index 57b453e1..9ca25056 100644 --- a/src/maxdiffusion/max_utils.py +++ b/src/maxdiffusion/max_utils.py @@ -271,15 +271,15 @@ def create_device_mesh(config, devices=None, logging=True): if "dcn_fsdp_batch_parallelism" in config.get_keys(): dcn_parallelism = [ config.dcn_data_parallelism, - config.dcn_tensor_parallelism, - config.dcn_fsdp_parallelism, config.dcn_fsdp_batch_parallelism, + config.dcn_fsdp_parallelism, + config.dcn_tensor_parallelism, ] ici_parallelism = [ config.ici_data_parallelism, - config.ici_tensor_parallelism, - config.ici_fsdp_parallelism, config.ici_fsdp_batch_parallelism, + config.ici_fsdp_parallelism, + config.ici_tensor_parallelism, ] else: dcn_parallelism = [ @@ -664,15 +664,3 @@ def maybe_initialize_jax_distributed_system(raw_keys): max_logging.log("Jax distributed system initialized on GPU!") else: jax.distributed.initialize() - -def get_axis_names(axis_key: str, config=None) -> str: - """Returns the mesh axis names given the logical axis key from config.logical_axis_rules.""" - axis_name = '' - if config: - axis_rules = config.logical_axis_rules - else: - axis_rules = nn.get_logical_axis_rules() - for rules in axis_rules: - if rules[0] == axis_key: - axis_name = rules[1] - return axis_name diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py index 5c1b8c7a..4ef87f41 100644 --- a/src/maxdiffusion/models/attention_flax.py +++ b/src/maxdiffusion/models/attention_flax.py @@ -254,8 +254,7 @@ def _tpu_flash_attention( block_kv_dq=None if attention_kernel == "tokamax_flash" else min(kv_max_block_size, query.shape[2]), use_fused_bwd_kernel=True if attention_kernel == "tokamax_flash" else False, ) - fsdp_key = max_utils.get_axis_names("activation_length") - num_fsdp_shards = mesh.shape[fsdp_key] + num_fsdp_shards = mesh.shape["fsdp"] query = _reshape_data_for_flash(query, heads) key = _reshape_data_for_flash(key, heads) value = _reshape_data_for_flash(value, heads) @@ -386,11 +385,48 @@ def ring_scan_body(carry, _): attention_output = o_final / l_final[..., None] else: - raise ValueError("ring attention requires fsdp > 1") + if num_fsdp_shards > 1: + out, (lse,) = vmapped_splash(query, key, value, segment_ids) + m = lse.astype(jnp.float32) + l = jnp.exp(lse - m) + o = out.astype(jnp.float32) * l[..., None] + + perm = [(j, (j + 1) % num_fsdp_shards) for j in range(num_fsdp_shards)] + + k1 = jax.lax.ppermute(key, axis_name="fsdp", perm=perm) + v1 = jax.lax.ppermute(value, axis_name="fsdp", perm=perm) + + def ring_scan_body(carry, _): + m, l, o, k_current, v_current = carry + k_next = jax.lax.ppermute(k_current, axis_name="fsdp", perm=perm) + v_next = jax.lax.ppermute(v_current, axis_name="fsdp", perm=perm) + + out_chunk, (lse_chunk,) = vmapped_splash(query, k_current, v_current, segment_ids) + + m_chunk = lse_chunk.astype(jnp.float32) + m_old = m + m = jnp.maximum(m_old, m_chunk) + + exp_m_diff = jnp.exp(m_old - m) + exp_m_chunk_diff = jnp.exp(m_chunk - m) + + l = l * exp_m_diff + jnp.exp(lse_chunk - m) + o = o * exp_m_diff[..., None] + o += exp_m_chunk_diff[..., None] * out_chunk.astype(jnp.float32) + + # Return the updated state for the next iteration + return (m, l, o, k_next, v_next), None + + initial_carry = (m, l, o, k1, v1) + (m_final, l_final, o_final, _, _), _ = jax.lax.scan(ring_scan_body, initial_carry, None, length=num_fsdp_shards - 1) + + attention_output = o_final / l_final[..., None] + else: + raise ValueError("ring attention requires fsdp > 1") return attention_output[:, :, :query_seq_len, :kv_size].astype(query.dtype) - devices_in_data_fsdp = mesh.shape["data"] * mesh.shape[fsdp_key] + devices_in_data_fsdp = mesh.shape["data"] * mesh.shape["fsdp"] # This warning might show up when doing model eval for example, when calculating model flops # and that is expected. if not (query.shape[0] / devices_in_data_fsdp).is_integer(): diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py index 91ceb6e2..9e8dead1 100644 --- a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py +++ b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py @@ -74,10 +74,7 @@ def __init__( self._depth_padding_before = self._causal_padding[1][0] # 2 * padding_tuple[0] # Set sharding dynamically based on out_channels. - fspd_key = max_utils.get_axis_names("activation_length") - if not fspd_key: - fspd_key = "fsdp" - num_fsdp_axis_devices = mesh.shape[fspd_key] + num_fsdp_axis_devices = mesh.shape["fsdp"] kernel_sharding = (None, None, None, None, None) if out_channels % num_fsdp_axis_devices == 0: kernel_sharding = (None, None, None, None, "conv_out") diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py index 0ff54a70..34e9d63a 100644 --- a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py +++ b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py @@ -30,7 +30,6 @@ class WanPipeline2_1(WanPipeline): def __init__(self, config: HyperParameters, transformer: Optional[WanModel], **kwargs): super().__init__(config=config, **kwargs) self.transformer = transformer - self.config = config @classmethod def _load_and_init(cls, config, restored_checkpoint=None, vae_only=False, load_transformer=True): @@ -119,8 +118,7 @@ def __call__( # Set the TE shard_guard context_manager if using TE cudnn_flash attention if self.config.attention == "cudnn_flash_te": from transformer_engine.jax.sharding import global_shard_guard, MeshResource # pytype: disable=import-error - cp_resource = max_utils.get_axis_names("activation_length", config=self.config) - shard_guard = global_shard_guard(MeshResource(cp_resource=cp_resource)) + shard_guard = global_shard_guard(MeshResource(cp_resource="fsdp")) else: shard_guard = nullcontext() diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py index 2f972266..b9eba362 100644 --- a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py +++ b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py @@ -131,8 +131,7 @@ def __call__( # Set the TE shard_guard context_manager if using TE cudnn_flash attention if self.config.attention == "cudnn_flash_te": from transformer_engine.jax.sharding import global_shard_guard, MeshResource # pytype: disable=import-error - cp_resource = max_utils.get_axis_names("activation_length", config=self.config) - shard_guard = global_shard_guard(MeshResource(cp_resource=cp_resource)) + shard_guard = global_shard_guard(MeshResource(cp_resource="fsdp")) else: shard_guard = nullcontext() diff --git a/src/maxdiffusion/pyconfig.py b/src/maxdiffusion/pyconfig.py index 27c9f645..8df2e1e7 100644 --- a/src/maxdiffusion/pyconfig.py +++ b/src/maxdiffusion/pyconfig.py @@ -195,8 +195,8 @@ def user_init(raw_keys): raw_keys["logical_axis_rules"] = _lists_to_tuples(raw_keys["logical_axis_rules"]) # Verify qkv is sharded across sequence. - if raw_keys["attention"] == "ring" or raw_keys["attention_sharding_uniform"]: - max_logging.log(f"Adding sequence sharding to q and kv if not already present because {raw_keys['attention']}=='ring' or {raw_keys['attention_sharding_uniform']} is set.") + if "ring" in raw_keys["attention"] or raw_keys["attention_sharding_uniform"]: + max_logging.log(f"Adding sequence sharding to q and kv if not already present because '{raw_keys['attention']}' contains 'ring' or {raw_keys['attention_sharding_uniform']} is set.") logical_axis_rules = list(raw_keys["logical_axis_rules"]) max_logging.log(f"Initial logical axis rules: {logical_axis_rules}") new_rules = [] diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py index bdca7090..7a3d1bee 100644 --- a/src/maxdiffusion/trainers/wan_trainer.py +++ b/src/maxdiffusion/trainers/wan_trainer.py @@ -370,8 +370,7 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data # Designate the context parallel axis for sharding if self.config.attention == "cudnn_flash_te": from transformer_engine.jax.sharding import global_shard_guard, MeshResource # pytype: disable=import-error - cp_resource = max_utils.get_axis_names("activation_length", config=self.config) - shard_guard = global_shard_guard(MeshResource(cp_resource=cp_resource)) + shard_guard = global_shard_guard(MeshResource(cp_resource="fsdp")) else: shard_guard = nullcontext() From cd6abf8ebb19503725fc8e9a3fe1f8ad966ea07e Mon Sep 17 00:00:00 2001 From: Carl Persson Date: Fri, 9 Jan 2026 11:29:22 +0000 Subject: [PATCH 09/12] remove unused max_utils imports --- src/maxdiffusion/models/attention_flax.py | 1 - src/maxdiffusion/models/wan/autoencoder_kl_wan.py | 1 - src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py | 1 - 3 files changed, 3 deletions(-) diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py index 4ef87f41..f6132c0d 100644 --- a/src/maxdiffusion/models/attention_flax.py +++ b/src/maxdiffusion/models/attention_flax.py @@ -29,7 +29,6 @@ from tokamax._src.ops.experimental.tpu.splash_attention import splash_attention_kernel as tokamax_splash_attention_kernel from einops import rearrange from .. import common_types, max_logging -from .. import max_utils from . import quantizations from .modeling_flax_utils import get_activation diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py index 9e8dead1..b21fce19 100644 --- a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py +++ b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py @@ -21,7 +21,6 @@ import jax.numpy as jnp from flax import nnx from ...configuration_utils import ConfigMixin -from ... import max_utils from ..modeling_flax_utils import FlaxModelMixin, get_activation from ... import common_types from ..vae_flax import (FlaxAutoencoderKLOutput, FlaxDiagonalGaussianDistribution, FlaxDecoderOutput) diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py index 34e9d63a..e5f878af 100644 --- a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py +++ b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py @@ -23,7 +23,6 @@ import jax import jax.numpy as jnp from ...schedulers.scheduling_unipc_multistep_flax import FlaxUniPCMultistepScheduler -from maxdiffusion import max_utils class WanPipeline2_1(WanPipeline): """Pipeline for WAN 2.1 with a single transformer.""" From f9214aa4fd19ef868e14eaa5e6ba33c14c1c70fd Mon Sep 17 00:00:00 2001 From: Carl Persson Date: Mon, 19 Jan 2026 10:39:12 +0000 Subject: [PATCH 10/12] change mesh names to more accurately reflect sharding --- src/maxdiffusion/common_types.py | 13 ++-- src/maxdiffusion/configs/base_wan_14b.yml | 28 ++++---- src/maxdiffusion/configs/base_wan_27b.yml | 25 ++++---- src/maxdiffusion/max_utils.py | 6 +- src/maxdiffusion/models/attention_flax.py | 64 ++++--------------- .../models/wan/autoencoder_kl_wan.py | 4 +- .../pipelines/wan/wan_pipeline_2_1.py | 2 +- .../pipelines/wan/wan_pipeline_2_2.py | 2 +- src/maxdiffusion/pyconfig.py | 4 +- src/maxdiffusion/trainers/wan_trainer.py | 2 +- 10 files changed, 58 insertions(+), 92 deletions(-) diff --git a/src/maxdiffusion/common_types.py b/src/maxdiffusion/common_types.py index 15553727..6e9d5445 100644 --- a/src/maxdiffusion/common_types.py +++ b/src/maxdiffusion/common_types.py @@ -36,6 +36,7 @@ # Physical axis names for device meshes. DATA = "data" FSDP = "fsdp" +CONTEXT = "context" TENSOR = "tensor" # Logical axis names for model parameters and activations. BATCH = "activation_batch" @@ -67,18 +68,18 @@ ### Common axis rules for ring attention ### RING_ATTENTION_AXIS_RULES = [ [SELF_ATTN_HEAD, None], - [SELF_ATTN_Q_LENGTH, FSDP], - [SELF_ATTN_KV_LENGTH, FSDP], + [SELF_ATTN_Q_LENGTH, CONTEXT], + [SELF_ATTN_KV_LENGTH, CONTEXT], [CROSS_ATTN_HEAD, None], - [CROSS_ATTN_Q_LENGTH, FSDP], - [CROSS_ATTN_KV_LENGTH, FSDP], + [CROSS_ATTN_Q_LENGTH, CONTEXT], + [CROSS_ATTN_KV_LENGTH, CONTEXT], ] SEQUENCE_PARALLEL_AXIS_RULES = [ [SELF_ATTN_HEAD, None], - [SELF_ATTN_Q_LENGTH, FSDP], + [SELF_ATTN_Q_LENGTH, CONTEXT], [SELF_ATTN_KV_LENGTH, None], [CROSS_ATTN_HEAD, None], - [CROSS_ATTN_Q_LENGTH, FSDP], + [CROSS_ATTN_Q_LENGTH, CONTEXT], [CROSS_ATTN_KV_LENGTH, None], ] diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml index 26ced51e..390ea3c6 100644 --- a/src/maxdiffusion/configs/base_wan_14b.yml +++ b/src/maxdiffusion/configs/base_wan_14b.yml @@ -151,7 +151,7 @@ hardware: 'tpu' # Supported hardware types are 'tpu', 'gpu' skip_jax_distributed_system: False # Parallelism -mesh_axes: ['data', 'fsdp_batch', 'fsdp', 'tensor'] +mesh_axes: ['data', 'fsdp', 'context', 'tensor'] # batch : batch dimension of data and activations # hidden : @@ -166,33 +166,33 @@ mesh_axes: ['data', 'fsdp_batch', 'fsdp', 'tensor'] # conv_in : conv.shape[2] weight # conv_out : conv.shape[-1] weight logical_axis_rules: [ - ['batch', ['data', 'fsdp_batch']], - ['activation_batch', ['data', 'fsdp_batch']], - ['activation_self_attn_heads', ['fsdp', 'tensor']], - ['activation_cross_attn_q_length', ['fsdp', 'tensor']], - ['activation_length', 'fsdp'], + ['batch', ['data', 'fsdp']], + ['activation_batch', ['data', 'fsdp']], + ['activation_self_attn_heads', ['context', 'tensor']], + ['activation_cross_attn_q_length', ['context', 'tensor']], + ['activation_length', 'context'], ['activation_heads', 'tensor'], ['mlp','tensor'], - ['embed', ['fsdp', 'fsdp_batch']], + ['embed', ['context', 'fsdp']], ['heads', 'tensor'], ['norm', 'tensor'], - ['conv_batch', ['data', 'fsdp', 'fsdp_batch']], + ['conv_batch', ['data', 'context', 'fsdp']], ['out_channels', 'tensor'], - ['conv_out', 'fsdp'], + ['conv_out', 'context'], ] -data_sharding: [['data', 'fsdp_batch', 'fsdp', 'tensor']] +data_sharding: [['data', 'fsdp', 'context', 'tensor']] # One axis for each parallelism type may hold a placeholder (-1) # value to auto-shard based on available slices and devices. # By default, product of the DCN axes should equal number of slices # and product of the ICI axes should equal number of devices per slice. dcn_data_parallelism: 1 # recommended DCN axis to be auto-sharded -dcn_fsdp_batch_parallelism: 1 -dcn_fsdp_parallelism: -1 +dcn_fsdp_parallelism: 1 +dcn_context_parallelism: -1 dcn_tensor_parallelism: 1 ici_data_parallelism: 1 -ici_fsdp_batch_parallelism: 1 -ici_fsdp_parallelism: -1 # recommended ICI axis to be auto-sharded +ici_fsdp_parallelism: 1 +ici_context_parallelism: -1 # recommended ICI axis to be auto-sharded ici_tensor_parallelism: 1 allow_split_physical_axes: False diff --git a/src/maxdiffusion/configs/base_wan_27b.yml b/src/maxdiffusion/configs/base_wan_27b.yml index cff70a94..7da63c84 100644 --- a/src/maxdiffusion/configs/base_wan_27b.yml +++ b/src/maxdiffusion/configs/base_wan_27b.yml @@ -139,7 +139,7 @@ hardware: 'tpu' # Supported hardware types are 'tpu', 'gpu' skip_jax_distributed_system: False # Parallelism -mesh_axes: ['data', 'fsdp', 'tensor'] +mesh_axes: ['data', 'fsdp', 'context', 'tensor'] # batch : batch dimension of data and activations # hidden : @@ -154,30 +154,33 @@ mesh_axes: ['data', 'fsdp', 'tensor'] # conv_in : conv.shape[2] weight # conv_out : conv.shape[-1] weight logical_axis_rules: [ - ['batch', 'data'], - ['activation_batch', 'data'], - ['activation_length', 'fsdp'], - + ['batch', ['data', 'fsdp']], + ['activation_batch', ['data', 'fsdp']], + ['activation_length', 'context'], + ['activation_self_attn_heads', ['context', 'tensor']], + ['activation_cross_attn_q_length', ['context', 'tensor']], ['activation_heads', 'tensor'], ['mlp','tensor'], - ['embed','fsdp'], + ['embed', ['context', 'fsdp']], ['heads', 'tensor'], ['norm', 'tensor'], - ['conv_batch', ['data','fsdp']], + ['conv_batch', ['data', 'context', 'fsdp']], ['out_channels', 'tensor'], - ['conv_out', 'fsdp'], + ['conv_out', 'context'], ] -data_sharding: [['data', 'fsdp', 'tensor']] +data_sharding: [['data', 'fsdp', 'context', 'tensor']] # One axis for each parallelism type may hold a placeholder (-1) # value to auto-shard based on available slices and devices. # By default, product of the DCN axes should equal number of slices # and product of the ICI axes should equal number of devices per slice. dcn_data_parallelism: 1 # recommended DCN axis to be auto-sharded -dcn_fsdp_parallelism: -1 +dcn_fsdp_parallelism: 1 +dcn_context_parallelism: -1 dcn_tensor_parallelism: 1 ici_data_parallelism: 1 -ici_fsdp_parallelism: -1 # recommended ICI axis to be auto-sharded +ici_fsdp_parallelism: 1 +ici_context_parallelism: -1 # recommended ICI axis to be auto-sharded ici_tensor_parallelism: 1 allow_split_physical_axes: False diff --git a/src/maxdiffusion/max_utils.py b/src/maxdiffusion/max_utils.py index 9ca25056..9cf8bd98 100644 --- a/src/maxdiffusion/max_utils.py +++ b/src/maxdiffusion/max_utils.py @@ -268,17 +268,17 @@ def create_device_mesh(config, devices=None, logging=True): max_logging.log(f"Devices: {devices} (num_devices: {num_devices})") multi_slice_env = num_slices > 1 - if "dcn_fsdp_batch_parallelism" in config.get_keys(): + if "dcn_context_parallelism" in config.get_keys() and "ici_context_parallelism" in config.get_keys(): dcn_parallelism = [ config.dcn_data_parallelism, - config.dcn_fsdp_batch_parallelism, config.dcn_fsdp_parallelism, + config.dcn_context_parallelism, config.dcn_tensor_parallelism, ] ici_parallelism = [ config.ici_data_parallelism, - config.ici_fsdp_batch_parallelism, config.ici_fsdp_parallelism, + config.ici_context_parallelism, config.ici_tensor_parallelism, ] else: diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py index f6132c0d..ab756133 100644 --- a/src/maxdiffusion/models/attention_flax.py +++ b/src/maxdiffusion/models/attention_flax.py @@ -253,7 +253,7 @@ def _tpu_flash_attention( block_kv_dq=None if attention_kernel == "tokamax_flash" else min(kv_max_block_size, query.shape[2]), use_fused_bwd_kernel=True if attention_kernel == "tokamax_flash" else False, ) - num_fsdp_shards = mesh.shape["fsdp"] + num_context_shards = mesh.shape["context"] query = _reshape_data_for_flash(query, heads) key = _reshape_data_for_flash(key, heads) value = _reshape_data_for_flash(value, heads) @@ -347,21 +347,21 @@ def wrap_flash_attention(query, key, value): if attention_kernel in ["flash", "tokamax_flash"]: attention_output = vmapped_splash(query, key, value, segment_ids) else: - if num_fsdp_shards > 1: + if num_context_shards > 1: out, (lse,) = vmapped_splash(query, key, value, segment_ids) m = lse.astype(jnp.float32) l = jnp.exp(lse - m) o = out.astype(jnp.float32) * l[..., None] - perm = [(j, (j + 1) % num_fsdp_shards) for j in range(num_fsdp_shards)] + perm = [(j, (j + 1) % num_context_shards) for j in range(num_context_shards)] - k1 = jax.lax.ppermute(key, axis_name=fsdp_key, perm=perm) - v1 = jax.lax.ppermute(value, axis_name=fsdp_key, perm=perm) + k1 = jax.lax.ppermute(key, axis_name="context", perm=perm) + v1 = jax.lax.ppermute(value, axis_name="context", perm=perm) def ring_scan_body(carry, _): m, l, o, k_current, v_current = carry - k_next = jax.lax.ppermute(k_current, axis_name=fsdp_key, perm=perm) - v_next = jax.lax.ppermute(v_current, axis_name=fsdp_key, perm=perm) + k_next = jax.lax.ppermute(k_current, axis_name="context", perm=perm) + v_next = jax.lax.ppermute(v_current, axis_name="context", perm=perm) out_chunk, (lse_chunk,) = vmapped_splash(query, k_current, v_current, segment_ids) @@ -380,58 +380,20 @@ def ring_scan_body(carry, _): return (m, l, o, k_next, v_next), None initial_carry = (m, l, o, k1, v1) - (m_final, l_final, o_final, _, _), _ = jax.lax.scan(ring_scan_body, initial_carry, None, length=num_fsdp_shards - 1) + (m_final, l_final, o_final, _, _), _ = jax.lax.scan(ring_scan_body, initial_carry, None, length=num_context_shards - 1) attention_output = o_final / l_final[..., None] else: - if num_fsdp_shards > 1: - out, (lse,) = vmapped_splash(query, key, value, segment_ids) - m = lse.astype(jnp.float32) - l = jnp.exp(lse - m) - o = out.astype(jnp.float32) * l[..., None] - - perm = [(j, (j + 1) % num_fsdp_shards) for j in range(num_fsdp_shards)] - - k1 = jax.lax.ppermute(key, axis_name="fsdp", perm=perm) - v1 = jax.lax.ppermute(value, axis_name="fsdp", perm=perm) - - def ring_scan_body(carry, _): - m, l, o, k_current, v_current = carry - k_next = jax.lax.ppermute(k_current, axis_name="fsdp", perm=perm) - v_next = jax.lax.ppermute(v_current, axis_name="fsdp", perm=perm) - - out_chunk, (lse_chunk,) = vmapped_splash(query, k_current, v_current, segment_ids) - - m_chunk = lse_chunk.astype(jnp.float32) - m_old = m - m = jnp.maximum(m_old, m_chunk) - - exp_m_diff = jnp.exp(m_old - m) - exp_m_chunk_diff = jnp.exp(m_chunk - m) - - l = l * exp_m_diff + jnp.exp(lse_chunk - m) - o = o * exp_m_diff[..., None] - o += exp_m_chunk_diff[..., None] * out_chunk.astype(jnp.float32) - - # Return the updated state for the next iteration - return (m, l, o, k_next, v_next), None - - initial_carry = (m, l, o, k1, v1) - (m_final, l_final, o_final, _, _), _ = jax.lax.scan(ring_scan_body, initial_carry, None, length=num_fsdp_shards - 1) - - attention_output = o_final / l_final[..., None] - else: - raise ValueError("ring attention requires fsdp > 1") - + raise ValueError("ring attention requires context > 1") return attention_output[:, :, :query_seq_len, :kv_size].astype(query.dtype) - devices_in_data_fsdp = mesh.shape["data"] * mesh.shape["fsdp"] + devices_in_data_context = mesh.shape["data"] * mesh.shape["context"] # This warning might show up when doing model eval for example, when calculating model flops # and that is expected. - if not (query.shape[0] / devices_in_data_fsdp).is_integer(): + if not (query.shape[0] / devices_in_data_context).is_integer(): max_logging.log( - "Warning, batch dimension should be shardable among the devices in data and fsdp" - f" axis, batch dimension: {query.shape[0]}, devices_in_data_fsdp: {devices_in_data_fsdp}" + "Warning, batch dimension should be shardable among the devices in data and context" + f" axis, batch dimension: {query.shape[0]}, devices_in_data_context: {devices_in_data_context}" ) x = wrap_flash_attention(query, key, value) x = _reshape_heads_to_head_dim(x) diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py index b21fce19..7f28a455 100644 --- a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py +++ b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py @@ -73,9 +73,9 @@ def __init__( self._depth_padding_before = self._causal_padding[1][0] # 2 * padding_tuple[0] # Set sharding dynamically based on out_channels. - num_fsdp_axis_devices = mesh.shape["fsdp"] + num_context_axis_devices = mesh.shape["context"] kernel_sharding = (None, None, None, None, None) - if out_channels % num_fsdp_axis_devices == 0: + if out_channels % num_context_axis_devices == 0: kernel_sharding = (None, None, None, None, "conv_out") self.conv = nnx.Conv( diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py index e5f878af..c1146587 100644 --- a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py +++ b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_1.py @@ -117,7 +117,7 @@ def __call__( # Set the TE shard_guard context_manager if using TE cudnn_flash attention if self.config.attention == "cudnn_flash_te": from transformer_engine.jax.sharding import global_shard_guard, MeshResource # pytype: disable=import-error - shard_guard = global_shard_guard(MeshResource(cp_resource="fsdp")) + shard_guard = global_shard_guard(MeshResource(cp_resource="context")) else: shard_guard = nullcontext() diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py index b9eba362..c7124869 100644 --- a/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py +++ b/src/maxdiffusion/pipelines/wan/wan_pipeline_2_2.py @@ -131,7 +131,7 @@ def __call__( # Set the TE shard_guard context_manager if using TE cudnn_flash attention if self.config.attention == "cudnn_flash_te": from transformer_engine.jax.sharding import global_shard_guard, MeshResource # pytype: disable=import-error - shard_guard = global_shard_guard(MeshResource(cp_resource="fsdp")) + shard_guard = global_shard_guard(MeshResource(cp_resource="context")) else: shard_guard = nullcontext() diff --git a/src/maxdiffusion/pyconfig.py b/src/maxdiffusion/pyconfig.py index 8df2e1e7..ae7f68a0 100644 --- a/src/maxdiffusion/pyconfig.py +++ b/src/maxdiffusion/pyconfig.py @@ -200,8 +200,8 @@ def user_init(raw_keys): logical_axis_rules = list(raw_keys["logical_axis_rules"]) max_logging.log(f"Initial logical axis rules: {logical_axis_rules}") new_rules = [] - q_seq_sharding = (LENGTH, "fsdp") - kv_seq_sharding = (KV_LENGTH, "fsdp") + q_seq_sharding = (LENGTH, "context") + kv_seq_sharding = (KV_LENGTH, "context") if q_seq_sharding not in logical_axis_rules: logical_axis_rules.append(q_seq_sharding) if kv_seq_sharding not in logical_axis_rules: diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py index 7a3d1bee..50704ea5 100644 --- a/src/maxdiffusion/trainers/wan_trainer.py +++ b/src/maxdiffusion/trainers/wan_trainer.py @@ -370,7 +370,7 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data # Designate the context parallel axis for sharding if self.config.attention == "cudnn_flash_te": from transformer_engine.jax.sharding import global_shard_guard, MeshResource # pytype: disable=import-error - shard_guard = global_shard_guard(MeshResource(cp_resource="fsdp")) + shard_guard = global_shard_guard(MeshResource(cp_resource="context")) else: shard_guard = nullcontext() From 87f04f4bc7805193d0184f4902bca01ee38b4552 Mon Sep 17 00:00:00 2001 From: Carl Persson Date: Tue, 20 Jan 2026 16:09:52 +0000 Subject: [PATCH 11/12] cleanup --- src/maxdiffusion/configs/base_wan_27b.yml | 2 +- src/maxdiffusion/models/wan/autoencoder_kl_wan.py | 5 ++++- .../models/wan/transformers/transformer_wan.py | 7 +++---- src/maxdiffusion/pyconfig.py | 4 ++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/maxdiffusion/configs/base_wan_27b.yml b/src/maxdiffusion/configs/base_wan_27b.yml index 7da63c84..f2839cba 100644 --- a/src/maxdiffusion/configs/base_wan_27b.yml +++ b/src/maxdiffusion/configs/base_wan_27b.yml @@ -156,9 +156,9 @@ mesh_axes: ['data', 'fsdp', 'context', 'tensor'] logical_axis_rules: [ ['batch', ['data', 'fsdp']], ['activation_batch', ['data', 'fsdp']], - ['activation_length', 'context'], ['activation_self_attn_heads', ['context', 'tensor']], ['activation_cross_attn_q_length', ['context', 'tensor']], + ['activation_length', 'context'], ['activation_heads', 'tensor'], ['mlp','tensor'], ['embed', ['context', 'fsdp']], diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py index 7f28a455..c8c54e9a 100644 --- a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py +++ b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py @@ -28,7 +28,10 @@ BlockSizes = common_types.BlockSizes CACHE_T = 2 -flax.config.update('flax_always_shard_variable', False) +try: + flax.config.update('flax_always_shard_variable', False) +except: + pass # Helper to ensure kernel_size, stride, padding are tuples of 3 integers def _canonicalize_tuple(x: Union[int, Sequence[int]], rank: int, name: str) -> Tuple[int, ...]: diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py index 2693c7c4..c52deb46 100644 --- a/src/maxdiffusion/models/wan/transformers/transformer_wan.py +++ b/src/maxdiffusion/models/wan/transformers/transformer_wan.py @@ -378,10 +378,9 @@ def __call__( encoder_attention_mask: Optional[jax.Array] = None, ): with self.conditional_named_scope("transformer_block"): - with self.conditional_named_scope("adaln"): - shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split( - (self.adaln_scale_shift_table + temb.astype(jnp.float32)), 6, axis=1 - ) + shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split( + (self.adaln_scale_shift_table + temb.astype(jnp.float32)), 6, axis=1 + ) axis_names = nn.logical_to_mesh_axes(("activation_batch", "activation_length", "activation_heads")) hidden_states = jax.lax.with_sharding_constraint(hidden_states, axis_names) hidden_states = checkpoint_name(hidden_states, "hidden_states") diff --git a/src/maxdiffusion/pyconfig.py b/src/maxdiffusion/pyconfig.py index ae7f68a0..c39443ce 100644 --- a/src/maxdiffusion/pyconfig.py +++ b/src/maxdiffusion/pyconfig.py @@ -195,8 +195,8 @@ def user_init(raw_keys): raw_keys["logical_axis_rules"] = _lists_to_tuples(raw_keys["logical_axis_rules"]) # Verify qkv is sharded across sequence. - if "ring" in raw_keys["attention"] or raw_keys["attention_sharding_uniform"]: - max_logging.log(f"Adding sequence sharding to q and kv if not already present because '{raw_keys['attention']}' contains 'ring' or {raw_keys['attention_sharding_uniform']} is set.") + if raw_keys["attention"] == "ring" or raw_keys["attention_sharding_uniform"]: + max_logging.log(f"Adding sequence sharding to q and kv if not already present because {raw_keys['attention']}=='ring' or {raw_keys['attention_sharding_uniform']} is set.") logical_axis_rules = list(raw_keys["logical_axis_rules"]) max_logging.log(f"Initial logical axis rules: {logical_axis_rules}") new_rules = [] From 42969af4d61fcee1aaeb96c1659dfed8cfb9f575 Mon Sep 17 00:00:00 2001 From: Carl Persson Date: Tue, 20 Jan 2026 17:05:18 +0000 Subject: [PATCH 12/12] fix lint errors --- src/maxdiffusion/generate_wan.py | 2 +- src/maxdiffusion/loaders/lora_conversion_utils.py | 4 ++-- src/maxdiffusion/models/attention_flax.py | 3 +-- src/maxdiffusion/models/wan/autoencoder_kl_wan.py | 2 +- src/maxdiffusion/models/wan/transformers/transformer_wan.py | 1 - src/maxdiffusion/pipelines/pipeline_flax_utils.py | 2 +- src/maxdiffusion/train_wan.py | 2 +- 7 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py index 7ed4d5a3..090bd820 100644 --- a/src/maxdiffusion/generate_wan.py +++ b/src/maxdiffusion/generate_wan.py @@ -250,7 +250,7 @@ def main(argv: Sequence[str]) -> None: pyconfig.initialize(argv) try: flax.config.update("flax_always_shard_variable", False) - except: + except LookupError: pass run(pyconfig.config) diff --git a/src/maxdiffusion/loaders/lora_conversion_utils.py b/src/maxdiffusion/loaders/lora_conversion_utils.py index 5f9e72a6..534a440d 100644 --- a/src/maxdiffusion/loaders/lora_conversion_utils.py +++ b/src/maxdiffusion/loaders/lora_conversion_utils.py @@ -391,7 +391,7 @@ def _convert_to_ai_toolkit_cat(sds_sd, ait_sd, sds_key, ait_keys, dims=None): ait_up_keys = [k + ".lora_B.weight" for k in ait_keys] if not is_sparse: # down_weight is copied to each split - ait_sd.update({k: down_weight for k in ait_down_keys}) + ait_sd.update(dict.fromkeys(ait_down_keys, down_weight)) # up_weight is split to each split ait_sd.update({k: v for k, v in zip(ait_up_keys, torch.split(up_weight, dims, dim=0))}) # noqa: C416 @@ -534,7 +534,7 @@ def handle_qkv(sds_sd, ait_sd, sds_key, ait_keys, dims=None): ait_up_keys = [k + ".lora_B.weight" for k in ait_keys] # down_weight is copied to each split - ait_sd.update({k: down_weight for k in ait_down_keys}) + ait_sd.update(dict.fromkeys(ait_down_keys, down_weight)) # up_weight is split to each split ait_sd.update({k: v for k, v in zip(ait_up_keys, torch.split(up_weight, dims, dim=0))}) # noqa: C416 diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py index ab756133..470f8a69 100644 --- a/src/maxdiffusion/models/attention_flax.py +++ b/src/maxdiffusion/models/attention_flax.py @@ -20,7 +20,6 @@ from flax import nnx import jax from jax.ad_checkpoint import checkpoint_name -from jax.sharding import PartitionSpec import jax.numpy as jnp from jax.experimental import shard_map from jax.experimental.pallas.ops.tpu.splash_attention import splash_attention_mask @@ -489,7 +488,7 @@ def _cudnn_flash_attention(query: Array, key: Array, value: Array, heads: int, m query = jax.lax.with_sharding_constraint(query, axis_names) key = jax.lax.with_sharding_constraint(key, axis_names) value = jax.lax.with_sharding_constraint(value, axis_names) - + out = dpa_layer(query, key, value, mask=None) return _reshape_data_from_cudnn_flash(out) diff --git a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py index c8c54e9a..107899da 100644 --- a/src/maxdiffusion/models/wan/autoencoder_kl_wan.py +++ b/src/maxdiffusion/models/wan/autoencoder_kl_wan.py @@ -30,7 +30,7 @@ CACHE_T = 2 try: flax.config.update('flax_always_shard_variable', False) -except: +except LookupError: pass # Helper to ensure kernel_size, stride, padding are tuples of 3 integers diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py index c52deb46..0390c6fd 100644 --- a/src/maxdiffusion/models/wan/transformers/transformer_wan.py +++ b/src/maxdiffusion/models/wan/transformers/transformer_wan.py @@ -19,7 +19,6 @@ import math import jax import jax.numpy as jnp -from jax.sharding import PartitionSpec from jax.ad_checkpoint import checkpoint_name from flax import nnx import flax.linen as nn diff --git a/src/maxdiffusion/pipelines/pipeline_flax_utils.py b/src/maxdiffusion/pipelines/pipeline_flax_utils.py index 8507d96e..da3a755b 100644 --- a/src/maxdiffusion/pipelines/pipeline_flax_utils.py +++ b/src/maxdiffusion/pipelines/pipeline_flax_utils.py @@ -473,7 +473,7 @@ def load_module(name, value): class_obj = import_flax_or_no_model(pipeline_module, class_name) importable_classes = ALL_IMPORTABLE_CLASSES - class_candidates = {c: class_obj for c in importable_classes.keys()} + class_candidates = dict.fromkeys(importable_classes.keys(), class_obj) else: # else we just import it from the library. diff --git a/src/maxdiffusion/train_wan.py b/src/maxdiffusion/train_wan.py index cc246797..2a289dfe 100644 --- a/src/maxdiffusion/train_wan.py +++ b/src/maxdiffusion/train_wan.py @@ -37,7 +37,7 @@ def main(argv: Sequence[str]) -> None: max_logging.log(f"Found {jax.device_count()} devices.") try: flax.config.update("flax_always_shard_variable", False) - except: + except LookupError: pass train(config)