diff --git a/CLAUDE.md b/CLAUDE.md index 770f345..6c9ebfc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,6 +1,8 @@ # ltx.cpp -C++ inference engine for LTX-Video (LTX 2.3) — text-to-video and image-to-video generation using GGML backends (Metal, CUDA, CPU). +C++ inference engine for LTX-Video (LTX 2.3) — text-to-video, image-to-video, and **audio-video (AV)** generation using GGML backends (Metal, CUDA, CPU). + +**Branch `audio-video`**: same DiT sees concatenated video+audio latent; one denoise loop; output is video frames + WAV. See `docs/AV_PIPELINE.md` and README “Audio-video (AV)” section. ## Build @@ -23,15 +25,17 @@ cmake -B build -DLTX_HIP=ON # ROCm/AMD ## Models -Download with `./models.sh` (requires `huggingface-cli`): +Download with `./models.sh` (requires `curl` or `wget`): ```bash -./models.sh # DiT Q4_K_M + T5 Q8_0 + VAE + extras +./models.sh # Dev DiT (default) + T5 + VAE + extras +./models.sh --distilled # Distilled DiT (few-step 4–8, CFG=1) from same repo ./models.sh --minimal # DiT + T5 + VAE only ./models.sh --quant Q8_0 # different DiT quant ``` Models land flat under `models/`. Key files: -- `models/ltx-2.3-22b-dev-Q4_K_M.gguf` — DiT weights +- `models/ltx-2.3-22b-dev-Q4_K_M.gguf` — Dev DiT (default) +- `models/ltx-2.3-22b-distilled-Q4_K_M.gguf` — Distilled DiT (with `--distilled`) - `models/ltx-2.3-22b-dev_video_vae.safetensors` — VAE - `models/t5-v1_1-xxl-encoder-Q8_0.gguf` — T5 text encoder @@ -47,9 +51,14 @@ build/ltx-generate \ --steps 20 --out output/frame ``` +**Audio-video (AV):** add `--av` and optionally `--out-wav path.wav` to get video frames + WAV from the same run. Mux with ffmpeg: `ffmpeg -framerate 24 -i out_%04d.ppm -i out.wav -c:v libx264 -c:a aac -shortest out.mp4`. + Useful flags: - `-v` — verbose per-step logging - `--perf` — print CPU%/RSS/free-RAM/GPU-MB to stderr every 10 s +- `--av` — enable audio+video path (concat latent → DiT → split → decode both) +- `--audio-vae path` — optional; for full audio VAE decoder when implemented +- `--out-wav path` — WAV output when `--av` (default: `.wav`) - `--start-frame img.png` — image-to-video (I2V) - `--end-frame img.png` — keyframe interpolation - `--seed N`, `--cfg F`, `--shift F`, `--threads N` @@ -65,14 +74,15 @@ BIN=build_debug/ltx-generate bash scripts/test-gpu-migration.sh | File | Purpose | |------|---------| -| `src/ltx-generate.cpp` | Main binary: arg parsing, model loading, denoising loop | -| `src/ltx_dit.hpp` | DiT transformer (forward pass, block loop, Metal/CPU paths) | +| `src/ltx-generate.cpp` | Main binary: arg parsing, model loading, denoising loop; AV path (concat/split, WAV output) | +| `src/ltx_dit.hpp` | DiT transformer (forward pass, block loop); `patchify_audio` / `unpatchify_audio` for AV | | `src/video_vae.hpp` | VAE encoder/decoder (safetensors) | | `src/t5_encoder.hpp` | T5-XXL text encoder (GGUF) | | `src/scheduler.hpp` | RF flow scheduler (timesteps, Euler step, CFG) | | `src/ltx_perf.hpp` | Background perf monitor thread (CPU/RAM stats) | | `src/ltx_common.hpp` | Shared macros (`LTX_LOG`, `LTX_ERR`), GGML helpers | | `src/safetensors_loader.cpp` | safetensors file loader | +| `docs/AV_PIPELINE.md` | AV pipeline design (token concat, shapes, CLI) | ## Architecture notes @@ -87,3 +97,8 @@ BIN=build_debug/ltx-generate bash scripts/test-gpu-migration.sh | Variable | Default | Effect | |----------|---------|--------| | `LTX_MIGRATE_MAX_TENSOR_MB` | `6144` | Max single-tensor size for GPU migration | + +## Branch: audio-video + +- **AV path**: with `--av`, video and audio latents are patchified, concatenated (video then audio tokens), passed through one DiT forward, then split; Euler step on both; video decoded with existing VAE, audio turned into WAV via a latent→waveform fallback. +- **Full audio VAE** (safetensors decoder) is not yet implemented; audio quality uses the fallback. See `docs/AV_PIPELINE.md` and `DEV.md` §5. diff --git a/DEV.md b/DEV.md index 491fb49..b3f39bb 100644 --- a/DEV.md +++ b/DEV.md @@ -16,7 +16,8 @@ navigate the known limitations. - [Build configurations](#build-configurations) - [Obtaining model files](#obtaining-model-files) 4. [End-to-end data flow](#4-end-to-end-data-flow) -5. [Source file reference](#5-source-file-reference) +5. [Audio-video (AV) pipeline](#5-audio-video-av-pipeline) +6. [Source file reference](#6-source-file-reference) - [ltx\_common.hpp](#ltx_commonhpp) - [scheduler.hpp](#schedulerhpp) - [t5\_encoder.hpp](#t5_encoderhpp) @@ -25,25 +26,25 @@ navigate the known limitations. - [ltx-generate.cpp](#ltx-generatecpp) - [ltx-quantize.cpp](#ltx-quantizecpp) - [convert.py](#convertpy) -6. [GGUF model format conventions](#6-gguf-model-format-conventions) +7. [GGUF model format conventions](#7-gguf-model-format-conventions) - [DiT GGUF](#dit-gguf) - [VAE GGUF](#vae-gguf) - [T5 GGUF](#t5-gguf) -7. [Image-to-video (I2V) design](#7-image-to-video-i2v-design) +8. [Image-to-video (I2V) design](#8-image-to-video-i2v-design) - [VaeEncoder](#vaeencoder) - [Frame-conditioning schedule](#frame-conditioning-schedule) - [Hard-pinning at t=0](#hard-pinning-at-t0) -8. [Key algorithms and design decisions](#8-key-algorithms-and-design-decisions) +9. [Key algorithms and design decisions](#9-key-algorithms-and-design-decisions) - [Rectified Flow (RF) scheduling](#rectified-flow-rf-scheduling) - [Classifier-free guidance](#classifier-free-guidance) - [Patchify / unpatchify](#patchify--unpatchify) - [Latent dimension formulas](#latent-dimension-formulas) - [Tokenizer](#tokenizer) -9. [Adding a new backend (GPU/Metal/Vulkan)](#9-adding-a-new-backend-gpumetalvulkan) -10. [Known limitations and open tasks](#10-known-limitations-and-open-tasks) -11. [Coding conventions](#11-coding-conventions) -12. [Testing](#12-testing) -13. [Contributing](#13-contributing) +10. [Adding a new backend (GPU/Metal/Vulkan)](#10-adding-a-new-backend-gpumetalvulkan) +11. [Known limitations and open tasks](#11-known-limitations-and-open-tasks) +12. [Coding conventions](#12-coding-conventions) +13. [Testing](#13-testing) +14. [Contributing](#14-contributing) --- @@ -90,6 +91,9 @@ ltx.cpp/ ├── checkpoints.sh Download raw HF safetensors checkpoints ├── models.sh Download pre-quantised GGUF models from Unsloth/HF ├── quantize.sh Shell wrapper: run ltx-quantize on all BF16 GGUFs +├── docs/ +│ ├── AV_PIPELINE.md Audio-video pipeline design (token concat, shapes, CLI) +│ └── LTX_COMFY_REFERENCE.md ComfyUI workflow reference │ └── ggml/ Git submodule — GGML tensor library ``` @@ -246,12 +250,32 @@ CLI args → start_lat / end_lat [H_lat × W_lat × 128] These latents are blended into the live denoising latent after each Euler step - (see §7 for the full schedule). + (see §8 for the full schedule). ``` --- -## 5. Source file reference +## 5. Audio-video (AV) pipeline + +**Branch: `audio-video`.** The LTX 2.3 GGUF DiT is a full **audio-video** model: it expects a single sequence of **concatenated video + audio** tokens and outputs a combined velocity that is split back into video and audio. + +**Data flow when `--av`:** + +1. **Latent init**: Video latent `[T_lat, H_lat, W_lat, C]` (as today) plus audio latent `[T_lat, 8, 16]` (C_audio=8, mel_bins=16), both filled with noise. +2. **Per step**: + - `patchify()` → video tokens `[n_video_tok, 128]`; `patchify_audio()` → audio tokens `[T_lat, 128]`. + - Concat → `[n_video_tok + T_lat, 128]` = `[n_tok_total, Pd]`. + - `LtxDiT::forward(combined, n_tok_total, …)` → combined velocity. + - Split: first `n_video_tok` tokens → video velocity; remainder → audio velocity. + - Unpatchify both; Euler step on video latent and on audio latent. + - (Optional) frame conditioning on video only (unchanged). +3. **Decode**: Video VAE decode → PPM frames (unchanged). Audio: denoised audio latent → waveform via a **latent-to-waveform** path (fake mel + overlap-add sinusoids) → 16-bit WAV (16 kHz). A full **audio VAE decoder** (safetensors) can be integrated later for higher-quality audio. + +**Code:** `patchify_audio` / `unpatchify_audio` in `ltx_dit.hpp`; combined patch buffer, split, and dual Euler step in `ltx-generate.cpp`; `write_wav()` and `latent_to_waveform()` in `ltx-generate.cpp`. Design details: [docs/AV_PIPELINE.md](docs/AV_PIPELINE.md). + +--- + +## 6. Source file reference ### `ltx_common.hpp` @@ -332,7 +356,7 @@ Weights layout expected in the GGUF (prefix `vae.decoder.*`): `decode(latents, T_lat, H_lat, W_lat)` runs a simplified per-frame 2-D decode with nearest-neighbour temporal upsampling. Full causal 3-D conv decode is a -planned improvement (see §10). +planned improvement (see §11). #### `VaeEncoder` @@ -368,10 +392,12 @@ model.diffusion_model.norm_out.linear.{weight,bias} ``` Fallback names with prefix `dit.*` are also tried. +**Audio (AV pipeline)**: `patchify_audio(lat, T, C, F)` and `unpatchify_audio(tok, T, C, F)` in the same header convert audio latent `[T, 8, 16]` ↔ `[T, 128]` tokens for concatenation with video tokens before the single DiT forward. + **Forward pass** (per call to `LtxDiT::forward()`): 1. Sinusoidal timestep embedding → MLP → `hidden_size` vector 2. AdaLN-single linear → `6 × hidden_size` (scale/shift params; currently - stored but not yet fully applied per-block — see §10) + stored but not yet fully applied per-block — see §11) 3. Patchify projection: `[N_tok, patch_dim]` → `[N_tok, hidden_size]` 4. Caption projection: `[S, 4096]` → `[S, hidden_size]` 5. N × transformer blocks: @@ -411,11 +437,13 @@ Orchestrates the full inference pipeline. | `start_frame_path` | `--start-frame` | `""` (disabled) | | `end_frame_path` | `--end-frame` | `""` (disabled) | | `frame_strength` | `--frame-strength` | 1.0 | +| `av` | `--av` | false (enable audio+video path) | +| `audio_vae_path` | `--audio-vae` | `""` (optional; for full decoder when implemented) | +| `out_wav` | `--out-wav` | `""` (default: `.wav` when `--av`) | | `threads` | `--threads` | 4 | | `verbose` | `-v` | false | -**Output**: frames are written as `{out_prefix}_{NNNN}.ppm`. The output -directory is created automatically (including intermediate directories). +**Output**: frames are written as `{out_prefix}_{NNNN}.ppm`. When `--av`, a WAV file is also written (default `{out_prefix}.wav`). The output directory is created automatically (including intermediate directories). --- @@ -430,7 +458,7 @@ Rules: - Everything else → quantised to `target_type` All GGUF KV metadata is copied verbatim. String arrays (e.g. the tokenizer -vocabulary) are not currently copied — this is a known limitation (see §10). +vocabulary) are not currently copied — this is a known limitation (see §11). Supported quant types: `Q4_K_M`, `Q5_K_M`, `Q6_K`, `Q8_0`, `BF16`, `F32`, `F16`. @@ -458,7 +486,7 @@ For T5, the HF tokenizer vocabulary can be embedded into the GGUF via --- -## 6. GGUF model format conventions +## 7. GGUF model format conventions ### DiT GGUF @@ -501,7 +529,7 @@ Architecture string: `"t5"` --- -## 7. Image-to-video (I2V) design +## 8. Image-to-video (I2V) design The I2V implementation does not modify the DiT architecture. Instead it works by conditioning the *latent* directly at the boundary frames before and @@ -553,7 +581,7 @@ appearance, regardless of any residual denoising drift. --- -## 8. Key algorithms and design decisions +## 9. Key algorithms and design decisions ### Rectified Flow (RF) scheduling @@ -587,17 +615,19 @@ The unconditional embedding is computed by encoding the `--neg` prompt ### Patchify / unpatchify -The DiT operates on *tokens*, not on the raw latent volume. The latent +The DiT operates on *tokens*, not on the raw latent volume. The **video** latent `[T_lat, H_lat, W_lat, C]` is chunked into non-overlapping patches of size `(pt=1, ph=2, pw=2)` along the temporal, height, and width dimensions: ``` -patch_dim = pt * ph * pw * C = 1 * 2 * 2 * 128 = 512 +patch_dim = pt * ph * pw * C = 1 * 2 * 2 * 128 = 512 (or 128 for C=32) N_tok = (T_lat/pt) * (H_lat/ph) * (W_lat/pw) ``` -`patchify()` and `unpatchify()` are helper functions called from -`ltx-generate.cpp`. Both are pure memory rearrangements with no arithmetic. +`patchify()` and `unpatchify()` are helper functions in `ltx_dit.hpp` called from +`ltx-generate.cpp`. For the **audio-video** path, `patchify_audio()` and +`unpatchify_audio()` convert audio latent `[T, 8, 16]` to/from `[T, 128]` tokens; +video and audio token sequences are concatenated before the DiT forward and split after. All are pure memory rearrangements with no arithmetic. ### Latent dimension formulas @@ -643,7 +673,7 @@ Scores are written by `convert.py --tokenizer` (via --- -## 9. Backends (GPU: Metal, CUDA, Vulkan, ROCm) +## 10. Backends (GPU: Metal, CUDA, Vulkan, ROCm) We follow the same pattern as [acestep.cpp](https://github.com/ServeurpersoCom/acestep.cpp): **the build command determines the backend**. One backend per build; no platform-specific divergence in code. @@ -658,7 +688,7 @@ The main performance bottleneck is the DiT `forward()` call, which rebuilds a `g --- -## 10. Known limitations and open tasks +## 11. Known limitations and open tasks These are the main areas where the implementation is deliberately simplified and where contributions are most welcome. @@ -677,10 +707,11 @@ and where contributions are most welcome. | 10 | **Threading** | `--threads` is parsed but not passed to `ggml_graph_compute_with_ctx` | Wire the thread count through to `ggml_graph_compute_with_ctx(ctx, gf, n_threads)` | | 11 | **Output formats** | Only binary PPM (P6) output | Add JPEG/PNG output via stb_image_write or a similar library | | 12 | **Windows `_mkdir`** | Only one level of directory is created on Windows | Implement recursive mkdir for Windows | +| 13 | **Audio VAE decoder** | With `--av`, audio is synthesized from the denoised latent via a fallback (fake mel + overlap-add); no full audio VAE decode yet | Load `ltx-2.3-22b-dev_audio_vae.safetensors` and implement 2D conv decoder (see docs/AV_PIPELINE.md) | --- -## 11. Coding conventions +## 12. Coding conventions - **Language**: C++17 throughout; no exceptions (use return codes). - **Headers only**: all modules live in `src/*.hpp`. Only the two `main()` @@ -704,7 +735,7 @@ and where contributions are most welcome. --- -## 12. Testing +## 13. Testing There is no formal test suite yet. Validation is currently done by: @@ -726,16 +757,16 @@ There is no formal test suite yet. Validation is currently done by: --- -## 13. Contributing +## 14. Contributing 1. **Fork** the repository and create a branch from `main`. -2. **Read §10** to find where help is most needed. +2. **Read §11** to find where help is most needed. 3. **Keep PRs focused** — one feature or fix per PR. -4. **Match the style** described in §11. +4. **Match the style** described in §12. 5. **Document** any new CLI flag in both `print_usage()` (in `ltx-generate.cpp`) and `README.md`. 6. **Update this file** (`DEV.md`) if you add a new module, change the GGUF - schema, or significantly alter the data flow. + schema, or significantly alter the data flow (e.g. AV pipeline in §5). 7. **No model weights** should ever be committed to the repo. For questions, open a GitHub Discussion or issue in the diff --git a/README.md b/README.md index 3cfa232..43244d1 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ Portable C++17 inference of **LTX-Video** (Lightricks) using [GGML](https://github.com/ggml-org/ggml) / GGUF. Text-to-video generation runs on CPU, with build-time support for CUDA, ROCm, Metal, and Vulkan (same backend-per-build pattern as [acestep.cpp](https://github.com/ServeurpersoCom/acestep.cpp)). No Python at inference time. +**On the `audio-video` branch**: the same DiT is used for **audio+video** — concatenated video and audio latent tokens, one denoise loop, then split and decode to frames + WAV. See [Audio-video (AV)](#audio-video-av--video--wav-from-the-same-dit) and [docs/AV_PIPELINE.md](docs/AV_PIPELINE.md). + Inspired by [llama.cpp](https://github.com/ggml-org/llama.cpp) and [acestep.cpp](https://github.com/ServeurpersoCom/acestep.cpp). @@ -17,6 +19,7 @@ Inspired by [llama.cpp](https://github.com/ggml-org/llama.cpp) and - Quantised GGUF weights (Q4\_K\_M → Q8\_0 → BF16) - Classifier-free guidance + flow-shift Euler sampler - PPM frame output (pipe to ffmpeg for MP4) +- **Audio-video (AV) pipeline** — same DiT sees concatenated video+audio latent; output is video frames + WAV (see [docs/AV_PIPELINE.md](docs/AV_PIPELINE.md)) - Single `ltx-generate` binary — no Python at runtime --- @@ -70,9 +73,10 @@ Builds two binaries: ```bash pip install huggingface_hub # for hf_hub_download -./models.sh # Q8_0 (~7 GB total) +./models.sh # Dev DiT (default) + T5 + VAE + extras +./models.sh --distilled # Distilled DiT (few-step) instead of dev ./models.sh --quant Q4_K_M # smaller, faster -./models.sh --all # every quant +./models.sh --all # every quant (dev or distilled) ``` Downloads three GGUF files into `models/`: @@ -83,6 +87,8 @@ Downloads three GGUF files into `models/`: | `ltxv-vae-Q8_0.gguf` | CausalVideoVAE | ~400 MB | | `t5-xxl-Q8_0.gguf` | T5-XXL text encoder | ~4.6 GB | +**LTX-2.3 (22B)** — All from [unsloth/LTX-2.3-GGUF](https://huggingface.co/unsloth/LTX-2.3-GGUF): **DiT** (dev at repo root, or [distilled/](https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/distilled)), **VAE** (`vae/` — video + audio safetensors), **text encoders** (`text_encoders/` — embeddings_connectors for Gemma). Use `./models.sh` for dev (default) or `./models.sh --distilled` for distilled DiT + matching VAE and connectors. See `docs/LTX_COMFY_REFERENCE.md` for the full file list. + ### Option B – Convert from safetensors ```bash @@ -173,6 +179,28 @@ Convert the PPM output frames to MP4: ffmpeg -framerate 24 -i output/frame_%04d.ppm -c:v libx264 -pix_fmt yuv420p output.mp4 ``` +### Audio-video (AV) — video + WAV from the same DiT + +The LTX 2.3 GGUF DiT is a full **audio-video** model: it expects **concatenated video + audio** latent tokens and outputs both. Use `--av` to run the full AV path (same denoise loop, then decode video and synthesize audio). + +```bash +./build/ltx-generate \ + --dit models/ltx-2.3-22b-dev-Q4_K_M.gguf \ + --vae models/ltx-2.3-22b-dev_video_vae.safetensors \ + --t5 models/t5-xxl-Q8_0.gguf \ + --av --out output/av --out-wav output/av.wav \ + --prompt "Ocean waves, seagulls, wind" \ + --frames 25 --height 480 --width 704 --steps 20 --cfg 4.0 +``` + +You get `output/av_0000.ppm` … and `output/av.wav`. Mux video + audio with ffmpeg: + +```bash +ffmpeg -framerate 24 -i output/av_%04d.ppm -i output/av.wav -c:v libx264 -c:a aac -shortest output_av.mp4 +``` + +Design details (token concat, shapes, audio VAE): [docs/AV_PIPELINE.md](docs/AV_PIPELINE.md). + --- ## Command-Line Reference @@ -197,6 +225,11 @@ Generation: --seed RNG seed (default: 42) --out Output frame file prefix (default: output/frame) +Audio-video (AV) pipeline: + --av Enable audio+video (concat latent → DiT → split → decode both) + --audio-vae Audio VAE safetensors (optional with --av; for full decoder when implemented) + --out-wav Output WAV path (default: <out prefix>.wav when --av) + Image-to-video (I2V) conditioning: --start-frame PNG/JPG/BMP/TGA/PPM image: animate from this reference frame --end-frame PNG/JPG/BMP/TGA/PPM image: end at this frame (keyframe interp) diff --git a/docs/AV_PIPELINE.md b/docs/AV_PIPELINE.md new file mode 100644 index 0000000..5e553c0 --- /dev/null +++ b/docs/AV_PIPELINE.md @@ -0,0 +1,67 @@ +# Audio-Video pipeline design (ltx.cpp) + +This document describes how the **combined audio+video** path works so that ltx.cpp can produce both video frames and a WAV from the same GGUF DiT and VAEs used in ComfyUI. + +## Goal + +- **Input**: Text prompt, frame count, resolution (same as today). +- **Output**: Video frames (PPM/PNG) **and** a WAV file (optionally muxed with ffmpeg). +- **Models**: Same stack as [LTX_COMFY_REFERENCE.md](LTX_COMFY_REFERENCE.md): unsloth/LTX-2.3-GGUF DiT, video VAE, **audio VAE** (`ltx-2.3-22b-dev_audio_vae.safetensors`). + +## AV latent layout (token concatenation) + +The LTX 2.3 DiT is an **audio-video** model: it expects a **single sequence of tokens** formed by concatenating **video tokens** and **audio tokens** in that order. Each token has the same dimension **Pd = 128** (from patch_dim = patch_t × patch_h × patch_w × latent_channels; we use 1×2×2×32 or 128 for video, and 8×16=128 for audio). + +- **Video**: Latent shape `[T_lat, H_lat, W_lat, C]` with C=32 (DiT latent_channels). Patch size (1,2,2) → **n_video_tok** = (T_lat/1) × (H_lat/2) × (W_lat/2). +- **Audio**: Latent shape `[T_audio, C_audio, mel_bins]` with C_audio=8, mel_bins=16 (Lightricks `AudioLatentShape`). Patchify: `(B, C, T, F) → (B, T, C*F)` → **n_audio_tok** = T_audio. We align **T_audio = T_lat** so that one audio token per latent frame matches the Comfy “same length as video” rule. +- **Combined**: Input to DiT is `[n_video_tok + n_audio_tok, Pd]`; output is the same shape. We **split** the DiT output: first `n_video_tok` tokens → video velocity; remaining `n_audio_tok` tokens → audio velocity. + +References: + +- Comfy: “AV latent: model sees **concatenated video + audio latent**; after sampling, **LTXVSeparateAVLatent** splits back to video and audio.” +- Lightricks: `AudioPatchifier.get_token_count` = `tgt_shape.frames`; `patchify` is `b c t f → b t (c f)` so Pd_audio = C_audio × mel_bins = 128. + +## Pipeline steps + +1. **Latent init**: Video latent `[T_lat, H_lat, W_lat, C]` and audio latent `[T_lat, C_audio, mel_bins]` (i.e. `[T_lat, 8, 16]`) filled with noise. +2. **Per step**: + - Patchify video → `[n_video_tok, 128]`; patchify audio → `[n_audio_tok, 128]`. + - Concat → `[n_video_tok + n_audio_tok, 128]`. + - DiT forward on combined sequence. + - Split output: video part → unpatchify → video velocity; audio part → unpatchify_audio → audio velocity. + - Euler step on video latent; Euler step on audio latent. + - (Optional) Frame conditioning on video as today (start/end frame pinning). +3. **Decode**: + - **Video**: Existing video VAE decode → frames → write PPM/PNG. + - **Audio**: Audio VAE decode (latent → spectrogram) → vocoder or Griffin–Lim (spectrogram → waveform) → write WAV. + +## Audio VAE + +- **File**: `ltx-2.3-22b-dev_audio_vae.safetensors` (unsloth/LTX-2.3-GGUF, `vae/`). +- **Role**: Decoder maps audio latent `[B, C, T, F]` (C=8, F=16, T=T_lat) to spectrogram (e.g. `[B, 1, T×4, 64]` mel bins; exact from Lightricks `LATENT_DOWNSAMPLE_FACTOR=4` and `_adjust_output_shape`). +- **Implementation**: 2D conv decoder (conv_in → mid block → up blocks → norm_out → conv_out), loaded from safetensors; tensor names follow the Lightricks Decoder (e.g. `decoder.conv_in.*`, `decoder.mid.*`, `decoder.up.*`, `decoder.norm_out`, `decoder.conv_out`). Our loader may add a `vae.` prefix. + +## Spectrogram → WAV + +- **Preferred**: Full vocoder (spectrogram → waveform) as in Lightricks; if not available in C++, use **Griffin–Lim** (inverse STFT from magnitude) for a first milestone. +- **Params**: sample_rate 16000, hop_length 160, mel_bins 64 (from audio VAE / pipeline config). + +## CLI + +- `--av` : Enable audio+video path (allocate audio latent, concat/split, decode both). +- `--audio-vae ` : Optional path to `ltx-2.3-22b-dev_audio_vae.safetensors`; when omitted, audio is synthesized from the denoised latent (fallback path). +- `--out-wav ` : Output WAV path (default: `.wav` when `--av`). + +## Summary + +| Step | Video path (existing) | Audio path (new) | +|----------------|------------------------------------|-------------------------------------------| +| Init | Noise `[T,H,W,C]` | Noise `[T_lat, 8, 16]` | +| Patchify | `patchify()` → `[n_v, 128]` | `patchify_audio()` → `[n_a, 128]` | +| DiT | — | Single forward on `[n_v+n_a, 128]` | +| Split | — | First n_v → video, last n_a → audio | +| Unpatchify | `unpatchify()` → velocity | `unpatchify_audio()` → audio velocity | +| Step | Euler on video latent | Euler on audio latent | +| Decode | Video VAE → frames | Audio VAE → spectrogram → WAV | + +This yields a single, proven audio+video pipeline using the same GGUF models as ComfyUI. diff --git a/docs/LTX_COMFY_REFERENCE.md b/docs/LTX_COMFY_REFERENCE.md new file mode 100644 index 0000000..34427a2 --- /dev/null +++ b/docs/LTX_COMFY_REFERENCE.md @@ -0,0 +1,77 @@ +# LTX 2.3 ComfyUI workflow reference (for ltx.cpp) + +Reference only — no Python. This describes the working ComfyUI setup so ltx.cpp can stay aligned with the same model and behavior. + +## LTX-2.3-GGUF (models we use) + +We use the [unsloth/LTX-2.3-GGUF](https://huggingface.co/unsloth/LTX-2.3-GGUF) repo for **DiT**, **VAE**, and **text_encoders**. Same repo layout: [main tree](https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main) (dev DiT at root, `vae/`, `text_encoders/`), [distilled/](https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/distilled) (distilled DiT). Unsloth Dynamic 2.0 upcasts important layers; tooling from ComfyUI-GGUF (city96). + +- **Dev**: DiT at repo root (`ltx-2.3-22b-dev-*.gguf`), VAE and connectors from `vae/` and `text_encoders/` (dev_*). ≥20 steps, best quality. +- **Distilled**: DiT in `distilled/` (`ltx-2.3-22b-distilled-*.gguf`), VAE and connectors `vae/ltx-2.3-22b-distilled_*`, `text_encoders/ltx-2.3-22b-distilled_embeddings_connectors.safetensors`. Few-step (4–8), CFG=1. Use `./models.sh --distilled`. + +Base model: [Lightricks/LTX-2.3](https://huggingface.co/Lightricks/LTX-2.3). LTX-2.3 is a DiT-based **audio-video** foundation model (synchronized video + audio in one model). + +## Model files (what the workflow uses) + +All from [unsloth/LTX-2.3-GGUF](https://huggingface.co/unsloth/LTX-2.3-GGUF) unless noted: + +| Role | File | Format | Source | +|------|------|--------|--------| +| DiT (UNet) dev | `ltx-2.3-22b-dev-Q4_K_M.gguf` | GGUF | repo root | +| DiT (UNet) distilled | `ltx-2.3-22b-distilled-Q4_K_M.gguf` | GGUF | [distilled/](https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/distilled) | +| Video VAE | `ltx-2.3-22b-dev_video_vae.safetensors` | safetensors | **vae/** | +| Video VAE (distilled) | `ltx-2.3-22b-distilled_video_vae.safetensors` | safetensors | **vae/** | +| Audio VAE | `ltx-2.3-22b-dev_audio_vae.safetensors` | safetensors | **vae/** | +| Audio VAE (distilled) | `ltx-2.3-22b-distilled_audio_vae.safetensors` | safetensors | **vae/** | +| Embedding connectors (dev) | `ltx-2.3-22b-dev_embeddings_connectors.safetensors` | safetensors | **text_encoders/** | +| Embedding connectors (distilled) | `ltx-2.3-22b-distilled_embeddings_connectors.safetensors` | safetensors | **text_encoders/** | +| Text encoder (Gemma) | `gemma-3-12b-it-qat-UD-Q4_K_XL.gguf` | GGUF | unsloth/gemma-3-12b-it-qat-GGUF | +| Optional LoRA | `ltx-2.3-22b-distilled-lora-384.safetensors` | safetensors | Lightricks/LTX-2.3 | +| Optional upscaler | `ltx-2.3-spatial-upscaler-x2-1.0.safetensors` | safetensors | Lightricks/LTX-2.3 | + +ComfyUI loads the DiT via **UnetLoaderGGUF** and text via **DualCLIPLoaderGGUF** with type `"ltxv"` (Gemma GGUF + connectors). + +## Latent and resolution rules + +- **Video latent**: from `EmptyLTXVLatentVideo` — width, height, length (frames), batch. +- **Frames**: workflow uses **97** (satisfies LTX rule: frames = 8×n + 1). +- **Resolution**: width/height must be **divisible by 32** (e.g. 768×512). +- **Audio latent**: same “length” as video (97 frames), frame_rate 24–25; concatenated with video latent before the model. +- **AV latent**: model sees **concatenated video + audio latent**; after sampling, **LTXVSeparateAVLatent** splits back to video and audio for decoding. + +**The LTX 2.3 GGUF is full audio-video**: it expects combined video+audio latent and outputs combined velocity. ltx.cpp currently only feeds **video** latent and uses only the **video** part of the output; the audio path (audio latent, split, audio VAE decode, WAV output) is not yet implemented. + +## Scheduler and sampling + +- **LTXVScheduler** (first stage): steps=**20**, parameters 2.05, 0.95, true, 0.1 → outputs **SIGMAS**. +- **Sampler**: **euler_ancestral**. +- **CFG**: **4** in first stage; refinement stage uses CFG **1** with LoRA. +- **Refinement** (optional): second pass with **ManualSigmas** `0.909375, 0.725, 0.421875, 0.0` (4 steps), same sampler, CFG 1, LoRA applied. + +So a minimal video-only pipeline matches: ~20 steps, euler_ancestral, CFG ~4, and sigmas from an LTX-style scheduler. + +## Conditioning + +- **LTXVConditioning**: takes positive/negative conditioning and **frame_rate** (e.g. 25). +- Negative prompt in workflow: `"blurry, low quality, still frame, frames, watermark, overlay, titles, has blurbox, has subtitles"`. +- Text comes from **Gemma** + **embeddings_connectors**, not T5-XXL. ltx.cpp currently uses T5-XXL; for full parity with this workflow we’d need Gemma + connectors (or confirm DiT accepts T5 embeddings when dimensions match). + +## Decode and output + +- **Video**: **VAEDecodeTiled** on video latent → images (tile settings 512, 64, 4096, 8). +- **Audio**: **LTXVAudioVAEDecode** on audio latent → audio. +- **CreateVideo**: images + audio + fps (25) → final video. + +For ltx.cpp (current): we only decode the **video** part of the DiT output; tiled decode is an implementation detail for memory. Full pipeline would also decode the audio part with **LTXVAudioVAEDecode** and mux. + +## Summary for ltx.cpp + +1. **DiT**: GGUF `ltx-2.3-22b-dev-*.gguf` — already supported. +2. **VAE**: Video VAE is `ltx-2.3-22b-dev_video_vae.safetensors` on HF; ltx.cpp needs GGUF or safetensors support for this file. +3. **Text**: ComfyUI uses Gemma 3 12B GGUF + embeddings_connectors; ltx.cpp uses T5-XXL — same cross-attention dimension (4096) may allow reuse. +4. **Frames**: Use 8n+1 (e.g. 25, 33, 97). +5. **Resolution**: Width and height divisible by 32. +6. **Sampling**: euler_ancestral, ~20 steps, CFG ~4, LTX-style sigmas (flow-shift / scheduler params 2.05, 0.95, etc.). +7. **Audio not yet in ltx.cpp**: the GGUF is full AV; we currently feed only video latent and use only the video part of the output. To add audio: implement combined AV latent (concat video+audio noise), run DiT (already supports it), split output with LTXVSeparateAVLatent, decode audio latent with **LTXVAudioVAEDecode** (`ltx-2.3-22b-dev_audio_vae.safetensors`) and vocoder, then mux with video (e.g. ffmpeg). + +This file is reference only to keep ltx.cpp consistent with the working ComfyUI LTX 2.3 GGUF workflow. diff --git a/models.sh b/models.sh index 1ab18ab..871de39 100755 --- a/models.sh +++ b/models.sh @@ -1,18 +1,19 @@ #!/usr/bin/env bash # models.sh – Download LTX models from Hugging Face (no conversion). # -# All from unsloth/LTX-2.3-GGUF: -# DiT: ltx-2.3-22b-dev-*.gguf (root) -# VAE: vae/ltx-2.3-22b-dev_video_vae.safetensors (video decoding) -# T5: city96/t5-v1_1-xxl-encoder-gguf (t5-v1_1-xxl-encoder-*.gguf) -# -# Note: ltx.cpp loads VAE from GGUF only; the repo provides VAE as safetensors. -# Use a VAE GGUF if you have one, or the safetensors path for other tools. +# All from unsloth/LTX-2.3-GGUF (https://huggingface.co/unsloth/LTX-2.3-GGUF): +# DiT (dev): repo root ltx-2.3-22b-dev-*.gguf — ≥20 steps, best quality +# DiT (distilled): distilled/ ltx-2.3-22b-distilled-*.gguf — few-step (4–8), CFG=1 +# VAE: vae/ ltx-2.3-22b-dev_video_vae.safetensors, *_audio_vae.safetensors +# Text encoders: text_encoders/ ltx-2.3-22b-dev_embeddings_connectors.safetensors (for Gemma) +# With --distilled: DiT from distilled/; VAE and connectors use distilled_* variants from same repo. +# T5: city96/t5-v1_1-xxl-encoder-gguf (t5-v1_1-xxl-encoder-*.gguf) — used by ltx-generate # # Usage: -# ./models.sh # DiT + T5 + VAE + full extras (audio VAE, embeddings, LoRA, upscaler, Gemma) +# ./models.sh # Dev DiT + T5 + VAE + full extras +# ./models.sh --distilled # Distilled DiT instead of dev (few-step workflow) # ./models.sh --quant Q8_0 -# ./models.sh --all # All DiT quantizations + T5 + VAE + extras +# ./models.sh --all # All DiT quantizations (dev or distilled) + T5 + VAE + extras # ./models.sh --dit-only # DiT only # ./models.sh --minimal # DiT + T5 + VAE only (no extras) @@ -27,16 +28,19 @@ QUANT="${QUANT:-Q4_K_M}" DOWNLOAD_ALL=0 DIT_ONLY=0 MINIMAL=0 +DISTILLED=0 VAE_VIDEO_FILE="vae/ltx-2.3-22b-dev_video_vae.safetensors" usage() { - echo "Usage: $0 [--quant QUANT] [--all] [--dit-only] [--minimal]" + echo "Usage: $0 [--quant QUANT] [--distilled] [--all] [--dit-only] [--minimal]" echo "" echo "Options:" echo " --quant QUANT DiT quantization (default: Q4_K_M)" echo " Choices: Q2_K, Q3_K_M, Q3_K_S, Q4_0, Q4_1, Q4_K_M, Q4_K_S," echo " Q5_0, Q5_1, Q5_K_M, Q5_K_S, Q6_K, Q8_0, BF16, F16" - echo " --all Download all DiT quantizations" + echo " --distilled Download distilled DiT (distilled/*.gguf) instead of dev (root)." + echo " Few-step (4–8), CFG=1; same repo: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/distilled" + echo " --all Download all DiT quantizations (dev or distilled)" echo " --dit-only Download DiT only (skip T5, VAE, extras)" echo " --minimal DiT + T5 + VAE only (skip extras: audio VAE, embeddings, LoRA, upscaler, Gemma)" echo "" @@ -47,12 +51,13 @@ usage() { while [[ $# -gt 0 ]]; do case $1 in - --quant) QUANT="$2"; shift 2 ;; - --all) DOWNLOAD_ALL=1; shift ;; - --dit-only) DIT_ONLY=1; shift ;; - --minimal) MINIMAL=1; shift ;; - --help|-h) usage; exit 0 ;; - *) echo "Unknown option: $1"; usage; exit 1 ;; + --quant) QUANT="$2"; shift 2 ;; + --distilled) DISTILLED=1; shift ;; + --all) DOWNLOAD_ALL=1; shift ;; + --dit-only) DIT_ONLY=1; shift ;; + --minimal) MINIMAL=1; shift ;; + --help|-h) usage; exit 0 ;; + *) echo "Unknown option: $1"; usage; exit 1 ;; esac done @@ -89,69 +94,71 @@ hf_download() { fi } -# ── DiT filename by quant (case statement: safe with set -u) ─────────────────── +# ── DiT filename by quant and variant (dev vs distilled) ─────────────────────── get_dit_filename() { + local base + if [[ $DISTILLED -eq 1 ]]; then + base="ltx-2.3-22b-distilled" + else + base="ltx-2.3-22b-dev" + fi case "$1" in - Q2_K) echo "ltx-2.3-22b-dev-Q2_K.gguf" ;; - Q3_K_M) echo "ltx-2.3-22b-dev-Q3_K_M.gguf" ;; - Q3_K_S) echo "ltx-2.3-22b-dev-Q3_K_S.gguf" ;; - Q4_0) echo "ltx-2.3-22b-dev-Q4_0.gguf" ;; - Q4_1) echo "ltx-2.3-22b-dev-Q4_1.gguf" ;; - Q4_K_M) echo "ltx-2.3-22b-dev-Q4_K_M.gguf" ;; - Q4_K_S) echo "ltx-2.3-22b-dev-Q4_K_S.gguf" ;; - Q5_0) echo "ltx-2.3-22b-dev-Q5_0.gguf" ;; - Q5_1) echo "ltx-2.3-22b-dev-Q5_1.gguf" ;; - Q5_K_M) echo "ltx-2.3-22b-dev-Q5_K_M.gguf" ;; - Q5_K_S) echo "ltx-2.3-22b-dev-Q5_K_S.gguf" ;; - Q6_K) echo "ltx-2.3-22b-dev-Q6_K.gguf" ;; - Q8_0) echo "ltx-2.3-22b-dev-Q8_0.gguf" ;; - BF16) echo "ltx-2.3-22b-dev-BF16.gguf" ;; - F16) echo "ltx-2.3-22b-dev-F16.gguf" ;; + Q2_K) echo "${base}-Q2_K.gguf" ;; + Q3_K_M) echo "${base}-Q3_K_M.gguf" ;; + Q3_K_S) echo "${base}-Q3_K_S.gguf" ;; + Q4_0) echo "${base}-Q4_0.gguf" ;; + Q4_1) echo "${base}-Q4_1.gguf" ;; + Q4_K_M) echo "${base}-Q4_K_M.gguf" ;; + Q4_K_S) echo "${base}-Q4_K_S.gguf" ;; + Q5_0) echo "${base}-Q5_0.gguf" ;; + Q5_1) echo "${base}-Q5_1.gguf" ;; + Q5_K_M) echo "${base}-Q5_K_M.gguf" ;; + Q5_K_S) echo "${base}-Q5_K_S.gguf" ;; + Q6_K) echo "${base}-Q6_K.gguf" ;; + Q8_0) echo "${base}-Q8_0.gguf" ;; + BF16) echo "${base}-BF16.gguf" ;; + F16) echo "${base}-F16.gguf" ;; *) echo "" ;; esac } +# HF path prefix for DiT: root for dev, distilled/ for distilled +get_dit_hf_path() { + local fn="$1" + if [[ $DISTILLED -eq 1 ]]; then + echo "distilled/$fn" + else + echo "$fn" + fi +} + T5_FILE="t5-v1_1-xxl-encoder-Q8_0.gguf" # ── Download ────────────────────────────────────────────────────────────────── echo "Models directory: $MODELS_DIR" -echo "DiT quant: $QUANT" +echo "DiT variant: $([ $DISTILLED -eq 1 ] && echo 'distilled' || echo 'dev')" +echo "DiT quant: $QUANT" echo "" if [[ $DOWNLOAD_ALL -eq 1 ]]; then - while IFS= read -r line; do - q="${line%% *}" - f="${line#* }" - echo "Downloading DiT [$q]: $f" - hf_download "$HF_REPO" "$f" "$MODELS_DIR/$f" - done <<'DIT_LIST' -Q2_K ltx-2.3-22b-dev-Q2_K.gguf -Q3_K_M ltx-2.3-22b-dev-Q3_K_M.gguf -Q3_K_S ltx-2.3-22b-dev-Q3_K_S.gguf -Q4_0 ltx-2.3-22b-dev-Q4_0.gguf -Q4_1 ltx-2.3-22b-dev-Q4_1.gguf -Q4_K_M ltx-2.3-22b-dev-Q4_K_M.gguf -Q4_K_S ltx-2.3-22b-dev-Q4_K_S.gguf -Q5_0 ltx-2.3-22b-dev-Q5_0.gguf -Q5_1 ltx-2.3-22b-dev-Q5_1.gguf -Q5_K_M ltx-2.3-22b-dev-Q5_K_M.gguf -Q5_K_S ltx-2.3-22b-dev-Q5_K_S.gguf -Q6_K ltx-2.3-22b-dev-Q6_K.gguf -Q8_0 ltx-2.3-22b-dev-Q8_0.gguf -BF16 ltx-2.3-22b-dev-BF16.gguf -F16 ltx-2.3-22b-dev-F16.gguf -DIT_LIST - DIT_EXAMPLE="ltx-2.3-22b-dev-Q4_K_M.gguf" + for q in Q2_K Q3_K_M Q3_K_S Q4_0 Q4_1 Q4_K_M Q4_K_S Q5_0 Q5_1 Q5_K_M Q5_K_S Q6_K Q8_0 BF16 F16; do + fn="$(get_dit_filename "$q")" + hf_path="$(get_dit_hf_path "$fn")" + echo "Downloading DiT [$q]: $fn" + hf_download "$HF_REPO" "$hf_path" "$MODELS_DIR/$fn" + done + DIT_EXAMPLE="$(get_dit_filename "Q4_K_M")" else fn="$(get_dit_filename "$QUANT")" if [[ -z "$fn" ]]; then echo "Unknown quant: $QUANT. Choose from: Q2_K, Q3_K_M, Q3_K_S, Q4_0, Q4_1, Q4_K_M, Q4_K_S, Q5_0, Q5_1, Q5_K_M, Q5_K_S, Q6_K, Q8_0, BF16, F16" exit 1 fi + hf_path="$(get_dit_hf_path "$fn")" echo "Downloading DiT [$QUANT]: $fn" - hf_download "$HF_REPO" "$fn" "$MODELS_DIR/$fn" + hf_download "$HF_REPO" "$hf_path" "$MODELS_DIR/$fn" DIT_EXAMPLE="$fn" fi @@ -161,35 +168,49 @@ if [[ $DIT_ONLY -eq 0 ]]; then hf_download "$HF_REPO_T5" "$T5_FILE" "$MODELS_DIR/$T5_FILE" echo "" - echo "Downloading VAE (video): $VAE_VIDEO_FILE" - VAE_DEST="$MODELS_DIR/ltx-2.3-22b-dev_video_vae.safetensors" - hf_download "$HF_REPO" "$VAE_VIDEO_FILE" "$VAE_DEST" + if [[ $DISTILLED -eq 1 ]]; then + echo "Downloading VAE (video, distilled): vae/ltx-2.3-22b-distilled_video_vae.safetensors" + hf_download "$HF_REPO" "vae/ltx-2.3-22b-distilled_video_vae.safetensors" "$MODELS_DIR/ltx-2.3-22b-distilled_video_vae.safetensors" + VAE_EXAMPLE="ltx-2.3-22b-distilled_video_vae.safetensors" + else + echo "Downloading VAE (video): $VAE_VIDEO_FILE" + hf_download "$HF_REPO" "$VAE_VIDEO_FILE" "$MODELS_DIR/ltx-2.3-22b-dev_video_vae.safetensors" + VAE_EXAMPLE="ltx-2.3-22b-dev_video_vae.safetensors" + fi fi # ── Full project extras (skip with --minimal or --dit-only) ─────────────────── if [[ $DIT_ONLY -eq 0 && $MINIMAL -eq 0 ]]; then echo "" - echo "Downloading extras (audio VAE, embeddings, LoRA, upscaler, Gemma, mmproj) ..." - hf_download "$HF_REPO" "vae/ltx-2.3-22b-dev_audio_vae.safetensors" "$MODELS_DIR/ltx-2.3-22b-dev_audio_vae.safetensors" - hf_download "$HF_REPO" "text_encoders/ltx-2.3-22b-dev_embeddings_connectors.safetensors" "$MODELS_DIR/ltx-2.3-22b-dev_embeddings_connectors.safetensors" + echo "Downloading extras (audio VAE, text_encoders, LoRA, upscaler, Gemma, mmproj) ..." + if [[ $DISTILLED -eq 1 ]]; then + hf_download "$HF_REPO" "vae/ltx-2.3-22b-distilled_audio_vae.safetensors" "$MODELS_DIR/ltx-2.3-22b-distilled_audio_vae.safetensors" + hf_download "$HF_REPO" "text_encoders/ltx-2.3-22b-distilled_embeddings_connectors.safetensors" "$MODELS_DIR/ltx-2.3-22b-distilled_embeddings_connectors.safetensors" + else + hf_download "$HF_REPO" "vae/ltx-2.3-22b-dev_audio_vae.safetensors" "$MODELS_DIR/ltx-2.3-22b-dev_audio_vae.safetensors" + hf_download "$HF_REPO" "text_encoders/ltx-2.3-22b-dev_embeddings_connectors.safetensors" "$MODELS_DIR/ltx-2.3-22b-dev_embeddings_connectors.safetensors" + fi hf_download "$HF_REPO_LIGHTRICKS" "ltx-2.3-22b-distilled-lora-384.safetensors" "$MODELS_DIR/ltx-2.3-22b-distilled-lora-384.safetensors" hf_download "$HF_REPO_LIGHTRICKS" "ltx-2.3-spatial-upscaler-x2-1.0.safetensors" "$MODELS_DIR/ltx-2.3-spatial-upscaler-x2-1.0.safetensors" hf_download "$HF_REPO_GEMMA" "gemma-3-12b-it-qat-UD-Q4_K_XL.gguf" "$MODELS_DIR/gemma-3-12b-it-qat-UD-Q4_K_XL.gguf" hf_download "$HF_REPO_GEMMA" "mmproj-BF16.gguf" "$MODELS_DIR/mmproj-BF16.gguf" fi +# Default VAE example when not DIT_ONLY (set above) +: "${VAE_EXAMPLE:=ltx-2.3-22b-dev_video_vae.safetensors}" + echo "" echo "Done. Models are in: $MODELS_DIR" echo "" -echo "VAE is from unsloth/LTX-2.3-GGUF (vae/); downloaded as safetensors." -echo "ltx-generate expects a VAE GGUF; use the safetensors path with tools that support it." +echo "DiT, VAE (vae/), and text_encoders (text_encoders/) are from unsloth/LTX-2.3-GGUF." +echo "VAE files are safetensors; ltx-generate accepts them (no GGUF required)." echo "" echo "Quick start (DiT + T5 + VAE, flat in $MODELS_DIR):" echo " mkdir -p output" echo " ./build/ltx-generate \\" echo " --dit $MODELS_DIR/$DIT_EXAMPLE \\" -echo " --vae $MODELS_DIR/ltx-2.3-22b-dev_video_vae.safetensors \\" +echo " --vae $MODELS_DIR/$VAE_EXAMPLE \\" echo " --t5 $MODELS_DIR/$T5_FILE \\" echo " --prompt \"A beautiful sunrise over mountain peaks\" \\" echo " --frames 25 --height 480 --width 704 \\" diff --git a/src/ltx-generate.cpp b/src/ltx-generate.cpp index 71d2ba5..7ec0919 100644 --- a/src/ltx-generate.cpp +++ b/src/ltx-generate.cpp @@ -25,6 +25,7 @@ #include "scheduler.hpp" #include "ltx_perf.hpp" +#include #include #include #include @@ -35,6 +36,103 @@ # include // _mkdir #endif +// ── WAV output (AV pipeline) ────────────────────────────────────────────────── +// Writes 16-bit mono PCM WAV. sample_rate typically 16000. +static bool write_wav(const std::string & path, + const float * samples, size_t num_samples, int sample_rate) +{ + std::FILE * f = std::fopen(path.c_str(), "wb"); + if (!f) { + LTX_ERR("cannot create WAV file: %s", path.c_str()); + return false; + } + size_t data_bytes = num_samples * 2; // 16-bit + unsigned char header[44] = { + 'R','I','F','F', + 0,0,0,0, // file size - 8 + 'W','A','V','E', + 'f','m','t',' ', + 16,0,0,0, // fmt chunk size + 1,0, // PCM + 1,0, // mono + 0,0,0,0, // sample rate (fill below) + 0,0,0,0, // byte rate + 2,0, // block align + 16,0, // bits per sample + 'd','a','t','a', + 0,0,0,0 // data size + }; + uint32_t file_size = (uint32_t)(36 + data_bytes); + uint32_t sr = (uint32_t)sample_rate; + uint32_t byte_rate = sr * 2; + header[4] = (unsigned char)(file_size); header[5] = (unsigned char)(file_size>>8); + header[6] = (unsigned char)(file_size>>16); header[7] = (unsigned char)(file_size>>24); + header[24] = (unsigned char)(sr); header[25] = (unsigned char)(sr>>8); + header[26] = (unsigned char)(sr>>16); header[27] = (unsigned char)(sr>>24); + header[28] = (unsigned char)(byte_rate); header[29] = (unsigned char)(byte_rate>>8); + header[30] = (unsigned char)(byte_rate>>16); header[31] = (unsigned char)(byte_rate>>24); + uint32_t ds = (uint32_t)data_bytes; + header[40] = (unsigned char)(ds); header[41] = (unsigned char)(ds>>8); + header[42] = (unsigned char)(ds>>16); header[43] = (unsigned char)(ds>>24); + if (fwrite(header, 1, 44, f) != 44) { fclose(f); return false; } + for (size_t i = 0; i < num_samples; ++i) { + float s = samples[i]; + s = std::max(-1.0f, std::min(1.0f, s)); + int16_t v = (int16_t)(s * 32767.0f); + unsigned char b[2] = { (unsigned char)(v & 0xff), (unsigned char)(v >> 8) }; + if (fwrite(b, 1, 2, f) != 2) { fclose(f); return false; } + } + fclose(f); + LTX_LOG("WAV written: %s (%zu samples, %d Hz)", path.c_str(), num_samples, sample_rate); + return true; +} + +// Build a crude waveform from audio latent for AV pipeline (no full audio VAE decoder yet). +// Latent [T, 8, 16] -> fake mel (T*4, 64) -> overlap-add with sinusoids -> float samples. +// sample_rate=16000, hop_length=160, mel_bins=64 (LTX reference). +static std::vector latent_to_waveform( + const float * lat, int T_lat, int sample_rate, int hop_length, int mel_bins) +{ + const int T_mel = T_lat * 4; // LATENT_DOWNSAMPLE_FACTOR + const size_t num_samples = (size_t)T_mel * hop_length; + std::vector out(num_samples, 0.0f); + // Mel center frequencies (Hz) for 64 bins, 0..8000 Hz approx + std::vector mel_centers((size_t)mel_bins); + for (int b = 0; b < mel_bins; ++b) { + float m = (float)b / (mel_bins - 1) * 2595.0f * std::log10(1.0f + 8000.0f / 700.0f); + mel_centers[(size_t)b] = 700.0f * (std::pow(10.0f, m / 2595.0f) - 1.0f); + } + for (int t = 0; t < T_mel; ++t) { + int t_lat = t / 4; + if (t_lat >= T_lat) t_lat = T_lat - 1; + size_t sample_start = (size_t)t * hop_length; + for (int b = 0; b < mel_bins; ++b) { + // Map 64 mel bins from latent (8 ch, 16 freq): use ch = b/16, f = b%16 (first 4 ch) + int c = b / 16, f = b % 16; + if (c >= 8) c = 7; + float mag = lat[((size_t)t_lat * 8 + c) * 16 + f]; + mag = std::max(0.0f, std::min(1.0f, 0.5f + 0.5f * mag)); // scale latent to positive + float phase = 0.0f; // fixed phase for simplicity + float f_hz = mel_centers[(size_t)b]; + for (int i = 0; i < hop_length && sample_start + i < num_samples; ++i) { + float x = (float)((int)sample_start + i) / (float)sample_rate; + out[sample_start + i] += mag * std::cos(2.0f * 3.14159265f * f_hz * x + phase); + } + } + } + // Normalize + float max_val = 0.0f; + for (size_t i = 0; i < num_samples; ++i) { + float a = std::fabs(out[i]); + if (a > max_val) max_val = a; + } + if (max_val > 1e-6f) { + float scale = 0.95f / max_val; + for (size_t i = 0; i < num_samples; ++i) out[i] *= scale; + } + return out; +} + // ── Image loading (single TU to avoid duplicate stb symbols) ───────────────── @@ -95,6 +193,10 @@ struct Args { std::string prompt = "A beautiful scenic landscape with flowing water."; std::string negative_prompt = ""; std::string out_prefix = "output/frame"; + // Audio-video: enable AV path, audio VAE path, WAV output + bool av = false; + std::string audio_vae_path; // required when --av + std::string out_wav; // WAV output path when --av (default: out_prefix + .wav) // Image-to-video conditioning. std::string start_frame_path; // path to start/reference frame (PPM) std::string end_frame_path; // path to end frame (PPM), for keyframe interpolation @@ -132,6 +234,11 @@ static void print_usage(const char * prog) { " --seed RNG seed (default: 42)\n" " --out Output frame prefix (default: output/frame)\n" "\n" + "Audio-video (AV) pipeline:\n" + " --av Enable audio+video (concat video+audio latent, DiT, split, decode both)\n" + " --audio-vae Audio VAE safetensors (optional; when omitted, audio from latent fallback)\n" + " --out-wav Output WAV path (default: .wav when --av)\n" + "\n" "Image-to-video (I2V) conditioning:\n" " --start-frame PNG/JPG/BMP/TGA/PPM image to use as the first frame / reference\n" " --end-frame PNG/JPG/BMP/TGA/PPM image to use as the last frame (keyframe interp)\n" @@ -173,6 +280,9 @@ static Args parse_args(int argc, char ** argv) { else if (arg == "--start-frame") a.start_frame_path = nextarg(); else if (arg == "--end-frame") a.end_frame_path = nextarg(); else if (arg == "--frame-strength") a.frame_strength = std::atof(nextarg()); + else if (arg == "--av") a.av = true; + else if (arg == "--audio-vae") a.audio_vae_path = nextarg(); + else if (arg == "--out-wav") a.out_wav = nextarg(); else if (arg == "-v") a.verbose = true; else if (arg == "--perf") a.perf = true; else if (arg == "--help" || arg == "-h") { print_usage(argv[0]); exit(0); } @@ -205,6 +315,15 @@ int main(int argc, char ** argv) { print_usage(argv[0]); return 1; } + // --audio-vae is optional with --av: when omitted, audio is synthesized from the denoised latent (fallback). + if (args.av && args.out_wav.empty()) { + args.out_wav = args.out_prefix; + size_t slash = args.out_wav.rfind('/'); + size_t dot = args.out_wav.find('.', slash == std::string::npos ? 0 : slash); + if (dot != std::string::npos) + args.out_wav = args.out_wav.substr(0, dot); + args.out_wav += ".wav"; + } // LTX reference: frames = 8*n+1, width/height divisible by 32. if ((args.frames - 1) % 8 != 0) { @@ -327,8 +446,16 @@ int main(int argc, char ** argv) { int n_tok = (T_lat / pt) * (H_lat / ph) * (W_lat / pw); int Pd = dit.cfg.patch_dim(); + // Audio latent (AV pipeline): same T as video, C_audio=8, mel_bins=16 → n_audio_tok = T_lat, Pd_audio=128 + const int C_audio = 8, F_audio = 16; + int n_audio_tok = args.av ? T_lat : 0; + int n_tok_total = n_tok + n_audio_tok; + LTX_LOG("latent: T=%d H=%d W=%d C=%d → %d tokens (patch_dim=%d)", T_lat, H_lat, W_lat, C, n_tok, Pd); + if (args.av) + LTX_LOG("AV: audio latent T=%d C=%d F=%d → %d audio tokens, total tokens %d", + T_lat, C_audio, F_audio, n_audio_tok, n_tok_total); // ── Encode reference frames (I2V conditioning) ──────────────────────────── @@ -370,6 +497,13 @@ int main(int argc, char ** argv) { std::vector latents(lat_size); rng.fill(latents.data(), lat_size); + size_t audio_lat_size = args.av ? (size_t)T_lat * C_audio * F_audio : 0; + std::vector audio_latents; + if (args.av) { + audio_latents.resize(audio_lat_size); + rng.fill(audio_latents.data(), audio_lat_size); + } + // ── Denoising loop ──────────────────────────────────────────────────────── RFScheduler rf_sched(args.steps, args.shift, do_cfg); @@ -388,26 +522,39 @@ int main(int argc, char ** argv) { fflush(stderr); } - // Patchify latent. + // Patchify: video [n_tok, Pd]; if AV, audio [n_audio_tok, Pd]; combined = [video; audio]. std::vector patches = patchify( latents.data(), T_lat, H_lat, W_lat, C, pt, ph, pw); - // Conditional velocity. + std::vector combined_patches; + const float * dit_input = patches.data(); + int dit_n_tok = n_tok; + if (args.av) { + combined_patches.resize((size_t)n_tok_total * Pd); + memcpy(combined_patches.data(), patches.data(), (size_t)n_tok * Pd * sizeof(float)); + std::vector a_patches = patchify_audio( + audio_latents.data(), T_lat, C_audio, F_audio); + memcpy(combined_patches.data() + (size_t)n_tok * Pd, + a_patches.data(), (size_t)n_audio_tok * Pd * sizeof(float)); + dit_input = combined_patches.data(); + dit_n_tok = n_tok_total; + } + + // Conditional velocity (DiT on video-only or combined AV). std::vector v_cond = dit.forward( - patches.data(), n_tok, text_emb.data(), seq_len, t_cur, sched); + dit_input, dit_n_tok, text_emb.data(), seq_len, t_cur, sched); if (v_cond.empty()) { ggml_backend_sched_free(sched); for (auto b : dit_weight_buffers) ggml_backend_buffer_free(b); if (gpu_backend) ggml_backend_free(gpu_backend); ggml_backend_free(cpu_backend); return 1; } - // Unpatchify velocity. + // Split AV output: first n_tok tokens → video velocity, rest → audio velocity. std::vector vel_cond = unpatchify( v_cond.data(), T_lat, H_lat, W_lat, C, pt, ph, pw); std::vector velocity(lat_size); - + std::vector v_uncond; // unconditional DiT output (when do_cfg), for video + audio split if (do_cfg) { - // Unconditional velocity. - std::vector v_uncond = dit.forward( - patches.data(), n_tok, uncond_emb.data(), seq_len, t_cur, sched); + v_uncond = dit.forward( + dit_input, dit_n_tok, uncond_emb.data(), seq_len, t_cur, sched); if (v_uncond.empty()) { ggml_backend_sched_free(sched); for (auto b : dit_weight_buffers) ggml_backend_buffer_free(b); if (gpu_backend) ggml_backend_free(gpu_backend); ggml_backend_free(cpu_backend); return 1; } std::vector vel_uncond = unpatchify( v_uncond.data(), T_lat, H_lat, W_lat, C, pt, ph, pw); @@ -418,10 +565,28 @@ int main(int argc, char ** argv) { velocity = vel_cond; } - // Euler step. + // Euler step on video latent. RFScheduler::euler_step(latents.data(), velocity.data(), t_cur, t_next, lat_size); + // Euler step on audio latent (AV). + if (args.av) { + std::vector audio_vel_cond = unpatchify_audio( + v_cond.data() + (size_t)n_tok * Pd, T_lat, C_audio, F_audio); + std::vector audio_velocity(audio_lat_size); + if (do_cfg) { + std::vector audio_vel_uncond = unpatchify_audio( + v_uncond.data() + (size_t)n_tok * Pd, T_lat, C_audio, F_audio); + RFScheduler::apply_cfg( + audio_velocity.data(), audio_vel_cond.data(), audio_vel_uncond.data(), + args.cfg_scale, audio_lat_size); + } else { + audio_velocity = audio_vel_cond; + } + RFScheduler::euler_step(audio_latents.data(), audio_velocity.data(), + t_cur, t_next, audio_lat_size); + } + // ── Frame conditioning: pin start / end latent frames ────────────── // After each Euler step we re-impose the reference frame(s) to prevent // the denoising process from drifting away from the conditioning. @@ -516,6 +681,17 @@ int main(int argc, char ** argv) { write_video_frames(vbuf, args.out_prefix); + // ── Audio output (AV pipeline) ───────────────────────────────────────────── + if (args.av && !audio_latents.empty()) { + const int sample_rate = 16000, hop_length = 160, mel_bins = 64; + std::vector waveform = latent_to_waveform( + audio_latents.data(), T_lat, sample_rate, hop_length, mel_bins); + if (!waveform.empty() && write_wav(args.out_wav, waveform.data(), waveform.size(), sample_rate)) + LTX_LOG("audio written: %s", args.out_wav.c_str()); + else + LTX_ERR("failed to write WAV: %s", args.out_wav.c_str()); + } + LTX_LOG("done. %d frames written to '%s_XXXX.ppm'", T_vid, args.out_prefix.c_str()); if (has_start || has_end) { LTX_LOG("I2V conditioning applied: start=%s end=%s strength=%.2f", diff --git a/src/ltx_dit.hpp b/src/ltx_dit.hpp index 8219d82..2d08b3e 100644 --- a/src/ltx_dit.hpp +++ b/src/ltx_dit.hpp @@ -572,6 +572,43 @@ static std::vector patchify( return out; } +// ── Audio patchify / unpatchify (for AV pipeline) ───────────────────────────── +// Audio latent layout: [T, C, F] with C=8, F=16 (Lightricks AudioLatentShape). +// Patchify: (T, C, F) → (T, C*F) so n_audio_tok = T, Pd_audio = 128. +// Matches Python: "b c t f -> b t (c f)". + +static std::vector patchify_audio( + const float * lat, + int T, int C, int F) +{ + int Pd = C * F; + std::vector out((size_t)T * Pd); + for (int t = 0; t < T; ++t) { + float * dst = out.data() + (size_t)t * Pd; + int d = 0; + for (int c = 0; c < C; ++c) + for (int f = 0; f < F; ++f) + dst[d++] = lat[((size_t)t * C + c) * F + f]; + } + return out; +} + +static std::vector unpatchify_audio( + const float * tok, + int T, int C, int F) +{ + int Pd = C * F; + std::vector out((size_t)T * C * F, 0.0f); + for (int t = 0; t < T; ++t) { + const float * src = tok + (size_t)t * Pd; + int d = 0; + for (int c = 0; c < C; ++c) + for (int f = 0; f < F; ++f) + out[((size_t)t * C + c) * F + f] = src[d++]; + } + return out; +} + // Unpatchify [N_tok, patch_dim] → [T_lat, H_lat, W_lat, C] static std::vector unpatchify( const float * tok,