From 1c6c1bc336a37c62628718bb0b12a1e1aa37a7af Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 24 Apr 2026 10:41:15 +0100 Subject: [PATCH 01/19] refactor(block): expand ConfigSpace to full virtio-blk layout The virtio-blk spec defines a 56-byte config space with fields for geometry, topology, writeback, and discard at fixed offsets. The previous struct only held `capacity` (8 bytes), so the driver never saw the fields beyond offset 8. The full layout is required to advertise VIRTIO_BLK_F_DISCARD: the spec mandates that `max_discard_sectors` (offset 40) and `max_discard_seg` (offset 44) be set before the feature can be used by the guest. Without the full struct, there is nowhere to write those values. All new fields default to zero, so there is no behavioral change in this commit. A compile-time size assertion (`assert!(size_of::() == 56)`) guards against future accidental padding changes. Tests are updated to use `..Default::default()` for forward-compatible ConfigSpace init. Signed-off-by: Nikita Kalyazin --- .../src/devices/virtio/block/virtio/device.rs | 41 ++++++++++++++++--- .../devices/virtio/block/virtio/persist.rs | 1 + 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index ecdd8ee4f6d..3f09a19c507 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -163,10 +163,27 @@ impl DiskProperties { #[derive(Debug, Default, Clone, Copy, Eq, PartialEq)] #[repr(C)] pub struct ConfigSpace { - pub capacity: u64, + pub capacity: u64, // offset 0 + pub size_max: u32, // offset 8 + pub seg_max: u32, // offset 12 + pub geometry_cylinders: u16, // offset 16 + pub geometry_heads: u8, // offset 18 + pub geometry_sectors: u8, // offset 19 + pub blk_size: u32, // offset 20 + pub topology_physical_block_exp: u8, // offset 24 + pub topology_alignment_offset: u8, // offset 25 + pub topology_min_io_size: u16, // offset 26 + pub topology_opt_io_size: u32, // offset 28 + pub writeback: u8, // offset 32 + pub(crate) _unused0: [u8; 3], // offset 33 + pub num_queues: u32, // offset 36 + pub max_discard_sectors: u32, // offset 40 + pub max_discard_seg: u32, // offset 44 + pub discard_sector_alignment: u32, // offset 48 + pub(crate) _unused1: [u8; 4], // offset 52 (alignment padding to 56) } - -// SAFETY: `ConfigSpace` contains only PODs in `repr(C)` or `repr(transparent)`, without padding. +const _: () = assert!(std::mem::size_of::() == 56); +// SAFETY: repr(C), all POD fields, explicit padding — no implicit padding bytes. unsafe impl ByteValued for ConfigSpace {} /// Use this structure to set up the Block Device before booting the kernel. @@ -314,6 +331,7 @@ impl VirtioBlock { let config_space = ConfigSpace { capacity: disk_properties.nsectors.to_le(), + ..Default::default() }; Ok(VirtioBlock { @@ -821,11 +839,17 @@ mod tests { // This will read the number of sectors. // The block's backing file size is 0x1000, so there are 8 (4096/512) sectors. // The config space is little endian. - let expected_config_space = ConfigSpace { capacity: 8 }; + let expected_config_space = ConfigSpace { + capacity: 8, + ..Default::default() + }; assert_eq!(actual_config_space, expected_config_space); // Invalid read. - let expected_config_space = ConfigSpace { capacity: 696969 }; + let expected_config_space = ConfigSpace { + capacity: 696969, + ..Default::default() + }; actual_config_space = expected_config_space; block.read_config( std::mem::size_of::() as u64 + 1, @@ -842,7 +866,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); - let expected_config_space = ConfigSpace { capacity: 696969 }; + let expected_config_space = ConfigSpace { + capacity: 696969, + ..Default::default() + }; block.write_config(0, expected_config_space.as_slice()); let mut actual_config_space = ConfigSpace::default(); @@ -852,6 +879,7 @@ mod tests { // If privileged user writes to `/dev/mem`, in block config space - byte by byte. let expected_config_space = ConfigSpace { capacity: 0x1122334455667788, + ..Default::default() }; let expected_config_space_slice = expected_config_space.as_slice(); for (i, b) in expected_config_space_slice.iter().enumerate() { @@ -863,6 +891,7 @@ mod tests { // Invalid write. let new_config_space = ConfigSpace { capacity: 0xDEADBEEF, + ..Default::default() }; block.write_config(5, new_config_space.as_slice()); // Make sure nothing got written. diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 98f17c258ad..05e00b1b1fa 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -113,6 +113,7 @@ impl Persist<'_> for VirtioBlock { let config_space = ConfigSpace { capacity: disk_properties.nsectors.to_le(), + ..Default::default() }; Ok(VirtioBlock { From a0adea0b7337bb8ecc26e9924f0c657021f82df5 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 24 Apr 2026 10:45:00 +0100 Subject: [PATCH 02/19] refactor(block): add Discard request type and discard segment struct The virtio-blk spec (section 5.2.6.14) defines VIRTIO_BLK_T_DISCARD (type 11) as a request the guest driver issues to free sectors it no longer needs. To handle it, the VMM needs three things introduced here: - `RequestType::Discard` so the dispatcher can route the request type. - `DiscardWriteZeroes` (16 bytes, ByteValued) matching the layout of a single discard segment that the guest places in the data descriptor. - `DISCARD_SEGMENT_SIZE` constant (16) used in parse() to validate that the data buffer length is a non-zero multiple of the segment size. - `IoErr::InvalidFlags` / `IoErr::InvalidOffset` for per-segment validation errors added in the processing step. Placeholder arms in `finish()` and `process()` keep the build green; they are replaced with the real implementation in the next commit. Signed-off-by: Nikita Kalyazin --- .../devices/virtio/block/virtio/request.rs | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/request.rs b/src/vmm/src/devices/virtio/block/virtio/request.rs index 8fc83cf43da..e8a34ec5d2b 100644 --- a/src/vmm/src/devices/virtio/block/virtio/request.rs +++ b/src/vmm/src/devices/virtio/block/virtio/request.rs @@ -14,18 +14,35 @@ use crate::devices::virtio::block::virtio::device::DiskProperties; use crate::devices::virtio::block::virtio::metrics::BlockDeviceMetrics; pub use crate::devices::virtio::generated::virtio_blk::{ VIRTIO_BLK_ID_BYTES, VIRTIO_BLK_S_IOERR, VIRTIO_BLK_S_OK, VIRTIO_BLK_S_UNSUPP, - VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID, VIRTIO_BLK_T_IN, VIRTIO_BLK_T_OUT, + VIRTIO_BLK_T_DISCARD, VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID, VIRTIO_BLK_T_IN, + VIRTIO_BLK_T_OUT, }; use crate::devices::virtio::queue::DescriptorChain; use crate::logger::{IncMetric, error}; use crate::rate_limiter::{RateLimiter, TokenType}; use crate::vstate::memory::{ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; +/// One virtio-blk discard/write_zeroes segment — virtio spec §5.2.6.14. +#[derive(Debug, Default, Copy, Clone)] +#[repr(C)] +pub struct DiscardWriteZeroes { + pub sector: u64, + pub num_sectors: u32, + pub flags: u32, +} +// SAFETY: repr(C), all POD fields, no implicit padding. +unsafe impl ByteValued for DiscardWriteZeroes {} + +pub const DISCARD_SEGMENT_SIZE: u32 = 16; +const _: () = assert!(std::mem::size_of::() == DISCARD_SEGMENT_SIZE as usize); + #[derive(Debug, derive_more::From)] pub enum IoErr { GetId(GuestMemoryError), PartialTransfer { completed: u32, expected: u32 }, FileEngine(block_io::BlockIoError), + InvalidOffset, + InvalidFlags, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -34,6 +51,7 @@ pub enum RequestType { Out, Flush, GetDeviceID, + Discard, Unsupported(u32), } @@ -44,6 +62,7 @@ impl From for RequestType { VIRTIO_BLK_T_OUT => RequestType::Out, VIRTIO_BLK_T_FLUSH => RequestType::Flush, VIRTIO_BLK_T_GET_ID => RequestType::GetDeviceID, + VIRTIO_BLK_T_DISCARD => RequestType::Discard, t => RequestType::Unsupported(t), } } @@ -176,6 +195,9 @@ impl PendingRequest { (Ok(transferred_data_len), RequestType::GetDeviceID) => { Status::from_data(self.data_len, transferred_data_len, true) } + (Ok(_), RequestType::Discard) => Status::Unsupported { + op: VIRTIO_BLK_T_DISCARD, + }, (_, RequestType::Unsupported(op)) => Status::Unsupported { op }, (Err(err), _) => Status::IoErr { num_bytes_to_mem: 0, @@ -390,6 +412,9 @@ impl Request { .map_err(IoErr::GetId); return ProcessingResult::Executed(pending.finish(mem, res, block_metrics)); } + RequestType::Discard => { + return ProcessingResult::Executed(pending.finish(mem, Ok(0), block_metrics)); + } RequestType::Unsupported(_) => { return ProcessingResult::Executed(pending.finish(mem, Ok(0), block_metrics)); } @@ -731,6 +756,7 @@ mod tests { RequestType::Out => VIRTIO_BLK_T_OUT, RequestType::Flush => VIRTIO_BLK_T_FLUSH, RequestType::GetDeviceID => VIRTIO_BLK_T_GET_ID, + RequestType::Discard => VIRTIO_BLK_T_DISCARD, RequestType::Unsupported(id) => id, } } @@ -743,6 +769,7 @@ mod tests { RequestType::Out => VIRTQ_DESC_F_NEXT, RequestType::Flush => VIRTQ_DESC_F_NEXT, RequestType::GetDeviceID => VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, + RequestType::Discard => VIRTQ_DESC_F_NEXT, RequestType::Unsupported(_) => VIRTQ_DESC_F_NEXT, } } From 08fb060e57ec274c56db5de406bc199ba4180504 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 24 Apr 2026 15:22:23 +0100 Subject: [PATCH 03/19] feat(block): add discard method to FileEngine using fallocate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SyncIoError::Discard, SyncFileEngine::discard() and AsyncFileEngine::discard() (synchronous fallback — discard is not latency-critical and IORING_OP_FALLOCATE is not in the io_uring allowlist). Wire up FileEngine::discard() dispatching to both engines. Signed-off-by: Nikita Kalyazin --- .../virtio/block/virtio/io/async_io.rs | 20 +++++++++++++++- .../src/devices/virtio/block/virtio/io/mod.rs | 24 +++++++++++++++++++ .../devices/virtio/block/virtio/io/sync_io.rs | 20 ++++++++++++++++ src/vmm/src/io_uring/operation/mod.rs | 23 ++++++++++++++++++ 4 files changed, 86 insertions(+), 1 deletion(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs index f83d9dea1df..ac9df7ea77c 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs @@ -4,7 +4,7 @@ use std::fmt::Debug; use std::fs::File; use std::os::fd::RawFd; -use std::os::unix::io::AsRawFd; +use std::os::unix::io::AsRawFd as _; use vm_memory::GuestMemoryError; use vmm_sys_util::eventfd::EventFd; @@ -82,6 +82,7 @@ impl AsyncFileEngine { Restriction::AllowOpCode(OpCode::Read), Restriction::AllowOpCode(OpCode::Write), Restriction::AllowOpCode(OpCode::Fsync), + Restriction::AllowOpCode(OpCode::Fallocate), ], Some(completion_fd), ) @@ -119,6 +120,23 @@ impl AsyncFileEngine { &self.completion_evt } + pub fn push_discard( + &mut self, + offset: u64, + len: u64, + req: PendingRequest, + ) -> Result<(), RequestError> { + // FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE = 3 + const PUNCH_HOLE_MODE: u32 = 3; + let wrapped = WrappedRequest::new(req); + self.ring + .push(Operation::fallocate(0, offset, len, PUNCH_HOLE_MODE, wrapped)) + .map_err(|(io_uring_error, data)| RequestError { + req: data.req, + error: AsyncIoError::IoUring(io_uring_error), + }) + } + pub fn push_read( &mut self, offset: u64, diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs index b7aa8061d76..026447aa7f7 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs @@ -157,6 +157,30 @@ impl FileEngine { } } + pub fn discard( + &mut self, + offset: u64, + len: u64, + req: PendingRequest, + ) -> Result> { + match self { + FileEngine::Async(engine) => match engine.push_discard(offset, len, req) { + Ok(()) => Ok(FileEngineOk::Submitted), + Err(err) => Err(RequestError { + req: err.req, + error: BlockIoError::Async(err.error), + }), + }, + FileEngine::Sync(engine) => match engine.discard(offset, len) { + Ok(()) => Ok(FileEngineOk::Executed(RequestOk { req, count: 0 })), + Err(err) => Err(RequestError { + req, + error: BlockIoError::Sync(err), + }), + }, + } + } + pub fn drain(&mut self, discard: bool) -> Result<(), BlockIoError> { match self { FileEngine::Async(engine) => engine.drain(discard).map_err(BlockIoError::Async), diff --git a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs index eec3b3d8b8d..771f17d48c5 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs @@ -3,6 +3,7 @@ use std::fs::File; use std::io::{Seek, SeekFrom, Write}; +use std::os::unix::io::AsRawFd; use vm_memory::{GuestMemoryError, ReadVolatile, WriteVolatile}; @@ -10,6 +11,8 @@ use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap}; #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum SyncIoError { + /// Discard: {0} + Discard(std::io::Error), /// Flush: {0} Flush(std::io::Error), /// Seek: {0} @@ -75,6 +78,23 @@ impl SyncFileEngine { Ok(count) } + pub fn discard(&mut self, offset: u64, len: u64) -> Result<(), SyncIoError> { + // SAFETY: self.file is a valid fd; offset and len are caller-validated. + let ret = unsafe { + libc::fallocate( + self.file.as_raw_fd(), + libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, + i64::try_from(offset).expect("discard offset fits in i64"), + i64::try_from(len).expect("discard length fits in i64"), + ) + }; + if ret < 0 { + Err(SyncIoError::Discard(std::io::Error::last_os_error())) + } else { + Ok(()) + } + } + pub fn flush(&mut self) -> Result<(), SyncIoError> { // flush() first to force any cached data out of rust buffers. self.file.flush().map_err(SyncIoError::Flush)?; diff --git a/src/vmm/src/io_uring/operation/mod.rs b/src/vmm/src/io_uring/operation/mod.rs index ead96df5ba5..48f2a89c208 100644 --- a/src/vmm/src/io_uring/operation/mod.rs +++ b/src/vmm/src/io_uring/operation/mod.rs @@ -29,6 +29,8 @@ pub enum OpCode { Write = io_uring_op::IORING_OP_WRITE as u8, /// Fsync operation. Fsync = io_uring_op::IORING_OP_FSYNC as u8, + /// Fallocate operation. + Fallocate = io_uring_op::IORING_OP_FALLOCATE as u8, } // Useful for outputting errors. @@ -38,6 +40,7 @@ impl From for &'static str { OpCode::Read => "read", OpCode::Write => "write", OpCode::Fsync => "fsync", + OpCode::Fallocate => "fallocate", } } } @@ -100,6 +103,26 @@ impl Operation { } } + /// Construct a fallocate operation. + /// + /// For IORING_OP_FALLOCATE the kernel reads the length from sqe->addr (u64) + /// and the mode flags from sqe->len (u32), with the file offset in sqe->off. + /// The existing `addr` and `len` struct fields map to exactly those SQE slots, + /// so no changes to `into_sqe` are needed. + // Firecracker targets 64-bit only; usize == u64 so this cast never truncates. + #[allow(clippy::cast_possible_truncation)] + pub fn fallocate(fd: FixedFd, offset: u64, len: u64, mode: u32, user_data: T) -> Self { + Self { + fd, + opcode: OpCode::Fallocate, + addr: Some(len as usize), // sqe->addr = fallocate length (u64) + len: Some(mode), // sqe->len = fallocate mode flags + flags: 0, + offset: Some(offset), + user_data, + } + } + /// Construct a fsync operation. pub fn fsync(fd: FixedFd, user_data: T) -> Self { Self { From cb2ef057dea33d1b5b2ddd3eccb4f118ef9f10ec Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 24 Apr 2026 15:29:49 +0100 Subject: [PATCH 04/19] style(seccomp): fix indentation and trailing whitespace in filter files Fix 2-space indented getrandom entries (should be 4 spaces like all other entries) and remove trailing whitespace from madvise comment. Signed-off-by: Nikita Kalyazin --- resources/seccomp/aarch64-unknown-linux-musl.json | 10 +++++----- resources/seccomp/x86_64-unknown-linux-musl.json | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index 26dd661e46b..04c6724caa9 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -110,8 +110,8 @@ "comment": "sigaltstack is used by Rust stdlib to remove alternative signal stack during thread teardown." }, { - "syscall": "getrandom", - "comment": "getrandom is used by aws-lc library which we consume in virtio-rng" + "syscall": "getrandom", + "comment": "getrandom is used by aws-lc library which we consume in virtio-rng" }, { "syscall": "accept4", @@ -213,7 +213,7 @@ }, { "syscall": "madvise", - "comment": "Used by the VirtIO balloon device and by musl for some customer workloads. It is also used by aws-lc during random number generation. They setup a memory page that mark with MADV_WIPEONFORK to be able to detect forks. They also call it with -1 to see if madvise is supported in certain platforms." + "comment": "Used by the VirtIO balloon device and by musl for some customer workloads. It is also used by aws-lc during random number generation. They setup a memory page that mark with MADV_WIPEONFORK to be able to detect forks. They also call it with -1 to see if madvise is supported in certain platforms." }, { "syscall": "msync", @@ -544,8 +544,8 @@ "comment": "sigaltstack is used by Rust stdlib to remove alternative signal stack during thread teardown." }, { - "syscall": "getrandom", - "comment": "getrandom is used by `HttpServer` to reinialize `HashMap` after moving to the API thread" + "syscall": "getrandom", + "comment": "getrandom is used by `HttpServer` to reinialize `HashMap` after moving to the API thread" }, { "syscall": "accept4", diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 1eb2d83e0f2..1326a8489e6 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -559,8 +559,8 @@ "comment": "sigaltstack is used by Rust stdlib to remove alternative signal stack during thread teardown." }, { - "syscall": "getrandom", - "comment": "getrandom is used by `HttpServer` to reinialize `HashMap` after moving to the API thread" + "syscall": "getrandom", + "comment": "getrandom is used by `HttpServer` to reinialize `HashMap` after moving to the API thread" }, { "syscall": "accept4", From 9321e95a0bd2584a592df0eb681191520aa3a750 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 24 Apr 2026 15:30:05 +0100 Subject: [PATCH 05/19] feat(seccomp): allow fallocate syscall in vmm thread filter VIRTIO_BLK_F_DISCARD issues fallocate(FALLOC_FL_PUNCH_HOLE) on the backing file. Add fallocate to the vmm thread seccomp filter on both x86_64 and aarch64 so the syscall is not blocked at runtime. Signed-off-by: Nikita Kalyazin --- resources/seccomp/aarch64-unknown-linux-musl.json | 4 ++++ resources/seccomp/x86_64-unknown-linux-musl.json | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index 04c6724caa9..487c0346181 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -42,6 +42,10 @@ { "syscall": "fsync" }, + { + "syscall": "fallocate", + "comment": "Used by the block device for VIRTIO_BLK_F_DISCARD (FALLOC_FL_PUNCH_HOLE)" + }, { "syscall": "close" }, diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 1326a8489e6..988967a4e22 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -45,6 +45,10 @@ { "syscall": "fsync" }, + { + "syscall": "fallocate", + "comment": "Used by the block device for VIRTIO_BLK_F_DISCARD (FALLOC_FL_PUNCH_HOLE)" + }, { "syscall": "close" }, From c06070a730424503ccc2e4dc4c79c4dd94d958b5 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 24 Apr 2026 10:48:31 +0100 Subject: [PATCH 06/19] feat(block): handle VIRTIO_BLK_T_DISCARD requests Add parse() validation: discard data descriptor must be read-only and data_len must be a non-zero multiple of DISCARD_SEGMENT_SIZE (16 bytes). Add process_discard_segments(): validates flags==0, num_sectors>0, and sector bounds per segment, then calls FileEngine::discard() using fallocate(FALLOC_FL_PUNCH_HOLE). Returns VIRTIO_BLK_S_IOERR on any validation or syscall failure; VIRTIO_BLK_S_OK on success. Add discard_count metric incremented on each successful discard. Signed-off-by: Nikita Kalyazin --- .../src/devices/virtio/block/virtio/device.rs | 99 +++++++++++++++-- .../virtio/block/virtio/io/async_io.rs | 8 +- .../src/devices/virtio/block/virtio/io/mod.rs | 8 ++ .../devices/virtio/block/virtio/metrics.rs | 3 + .../devices/virtio/block/virtio/request.rs | 102 ++++++++++++++++-- 5 files changed, 198 insertions(+), 22 deletions(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index 3f09a19c507..638934ac076 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -59,6 +59,8 @@ pub struct DiskProperties { pub file_engine: FileEngine, pub nsectors: u64, pub image_id: [u8; VIRTIO_BLK_ID_BYTES as usize], + /// Set on first EOPNOTSUPP from fallocate; subsequent discards are no-ops. Not persisted. + pub discard_unsupported: bool, } impl DiskProperties { @@ -106,6 +108,7 @@ impl DiskProperties { .map_err(VirtioBlockError::FileEngine)?, nsectors: disk_size >> SECTOR_SHIFT, image_id, + discard_unsupported: false, }) } @@ -501,17 +504,42 @@ impl VirtioBlock { } Ok(None) => break, Ok(Some(cqe)) => { - let res = cqe.result(); - let user_data = cqe.user_data(); - - let (pending, res) = match res { - Ok(count) => (user_data, Ok(count)), - Err(error) => ( - user_data, - Err(IoErr::FileEngine(block_io::BlockIoError::Async( - async_io::AsyncIoError::IO(error), - ))), - ), + let cqe_result = cqe.result(); + let pending = cqe.user_data(); + + // io_uring CQE errors use negated errno, so EOPNOTSUPP is -95. + let is_eopnotsupp = matches!( + &cqe_result, + Err(e) if e.raw_os_error() == Some(-libc::EOPNOTSUPP) + ); + if is_eopnotsupp && pending.request_type() == RequestType::Discard { + if !self.disk.discard_unsupported { + warn!( + "Block discard not supported by host filesystem; disabling discard" + ); + self.disk.discard_unsupported = true; + } + let finished = pending.finish( + &active_state.mem, + Err(IoErr::DiscardUnsupported), + &self.metrics, + ); + queue + .add_used(finished.desc_idx, finished.num_bytes_to_mem) + .unwrap_or_else(|err| { + error!( + "Failed to add available descriptor head {}: {}", + finished.desc_idx, err + ) + }); + continue; + } + + let res = match cqe_result { + Ok(count) => Ok(count), + Err(error) => Err(IoErr::FileEngine(block_io::BlockIoError::Async( + async_io::AsyncIoError::IO(error), + ))), }; let finished = pending.finish(&active_state.mem, res, &self.metrics); queue @@ -1893,4 +1921,53 @@ mod tests { assert_eq!(block.disk.image_id, id.as_slice()); } } + + #[test] + fn test_discard_unsupported_cached() { + for engine in [FileEngineType::Sync, FileEngineType::Async] { + let mut block = default_block(engine); + let mem = default_mem(); + let interrupt = default_interrupt(); + let vq = VirtQueue::new(GuestAddress(0), &mem, 16); + set_queue(&mut block, 0, vq.create_queue()); + block.activate(mem.clone(), interrupt).unwrap(); + read_blk_req_descriptors(&vq); + + let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); + let data_addr = GuestAddress(vq.dtable[1].addr.get()); + let status_addr = GuestAddress(vq.dtable[2].addr.get()); + + vq.dtable[1].flags.set(VIRTQ_DESC_F_NEXT); + vq.dtable[1].len.set(DISCARD_SEGMENT_SIZE); + + mem.write_obj::(VIRTIO_BLK_T_DISCARD, request_type_addr) + .unwrap(); + mem.write_obj( + DiscardWriteZeroes { + sector: 0, + num_sectors: 4, + flags: 0, + }, + data_addr, + ) + .unwrap(); + + // Pre-set the flag as if a previous EOPNOTSUPP was already detected. + block.disk.discard_unsupported = true; + + // Neither discard_count nor invalid_reqs_count should increment. + let discard_before = block.metrics.discard_count.count(); + let invalid_before = block.metrics.invalid_reqs_count.count(); + simulate_queue_and_async_completion_events(&mut block, true); + assert_eq!(block.metrics.discard_count.count(), discard_before); + assert_eq!(block.metrics.invalid_reqs_count.count(), invalid_before); + + assert_eq!(vq.used.idx.get(), 1); + assert_eq!(vq.used.ring[0].get().len, 1); // status byte only + assert_eq!( + u32::from(mem.read_obj::(status_addr).unwrap()), + VIRTIO_BLK_S_UNSUPP + ); + } + } } diff --git a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs index ac9df7ea77c..953144ac51d 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs @@ -130,7 +130,13 @@ impl AsyncFileEngine { const PUNCH_HOLE_MODE: u32 = 3; let wrapped = WrappedRequest::new(req); self.ring - .push(Operation::fallocate(0, offset, len, PUNCH_HOLE_MODE, wrapped)) + .push(Operation::fallocate( + 0, + offset, + len, + PUNCH_HOLE_MODE, + wrapped, + )) .map_err(|(io_uring_error, data)| RequestError { req: data.req, error: AsyncIoError::IoUring(io_uring_error), diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs index 026447aa7f7..9c4934d7628 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs @@ -40,6 +40,14 @@ impl BlockIoError { _ => false, } } + + pub fn is_eopnotsupp(&self) -> bool { + matches!( + self, + BlockIoError::Sync(SyncIoError::Discard(e)) + if e.raw_os_error() == Some(libc::EOPNOTSUPP) + ) + } } #[derive(Debug)] diff --git a/src/vmm/src/devices/virtio/block/virtio/metrics.rs b/src/vmm/src/devices/virtio/block/virtio/metrics.rs index 0abe2a58963..d1328895833 100644 --- a/src/vmm/src/devices/virtio/block/virtio/metrics.rs +++ b/src/vmm/src/devices/virtio/block/virtio/metrics.rs @@ -184,6 +184,8 @@ pub struct BlockDeviceMetrics { pub io_engine_throttled_events: SharedIncMetric, /// Number of remaining requests in the queue. pub remaining_reqs_count: SharedIncMetric, + /// Number of successful discard (TRIM) operations. + pub discard_count: SharedIncMetric, } impl BlockDeviceMetrics { @@ -230,6 +232,7 @@ impl BlockDeviceMetrics { .add(other.io_engine_throttled_events.fetch_diff()); self.remaining_reqs_count .add(other.remaining_reqs_count.fetch_diff()); + self.discard_count.add(other.discard_count.fetch_diff()); } } diff --git a/src/vmm/src/devices/virtio/block/virtio/request.rs b/src/vmm/src/devices/virtio/block/virtio/request.rs index e8a34ec5d2b..70960ba6269 100644 --- a/src/vmm/src/devices/virtio/block/virtio/request.rs +++ b/src/vmm/src/devices/virtio/block/virtio/request.rs @@ -18,9 +18,9 @@ pub use crate::devices::virtio::generated::virtio_blk::{ VIRTIO_BLK_T_OUT, }; use crate::devices::virtio::queue::DescriptorChain; -use crate::logger::{IncMetric, error}; +use crate::logger::{IncMetric, error, warn}; use crate::rate_limiter::{RateLimiter, TokenType}; -use crate::vstate::memory::{ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; +use crate::vstate::memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; /// One virtio-blk discard/write_zeroes segment — virtio spec §5.2.6.14. #[derive(Debug, Default, Copy, Clone)] @@ -39,10 +39,15 @@ const _: () = assert!(std::mem::size_of::() == DISCARD_SEGME #[derive(Debug, derive_more::From)] pub enum IoErr { GetId(GuestMemoryError), - PartialTransfer { completed: u32, expected: u32 }, + PartialTransfer { + completed: u32, + expected: u32, + }, FileEngine(block_io::BlockIoError), InvalidOffset, InvalidFlags, + /// Discard not supported by the host filesystem; cached after first EOPNOTSUPP. + DiscardUnsupported, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -83,9 +88,18 @@ pub struct FinishedRequest { #[derive(Debug)] enum Status { - Ok { num_bytes_to_mem: u32 }, - IoErr { num_bytes_to_mem: u32, err: IoErr }, - Unsupported { op: u32 }, + Ok { + num_bytes_to_mem: u32, + }, + IoErr { + num_bytes_to_mem: u32, + err: IoErr, + }, + Unsupported { + op: u32, + }, + /// Discard silently unsupported — returns VIRTIO_BLK_S_UNSUPP without logging or metrics. + DiscardUnsupported, } impl Status { @@ -117,6 +131,10 @@ pub struct PendingRequest { } impl PendingRequest { + pub fn request_type(&self) -> RequestType { + self.r#type + } + fn write_status_and_finish( self, status: &Status, @@ -143,6 +161,7 @@ impl PendingRequest { error!("Received unsupported virtio block request: {}", op); (0, u8::try_from(VIRTIO_BLK_S_UNSUPP).unwrap()) } + Status::DiscardUnsupported => (0, u8::try_from(VIRTIO_BLK_S_UNSUPP).unwrap()), }; let num_bytes_to_mem = mem @@ -195,9 +214,13 @@ impl PendingRequest { (Ok(transferred_data_len), RequestType::GetDeviceID) => { Status::from_data(self.data_len, transferred_data_len, true) } - (Ok(_), RequestType::Discard) => Status::Unsupported { - op: VIRTIO_BLK_T_DISCARD, - }, + (Ok(_), RequestType::Discard) => { + block_metrics.discard_count.inc(); + Status::Ok { + num_bytes_to_mem: 0, + } + } + (Err(IoErr::DiscardUnsupported), _) => Status::DiscardUnsupported, (_, RequestType::Unsupported(op)) => Status::Unsupported { op }, (Err(err), _) => Status::IoErr { num_bytes_to_mem: 0, @@ -209,6 +232,29 @@ impl PendingRequest { } } +fn parse_discard_segment( + data_addr: GuestAddress, + nsectors: u64, + mem: &GuestMemoryMmap, +) -> Result<(u64, u64), IoErr> { + // max_discard_seg = 1 guarantees exactly one segment per request. + let seg: DiscardWriteZeroes = mem.read_obj(data_addr).map_err(IoErr::GetId)?; + if seg.flags != 0 { + return Err(IoErr::InvalidFlags); + } + if seg.num_sectors == 0 { + return Err(IoErr::InvalidOffset); + } + seg.sector + .checked_add(u64::from(seg.num_sectors)) + .filter(|&top| top <= nsectors) + .ok_or(IoErr::InvalidOffset)?; + Ok(( + seg.sector << SECTOR_SHIFT, + u64::from(seg.num_sectors) << SECTOR_SHIFT, + )) +} + /// The request header represents the mandatory fields of each block device request. /// /// A request header contains the following fields: @@ -303,6 +349,9 @@ impl Request { if data_desc.is_write_only() && req.r#type == RequestType::Out { return Err(VirtioBlockError::UnexpectedWriteOnlyDescriptor); } + if data_desc.is_write_only() && req.r#type == RequestType::Discard { + return Err(VirtioBlockError::UnexpectedWriteOnlyDescriptor); + } if !data_desc.is_write_only() && req.r#type == RequestType::In { return Err(VirtioBlockError::UnexpectedReadOnlyDescriptor); } @@ -335,6 +384,11 @@ impl Request { return Err(VirtioBlockError::InvalidDataLength); } } + RequestType::Discard => { + if req.data_len == 0 || !req.data_len.is_multiple_of(DISCARD_SEGMENT_SIZE) { + return Err(VirtioBlockError::InvalidDataLength); + } + } _ => {} } @@ -413,7 +467,23 @@ impl Request { return ProcessingResult::Executed(pending.finish(mem, res, block_metrics)); } RequestType::Discard => { - return ProcessingResult::Executed(pending.finish(mem, Ok(0), block_metrics)); + if disk.discard_unsupported { + return ProcessingResult::Executed(pending.finish( + mem, + Err(IoErr::DiscardUnsupported), + block_metrics, + )); + } + match parse_discard_segment(self.data_addr, disk.nsectors, mem) { + Err(io_err) => { + return ProcessingResult::Executed(pending.finish( + mem, + Err(io_err), + block_metrics, + )); + } + Ok((offset, len)) => disk.file_engine.discard(offset, len, pending), + } } RequestType::Unsupported(_) => { return ProcessingResult::Executed(pending.finish(mem, Ok(0), block_metrics)); @@ -428,6 +498,18 @@ impl Request { Err(err) => { if err.error.is_throttling_err() { ProcessingResult::Throttled + } else if err.error.is_eopnotsupp() + && err.req.request_type() == RequestType::Discard + { + if !disk.discard_unsupported { + warn!("Block discard not supported by host filesystem; disabling discard"); + disk.discard_unsupported = true; + } + ProcessingResult::Executed(err.req.finish( + mem, + Err(IoErr::DiscardUnsupported), + block_metrics, + )) } else { ProcessingResult::Executed(err.req.finish( mem, From 4b3b62d6d6dfc3d0405815c4a17c5eb7f7127403 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 24 Apr 2026 10:51:01 +0100 Subject: [PATCH 07/19] chore(snapshot): fix ConfigSpace restore for VIRTIO_BLK_F_DISCARD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Populate max_discard_sectors and max_discard_seg in restore() to match what new() produces for the expanded ConfigSpace. No version bump needed — main already carried 9.0.0 → 10.0.0 in this release cycle. Signed-off-by: Nikita Kalyazin --- src/vmm/src/devices/virtio/block/virtio/persist.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 05e00b1b1fa..3326caeb9c3 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -111,8 +111,11 @@ impl Persist<'_> for VirtioBlock { let avail_features = state.virtio_state.avail_features; let acked_features = state.virtio_state.acked_features; + let discard_sectors = u32::try_from(disk_properties.nsectors).unwrap_or(u32::MAX); let config_space = ConfigSpace { capacity: disk_properties.nsectors.to_le(), + max_discard_sectors: if !is_read_only { discard_sectors } else { 0 }, + max_discard_seg: if !is_read_only { 1 } else { 0 }, ..Default::default() }; From 4ca982a26343b5c6b75eb83c79c2ad1863656e13 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 24 Apr 2026 10:49:13 +0100 Subject: [PATCH 08/19] feat(block): advertise VIRTIO_BLK_F_DISCARD for non-read-only devices Set VIRTIO_BLK_F_DISCARD in avail_features for all writable block devices. Populate max_discard_sectors (capped to disk size) and max_discard_seg=1 in the config space. Read-only devices advertise neither the feature nor non-zero discard config fields. This is a behavioral change for existing writable block device configurations: guests will now see and may negotiate the discard feature. See docs/api_requests/block-discard.md for host filesystem requirements and recommendations. Signed-off-by: Nikita Kalyazin --- src/vmm/src/devices/virtio/block/virtio/device.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index 638934ac076..b5b42985753 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -27,7 +27,7 @@ use crate::devices::virtio::block::CacheType; use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice}; use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{ - VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, + VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, }; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ids::VIRTIO_ID_BLOCK; @@ -326,14 +326,23 @@ impl VirtioBlock { if config.is_read_only { avail_features |= 1u64 << VIRTIO_BLK_F_RO; - }; + } else { + avail_features |= 1u64 << VIRTIO_BLK_F_DISCARD; + } let queue_evts = [EventFd::new(libc::EFD_NONBLOCK).map_err(VirtioBlockError::EventFd)?]; let queues = BLOCK_QUEUE_SIZES.iter().map(|&s| Queue::new(s)).collect(); + let discard_sectors = u32::try_from(disk_properties.nsectors).unwrap_or(u32::MAX); let config_space = ConfigSpace { capacity: disk_properties.nsectors.to_le(), + max_discard_sectors: if !config.is_read_only { + discard_sectors + } else { + 0 + }, + max_discard_seg: if !config.is_read_only { 1 } else { 0 }, ..Default::default() }; From 83d35bd2d275e87d672c83d3710cff2a54953415 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 24 Apr 2026 11:44:56 +0100 Subject: [PATCH 09/19] doc(block): document VIRTIO_BLK_F_DISCARD discard support Explain how discard works, host requirements, limitations, and snapshot compatibility impact. Signed-off-by: Nikita Kalyazin --- docs/api_requests/block-discard.md | 44 ++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 docs/api_requests/block-discard.md diff --git a/docs/api_requests/block-discard.md b/docs/api_requests/block-discard.md new file mode 100644 index 00000000000..792ca4fb8d1 --- /dev/null +++ b/docs/api_requests/block-discard.md @@ -0,0 +1,44 @@ +# Block device discard (TRIM) + +Firecracker supports the `VIRTIO_BLK_F_DISCARD` feature, which allows the guest +to issue discard (TRIM) requests to the block device. Discard requests tell the +host that a range of sectors is no longer needed, enabling the host to reclaim +space on sparse or thin-provisioned backing files. + +## How it works + +For all non-read-only block devices, Firecracker automatically advertises the +`VIRTIO_BLK_F_DISCARD` feature to the guest driver. No API configuration is +required — discard support is always-on for writable drives. + +When the guest driver issues a `VIRTIO_BLK_T_DISCARD` request, Firecracker calls +`fallocate(2)` with `FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE` on the backing +file for each discard segment. This punches a hole in the file, freeing the +underlying disk blocks without changing the file size. + +Guest tools that trigger discard include: + +- `fstrim -v /` — manually trim a mounted filesystem +- `discard` mount option — automatic discard on file deletion +- `blkdiscard /dev/vda` — discard the entire block device + +## Host requirements + +The backing file must reside on a filesystem and kernel that support +`FALLOC_FL_PUNCH_HOLE`. This is supported on ext4, xfs, btrfs, and tmpfs on +Linux 3.5+. On filesystems that do not support hole-punching, `fallocate` +returns `EOPNOTSUPP`. Firecracker detects this on the first discard, logs a +one-time warning, and replies to the guest with `VIRTIO_BLK_S_UNSUPP`. The Linux +virtio-blk driver propagates `VIRTIO_BLK_S_UNSUPP` through the block layer and +stops issuing further discard requests. Firecracker short-circuits any remaining +discard requests with `VIRTIO_BLK_S_UNSUPP` immediately — no additional +`fallocate` calls are made. + +## Limitations + +- Discard is only available for non-read-only block devices. +- At most one discard segment per request is supported (`max_discard_seg = 1`). +- The discard segment flags field must be zero; non-zero flags are rejected with + an I/O error. +- The `write_zeroes` variant of the feature (`VIRTIO_BLK_T_WRITE_ZEROES`) is not + supported. From 6de6d87c2376ce745fa7d916f3b973ba3e64fb57 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 24 Apr 2026 11:40:59 +0100 Subject: [PATCH 10/19] test(block): add unit tests for VIRTIO_BLK_F_DISCARD Test feature flag advertisement, config space discard fields, and end-to-end discard request processing (success and error paths). Update test_virtio_features and test_virtio_read_config to account for the new VIRTIO_BLK_F_DISCARD bit and config space fields. Signed-off-by: Nikita Kalyazin --- .../src/devices/virtio/block/virtio/device.rs | 155 +++++++++++++++++- 1 file changed, 152 insertions(+), 3 deletions(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index b5b42985753..c6b65d0f8c6 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -847,7 +847,10 @@ mod tests { assert_eq!(block.device_type(), VIRTIO_ID_BLOCK); - let features: u64 = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX); + // default_block is non-read-only, so VIRTIO_BLK_F_DISCARD is advertised. + let features: u64 = (1u64 << VIRTIO_F_VERSION_1) + | (1u64 << VIRTIO_RING_F_EVENT_IDX) + | (1u64 << VIRTIO_BLK_F_DISCARD); assert_eq!( block.avail_features_by_page(0), @@ -873,11 +876,12 @@ mod tests { let mut actual_config_space = ConfigSpace::default(); block.read_config(0, actual_config_space.as_mut_slice()); - // This will read the number of sectors. // The block's backing file size is 0x1000, so there are 8 (4096/512) sectors. - // The config space is little endian. + // default_block is non-read-only, so discard fields are populated. let expected_config_space = ConfigSpace { capacity: 8, + max_discard_sectors: 8, + max_discard_seg: 1, ..Default::default() }; assert_eq!(actual_config_space, expected_config_space); @@ -1931,6 +1935,151 @@ mod tests { } } + #[test] + fn test_discard_feature_and_config() { + // Non-read-only block: VIRTIO_BLK_F_DISCARD set, discard config fields populated. + for engine in [FileEngineType::Sync, FileEngineType::Async] { + let block = default_block(engine); + assert_ne!(block.avail_features & (1u64 << VIRTIO_BLK_F_DISCARD), 0); + assert_eq!(block.avail_features & (1u64 << VIRTIO_BLK_F_RO), 0); + // default_block has 0x1000-byte backing file → 8 sectors. + assert_eq!(block.config_space.max_discard_sectors, 8); + assert_eq!(block.config_space.max_discard_seg, 1); + } + + // Read-only block: VIRTIO_BLK_F_RO set, no discard. + let f = TempFile::new().unwrap(); + f.as_file().set_len(0x1000).unwrap(); + let config = VirtioBlockConfig { + drive_id: "test_ro".to_string(), + path_on_host: f.as_path().to_str().unwrap().to_string(), + is_root_device: false, + partuuid: None, + is_read_only: true, + cache_type: CacheType::Unsafe, + rate_limiter: None, + file_engine_type: FileEngineType::default(), + }; + let ro_block = VirtioBlock::new(config).unwrap(); + assert_eq!(ro_block.avail_features & (1u64 << VIRTIO_BLK_F_DISCARD), 0); + assert_ne!(ro_block.avail_features & (1u64 << VIRTIO_BLK_F_RO), 0); + assert_eq!(ro_block.config_space.max_discard_sectors, 0); + assert_eq!(ro_block.config_space.max_discard_seg, 0); + } + + #[test] + fn test_discard() { + for engine in [FileEngineType::Sync, FileEngineType::Async] { + let mut block = default_block(engine); + let mem = default_mem(); + let interrupt = default_interrupt(); + let vq = VirtQueue::new(GuestAddress(0), &mem, 16); + set_queue(&mut block, 0, vq.create_queue()); + block.activate(mem.clone(), interrupt).unwrap(); + read_blk_req_descriptors(&vq); + + let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); + let data_addr = GuestAddress(vq.dtable[1].addr.get()); + let status_addr = GuestAddress(vq.dtable[2].addr.get()); + + // Discard data descriptor is read-only (guest sends the segment list). + vq.dtable[1].flags.set(VIRTQ_DESC_F_NEXT); + vq.dtable[1].len.set(DISCARD_SEGMENT_SIZE); + + // Valid discard: sector 0, 4 sectors, flags=0. + { + mem.write_obj::(VIRTIO_BLK_T_DISCARD, request_type_addr) + .unwrap(); + mem.write_obj( + DiscardWriteZeroes { + sector: 0, + num_sectors: 4, + flags: 0, + }, + data_addr, + ) + .unwrap(); + + check_metric_after_block!( + &block.metrics.discard_count, + 1, + simulate_queue_and_async_completion_events(&mut block, true) + ); + + assert_eq!(vq.used.idx.get(), 1); + assert_eq!(vq.used.ring[0].get().id, 0); + assert_eq!(vq.used.ring[0].get().len, 1); // status byte only + assert_eq!(mem.read_obj::(status_addr).unwrap(), VIRTIO_BLK_S_OK); + } + + // Invalid: non-zero flags rejected. + { + vq.used.idx.set(0); + set_queue(&mut block, 0, vq.create_queue()); + mem.write_obj::(VIRTIO_BLK_T_DISCARD, request_type_addr) + .unwrap(); + mem.write_obj( + DiscardWriteZeroes { + sector: 0, + num_sectors: 4, + flags: 1, + }, + data_addr, + ) + .unwrap(); + + simulate_queue_event(&mut block, Some(true)); + + assert_eq!(vq.used.idx.get(), 1); + assert_eq!( + u32::from(mem.read_obj::(status_addr).unwrap()), + VIRTIO_BLK_S_IOERR + ); + } + + // Invalid: sector range exceeds disk bounds. + { + vq.used.idx.set(0); + set_queue(&mut block, 0, vq.create_queue()); + mem.write_obj::(VIRTIO_BLK_T_DISCARD, request_type_addr) + .unwrap(); + // 8-sector disk; sector 7 + 4 = 11 > 8. + mem.write_obj( + DiscardWriteZeroes { + sector: 7, + num_sectors: 4, + flags: 0, + }, + data_addr, + ) + .unwrap(); + + simulate_queue_event(&mut block, Some(true)); + + assert_eq!(vq.used.idx.get(), 1); + assert_eq!( + u32::from(mem.read_obj::(status_addr).unwrap()), + VIRTIO_BLK_S_IOERR + ); + } + + // Invalid data length: not a multiple of DISCARD_SEGMENT_SIZE → parse error. + { + vq.used.idx.set(0); + set_queue(&mut block, 0, vq.create_queue()); + vq.dtable[1].len.set(DISCARD_SEGMENT_SIZE - 1); + mem.write_obj::(VIRTIO_BLK_T_DISCARD, request_type_addr) + .unwrap(); + + simulate_queue_event(&mut block, Some(true)); + + // Parse error → descriptor chain discarded (used len = 0). + assert_eq!(vq.used.idx.get(), 1); + assert_eq!(vq.used.ring[0].get().len, 0); + } + } + } + #[test] fn test_discard_unsupported_cached() { for engine in [FileEngineType::Sync, FileEngineType::Async] { From 58127c05253187f688dfa833c149226e6a0228c0 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Fri, 24 Apr 2026 12:16:57 +0100 Subject: [PATCH 11/19] test(block): add pytest integration tests for VIRTIO_BLK_F_DISCARD Two tests: - test_discard: boot VM with writable ext4 disk, write+delete data, run fstrim, verify rc=0 and discard_count metric increments. - test_discard_not_advertised_for_read_only: verify read-only devices report discard_max_bytes=0 in sysfs (feature not advertised). Both parametrized over Sync and Async IO engines. Also add discard_count to the fcmetrics block_metrics list. Signed-off-by: Nikita Kalyazin --- tests/host_tools/fcmetrics.py | 1 + .../functional/test_drive_virtio.py | 64 +++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/tests/host_tools/fcmetrics.py b/tests/host_tools/fcmetrics.py index e4f6f7e9813..ad6db45d522 100644 --- a/tests/host_tools/fcmetrics.py +++ b/tests/host_tools/fcmetrics.py @@ -95,6 +95,7 @@ def validate_fc_metrics(metrics): "rate_limiter_throttled_events", "io_engine_throttled_events", "remaining_reqs_count", + "discard_count", {"read_agg": latency_agg_metrics_fields}, {"write_agg": latency_agg_metrics_fields}, ] diff --git a/tests/integration_tests/functional/test_drive_virtio.py b/tests/integration_tests/functional/test_drive_virtio.py index 9c61ead56a9..a36e4f251d0 100644 --- a/tests/integration_tests/functional/test_drive_virtio.py +++ b/tests/integration_tests/functional/test_drive_virtio.py @@ -383,3 +383,67 @@ def _check_mount(ssh_connection, dev_path): assert stderr == "" _, _, stderr = ssh_connection.run("umount /tmp", timeout=30.0) assert stderr == "" + + +def _mount_disk(ssh): + ssh.check_output("mkdir -p /tmp/mnt") + ssh.check_output("mount /dev/vdb /tmp/mnt") + + +def _fill_and_trim(ssh): + ssh.check_output("dd if=/dev/zero of=/tmp/mnt/fill bs=1M count=64 conv=fsync") + ssh.check_output("rm /tmp/mnt/fill && sync") + _, stdout, _ = ssh.check_output("fstrim -v /tmp/mnt") + assert "0 B" not in stdout, f"fstrim reported no bytes trimmed: {stdout}" + + +def test_discard(uvm_plain_any, microvm_factory, io_engine): + """ + Verify VIRTIO_BLK_F_DISCARD on a fresh boot and after snapshot/restore. + """ + vm = uvm_plain_any + vm.spawn() + vm.basic_config() + vm.add_net_iface() + fs = drive_tools.FilesystemFile(os.path.join(vm.fsfiles, "discard_test"), size=256) + vm.add_drive("discard_disk", fs.path, is_read_only=False, io_engine=io_engine) + vm.start() + + _mount_disk(vm.ssh) + _fill_and_trim(vm.ssh) + st = os.stat(fs.path) + assert st.st_blocks * 512 < st.st_size, "backing file has no holes after trim" + + snapshot = vm.snapshot_full() + vm = microvm_factory.build_from_snapshot(snapshot) + # Disk is still mounted in the restored guest; write+trim again. + _fill_and_trim(vm.ssh) + st = os.stat(fs.path) + assert st.st_blocks * 512 < st.st_size, "backing file has no holes after trim post-restore" + + metrics = vm.flush_metrics() + assert metrics["block"]["discard_count"] > 0 + + +def test_discard_not_advertised_for_read_only(uvm_plain_any, io_engine): + """ + Verify VIRTIO_BLK_F_DISCARD is NOT advertised for read-only block devices. + + The kernel exposes discard support via /sys/block//queue/discard_max_bytes. + A value of 0 means the device does not support discard. + """ + vm = uvm_plain_any + vm.spawn() + vm.basic_config() + vm.add_net_iface() + + fs = drive_tools.FilesystemFile(os.path.join(vm.fsfiles, "ro_disk"), size=64) + vm.add_drive("ro_disk", fs.path, is_read_only=True, io_engine=io_engine) + vm.start() + + _, stdout, _ = vm.ssh.check_output("cat /sys/block/vdb/queue/discard_max_bytes") + assert ( + stdout.strip() == "0" + ), f"Expected discard_max_bytes=0 for read-only device, got: {stdout.strip()}" + + From 4a78790c9a91db4af7a5fdf2926af3470a332188 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Wed, 6 May 2026 14:54:48 +0100 Subject: [PATCH 12/19] refactor(block): extend ConfigSpace with write-zeroes fields Replace the 4-byte _unused1 padding at offset 52 with the virtio-blk write-zeroes fields: - max_write_zeroes_sectors (u32, offset 52) - max_write_zeroes_seg (u32, offset 56) - write_zeroes_may_unmap (u8, offset 60) - _unused1 (u8;3, offset 61, padding to 64) Bump the compile-time size assertion to 64. Fields default to zero; no behavioural change. They will be populated when VIRTIO_BLK_F_WRITE_ZEROES is wired up. Signed-off-by: Nikita Kalyazin --- src/vmm/src/devices/virtio/block/virtio/device.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index c6b65d0f8c6..b064f7b024d 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -183,9 +183,12 @@ pub struct ConfigSpace { pub max_discard_sectors: u32, // offset 40 pub max_discard_seg: u32, // offset 44 pub discard_sector_alignment: u32, // offset 48 - pub(crate) _unused1: [u8; 4], // offset 52 (alignment padding to 56) + pub max_write_zeroes_sectors: u32, // offset 52 + pub max_write_zeroes_seg: u32, // offset 56 + pub write_zeroes_may_unmap: u8, // offset 60 + pub(crate) _unused1: [u8; 3], // offset 61 (padding to 64) } -const _: () = assert!(std::mem::size_of::() == 56); +const _: () = assert!(std::mem::size_of::() == 64); // SAFETY: repr(C), all POD fields, explicit padding — no implicit padding bytes. unsafe impl ByteValued for ConfigSpace {} From 997e58cd0b0008479e9895395572dbfa8b6550f5 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Wed, 6 May 2026 14:57:32 +0100 Subject: [PATCH 13/19] refactor(block): add WriteZeroes request type and supporting variants Add the RequestType::WriteZeroes variant and From mapping for VIRTIO_BLK_T_WRITE_ZEROES. Add IoErr::WriteZeroesUnsupported and Status::WriteZeroesUnsupported (parallel to the discard equivalents) that return VIRTIO_BLK_S_UNSUPP silently when the host filesystem doesn't support the operation. Add a write_zeroes_unsupported EOPNOTSUPP cache on DiskProperties (not persisted). Add placeholder arms in PendingRequest::finish() and Request::process() so the code compiles. Both currently return WriteZeroesUnsupported; they will be replaced when the full feature is wired up. Update the test-only From for u32 and request_type_flags() helpers to handle the new variant. Signed-off-by: Nikita Kalyazin --- .../src/devices/virtio/block/virtio/device.rs | 4 ++++ .../devices/virtio/block/virtio/request.rs | 24 ++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index b064f7b024d..449f1f34d48 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -61,6 +61,9 @@ pub struct DiskProperties { pub image_id: [u8; VIRTIO_BLK_ID_BYTES as usize], /// Set on first EOPNOTSUPP from fallocate; subsequent discards are no-ops. Not persisted. pub discard_unsupported: bool, + /// Set on first EOPNOTSUPP from fallocate during write-zeroes; subsequent + /// write-zeroes requests are no-ops. Not persisted. + pub write_zeroes_unsupported: bool, } impl DiskProperties { @@ -109,6 +112,7 @@ impl DiskProperties { nsectors: disk_size >> SECTOR_SHIFT, image_id, discard_unsupported: false, + write_zeroes_unsupported: false, }) } diff --git a/src/vmm/src/devices/virtio/block/virtio/request.rs b/src/vmm/src/devices/virtio/block/virtio/request.rs index 70960ba6269..cb3645879a6 100644 --- a/src/vmm/src/devices/virtio/block/virtio/request.rs +++ b/src/vmm/src/devices/virtio/block/virtio/request.rs @@ -15,7 +15,7 @@ use crate::devices::virtio::block::virtio::metrics::BlockDeviceMetrics; pub use crate::devices::virtio::generated::virtio_blk::{ VIRTIO_BLK_ID_BYTES, VIRTIO_BLK_S_IOERR, VIRTIO_BLK_S_OK, VIRTIO_BLK_S_UNSUPP, VIRTIO_BLK_T_DISCARD, VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID, VIRTIO_BLK_T_IN, - VIRTIO_BLK_T_OUT, + VIRTIO_BLK_T_OUT, VIRTIO_BLK_T_WRITE_ZEROES, }; use crate::devices::virtio::queue::DescriptorChain; use crate::logger::{IncMetric, error, warn}; @@ -48,6 +48,8 @@ pub enum IoErr { InvalidFlags, /// Discard not supported by the host filesystem; cached after first EOPNOTSUPP. DiscardUnsupported, + /// Write zeroes not supported by the host filesystem; cached after first EOPNOTSUPP. + WriteZeroesUnsupported, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -57,6 +59,7 @@ pub enum RequestType { Flush, GetDeviceID, Discard, + WriteZeroes, Unsupported(u32), } @@ -68,6 +71,7 @@ impl From for RequestType { VIRTIO_BLK_T_FLUSH => RequestType::Flush, VIRTIO_BLK_T_GET_ID => RequestType::GetDeviceID, VIRTIO_BLK_T_DISCARD => RequestType::Discard, + VIRTIO_BLK_T_WRITE_ZEROES => RequestType::WriteZeroes, t => RequestType::Unsupported(t), } } @@ -100,6 +104,8 @@ enum Status { }, /// Discard silently unsupported — returns VIRTIO_BLK_S_UNSUPP without logging or metrics. DiscardUnsupported, + /// Write zeroes silently unsupported — returns VIRTIO_BLK_S_UNSUPP without logging or metrics. + WriteZeroesUnsupported, } impl Status { @@ -162,6 +168,7 @@ impl PendingRequest { (0, u8::try_from(VIRTIO_BLK_S_UNSUPP).unwrap()) } Status::DiscardUnsupported => (0, u8::try_from(VIRTIO_BLK_S_UNSUPP).unwrap()), + Status::WriteZeroesUnsupported => (0, u8::try_from(VIRTIO_BLK_S_UNSUPP).unwrap()), }; let num_bytes_to_mem = mem @@ -220,7 +227,12 @@ impl PendingRequest { num_bytes_to_mem: 0, } } + (Ok(_), RequestType::WriteZeroes) => { + // Placeholder: replaced when feature is wired up. + Status::WriteZeroesUnsupported + } (Err(IoErr::DiscardUnsupported), _) => Status::DiscardUnsupported, + (Err(IoErr::WriteZeroesUnsupported), _) => Status::WriteZeroesUnsupported, (_, RequestType::Unsupported(op)) => Status::Unsupported { op }, (Err(err), _) => Status::IoErr { num_bytes_to_mem: 0, @@ -485,6 +497,14 @@ impl Request { Ok((offset, len)) => disk.file_engine.discard(offset, len, pending), } } + RequestType::WriteZeroes => { + // Placeholder: replaced when feature is wired up in a later commit. + return ProcessingResult::Executed(pending.finish( + mem, + Err(IoErr::WriteZeroesUnsupported), + block_metrics, + )); + } RequestType::Unsupported(_) => { return ProcessingResult::Executed(pending.finish(mem, Ok(0), block_metrics)); } @@ -839,6 +859,7 @@ mod tests { RequestType::Flush => VIRTIO_BLK_T_FLUSH, RequestType::GetDeviceID => VIRTIO_BLK_T_GET_ID, RequestType::Discard => VIRTIO_BLK_T_DISCARD, + RequestType::WriteZeroes => VIRTIO_BLK_T_WRITE_ZEROES, RequestType::Unsupported(id) => id, } } @@ -852,6 +873,7 @@ mod tests { RequestType::Flush => VIRTQ_DESC_F_NEXT, RequestType::GetDeviceID => VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, RequestType::Discard => VIRTQ_DESC_F_NEXT, + RequestType::WriteZeroes => VIRTQ_DESC_F_NEXT, RequestType::Unsupported(_) => VIRTQ_DESC_F_NEXT, } } From 87b8fbb57419424cdab68e512483776954629a56 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Wed, 6 May 2026 14:59:00 +0100 Subject: [PATCH 14/19] feat(block): add write_zeroes method to FileEngine using fallocate Add write_zeroes(offset, len, unmap, req) on FileEngine plus the corresponding SyncFileEngine::write_zeroes() and AsyncFileEngine::push_write_zeroes() implementations. The unmap flag selects the fallocate mode at call time: - unmap=true: FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE (mode 3) - unmap=false: FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE (mode 17) The async path uses io_uring's IORING_OP_FALLOCATE (already in the allowlist from the discard work). The sync path calls libc::fallocate. Add SyncIoError::WriteZeroes and extend BlockIoError::is_eopnotsupp() to also match SyncIoError::WriteZeroes so the EOPNOTSUPP fallback machinery (added later) catches both modes. No seccomp / io_uring restriction changes are required: fallocate is already in the seccomp filter and IORING_OP_FALLOCATE is already allowed via the discard work. Signed-off-by: Nikita Kalyazin --- .../virtio/block/virtio/io/async_io.rs | 25 +++++++++++++++ .../src/devices/virtio/block/virtio/io/mod.rs | 27 +++++++++++++++- .../devices/virtio/block/virtio/io/sync_io.rs | 31 +++++++++++++++++++ 3 files changed, 82 insertions(+), 1 deletion(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs index 953144ac51d..089cee883a1 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs @@ -143,6 +143,31 @@ impl AsyncFileEngine { }) } + pub fn push_write_zeroes( + &mut self, + offset: u64, + len: u64, + unmap: bool, + req: PendingRequest, + ) -> Result<(), RequestError> { + // FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE = 3 + // FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE = 17 + const PUNCH_HOLE_MODE: u32 = 3; + const ZERO_RANGE_MODE: u32 = 17; + let mode = if unmap { + PUNCH_HOLE_MODE + } else { + ZERO_RANGE_MODE + }; + let wrapped = WrappedRequest::new(req); + self.ring + .push(Operation::fallocate(0, offset, len, mode, wrapped)) + .map_err(|(io_uring_error, data)| RequestError { + req: data.req, + error: AsyncIoError::IoUring(io_uring_error), + }) + } + pub fn push_read( &mut self, offset: u64, diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs index 9c4934d7628..c87ea83cac9 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs @@ -44,7 +44,7 @@ impl BlockIoError { pub fn is_eopnotsupp(&self) -> bool { matches!( self, - BlockIoError::Sync(SyncIoError::Discard(e)) + BlockIoError::Sync(SyncIoError::Discard(e) | SyncIoError::WriteZeroes(e)) if e.raw_os_error() == Some(libc::EOPNOTSUPP) ) } @@ -189,6 +189,31 @@ impl FileEngine { } } + pub fn write_zeroes( + &mut self, + offset: u64, + len: u64, + unmap: bool, + req: PendingRequest, + ) -> Result> { + match self { + FileEngine::Async(engine) => match engine.push_write_zeroes(offset, len, unmap, req) { + Ok(()) => Ok(FileEngineOk::Submitted), + Err(err) => Err(RequestError { + req: err.req, + error: BlockIoError::Async(err.error), + }), + }, + FileEngine::Sync(engine) => match engine.write_zeroes(offset, len, unmap) { + Ok(()) => Ok(FileEngineOk::Executed(RequestOk { req, count: 0 })), + Err(err) => Err(RequestError { + req, + error: BlockIoError::Sync(err), + }), + }, + } + } + pub fn drain(&mut self, discard: bool) -> Result<(), BlockIoError> { match self { FileEngine::Async(engine) => engine.drain(discard).map_err(BlockIoError::Async), diff --git a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs index 771f17d48c5..929dd4e183d 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs @@ -21,6 +21,8 @@ pub enum SyncIoError { SyncAll(std::io::Error), /// Transfer: {0} Transfer(GuestMemoryError), + /// WriteZeroes: {0} + WriteZeroes(std::io::Error), } #[derive(Debug)] @@ -95,6 +97,35 @@ impl SyncFileEngine { } } + pub fn write_zeroes( + &mut self, + offset: u64, + len: u64, + unmap: bool, + ) -> Result<(), SyncIoError> { + // UNMAP=1 reuses PUNCH_HOLE (the spec lets the device deallocate); + // UNMAP=0 must zero in place without deallocating, so use ZERO_RANGE. + let mode = if unmap { + libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE + } else { + libc::FALLOC_FL_ZERO_RANGE | libc::FALLOC_FL_KEEP_SIZE + }; + // SAFETY: self.file is a valid fd; offset and len are caller-validated. + let ret = unsafe { + libc::fallocate( + self.file.as_raw_fd(), + mode, + i64::try_from(offset).expect("write_zeroes offset fits in i64"), + i64::try_from(len).expect("write_zeroes length fits in i64"), + ) + }; + if ret < 0 { + Err(SyncIoError::WriteZeroes(std::io::Error::last_os_error())) + } else { + Ok(()) + } + } + pub fn flush(&mut self) -> Result<(), SyncIoError> { // flush() first to force any cached data out of rust buffers. self.file.flush().map_err(SyncIoError::Flush)?; From 32302edc2c7bba5019cab8ffca57c841f2107936 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Wed, 6 May 2026 15:01:39 +0100 Subject: [PATCH 15/19] feat(block): handle VIRTIO_BLK_T_WRITE_ZEROES requests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire up Request::parse() validation and Request::process() handling for write-zeroes requests, mirroring the discard path: - parse(): reject write-only data descriptors and require data_len to be a non-zero multiple of DISCARD_SEGMENT_SIZE (the DiscardWriteZeroes struct is shared between both ops). - parse_write_zeroes_segment(): validate flags (only bit 0, VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP, is permitted), reject num_sectors=0, check sector range, and return the unmap flag so process() can pick the fallocate mode. - process(): short-circuit to VIRTIO_BLK_S_UNSUPP if a previous EOPNOTSUPP set disk.write_zeroes_unsupported; otherwise dispatch to FileEngine::write_zeroes(). - the (Ok(_), RequestType::WriteZeroes) finish() arm now increments the write_zeroes_count metric and returns VIRTIO_BLK_S_OK. - the EOPNOTSUPP detection in the error path also handles RequestType::WriteZeroes, setting the cache and returning Status::WriteZeroesUnsupported (silent UNSUPP). Add the write_zeroes_count metric and aggregate() line. The feature flag is not yet advertised — that lands in the next commit, after the request handling is fully in place. Signed-off-by: Nikita Kalyazin --- .../devices/virtio/block/virtio/metrics.rs | 4 + .../devices/virtio/block/virtio/request.rs | 82 +++++++++++++++++-- 2 files changed, 77 insertions(+), 9 deletions(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/metrics.rs b/src/vmm/src/devices/virtio/block/virtio/metrics.rs index d1328895833..ffd363ba559 100644 --- a/src/vmm/src/devices/virtio/block/virtio/metrics.rs +++ b/src/vmm/src/devices/virtio/block/virtio/metrics.rs @@ -186,6 +186,8 @@ pub struct BlockDeviceMetrics { pub remaining_reqs_count: SharedIncMetric, /// Number of successful discard (TRIM) operations. pub discard_count: SharedIncMetric, + /// Number of successful write-zeroes operations. + pub write_zeroes_count: SharedIncMetric, } impl BlockDeviceMetrics { @@ -233,6 +235,8 @@ impl BlockDeviceMetrics { self.remaining_reqs_count .add(other.remaining_reqs_count.fetch_diff()); self.discard_count.add(other.discard_count.fetch_diff()); + self.write_zeroes_count + .add(other.write_zeroes_count.fetch_diff()); } } diff --git a/src/vmm/src/devices/virtio/block/virtio/request.rs b/src/vmm/src/devices/virtio/block/virtio/request.rs index cb3645879a6..503d6c38d96 100644 --- a/src/vmm/src/devices/virtio/block/virtio/request.rs +++ b/src/vmm/src/devices/virtio/block/virtio/request.rs @@ -15,7 +15,7 @@ use crate::devices::virtio::block::virtio::metrics::BlockDeviceMetrics; pub use crate::devices::virtio::generated::virtio_blk::{ VIRTIO_BLK_ID_BYTES, VIRTIO_BLK_S_IOERR, VIRTIO_BLK_S_OK, VIRTIO_BLK_S_UNSUPP, VIRTIO_BLK_T_DISCARD, VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID, VIRTIO_BLK_T_IN, - VIRTIO_BLK_T_OUT, VIRTIO_BLK_T_WRITE_ZEROES, + VIRTIO_BLK_T_OUT, VIRTIO_BLK_T_WRITE_ZEROES, VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP, }; use crate::devices::virtio::queue::DescriptorChain; use crate::logger::{IncMetric, error, warn}; @@ -228,8 +228,10 @@ impl PendingRequest { } } (Ok(_), RequestType::WriteZeroes) => { - // Placeholder: replaced when feature is wired up. - Status::WriteZeroesUnsupported + block_metrics.write_zeroes_count.inc(); + Status::Ok { + num_bytes_to_mem: 0, + } } (Err(IoErr::DiscardUnsupported), _) => Status::DiscardUnsupported, (Err(IoErr::WriteZeroesUnsupported), _) => Status::WriteZeroesUnsupported, @@ -267,6 +269,32 @@ fn parse_discard_segment( )) } +fn parse_write_zeroes_segment( + data_addr: GuestAddress, + nsectors: u64, + mem: &GuestMemoryMmap, +) -> Result<(u64, u64, bool), IoErr> { + // max_write_zeroes_seg = 1 guarantees exactly one segment per request. + let seg: DiscardWriteZeroes = mem.read_obj(data_addr).map_err(IoErr::GetId)?; + // Only bit 0 (UNMAP) is defined; all other bits MUST be 0 per virtio spec + // §5.2.6.14. + if seg.flags & !VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP != 0 { + return Err(IoErr::InvalidFlags); + } + if seg.num_sectors == 0 { + return Err(IoErr::InvalidOffset); + } + seg.sector + .checked_add(u64::from(seg.num_sectors)) + .filter(|&top| top <= nsectors) + .ok_or(IoErr::InvalidOffset)?; + Ok(( + seg.sector << SECTOR_SHIFT, + u64::from(seg.num_sectors) << SECTOR_SHIFT, + seg.flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP != 0, + )) +} + /// The request header represents the mandatory fields of each block device request. /// /// A request header contains the following fields: @@ -364,6 +392,9 @@ impl Request { if data_desc.is_write_only() && req.r#type == RequestType::Discard { return Err(VirtioBlockError::UnexpectedWriteOnlyDescriptor); } + if data_desc.is_write_only() && req.r#type == RequestType::WriteZeroes { + return Err(VirtioBlockError::UnexpectedWriteOnlyDescriptor); + } if !data_desc.is_write_only() && req.r#type == RequestType::In { return Err(VirtioBlockError::UnexpectedReadOnlyDescriptor); } @@ -401,6 +432,11 @@ impl Request { return Err(VirtioBlockError::InvalidDataLength); } } + RequestType::WriteZeroes => { + if req.data_len == 0 || req.data_len % DISCARD_SEGMENT_SIZE != 0 { + return Err(VirtioBlockError::InvalidDataLength); + } + } _ => {} } @@ -498,12 +534,25 @@ impl Request { } } RequestType::WriteZeroes => { - // Placeholder: replaced when feature is wired up in a later commit. - return ProcessingResult::Executed(pending.finish( - mem, - Err(IoErr::WriteZeroesUnsupported), - block_metrics, - )); + if disk.write_zeroes_unsupported { + return ProcessingResult::Executed(pending.finish( + mem, + Err(IoErr::WriteZeroesUnsupported), + block_metrics, + )); + } + match parse_write_zeroes_segment(self.data_addr, disk.nsectors, mem) { + Err(io_err) => { + return ProcessingResult::Executed(pending.finish( + mem, + Err(io_err), + block_metrics, + )); + } + Ok((offset, len, unmap)) => { + disk.file_engine.write_zeroes(offset, len, unmap, pending) + } + } } RequestType::Unsupported(_) => { return ProcessingResult::Executed(pending.finish(mem, Ok(0), block_metrics)); @@ -530,6 +579,21 @@ impl Request { Err(IoErr::DiscardUnsupported), block_metrics, )) + } else if err.error.is_eopnotsupp() + && err.req.request_type() == RequestType::WriteZeroes + { + if !disk.write_zeroes_unsupported { + warn!( + "Block write_zeroes not supported by host filesystem; disabling \ + write_zeroes" + ); + disk.write_zeroes_unsupported = true; + } + ProcessingResult::Executed(err.req.finish( + mem, + Err(IoErr::WriteZeroesUnsupported), + block_metrics, + )) } else { ProcessingResult::Executed(err.req.finish( mem, From 5e1701743fe5bf6847851af59364267993ddd0c2 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Wed, 6 May 2026 15:04:16 +0100 Subject: [PATCH 16/19] feat(block): advertise VIRTIO_BLK_F_WRITE_ZEROES for non-read-only devices Set VIRTIO_BLK_F_WRITE_ZEROES in avail_features for any non-read-only block device, alongside VIRTIO_BLK_F_DISCARD. Populate the max_write_zeroes_sectors, max_write_zeroes_seg, and write_zeroes_may_unmap config fields in both VirtioBlock::new() and the persist::restore() path so the values match what the guest sees on a fresh boot vs after a snapshot restore. write_zeroes_may_unmap=1 lets the guest set the UNMAP flag on individual segments, which we then translate to fallocate's PUNCH_HOLE mode (UNMAP=0 uses ZERO_RANGE). Update the test_virtio_features and test_virtio_read_config expectations to account for the new feature bit and config fields. Signed-off-by: Nikita Kalyazin --- .../src/devices/virtio/block/virtio/device.rs | 22 +++++++++++++++---- .../devices/virtio/block/virtio/persist.rs | 3 +++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index 449f1f34d48..d9eee849204 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -27,7 +27,8 @@ use crate::devices::virtio::block::CacheType; use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice}; use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{ - VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, + VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_WRITE_ZEROES, + VIRTIO_BLK_ID_BYTES, }; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ids::VIRTIO_ID_BLOCK; @@ -335,6 +336,7 @@ impl VirtioBlock { avail_features |= 1u64 << VIRTIO_BLK_F_RO; } else { avail_features |= 1u64 << VIRTIO_BLK_F_DISCARD; + avail_features |= 1u64 << VIRTIO_BLK_F_WRITE_ZEROES; } let queue_evts = [EventFd::new(libc::EFD_NONBLOCK).map_err(VirtioBlockError::EventFd)?]; @@ -350,6 +352,13 @@ impl VirtioBlock { 0 }, max_discard_seg: if !config.is_read_only { 1 } else { 0 }, + max_write_zeroes_sectors: if !config.is_read_only { + discard_sectors + } else { + 0 + }, + max_write_zeroes_seg: if !config.is_read_only { 1 } else { 0 }, + write_zeroes_may_unmap: if !config.is_read_only { 1 } else { 0 }, ..Default::default() }; @@ -854,10 +863,12 @@ mod tests { assert_eq!(block.device_type(), VIRTIO_ID_BLOCK); - // default_block is non-read-only, so VIRTIO_BLK_F_DISCARD is advertised. + // default_block is non-read-only, so VIRTIO_BLK_F_DISCARD and + // VIRTIO_BLK_F_WRITE_ZEROES are advertised. let features: u64 = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX) - | (1u64 << VIRTIO_BLK_F_DISCARD); + | (1u64 << VIRTIO_BLK_F_DISCARD) + | (1u64 << VIRTIO_BLK_F_WRITE_ZEROES); assert_eq!( block.avail_features_by_page(0), @@ -884,11 +895,14 @@ mod tests { let mut actual_config_space = ConfigSpace::default(); block.read_config(0, actual_config_space.as_mut_slice()); // The block's backing file size is 0x1000, so there are 8 (4096/512) sectors. - // default_block is non-read-only, so discard fields are populated. + // default_block is non-read-only, so discard and write-zeroes fields are populated. let expected_config_space = ConfigSpace { capacity: 8, max_discard_sectors: 8, max_discard_seg: 1, + max_write_zeroes_sectors: 8, + max_write_zeroes_seg: 1, + write_zeroes_may_unmap: 1, ..Default::default() }; assert_eq!(actual_config_space, expected_config_space); diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 3326caeb9c3..f93f146c52e 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -116,6 +116,9 @@ impl Persist<'_> for VirtioBlock { capacity: disk_properties.nsectors.to_le(), max_discard_sectors: if !is_read_only { discard_sectors } else { 0 }, max_discard_seg: if !is_read_only { 1 } else { 0 }, + max_write_zeroes_sectors: if !is_read_only { discard_sectors } else { 0 }, + max_write_zeroes_seg: if !is_read_only { 1 } else { 0 }, + write_zeroes_may_unmap: if !is_read_only { 1 } else { 0 }, ..Default::default() }; From b0e9d4a7f63c9c178284f4119edae44052dd80fe Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Wed, 6 May 2026 15:08:46 +0100 Subject: [PATCH 17/19] test(block): add unit tests for VIRTIO_BLK_F_WRITE_ZEROES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add three tests parametrised over Sync and Async file engines: - test_write_zeroes_feature_and_config: VIRTIO_BLK_F_WRITE_ZEROES advertised on writable devices; max_write_zeroes_sectors, max_write_zeroes_seg, and write_zeroes_may_unmap populated as expected; read-only devices advertise none of these. - test_write_zeroes: end-to-end happy path (UNMAP=1 → PUNCH_HOLE, metric incremented, VIRTIO_BLK_S_OK) plus four error cases (reserved flag bit set, sector range out of bounds, num_sectors=0, invalid data length). - test_write_zeroes_unsupported_cached: pre-set the cache flag and verify subsequent requests return VIRTIO_BLK_S_UNSUPP without incrementing the success or invalid-request counters. The UNMAP=0 (ZERO_RANGE) happy path is not asserted in the unit tests because ZERO_RANGE support is filesystem-dependent (the host's tmpfs in some kernels returns EOPNOTSUPP); end-to-end UNMAP=0 correctness is left to the integration tests running on ext4. The unit tests cover wire-flag handling for UNMAP=0 via the invalid-flags and zero-sectors cases. Also extend test_request_type_from to assert the new VIRTIO_BLK_T_WRITE_ZEROES → RequestType::WriteZeroes mapping. Signed-off-by: Nikita Kalyazin --- .../src/devices/virtio/block/virtio/device.rs | 233 ++++++++++++++++++ .../devices/virtio/block/virtio/request.rs | 4 + 2 files changed, 237 insertions(+) diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index d9eee849204..452b53b55ca 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -2149,4 +2149,237 @@ mod tests { ); } } + + #[test] + fn test_write_zeroes_feature_and_config() { + // Non-read-only block: VIRTIO_BLK_F_WRITE_ZEROES set, write-zeroes config + // fields populated. + for engine in [FileEngineType::Sync, FileEngineType::Async] { + let block = default_block(engine); + assert_ne!( + block.avail_features & (1u64 << VIRTIO_BLK_F_WRITE_ZEROES), + 0 + ); + assert_eq!(block.avail_features & (1u64 << VIRTIO_BLK_F_RO), 0); + // default_block has 0x1000-byte backing file → 8 sectors. + assert_eq!(block.config_space.max_write_zeroes_sectors, 8); + assert_eq!(block.config_space.max_write_zeroes_seg, 1); + assert_eq!(block.config_space.write_zeroes_may_unmap, 1); + } + + // Read-only block: VIRTIO_BLK_F_RO set, no write-zeroes. + let f = TempFile::new().unwrap(); + f.as_file().set_len(0x1000).unwrap(); + let config = VirtioBlockConfig { + drive_id: "test_ro_wz".to_string(), + path_on_host: f.as_path().to_str().unwrap().to_string(), + is_root_device: false, + partuuid: None, + is_read_only: true, + cache_type: CacheType::Unsafe, + rate_limiter: None, + file_engine_type: FileEngineType::default(), + }; + let ro_block = VirtioBlock::new(config).unwrap(); + assert_eq!( + ro_block.avail_features & (1u64 << VIRTIO_BLK_F_WRITE_ZEROES), + 0 + ); + assert_ne!(ro_block.avail_features & (1u64 << VIRTIO_BLK_F_RO), 0); + assert_eq!(ro_block.config_space.max_write_zeroes_sectors, 0); + assert_eq!(ro_block.config_space.max_write_zeroes_seg, 0); + assert_eq!(ro_block.config_space.write_zeroes_may_unmap, 0); + } + + #[test] + fn test_write_zeroes() { + for engine in [FileEngineType::Sync, FileEngineType::Async] { + let mut block = default_block(engine); + let mem = default_mem(); + let interrupt = default_interrupt(); + let vq = VirtQueue::new(GuestAddress(0), &mem, 16); + set_queue(&mut block, 0, vq.create_queue()); + block.activate(mem.clone(), interrupt).unwrap(); + read_blk_req_descriptors(&vq); + + let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); + let data_addr = GuestAddress(vq.dtable[1].addr.get()); + let status_addr = GuestAddress(vq.dtable[2].addr.get()); + + // Write-zeroes data descriptor is read-only (guest sends the segment list). + vq.dtable[1].flags.set(VIRTQ_DESC_F_NEXT); + vq.dtable[1].len.set(DISCARD_SEGMENT_SIZE); + + // Valid write-zeroes with UNMAP=1 (PUNCH_HOLE — supported on tmpfs). + // The UNMAP=0 (ZERO_RANGE) happy path is not asserted here because + // ZERO_RANGE support is filesystem-dependent (e.g. older tmpfs returns + // EOPNOTSUPP); the wire-flag handling for UNMAP=0 is covered by the + // invalid-flag and zero-sectors cases below, and end-to-end correctness + // is left to the integration tests on ext4. + { + mem.write_obj::(VIRTIO_BLK_T_WRITE_ZEROES, request_type_addr) + .unwrap(); + mem.write_obj( + DiscardWriteZeroes { + sector: 0, + num_sectors: 4, + flags: VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP, + }, + data_addr, + ) + .unwrap(); + + check_metric_after_block!( + &block.metrics.write_zeroes_count, + 1, + simulate_queue_and_async_completion_events(&mut block, true) + ); + + assert_eq!(vq.used.idx.get(), 1); + assert_eq!(vq.used.ring[0].get().id, 0); + assert_eq!(vq.used.ring[0].get().len, 1); // status byte only + assert_eq!(mem.read_obj::(status_addr).unwrap(), VIRTIO_BLK_S_OK); + } + + // Invalid: reserved flag bit set (any bit other than UNMAP). + { + vq.used.idx.set(0); + set_queue(&mut block, 0, vq.create_queue()); + mem.write_obj::(VIRTIO_BLK_T_WRITE_ZEROES, request_type_addr) + .unwrap(); + mem.write_obj( + DiscardWriteZeroes { + sector: 0, + num_sectors: 4, + flags: 0x2, // reserved bit + }, + data_addr, + ) + .unwrap(); + + simulate_queue_event(&mut block, Some(true)); + + assert_eq!(vq.used.idx.get(), 1); + assert_eq!( + u32::from(mem.read_obj::(status_addr).unwrap()), + VIRTIO_BLK_S_IOERR + ); + } + + // Invalid: sector range exceeds disk bounds. + { + vq.used.idx.set(0); + set_queue(&mut block, 0, vq.create_queue()); + mem.write_obj::(VIRTIO_BLK_T_WRITE_ZEROES, request_type_addr) + .unwrap(); + // 8-sector disk; sector 7 + 4 = 11 > 8. + mem.write_obj( + DiscardWriteZeroes { + sector: 7, + num_sectors: 4, + flags: 0, + }, + data_addr, + ) + .unwrap(); + + simulate_queue_event(&mut block, Some(true)); + + assert_eq!(vq.used.idx.get(), 1); + assert_eq!( + u32::from(mem.read_obj::(status_addr).unwrap()), + VIRTIO_BLK_S_IOERR + ); + } + + // Invalid: num_sectors=0. + { + vq.used.idx.set(0); + set_queue(&mut block, 0, vq.create_queue()); + mem.write_obj::(VIRTIO_BLK_T_WRITE_ZEROES, request_type_addr) + .unwrap(); + mem.write_obj( + DiscardWriteZeroes { + sector: 0, + num_sectors: 0, + flags: 0, + }, + data_addr, + ) + .unwrap(); + + simulate_queue_event(&mut block, Some(true)); + + assert_eq!(vq.used.idx.get(), 1); + assert_eq!( + u32::from(mem.read_obj::(status_addr).unwrap()), + VIRTIO_BLK_S_IOERR + ); + } + + // Invalid data length: not a multiple of DISCARD_SEGMENT_SIZE → parse error. + { + vq.used.idx.set(0); + set_queue(&mut block, 0, vq.create_queue()); + vq.dtable[1].len.set(DISCARD_SEGMENT_SIZE - 1); + mem.write_obj::(VIRTIO_BLK_T_WRITE_ZEROES, request_type_addr) + .unwrap(); + + simulate_queue_event(&mut block, Some(true)); + + // Parse error → descriptor chain discarded (used len = 0). + assert_eq!(vq.used.idx.get(), 1); + assert_eq!(vq.used.ring[0].get().len, 0); + } + } + } + + #[test] + fn test_write_zeroes_unsupported_cached() { + for engine in [FileEngineType::Sync, FileEngineType::Async] { + let mut block = default_block(engine); + let mem = default_mem(); + let interrupt = default_interrupt(); + let vq = VirtQueue::new(GuestAddress(0), &mem, 16); + set_queue(&mut block, 0, vq.create_queue()); + block.activate(mem.clone(), interrupt).unwrap(); + read_blk_req_descriptors(&vq); + + let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); + let data_addr = GuestAddress(vq.dtable[1].addr.get()); + let status_addr = GuestAddress(vq.dtable[2].addr.get()); + + vq.dtable[1].flags.set(VIRTQ_DESC_F_NEXT); + vq.dtable[1].len.set(DISCARD_SEGMENT_SIZE); + + mem.write_obj::(VIRTIO_BLK_T_WRITE_ZEROES, request_type_addr) + .unwrap(); + mem.write_obj( + DiscardWriteZeroes { + sector: 0, + num_sectors: 4, + flags: 0, + }, + data_addr, + ) + .unwrap(); + + // Pre-set the flag as if a previous EOPNOTSUPP was already detected. + block.disk.write_zeroes_unsupported = true; + + // Neither write_zeroes_count nor invalid_reqs_count should increment. + let wz_before = block.metrics.write_zeroes_count.count(); + let invalid_before = block.metrics.invalid_reqs_count.count(); + simulate_queue_and_async_completion_events(&mut block, true); + assert_eq!(block.metrics.write_zeroes_count.count(), wz_before); + assert_eq!(block.metrics.invalid_reqs_count.count(), invalid_before); + + assert_eq!(vq.used.idx.get(), 1); + assert_eq!(vq.used.ring[0].get().len, 1); // status byte only + assert_eq!( + u32::from(mem.read_obj::(status_addr).unwrap()), + VIRTIO_BLK_S_UNSUPP + ); + } + } } diff --git a/src/vmm/src/devices/virtio/block/virtio/request.rs b/src/vmm/src/devices/virtio/block/virtio/request.rs index 503d6c38d96..ee51ab6de1f 100644 --- a/src/vmm/src/devices/virtio/block/virtio/request.rs +++ b/src/vmm/src/devices/virtio/block/virtio/request.rs @@ -666,6 +666,10 @@ mod tests { RequestType::from(VIRTIO_BLK_T_GET_ID), RequestType::GetDeviceID ); + assert_eq!( + RequestType::from(VIRTIO_BLK_T_WRITE_ZEROES), + RequestType::WriteZeroes + ); assert_eq!(RequestType::from(42), RequestType::Unsupported(42)); } From e028273282aa6298a2778a4c707f910cb3833c20 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Wed, 6 May 2026 15:56:36 +0100 Subject: [PATCH 18/19] test(block): add pytest integration tests for VIRTIO_BLK_F_WRITE_ZEROES Two tests, parametrised over Sync and Async IO engines: - test_write_zeroes: boot VM with a writable block device, write random data to a 1 MiB region of /dev/vdb, run `blkdiscard -z` (BLKZEROOUT ioctl), then verify: * sysfs /queue/write_zeroes_max_bytes is non-zero (feature negotiated) * the region reads back as zeros after the operation * no requests failed - test_write_zeroes_not_advertised_for_read_only: read-only device reports write_zeroes_max_bytes=0 in sysfs. The test deliberately does not assert that the write_zeroes_count metric incremented. The Linux kernel's blkdev_issue_zeroout() may legitimately fall back to issuing plain zero-page writes (REQ_OP_WRITE) instead of REQ_OP_WRITE_ZEROES even when the feature is advertised, depending on internal heuristics. Both paths leave the device reading as zeros, so a metric assertion would be flaky in CI without indicating any actual bug. Direct WRITE_ZEROES request-handling correctness is covered by the unit tests. Use cmp -n /dev/vdb /dev/zero for the zero check (od collapses repeated lines with *, which makes naive byte-comparison fragile). Add write_zeroes_count to host_tools/fcmetrics.py so the metrics schema validation passes. Signed-off-by: Nikita Kalyazin --- tests/host_tools/fcmetrics.py | 1 + .../functional/test_drive_virtio.py | 83 +++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/tests/host_tools/fcmetrics.py b/tests/host_tools/fcmetrics.py index ad6db45d522..04a8765e07b 100644 --- a/tests/host_tools/fcmetrics.py +++ b/tests/host_tools/fcmetrics.py @@ -96,6 +96,7 @@ def validate_fc_metrics(metrics): "io_engine_throttled_events", "remaining_reqs_count", "discard_count", + "write_zeroes_count", {"read_agg": latency_agg_metrics_fields}, {"write_agg": latency_agg_metrics_fields}, ] diff --git a/tests/integration_tests/functional/test_drive_virtio.py b/tests/integration_tests/functional/test_drive_virtio.py index a36e4f251d0..25d8f2cd421 100644 --- a/tests/integration_tests/functional/test_drive_virtio.py +++ b/tests/integration_tests/functional/test_drive_virtio.py @@ -447,3 +447,86 @@ def test_discard_not_advertised_for_read_only(uvm_plain_any, io_engine): ), f"Expected discard_max_bytes=0 for read-only device, got: {stdout.strip()}" +def _exercise_write_zeroes(ssh): + """Write random data, issue blkdiscard -z, verify zeros on /dev/vdb.""" + # Sysfs check: the kernel populates write_zeroes_max_bytes from the + # negotiated feature; a non-zero value proves the feature is advertised. + _, stdout, _ = ssh.check_output( + "cat /sys/block/vdb/queue/write_zeroes_max_bytes" + ) + assert int(stdout.strip()) > 0, ( + f"Expected non-zero write_zeroes_max_bytes, got: {stdout.strip()}" + ) + # Write random non-zero data so we can tell zeroing apart from + # "the device was already zero". + ssh.check_output("dd if=/dev/urandom of=/dev/vdb bs=1M count=1 conv=fsync") + ssh.check_output("sync && echo 3 > /proc/sys/vm/drop_caches") + # Issue zero-out via blkdiscard -z (BLKZEROOUT ioctl). + ssh.check_output("blkdiscard -z --offset 0 --length $((1024*1024)) /dev/vdb") + ssh.check_output("sync && echo 3 > /proc/sys/vm/drop_caches") + # Verify the range now reads as zeros. + ssh.check_output("cmp -n 1048576 /dev/vdb /dev/zero") + + +def test_write_zeroes(uvm_plain_any, microvm_factory, io_engine): + """ + Verify VIRTIO_BLK_F_WRITE_ZEROES on a fresh boot and after snapshot/restore. + + Writes random data to a 1 MiB region of /dev/vdb, then issues + `blkdiscard -z` (BLKZEROOUT ioctl), and asserts that: + - sysfs /queue/write_zeroes_max_bytes is non-zero (feature negotiated) + - the region reads back as zeros after the operation + The same workload is then run against a snapshot-restored VM, which + catches `persist::restore()` populating the write-zeroes ConfigSpace + fields wrong (the restored guest would otherwise see a zero + write_zeroes_max_bytes and the workload would silently regress). + + Note: we deliberately do NOT assert that the `write_zeroes_count` metric + increased. The Linux kernel's `blkdev_issue_zeroout()` may issue either + `REQ_OP_WRITE_ZEROES` (counted) or fall back to plain zero-page writes + (counted as `write_count`) depending on internal heuristics; both + result in the device contents reading as zeros. Direct WRITE_ZEROES + request handling is covered by the unit tests. + """ + vm = uvm_plain_any + vm.spawn() + vm.basic_config() + vm.add_net_iface() + fs = drive_tools.FilesystemFile(os.path.join(vm.fsfiles, "wz_test"), size=64) + vm.add_drive("wz_disk", fs.path, is_read_only=False, io_engine=io_engine) + vm.start() + + _exercise_write_zeroes(vm.ssh) + + snapshot = vm.snapshot_full() + vm = microvm_factory.build_from_snapshot(snapshot) + _exercise_write_zeroes(vm.ssh) + + metrics = vm.flush_metrics() + assert metrics["block"]["execute_fails"] == 0 + + +def test_write_zeroes_not_advertised_for_read_only(uvm_plain_any, io_engine): + """ + Verify VIRTIO_BLK_F_WRITE_ZEROES is NOT advertised for read-only devices. + + The kernel exposes the negotiated feature via + /sys/block//queue/write_zeroes_max_bytes; 0 means not supported. + """ + vm = uvm_plain_any + vm.spawn() + vm.basic_config() + vm.add_net_iface() + + fs = drive_tools.FilesystemFile(os.path.join(vm.fsfiles, "ro_wz_disk"), size=64) + vm.add_drive("ro_wz_disk", fs.path, is_read_only=True, io_engine=io_engine) + vm.start() + + _, stdout, _ = vm.ssh.check_output( + "cat /sys/block/vdb/queue/write_zeroes_max_bytes" + ) + assert stdout.strip() == "0", ( + f"Expected write_zeroes_max_bytes=0 for read-only device, got: {stdout.strip()}" + ) + + From d06e14448cefcbd79b18c8917d95acf5987341d5 Mon Sep 17 00:00:00 2001 From: Nikita Kalyazin Date: Wed, 6 May 2026 15:56:46 +0100 Subject: [PATCH 19/19] doc(block): document VIRTIO_BLK_F_WRITE_ZEROES support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add docs/api_requests/block-write-zeroes.md describing: - automatic advertisement on writable devices - UNMAP=0 → FALLOC_FL_ZERO_RANGE (zeros in place) - UNMAP=1 → FALLOC_FL_PUNCH_HOLE (zeros + deallocate) - host filesystem requirements - EOPNOTSUPP fallback (silent VIRTIO_BLK_S_UNSUPP, shared cache) - known limitations Remove the "write_zeroes is not supported" line from block-discard.md now that the feature is implemented. Signed-off-by: Nikita Kalyazin --- docs/api_requests/block-discard.md | 2 - docs/api_requests/block-write-zeroes.md | 66 +++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 docs/api_requests/block-write-zeroes.md diff --git a/docs/api_requests/block-discard.md b/docs/api_requests/block-discard.md index 792ca4fb8d1..1d18ff98f7c 100644 --- a/docs/api_requests/block-discard.md +++ b/docs/api_requests/block-discard.md @@ -40,5 +40,3 @@ discard requests with `VIRTIO_BLK_S_UNSUPP` immediately — no additional - At most one discard segment per request is supported (`max_discard_seg = 1`). - The discard segment flags field must be zero; non-zero flags are rejected with an I/O error. -- The `write_zeroes` variant of the feature (`VIRTIO_BLK_T_WRITE_ZEROES`) is not - supported. diff --git a/docs/api_requests/block-write-zeroes.md b/docs/api_requests/block-write-zeroes.md new file mode 100644 index 00000000000..26dc4b5b100 --- /dev/null +++ b/docs/api_requests/block-write-zeroes.md @@ -0,0 +1,66 @@ +# Block device write-zeroes + +Firecracker supports the `VIRTIO_BLK_F_WRITE_ZEROES` feature, which allows the +guest to ask the device to zero a range of sectors without transferring a buffer +of zeros over the virtqueue. Common consumers are `mkfs` (clearing inode tables +and journals), filesystem snapshots, encrypted-volume initial wipe, and +`blkdiscard -z` / `blkzeroout` from userspace. + +## How it works + +For all non-read-only block devices, Firecracker automatically advertises the +`VIRTIO_BLK_F_WRITE_ZEROES` feature to the guest driver. No API configuration +is required — write-zeroes support is always-on for writable drives. + +Each `VIRTIO_BLK_T_WRITE_ZEROES` request carries a 16-byte segment with a +`flags` field. Bit 0 (`VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP`) tells the device +whether it may also deallocate the underlying backing-file blocks. Firecracker +advertises `write_zeroes_may_unmap=1`, so guests are free to set this flag. + +Firecracker translates the guest's UNMAP bit into a `fallocate(2)` mode on the +backing file: + +| UNMAP | fallocate mode | Effect | +|-------|---------------------------------------------|---------------------------------------| +| 0 | `FALLOC_FL_ZERO_RANGE \| FALLOC_FL_KEEP_SIZE` | zeros in place, no deallocation | +| 1 | `FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_KEEP_SIZE` | zeros + deallocate (sparse holes) | + +The virtio spec requires that when UNMAP is clear the device MUST NOT +deallocate sectors (so `ZERO_RANGE` is mandatory for that path); when UNMAP +is set, the device MAY deallocate, and `PUNCH_HOLE` reads as zeros on every +filesystem that supports it. + +## Host requirements + +The backing file must reside on a filesystem that supports the corresponding +`fallocate` mode: + +- `FALLOC_FL_PUNCH_HOLE` (UNMAP=1) is widely supported: ext4, xfs, btrfs, tmpfs. +- `FALLOC_FL_ZERO_RANGE` (UNMAP=0) is supported on ext4, xfs, btrfs; on tmpfs + it requires Linux 6.8+. Other filesystems may not support it. + +If `fallocate` returns `EOPNOTSUPP` for either mode, Firecracker logs a one-time +warning and replies with `VIRTIO_BLK_S_UNSUPP`. The Linux virtio-blk driver +propagates that status through the block layer and stops issuing further +write-zeroes requests, so subsequent guest writes fall back to plain +`REQ_OP_WRITE` traffic. Firecracker short-circuits any in-flight write-zeroes +requests with `VIRTIO_BLK_S_UNSUPP` for the rest of the device's lifetime — no +additional `fallocate` calls are made. + +The EOPNOTSUPP cache is shared across UNMAP=0 and UNMAP=1 paths: a single +fallback flag disables both. This is conservative — a filesystem that +supports `PUNCH_HOLE` but not `ZERO_RANGE` will see UNMAP=1 requests rejected +once an UNMAP=0 request fails — but it matches the discard fallback design +and avoids subtle host-side state. + +## Limitations + +- Write-zeroes is only available for non-read-only block devices. +- At most one segment per request is supported (`max_write_zeroes_seg = 1`). +- Only bit 0 (UNMAP) of the segment flags is allowed; non-zero reserved bits + are rejected with an I/O error. +- `EOPNOTSUPP` errors from the io_uring async engine are surfaced as + `VIRTIO_BLK_S_IOERR`, not silent `VIRTIO_BLK_S_UNSUPP` (the cache only + triggers for the synchronous engine, which is where the EOPNOTSUPP + detection currently lives). For async-engine devices, configure the + backing filesystem to support both modes.