diff --git a/Dockerfile b/Dockerfile index fd305791..8b69bbd5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -259,13 +259,37 @@ RUN go install github.com/go-delve/delve/cmd/dlv@latest FROM "${RUST_IMAGE}" AS libkrun-build ARG LIBKRUN_VERSION=v1.17.4 +ARG IMAGO_VERSION=0.2.1 RUN --mount=type=cache,sharing=locked,id=libkrun-aptlib,target=/var/lib/apt \ --mount=type=cache,sharing=locked,id=libkrun-aptcache,target=/var/cache/apt \ - apt-get update && apt-get install -y git libcap-ng-dev libclang-19-dev llvm make + apt-get update && apt-get install -y curl git libcap-ng-dev libclang-19-dev llvm make + +# Patch imago's VMDK driver: get_extent_at uses `disk_range.end < offset` as +# the binary-search comparator's "Less" cutoff. Range is half-open +# ([start, end)), so when `end == offset` the offset belongs to the *next* +# extent and the comparator should return Less. The published version +# returns Greater, breaking the partition assumption that binary_search_by +# requires; partition reads landing exactly on an extent boundary then +# return EOF instead of the partition data. +# +# This is what makes a GPT-partitioned VMDK with many extents fail (every +# partition's StartingLBA coincides with the preceding pad extent's end). +# Drop this patch once it lands in a published imago crate and libkrun +# rolls forward. +RUN curl -fsSL "https://crates.io/api/v1/crates/imago/${IMAGO_VERSION}/download" \ + -o /tmp/imago.tar.gz && \ + mkdir -p /opt/imago-patched && \ + tar -xzf /tmp/imago.tar.gz --strip-components=1 -C /opt/imago-patched && \ + rm /tmp/imago.tar.gz && \ + sed -i 's|extent\.disk_range\.end < offset|extent.disk_range.end <= offset|' \ + /opt/imago-patched/src/vmdk/mod.rs && \ + grep -q 'extent\.disk_range\.end <= offset' /opt/imago-patched/src/vmdk/mod.rs && \ + rm -f /opt/imago-patched/.cargo_vcs_info.json /opt/imago-patched/Cargo.toml.orig RUN git clone --depth 1 --branch ${LIBKRUN_VERSION} https://github.com/containers/libkrun.git && \ cd libkrun && \ + printf '\n[patch.crates-io]\nimago = { path = "/opt/imago-patched" }\n' >> Cargo.toml && \ make -j$(nproc) BLK=1 NET=1 FROM scratch AS libkrun diff --git a/internal/erofs/gpt.go b/internal/erofs/gpt.go new file mode 100644 index 00000000..1bad4beb --- /dev/null +++ b/internal/erofs/gpt.go @@ -0,0 +1,352 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package erofs + +import ( + "crypto/sha256" + "encoding/binary" + "errors" + "fmt" + "hash/crc32" + "io" + "os" + "unicode/utf16" +) + +// Constants describing the GPT layout used for stacked erofs layer disks. +const ( + // gptSectorSize is the LBA size used by the synthetic VMDK-backed disk. + // VMDK descriptors express extents and partition LBAs in 512-byte + // sectors, so we always stick with this size regardless of the host or + // the in-VM erofs filesystem block size. + gptSectorSize = 512 + + // gptAlignSectors aligns each partition to a 1 MiB boundary, which is + // also a multiple of the 4 KiB erofs filesystem block size. + gptAlignSectors = 2048 + + // gptPartitionEntries is the standard partition entry count. + gptPartitionEntries = 128 + + // gptPartitionEntrySize is the size of each partition entry, in bytes. + gptPartitionEntrySize = 128 + + // gptPartitionArraySectors is the number of sectors covered by the + // 128-entry partition array (128 * 128 = 16384 bytes = 32 sectors). + gptPartitionArraySectors = (gptPartitionEntries * gptPartitionEntrySize) / gptSectorSize + + // gptHeaderSize is the size of the on-disk GPT header structure. + gptHeaderSize = 92 + + // gptHeaderRevision is the GPT 1.0 revision marker (major=1, minor=0). + gptHeaderRevision uint32 = 0x00010000 + + // gptReservedSectors is the number of sectors reserved at the head of + // the disk before the first usable LBA: LBA 0 (protective MBR) + LBA 1 + // (primary header) + LBA 2..33 (primary partition entry array). + gptReservedSectors = 1 + 1 + gptPartitionArraySectors + + // gptTailSectors is the number of sectors reserved at the tail of the + // disk: secondary partition entry array (32 sectors) + secondary + // header (1 sector). + gptTailSectors = gptPartitionArraySectors + 1 + + // erofsBlockSize is the filesystem block size required by the in-VM + // kernel. Layer files must be a multiple of this size. + erofsBlockSize = 4096 +) + +var ( + // gptSignature is the "EFI PART" magic at the start of the GPT header. + gptSignature = [8]byte{'E', 'F', 'I', ' ', 'P', 'A', 'R', 'T'} + + // gptTypeLinuxFS is the type GUID for a generic Linux filesystem data + // partition (canonical 0FC63DAF-8483-4772-8E79-3D69D8477DE4) encoded + // in the mixed-endian on-disk form used by GPT. + gptTypeLinuxFS = [16]byte{ + 0xAF, 0x3D, 0xC6, 0x0F, + 0x83, 0x84, + 0x72, 0x47, + 0x8E, 0x79, + 0x3D, 0x69, 0xD8, 0x47, 0x7D, 0xE4, + } + + // errGPTPartitionLimit is returned when more partitions are requested + // than fit in a standard 128-entry GPT partition array. + errGPTPartitionLimit = errors.New("erofs: too many layers for a GPT partition table") + + // errGPTUnaligned is returned when a layer file's size is not a + // multiple of the erofs filesystem block size. + errGPTUnaligned = errors.New("erofs: layer file size is not 4 KiB aligned") +) + +// Partition describes a single GPT partition slot for an erofs layer. +type Partition struct { + GUID [16]byte + TypeGUID [16]byte + FirstLBA uint64 + LastLBA uint64 + Name string + Source string + SizeBytes uint64 +} + +// SectorCount returns the number of 512-byte sectors covered by the partition. +func (p Partition) SectorCount() uint64 { + return p.LastLBA - p.FirstLBA + 1 +} + +// Layout describes the GPT layout for a list of erofs layer files. +type Layout struct { + SectorSize uint64 + AlignSectors uint64 + TotalSectors uint64 + DiskGUID [16]byte + Partitions []Partition +} + +// FirstUsableLBA returns the first LBA available for partition data. +func (l Layout) FirstUsableLBA() uint64 { + return uint64(gptReservedSectors) +} + +// LastUsableLBA returns the last LBA available for partition data. +func (l Layout) LastUsableLBA() uint64 { + return l.TotalSectors - 1 - uint64(gptTailSectors) +} + +// ComputeLayout builds a GPT Layout for the supplied erofs layer files. +// The first path becomes partition 1, the second becomes partition 2, and +// so on. Layer file sizes are read via os.Stat. +func ComputeLayout(layerPaths []string) (Layout, error) { + if len(layerPaths) > gptPartitionEntries { + return Layout{}, fmt.Errorf("%w: %d > %d", errGPTPartitionLimit, len(layerPaths), gptPartitionEntries) + } + + sizes := make([]uint64, len(layerPaths)) + for i, p := range layerPaths { + fi, err := os.Stat(p) + if err != nil { + return Layout{}, err + } + size := uint64(fi.Size()) + if size == 0 || size%erofsBlockSize != 0 { + return Layout{}, fmt.Errorf("%w: %s size %d", errGPTUnaligned, p, size) + } + sizes[i] = size + } + + parts := make([]Partition, len(layerPaths)) + cursor := uint64(gptAlignSectors) // partition 1 starts at LBA 2048 (1 MiB) + for i, p := range layerPaths { + sectors := sizes[i] / gptSectorSize + first := cursor + last := first + sectors - 1 + parts[i] = Partition{ + GUID: derivePartGUID(p, sizes[i], i), + TypeGUID: gptTypeLinuxFS, + FirstLBA: first, + LastLBA: last, + Name: fmt.Sprintf("erofs-layer-%d", i), + Source: p, + SizeBytes: sizes[i], + } + // Next partition slot starts at the next 1 MiB boundary. + cursor = alignUp(last+1, gptAlignSectors) + } + + // The secondary GPT structures live at the end of the disk; their + // first LBA is just past the end of the last partition slot. + total := cursor + uint64(gptTailSectors) + + return Layout{ + SectorSize: gptSectorSize, + AlignSectors: gptAlignSectors, + TotalSectors: total, + DiskGUID: deriveDiskGUID(layerPaths, sizes), + Partitions: parts, + }, nil +} + +// alignUp returns v rounded up to the nearest multiple of align. +// align must be non-zero. +func alignUp(v, align uint64) uint64 { + return ((v + align - 1) / align) * align +} + +// WriteHeader writes the protective MBR + primary GPT header + primary +// partition entry array, totalling gptReservedSectors (34) sectors. +func (l Layout) WriteHeader(w io.Writer) error { + // LBA 0: protective MBR. + mbr := make([]byte, gptSectorSize) + // Single MBR partition entry at offset 446 covering the full disk and + // marking the OS type as 0xEE (GPT protective). + mbr[446+0] = 0x00 // boot indicator + mbr[446+1] = 0x00 // starting CHS head + mbr[446+2] = 0x02 // starting CHS sector (sector 2 = LBA 1) + mbr[446+3] = 0x00 // starting CHS cylinder + mbr[446+4] = 0xEE // OS type: GPT protective + mbr[446+5] = 0xFF // ending CHS head + mbr[446+6] = 0xFF // ending CHS sector + mbr[446+7] = 0xFF // ending CHS cylinder + binary.LittleEndian.PutUint32(mbr[446+8:], 1) + sizeLBA := l.TotalSectors - 1 + if sizeLBA > 0xFFFFFFFF { + sizeLBA = 0xFFFFFFFF + } + binary.LittleEndian.PutUint32(mbr[446+12:], uint32(sizeLBA)) + mbr[510] = 0x55 + mbr[511] = 0xAA + if _, err := w.Write(mbr); err != nil { + return err + } + + // Build the partition array first; its CRC feeds into the header. + arr := l.partitionArrayBytes() + + // LBA 1: primary GPT header. + hdr := l.headerBytes(true, arr) + if _, err := w.Write(hdr); err != nil { + return err + } + + // LBA 2..33: primary partition entry array. + if _, err := w.Write(arr); err != nil { + return err + } + return nil +} + +// WriteTail writes the secondary partition entry array followed by the +// secondary GPT header, totalling gptTailSectors (33) sectors. +func (l Layout) WriteTail(w io.Writer) error { + arr := l.partitionArrayBytes() + if _, err := w.Write(arr); err != nil { + return err + } + hdr := l.headerBytes(false, arr) + if _, err := w.Write(hdr); err != nil { + return err + } + return nil +} + +// partitionArrayBytes returns the serialized 32-sector partition entry +// array containing all partitions in the layout, with unused slots zeroed. +func (l Layout) partitionArrayBytes() []byte { + const arrSize = gptPartitionEntries * gptPartitionEntrySize + arr := make([]byte, arrSize) + for i, p := range l.Partitions { + off := i * gptPartitionEntrySize + copy(arr[off+0:off+16], p.TypeGUID[:]) + copy(arr[off+16:off+32], p.GUID[:]) + binary.LittleEndian.PutUint64(arr[off+32:], p.FirstLBA) + binary.LittleEndian.PutUint64(arr[off+40:], p.LastLBA) + // Attribute flags at bytes 48..55 left at zero. + writePartName(arr[off+56:off+128], p.Name) + } + return arr +} + +// headerBytes returns one sector containing a GPT header. When primary is +// true the header is the primary header (MyLBA=1); otherwise it is the +// secondary header (MyLBA=last). The supplied arr is the partition entry +// array used to compute the entry array CRC. +func (l Layout) headerBytes(primary bool, arr []byte) []byte { + hdr := make([]byte, gptSectorSize) + copy(hdr[0:8], gptSignature[:]) + binary.LittleEndian.PutUint32(hdr[8:], gptHeaderRevision) + binary.LittleEndian.PutUint32(hdr[12:], gptHeaderSize) + // hdr[16:20] is the header CRC32, filled in below. + // hdr[20:24] is reserved (must be zero). + + var myLBA, altLBA, partEntryLBA uint64 + if primary { + myLBA = 1 + altLBA = l.TotalSectors - 1 + partEntryLBA = 2 + } else { + myLBA = l.TotalSectors - 1 + altLBA = 1 + partEntryLBA = l.TotalSectors - 1 - uint64(gptPartitionArraySectors) + } + binary.LittleEndian.PutUint64(hdr[24:], myLBA) + binary.LittleEndian.PutUint64(hdr[32:], altLBA) + binary.LittleEndian.PutUint64(hdr[40:], l.FirstUsableLBA()) + binary.LittleEndian.PutUint64(hdr[48:], l.LastUsableLBA()) + copy(hdr[56:72], l.DiskGUID[:]) + binary.LittleEndian.PutUint64(hdr[72:], partEntryLBA) + binary.LittleEndian.PutUint32(hdr[80:], uint32(gptPartitionEntries)) + binary.LittleEndian.PutUint32(hdr[84:], uint32(gptPartitionEntrySize)) + binary.LittleEndian.PutUint32(hdr[88:], crc32.ChecksumIEEE(arr)) + + // Header CRC is computed over the first gptHeaderSize bytes with the + // CRC field zeroed (it currently is). + binary.LittleEndian.PutUint32(hdr[16:], crc32.ChecksumIEEE(hdr[:gptHeaderSize])) + return hdr +} + +// writePartName UTF-16LE encodes name into buf, NUL-padding the remainder. +// Truncation is silent if the encoded form does not fit. +func writePartName(buf []byte, name string) { + enc := utf16.Encode([]rune(name)) + maxUnits := len(buf) / 2 + if len(enc) > maxUnits { + enc = enc[:maxUnits] + } + for i, r := range enc { + binary.LittleEndian.PutUint16(buf[i*2:i*2+2], r) + } +} + +// deriveDiskGUID returns a deterministic disk GUID derived from the layer +// paths and sizes. The same set of layers produces the same GUID, keeping +// the synthetic VMDK reproducible across shim invocations. +func deriveDiskGUID(paths []string, sizes []uint64) [16]byte { + h := sha256.New() + io.WriteString(h, "nerdbox/erofs/disk") + for i, p := range paths { + fmt.Fprintf(h, "\n%s:%d", p, sizes[i]) + } + var guid [16]byte + copy(guid[:], h.Sum(nil)) + setRFC4122Bits(&guid) + return guid +} + +// derivePartGUID returns a deterministic partition GUID for a layer. +func derivePartGUID(path string, size uint64, index int) [16]byte { + h := sha256.New() + io.WriteString(h, "nerdbox/erofs/part") + fmt.Fprintf(h, "\n%d:%s:%d", index, path, size) + var guid [16]byte + copy(guid[:], h.Sum(nil)) + setRFC4122Bits(&guid) + return guid +} + +// setRFC4122Bits sets the version (4) and RFC 4122 variant bits in the +// supplied 16 bytes. Because GPT stores GUIDs in mixed-endian form, +// canonical UUID byte 6 (the version nibble) maps to on-disk byte 7, and +// canonical byte 8 (the variant) maps to on-disk byte 8. +// +// The kernel's GPT parser does not enforce these bits, so this is purely +// for niceness when the GUID is read back as a canonical UUID string. +func setRFC4122Bits(g *[16]byte) { + g[7] = (g[7] & 0x0F) | 0x40 // version 4 + g[8] = (g[8] & 0x3F) | 0x80 // RFC 4122 variant +} diff --git a/internal/erofs/gpt_test.go b/internal/erofs/gpt_test.go new file mode 100644 index 00000000..702f8caf --- /dev/null +++ b/internal/erofs/gpt_test.go @@ -0,0 +1,282 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package erofs + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "hash/crc32" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// makeLayerFile creates a sparse zero-filled file of the requested size. +func makeLayerFile(t *testing.T, dir, name string, size int64) string { + t.Helper() + p := filepath.Join(dir, name) + f, err := os.Create(p) + require.NoError(t, err) + require.NoError(t, f.Truncate(size)) + require.NoError(t, f.Close()) + return p +} + +func TestComputeLayoutBasic(t *testing.T) { + tmp := t.TempDir() + a := makeLayerFile(t, tmp, "a.img", 64*1024*1024) // 64 MiB, aligned + b := makeLayerFile(t, tmp, "b.img", 4*1024*1024) // 4 MiB + c := makeLayerFile(t, tmp, "c.img", 8*1024) // 8 KiB (smallest 4 KiB aligned) + + l, err := ComputeLayout([]string{a, b, c}) + require.NoError(t, err) + + assert.Equal(t, uint64(gptSectorSize), l.SectorSize) + assert.Equal(t, uint64(gptAlignSectors), l.AlignSectors) + assert.Len(t, l.Partitions, 3) + + // First partition starts at LBA 2048 (1 MiB). + assert.Equal(t, uint64(2048), l.Partitions[0].FirstLBA) + // 64 MiB = 131072 sectors. + assert.Equal(t, uint64(131072), l.Partitions[0].SectorCount()) + assert.Equal(t, uint64(2048+131072-1), l.Partitions[0].LastLBA) + + // Partition 2 starts at the next 1 MiB boundary (133120 = 65 * 2048). + assert.Equal(t, uint64(2048+131072), l.Partitions[1].FirstLBA) + // 4 MiB = 8192 sectors. + assert.Equal(t, uint64(8192), l.Partitions[1].SectorCount()) + + // Partition 3 starts at the next 1 MiB boundary after partition 2 ends. + assert.Equal(t, l.Partitions[1].LastLBA+1, l.Partitions[2].FirstLBA) + // 8 KiB = 16 sectors. + assert.Equal(t, uint64(16), l.Partitions[2].SectorCount()) + + // Total disk: end of last partition (rounded to 1 MiB) + 33-sector tail. + expectedTail := alignUp(l.Partitions[2].LastLBA+1, uint64(gptAlignSectors)) + assert.Equal(t, expectedTail+uint64(gptTailSectors), l.TotalSectors) + + // Each partition gets the Linux fs data type GUID and a unique partition GUID. + for i, p := range l.Partitions { + assert.Equal(t, gptTypeLinuxFS, p.TypeGUID, "partition %d", i) + assert.NotEqual(t, [16]byte{}, p.GUID, "partition %d", i) + assert.Equal(t, fmt.Sprintf("erofs-layer-%d", i), p.Name) + } + + // Disk GUID is non-zero and partition GUIDs differ from each other. + assert.NotEqual(t, [16]byte{}, l.DiskGUID) + assert.NotEqual(t, l.Partitions[0].GUID, l.Partitions[1].GUID) + assert.NotEqual(t, l.Partitions[1].GUID, l.Partitions[2].GUID) +} + +func TestComputeLayoutDeterministic(t *testing.T) { + tmp := t.TempDir() + a := makeLayerFile(t, tmp, "a.img", 4096) + b := makeLayerFile(t, tmp, "b.img", 8192) + + l1, err := ComputeLayout([]string{a, b}) + require.NoError(t, err) + l2, err := ComputeLayout([]string{a, b}) + require.NoError(t, err) + + assert.Equal(t, l1.DiskGUID, l2.DiskGUID) + assert.Equal(t, l1.Partitions[0].GUID, l2.Partitions[0].GUID) + assert.Equal(t, l1.Partitions[1].GUID, l2.Partitions[1].GUID) +} + +func TestComputeLayoutUnaligned(t *testing.T) { + tmp := t.TempDir() + bad := makeLayerFile(t, tmp, "bad.img", 1234) // not 4 KiB aligned + + _, err := ComputeLayout([]string{bad}) + require.Error(t, err) + assert.True(t, errors.Is(err, errGPTUnaligned), "want errGPTUnaligned, got %v", err) +} + +func TestComputeLayoutEmptyFile(t *testing.T) { + tmp := t.TempDir() + zero := makeLayerFile(t, tmp, "zero.img", 0) + + _, err := ComputeLayout([]string{zero}) + require.Error(t, err) + assert.True(t, errors.Is(err, errGPTUnaligned), "want errGPTUnaligned, got %v", err) +} + +func TestComputeLayoutTooMany(t *testing.T) { + tmp := t.TempDir() + paths := make([]string, gptPartitionEntries+1) + for i := range paths { + paths[i] = makeLayerFile(t, tmp, fmt.Sprintf("l%d.img", i), 4096) + } + + _, err := ComputeLayout(paths) + require.Error(t, err) + assert.True(t, errors.Is(err, errGPTPartitionLimit), "want errGPTPartitionLimit, got %v", err) +} + +func TestWriteHeaderStructure(t *testing.T) { + tmp := t.TempDir() + a := makeLayerFile(t, tmp, "a.img", 4*1024*1024) + b := makeLayerFile(t, tmp, "b.img", 4*1024*1024) + + l, err := ComputeLayout([]string{a, b}) + require.NoError(t, err) + + var buf bytes.Buffer + require.NoError(t, l.WriteHeader(&buf)) + require.Equal(t, gptReservedSectors*gptSectorSize, buf.Len(), + "header blob must cover %d sectors", gptReservedSectors) + + data := buf.Bytes() + + // LBA 0: protective MBR. + mbr := data[:gptSectorSize] + assert.Equal(t, byte(0xEE), mbr[446+4], "MBR partition type must be 0xEE") + assert.Equal(t, uint32(1), binary.LittleEndian.Uint32(mbr[446+8:]), + "MBR starting LBA must be 1") + expectedSize := uint32(l.TotalSectors - 1) + if l.TotalSectors-1 > 0xFFFFFFFF { + expectedSize = 0xFFFFFFFF + } + assert.Equal(t, expectedSize, binary.LittleEndian.Uint32(mbr[446+12:]), + "MBR size in LBA must be totalSectors - 1 (capped at 0xFFFFFFFF)") + assert.Equal(t, byte(0x55), mbr[510]) + assert.Equal(t, byte(0xAA), mbr[511]) + + // LBA 1: primary GPT header. + hdr := data[gptSectorSize : 2*gptSectorSize] + assert.Equal(t, gptSignature[:], hdr[0:8]) + assert.Equal(t, gptHeaderRevision, binary.LittleEndian.Uint32(hdr[8:])) + assert.Equal(t, uint32(gptHeaderSize), binary.LittleEndian.Uint32(hdr[12:])) + assert.Equal(t, uint64(1), binary.LittleEndian.Uint64(hdr[24:]), "MyLBA") + assert.Equal(t, l.TotalSectors-1, binary.LittleEndian.Uint64(hdr[32:]), "AlternateLBA") + assert.Equal(t, uint64(gptReservedSectors), binary.LittleEndian.Uint64(hdr[40:]), "FirstUsableLBA") + assert.Equal(t, l.TotalSectors-1-uint64(gptTailSectors), + binary.LittleEndian.Uint64(hdr[48:]), "LastUsableLBA") + assert.Equal(t, l.DiskGUID[:], hdr[56:72]) + assert.Equal(t, uint64(2), binary.LittleEndian.Uint64(hdr[72:]), "PartitionEntryLBA") + assert.Equal(t, uint32(gptPartitionEntries), binary.LittleEndian.Uint32(hdr[80:])) + assert.Equal(t, uint32(gptPartitionEntrySize), binary.LittleEndian.Uint32(hdr[84:])) + + // Header CRC validates: zero out the CRC field and recompute. + hdrCheck := append([]byte(nil), hdr[:gptHeaderSize]...) + storedHeaderCRC := binary.LittleEndian.Uint32(hdrCheck[16:]) + binary.LittleEndian.PutUint32(hdrCheck[16:], 0) + assert.Equal(t, crc32.ChecksumIEEE(hdrCheck), storedHeaderCRC, "header CRC must validate") + + // LBA 2..33: partition entry array. + arr := data[2*gptSectorSize:] + require.Equal(t, gptPartitionArraySectors*gptSectorSize, len(arr)) + storedArrCRC := binary.LittleEndian.Uint32(hdr[88:]) + assert.Equal(t, crc32.ChecksumIEEE(arr), storedArrCRC, "partition array CRC must validate") + + // Partition entries: type GUID, partition GUID, first/last LBA. + for i, p := range l.Partitions { + off := i * gptPartitionEntrySize + assert.Equal(t, p.TypeGUID[:], arr[off:off+16], "part %d type GUID", i) + assert.Equal(t, p.GUID[:], arr[off+16:off+32], "part %d GUID", i) + assert.Equal(t, p.FirstLBA, binary.LittleEndian.Uint64(arr[off+32:]), + "part %d first LBA", i) + assert.Equal(t, p.LastLBA, binary.LittleEndian.Uint64(arr[off+40:]), + "part %d last LBA", i) + } + + // Unused entries (after the last partition) must be all zero. + for i := len(l.Partitions); i < gptPartitionEntries; i++ { + off := i * gptPartitionEntrySize + entry := arr[off : off+gptPartitionEntrySize] + var zero [gptPartitionEntrySize]byte + assert.Equal(t, zero[:], entry, "unused partition entry %d must be zeroed", i) + } +} + +func TestWriteTailStructure(t *testing.T) { + tmp := t.TempDir() + a := makeLayerFile(t, tmp, "a.img", 4*1024*1024) + + l, err := ComputeLayout([]string{a}) + require.NoError(t, err) + + var buf bytes.Buffer + require.NoError(t, l.WriteTail(&buf)) + require.Equal(t, gptTailSectors*gptSectorSize, buf.Len(), + "tail blob must cover %d sectors", gptTailSectors) + + data := buf.Bytes() + + // First 32 sectors: secondary partition entry array (must match primary). + secArr := data[:gptPartitionArraySectors*gptSectorSize] + + // Last sector: secondary GPT header. + hdr := data[gptPartitionArraySectors*gptSectorSize:] + assert.Equal(t, gptSignature[:], hdr[0:8]) + assert.Equal(t, l.TotalSectors-1, binary.LittleEndian.Uint64(hdr[24:]), "MyLBA = last LBA") + assert.Equal(t, uint64(1), binary.LittleEndian.Uint64(hdr[32:]), "AlternateLBA = 1") + expectedPartEntryLBA := l.TotalSectors - 1 - uint64(gptPartitionArraySectors) + assert.Equal(t, expectedPartEntryLBA, binary.LittleEndian.Uint64(hdr[72:]), + "secondary PartitionEntryLBA") + + // Header CRC validates. + hdrCheck := append([]byte(nil), hdr[:gptHeaderSize]...) + storedHeaderCRC := binary.LittleEndian.Uint32(hdrCheck[16:]) + binary.LittleEndian.PutUint32(hdrCheck[16:], 0) + assert.Equal(t, crc32.ChecksumIEEE(hdrCheck), storedHeaderCRC, + "secondary header CRC must validate") + + // Partition array CRC matches the array we just wrote. + storedArrCRC := binary.LittleEndian.Uint32(hdr[88:]) + assert.Equal(t, crc32.ChecksumIEEE(secArr), storedArrCRC, + "secondary partition array CRC must validate") +} + +func TestPrimaryAndSecondaryArrayCRCsMatch(t *testing.T) { + tmp := t.TempDir() + paths := []string{ + makeLayerFile(t, tmp, "a.img", 4096), + makeLayerFile(t, tmp, "b.img", 8192), + makeLayerFile(t, tmp, "c.img", 4096), + } + l, err := ComputeLayout(paths) + require.NoError(t, err) + + var primaryBuf, tailBuf bytes.Buffer + require.NoError(t, l.WriteHeader(&primaryBuf)) + require.NoError(t, l.WriteTail(&tailBuf)) + + // Primary array is the third sector onwards. + primaryArr := primaryBuf.Bytes()[2*gptSectorSize:] + // Secondary array is the first 32 sectors of the tail. + secondaryArr := tailBuf.Bytes()[:gptPartitionArraySectors*gptSectorSize] + + assert.Equal(t, primaryArr, secondaryArr, "primary and secondary arrays must match byte-for-byte") +} + +func TestLinuxFSGUIDEncoding(t *testing.T) { + // Canonical 0FC63DAF-8483-4772-8E79-3D69D8477DE4 in mixed-endian on-disk form. + want := [16]byte{ + 0xAF, 0x3D, 0xC6, 0x0F, + 0x83, 0x84, + 0x72, 0x47, + 0x8E, 0x79, + 0x3D, 0x69, 0xD8, 0x47, 0x7D, 0xE4, + } + assert.Equal(t, want, gptTypeLinuxFS) +} diff --git a/internal/erofs/no_copy_check_test.go b/internal/erofs/no_copy_check_test.go new file mode 100644 index 00000000..716ff35a --- /dev/null +++ b/internal/erofs/no_copy_check_test.go @@ -0,0 +1,108 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package erofs + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "syscall" + "testing" +) + +// TestNoLayerDataCopy is a manual / informational test: it builds a GPT +// VMDK from 30 sparse 64-MiB layer files and reports the actual disk usage +// of every file produced. It demonstrates that the auxiliary files are +// tiny (a few KiB) and independent of layer count; layer data is referenced +// in place rather than copied. +// +// Run with: go test -run TestNoLayerDataCopy -v ./internal/erofs/ +func TestNoLayerDataCopy(t *testing.T) { + const numLayers = 30 + const layerSize = 64 * 1024 * 1024 // 64 MiB sparse per layer + + tmp := t.TempDir() + var devices []string + for i := 0; i < numLayers; i++ { + p := filepath.Join(tmp, fmt.Sprintf("layer-%03d.erofs", i)) + f, err := os.Create(p) + if err != nil { + t.Fatal(err) + } + // Sparse: Truncate sets the file size without allocating blocks. + if err := f.Truncate(layerSize); err != nil { + t.Fatal(err) + } + f.Close() + devices = append(devices, p) + } + + vmdkPath := filepath.Join(tmp, "merged_fs_gpt.vmdk") + if err := DumpGPTVMDKDescriptorToFile(vmdkPath, 0xfffffffe, devices); err != nil { + t.Fatal(err) + } + + entries, err := os.ReadDir(tmp) + if err != nil { + t.Fatal(err) + } + + var totalLogical, totalActual, auxLogical, auxActual int64 + t.Logf("%-32s %14s %14s", "file", "size (B)", "actual (B)") + for _, e := range entries { + fi, err := e.Info() + if err != nil { + t.Fatal(err) + } + stat, ok := fi.Sys().(*syscall.Stat_t) + if !ok { + t.Fatalf("Stat_t unavailable") + } + actual := stat.Blocks * 512 + isAux := strings.HasPrefix(e.Name(), "merged_fs_gpt") || + !strings.HasSuffix(e.Name(), ".erofs") + marker := " " + if isAux { + marker = " (aux)" + } + t.Logf("%-32s %14d %14d%s", e.Name(), fi.Size(), actual, marker) + totalLogical += fi.Size() + totalActual += actual + if isAux { + auxLogical += fi.Size() + auxActual += actual + } + } + + t.Logf("--") + t.Logf("layers: %d", numLayers) + t.Logf("logical sum: %d B (%d MiB)", totalLogical, totalLogical/1024/1024) + t.Logf("actual sum: %d B (%d KiB)", totalActual, totalActual/1024) + t.Logf("aux logical: %d B", auxLogical) + t.Logf("aux actual: %d B (%d KiB)", auxActual, auxActual/1024) + + // Sanity check: the auxiliary files combined must not store anything + // close to a layer's worth of data. For 64 MiB layers a copy would + // blow this up by ~64 MiB per layer; a no-copy implementation keeps + // auxiliary disk usage in the tens-of-KiB range. + const maxAuxKiB = 1024 // 1 MiB; in practice we expect ~33 KiB + if auxActual/1024 > maxAuxKiB { + t.Fatalf("auxiliary files use %d KiB of disk; expected < %d KiB (no-copy invariant violated)", + auxActual/1024, maxAuxKiB) + } +} diff --git a/internal/erofs/vmdk.go b/internal/erofs/vmdk.go index ac959d7d..df71999a 100644 --- a/internal/erofs/vmdk.go +++ b/internal/erofs/vmdk.go @@ -20,6 +20,8 @@ import ( "fmt" "io" "os" + "path/filepath" + "strings" ) const ( @@ -47,6 +49,22 @@ func vmdkDescAddExtent(w io.Writer, sectors uint64, filename string, offset uint return nil } +// vmdkDescAddZeroExtent writes "RW ZERO" extent lines to the writer, +// splitting at the 2 GiB extent boundary. ZERO extents represent zero-filled +// regions of the virtual disk without referencing a backing file. +func vmdkDescAddZeroExtent(w io.Writer, sectors uint64) error { + for sectors > 0 { + count := min(sectors, max2GbExtentSectors) + + _, err := fmt.Fprintf(w, "RW %d ZERO\n", count) + if err != nil { + return err + } + sectors -= count + } + return nil +} + func DumpVMDKDescriptor(w io.Writer, cid uint32, devices []string) error { parentCID := uint32(0xffffffff) @@ -104,3 +122,171 @@ func DumpVMDKDescriptorToFile(vmdkdesc string, cid uint32, devices []string) err f.Close() return err } + +// gptUseZeroExtents controls whether DumpGPTVMDKDescriptorToFile emits +// "RW ZERO" extents (preferred) or generates a shared zero-filled +// padding file referenced via FLAT extents. +// +// libkrun's twoGbMaxExtentFlat parser is expected to support ZERO extents; +// flip this default if a particular libkrun build does not. +var gptUseZeroExtents = true + +// DumpGPTVMDKDescriptor writes a VMDK descriptor whose virtual disk is a +// GPT-partitioned image with one partition per layout.Partitions entry. +// +// headerPath should reference a 34-sector blob produced by Layout.WriteHeader +// covering LBAs 0..33 (protective MBR, primary GPT header, primary entry array). +// tailPath should reference a 33-sector blob produced by Layout.WriteTail +// covering the final 33 sectors of the disk. +// +// When padFile is empty, padding regions are emitted as "RW ZERO" +// extents. Otherwise padFile must be a zero-filled file of at least max +// padding-region size; padding regions are emitted as FLAT extents reading +// from offset 0 of padFile. +func DumpGPTVMDKDescriptor(w io.Writer, cid uint32, layout Layout, headerPath, tailPath, padFile string) error { + parentCID := uint32(0xffffffff) + + if _, err := fmt.Fprintf(w, `# Disk DescriptorFile +version=1 +CID=%08x +parentCID=%08x +createType="%s" + +# Extent description +`, cid, parentCID, subformat); err != nil { + return err + } + + emitPadding := func(sectors uint64) error { + if sectors == 0 { + return nil + } + if padFile == "" { + return vmdkDescAddZeroExtent(w, sectors) + } + // FLAT extent reading zeros from offset 0 of the shared pad file. + // vmdkDescAddExtent splits at max2GbExtentSectors automatically; + // each split line still reads from offset 0 of the same file, which + // is correct because the entire pad file is zero-filled. + return vmdkDescAddExtent(w, sectors, padFile, 0) + } + + // LBA 0..33: protective MBR + primary GPT header + primary entry array. + if err := vmdkDescAddExtent(w, uint64(gptReservedSectors), headerPath, 0); err != nil { + return err + } + + prevEnd := uint64(gptReservedSectors) + for _, p := range layout.Partitions { + if p.FirstLBA > prevEnd { + if err := emitPadding(p.FirstLBA - prevEnd); err != nil { + return err + } + } + if err := vmdkDescAddExtent(w, p.SectorCount(), p.Source, 0); err != nil { + return err + } + prevEnd = p.LastLBA + 1 + } + + // Padding between the end of the last partition and the secondary + // partition entry array. + tailStart := layout.TotalSectors - uint64(gptTailSectors) + if tailStart > prevEnd { + if err := emitPadding(tailStart - prevEnd); err != nil { + return err + } + } + + // Last 33 sectors: secondary partition entry array + secondary header. + if err := vmdkDescAddExtent(w, uint64(gptTailSectors), tailPath, 0); err != nil { + return err + } + + cylinders := (layout.TotalSectors + sectorsPerTrack*numberHeads - 1) / (sectorsPerTrack * numberHeads) + _, err := fmt.Fprintf(w, ` + +# The Disk Data Base +#DDB + +ddb.virtualHWVersion = "%s" +ddb.geometry.cylinders = "%d" +ddb.geometry.heads = "%d" +ddb.geometry.sectors = "63" +ddb.adapterType = "%s" +`, hwVersion, cylinders, numberHeads, adapterType) + return err +} + +// DumpGPTVMDKDescriptorToFile generates a GPT-partitioned VMDK at vmdkPath +// that references the supplied erofs layer files as separate partitions on +// a single virtual disk. The header blob, tail blob, and (when ZERO extents +// are not in use) shared padding file are written next to vmdkPath. +// +// The vmdk filename's extension is replaced with "_header.bin", "_tail.bin", +// and "_pad.bin" suffixes to derive the auxiliary filenames; e.g. for +// "/x/merged_fs_gpt.vmdk" the auxiliary files are "/x/merged_fs_gpt_header.bin", +// "/x/merged_fs_gpt_tail.bin", and (if applicable) "/x/merged_fs_gpt_pad.bin". +func DumpGPTVMDKDescriptorToFile(vmdkPath string, cid uint32, devices []string) error { + layout, err := ComputeLayout(devices) + if err != nil { + return err + } + + dir := filepath.Dir(vmdkPath) + base := strings.TrimSuffix(filepath.Base(vmdkPath), filepath.Ext(vmdkPath)) + headerPath := filepath.Join(dir, base+"_header.bin") + tailPath := filepath.Join(dir, base+"_tail.bin") + + if err := writeBlob(headerPath, layout.WriteHeader); err != nil { + return err + } + if err := writeBlob(tailPath, layout.WriteTail); err != nil { + return err + } + + var padFile string + if !gptUseZeroExtents { + // The maximum single padding region is bounded by gptAlignSectors + // (1 MiB minus a sector); a 1 MiB pad file always suffices. + padFile = filepath.Join(dir, base+"_pad.bin") + if err := writePadFile(padFile, gptAlignSectors*gptSectorSize); err != nil { + return err + } + } + + f, err := os.Create(vmdkPath) + if err != nil { + return err + } + if err := DumpGPTVMDKDescriptor(f, cid, layout, headerPath, tailPath, padFile); err != nil { + f.Close() + return err + } + return f.Close() +} + +func writeBlob(path string, fn func(io.Writer) error) error { + f, err := os.Create(path) + if err != nil { + return err + } + if err := fn(f); err != nil { + f.Close() + return err + } + return f.Close() +} + +// writePadFile creates a sparse zero-filled file of the requested size. +func writePadFile(path string, size uint64) error { + f, err := os.Create(path) + if err != nil { + return err + } + if err := f.Truncate(int64(size)); err != nil { + f.Close() + return err + } + return f.Close() +} diff --git a/internal/erofs/vmdk_gpt_test.go b/internal/erofs/vmdk_gpt_test.go new file mode 100644 index 00000000..bae994de --- /dev/null +++ b/internal/erofs/vmdk_gpt_test.go @@ -0,0 +1,286 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package erofs + +import ( + "bytes" + "fmt" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// extentLine is a parsed `RW [FLAT "" | ZERO]` line. +type extentLine struct { + count uint64 + kind string // "FLAT" or "ZERO" + filename string + offset uint64 +} + +// parseDescriptor extracts extent lines and computes the cumulative sector total. +func parseDescriptor(t *testing.T, raw string) []extentLine { + t.Helper() + var extents []extentLine + for _, line := range strings.Split(raw, "\n") { + line = strings.TrimSpace(line) + if !strings.HasPrefix(line, "RW ") { + continue + } + var ( + count uint64 + kind string + fname string + offset uint64 + ) + // Two formats: `RW N FLAT "file" off` or `RW N ZERO`. + flatN, _ := fmt.Sscanf(line, "RW %d FLAT %q %d", &count, &fname, &offset) + if flatN == 3 { + kind = "FLAT" + extents = append(extents, extentLine{count: count, kind: kind, filename: fname, offset: offset}) + continue + } + zeroN, _ := fmt.Sscanf(line, "RW %d ZERO", &count) + if zeroN == 1 { + extents = append(extents, extentLine{count: count, kind: "ZERO"}) + continue + } + t.Fatalf("could not parse extent line: %q", line) + } + return extents +} + +func TestDumpGPTVMDKDescriptorZeroExtents(t *testing.T) { + tmp := t.TempDir() + + a := makeLayerFile(t, tmp, "a.img", 4*1024*1024) // 4 MiB + b := makeLayerFile(t, tmp, "b.img", 4*1024*1024) + c := makeLayerFile(t, tmp, "c.img", 4*1024*1024) + + devices := []string{a, b, c} + layout, err := ComputeLayout(devices) + require.NoError(t, err) + + headerPath := filepath.Join(tmp, "h.bin") + tailPath := filepath.Join(tmp, "t.bin") + require.NoError(t, writeBlob(headerPath, layout.WriteHeader)) + require.NoError(t, writeBlob(tailPath, layout.WriteTail)) + + var buf bytes.Buffer + require.NoError(t, DumpGPTVMDKDescriptor(&buf, 0xfffffffe, layout, headerPath, tailPath, "")) + + desc := buf.String() + assert.Contains(t, desc, `createType="twoGbMaxExtentFlat"`) + assert.Contains(t, desc, "ddb.adapterType") + + extents := parseDescriptor(t, desc) + require.NotEmpty(t, extents) + + // First extent: gptReservedSectors of header file. + assert.Equal(t, "FLAT", extents[0].kind) + assert.Equal(t, headerPath, extents[0].filename) + assert.Equal(t, uint64(gptReservedSectors), extents[0].count) + assert.Equal(t, uint64(0), extents[0].offset) + + // Sum of sector counts equals layout.TotalSectors. + var total uint64 + for _, e := range extents { + total += e.count + } + assert.Equal(t, layout.TotalSectors, total, + "sum of extent counts must equal total sectors") + + // Last extent: gptTailSectors of tail file. + last := extents[len(extents)-1] + assert.Equal(t, "FLAT", last.kind) + assert.Equal(t, tailPath, last.filename) + assert.Equal(t, uint64(gptTailSectors), last.count) + + // At least one ZERO extent exists for padding (alignment skipped sectors). + var sawZero bool + for _, e := range extents { + if e.kind == "ZERO" { + sawZero = true + break + } + } + assert.True(t, sawZero, "expected at least one ZERO extent for alignment padding") + + // Each layer must appear as a FLAT extent of the right size at offset 0. + for i, dev := range devices { + var found bool + want := layout.Partitions[i].SectorCount() + for _, e := range extents { + if e.kind == "FLAT" && e.filename == dev && e.count == want && e.offset == 0 { + found = true + break + } + } + assert.True(t, found, "expected FLAT extent for device %s of %d sectors", dev, want) + } +} + +func TestDumpGPTVMDKDescriptorPaddingFile(t *testing.T) { + tmp := t.TempDir() + + a := makeLayerFile(t, tmp, "a.img", 4*1024*1024) + b := makeLayerFile(t, tmp, "b.img", 4*1024*1024) + + layout, err := ComputeLayout([]string{a, b}) + require.NoError(t, err) + + headerPath := filepath.Join(tmp, "h.bin") + tailPath := filepath.Join(tmp, "t.bin") + padPath := filepath.Join(tmp, "p.bin") + require.NoError(t, writeBlob(headerPath, layout.WriteHeader)) + require.NoError(t, writeBlob(tailPath, layout.WriteTail)) + require.NoError(t, writePadFile(padPath, gptAlignSectors*gptSectorSize)) + + var buf bytes.Buffer + require.NoError(t, DumpGPTVMDKDescriptor(&buf, 0xfffffffe, layout, headerPath, tailPath, padPath)) + + extents := parseDescriptor(t, buf.String()) + require.NotEmpty(t, extents) + + // No ZERO extents when a padFile is provided. + for _, e := range extents { + assert.NotEqual(t, "ZERO", e.kind, "no ZERO extents allowed when padFile is set") + } + + // At least one extent references the pad file. + var sawPad bool + for _, e := range extents { + if e.kind == "FLAT" && e.filename == padPath { + sawPad = true + assert.Equal(t, uint64(0), e.offset, "pad extent must read from offset 0") + } + } + assert.True(t, sawPad, "expected at least one FLAT extent referencing the pad file") + + // Sum of sectors still equals the total disk size. + var total uint64 + for _, e := range extents { + total += e.count + } + assert.Equal(t, layout.TotalSectors, total) +} + +func TestDumpGPTVMDKDescriptorToFile(t *testing.T) { + tmp := t.TempDir() + + devices := []string{ + makeLayerFile(t, tmp, "a.img", 4*1024*1024), + makeLayerFile(t, tmp, "b.img", 4*1024*1024), + } + + vmdkPath := filepath.Join(tmp, "merged_fs_gpt.vmdk") + require.NoError(t, DumpGPTVMDKDescriptorToFile(vmdkPath, 0xfffffffe, devices)) + + // Descriptor exists. + _, err := os.Stat(vmdkPath) + require.NoError(t, err) + + // Auxiliary files must also exist. + _, err = os.Stat(filepath.Join(tmp, "merged_fs_gpt_header.bin")) + require.NoError(t, err) + _, err = os.Stat(filepath.Join(tmp, "merged_fs_gpt_tail.bin")) + require.NoError(t, err) + + // Header file is exactly gptReservedSectors * 512 bytes. + hf, err := os.Stat(filepath.Join(tmp, "merged_fs_gpt_header.bin")) + require.NoError(t, err) + assert.Equal(t, int64(gptReservedSectors*gptSectorSize), hf.Size()) + + tf, err := os.Stat(filepath.Join(tmp, "merged_fs_gpt_tail.bin")) + require.NoError(t, err) + assert.Equal(t, int64(gptTailSectors*gptSectorSize), tf.Size()) + + // With ZERO extents enabled (the default), no pad file is written. + _, err = os.Stat(filepath.Join(tmp, "merged_fs_gpt_pad.bin")) + assert.True(t, os.IsNotExist(err), "pad file must not be created when ZERO extents are used") +} + +func TestDumpGPTVMDKDescriptorToFilePaddingFallback(t *testing.T) { + t.Cleanup(func() { + gptUseZeroExtents = true + }) + gptUseZeroExtents = false + + tmp := t.TempDir() + devices := []string{ + makeLayerFile(t, tmp, "a.img", 4*1024*1024), + makeLayerFile(t, tmp, "b.img", 4*1024*1024), + } + + vmdkPath := filepath.Join(tmp, "gpt.vmdk") + require.NoError(t, DumpGPTVMDKDescriptorToFile(vmdkPath, 0xfffffffe, devices)) + + pf, err := os.Stat(filepath.Join(tmp, "gpt_pad.bin")) + require.NoError(t, err) + assert.Equal(t, int64(gptAlignSectors*gptSectorSize), pf.Size(), + "pad file must be 1 MiB to cover the largest possible alignment region") +} + +func TestDumpGPTVMDKDescriptorExtentOrdering(t *testing.T) { + // Verify that the descriptor lays out extents strictly in LBA order: + // [header, padding?, layer1, padding?, layer2, ..., padding?, tail]. + tmp := t.TempDir() + + devices := []string{ + makeLayerFile(t, tmp, "a.img", 4096), // 1 sector aligned to 1 MiB → big trailing pad + makeLayerFile(t, tmp, "b.img", 8192), // 2 sectors aligned to 1 MiB → big trailing pad + makeLayerFile(t, tmp, "c.img", 12288), // 3 sectors aligned to 1 MiB → big trailing pad + } + + layout, err := ComputeLayout(devices) + require.NoError(t, err) + + hdrPath := filepath.Join(tmp, "h.bin") + tailPath := filepath.Join(tmp, "t.bin") + require.NoError(t, writeBlob(hdrPath, layout.WriteHeader)) + require.NoError(t, writeBlob(tailPath, layout.WriteTail)) + + var buf bytes.Buffer + require.NoError(t, DumpGPTVMDKDescriptor(&buf, 0xfffffffe, layout, hdrPath, tailPath, "")) + + extents := parseDescriptor(t, buf.String()) + + // Walk extents accumulating LBAs; assert every layer extent appears at + // the LBA the layout assigns to its partition. + var lba uint64 + deviceLBA := map[string]uint64{} + for _, e := range extents { + if e.kind == "FLAT" { + deviceLBA[e.filename] = lba + } + lba += e.count + } + assert.Equal(t, layout.TotalSectors, lba) + + for i, dev := range devices { + assert.Equal(t, layout.Partitions[i].FirstLBA, deviceLBA[dev], + "layer %d %s must start at partition LBA %d", i, dev, layout.Partitions[i].FirstLBA) + } + + // Header extent at LBA 0; tail extent at LBA total - tailSectors. + assert.Equal(t, uint64(0), deviceLBA[hdrPath]) + assert.Equal(t, layout.TotalSectors-uint64(gptTailSectors), deviceLBA[tailPath]) +} diff --git a/internal/shim/task/mount.go b/internal/shim/task/mount.go index 16bf42a3..ef1ba526 100644 --- a/internal/shim/task/mount.go +++ b/internal/shim/task/mount.go @@ -35,6 +35,17 @@ import ( "github.com/containerd/nerdbox/internal/shim/task/bundle" ) +// gptLayerThreshold is the number of plain (non-multi-device) erofs mounts +// above which the shim packs them into a single GPT-partitioned VMDK rather +// than allocating one virtio-block device per mount. Multi-device erofs +// mounts (those carrying device= options) are not affected: they continue +// to use the existing flat-concat VMDK path inline. +// +// Packing into a GPT VMDK reduces virtio-block consumption (which is capped +// at 25 letters per VM) and lets the shim handle deep stacks of independent +// erofs mounts without coordinating layer offsets in the snapshotter. +const gptLayerThreshold = 8 + // diskAllocator assigns sequential virtio disk letters (vda, vdb, …). // A single instance is shared across rootfs and volume disk allocation so // that all disks within a container get unique, collision-free letters. @@ -54,29 +65,39 @@ type diskOptions struct { vmdk bool } +// erofsCandidate describes a plain erofs mount that has been deferred from +// the first pass of transformMounts. After all mounts are classified, the +// number of candidates determines whether they are each turned into raw +// virtio-block disks or packed into a single GPT-partitioned VMDK. +type erofsCandidate struct { + outIdx int // index in `am` reserved for this mount + source string // host path of the erofs file + target string // VM target path + options []string // remaining options (no device= entries) +} + // transformMounts does not perform any local mounts but transforms -// the mounts to be used inside the VM via virtio +// the mounts to be used inside the VM via virtio. +// +// erofs mounts that carry device= options are processed inline using the +// existing flat-concat VMDK path. Plain erofs mounts (no device= options) +// are deferred and resolved after the first pass: when more than +// gptLayerThreshold are present, they are packed into a single GPT- +// partitioned VMDK with one partition per mount; otherwise each becomes +// its own raw virtio-block device. func transformMounts(ctx context.Context, id string, ms []*types.Mount, da *diskAllocator) ([]*types.Mount, []sandbox.Opt, error) { var ( - addDisks []diskOptions - am []*types.Mount - sbOpts []sandbox.Opt - err error + addDisks []diskOptions + am []*types.Mount + sbOpts []sandbox.Opt + erofsList []erofsCandidate ) log.G(ctx).Trace("transformMounts", ms) for _, m := range ms { switch m.Type { case "erofs": - letter := da.Next() - disk := fmt.Sprintf("disk-%d-%s", letter, id) - // virtiofs implementation has a limit of 36 characters for the tag - if len(disk) > 36 { - disk = disk[:36] - } - var Options []string - devices := []string{m.Source} for _, o := range m.Options { if d, f := strings.CutPrefix(o, "device="); f { @@ -87,15 +108,20 @@ func transformMounts(ctx context.Context, id string, ms []*types.Mount, da *disk } if len(devices) > 1 { - // generate VMDK desc for the EROFS flattened fs if it does not exist + // Multi-device erofs: existing flat-concat VMDK path, + // applied inline so it is independent of GPT packing. + letter := da.Next() + disk := fmt.Sprintf("disk-%d-%s", letter, id) + if len(disk) > 36 { + disk = disk[:36] + } mergedfsPath := filepath.Dir(m.Source) + "/merged_fs.vmdk" if _, err := os.Stat(mergedfsPath); err != nil { if !os.IsNotExist(err) { log.G(ctx).Warnf("failed to stat %v: %v", mergedfsPath, err) return nil, nil, errdefs.ErrNotImplemented } - err = erofs.DumpVMDKDescriptorToFile(mergedfsPath, 0xfffffffe, devices) - if err != nil { + if err := erofs.DumpVMDKDescriptorToFile(mergedfsPath, 0xfffffffe, devices); err != nil { log.G(ctx).Warnf("failed to generate %v: %v", mergedfsPath, err) return nil, nil, errdefs.ErrNotImplemented } @@ -106,19 +132,23 @@ func transformMounts(ctx context.Context, id string, ms []*types.Mount, da *disk readOnly: true, vmdk: true, }) - } else { - addDisks = append(addDisks, diskOptions{ - name: disk, - source: m.Source, - readOnly: true, - vmdk: false, + am = append(am, &types.Mount{ + Type: "erofs", + Source: fmt.Sprintf("/dev/vd%c", letter), + Target: m.Target, + Options: filterOptions(Options), }) + continue } - am = append(am, &types.Mount{ - Type: "erofs", - Source: fmt.Sprintf("/dev/vd%c", letter), - Target: m.Target, - Options: filterOptions(Options), + + // Plain erofs: defer and reserve a slot in `am` so the output + // preserves the input mount ordering. + am = append(am, nil) + erofsList = append(erofsList, erofsCandidate{ + outIdx: len(am) - 1, + source: m.Source, + target: m.Target, + options: Options, }) case "ext4": @@ -176,6 +206,10 @@ func transformMounts(ctx context.Context, id string, ms []*types.Mount, da *disk } } + if err := finalizeErofsCandidates(ctx, id, da, erofsList, am, &addDisks); err != nil { + return nil, nil, err + } + for _, do := range addDisks { var flags sandbox.DiskFlags if do.readOnly { @@ -188,7 +222,86 @@ func transformMounts(ctx context.Context, id string, ms []*types.Mount, da *disk sbOpts = append(sbOpts, sandbox.WithDisk(do.name, do.source, flags)) } - return am, sbOpts, err + return am, sbOpts, nil +} + +// finalizeErofsCandidates resolves the deferred plain erofs mounts. When +// the candidate count exceeds gptLayerThreshold, all candidates are packed +// into a single GPT-partitioned VMDK consuming one virtio-block letter. +// Below the threshold, each candidate becomes its own raw virtio-block +// device, matching the long-standing single-mount-per-disk behavior. +// +// The reserved slots in `am` (set during the first pass of transformMounts) +// are filled in with the resulting mounts; new disks are appended to +// addDisks. The order of da.Next() calls inside this function is contiguous +// with whatever the first pass already consumed, preserving the invariant +// that the Nth virtio-block letter corresponds to the Nth disk in addDisks. +func finalizeErofsCandidates(ctx context.Context, id string, da *diskAllocator, candidates []erofsCandidate, am []*types.Mount, addDisks *[]diskOptions) error { + if len(candidates) == 0 { + return nil + } + + if len(candidates) > gptLayerThreshold { + sources := make([]string, len(candidates)) + for i, c := range candidates { + sources[i] = c.source + } + // The merged GPT VMDK lives next to the first source; the auxiliary + // header/tail blobs are written in the same directory. + gptPath := filepath.Join(filepath.Dir(sources[0]), "merged_fs_gpt.vmdk") + if _, err := os.Stat(gptPath); err != nil { + if !os.IsNotExist(err) { + log.G(ctx).Warnf("failed to stat %v: %v", gptPath, err) + return errdefs.ErrNotImplemented + } + if err := erofs.DumpGPTVMDKDescriptorToFile(gptPath, 0xfffffffe, sources); err != nil { + log.G(ctx).Warnf("failed to generate %v: %v", gptPath, err) + return errdefs.ErrNotImplemented + } + } + letter := da.Next() + disk := fmt.Sprintf("disk-%d-%s", letter, id) + if len(disk) > 36 { + disk = disk[:36] + } + *addDisks = append(*addDisks, diskOptions{ + name: disk, + source: gptPath, + readOnly: true, + vmdk: true, + }) + for i, c := range candidates { + am[c.outIdx] = &types.Mount{ + Type: "erofs", + Source: fmt.Sprintf("/dev/vd%c%d", letter, i+1), + Target: c.target, + Options: filterOptions(c.options), + } + } + return nil + } + + // Below the threshold: one raw virtio-block device per candidate. + for _, c := range candidates { + letter := da.Next() + disk := fmt.Sprintf("disk-%d-%s", letter, id) + if len(disk) > 36 { + disk = disk[:36] + } + *addDisks = append(*addDisks, diskOptions{ + name: disk, + source: c.source, + readOnly: true, + vmdk: false, + }) + am[c.outIdx] = &types.Mount{ + Type: "erofs", + Source: fmt.Sprintf("/dev/vd%c", letter), + Target: c.target, + Options: filterOptions(c.options), + } + } + return nil } func filterOptions(options []string) []string { diff --git a/internal/shim/task/mount_test.go b/internal/shim/task/mount_test.go index a64828a9..cb2d1110 100644 --- a/internal/shim/task/mount_test.go +++ b/internal/shim/task/mount_test.go @@ -18,12 +18,15 @@ package task import ( "context" + "fmt" "os" "path/filepath" "testing" + "github.com/containerd/containerd/api/types" "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/containerd/containerd/v2/core/mount" "github.com/containerd/nerdbox/internal/shim/sandbox" @@ -305,3 +308,297 @@ func TestBindMountsProvider(t *testing.T) { }) } } + +// makeErofsLayer creates a sparse zero-filled file of the given size to act +// as a stand-in for an erofs layer image. transformMounts only stat()s these +// files, so their contents do not need to be a valid erofs filesystem. +func makeErofsLayer(t *testing.T, dir, name string, size int64) string { + t.Helper() + p := filepath.Join(dir, name) + f, err := os.Create(p) + require.NoError(t, err) + require.NoError(t, f.Truncate(size)) + require.NoError(t, f.Close()) + return p +} + +// TestTransformMountsErofs covers the erofs branches of transformMounts: +// +// - a single plain erofs mount becomes a raw virtio-block disk; +// - a single multi-device erofs mount (with device= options) goes through +// the existing flat-concat VMDK path inline; +// - 2..gptLayerThreshold plain erofs mounts each get a raw disk; +// - more than gptLayerThreshold plain erofs mounts are packed into one +// GPT-partitioned VMDK consuming a single virtio-block letter; +// - multi-device and plain erofs mounts coexist: the former are inline, +// the latter contribute to the threshold-driven GPT decision. +func TestTransformMountsErofs(t *testing.T) { + const id = "cid" + + t.Run("single plain erofs is a raw disk", func(t *testing.T) { + tmp := t.TempDir() + layer := makeErofsLayer(t, tmp, "layer0.img", 4*1024*1024) + + ms := []*types.Mount{{ + Type: "erofs", + Source: layer, + Target: "/rootfs", + Options: []string{"ro"}, + }} + + da := newDiskAllocator() + out, opts, err := transformMounts(context.Background(), id, ms, &da) + require.NoError(t, err) + + require.Len(t, out, 1) + assert.Equal(t, "erofs", out[0].Type) + assert.Equal(t, "/dev/vda", out[0].Source) + assert.Equal(t, "/rootfs", out[0].Target) + assert.Equal(t, []string{"ro"}, out[0].Options) + + o := applyOpts(opts) + require.Len(t, o.Disks, 1) + assert.Equal(t, layer, o.Disks[0].MountPath) + assert.Equal(t, sandbox.DiskFlagReadonly, o.Disks[0].Flags, + "single plain erofs uses raw image, no VMDK flag") + + // No VMDK is generated for the single plain path. + _, statErr := os.Stat(filepath.Join(tmp, "merged_fs.vmdk")) + assert.True(t, os.IsNotExist(statErr)) + _, statErr = os.Stat(filepath.Join(tmp, "merged_fs_gpt.vmdk")) + assert.True(t, os.IsNotExist(statErr)) + }) + + t.Run("multi-device erofs uses flat-concat VMDK inline", func(t *testing.T) { + tmp := t.TempDir() + + var deviceOpts []string + top := makeErofsLayer(t, tmp, "layer-top.img", 4*1024*1024) + for i := 0; i < 2; i++ { + lp := makeErofsLayer(t, tmp, fmt.Sprintf("layer-%d.img", i), 4*1024*1024) + deviceOpts = append(deviceOpts, "device="+lp) + } + + ms := []*types.Mount{{ + Type: "erofs", + Source: top, + Target: "/rootfs", + Options: append([]string{"ro"}, deviceOpts...), + }} + + da := newDiskAllocator() + out, opts, err := transformMounts(context.Background(), id, ms, &da) + require.NoError(t, err) + + require.Len(t, out, 1) + assert.Equal(t, "/dev/vda", out[0].Source, + "flat-concat VMDK exposes the layers as a single concatenated device") + assert.Equal(t, []string{"ro"}, out[0].Options, + "flat-concat path drops device= options because the VMDK already concatenates layers") + + o := applyOpts(opts) + require.Len(t, o.Disks, 1) + assert.Equal(t, filepath.Join(tmp, "merged_fs.vmdk"), o.Disks[0].MountPath) + assert.Equal(t, sandbox.DiskFlagReadonly|sandbox.DiskFlagVMDK, o.Disks[0].Flags) + + // The flat-concat VMDK was generated; the GPT VMDK was not. + _, statErr := os.Stat(filepath.Join(tmp, "merged_fs.vmdk")) + assert.NoError(t, statErr) + _, statErr = os.Stat(filepath.Join(tmp, "merged_fs_gpt.vmdk")) + assert.True(t, os.IsNotExist(statErr)) + }) + + t.Run("plain erofs at or below threshold use raw disks", func(t *testing.T) { + tmp := t.TempDir() + + // gptLayerThreshold separate plain erofs mounts (no device=). + // At exactly the threshold the GPT path is NOT taken (the trigger + // is strictly greater-than). + var ms []*types.Mount + for i := 0; i < gptLayerThreshold; i++ { + lp := makeErofsLayer(t, tmp, fmt.Sprintf("l%d.img", i), 4*1024*1024) + ms = append(ms, &types.Mount{ + Type: "erofs", + Source: lp, + Target: fmt.Sprintf("/m/%d", i), + Options: []string{"ro"}, + }) + } + + da := newDiskAllocator() + out, opts, err := transformMounts(context.Background(), id, ms, &da) + require.NoError(t, err) + + require.Len(t, out, gptLayerThreshold) + o := applyOpts(opts) + require.Len(t, o.Disks, gptLayerThreshold, + "each plain erofs at or below threshold gets its own virtio-block disk") + + // Disk letters are allocated in deferred order, but candidates are + // captured in input order, so output mounts get vda..vd. + for i := 0; i < gptLayerThreshold; i++ { + assert.Equal(t, fmt.Sprintf("/dev/vd%c", 'a'+byte(i)), out[i].Source) + assert.Equal(t, fmt.Sprintf("/m/%d", i), out[i].Target) + // Raw disk: VMDK flag must NOT be set. + assert.Equal(t, sandbox.DiskFlagReadonly, o.Disks[i].Flags) + } + + // No GPT VMDK at the threshold. + _, statErr := os.Stat(filepath.Join(tmp, "merged_fs_gpt.vmdk")) + assert.True(t, os.IsNotExist(statErr)) + }) + + t.Run("plain erofs above threshold packed into GPT VMDK", func(t *testing.T) { + tmp := t.TempDir() + + mountCount := gptLayerThreshold + 2 + var ms []*types.Mount + var sources []string + for i := 0; i < mountCount; i++ { + lp := makeErofsLayer(t, tmp, fmt.Sprintf("l%d.img", i), 4*1024*1024) + sources = append(sources, lp) + ms = append(ms, &types.Mount{ + Type: "erofs", + Source: lp, + Target: fmt.Sprintf("/m/%d", i), + Options: []string{"ro"}, + }) + } + + da := newDiskAllocator() + out, opts, err := transformMounts(context.Background(), id, ms, &da) + require.NoError(t, err) + + require.Len(t, out, mountCount) + // Each plain erofs becomes a partition on the single GPT VMDK + // (which itself is the first - and only - virtio-block disk: + // /dev/vda). + for i := 0; i < mountCount; i++ { + assert.Equal(t, "erofs", out[i].Type) + assert.Equal(t, fmt.Sprintf("/dev/vda%d", i+1), out[i].Source, + "plain erofs mount %d must point at GPT partition %d", i, i+1) + assert.Equal(t, fmt.Sprintf("/m/%d", i), out[i].Target) + assert.Equal(t, []string{"ro"}, out[i].Options, + "per-mount options preserved; each partition gets a separate mount call") + } + + o := applyOpts(opts) + require.Len(t, o.Disks, 1, "all plain erofs mounts share a single GPT VMDK disk") + // VMDK lives next to the first source. + expectedVmdk := filepath.Join(filepath.Dir(sources[0]), "merged_fs_gpt.vmdk") + assert.Equal(t, expectedVmdk, o.Disks[0].MountPath) + assert.Equal(t, sandbox.DiskFlagReadonly|sandbox.DiskFlagVMDK, o.Disks[0].Flags) + + // GPT VMDK and its auxiliary blobs were generated; the flat-concat + // VMDK was not. + _, statErr := os.Stat(expectedVmdk) + assert.NoError(t, statErr) + _, statErr = os.Stat(filepath.Join(tmp, "merged_fs_gpt_header.bin")) + assert.NoError(t, statErr) + _, statErr = os.Stat(filepath.Join(tmp, "merged_fs_gpt_tail.bin")) + assert.NoError(t, statErr) + _, statErr = os.Stat(filepath.Join(tmp, "merged_fs.vmdk")) + assert.True(t, os.IsNotExist(statErr)) + }) + + t.Run("multi-device alongside many plain erofs mounts", func(t *testing.T) { + tmp := t.TempDir() + + // One multi-device erofs (handled inline) plus enough plain erofs + // mounts to trigger the GPT path. + mdTop := makeErofsLayer(t, tmp, "md-top.img", 4*1024*1024) + mdLow := makeErofsLayer(t, tmp, "md-low.img", 4*1024*1024) + multiDevice := &types.Mount{ + Type: "erofs", + Source: mdTop, + Target: "/multi", + Options: []string{"ro", "device=" + mdLow}, + } + + mountCount := gptLayerThreshold + 1 + var plainMounts []*types.Mount + var plainSources []string + for i := 0; i < mountCount; i++ { + lp := makeErofsLayer(t, tmp, fmt.Sprintf("p%d.img", i), 4*1024*1024) + plainSources = append(plainSources, lp) + plainMounts = append(plainMounts, &types.Mount{ + Type: "erofs", + Source: lp, + Target: fmt.Sprintf("/p/%d", i), + Options: []string{"ro"}, + }) + } + + // Interleave: a couple of plain mounts, the multi-device mount, + // then the rest of the plain mounts. The multi-device mount's + // disk letter must come from inline allocation; plain mounts' + // letter comes from the deferred GPT pass. + ms := []*types.Mount{} + ms = append(ms, plainMounts[0:2]...) + ms = append(ms, multiDevice) + ms = append(ms, plainMounts[2:]...) + + da := newDiskAllocator() + out, opts, err := transformMounts(context.Background(), id, ms, &da) + require.NoError(t, err) + + require.Len(t, out, len(ms)) + o := applyOpts(opts) + require.Len(t, o.Disks, 2, "one multi-device VMDK plus one GPT VMDK = 2 disks") + + // addDisks order: multi-device (allocated inline in pass 1) gets + // letter 'a'; GPT VMDK (allocated in pass 2) gets letter 'b'. + assert.Equal(t, filepath.Join(tmp, "merged_fs.vmdk"), o.Disks[0].MountPath) + assert.Equal(t, sandbox.DiskFlagReadonly|sandbox.DiskFlagVMDK, o.Disks[0].Flags) + expectedGptVmdk := filepath.Join(filepath.Dir(plainSources[0]), "merged_fs_gpt.vmdk") + assert.Equal(t, expectedGptVmdk, o.Disks[1].MountPath) + assert.Equal(t, sandbox.DiskFlagReadonly|sandbox.DiskFlagVMDK, o.Disks[1].Flags) + + // Output mount order matches input order: + // ms[0] = plain mounts[0] -> /dev/vdb1 + // ms[1] = plain mounts[1] -> /dev/vdb2 + // ms[2] = multi-device -> /dev/vda + // ms[3] = plain mounts[2] -> /dev/vdb3 + // ... + assert.Equal(t, "/dev/vdb1", out[0].Source) + assert.Equal(t, "/dev/vdb2", out[1].Source) + assert.Equal(t, "/dev/vda", out[2].Source) + for i := 0; i < mountCount-2; i++ { + assert.Equal(t, fmt.Sprintf("/dev/vdb%d", i+3), out[3+i].Source, + "plain mount %d after multi-device must keep its sequential partition", i) + } + }) + + t.Run("GPT VMDK is cached on subsequent calls", func(t *testing.T) { + tmp := t.TempDir() + + mountCount := gptLayerThreshold + 1 + var ms []*types.Mount + for i := 0; i < mountCount; i++ { + lp := makeErofsLayer(t, tmp, fmt.Sprintf("l%d.img", i), 4*1024*1024) + ms = append(ms, &types.Mount{ + Type: "erofs", + Source: lp, + Target: fmt.Sprintf("/m/%d", i), + }) + } + + da := newDiskAllocator() + _, _, err := transformMounts(context.Background(), id, ms, &da) + require.NoError(t, err) + + // VMDK must not be regenerated on a second call with the same inputs. + gptPath := filepath.Join(tmp, "merged_fs_gpt.vmdk") + fi1, err := os.Stat(gptPath) + require.NoError(t, err) + + da2 := newDiskAllocator() + _, _, err = transformMounts(context.Background(), id, ms, &da2) + require.NoError(t, err) + + fi2, err := os.Stat(gptPath) + require.NoError(t, err) + assert.Equal(t, fi1.ModTime(), fi2.ModTime(), + "VMDK descriptor must not be regenerated when it already exists") + }) +}