Skip to main content

bootc_initramfs_setup/
lib.rs

1//! Mount helpers for bootc-initramfs
2
3use std::{
4    ffi::OsString,
5    fmt::Debug,
6    io::ErrorKind,
7    os::fd::{AsFd, AsRawFd, OwnedFd},
8    path::{Path, PathBuf},
9};
10
11use anyhow::{Context, Result};
12use clap::Parser;
13use rustix::{
14    fs::{CWD, Mode, OFlags, major, minor, mkdirat, openat, stat, symlink},
15    io::Errno,
16    mount::{
17        FsMountFlags, MountAttrFlags, OpenTreeFlags, UnmountFlags, fsconfig_create,
18        fsconfig_set_string, fsmount, open_tree, unmount,
19    },
20    path,
21};
22
23use serde::Deserialize;
24
25use composefs::{
26    fsverity::{FsVerityHashValue, Sha512HashValue},
27    mount::FsHandle,
28    mountcompat::{overlayfs_set_fd, overlayfs_set_lower_and_data_fds, prepare_mount},
29    repository::Repository,
30};
31use composefs_boot::cmdline::get_cmdline_composefs;
32use composefs_ctl::composefs;
33use composefs_ctl::composefs_boot;
34
35use fn_error_context::context;
36
37use bootc_kernel_cmdline::utf8::Cmdline;
38
39// mount_setattr syscall support
40const MOUNT_ATTR_RDONLY: u64 = 0x00000001;
41
42#[repr(C)]
43struct MountAttr {
44    attr_set: u64,
45    attr_clr: u64,
46    propagation: u64,
47    userns_fd: u64,
48}
49
50/// Set mount attributes using mount_setattr syscall
51#[context("Setting mount attributes")]
52#[allow(unsafe_code)]
53fn mount_setattr(fd: impl AsFd, flags: libc::c_int, attr: &MountAttr) -> Result<()> {
54    let ret = unsafe {
55        libc::syscall(
56            libc::SYS_mount_setattr,
57            fd.as_fd().as_raw_fd(),
58            c"".as_ptr(),
59            flags,
60            attr as *const MountAttr,
61            std::mem::size_of::<MountAttr>(),
62        )
63    };
64    if ret == -1 {
65        Err(std::io::Error::last_os_error())?;
66    }
67    Ok(())
68}
69
70/// Set mount to readonly
71#[context("Setting mount readonly")]
72fn set_mount_readonly(fd: impl AsFd) -> Result<()> {
73    let attr = MountAttr {
74        attr_set: MOUNT_ATTR_RDONLY,
75        attr_clr: 0,
76        propagation: 0,
77        userns_fd: 0,
78    };
79    mount_setattr(fd, libc::AT_EMPTY_PATH, &attr)
80}
81
82/// Types of mounts supported by the configuration
83#[derive(Clone, Copy, Debug, Deserialize, PartialEq)]
84#[serde(rename_all = "lowercase")]
85pub enum MountType {
86    /// No mount; "root" is an alias meaning this dir is part of the root mount
87    #[serde(alias = "root")]
88    None,
89    /// Bind mount
90    Bind,
91    /// Overlay mount
92    Overlay,
93    /// Transient mount; "volatile" is an alias (Unix convention for tmpfs)
94    #[serde(alias = "volatile")]
95    Transient,
96}
97
98#[derive(Debug, Default, Deserialize, PartialEq)]
99struct RootConfig {
100    #[serde(default)]
101    transient: bool,
102}
103
104/// Configuration for mount operations
105#[derive(Debug, Default, Deserialize, PartialEq)]
106pub struct MountConfig {
107    /// The type of mount to use
108    pub mount: Option<MountType>,
109    #[serde(default)]
110    /// Whether this mount should be transient (temporary)
111    pub transient: bool,
112}
113
114#[derive(Debug, Deserialize, Default, PartialEq)]
115struct Config {
116    #[serde(default)]
117    etc: MountConfig,
118    #[serde(default)]
119    var: MountConfig,
120    #[serde(default)]
121    root: RootConfig,
122}
123
124/// Default path to the setup-root configuration file, relative to the booted root.
125pub const SETUP_ROOT_CONF_PATH: &str = "/usr/lib/composefs/setup-root-conf.toml";
126
127/// Returns `true` if the configuration at `path` requests a transient `/etc`
128/// overlay.  Used by the systemd generator to decide whether to emit the
129/// SELinux relabel unit *before* those mounts exist (the generator runs before
130/// `local-fs.target`).
131///
132/// Returns `false` if the file is absent or unreadable (safe default: no unit
133/// emitted for non-transient systems).
134pub fn config_has_transient_submounts(path: &std::path::Path) -> bool {
135    let text = match std::fs::read_to_string(path) {
136        Ok(t) => t,
137        Err(e) => {
138            tracing::debug!("Could not read {}: {e:#}", path.display());
139            return false;
140        }
141    };
142    let config: Config = match toml::from_str(&text) {
143        Ok(c) => c,
144        Err(e) => {
145            tracing::debug!("Could not parse {}: {e:#}", path.display());
146            return false;
147        }
148    };
149    // Only /etc overlay triggers the relabel unit.
150    let is_transient = |mc: &MountConfig| match mc.mount {
151        Some(mt) => mt == MountType::Transient,
152        None => mc.transient,
153    };
154    is_transient(&config.etc)
155}
156
157/// Command-line arguments
158#[derive(Parser, Debug)]
159pub struct Args {
160    #[arg(help = "Execute this command (for testing)")]
161    /// Execute this command (for testing)
162    pub cmd: Vec<OsString>,
163
164    #[arg(
165        long,
166        default_value = "/sysroot",
167        help = "sysroot directory in initramfs"
168    )]
169    /// sysroot directory in initramfs
170    pub sysroot: PathBuf,
171
172    #[arg(
173        long,
174        default_value = "/usr/lib/composefs/setup-root-conf.toml",
175        help = "Config path (for testing)"
176    )]
177    /// Config path (for testing)
178    pub config: PathBuf,
179
180    // we want to test in a userns, but can't mount erofs there
181    #[arg(long, help = "Bind mount root-fs from (for testing)")]
182    /// Bind mount root-fs from (for testing)
183    pub root_fs: Option<PathBuf>,
184
185    #[arg(long, help = "Kernel commandline args (for testing)")]
186    /// Kernel commandline args (for testing)
187    pub cmdline: Option<Cmdline<'static>>,
188
189    #[arg(long, help = "Mountpoint (don't replace sysroot, for testing)")]
190    /// Mountpoint (don't replace sysroot, for testing)
191    pub target: Option<PathBuf>,
192}
193
194/// Wrapper around [`composefs::mount::mount_at`]
195pub fn mount_at_wrapper(
196    fs_fd: impl AsFd,
197    dirfd: impl AsFd,
198    path: impl path::Arg + Debug + Clone,
199) -> Result<()> {
200    composefs::mount::mount_at(fs_fd, dirfd, path.clone())
201        .with_context(|| format!("Mounting at path {path:?}"))
202}
203
204/// Wrapper around [`rustix::fs::openat`]
205#[context("Opening dir {name:?}")]
206pub fn open_dir(dirfd: impl AsFd, name: impl AsRef<Path> + Debug) -> Result<OwnedFd> {
207    let res = openat(
208        dirfd,
209        name.as_ref(),
210        OFlags::PATH | OFlags::DIRECTORY | OFlags::CLOEXEC,
211        Mode::empty(),
212    );
213
214    Ok(res?)
215}
216
217#[context("Ensure dir")]
218fn ensure_dir(dirfd: impl AsFd, name: &str, mode: Option<rustix::fs::Mode>) -> Result<OwnedFd> {
219    match mkdirat(dirfd.as_fd(), name, mode.unwrap_or(0o700.into())) {
220        Ok(()) | Err(Errno::EXIST) => {}
221        Err(err) => Err(err).with_context(|| format!("Creating dir {name}"))?,
222    }
223
224    open_dir(dirfd, name)
225}
226
227#[context("Bind mounting to path {path}")]
228fn bind_mount(fd: impl AsFd, path: &str) -> Result<OwnedFd> {
229    let res = open_tree(
230        fd.as_fd(),
231        path,
232        OpenTreeFlags::OPEN_TREE_CLONE
233            | OpenTreeFlags::OPEN_TREE_CLOEXEC
234            | OpenTreeFlags::AT_EMPTY_PATH,
235    );
236
237    Ok(res?)
238}
239
240/// Mount a tmpfs to use as the upper layer for an overlay.
241///
242/// TODO: sync these options with systemd's root mounting, there's some tweaks there for default tmpfs
243/// and we may want to make this configurable anyways i nthe future
244///
245/// See <https://github.com/containers/bootc/issues/1992>.
246#[context("Mounting tmpfs for overlay")]
247fn mount_tmpfs_for_overlay() -> Result<OwnedFd> {
248    let tmpfs = FsHandle::open("tmpfs")?;
249    fsconfig_create(tmpfs.as_fd())?;
250    Ok(fsmount(
251        tmpfs.as_fd(),
252        FsMountFlags::FSMOUNT_CLOEXEC,
253        MountAttrFlags::empty(),
254    )?)
255}
256
257/// Build an overlayfs fsmount fd from an existing state dir (upper+work).
258///
259/// upper is 0755: the merged view inherits permissions from upperdir, so 0700
260/// would make the mountpoint inaccessible to non-root processes.  work is
261/// kernel-internal and never visible; 0700 is fine.
262/// See: <https://github.com/composefs/composefs-rs/issues/287>
263fn build_overlay_fd(
264    base: impl AsFd,
265    state: impl AsFd,
266    source: &str,
267    mount_attr_flags: Option<MountAttrFlags>,
268) -> Result<OwnedFd> {
269    let upper = ensure_dir(state.as_fd(), "upper", Some(0o755.into()))?;
270    let work = ensure_dir(state.as_fd(), "work", Some(0o700.into()))?;
271
272    let overlayfs = FsHandle::open("overlay")?;
273    fsconfig_set_string(overlayfs.as_fd(), "source", source)?;
274    overlayfs_set_fd(overlayfs.as_fd(), "workdir", work.as_fd())?;
275    overlayfs_set_fd(overlayfs.as_fd(), "upperdir", upper.as_fd())?;
276    overlayfs_set_lower_and_data_fds(&overlayfs, base.as_fd(), None::<OwnedFd>)?;
277    fsconfig_create(overlayfs.as_fd())?;
278    Ok(fsmount(
279        overlayfs.as_fd(),
280        FsMountFlags::FSMOUNT_CLOEXEC,
281        mount_attr_flags.unwrap_or(MountAttrFlags::empty()),
282    )?)
283}
284
285/// Mount a persistent state directory as an overlay on top of `base`,
286/// attaching the result immediately at `.` relative to `base`.
287#[context("Mounting state as overlay")]
288fn overlay_state(
289    base: impl AsFd,
290    state: impl AsFd,
291    source: &str,
292    mount_attr_flags: Option<MountAttrFlags>,
293) -> Result<()> {
294    let fs = build_overlay_fd(&base, state, source, mount_attr_flags)?;
295    mount_at_wrapper(fs, base, ".").context("Moving mount")
296}
297
298/// Creates a transient overlayfs with the passed-in fd as the lowerdir.
299///
300/// Returns a detached (not yet attached) `OwnedFd` for the overlay mount.
301/// The caller is responsible for attaching it to the filesystem tree.
302///
303/// `source` is used verbatim as the overlay's `source` fsconfig option and
304/// appears in `/proc/self/mountinfo`.  For the composefs root, pass
305/// `"transient:composefs=<digest_hex>"` so that `composefs_booted()` can
306/// recover the verity digest from the mount source after switch-root.  For
307/// non-root transient mounts (e.g. `/usr`, `/var`) pass `"transient"`.
308///
309/// The SELinux label on `/` is fixed after boot by
310/// `bootc-early-overlay-relabel.service`; no initramfs-side xattr write is
311/// needed (kernel `fs_use_trans tmpfs` relabeling at policy-load time would
312/// overwrite anything written here).
313#[context("Creating transient overlayfs")]
314pub fn overlay_transient(
315    base: impl AsFd,
316    source: &str,
317    mount_attr_flags: Option<MountAttrFlags>,
318) -> Result<OwnedFd> {
319    let tmpfs = mount_tmpfs_for_overlay()?;
320    let state = prepare_mount(tmpfs)?;
321    build_overlay_fd(base, state, source, mount_attr_flags)
322}
323
324#[context("Opening rootfs")]
325fn open_root_fs(path: &Path) -> Result<OwnedFd> {
326    let rootfs = open_tree(
327        CWD,
328        path,
329        OpenTreeFlags::OPEN_TREE_CLONE | OpenTreeFlags::OPEN_TREE_CLOEXEC,
330    )?;
331
332    set_mount_readonly(&rootfs)?;
333
334    Ok(rootfs)
335}
336
337/// Prepares a floating mount for composefs and returns the fd
338///
339/// # Arguments
340/// * sysroot                - fd for /sysroot
341/// * name                   - Name of the EROFS image to be mounted
342/// * allow_missing_fsverity - Whether to allow mount without fsverity support
343#[context("Mounting composefs image")]
344pub fn mount_composefs_image(
345    sysroot: &OwnedFd,
346    name: &str,
347    allow_missing_fsverity: bool,
348) -> Result<OwnedFd> {
349    // Use open_upgrade to handle upgrades from older composefs-rs versions
350    // that lack meta.json: it infers the algorithm and verity mode from
351    // existing objects, writes meta.json, and opens normally.
352    let (mut repo, _upgraded) = Repository::<Sha512HashValue>::open_upgrade(sysroot, "composefs")?;
353    if allow_missing_fsverity {
354        repo.set_insecure();
355    }
356    let rootfs = repo
357        .mount(name)
358        .context("Failed to mount composefs image")?;
359
360    set_mount_readonly(&rootfs)?;
361
362    Ok(rootfs)
363}
364
365/// Mounts a subdirectory with the specified configuration
366#[context("Mounting subdirectory")]
367pub fn mount_subdir(
368    new_root: impl AsFd,
369    state: impl AsFd,
370    subdir: &str,
371    config: MountConfig,
372    default: MountType,
373) -> Result<()> {
374    let mount_type = match config.mount {
375        Some(mt) => mt,
376        None => match config.transient {
377            true => MountType::Transient,
378            false => default,
379        },
380    };
381
382    match mount_type {
383        MountType::None => Ok(()),
384        MountType::Bind => Ok(mount_at_wrapper(
385            bind_mount(&state, subdir)?,
386            &new_root,
387            subdir,
388        )?),
389        MountType::Overlay => overlay_state(
390            open_dir(&new_root, subdir)?,
391            open_dir(&state, subdir)?,
392            "overlay",
393            None,
394        ),
395        MountType::Transient => {
396            // For subdirectory transient mounts, create the overlay and immediately
397            // attach it at the subdirectory path in new_root.
398            let subdir_fd = open_dir(&new_root, subdir)?;
399            let overlay_fd = overlay_transient(subdir_fd.as_fd(), "transient", None)?;
400            mount_at_wrapper(overlay_fd, &new_root, subdir)
401        }
402    }
403}
404
405#[context("GPT workaround")]
406/// Workaround for /dev/gpt-auto-root
407pub fn gpt_workaround() -> Result<()> {
408    // https://github.com/systemd/systemd/issues/35017
409    let rootdev = stat("/dev/gpt-auto-root");
410
411    let rootdev = match rootdev {
412        Ok(r) => r,
413        Err(e) if e.kind() == ErrorKind::NotFound => return Ok(()),
414        Err(e) => Err(e)?,
415    };
416
417    let target = format!(
418        "/dev/block/{}:{}",
419        major(rootdev.st_rdev),
420        minor(rootdev.st_rdev)
421    );
422    symlink(target, "/run/systemd/volatile-root")?;
423    Ok(())
424}
425
426/// Sets up /sysroot for switch-root
427#[context("Setting up /sysroot")]
428pub fn setup_root(args: Args) -> Result<()> {
429    let config = match std::fs::read_to_string(args.config) {
430        Ok(text) => toml::from_str(&text)?,
431        Err(err) if err.kind() == ErrorKind::NotFound => Config::default(),
432        Err(err) => Err(err)?,
433    };
434
435    let sysroot = open_dir(CWD, &args.sysroot)
436        .with_context(|| format!("Failed to open sysroot {:?}", args.sysroot))?;
437
438    let cmdline = args
439        .cmdline
440        .unwrap_or(Cmdline::from_proc().context("Failed to read cmdline")?);
441
442    // Auto-detect systemd.volatile=state: if the kernel cmdline requests a
443    // volatile /var via the systemd fstab-generator, skip our initramfs
444    // bind-mount of /var from the deployment state directory.  This leaves
445    // /var as an empty directory from the composefs image so that
446    // systemd-fstab-generator can mount a fresh tmpfs there at local-fs.target.
447    // An explicit `[var] mount = "none"` in setup-root-conf.toml has the same
448    // effect; the cmdline check is a convenience so users only need the kargs.d
449    // entry without also editing setup-root-conf.toml.
450    let config = {
451        let mut config = config;
452        // value_of returns None for a missing key, Some("") for a bare flag,
453        // or Some("state") / Some("overlay") / Some("yes") for key=value form.
454        let volatile_val = cmdline.value_of("systemd.volatile");
455        let var_volatile = matches!(volatile_val, Some("state") | Some("overlay"));
456        if var_volatile && config.var.mount.is_none() && !config.var.transient {
457            tracing::debug!(
458                "systemd.volatile={} detected; skipping /var state bind-mount",
459                volatile_val.unwrap_or("")
460            );
461            config.var.mount = Some(MountType::None);
462        }
463        config
464    };
465
466    let (image, insecure) = get_cmdline_composefs::<Sha512HashValue>(&cmdline)?;
467
468    let new_root = match &args.root_fs {
469        Some(path) => open_root_fs(path).context("Failed to clone specified root fs")?,
470        None => mount_composefs_image(&sysroot, &image.to_hex(), insecure)?,
471    };
472
473    // we need to clone this before the next step to make sure we get the old one
474    let sysroot_clone = bind_mount(&sysroot, "")?;
475
476    set_mount_readonly(&sysroot_clone)?;
477
478    let mount_target = args.target.unwrap_or(args.sysroot.clone());
479
480    // Ideally we build the new root filesystem together before we mount it, but that only works on
481    // 6.15 and later.  Before 6.15 we can't mount into a floating tree, so mount it first.  This
482    // will leave an abandoned clone of the sysroot mounted under it, but that's OK for now.
483    if cfg!(feature = "pre-6.15") {
484        mount_at_wrapper(&new_root, CWD, &mount_target)?;
485    }
486
487    // When transient root is enabled, place an overlay on top of the composefs.
488    // On pre-6.15, since the composefs is already attached at `mount_target`,
489    // the overlay is also immediately attached there.  We then open the overlay
490    // via its path so that subsequent mounts target the visible merged tree.
491    //
492    // On 6.15+, the whole tree is assembled in floating mode; `overlay_transient`
493    // returns a detached overlay fd that we can directly mount into.
494    //
495    // `new_root` always refers to the composefs fd; mounting via it after the
496    // overlay is in place would land in the hidden lower layer.
497    let transient_overlay_fd: Option<OwnedFd> = if config.root.transient {
498        let overlay_fd = overlay_transient(
499            &new_root,
500            &format!("transient:composefs={}", image.to_hex()),
501            None,
502        )?;
503
504        if cfg!(feature = "pre-6.15") {
505            // In pre-6.15, the composefs is already attached at `mount_target`.
506            // Attach the overlay on top of it, then reopen the path to get a
507            // dirfd that resolves through the overlay (not the hidden composefs).
508            mount_at_wrapper(&overlay_fd, CWD, &mount_target)
509                .context("Moving transient overlay onto sysroot")?;
510            Some(open_dir(CWD, &mount_target).context("Opening attached overlay root")?)
511        } else {
512            // On 6.15+ we assemble in floating mode; use the detached overlay fd
513            // directly for subsequent mounts into the tree.
514            Some(overlay_fd)
515        }
516    } else {
517        None
518    };
519
520    // When transient root is active the overlay sits on top of the composefs.
521    // Mounts placed via `new_root` would land in the composefs lower layer and
522    // be invisible from the running system.  Use the overlay fd for all
523    // post-overlay mounts (sysroot, etc, var) so they appear in the merged view.
524    let visible_root: &dyn AsFd = transient_overlay_fd
525        .as_ref()
526        .map_or(&new_root as &dyn AsFd, |fd| fd as &dyn AsFd);
527
528    // Mount the physical sysroot (with the composefs repo) into the new root
529    // so that `bootc status` and other tools can find it after switch-root.
530    match composefs::mount::mount_at(&sysroot_clone, visible_root, "sysroot") {
531        Ok(()) | Err(Errno::NOENT) => {}
532        Err(err) => Err(err)?,
533    }
534
535    // etc + var
536    let state = open_dir(open_dir(&sysroot, "state/deploy")?, image.to_hex())?;
537    mount_subdir(visible_root, &state, "etc", config.etc, MountType::Bind)?;
538    // /var is bind-mounted from the deployment state directory by default.
539    // The systemd.volatile=state cmdline detection above (or an explicit
540    // [var] mount = "none" in setup-root-conf.toml) can change this to
541    // MountType::None, which skips the bind-mount entirely and leaves /var
542    // as an empty directory from the composefs image for systemd to fill.
543    mount_subdir(visible_root, &state, "var", config.var, MountType::Bind)?;
544
545    if cfg!(not(feature = "pre-6.15")) {
546        // Replace the /sysroot with the new composed root filesystem.
547        // When a transient overlay is active, mount it rather than the bare
548        // composefs so the running system sees the writable merged view.
549        unmount(&args.sysroot, UnmountFlags::DETACH)?;
550        mount_at_wrapper(visible_root, CWD, &mount_target)?;
551    }
552
553    Ok(())
554}
555
556#[cfg(test)]
557mod tests {
558    use super::*;
559
560    fn parse(toml: &str) -> Config {
561        toml::from_str(toml).expect("TOML parse failed")
562    }
563
564    #[test]
565    fn test_config_defaults() {
566        let config = parse("");
567        assert_eq!(
568            config,
569            Config {
570                etc: MountConfig {
571                    mount: None,
572                    transient: false
573                },
574                var: MountConfig {
575                    mount: None,
576                    transient: false
577                },
578                root: RootConfig { transient: false },
579            }
580        );
581    }
582
583    #[test]
584    fn test_mounttype_none() {
585        let config = parse("[etc]\nmount = \"none\"");
586        assert_eq!(config.etc.mount, Some(MountType::None));
587    }
588
589    #[test]
590    fn test_mounttype_root_alias() {
591        let config = parse("[etc]\nmount = \"root\"");
592        assert_eq!(config.etc.mount, Some(MountType::None));
593    }
594
595    #[test]
596    fn test_etc_transient_flag() {
597        let config = parse("[etc]\ntransient = true");
598        assert_eq!(config.etc.transient, true);
599        assert_eq!(config.etc.mount, None);
600    }
601
602    #[test]
603    fn test_var_none() {
604        // mount = "none" skips the state bind-mount; combine with
605        // systemd.volatile=state karg to get a fresh tmpfs on every boot.
606        let config = parse("[var]\nmount = \"none\"");
607        assert_eq!(config.var.mount, Some(MountType::None));
608    }
609
610    #[test]
611    fn test_root_transient() {
612        let config = parse("[root]\ntransient = true");
613        assert_eq!(config.root.transient, true);
614    }
615
616    #[test]
617    fn test_combined_config() {
618        let config = parse("[root]\ntransient = true\n[etc]\nmount = \"root\"");
619        assert_eq!(config.root.transient, true);
620        assert_eq!(config.etc.mount, Some(MountType::None));
621    }
622}