Skip to main content

bootc_lib/
podstorage.rs

1//! # bootc-managed instance of containers-storage:
2//!
3//! The backend for podman and other tools is known as `container-storage:`,
4//! with a canonical instance that lives in `/var/lib/containers`.
5//!
6//! This is a `containers-storage:` instance which is owned by bootc.
7//! On ostree systems it lives at `/sysroot/ostree/bootc/storage`;
8//! on composefs systems the physical location is
9//! `/sysroot/composefs/bootc/storage` with a compatibility symlink
10//! at `ostree/bootc -> ../composefs/bootc`.
11//!
12//! At the current time, this is only used for Logically Bound Images.
13
14use std::collections::HashSet;
15use std::io::{Seek, Write};
16use std::os::unix::process::CommandExt;
17use std::process::{Command, Stdio};
18use std::sync::Arc;
19
20use anyhow::{Context, Result};
21use bootc_utils::{AsyncCommandRunExt, CommandRunExt, ExitStatusExt};
22use camino::{Utf8Path, Utf8PathBuf};
23use cap_std_ext::cap_std::fs::Dir;
24use cap_std_ext::cap_tempfile::TempDir;
25use cap_std_ext::cmdext::{CapStdExtCommandExt, CmdFds};
26use cap_std_ext::dirext::CapStdExtDirExt;
27use cap_std_ext::{cap_std, cap_tempfile};
28use fn_error_context::context;
29use ostree_ext::ostree::{self};
30use std::os::fd::OwnedFd;
31use tokio::process::Command as AsyncCommand;
32
33// Pass only 100 args at a time just to avoid potentially overflowing argument
34// vectors; not that this should happen in reality, but just in case.
35const SUBCMD_ARGV_CHUNKING: usize = 100;
36
37/// Global directory path which we use for podman to point
38/// it at our storage. Unfortunately we can't yet use the
39/// /proc/self/fd/N trick because it currently breaks due
40/// to how the untar process is forked in the child.
41pub(crate) const STORAGE_ALIAS_DIR: &str = "/run/bootc/storage";
42/// We pass this via /proc/self/fd to the child process.
43pub(crate) const STORAGE_RUN_FD: i32 = 3;
44
45const LABELED: &str = ".bootc_labeled";
46
47/// The system path to the canonical containers-storage instance,
48/// used as the SELinux label reference path.
49const SYS_CSTOR_PATH: &str = "/var/lib/containers/storage";
50
51/// The path to the image storage, relative to the bootc root directory.
52pub(crate) const SUBPATH: &str = "storage";
53/// The path to the "runroot" with transient runtime state; this is
54/// relative to the /run directory
55const RUNROOT: &str = "bootc/storage";
56
57/// A bootc-owned instance of `containers-storage:`.
58pub(crate) struct CStorage {
59    /// The root directory
60    sysroot: Dir,
61    /// The location of container storage
62    storage_root: Dir,
63    /// Our runtime state
64    run: Dir,
65    /// The SELinux policy used for labeling the storage.
66    sepolicy: Option<ostree::SePolicy>,
67    /// Disallow using this across multiple threads concurrently; while we
68    /// have internal locking in podman, in the future we may change how
69    /// things work here. And we don't have a use case right now for
70    /// concurrent operations.
71    _unsync: std::cell::Cell<()>,
72}
73
74#[derive(Debug, PartialEq, Eq)]
75pub(crate) enum PullMode {
76    /// Pull only if the image is not present
77    IfNotExists,
78    /// Always check for an update
79    #[allow(dead_code)]
80    Always,
81}
82
83#[allow(unsafe_code)]
84#[context("Binding storage roots")]
85pub(crate) fn bind_storage_roots(
86    cmd: &mut Command,
87    fds: &mut CmdFds,
88    storage_root: &Dir,
89    run_root: &Dir,
90) -> Result<()> {
91    // podman requires an absolute path, for two reasons right now:
92    // - It writes the file paths into `db.sql`, a sqlite database for unknown reasons
93    // - It forks helper binaries, so just giving it /proc/self/fd won't work as
94    //   those helpers may not get the fd passed. (which is also true of skopeo)
95    // We create a new mount namespace, which also has the helpful side effect
96    // of automatically cleaning up the global bind mount that the storage stack
97    // creates.
98
99    let storage_root = Arc::new(storage_root.try_clone().context("Cloning storage root")?);
100    let run_root: Arc<OwnedFd> = Arc::new(run_root.try_clone().context("Cloning runroot")?.into());
101    // SAFETY: All the APIs we call here are safe to invoke between fork and exec.
102    unsafe {
103        cmd.pre_exec(move || {
104            use rustix::fs::{Mode, OFlags};
105            // For reasons I don't understand, we can't just `mount("/proc/self/fd/N", "/path/to/target")`
106            // but it *does* work to fchdir(fd) + mount(".", "/path/to/target").
107            // I think it may be that mount doesn't like operating on the magic links?
108            // This trick only works if we set our working directory to the target *before*
109            // creating the new namespace too.
110            //
111            // I think we may be hitting this:
112            //
113            // "       EINVAL A bind operation (MS_BIND) was requested where source referred a mount namespace magic link (i.e., a /proc/pid/ns/mnt magic link or a bind mount to such a link) and the propagation type of the parent mount of target was
114            // MS_SHARED, but propagation of the requested bind mount could lead to a circular dependency that might prevent the mount namespace from ever being freed."
115            //
116            // But...how did we avoid that circular dependency by using the process cwd?
117            //
118            // I tried making the mounts recursively private, but that didn't help.
119            let oldwd = rustix::fs::open(
120                ".",
121                OFlags::DIRECTORY | OFlags::CLOEXEC | OFlags::RDONLY,
122                Mode::empty(),
123            )?;
124            rustix::process::fchdir(&storage_root)?;
125            rustix::thread::unshare_unsafe(rustix::thread::UnshareFlags::NEWNS)?;
126            rustix::mount::mount_bind(".", STORAGE_ALIAS_DIR)?;
127            rustix::process::fchdir(&oldwd)?;
128            Ok(())
129        })
130    };
131    fds.take_fd_n(run_root, STORAGE_RUN_FD);
132    Ok(())
133}
134
135/// Set up `REGISTRY_AUTH_FILE` on a command, passing the bootc/ostree
136/// auth file via an anonymous tmpfile fd.
137///
138/// If no bootc-owned auth is configured, an empty `{}` is passed to
139/// prevent podman from falling back to user-owned auth paths.
140pub(crate) fn setup_auth(cmd: &mut Command, fds: &mut CmdFds, sysroot: &Dir) -> Result<()> {
141    let tmpd = &cap_std::fs::Dir::open_ambient_dir("/tmp", cap_std::ambient_authority())?;
142    let mut tempfile = cap_tempfile::TempFile::new_anonymous(tmpd).map(std::io::BufWriter::new)?;
143
144    // Keep this in sync with https://github.com/bootc-dev/containers-image-proxy-rs/blob/b5e0861ad5065f47eaf9cda0d48da3529cc1bc43/src/imageproxy.rs#L310
145    // We always override the auth to match the bootc setup.
146    let authfile_fd = ostree_ext::globals::get_global_authfile(sysroot)?.map(|v| v.1);
147    if let Some(mut fd) = authfile_fd {
148        std::io::copy(&mut fd, &mut tempfile)?;
149    } else {
150        // Note that if there's no bootc-owned auth, then we force an empty authfile to ensure
151        // that podman doesn't fall back to searching the user-owned paths.
152        tempfile.write_all(b"{}")?;
153    }
154
155    let tempfile = tempfile
156        .into_inner()
157        .map_err(|e| e.into_error())?
158        .into_std();
159    let fd: Arc<OwnedFd> = std::sync::Arc::new(tempfile.into());
160    let target_fd = fds.take_fd(fd);
161    cmd.env("REGISTRY_AUTH_FILE", format!("/proc/self/fd/{target_fd}"));
162
163    Ok(())
164}
165
166// Initialize a `podman` subprocess with:
167// - storage overridden to point to to storage_root
168// - Authentication (auth.json) using the bootc/ostree owned auth
169fn new_podman_cmd_in(sysroot: &Dir, storage_root: &Dir, run_root: &Dir) -> Result<Command> {
170    let mut cmd = Command::new(bootc_utils::podman_bin());
171    let mut fds = CmdFds::new();
172    bind_storage_roots(&mut cmd, &mut fds, storage_root, run_root)?;
173    let run_root = format!("/proc/self/fd/{STORAGE_RUN_FD}");
174    cmd.args(["--root", STORAGE_ALIAS_DIR, "--runroot", run_root.as_str()]);
175    setup_auth(&mut cmd, &mut fds, sysroot)?;
176    cmd.take_fds(fds);
177    Ok(cmd)
178}
179
180/// Adjust the provided command (skopeo or podman e.g.) to reference
181/// the provided path as an additional image store.
182pub fn set_additional_image_store<'c>(
183    cmd: &'c mut Command,
184    ais: impl AsRef<Utf8Path>,
185) -> &'c mut Command {
186    let ais = ais.as_ref();
187    let storage_opt = format!("additionalimagestore={ais}");
188    cmd.env("STORAGE_OPTS", storage_opt)
189}
190
191/// Ensure that "podman" is the first thing to touch the global storage
192/// instance. This is a workaround for <https://github.com/bootc-dev/bootc/pull/1101#issuecomment-2653862974>
193/// Basically podman has special upgrade logic for when it is the first thing
194/// to initialize the c/storage instance it sets the networking to netavark.
195/// If it's not the first thing, then it assumes an upgrade scenario and we
196/// may be using CNI.
197///
198/// But this legacy path is triggered through us using skopeo, turning off netavark
199/// by default. Work around this by ensuring that /usr/bin/podman is
200/// always the first thing to touch c/storage (at least, when invoked by us).
201///
202/// Call this function any time we're going to write to containers-storage.
203pub(crate) fn ensure_floating_c_storage_initialized() {
204    if let Err(e) = Command::new(bootc_utils::podman_bin())
205        .args(["system", "info"])
206        .stdout(Stdio::null())
207        .run_capture_stderr()
208    {
209        // Out of conservatism we don't make this operation fatal right now.
210        // If something went wrong, then we'll probably fail on a later operation
211        // anyways.
212        tracing::warn!("Failed to query podman system info: {e}");
213    }
214}
215
216impl CStorage {
217    /// Create a `podman image` Command instance prepared to operate on our alternative
218    /// root.
219    pub(crate) fn new_image_cmd(&self) -> Result<Command> {
220        let mut r = new_podman_cmd_in(&self.sysroot, &self.storage_root, &self.run)?;
221        // We want to limit things to only manipulating images by default.
222        r.arg("image");
223        Ok(r)
224    }
225
226    fn init_globals() -> Result<()> {
227        // Ensure our global storage alias dir exists
228        std::fs::create_dir_all(STORAGE_ALIAS_DIR)
229            .with_context(|| format!("Creating {STORAGE_ALIAS_DIR}"))?;
230        Ok(())
231    }
232
233    /// Ensure that the LSM (SELinux) labels are set on the bootc-owned
234    /// containers-storage: instance. We use a `LABELED` stamp file for
235    /// idempotence.
236    #[context("Labeling imgstorage dirs")]
237    pub(crate) fn ensure_labeled(&self) -> Result<()> {
238        if self.storage_root.try_exists(LABELED)? {
239            return Ok(());
240        }
241        let Some(sepolicy) = self.sepolicy.as_ref() else {
242            return Ok(());
243        };
244
245        // recursively set the labels because they were previously set to usr_t,
246        // and there is no policy defined to set them to the c/storage labels
247        crate::lsm::relabel_recurse(
248            &self.storage_root,
249            ".",
250            Some(Utf8Path::new(SYS_CSTOR_PATH)),
251            sepolicy,
252        )
253        .context("labeling storage root")?;
254
255        // fsync so relabel writes are durable before creating the stamp file
256        rustix::fs::fsync(
257            self.storage_root
258                .reopen_as_ownedfd()
259                .context("Reopening as owned fd")?,
260        )
261        .context("fsync")?;
262
263        self.storage_root.create(LABELED)?;
264
265        // Label the stamp file itself to match the storage directory context
266        crate::lsm::relabel(
267            &self.storage_root,
268            &self.storage_root.symlink_metadata(LABELED)?,
269            LABELED.into(),
270            Some(&Utf8Path::new(SYS_CSTOR_PATH).join(LABELED)),
271            sepolicy,
272        )
273        .context("labeling stamp file")?;
274
275        // fsync to persist the stamp file entry
276        rustix::fs::fsync(
277            self.storage_root
278                .reopen_as_ownedfd()
279                .context("Reopening as owned fd")?,
280        )
281        .context("fsync")?;
282
283        Ok(())
284    }
285
286    #[context("Creating imgstorage")]
287    pub(crate) fn create(
288        sysroot: &Dir,
289        run: &Dir,
290        sepolicy: Option<&ostree::SePolicy>,
291    ) -> Result<Self> {
292        Self::init_globals()?;
293        let subpath = &Self::subpath();
294
295        // SAFETY: We know there's a parent
296        let parent = subpath.parent().unwrap();
297        let tmp = format!("{subpath}.tmp");
298        let existed = sysroot
299            .try_exists(subpath)
300            .with_context(|| format!("Querying {subpath}"))?;
301        if !existed {
302            sysroot.remove_all_optional(&tmp).context("Removing tmp")?;
303            sysroot
304                .create_dir_all(parent)
305                .with_context(|| format!("Creating {parent}"))?;
306            sysroot.create_dir_all(&tmp).context("Creating tmpdir")?;
307            let storage_root = sysroot.open_dir(&tmp).context("Open tmp")?;
308
309            // There's no explicit API to initialize a containers-storage:
310            // root, simply passing a path will attempt to auto-create it.
311            // We run "podman images" in the new root.
312            new_podman_cmd_in(&sysroot, &storage_root, &run)?
313                .stdout(Stdio::null())
314                .arg("images")
315                .run_capture_stderr()
316                .context("Initializing images")?;
317            drop(storage_root);
318            sysroot
319                .rename(&tmp, sysroot, subpath)
320                .context("Renaming tmpdir")?;
321            tracing::debug!("Created image store");
322        }
323
324        let s = Self::open(sysroot, run, sepolicy.cloned())?;
325        if existed {
326            // For pre-existing storage (e.g. on a booted system), ensure
327            // labels are correct now. For freshly created storage (e.g.
328            // during install), labeling is deferred until after all image
329            // pulls are complete via an explicit ensure_labeled() call.
330            s.ensure_labeled()?;
331        }
332        Ok(s)
333    }
334
335    #[context("Opening imgstorage")]
336    pub(crate) fn open(
337        sysroot: &Dir,
338        run: &Dir,
339        sepolicy: Option<ostree::SePolicy>,
340    ) -> Result<Self> {
341        tracing::trace!("Opening container image store");
342        Self::init_globals()?;
343        let subpath = &Self::subpath();
344        let storage_root = sysroot
345            .open_dir(subpath)
346            .with_context(|| format!("Opening {subpath}"))?;
347        // Always auto-create this if missing
348        run.create_dir_all(RUNROOT)
349            .with_context(|| format!("Creating {RUNROOT}"))?;
350        let run = run.open_dir(RUNROOT)?;
351        Ok(Self {
352            sysroot: sysroot.try_clone()?,
353            storage_root,
354            run,
355            sepolicy,
356            _unsync: Default::default(),
357        })
358    }
359
360    #[context("Listing images")]
361    pub(crate) async fn list_images(&self) -> Result<Vec<crate::podman::ImageListEntry>> {
362        let mut cmd = self.new_image_cmd()?;
363        cmd.args(["list", "--format=json"]);
364        cmd.stdin(Stdio::null());
365        // It's maximally convenient for us to just pipe the whole output to a tempfile
366        let mut stdout = tempfile::tempfile()?;
367        cmd.stdout(stdout.try_clone()?);
368        // Allocate stderr, which is passed to the status checker
369        let stderr = tempfile::tempfile()?;
370        cmd.stderr(stderr.try_clone()?);
371
372        // Spawn the child and wait
373        AsyncCommand::from(cmd)
374            .status()
375            .await?
376            .check_status_with_stderr(stderr)?;
377        // Spawn a helper thread to avoid blocking the main thread
378        // parsing JSON.
379        tokio::task::spawn_blocking(move || -> Result<_> {
380            stdout.seek(std::io::SeekFrom::Start(0))?;
381            let stdout = std::io::BufReader::new(stdout);
382            let r = serde_json::from_reader(stdout)?;
383            Ok(r)
384        })
385        .await?
386    }
387
388    #[context("Pruning")]
389    pub(crate) async fn prune_except_roots(&self, roots: &HashSet<&str>) -> Result<Vec<String>> {
390        let all_images = self.list_images().await?;
391        tracing::debug!("Images total: {}", all_images.len(),);
392        let mut garbage = Vec::new();
393        for image in all_images {
394            if image
395                .names
396                .iter()
397                .flatten()
398                .all(|name| !roots.contains(name.as_str()))
399            {
400                garbage.push(image.id);
401            }
402        }
403        tracing::debug!("Images to prune: {}", garbage.len());
404        for garbage in garbage.chunks(SUBCMD_ARGV_CHUNKING) {
405            let mut cmd = self.new_image_cmd()?;
406            cmd.stdin(Stdio::null());
407            cmd.stdout(Stdio::null());
408            cmd.arg("rm");
409            cmd.args(garbage);
410            AsyncCommand::from(cmd).run().await?;
411        }
412        Ok(garbage)
413    }
414
415    /// Return true if the image exists in the storage.
416    pub(crate) async fn exists(&self, image: &str) -> Result<bool> {
417        // Sadly https://docs.rs/containers-image-proxy/latest/containers_image_proxy/struct.ImageProxy.html#method.open_image_optional
418        // doesn't work with containers-storage yet
419        let mut cmd = AsyncCommand::from(self.new_image_cmd()?);
420        cmd.args(["exists", image]);
421        Ok(cmd.status().await?.success())
422    }
423
424    /// Fetch the image if it is not already present; return whether
425    /// or not the image was fetched.
426    pub(crate) async fn pull(&self, image: &str, mode: PullMode) -> Result<bool> {
427        match mode {
428            PullMode::IfNotExists => {
429                if self.exists(image).await? {
430                    tracing::debug!("Image is already present: {image}");
431                    return Ok(false);
432                }
433            }
434            PullMode::Always => {}
435        };
436        let mut cmd = self.new_image_cmd()?;
437        cmd.stdin(Stdio::null());
438        cmd.stdout(Stdio::null());
439        cmd.args(["pull", image]);
440        tracing::debug!("Pulling image: {image}");
441        let mut cmd = AsyncCommand::from(cmd);
442        cmd.run().await.context("Failed to pull image")?;
443        Ok(true)
444    }
445
446    /// Copy an image from the default container storage (/var/lib/containers/)
447    /// to this storage.
448    #[context("Pulling from host storage: {image}")]
449    pub(crate) async fn pull_from_host_storage(&self, image: &str) -> Result<()> {
450        let mut cmd = Command::new(bootc_utils::podman_bin());
451        cmd.stdin(Stdio::null());
452        cmd.stdout(Stdio::null());
453        // An ephemeral place for the transient state;
454        let temp_runroot = TempDir::new(cap_std::ambient_authority())?;
455        let mut fds = CmdFds::new();
456        bind_storage_roots(&mut cmd, &mut fds, &self.storage_root, &temp_runroot)?;
457        cmd.take_fds(fds);
458
459        // The destination (target stateroot) + container storage dest
460        let storage_dest = &format!(
461            "containers-storage:[overlay@{STORAGE_ALIAS_DIR}+/proc/self/fd/{STORAGE_RUN_FD}]"
462        );
463        cmd.args(["image", "push", "--remove-signatures", image])
464            .arg(format!("{storage_dest}{image}"));
465        let mut cmd = AsyncCommand::from(cmd);
466        cmd.run().await?;
467        temp_runroot.close()?;
468        Ok(())
469    }
470
471    /// Pull an image with streaming progress display.
472    ///
473    /// Uses the podman native libpod HTTP API instead of shelling out,
474    /// enabling per-blob download progress. Registry auth is handled
475    /// via `REGISTRY_AUTH_FILE` on the podman service process.
476    ///
477    /// Always pulls (policy=always) so updated digests are fetched
478    /// even if an image with the same tag exists locally.
479    pub(crate) async fn pull_with_progress(&self, image: &str) -> Result<()> {
480        let client = crate::podman_client::PodmanClient::connect(
481            &self.sysroot,
482            &self.storage_root,
483            &self.run,
484        )
485        .await?;
486        client.pull_with_progress(image).await
487    }
488
489    pub(crate) fn subpath() -> Utf8PathBuf {
490        Utf8Path::new(crate::store::BOOTC_ROOT).join(SUBPATH)
491    }
492}
493
494#[cfg(test)]
495mod tests {
496    use super::*;
497    static_assertions::assert_not_impl_any!(CStorage: Sync);
498}