From 5f832b69a0e9983b8c8f32ae7dc19ad08c03c3e5 Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Wed, 6 May 2026 02:50:13 +0000 Subject: [PATCH] [opte] better support for unpublished xde via tools/opte_version_override OPTE has both a userland (oxide-vpc, opte-ioctl) pinned in Cargo.toml and a kernel module (xde) installed via the helios pkg repo. When the two need to be tested in lockstep before xde lands in a helios build, the existing override logic did not capture all requirements and swapping in an unpublished build was not seamless, particularly on CI. This builds out the override mechanism around the existing `tools/opte_version_override` stub. Includes: - install_opte.sh, deploy.sh, and releng image builds source `tools/opte_version_override` and, when OPTE_COMMIT is set, install from the buildomat-published p5p (or curl + rem_drv/add_drv on the deploy ramdisk, which has no pkg(5)) instead of the helios pkg repo. - package.sh tarballs `tools/opte_version_override` so that downstream jobs read the same value. - check-opte-ver.yml adds a check-opte-override job that fails any PR targeting main with a non-empty OPTE_COMMIT, so the override can't accidentally leak on main. - ci_check_opte_ver.sh now enforces OPTE_COMMIT == Cargo dependency rev so kernel/userland ABI drift surfaces at PR time. The default is OPTE_COMMIT="", which doesn't apply this override logic. We also fix a stale "expected lab-opte-0.$API_VER" error message in ci_check_opte_ver.sh. The check itself uses "lab-2.0-opte-0.$API_VER". This work was extracted from multicast work that (at times) has needed an unpublished xde for kernel-side multicast integration(s). --- .github/buildomat/jobs/deploy.sh | 36 +++++--- .github/buildomat/jobs/package.sh | 2 + .github/workflows/check-opte-ver.yml | 23 +++++- dev-tools/releng/src/main.rs | 118 ++++++++++++++++++++++++++- tools/ci_check_opte_ver.sh | 18 +++- tools/install_opte.sh | 64 ++++++++++----- tools/opte_version_override | 23 +++++- 7 files changed, 236 insertions(+), 48 deletions(-) diff --git a/.github/buildomat/jobs/deploy.sh b/.github/buildomat/jobs/deploy.sh index 7a3ac17b054..34a6f066efa 100755 --- a/.github/buildomat/jobs/deploy.sh +++ b/.github/buildomat/jobs/deploy.sh @@ -33,6 +33,10 @@ _exit_trap() { local status=$? set +o errexit + # Restore the override opteadm from /tmp before debug-evidence + # collection runs, in case anything earlier in this trap or any + # downstream tooling has overwritten the installed binary. The + # /tmp copy is saved by the OPTE_COMMIT install block below. if [[ "x$OPTE_COMMIT" != "x" ]]; then pfexec cp /tmp/opteadm /opt/oxide/opte/bin/opteadm fi @@ -134,19 +138,6 @@ z_swadm () { pfexec zlogin oxz_switch /opt/oxide/dendrite/bin/swadm $@ } -# only set this if you want to override the version of opte/xde installed by the -# install_opte.sh script -OPTE_COMMIT="" -if [[ "x$OPTE_COMMIT" != "x" ]]; then - curl -sSfOL https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/module/$OPTE_COMMIT/xde - pfexec rem_drv xde || true - pfexec mv xde /kernel/drv/amd64/xde - pfexec add_drv xde || true - curl -sSfOL https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/release/$OPTE_COMMIT/opteadm - chmod +x opteadm - cp opteadm /tmp/opteadm - pfexec mv opteadm /opt/oxide/opte/bin/opteadm -fi # # XXX work around 14537 (UFS should not allow directories to be unlinked) which @@ -197,6 +188,25 @@ ptime -m tar xvzf /input/package/work/package.tar.gz # shellcheck source=/dev/null source .github/buildomat/ci-env.sh +# Source the OPTE override (if any) from the canonical location and apply it. +# +# When set, download the xde driver and opteadm directly from buildomat and +# swap them in. The deploy target is a ramdisk image without pkg(5), so we +# use rem_drv/add_drv instead of the p5p approach used by install_opte.sh +# and releng. +# shellcheck source=/dev/null +source tools/opte_version_override +if [[ "x$OPTE_COMMIT" != "x" ]]; then + curl -sSfOL "https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/module/$OPTE_COMMIT/xde" + pfexec rem_drv xde || true + pfexec mv xde /kernel/drv/amd64/xde + pfexec add_drv xde || true + curl -sSfOL "https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/release/$OPTE_COMMIT/opteadm" + chmod +x opteadm + cp opteadm /tmp/opteadm + pfexec mv opteadm /opt/oxide/opte/bin/opteadm +fi + # Ask buildomat for the range of extra addresses that we're allowed to use, and # break them up into the ranges we need. diff --git a/.github/buildomat/jobs/package.sh b/.github/buildomat/jobs/package.sh index b43b91e9ec4..78df41dc5f6 100755 --- a/.github/buildomat/jobs/package.sh +++ b/.github/buildomat/jobs/package.sh @@ -60,5 +60,7 @@ files=( target/release/xtask target/debug/bootstrap tests/* + tools/opte_version + tools/opte_version_override ) ptime -m tar cvzf $WORK/package.tar.gz "${files[@]}" "${packages[@]}" diff --git a/.github/workflows/check-opte-ver.yml b/.github/workflows/check-opte-ver.yml index e516eeacbe6..d5de57270d3 100644 --- a/.github/workflows/check-opte-ver.yml +++ b/.github/workflows/check-opte-ver.yml @@ -1,10 +1,10 @@ name: check-opte-ver on: + # Run on every PR targeting main, regardless of which paths changed, as the + # override file may have been set in an earlier commit on the branch. So, + # we must check on every PR rather than only when tracked paths change. pull_request: - paths: - - '.github/workflows/check-opte-ver.yml' - - 'Cargo.toml' - - 'tools/opte_version' + branches: [main] jobs: check-opte-ver: runs-on: ubuntu-22.04 @@ -18,3 +18,18 @@ jobs: run: cargo install toml-cli@0.2.3 - name: Check OPTE version and rev match run: ./tools/ci_check_opte_ver.sh + + check-opte-override: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + ref: ${{ github.event.pull_request.head.sha }} # see omicron#4461 + - name: Reject OPTE override on main + run: | + source tools/opte_version_override + if [[ "x$OPTE_COMMIT" != "x" ]]; then + echo "::error::OPTE_COMMIT is set in tools/opte_version_override." + echo "::error::The OPTE override must be cleared before merging to main." + exit 1 + fi diff --git a/dev-tools/releng/src/main.rs b/dev-tools/releng/src/main.rs index 874f25e3fe7..2177b6c8cda 100644 --- a/dev-tools/releng/src/main.rs +++ b/dev-tools/releng/src/main.rs @@ -270,6 +270,18 @@ async fn main() -> Result<()> { let opte_version = fs::read_to_string(WORKSPACE_DIR.join("tools/opte_version")).await?; + // Parse tools/opte_version_override for OPTE_COMMIT. When set, we + // download the override p5p from buildomat and use it as a package + // source during image build instead of the helios pkg repo version. + let opte_override = parse_opte_version_override( + &WORKSPACE_DIR.join("tools/opte_version_override"), + ) + .await?; + if let Some(ov) = &opte_override { + info!(logger, "OPTE override active: commit={}", ov.commit); + } + let opte_version = opte_version.trim(); + let client = reqwest::ClientBuilder::new() .connect_timeout(Duration::from_secs(15)) .timeout(Duration::from_secs(120)) @@ -617,7 +629,7 @@ async fn main() -> Result<()> { .arg("-o") // output directory for image .arg(args.output_dir.join(format!("os-{}", target))) .arg("-F") // pass extra image builder features - .arg(format!("optever={}", opte_version.trim())) + .arg(format!("optever={opte_version}")) .arg("-P") // include all files from extra proto area .arg(proto_dir.join("root")) .arg("-N") // image name @@ -675,11 +687,33 @@ async fn main() -> Result<()> { .arg(format!("helios-dev={HELIOS_PKGREPO}")) } - // helios-build experiment-image - jobs.push_command(format!("{}-image", target), image_cmd) + // When OPTE_COMMIT is set, download the override p5p from buildomat + // and add it as a package source for the image build. + if let Some(ov) = &opte_override { + let p5p_path = tempdir.path().join(format!("opte-{target}.p5p")); + let commit = ov.commit.clone(); + let dest = p5p_path.clone(); + let cl = client.clone(); + let log = logger.clone(); + jobs.push( + format!("{target}-opte-p5p"), + download_opte_p5p(log, cl, commit, dest), + ); + + image_cmd = image_cmd + .arg("-p") + .arg(format!("helios-dev=file://{p5p_path}")); + } + + let image_job = jobs + .push_command(format!("{target}-image"), image_cmd) .after("helios-setup") .after("helios-incorp") - .after(format!("{}-proto", target)); + .after(format!("{target}-proto")); + + if opte_override.is_some() { + image_job.after(format!("{target}-opte-p5p")); + } } // Build the recovery target after we build the host target. Only one // of these will build at a time since Cargo locks its target directory; @@ -887,6 +921,82 @@ async fn build_proto_area( Ok(()) } +/// Parsed contents of `tools/opte_version_override` when an override is active. +struct OpteOverride { + commit: String, +} + +/// Parse `tools/opte_version_override` for `OPTE_COMMIT`. Returns `None` if +/// `OPTE_COMMIT` is unset or empty. Errors if more than one non-comment +/// `OPTE_COMMIT=` assignment is present, since a stale line above the active +/// one would otherwise win silently. +async fn parse_opte_version_override( + path: &Utf8PathBuf, +) -> Result> { + let contents = fs::read_to_string(path) + .await + .context("failed to read tools/opte_version_override")?; + + let assignments: Vec<&str> = contents + .lines() + .filter_map(|line| line.trim().strip_prefix("OPTE_COMMIT=")) + .map(|val| val.trim_matches(|c| c == '"' || c == '\'')) + .collect(); + + if assignments.len() > 1 { + bail!( + "tools/opte_version_override contains {} OPTE_COMMIT \ + assignments (expected at most 1)", + assignments.len() + ); + } + + Ok(assignments + .into_iter() + .find(|val| !val.is_empty()) + .map(|commit| OpteOverride { commit: commit.to_string() })) +} + +const OPTE_BUILDOMAT_BASE: &str = + "https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte"; + +/// Download the OPTE override p5p archive from buildomat. +async fn download_opte_p5p( + logger: Logger, + client: reqwest::Client, + commit: String, + dest: Utf8PathBuf, +) -> Result<()> { + let url = format!("{OPTE_BUILDOMAT_BASE}/repo/{commit}/opte.p5p"); + info!(logger, "downloading OPTE override p5p from {url}"); + for attempt in 1..=RETRY_ATTEMPTS { + let result = async { + let response = client.get(&url).send().await?.error_for_status()?; + let bytes = response.bytes().await?; + fs::write(&dest, &bytes).await?; + Ok::<_, anyhow::Error>(()) + } + .await; + + match result { + Ok(()) => { + info!(logger, "downloaded OPTE p5p to {dest}"); + return Ok(()); + } + Err(err) => { + if attempt == RETRY_ATTEMPTS { + return Err(err).with_context(|| { + format!("failed to download OPTE p5p from {url}") + }); + } + info!(logger, "retrying OPTE p5p download (attempt {attempt})"); + } + } + } + + bail!("failed to download OPTE p5p after {RETRY_ATTEMPTS} attempts") +} + async fn host_add_root_profile(host_proto_root: Utf8PathBuf) -> Result<()> { fs::create_dir_all(&host_proto_root).await?; fs::write( diff --git a/tools/ci_check_opte_ver.sh b/tools/ci_check_opte_ver.sh index 4f1e099d9ea..a721370123e 100755 --- a/tools/ci_check_opte_ver.sh +++ b/tools/ci_check_opte_ver.sh @@ -2,9 +2,6 @@ set -euo pipefail source tools/opte_version_override -if [[ "x$OPTE_COMMIT" != "x" ]]; then - exit 0 -fi # Grab all the oxidecomputer/opte dependencies' revisions readarray -t opte_deps_revs < <(toml get Cargo.toml workspace.dependencies | jq -r 'to_entries | .[] | select(.value.git? | contains("oxidecomputer/opte")?) | .value.rev') @@ -19,6 +16,19 @@ for rev in "${opte_deps_revs[@]}"; do fi done +# When an OPTE override is active, the kernel binary is built from +# $OPTE_COMMIT while the userland (opte-ioctl, oxide-vpc) is built from +# the Cargo dep revision. They must match, otherwise kernel/userland +# ABI drift surfaces as opaque ioctl failures at runtime. +if [[ "x$OPTE_COMMIT" != "x" ]]; then + if [ "$OPTE_REV" != "$OPTE_COMMIT" ]; then + echo "OPTE override mismatch:" + echo " Cargo.toml deps: $OPTE_REV" + echo " tools/opte_version_override: $OPTE_COMMIT" + exit 1 + fi +fi + # Grab the API version for this revision API_VER=$(curl -s https://raw.githubusercontent.com/oxidecomputer/opte/"$OPTE_REV"/crates/opte-api/src/lib.rs | sed -n 's/pub const API_VERSION: u64 = \([0-9]*\);/\1/p') @@ -71,6 +81,6 @@ BUILDOMAT_DEPLOY_TARGET=$(cat .github/buildomat/jobs/deploy.sh | sed -n 's/#:[ ] if [ "lab-2.0-opte-0.$API_VER" != "$BUILDOMAT_DEPLOY_TARGET" ]; then echo "OPTE version mismatch:" echo "Cargo.toml: $OPTE_REV ($OPTE_VER)" - echo "buildomat deploy job: $BUILDOMAT_DEPLOY_TARGET (expected lab-opte-0.$API_VER)" + echo "buildomat deploy job: $BUILDOMAT_DEPLOY_TARGET (expected lab-2.0-opte-0.$API_VER)" exit 1 fi diff --git a/tools/install_opte.sh b/tools/install_opte.sh index d56523764d9..9209eee15af 100755 --- a/tools/install_opte.sh +++ b/tools/install_opte.sh @@ -51,6 +51,12 @@ fi # Grab the version of the opte package to install OPTE_VERSION="$(cat "$OMICRON_TOP/tools/opte_version")" +# Source the OPTE override (if any). When OPTE_COMMIT is set, the desired +# OPTE version isn't published to the helios pkg repo yet, so we install +# directly from the override p5p built by OPTE CI on buildomat (see the +# branch on $OPTE_COMMIT below). +source "$OMICRON_TOP/tools/opte_version_override" + OMICRON_FROZEN_PKG_COMMENT="OMICRON-PINNED-PACKAGE" # Once we install, we mark the package as frozen at that particular version. @@ -71,22 +77,48 @@ if PKG_FROZEN=$(pkg freeze | grep driver/network/opte); then pfexec pkg unfreeze driver/network/opte fi -# Actually install the xde kernel module and opteadm tool -RC=0 -pfexec pkg install -v pkg://helios-dev/driver/network/opte@"$OPTE_VERSION" || RC=$? -if [[ "$RC" -eq 0 ]]; then - echo "xde driver installed successfully" -elif [[ "$RC" -eq 4 ]]; then - echo "Correct xde driver already installed" +if [[ "x$OPTE_COMMIT" != "x" ]]; then + # Install from the override p5p archive built by OPTE CI. The p5p + # contains exactly one version (built from $OPTE_COMMIT), which + # generally won't match the canonical $OPTE_VERSION, so we let pkg + # pick the version from the p5p rather than pinning here. + P5P_URL="https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/repo/$OPTE_COMMIT/opte.p5p" + P5P_PATH="/tmp/opte-override.p5p" + echo "Downloading override p5p from $P5P_URL" + curl -fL -o "$P5P_PATH" "$P5P_URL" + + RC=0 + pfexec pkg install -g "$P5P_PATH" driver/network/opte || RC=$? + if [[ "$RC" -eq 0 ]]; then + echo "xde driver installed from override p5p" + elif [[ "$RC" -eq 4 ]]; then + echo "Correct xde driver already installed" + else + echo "Installing xde driver from override p5p failed" + exit "$RC" + fi + rm -f "$P5P_PATH" else - echo "Installing xde driver failed" - exit "$RC" + # Install the published version from the helios pkg repo. + RC=0 + pfexec pkg install -v pkg://helios-dev/driver/network/opte@"$OPTE_VERSION" || RC=$? + if [[ "$RC" -eq 0 ]]; then + echo "xde driver installed successfully" + elif [[ "$RC" -eq 4 ]]; then + echo "Correct xde driver already installed" + else + echo "Installing xde driver failed" + exit "$RC" + fi fi +# Discover the actually-installed version (may differ from $OPTE_VERSION +# when an override is active) and freeze at that. +INSTALLED_VERSION=$(pkg info -l driver/network/opte | awk '/^[[:space:]]*Version:/ {print $2}') RC=0 -pfexec pkg freeze -c "$OMICRON_FROZEN_PKG_COMMENT" driver/network/opte@"$OPTE_VERSION" || RC=$? +pfexec pkg freeze -c "$OMICRON_FROZEN_PKG_COMMENT" "driver/network/opte@$INSTALLED_VERSION" || RC=$? if [[ "$RC" -ne 0 ]]; then - echo "Failed to pin opte package to $OPTE_VERSION" + echo "Failed to pin opte package to $INSTALLED_VERSION" exit $RC fi @@ -97,13 +129,3 @@ if [[ "$RC" -ne 0 ]]; then echo "The \`opteadm\` administration tool is not on your path." echo "You may add \"/opt/oxide/opte/bin\" to your path to access it." fi - -source $OMICRON_TOP/tools/opte_version_override - -if [[ "x$OPTE_COMMIT" != "x" ]]; then - set +x - curl -fOL https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/module/$OPTE_COMMIT/xde - pfexec rem_drv xde || true - pfexec mv xde /kernel/drv/amd64/xde - pfexec add_drv xde || true -fi diff --git a/tools/opte_version_override b/tools/opte_version_override index 8d57f7ae9f4..288a0febf15 100644 --- a/tools/opte_version_override +++ b/tools/opte_version_override @@ -1,5 +1,24 @@ #!/usr/bin/env bash -# only set this if you want to override the version of opte/xde installed by the -# install_opte.sh script +# Override for using an unpublished OPTE version. When OPTE_COMMIT is set, +# the override p5p package is downloaded from buildomat and used instead of +# the version published in the helios pkg repo. The p5p is built by the +# opte-p5p buildomat job and published at: +# https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/repo/{commit}/opte.p5p +# +# Consumers: +# - install_opte.sh installs directly from the override p5p +# - releng image builds use extra_packages with the p5p as a pkg source +# - deploy.sh installs from the override p5p on the running system +# - ci_check_opte_ver.sh enforces OPTE_COMMIT == Cargo dep rev +# +# To activate: set OPTE_COMMIT to the git commit hash of the OPTE build. +# OPTE_COMMIT must equal the `rev` pinned for `oxide-vpc` / `opte-ioctl` in +# Cargo.toml. The kernel module is built from OPTE_COMMIT and the userland +# (opte-ioctl, oxide-vpc) is built from the Cargo dep rev. If these hashes +# diverge, kernel/userland ABI drift surfaces as opaque ioctl failures at +# runtime. Even kernel-only fixes (logging, perf, internal refactors) must come +# with a matching Cargo bump. +# +# To deactivate (once the new version is published): set OPTE_COMMIT to "". OPTE_COMMIT=""