diff --git a/.github/buildomat/jobs/deploy.sh b/.github/buildomat/jobs/deploy.sh index 7a3ac17b054..34a6f066efa 100755 --- a/.github/buildomat/jobs/deploy.sh +++ b/.github/buildomat/jobs/deploy.sh @@ -33,6 +33,10 @@ _exit_trap() { local status=$? set +o errexit + # Restore the override opteadm from /tmp before debug-evidence + # collection runs, in case anything earlier in this trap or any + # downstream tooling has overwritten the installed binary. The + # /tmp copy is saved by the OPTE_COMMIT install block below. if [[ "x$OPTE_COMMIT" != "x" ]]; then pfexec cp /tmp/opteadm /opt/oxide/opte/bin/opteadm fi @@ -134,19 +138,6 @@ z_swadm () { pfexec zlogin oxz_switch /opt/oxide/dendrite/bin/swadm $@ } -# only set this if you want to override the version of opte/xde installed by the -# install_opte.sh script -OPTE_COMMIT="" -if [[ "x$OPTE_COMMIT" != "x" ]]; then - curl -sSfOL https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/module/$OPTE_COMMIT/xde - pfexec rem_drv xde || true - pfexec mv xde /kernel/drv/amd64/xde - pfexec add_drv xde || true - curl -sSfOL https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/release/$OPTE_COMMIT/opteadm - chmod +x opteadm - cp opteadm /tmp/opteadm - pfexec mv opteadm /opt/oxide/opte/bin/opteadm -fi # # XXX work around 14537 (UFS should not allow directories to be unlinked) which @@ -197,6 +188,25 @@ ptime -m tar xvzf /input/package/work/package.tar.gz # shellcheck source=/dev/null source .github/buildomat/ci-env.sh +# Source the OPTE override (if any) from the canonical location and apply it. +# +# When set, download the xde driver and opteadm directly from buildomat and +# swap them in. The deploy target is a ramdisk image without pkg(5), so we +# use rem_drv/add_drv instead of the p5p approach used by install_opte.sh +# and releng. +# shellcheck source=/dev/null +source tools/opte_version_override +if [[ "x$OPTE_COMMIT" != "x" ]]; then + curl -sSfOL "https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/module/$OPTE_COMMIT/xde" + pfexec rem_drv xde || true + pfexec mv xde /kernel/drv/amd64/xde + pfexec add_drv xde || true + curl -sSfOL "https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/release/$OPTE_COMMIT/opteadm" + chmod +x opteadm + cp opteadm /tmp/opteadm + pfexec mv opteadm /opt/oxide/opte/bin/opteadm +fi + # Ask buildomat for the range of extra addresses that we're allowed to use, and # break them up into the ranges we need. diff --git a/.github/buildomat/jobs/package.sh b/.github/buildomat/jobs/package.sh index b43b91e9ec4..78df41dc5f6 100755 --- a/.github/buildomat/jobs/package.sh +++ b/.github/buildomat/jobs/package.sh @@ -60,5 +60,7 @@ files=( target/release/xtask target/debug/bootstrap tests/* + tools/opte_version + tools/opte_version_override ) ptime -m tar cvzf $WORK/package.tar.gz "${files[@]}" "${packages[@]}" diff --git a/.github/workflows/check-opte-ver.yml b/.github/workflows/check-opte-ver.yml index e516eeacbe6..d5de57270d3 100644 --- a/.github/workflows/check-opte-ver.yml +++ b/.github/workflows/check-opte-ver.yml @@ -1,10 +1,10 @@ name: check-opte-ver on: + # Run on every PR targeting main, regardless of which paths changed, as the + # override file may have been set in an earlier commit on the branch. So, + # we must check on every PR rather than only when tracked paths change. pull_request: - paths: - - '.github/workflows/check-opte-ver.yml' - - 'Cargo.toml' - - 'tools/opte_version' + branches: [main] jobs: check-opte-ver: runs-on: ubuntu-22.04 @@ -18,3 +18,18 @@ jobs: run: cargo install toml-cli@0.2.3 - name: Check OPTE version and rev match run: ./tools/ci_check_opte_ver.sh + + check-opte-override: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + ref: ${{ github.event.pull_request.head.sha }} # see omicron#4461 + - name: Reject OPTE override on main + run: | + source tools/opte_version_override + if [[ "x$OPTE_COMMIT" != "x" ]]; then + echo "::error::OPTE_COMMIT is set in tools/opte_version_override." + echo "::error::The OPTE override must be cleared before merging to main." + exit 1 + fi diff --git a/dev-tools/releng/src/main.rs b/dev-tools/releng/src/main.rs index 874f25e3fe7..2177b6c8cda 100644 --- a/dev-tools/releng/src/main.rs +++ b/dev-tools/releng/src/main.rs @@ -270,6 +270,18 @@ async fn main() -> Result<()> { let opte_version = fs::read_to_string(WORKSPACE_DIR.join("tools/opte_version")).await?; + // Parse tools/opte_version_override for OPTE_COMMIT. When set, we + // download the override p5p from buildomat and use it as a package + // source during image build instead of the helios pkg repo version. + let opte_override = parse_opte_version_override( + &WORKSPACE_DIR.join("tools/opte_version_override"), + ) + .await?; + if let Some(ov) = &opte_override { + info!(logger, "OPTE override active: commit={}", ov.commit); + } + let opte_version = opte_version.trim(); + let client = reqwest::ClientBuilder::new() .connect_timeout(Duration::from_secs(15)) .timeout(Duration::from_secs(120)) @@ -617,7 +629,7 @@ async fn main() -> Result<()> { .arg("-o") // output directory for image .arg(args.output_dir.join(format!("os-{}", target))) .arg("-F") // pass extra image builder features - .arg(format!("optever={}", opte_version.trim())) + .arg(format!("optever={opte_version}")) .arg("-P") // include all files from extra proto area .arg(proto_dir.join("root")) .arg("-N") // image name @@ -675,11 +687,33 @@ async fn main() -> Result<()> { .arg(format!("helios-dev={HELIOS_PKGREPO}")) } - // helios-build experiment-image - jobs.push_command(format!("{}-image", target), image_cmd) + // When OPTE_COMMIT is set, download the override p5p from buildomat + // and add it as a package source for the image build. + if let Some(ov) = &opte_override { + let p5p_path = tempdir.path().join(format!("opte-{target}.p5p")); + let commit = ov.commit.clone(); + let dest = p5p_path.clone(); + let cl = client.clone(); + let log = logger.clone(); + jobs.push( + format!("{target}-opte-p5p"), + download_opte_p5p(log, cl, commit, dest), + ); + + image_cmd = image_cmd + .arg("-p") + .arg(format!("helios-dev=file://{p5p_path}")); + } + + let image_job = jobs + .push_command(format!("{target}-image"), image_cmd) .after("helios-setup") .after("helios-incorp") - .after(format!("{}-proto", target)); + .after(format!("{target}-proto")); + + if opte_override.is_some() { + image_job.after(format!("{target}-opte-p5p")); + } } // Build the recovery target after we build the host target. Only one // of these will build at a time since Cargo locks its target directory; @@ -887,6 +921,82 @@ async fn build_proto_area( Ok(()) } +/// Parsed contents of `tools/opte_version_override` when an override is active. +struct OpteOverride { + commit: String, +} + +/// Parse `tools/opte_version_override` for `OPTE_COMMIT`. Returns `None` if +/// `OPTE_COMMIT` is unset or empty. Errors if more than one non-comment +/// `OPTE_COMMIT=` assignment is present, since a stale line above the active +/// one would otherwise win silently. +async fn parse_opte_version_override( + path: &Utf8PathBuf, +) -> Result> { + let contents = fs::read_to_string(path) + .await + .context("failed to read tools/opte_version_override")?; + + let assignments: Vec<&str> = contents + .lines() + .filter_map(|line| line.trim().strip_prefix("OPTE_COMMIT=")) + .map(|val| val.trim_matches(|c| c == '"' || c == '\'')) + .collect(); + + if assignments.len() > 1 { + bail!( + "tools/opte_version_override contains {} OPTE_COMMIT \ + assignments (expected at most 1)", + assignments.len() + ); + } + + Ok(assignments + .into_iter() + .find(|val| !val.is_empty()) + .map(|commit| OpteOverride { commit: commit.to_string() })) +} + +const OPTE_BUILDOMAT_BASE: &str = + "https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte"; + +/// Download the OPTE override p5p archive from buildomat. +async fn download_opte_p5p( + logger: Logger, + client: reqwest::Client, + commit: String, + dest: Utf8PathBuf, +) -> Result<()> { + let url = format!("{OPTE_BUILDOMAT_BASE}/repo/{commit}/opte.p5p"); + info!(logger, "downloading OPTE override p5p from {url}"); + for attempt in 1..=RETRY_ATTEMPTS { + let result = async { + let response = client.get(&url).send().await?.error_for_status()?; + let bytes = response.bytes().await?; + fs::write(&dest, &bytes).await?; + Ok::<_, anyhow::Error>(()) + } + .await; + + match result { + Ok(()) => { + info!(logger, "downloaded OPTE p5p to {dest}"); + return Ok(()); + } + Err(err) => { + if attempt == RETRY_ATTEMPTS { + return Err(err).with_context(|| { + format!("failed to download OPTE p5p from {url}") + }); + } + info!(logger, "retrying OPTE p5p download (attempt {attempt})"); + } + } + } + + bail!("failed to download OPTE p5p after {RETRY_ATTEMPTS} attempts") +} + async fn host_add_root_profile(host_proto_root: Utf8PathBuf) -> Result<()> { fs::create_dir_all(&host_proto_root).await?; fs::write( diff --git a/tools/ci_check_opte_ver.sh b/tools/ci_check_opte_ver.sh index 4f1e099d9ea..a721370123e 100755 --- a/tools/ci_check_opte_ver.sh +++ b/tools/ci_check_opte_ver.sh @@ -2,9 +2,6 @@ set -euo pipefail source tools/opte_version_override -if [[ "x$OPTE_COMMIT" != "x" ]]; then - exit 0 -fi # Grab all the oxidecomputer/opte dependencies' revisions readarray -t opte_deps_revs < <(toml get Cargo.toml workspace.dependencies | jq -r 'to_entries | .[] | select(.value.git? | contains("oxidecomputer/opte")?) | .value.rev') @@ -19,6 +16,19 @@ for rev in "${opte_deps_revs[@]}"; do fi done +# When an OPTE override is active, the kernel binary is built from +# $OPTE_COMMIT while the userland (opte-ioctl, oxide-vpc) is built from +# the Cargo dep revision. They must match, otherwise kernel/userland +# ABI drift surfaces as opaque ioctl failures at runtime. +if [[ "x$OPTE_COMMIT" != "x" ]]; then + if [ "$OPTE_REV" != "$OPTE_COMMIT" ]; then + echo "OPTE override mismatch:" + echo " Cargo.toml deps: $OPTE_REV" + echo " tools/opte_version_override: $OPTE_COMMIT" + exit 1 + fi +fi + # Grab the API version for this revision API_VER=$(curl -s https://raw.githubusercontent.com/oxidecomputer/opte/"$OPTE_REV"/crates/opte-api/src/lib.rs | sed -n 's/pub const API_VERSION: u64 = \([0-9]*\);/\1/p') @@ -71,6 +81,6 @@ BUILDOMAT_DEPLOY_TARGET=$(cat .github/buildomat/jobs/deploy.sh | sed -n 's/#:[ ] if [ "lab-2.0-opte-0.$API_VER" != "$BUILDOMAT_DEPLOY_TARGET" ]; then echo "OPTE version mismatch:" echo "Cargo.toml: $OPTE_REV ($OPTE_VER)" - echo "buildomat deploy job: $BUILDOMAT_DEPLOY_TARGET (expected lab-opte-0.$API_VER)" + echo "buildomat deploy job: $BUILDOMAT_DEPLOY_TARGET (expected lab-2.0-opte-0.$API_VER)" exit 1 fi diff --git a/tools/install_opte.sh b/tools/install_opte.sh index d56523764d9..9209eee15af 100755 --- a/tools/install_opte.sh +++ b/tools/install_opte.sh @@ -51,6 +51,12 @@ fi # Grab the version of the opte package to install OPTE_VERSION="$(cat "$OMICRON_TOP/tools/opte_version")" +# Source the OPTE override (if any). When OPTE_COMMIT is set, the desired +# OPTE version isn't published to the helios pkg repo yet, so we install +# directly from the override p5p built by OPTE CI on buildomat (see the +# branch on $OPTE_COMMIT below). +source "$OMICRON_TOP/tools/opte_version_override" + OMICRON_FROZEN_PKG_COMMENT="OMICRON-PINNED-PACKAGE" # Once we install, we mark the package as frozen at that particular version. @@ -71,22 +77,48 @@ if PKG_FROZEN=$(pkg freeze | grep driver/network/opte); then pfexec pkg unfreeze driver/network/opte fi -# Actually install the xde kernel module and opteadm tool -RC=0 -pfexec pkg install -v pkg://helios-dev/driver/network/opte@"$OPTE_VERSION" || RC=$? -if [[ "$RC" -eq 0 ]]; then - echo "xde driver installed successfully" -elif [[ "$RC" -eq 4 ]]; then - echo "Correct xde driver already installed" +if [[ "x$OPTE_COMMIT" != "x" ]]; then + # Install from the override p5p archive built by OPTE CI. The p5p + # contains exactly one version (built from $OPTE_COMMIT), which + # generally won't match the canonical $OPTE_VERSION, so we let pkg + # pick the version from the p5p rather than pinning here. + P5P_URL="https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/repo/$OPTE_COMMIT/opte.p5p" + P5P_PATH="/tmp/opte-override.p5p" + echo "Downloading override p5p from $P5P_URL" + curl -fL -o "$P5P_PATH" "$P5P_URL" + + RC=0 + pfexec pkg install -g "$P5P_PATH" driver/network/opte || RC=$? + if [[ "$RC" -eq 0 ]]; then + echo "xde driver installed from override p5p" + elif [[ "$RC" -eq 4 ]]; then + echo "Correct xde driver already installed" + else + echo "Installing xde driver from override p5p failed" + exit "$RC" + fi + rm -f "$P5P_PATH" else - echo "Installing xde driver failed" - exit "$RC" + # Install the published version from the helios pkg repo. + RC=0 + pfexec pkg install -v pkg://helios-dev/driver/network/opte@"$OPTE_VERSION" || RC=$? + if [[ "$RC" -eq 0 ]]; then + echo "xde driver installed successfully" + elif [[ "$RC" -eq 4 ]]; then + echo "Correct xde driver already installed" + else + echo "Installing xde driver failed" + exit "$RC" + fi fi +# Discover the actually-installed version (may differ from $OPTE_VERSION +# when an override is active) and freeze at that. +INSTALLED_VERSION=$(pkg info -l driver/network/opte | awk '/^[[:space:]]*Version:/ {print $2}') RC=0 -pfexec pkg freeze -c "$OMICRON_FROZEN_PKG_COMMENT" driver/network/opte@"$OPTE_VERSION" || RC=$? +pfexec pkg freeze -c "$OMICRON_FROZEN_PKG_COMMENT" "driver/network/opte@$INSTALLED_VERSION" || RC=$? if [[ "$RC" -ne 0 ]]; then - echo "Failed to pin opte package to $OPTE_VERSION" + echo "Failed to pin opte package to $INSTALLED_VERSION" exit $RC fi @@ -97,13 +129,3 @@ if [[ "$RC" -ne 0 ]]; then echo "The \`opteadm\` administration tool is not on your path." echo "You may add \"/opt/oxide/opte/bin\" to your path to access it." fi - -source $OMICRON_TOP/tools/opte_version_override - -if [[ "x$OPTE_COMMIT" != "x" ]]; then - set +x - curl -fOL https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/module/$OPTE_COMMIT/xde - pfexec rem_drv xde || true - pfexec mv xde /kernel/drv/amd64/xde - pfexec add_drv xde || true -fi diff --git a/tools/opte_version_override b/tools/opte_version_override index 8d57f7ae9f4..288a0febf15 100644 --- a/tools/opte_version_override +++ b/tools/opte_version_override @@ -1,5 +1,24 @@ #!/usr/bin/env bash -# only set this if you want to override the version of opte/xde installed by the -# install_opte.sh script +# Override for using an unpublished OPTE version. When OPTE_COMMIT is set, +# the override p5p package is downloaded from buildomat and used instead of +# the version published in the helios pkg repo. The p5p is built by the +# opte-p5p buildomat job and published at: +# https://buildomat.eng.oxide.computer/public/file/oxidecomputer/opte/repo/{commit}/opte.p5p +# +# Consumers: +# - install_opte.sh installs directly from the override p5p +# - releng image builds use extra_packages with the p5p as a pkg source +# - deploy.sh installs from the override p5p on the running system +# - ci_check_opte_ver.sh enforces OPTE_COMMIT == Cargo dep rev +# +# To activate: set OPTE_COMMIT to the git commit hash of the OPTE build. +# OPTE_COMMIT must equal the `rev` pinned for `oxide-vpc` / `opte-ioctl` in +# Cargo.toml. The kernel module is built from OPTE_COMMIT and the userland +# (opte-ioctl, oxide-vpc) is built from the Cargo dep rev. If these hashes +# diverge, kernel/userland ABI drift surfaces as opaque ioctl failures at +# runtime. Even kernel-only fixes (logging, perf, internal refactors) must come +# with a matching Cargo bump. +# +# To deactivate (once the new version is published): set OPTE_COMMIT to "". OPTE_COMMIT=""