Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 205 additions & 0 deletions .github/workflows/build-release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
name: Build / Release

on:
workflow_dispatch:
inputs:
architecture:
description: Package architecture to build
required: true
default: amd64
type: choice
options:
- amd64
- aarch64
package_release:
description: Package release suffix
required: true
default: "2"
type: string
run_tests:
description: Run build-time tests
required: true
default: false
type: boolean
release_comments:
description: Optional release comments. Defaults to the build commit hash.
required: false
type: string

permissions:
contents: write

concurrency:
group: build-release-${{ github.ref }}
cancel-in-progress: false

jobs:
packages:
name: Build packages and create release
runs-on: ubuntu-24.04
env:
DCGM_PACKAGE_RELEASE: ${{ inputs.package_release }}

steps:
- name: Check out repository
uses: actions/checkout@v4
with:
lfs: false

- name: Prepare release metadata
id: metadata
shell: bash
env:
PACKAGE_ARCHITECTURE: ${{ inputs.architecture }}
RELEASE_COMMENTS: ${{ inputs.release_comments }}
run: |
set -euo pipefail

version="$(sed -nE 's/^project\(datacenter-gpu-manager-4 VERSION ([^)]+)\)/\1/p' CMakeLists.txt)"
if [[ -z "$version" ]]; then
echo "Unable to determine DCGM version from CMakeLists.txt" >&2
exit 1
fi

tag="v${version}-${DCGM_PACKAGE_RELEASE}-${PACKAGE_ARCHITECTURE}"
name="DCGM ${version}-${DCGM_PACKAGE_RELEASE}-${PACKAGE_ARCHITECTURE}"

if [[ -n "$RELEASE_COMMENTS" ]]; then
body="$RELEASE_COMMENTS"
else
body="Build commit: ${GITHUB_SHA}"
fi

{
echo "version=$version"
echo "tag=$tag"
echo "name=$name"
echo "body<<EOF"
echo "$body"
echo "EOF"
} >> "$GITHUB_OUTPUT"

- name: Prepare Git LFS cache key
shell: bash
run: |
set -euo pipefail
git lfs install --local
git lfs ls-files -l | sort > .git-lfs-cache-key

- name: Cache Git LFS objects
uses: actions/cache@v4
with:
path: .git/lfs
key: git-lfs-${{ runner.os }}-${{ hashFiles('.git-lfs-cache-key') }}
restore-keys: |
git-lfs-${{ runner.os }}-

- name: Pull Git LFS objects
shell: bash
run: git lfs pull

- name: Set build architecture
id: architecture
shell: bash
env:
PACKAGE_ARCHITECTURE: ${{ inputs.architecture }}
run: |
set -euo pipefail

case "$PACKAGE_ARCHITECTURE" in
amd64)
docker_architecture=x86_64
package_architecture=amd64
;;
aarch64)
docker_architecture=aarch64
package_architecture=aarch64
;;
*)
echo "Unsupported architecture: $PACKAGE_ARCHITECTURE" >&2
exit 1
;;
esac

{
echo "docker_architecture=$docker_architecture"
echo "package_architecture=$package_architecture"
} >> "$GITHUB_OUTPUT"

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Build cached DCGM build image
working-directory: dcgmbuild
shell: bash
run: |
set -euo pipefail

docker_architecture="${{ steps.architecture.outputs.docker_architecture }}"

docker buildx bake "dcgmbuild-${docker_architecture}" \
--set "common-host-software.cache-from=type=gha,scope=dcgmbuild-common-host-software" \
--set "common-host-software.cache-to=type=gha,scope=dcgmbuild-common-host-software,mode=max" \
--set "toolchain-${docker_architecture}.cache-from=type=gha,scope=dcgmbuild-toolchain-${docker_architecture}" \
--set "toolchain-${docker_architecture}.cache-to=type=gha,scope=dcgmbuild-toolchain-${docker_architecture},mode=max" \
--set "dcgmbuild-${docker_architecture}.cache-from=type=gha,scope=dcgmbuild-${docker_architecture}" \
--set "dcgmbuild-${docker_architecture}.cache-to=type=gha,scope=dcgmbuild-${docker_architecture},mode=max" \
--set "dcgmbuild-${docker_architecture}.output=type=docker"

- name: Build packages
env:
DCGM_SKIP_LFS_INSTALL: 1
DCGM_SKIP_PYTHON_LINTING: 1
NPROC: 4
RUN_TESTS: ${{ inputs.run_tests }}
shell: bash
run: |
set -euo pipefail

test_arguments=(--no-tests)
if [[ "$RUN_TESTS" == "true" ]]; then
test_arguments=()
fi

./build.sh \
--release \
--arch "${{ steps.architecture.outputs.package_architecture }}" \
"${test_arguments[@]}" \
--deb \
--rpm \
--packages \
--package-release "$DCGM_PACKAGE_RELEASE"

- name: Collect packages
id: packages
shell: bash
run: |
set -euo pipefail

mkdir -p release-assets
find _out \
\( -name '*.deb' -o -name '*.ddeb' -o -name '*.rpm' -o -name '*.tar.gz' \) \
-type f \
-exec cp --target-directory release-assets {} +

if ! compgen -G 'release-assets/*' > /dev/null; then
echo "No package artifacts were produced" >&2
exit 1
fi

- name: Upload package artifacts
uses: actions/upload-artifact@v4
with:
name: dcgm-packages-${{ steps.architecture.outputs.package_architecture }}
path: release-assets/*
if-no-files-found: error

- name: Create GitHub release
uses: softprops/action-gh-release@v2
with:
tag_name: ${{ steps.metadata.outputs.tag }}
name: ${{ steps.metadata.outputs.name }}
body: ${{ steps.metadata.outputs.body }}
target_commitish: ${{ github.sha }}
fail_on_unmatched_files: true
files: release-assets/*
15 changes: 14 additions & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ Usage: ${0} [options] [-- [any additional cmake arguments]]
-p --packages : Generate tar.gz packages once the build is done
--deb : Generate *.deb packages once the build is done
--rpm : Generate *.rpm packages once the build is done
--package-release <release> : Override the package release suffix for DEB/RPM packages
-c --clean : Make clean rebuild
-a --arch <arch> : Make build for specified architecture. Supported are: amd64, aarch64
-n --no-tests : Do not run build-time tests
Expand Down Expand Up @@ -78,7 +79,7 @@ Usage: ${0} [options] [-- [any additional cmake arguments]]
sanitizers compatibility."
}

LONGOPTS=address-san,arch:,clean,coverage,deb,debug,debug-find,debug-find-pkg:,gcc-analyzer,help,leak-san,no-install,no-tests,packages,release,rpm,thread-san,ub-san,vmware
LONGOPTS=address-san,arch:,clean,coverage,deb,debug,debug-find,debug-find-pkg:,gcc-analyzer,help,leak-san,no-install,no-tests,package-release:,packages,release,rpm,thread-san,ub-san,vmware
SHORTOPTS=drsa:pchn

! PARSED=$(getopt --options=${SHORTOPTS} --longoptions=${LONGOPTS} --name "${0}" -- "$@")
Expand Down Expand Up @@ -106,6 +107,7 @@ COVERAGE=0
DEB=0
INSTALL=1
OS=Linux
PACKAGE_RELEASE=
RPM=0
TESTS=1
TGZ=0
Expand Down Expand Up @@ -162,6 +164,12 @@ while [[ $# -ne 0 ]]; do
--packages|-p)
TGZ=1
;;
--package-release)
PACKAGE_RELEASE=$2
build_arguments+=($1 $2)
shift 2
continue
;;
--release|-r)
cmake_build_types+=(RelWithDebInfo)
;;
Expand Down Expand Up @@ -194,6 +202,11 @@ done

cmake_arguments+=(-D BUILD_TESTING=$TESTS)

if [[ -n "$PACKAGE_RELEASE" ]]; then
export DCGM_PACKAGE_RELEASE="$PACKAGE_RELEASE"
intodocker_arguments+=(--env "DCGM_PACKAGE_RELEASE=$DCGM_PACKAGE_RELEASE")
fi

if [[ ${DCGM_BUILD_INSIDE_DOCKER:-0} -eq 0 ]]; then
if [[ $DCGM_SKIP_LFS_INSTALL -eq 0 ]]; then
if ! git config --local --get filter.lfs.smudge > /dev/null; then
Expand Down
9 changes: 7 additions & 2 deletions cmake/packaging.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
set(CPACK_PACKAGE_CONTACT "dcgm-support <dcgm-support@nvidia.com>")
set(CPACK_PACKAGE_VENDOR "NVIDIA Corp.")

set(DCGM_PACKAGE_RELEASE "1")
if (DEFINED ENV{DCGM_PACKAGE_RELEASE} AND NOT "$ENV{DCGM_PACKAGE_RELEASE}" STREQUAL "")
set(DCGM_PACKAGE_RELEASE "$ENV{DCGM_PACKAGE_RELEASE}")
endif()

set(CPACK_PACKAGE_DESCRIPTION
"This package is a component of DCGM, a userspace library and service that
simplifies the administration of NVIDIA datacenter hardware in cluster and
Expand Down Expand Up @@ -122,7 +127,7 @@ if (CPACK_GENERATOR MATCHES "DEB")
endif()

set(CPACK_COMPONENTS_GROUPING "IGNORE")
set(CPACK_DEBIAN_PACKAGE_RELEASE "1")
set(CPACK_DEBIAN_PACKAGE_RELEASE "${DCGM_PACKAGE_RELEASE}")
set(CPACK_DEBIAN_PACKAGE_CONTROL_STRICT_PERMISSION TRUE)
set(CPACK_PRE_BUILD_SCRIPTS
"${CMAKE_CURRENT_LIST_DIR}/cpack-deb-prebuild.cmake")
Expand Down Expand Up @@ -267,7 +272,7 @@ elseif(CPACK_GENERATOR MATCHES "RPM")
set(CPACK_RPM_PACKAGE_AUTOREQ "no") # Removes dependency on libcuda.so.1
set(CPACK_RPM_PACKAGE_DESCRIPTION "${CPACK_PACKAGE_DESCRIPTION_SUMMARY}")
set(CPACK_RPM_PACKAGE_LICENSE "NVIDIA Proprietary")
set(CPACK_RPM_PACKAGE_RELEASE 1) # this is the package spec version, not the software version
set(CPACK_RPM_PACKAGE_RELEASE "${DCGM_PACKAGE_RELEASE}") # this is the package spec version, not the software version
set(CPACK_RPM_SPEC_MORE_DEFINE "
%define __strip ${CPACK_STRIP}
%define __objdump ${CPACK_OBJDUMP}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ccache https://github.com/ccache/ccache/releases/download/v4.12.2/ccache-4.12.2-linux-x86_64.tar.xz d5aa5316d18bbb68ba332deca057e9f87e997f46316cb20beb4ef7e264f9181242d80b39629c59b92aff5fe0a1ce83bd35eb398a7f0353cff4ef0aa2730edeff
clang https://apt.llvm.org/llvm.sh e8f246b75b2c2c7cb903e24c07257eea344aecf3f2a80c56896261ec61605eb223ad1795bfb9530dd8b4bef9b25ba11a6c32fdd6bdafbea34262f0ba97d01b08
clang https://apt.llvm.org/llvm.sh cafe8efb95c27efd8bc4e7e936c18feb83b51bd73229f28b503cbeb132b99b90ea0d3324d84c67953482c5c15683c406db28e493ca0e88339945dbacdc8019d0
cmake https://github.com/Kitware/CMake/releases/download/v4.1.1/cmake-4.1.1-linux-x86_64.sh c51ce071fff93f6825d2f2496d9adb3bd331c4ccbb7204278770fb2890973cbb4884ad766d084cdd2adbda96659bf69a694aca29d3e1cb0a99cf141abee8a9d3
git-lfs https://github.com/git-lfs/git-lfs/releases/download/v2.13.3/git-lfs-linux-amd64-v2.13.3.tar.gz 110906644dd558705b996271066c18cc3e017035ceecf6dcea8a600691914513204f25a6d1549d20ca398d9bab78993d08ef66cc1744d1df8c74fadbdec965e7
lcov https://github.com/linux-test-project/lcov/archive/v2.0.tar.gz 5d64b77154ab77d04607ad7bd18afa370ccce90d74a7a48ed8f2f506534ee856e9b2bd1271931797f6e79ee37b0c0f9b65d9dd2cfed99377b708679ca0cab772
Expand Down
31 changes: 28 additions & 3 deletions dcgmlib/src/DcgmCacheManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5838,9 +5838,6 @@ bool DcgmCacheManager::IsModulePushedFieldId(unsigned int fieldId)
case DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER_FLOAT:
case DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_ERRORS:
case DCGM_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL:
case DCGM_FI_DEV_MEMORY_UNREPAIRABLE_FLAG:
case DCGM_FI_DEV_NVLINK_GET_STATE:
case DCGM_FI_DEV_NVLINK_PPCNT_IBPC_PORT_XMIT_WAIT:
case DCGM_FI_DEV_C2C_LINK_ERROR_INTR:
case DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY:
case DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY_B2B:
Expand All @@ -5861,6 +5858,34 @@ bool DcgmCacheManager::IsModulePushedFieldId(unsigned int fieldId)
case DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_13:
case DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_14:
case DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_15:
case DCGM_FI_DEV_CLOCKS_EVENT_REASON_SW_POWER_CAP_NS:
case DCGM_FI_DEV_CLOCKS_EVENT_REASON_SYNC_BOOST_NS:
case DCGM_FI_DEV_CLOCKS_EVENT_REASON_SW_THERM_SLOWDOWN_NS:
case DCGM_FI_DEV_CLOCKS_EVENT_REASON_HW_THERM_SLOWDOWN_NS:
case DCGM_FI_DEV_CLOCKS_EVENT_REASON_HW_POWER_BRAKE_SLOWDOWN_NS:
case DCGM_FI_DEV_PWR_SMOOTHING_ENABLED:
case DCGM_FI_DEV_PWR_SMOOTHING_PRIV_LVL:
case DCGM_FI_DEV_PWR_SMOOTHING_IMM_RAMP_DOWN_ENABLED:
case DCGM_FI_DEV_PWR_SMOOTHING_APPLIED_TMP_CEIL:
case DCGM_FI_DEV_PWR_SMOOTHING_APPLIED_TMP_FLOOR:
case DCGM_FI_DEV_PWR_SMOOTHING_MAX_PERCENT_TMP_FLOOR_SETTING:
case DCGM_FI_DEV_PWR_SMOOTHING_MIN_PERCENT_TMP_FLOOR_SETTING:
case DCGM_FI_DEV_PWR_SMOOTHING_HW_CIRCUITRY_PERCENT_LIFETIME_REMAINING:
case DCGM_FI_DEV_PWR_SMOOTHING_MAX_NUM_PRESET_PROFILES:
case DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_PERCENT_TMP_FLOOR:
case DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_UP_RATE:
case DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_DOWN_RATE:
case DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_DOWN_HYST_VAL:
case DCGM_FI_DEV_PWR_SMOOTHING_ACTIVE_PRESET_PROFILE:
case DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_PERCENT_TMP_FLOOR:
case DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_UP_RATE:
case DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_RATE:
case DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_HYST_VAL:
case DCGM_FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS:
case DCGM_FI_DEV_MEMORY_UNREPAIRABLE_FLAG:
case DCGM_FI_DEV_NVLINK_GET_STATE:
case DCGM_FI_DEV_NVLINK_PPCNT_IBPC_PORT_XMIT_WAIT:
case DCGM_FI_DEV_GET_GPU_RECOVERY_ACTION:
return false;
default:
return true;
Expand Down