From aab106b2126e0670bdaf212ae89e401028f8af24 Mon Sep 17 00:00:00 2001 From: David Grove Date: Fri, 12 Jul 2024 13:35:53 -0400 Subject: [PATCH 01/13] backlevel go version in go.mod for codeflare --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 8db8722..aef8b14 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/project-codeflare/appwrapper -go 1.22.4 +go 1.22.2 require ( github.com/distribution/reference v0.5.0 From d2ee6aa75eb880a6881198198db570c4fb5a347b Mon Sep 17 00:00:00 2001 From: David Grove Date: Fri, 15 Nov 2024 16:23:06 -0500 Subject: [PATCH 02/13] add replace directive for kueue to make code buildable --- go.mod | 3 +++ go.sum | 16 ++++++++-------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/go.mod b/go.mod index 7f644af..28daa58 100644 --- a/go.mod +++ b/go.mod @@ -21,6 +21,9 @@ require ( sigs.k8s.io/yaml v1.4.0 ) +// These replace directives deal with the backlevel ODH kueue version +replace sigs.k8s.io/kueue v0.8.3 => github.com/opendatahub-io/kueue v0.7.0-odh-2 + require ( github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect diff --git a/go.sum b/go.sum index 8f34f0d..22cf776 100644 --- a/go.sum +++ b/go.sum @@ -22,8 +22,8 @@ github.com/emicklei/go-restful/v3 v3.12.1 h1:PJMDIM/ak7btuL8Ex0iYET9hxM3CI2sjZtz github.com/emicklei/go-restful/v3 v3.12.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls= -github.com/evanphx/json-patch v5.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U= +github.com/evanphx/json-patch v5.6.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg= github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ= github.com/fatih/color v1.17.0 h1:GlRw1BRJxkpqUCBKzKOw098ed57fEsKeNjpTe3cSjK4= @@ -136,6 +136,8 @@ github.com/open-policy-agent/frameworks/constraint v0.0.0-20230822235116-f0b62fe github.com/open-policy-agent/frameworks/constraint v0.0.0-20230822235116-f0b62fe1e4c4/go.mod h1:54/KzLMvA5ndBVpm7B1OjLeV0cUtTLTz2bZ2OtydLpU= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= +github.com/opendatahub-io/kueue v0.7.0-odh-2 h1:sRPncaqDJ0a6WJiArVuYVDLPWTXm6hosqiBXzb9xMt4= +github.com/opendatahub-io/kueue v0.7.0-odh-2/go.mod h1:uTAgWTJVrIkm1FPbp6BnhZhr29zlUcgzb5z1m5rFw9k= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -150,10 +152,10 @@ github.com/prometheus/common v0.57.0 h1:Ro/rKjwdq9mZn1K5QPctzh+MA4Lp0BuYk5ZZEVho github.com/prometheus/common v0.57.0/go.mod h1:7uRPFSUTbfZWsJ7MHY56sqt7hLQu3bxXHDnNhl8E9qI= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= -github.com/ray-project/kuberay/ray-operator v1.2.1 h1:H7ofodGclghsU2TxbDHs+gvqvsOp5DJ/vAPGySL1DIE= -github.com/ray-project/kuberay/ray-operator v1.2.1/go.mod h1:osTiIyaDoWi5IN1f0tOOtZ4TzVf+5kJXZor8VFvcEiI= -github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= -github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= +github.com/ray-project/kuberay/ray-operator v1.1.1 h1:mVOA1ddS9aAsPvhhHrpf0ZXgTzccIAyTbeYeDqtcfAk= +github.com/ray-project/kuberay/ray-operator v1.1.1/go.mod h1:ZqyKKvMP5nKDldQoKmur+Wcx7wVlV9Q98phFqHzr+KY= +github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= +github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= @@ -320,8 +322,6 @@ sigs.k8s.io/jobset v0.5.2 h1:276q5Pi/ErLYj+GQ0ydEXR6tx3LwBhEzHLQv+k8bYF4= sigs.k8s.io/jobset v0.5.2/go.mod h1:Vg99rj/6OoGvy1uvywGEHOcVLCWWJYkJtisKqdWzcFw= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/kueue v0.8.3 h1:H4ZUSWYWnbnUrchunkaeubNVbSEhO8FhyxVlI+IsNLA= -sigs.k8s.io/kueue v0.8.3/go.mod h1:JO87rmNX7d71ZQvCX+TS5H5g1NtSWHDaXXEzbJi6Muk= sigs.k8s.io/kustomize/api v0.17.2 h1:E7/Fjk7V5fboiuijoZHgs4aHuexi5Y2loXlVOAVAG5g= sigs.k8s.io/kustomize/api v0.17.2/go.mod h1:UWTz9Ct+MvoeQsHcJ5e+vziRRkwimm3HytpZgIYqye0= sigs.k8s.io/kustomize/kustomize/v5 v5.3.0 h1:OUKaQwArd1udTz3ykibOjaUwdfly6FnkQiDSSft6+Fg= From 3ef10284f107cafb42d5c1d3abfc7b697e76dc9a Mon Sep 17 00:00:00 2001 From: David Grove Date: Fri, 15 Nov 2024 17:42:20 -0500 Subject: [PATCH 03/13] run main CI job on codeflare-releases branch (#270) --- .github/workflows/CI.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml index 94ba630..1c7029d 100644 --- a/.github/workflows/CI.yaml +++ b/.github/workflows/CI.yaml @@ -1,11 +1,11 @@ name: CI on: push: - branches: [main, rhoai-2.10] + branches: [main, rhoai-2.10, codeflare-releases] paths-ignore: - 'site/**' pull_request: - branches: [main, rhoai-2.10] + branches: [main, rhoai-2.10, codeflare-releases] jobs: CI: From 7cbccf8634cf81c70d200201e43cf81e0b14f4f6 Mon Sep 17 00:00:00 2001 From: David Grove Date: Mon, 18 Nov 2024 12:02:52 -0500 Subject: [PATCH 04/13] Use ODH fork of Kueue on the codeflare-releases branch (#272) --- go.mod | 4 ++-- go.sum | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/go.mod b/go.mod index 28daa58..afdefa9 100644 --- a/go.mod +++ b/go.mod @@ -21,8 +21,8 @@ require ( sigs.k8s.io/yaml v1.4.0 ) -// These replace directives deal with the backlevel ODH kueue version -replace sigs.k8s.io/kueue v0.8.3 => github.com/opendatahub-io/kueue v0.7.0-odh-2 +// On the CF release branch, we want to use ODH fork of Kueue +replace sigs.k8s.io/kueue v0.8.3 => github.com/opendatahub-io/kueue v0.8.3 require ( github.com/beorn7/perks v1.0.1 // indirect diff --git a/go.sum b/go.sum index 22cf776..1bdfbe1 100644 --- a/go.sum +++ b/go.sum @@ -22,8 +22,8 @@ github.com/emicklei/go-restful/v3 v3.12.1 h1:PJMDIM/ak7btuL8Ex0iYET9hxM3CI2sjZtz github.com/emicklei/go-restful/v3 v3.12.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U= -github.com/evanphx/json-patch v5.6.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls= +github.com/evanphx/json-patch v5.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg= github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ= github.com/fatih/color v1.17.0 h1:GlRw1BRJxkpqUCBKzKOw098ed57fEsKeNjpTe3cSjK4= @@ -136,8 +136,8 @@ github.com/open-policy-agent/frameworks/constraint v0.0.0-20230822235116-f0b62fe github.com/open-policy-agent/frameworks/constraint v0.0.0-20230822235116-f0b62fe1e4c4/go.mod h1:54/KzLMvA5ndBVpm7B1OjLeV0cUtTLTz2bZ2OtydLpU= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= -github.com/opendatahub-io/kueue v0.7.0-odh-2 h1:sRPncaqDJ0a6WJiArVuYVDLPWTXm6hosqiBXzb9xMt4= -github.com/opendatahub-io/kueue v0.7.0-odh-2/go.mod h1:uTAgWTJVrIkm1FPbp6BnhZhr29zlUcgzb5z1m5rFw9k= +github.com/opendatahub-io/kueue v0.8.3 h1:MLkHCmIrQR1KM1IcPiGuoEAT3Y+ZTs7493sMkmSUMow= +github.com/opendatahub-io/kueue v0.8.3/go.mod h1:jzRyUhAXHIpEPjt4pMx79t/Cg1g29GlNZY6wiLJE2YI= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -152,10 +152,10 @@ github.com/prometheus/common v0.57.0 h1:Ro/rKjwdq9mZn1K5QPctzh+MA4Lp0BuYk5ZZEVho github.com/prometheus/common v0.57.0/go.mod h1:7uRPFSUTbfZWsJ7MHY56sqt7hLQu3bxXHDnNhl8E9qI= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= -github.com/ray-project/kuberay/ray-operator v1.1.1 h1:mVOA1ddS9aAsPvhhHrpf0ZXgTzccIAyTbeYeDqtcfAk= -github.com/ray-project/kuberay/ray-operator v1.1.1/go.mod h1:ZqyKKvMP5nKDldQoKmur+Wcx7wVlV9Q98phFqHzr+KY= -github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= -github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/ray-project/kuberay/ray-operator v1.2.1 h1:H7ofodGclghsU2TxbDHs+gvqvsOp5DJ/vAPGySL1DIE= +github.com/ray-project/kuberay/ray-operator v1.2.1/go.mod h1:osTiIyaDoWi5IN1f0tOOtZ4TzVf+5kJXZor8VFvcEiI= +github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= +github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= From 94d6de558638a444f3b38f11849007c9a8a1497a Mon Sep 17 00:00:00 2001 From: David Grove Date: Tue, 5 Nov 2024 13:09:11 -0500 Subject: [PATCH 05/13] update mlbatch step of release instructions (#265) --- docs/release_instructions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release_instructions.md b/docs/release_instructions.md index 9e95b18..e0c7161 100644 --- a/docs/release_instructions.md +++ b/docs/release_instructions.md @@ -24,5 +24,5 @@ will: go.mod. 5. Update the kustomization.yaml files in MLBatch to refer to the new release: - + setup.k8s-v1.25/appwrapper/kustomization.yaml + + setup.k8s-v1.27/appwrapper/kustomization.yaml + setup.k8s-v1.30/appwrapper/kustomization.yaml From d126ebb1779bdedaac8aaf56557b0e1b2efe4585 Mon Sep 17 00:00:00 2001 From: David Grove Date: Tue, 5 Nov 2024 14:06:10 -0500 Subject: [PATCH 06/13] bump version of action-gh-release to upgrade to Node.js 20 (#266) --- .github/workflows/release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index a1dcdad..6babbd9 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -49,7 +49,7 @@ jobs: make build-installer -e TAG=${RELEASE_VERSION} -e quay_repository=quay.io/ibm - name: Create GitHub Release - uses: softprops/action-gh-release@v1 + uses: softprops/action-gh-release@v2 with: name: Release ${{ env.RELEASE_VERSION }} generate_release_notes: true From b0a3eb903b62acc948d8dc270cbd3bf8d2f67074 Mon Sep 17 00:00:00 2001 From: David Grove Date: Mon, 11 Nov 2024 15:04:10 -0500 Subject: [PATCH 07/13] bump kind to 0.25; simplify using different k8s versions for cluster (#268) --- hack/create-test-cluster.sh | 21 ++++++++---- hack/e2e-util.sh | 66 +++++++++++++++++++++++++++++++++---- hack/kind-config.yaml | 8 ----- 3 files changed, 75 insertions(+), 20 deletions(-) diff --git a/hack/create-test-cluster.sh b/hack/create-test-cluster.sh index 30fa3d5..ccc9387 100755 --- a/hack/create-test-cluster.sh +++ b/hack/create-test-cluster.sh @@ -12,17 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Create and configure a kind cluster for running the e2e tests -# Does NOT install mcad +# Create and optionally configure a kind cluster for running the e2e tests export ROOT_DIR="$(dirname "$(dirname "$(readlink -fn "$0")")")" CLUSTER_STARTED="false" +CONFIGURE_CLUSTER=${CONFIGURE_CLUSTER:-"true"} source ${ROOT_DIR}/hack/e2e-util.sh -update_test_host -check_prerequisites -pull_images +if [[ "$CONFIGURE_CLUSTER" == "true" ]] +then + update_test_host + check_prerequisites + pull_images +fi + kind_up_cluster add_virtual_GPUs -configure_cluster + +if [[ "$CONFIGURE_CLUSTER" == "true" ]] +then + kind_load_images + configure_cluster +fi diff --git a/hack/e2e-util.sh b/hack/e2e-util.sh index bf3a637..ee4cd7d 100755 --- a/hack/e2e-util.sh +++ b/hack/e2e-util.sh @@ -14,8 +14,9 @@ export LOG_LEVEL=${TEST_LOG_LEVEL:-2} export CLEANUP_CLUSTER=${CLEANUP_CLUSTER:-"true"} -export CLUSTER_CONTEXT="--name test" +export CLUSTER_CONTEXT=${CLUSTER_CONTEXT:-"--name test"} export KIND_OPT=${KIND_OPT:=" --config ${ROOT_DIR}/hack/kind-config.yaml"} +export KIND_K8S_VERSION=${KIND_K8S_VERSION:-"1.27"} export KA_BIN=_output/bin export WAIT_TIME="20s" export KUTTL_VERSION=0.15.0 @@ -61,9 +62,9 @@ function update_test_host { which kind >/dev/null 2>&1 if [ $? -ne 0 ] then - # Download kind binary (0.24.0) - echo "Downloading and installing kind v0.24.0...." - sudo curl -o /usr/local/bin/kind -L https://github.com/kubernetes-sigs/kind/releases/download/v0.24.0/kind-linux-${arch} && \ + # Download kind binary (0.25.0) + echo "Downloading and installing kind v0.25.0...." + sudo curl -o /usr/local/bin/kind -L https://github.com/kubernetes-sigs/kind/releases/download/v0.25.0/kind-linux-${arch} && \ sudo chmod +x /usr/local/bin/kind [ $? -ne 0 ] && echo "Failed to download kind" && exit 1 echo "Kind was sucessfully installed." @@ -154,15 +155,68 @@ function pull_images { } function kind_up_cluster { - echo "Running kind: [kind create cluster ${CLUSTER_CONTEXT} ${KIND_OPT}]" - kind create cluster ${CLUSTER_CONTEXT} ${KIND_OPT} --wait ${WAIT_TIME} + # Determine node image tag based on kind version and desired kubernetes version + KIND_ACTUAL_VERSION=$(kind version | awk '/ /{print $2}') + case $KIND_ACTUAL_VERSION in + v0.25.0) + case $KIND_K8S_VERSION in + 1.27) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.27.16@sha256:2d21a61643eafc439905e18705b8186f3296384750a835ad7a005dceb9546d20"} + ;; + 1.29) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.29.10@sha256:3b2d8c31753e6c8069d4fc4517264cd20e86fd36220671fb7d0a5855103aa84b"} + ;; + 1.30) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.30.6@sha256:b6d08db72079ba5ae1f4a88a09025c0a904af3b52387643c285442afb05ab994"} + ;; + 1.31) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.31.2@sha256:18fbefc20a7113353c7b75b5c869d7145a6abd6269154825872dc59c1329912e"} + ;; + *) + echo "Unexpected kubernetes version: $KIND_K8S__VERSION" + exit 1 + ;; + esac + ;; + + v0.24.0) + case $KIND_K8S_VERSION in + 1.27) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.27.16@sha256:3fd82731af34efe19cd54ea5c25e882985bafa2c9baefe14f8deab1737d9fabe"} + ;; + 1.29) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.29.8@sha256:d46b7aa29567e93b27f7531d258c372e829d7224b25e3fc6ffdefed12476d3aa"} + ;; + 1.30) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.30.4@sha256:976ea815844d5fa93be213437e3ff5754cd599b040946b5cca43ca45c2047114"} + ;; + 1.31) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.31.0@sha256:53df588e04085fd41ae12de0c3fe4c72f7013bba32a20e7325357a1ac94ba865"} + ;; + *) + echo "Unexpected kubernetes version: $KIND_K8S__VERSION" + exit 1 + ;; + esac + ;; + + *) + echo "Unexpected kind version: $KIND_ACTUAL_VERSION" + exit 1 + ;; + esac + + echo "Running kind: [kind create cluster ${CLUSTER_CONTEXT} --image kindest/node:${KIND_NODE_TAG} ${KIND_OPT}]" + kind create cluster ${CLUSTER_CONTEXT} --image kindest/node:${KIND_NODE_TAG} ${KIND_OPT} --wait ${WAIT_TIME} if [ $? -ne 0 ] then echo "Failed to start kind cluster" exit 1 fi CLUSTER_STARTED="true" +} +function kind_load_images { for image in ${IMAGE_ECHOSERVER} ${IMAGE_BUSY_BOX_LATEST} ${IMAGE_KUBEFLOW_OPERATOR} ${IMAGE_KUBERAY_OPERATOR} do kind load docker-image ${image} ${CLUSTER_CONTEXT} diff --git a/hack/kind-config.yaml b/hack/kind-config.yaml index d32e663..19f7fc2 100644 --- a/hack/kind-config.yaml +++ b/hack/kind-config.yaml @@ -2,14 +2,6 @@ kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 # 1 control plane node and 2 worker nodes nodes: -# the control plane node config - role: control-plane - # kubernetes version 1.27.17 from kind v0.24.0 - image: kindest/node:v1.27.17@sha256:3fd82731af34efe19cd54ea5c25e882985bafa2c9baefe14f8deab1737d9fabe -# the workers - role: worker - # kubernetes version 1.27.17 from kind v0.24.0 - image: kindest/node:v1.27.17@sha256:3fd82731af34efe19cd54ea5c25e882985bafa2c9baefe14f8deab1737d9fabe - role: worker - # kubernetes version 1.27.17 from kind v0.24.0 - image: kindest/node:v1.27.17@sha256:3fd82731af34efe19cd54ea5c25e882985bafa2c9baefe14f8deab1737d9fabe From 86987f134071dfd51d712f39cd90836bcbea35db Mon Sep 17 00:00:00 2001 From: David Grove Date: Mon, 11 Nov 2024 20:32:38 -0500 Subject: [PATCH 08/13] avoid blanket override of namespace when kustomizing kueue (#269) --- hack/kueue-config/kustomization.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/hack/kueue-config/kustomization.yaml b/hack/kueue-config/kustomization.yaml index b5af1c5..d80e583 100644 --- a/hack/kueue-config/kustomization.yaml +++ b/hack/kueue-config/kustomization.yaml @@ -1,14 +1,11 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -namespace: kueue-system - resources: - "https://github.com/kubernetes-sigs/kueue/config/default?ref=v0.8.3" configMapGenerator: - name: manager-config - namespace: kueue-system behavior: replace files: - controller_manager_config.yaml From d5a100b5634123c5d9716960b23a3388b66ed5e5 Mon Sep 17 00:00:00 2001 From: David Grove Date: Wed, 27 Nov 2024 11:26:17 -0500 Subject: [PATCH 09/13] add initial grace period before triggering a failure due to missing components (#273) --- .../appwrapper/appwrapper_controller.go | 25 +++++++++++-------- test/e2e/appwrapper_test.go | 9 +++++-- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/internal/controller/appwrapper/appwrapper_controller.go b/internal/controller/appwrapper/appwrapper_controller.go index 4f487a2..8fcbff7 100644 --- a/internal/controller/appwrapper/appwrapper_controller.go +++ b/internal/controller/appwrapper/appwrapper_controller.go @@ -268,21 +268,26 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request) } // Detect externally deleted components and transition to Failed with no GracePeriod or retry - detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected) if compStatus.deployed != compStatus.expected { - meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{ - Type: string(workloadv1beta2.Unhealthy), - Status: metav1.ConditionTrue, - Reason: "MissingComponent", - Message: detailMsg, - }) - r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg) - return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed) + // There may be a lag before created resources become visible in the cache; don't react too quickly. + whenDeployed := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)).LastTransitionTime + graceDuration := r.admissionGraceDuration(ctx, aw) + if time.Now().After(whenDeployed.Add(graceDuration)) { + detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected) + meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{ + Type: string(workloadv1beta2.Unhealthy), + Status: metav1.ConditionTrue, + Reason: "MissingComponent", + Message: detailMsg, + }) + r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg) + return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed) + } } // If a component's controller has put it into a failed state, we do not need // to allow a grace period. The situation will not self-correct. - detailMsg = fmt.Sprintf("Found %v failed components", compStatus.failed) + detailMsg := fmt.Sprintf("Found %v failed components", compStatus.failed) if compStatus.failed > 0 { meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{ Type: string(workloadv1beta2.Unhealthy), diff --git a/test/e2e/appwrapper_test.go b/test/e2e/appwrapper_test.go index 362f507..6b808a4 100644 --- a/test/e2e/appwrapper_test.go +++ b/test/e2e/appwrapper_test.go @@ -297,8 +297,13 @@ var _ = Describe("AppWrapper E2E Test", func() { Expect(aw.Status.Retries).Should(Equal(int32(2))) }) - It("Deleting a Running Component yields a failed AppWrapper", func() { - aw := createAppWrapper(ctx, pytorchjob(2, 500)) + It("Deleting a Running Component yields a failed AppWrapper", Label("slow"), func() { + aw := toAppWrapper(pytorchjob(2, 500)) + if aw.Annotations == nil { + aw.Annotations = make(map[string]string) + } + aw.Annotations[workloadv1beta2.AdmissionGracePeriodDurationAnnotation] = "5s" + Expect(getClient(ctx).Create(ctx, aw)).To(Succeed()) appwrappers = append(appwrappers, aw) Eventually(AppWrapperPhase(ctx, aw), 60*time.Second).Should(Equal(workloadv1beta2.AppWrapperRunning)) aw = getAppWrapper(ctx, types.NamespacedName{Name: aw.Name, Namespace: aw.Namespace}) From fbb0c6b572388893ecdf89d43ddfd5d303fdc1b4 Mon Sep 17 00:00:00 2001 From: David Grove Date: Wed, 27 Nov 2024 11:52:41 -0500 Subject: [PATCH 10/13] rename user-queue to default-queue to match mlbatch conventions (#274) --- hack/default-queues.yaml | 2 +- internal/webhook/suite_test.go | 4 ++-- samples/wrapped-deployment.yaml | 2 +- samples/wrapped-failing-job.yaml | 2 +- samples/wrapped-failing-pod.yaml | 2 +- samples/wrapped-failing-pytorch-job.yaml | 2 +- samples/wrapped-gpu-job.yaml | 2 +- samples/wrapped-job.yaml | 2 +- samples/wrapped-pod.yaml | 2 +- samples/wrapped-pytorch-job.yaml | 2 +- site/_pages/dev-setup.md | 2 +- site/_pages/quick-start.md | 2 +- site/_pages/sample-batch-job.md | 2 +- site/_pages/sample-pytorch.md | 2 +- 14 files changed, 15 insertions(+), 15 deletions(-) diff --git a/hack/default-queues.yaml b/hack/default-queues.yaml index 858b23c..23497c2 100644 --- a/hack/default-queues.yaml +++ b/hack/default-queues.yaml @@ -23,6 +23,6 @@ apiVersion: kueue.x-k8s.io/v1beta1 kind: LocalQueue metadata: namespace: "default" - name: "user-queue" + name: "default-queue" spec: clusterQueue: "cluster-queue" diff --git a/internal/webhook/suite_test.go b/internal/webhook/suite_test.go index fb52e77..0e7543f 100644 --- a/internal/webhook/suite_test.go +++ b/internal/webhook/suite_test.go @@ -62,8 +62,8 @@ var cancel context.CancelFunc const limitedUserName = "limited-user" const limitedUserID = "8da0fcfe-6d7f-4f44-b433-d91d22cc1b8c" -const defaultQueueName = "default-queue" -const userProvidedQueueName = "user-queue" +const defaultQueueName = "system-default-queue" +const userProvidedQueueName = "user-provided-queue" func TestWebhooks(t *testing.T) { RegisterFailHandler(Fail) diff --git a/samples/wrapped-deployment.yaml b/samples/wrapped-deployment.yaml index a5d0ce1..b9e6c9c 100644 --- a/samples/wrapped-deployment.yaml +++ b/samples/wrapped-deployment.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-deployment labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue spec: components: - template: diff --git a/samples/wrapped-failing-job.yaml b/samples/wrapped-failing-job.yaml index e65f564..130166d 100644 --- a/samples/wrapped-failing-job.yaml +++ b/samples/wrapped-failing-job.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-failing-job labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue annotations: workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 10s workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 10s diff --git a/samples/wrapped-failing-pod.yaml b/samples/wrapped-failing-pod.yaml index f13555b..eb02df5 100644 --- a/samples/wrapped-failing-pod.yaml +++ b/samples/wrapped-failing-pod.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-failing-pod labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue annotations: workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 10s workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 10s diff --git a/samples/wrapped-failing-pytorch-job.yaml b/samples/wrapped-failing-pytorch-job.yaml index d7bc7a2..e6cf910 100644 --- a/samples/wrapped-failing-pytorch-job.yaml +++ b/samples/wrapped-failing-pytorch-job.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-failing-pytorch-job labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue spec: components: - template: diff --git a/samples/wrapped-gpu-job.yaml b/samples/wrapped-gpu-job.yaml index b606324..bc0cf44 100644 --- a/samples/wrapped-gpu-job.yaml +++ b/samples/wrapped-gpu-job.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-gpu-job labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue annotations: workload.codeflare.dev.appwrapper/successTTLDuration: "1m" spec: diff --git a/samples/wrapped-job.yaml b/samples/wrapped-job.yaml index af73824..f8f4f3f 100644 --- a/samples/wrapped-job.yaml +++ b/samples/wrapped-job.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-job labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue annotations: workload.codeflare.dev.appwrapper/successTTLDuration: "1m" spec: diff --git a/samples/wrapped-pod.yaml b/samples/wrapped-pod.yaml index 067e0eb..7ecd87e 100644 --- a/samples/wrapped-pod.yaml +++ b/samples/wrapped-pod.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-pod labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue spec: components: - template: diff --git a/samples/wrapped-pytorch-job.yaml b/samples/wrapped-pytorch-job.yaml index 5577325..18f5cc6 100644 --- a/samples/wrapped-pytorch-job.yaml +++ b/samples/wrapped-pytorch-job.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-pytorch-job labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue spec: components: - template: diff --git a/site/_pages/dev-setup.md b/site/_pages/dev-setup.md index a12aed3..ddc4b8a 100644 --- a/site/_pages/dev-setup.md +++ b/site/_pages/dev-setup.md @@ -44,7 +44,7 @@ You can verify Kueue is configured as expected with: ```sh % kubectl get localqueues,clusterqueues -o wide NAME CLUSTERQUEUE PENDING WORKLOADS ADMITTED WORKLOADS -localqueue.kueue.x-k8s.io/user-queue cluster-queue 0 0 +localqueue.kueue.x-k8s.io/default-queue cluster-queue 0 0 NAME COHORT STRATEGY PENDING WORKLOADS ADMITTED WORKLOADS clusterqueue.kueue.x-k8s.io/cluster-queue BestEffortFIFO 0 0 diff --git a/site/_pages/quick-start.md b/site/_pages/quick-start.md index e475702..e276031 100644 --- a/site/_pages/quick-start.md +++ b/site/_pages/quick-start.md @@ -48,7 +48,7 @@ apiVersion: kueue.x-k8s.io/v1beta1 kind: LocalQueue metadata: namespace: "default" - name: "user-queue" + name: "default-queue" spec: clusterQueue: "cluster-queue" ``` diff --git a/site/_pages/sample-batch-job.md b/site/_pages/sample-batch-job.md index 5a91cf8..dd34016 100644 --- a/site/_pages/sample-batch-job.md +++ b/site/_pages/sample-batch-job.md @@ -11,7 +11,7 @@ kind: AppWrapper metadata: name: sample-job labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue spec: components: - template: diff --git a/site/_pages/sample-pytorch.md b/site/_pages/sample-pytorch.md index 63362e1..845edac 100644 --- a/site/_pages/sample-pytorch.md +++ b/site/_pages/sample-pytorch.md @@ -11,7 +11,7 @@ kind: AppWrapper metadata: name: sample-pytorch-job labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue spec: components: - template: From 515c0a3afb0f8de18d65786dce1f2a95e2b50180 Mon Sep 17 00:00:00 2001 From: David Grove Date: Thu, 12 Dec 2024 16:51:25 -0500 Subject: [PATCH 11/13] Ensure consistent resource status if create errors (#277) --- .../controller/appwrapper/resource_management.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/internal/controller/appwrapper/resource_management.go b/internal/controller/appwrapper/resource_management.go index eff0a7f..afa9b95 100644 --- a/internal/controller/appwrapper/resource_management.go +++ b/internal/controller/appwrapper/resource_management.go @@ -317,6 +317,18 @@ func (r *AppWrapperReconciler) createComponent(ctx context.Context, aw *workload } // fall through. This is not actually an error. The object already exists and the correct appwrapper owns it. } else { + // resource not actually created; patch status to reflect that + orig := copyForStatusPatch(aw) + meta.SetStatusCondition(&aw.Status.ComponentStatus[componentIdx].Conditions, metav1.Condition{ + Type: string(workloadv1beta2.ResourcesDeployed), + Status: metav1.ConditionFalse, + Reason: "ComponentCreationErrored", + }) + if patchErr := r.Status().Patch(ctx, aw, client.MergeFrom(orig)); patchErr != nil { + // ugh. Patch failed, so retry the create so we can get to a consistient state + return patchErr, false + } + // return actual error return err, meta.IsNoMatchError(err) || apierrors.IsInvalid(err) // fatal } } From b6bd577b7dab3cbaf148d8d40346ace9d11391be Mon Sep 17 00:00:00 2001 From: David Grove Date: Thu, 12 Dec 2024 17:41:20 -0500 Subject: [PATCH 12/13] flag config error if admissionGP exceeds warmupGP (#278) --- pkg/config/config.go | 6 +++++- pkg/config/config_test.go | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index b730965..1720567 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -143,7 +143,11 @@ func ValidateAppWrapperConfig(config *AppWrapperConfig) error { config.FaultTolerance.AdmissionGracePeriod, config.FaultTolerance.GracePeriodMaximum) } if config.FaultTolerance.WarmupGracePeriod > config.FaultTolerance.GracePeriodMaximum { - return fmt.Errorf("WarmupGracePeriod %v exceeds GracePeriodCeiling %v", + return fmt.Errorf("AdmissionGracePeriod %v exceeds GracePeriodCeiling %v", + config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodMaximum) + } + if config.FaultTolerance.AdmissionGracePeriod > config.FaultTolerance.WarmupGracePeriod { + return fmt.Errorf("AdmissionGracePeriod %v exceeds AdmissionGracePeriod %v", config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodMaximum) } if config.FaultTolerance.SuccessTTL <= 0 { diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index f04ce9b..9810907 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -56,6 +56,9 @@ var _ = Describe("AppWrapper Config", func() { bad = &FaultToleranceConfig{WarmupGracePeriod: 10 * time.Second, GracePeriodMaximum: 1 * time.Second} Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed()) + bad = &FaultToleranceConfig{AdmissionGracePeriod: 10 * time.Second, WarmupGracePeriod: 1 * time.Second} + Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed()) + bad = &FaultToleranceConfig{SuccessTTL: -1 * time.Second} Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed()) }) From 82cca1b5eabdc769d3636f9b414e2de34add5694 Mon Sep 17 00:00:00 2001 From: David Grove Date: Thu, 12 Dec 2024 18:10:50 -0500 Subject: [PATCH 13/13] prepare for 0.28 release (#279) --- README.md | 2 +- site/_config.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3edc862..5a23458 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ To install the latest release of AppWrapper in a Kubernetes cluster with Kueue a and configured, simply run the command: ```sh -kubectl apply --server-side -f https://github.com/project-codeflare/appwrapper/releases/download/v0.27.0/install.yaml +kubectl apply --server-side -f https://github.com/project-codeflare/appwrapper/releases/download/v0.28.0/install.yaml ``` The controller runs in the `appwrapper-system` namespace. diff --git a/site/_config.yml b/site/_config.yml index f0ae32d..4a2a980 100644 --- a/site/_config.yml +++ b/site/_config.yml @@ -25,7 +25,7 @@ repository: project-codeflare/appwrapper # Variables for use in pages gh_main_url: https://github.com/project-codeflare/appwrapper/blob/main -appwrapper_version: v0.27.0 +appwrapper_version: v0.28.0 # Outputting permalink: /:categories/:title/