From 30833e7c3db9245c9e78e95ac93a5c115a8dcdf5 Mon Sep 17 00:00:00 2001 From: VEERA GOPU Date: Wed, 28 May 2025 17:33:00 -0500 Subject: [PATCH 1/8] Added Dockerfile for CI images --- docker/Dockerfile | 67 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 docker/Dockerfile diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 000000000..af78091e8 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,67 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +ARG BASE_DOCKER=rocm/pytorch:rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.5.1 +FROM $BASE_DOCKER +WORKDIR /var/lib/jenkins + +RUN apt update \ + && apt install -y nano wget ninja-build \ + && apt install -y python3 python3-pip git \ + && apt install -y sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev + +RUN python3 -m pip install --upgrade pip +RUN pip install ninja cmake setuptools wheel +RUN pip install uv tabulate +RUN pip install ipython pytest fire pydantic pybind11 + +RUN pip uninstall -y torch + +RUN apt --fix-broken install -y +RUN apt install -y libzstd-dev +RUN apt install -y libibverbs-dev + +ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer +ENV PATH=$PATH:/opt/rocm/bin: +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/: + +# Install pytorch +ARG PYTORCH_COMMIT="f929e0d602a71aa393ca2e6097674b210bdf321c" +ENV PYTORCH_ROCM_ARCH=gfx942 +RUN rm -fr pytorch \ + && git clone https://github.com/pytorch/pytorch \ + && cd pytorch \ + && git fetch origin ${PYTORCH_COMMIT} \ + && git checkout -q ${PYTORCH_COMMIT} \ + && git submodule update --recursive --init \ + && ./tools/amd_build/build_amd.py \ + && BUILD_TEST=0 python3 setup.py install + +WORKDIR /var/lib/jenkins + +# Install flash-attention +ENV GPU_ARCHS=${PYTORCH_ROCM_ARCH} +RUN git clone https://github.com/ROCm/flash-attention.git \ + && cd flash-attention \ + && git checkout v2.7.3-cktile \ + && pip install . + +WORKDIR /var/lib/jenkins + +# Install jax +RUN git clone -b rocm-jaxlib-v0.4.35-qa https://github.com/ROCm/jax.git \ + && git clone -b rocm-jaxlib-v0.4.35-qa https://github.com/ROCm/xla.git \ + && cd jax \ + && echo 'run:rocm_plugin --copt=-DLEGACY_HIPBLAS_DIRECT' > .bazelrc.user \ + && python3 ./build/build.py --enable_rocm \ + --build_gpu_plugin \ + --use_clang=true \ + --clang_path=/opt/rocm-6.4.0/lib/llvm/bin/clang \ + --gpu_plugin_rocm_version=60 \ + --rocm_path=/opt/rocm-6.4.0/ \ + --rocm_amdgpu_targets=gfx942 \ + --bazel_options=--override_repository=xla=/var/lib/jenkins/xla \ + && pip install jax==0.4.35 \ + && python3 setup.py develop --user && python3 -m pip install dist/*.whl \ + && pip install jax==0.4.35 + +WORKDIR /workspace/ +CMD ["/bin/bash"] \ No newline at end of file From 342eb9a8279e78d0cd46dc272cc71fcb99b2b510 Mon Sep 17 00:00:00 2001 From: VEERA GOPU Date: Thu, 29 May 2025 02:21:30 -0500 Subject: [PATCH 2/8] Addressed reviews --- docker/{Dockerfile => Dockerfile.ci.deps} | 38 +++++++++++------------ 1 file changed, 19 insertions(+), 19 deletions(-) rename docker/{Dockerfile => Dockerfile.ci.deps} (66%) diff --git a/docker/Dockerfile b/docker/Dockerfile.ci.deps similarity index 66% rename from docker/Dockerfile rename to docker/Dockerfile.ci.deps index af78091e8..a454d441e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile.ci.deps @@ -1,21 +1,20 @@ -# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +# TE CI Dockerfile ARG BASE_DOCKER=rocm/pytorch:rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.5.1 FROM $BASE_DOCKER WORKDIR /var/lib/jenkins +RUN apt --fix-broken install -y RUN apt update \ && apt install -y nano wget ninja-build \ && apt install -y python3 python3-pip git \ && apt install -y sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev RUN python3 -m pip install --upgrade pip -RUN pip install ninja cmake setuptools wheel -RUN pip install uv tabulate +RUN pip install cmake setuptools wheel RUN pip install ipython pytest fire pydantic pybind11 -RUN pip uninstall -y torch +# RUN pip uninstall -y torch -RUN apt --fix-broken install -y RUN apt install -y libzstd-dev RUN apt install -y libibverbs-dev @@ -24,8 +23,8 @@ ENV PATH=$PATH:/opt/rocm/bin: ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/: # Install pytorch -ARG PYTORCH_COMMIT="f929e0d602a71aa393ca2e6097674b210bdf321c" -ENV PYTORCH_ROCM_ARCH=gfx942 +# ARG PYTORCH_COMMIT="f929e0d602a71aa393ca2e6097674b210bdf321c" +ARG PYTORCH_ROCM_ARCH=gfx942 RUN rm -fr pytorch \ && git clone https://github.com/pytorch/pytorch \ && cd pytorch \ @@ -33,23 +32,24 @@ RUN rm -fr pytorch \ && git checkout -q ${PYTORCH_COMMIT} \ && git submodule update --recursive --init \ && ./tools/amd_build/build_amd.py \ - && BUILD_TEST=0 python3 setup.py install - -WORKDIR /var/lib/jenkins + && BUILD_TEST=0 python3 setup.py install \ + && cd .. # Install flash-attention ENV GPU_ARCHS=${PYTORCH_ROCM_ARCH} +ARG FLASH_COMMIT="b3c68b169824a58df339e4fcb0ad5e5a3e4d4327" RUN git clone https://github.com/ROCm/flash-attention.git \ && cd flash-attention \ - && git checkout v2.7.3-cktile \ - && pip install . - -WORKDIR /var/lib/jenkins + && git fetch origin ${FLASH_COMMIT} \ + && git checkout -q ${FLASH_COMMIT} \ + && pip install . \ + && cd .. # Install jax -RUN git clone -b rocm-jaxlib-v0.4.35-qa https://github.com/ROCm/jax.git \ - && git clone -b rocm-jaxlib-v0.4.35-qa https://github.com/ROCm/xla.git \ - && cd jax \ +ARG JAX_COMMIT="58e53c664a30015eac865d57b4987827460d67b0" +ARG XLA_COMMIT="fe4a1ec96238c765874ebc76f17184df0d2c7b1f" +RUN git clone https://github.com/ROCm/xla.git && cd xla && git fetch origin ${XLA_COMMIT} && git checkout -q ${XLA_COMMIT} && cd .. \ + && git clone https://github.com/ROCm/jax.git && cd jax && git fetch origin ${JAX_COMMIT} && git checkout -q ${JAX_COMMIT} \ && echo 'run:rocm_plugin --copt=-DLEGACY_HIPBLAS_DIRECT' > .bazelrc.user \ && python3 ./build/build.py --enable_rocm \ --build_gpu_plugin \ @@ -57,11 +57,11 @@ RUN git clone -b rocm-jaxlib-v0.4.35-qa https://github.com/ROCm/jax.git \ --clang_path=/opt/rocm-6.4.0/lib/llvm/bin/clang \ --gpu_plugin_rocm_version=60 \ --rocm_path=/opt/rocm-6.4.0/ \ - --rocm_amdgpu_targets=gfx942 \ + --rocm_amdgpu_targets=${GPU_ARCH} \ --bazel_options=--override_repository=xla=/var/lib/jenkins/xla \ && pip install jax==0.4.35 \ && python3 setup.py develop --user && python3 -m pip install dist/*.whl \ && pip install jax==0.4.35 - + WORKDIR /workspace/ CMD ["/bin/bash"] \ No newline at end of file From 7ba78c37c178fc3783fc28eb8b1de31c02ffbefe Mon Sep 17 00:00:00 2001 From: VEERA GOPU Date: Thu, 29 May 2025 10:08:08 -0500 Subject: [PATCH 3/8] Addressed reviews --- docker/Dockerfile.ci.deps | 1 - 1 file changed, 1 deletion(-) diff --git a/docker/Dockerfile.ci.deps b/docker/Dockerfile.ci.deps index a454d441e..caf5c19c8 100644 --- a/docker/Dockerfile.ci.deps +++ b/docker/Dockerfile.ci.deps @@ -50,7 +50,6 @@ ARG JAX_COMMIT="58e53c664a30015eac865d57b4987827460d67b0" ARG XLA_COMMIT="fe4a1ec96238c765874ebc76f17184df0d2c7b1f" RUN git clone https://github.com/ROCm/xla.git && cd xla && git fetch origin ${XLA_COMMIT} && git checkout -q ${XLA_COMMIT} && cd .. \ && git clone https://github.com/ROCm/jax.git && cd jax && git fetch origin ${JAX_COMMIT} && git checkout -q ${JAX_COMMIT} \ - && echo 'run:rocm_plugin --copt=-DLEGACY_HIPBLAS_DIRECT' > .bazelrc.user \ && python3 ./build/build.py --enable_rocm \ --build_gpu_plugin \ --use_clang=true \ From 4fff8ff1b5b9f5ca7d74f97a3494838284cb9418 Mon Sep 17 00:00:00 2001 From: VEERA GOPU Date: Thu, 29 May 2025 10:57:07 -0500 Subject: [PATCH 4/8] Addressed flash attention commit to tag --- docker/Dockerfile.ci.deps | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docker/Dockerfile.ci.deps b/docker/Dockerfile.ci.deps index caf5c19c8..f9683eea5 100644 --- a/docker/Dockerfile.ci.deps +++ b/docker/Dockerfile.ci.deps @@ -37,11 +37,9 @@ RUN rm -fr pytorch \ # Install flash-attention ENV GPU_ARCHS=${PYTORCH_ROCM_ARCH} -ARG FLASH_COMMIT="b3c68b169824a58df339e4fcb0ad5e5a3e4d4327" RUN git clone https://github.com/ROCm/flash-attention.git \ && cd flash-attention \ - && git fetch origin ${FLASH_COMMIT} \ - && git checkout -q ${FLASH_COMMIT} \ + && git checkout v2.7.3-cktile \ && pip install . \ && cd .. From 41c9e4213f209c647fc2556df39e0fb45d1fbba7 Mon Sep 17 00:00:00 2001 From: Veera Rajasekhar Reddy Gopu Date: Wed, 11 Feb 2026 03:04:12 +0000 Subject: [PATCH 5/8] Moved Dockerfile.ci.deps to .github/scripts, Updated dockerfile --- .github/scripts/Dockerfile.ci.deps | 27 +++++++++++++ docker/Dockerfile.ci.deps | 64 ------------------------------ 2 files changed, 27 insertions(+), 64 deletions(-) create mode 100644 .github/scripts/Dockerfile.ci.deps delete mode 100644 docker/Dockerfile.ci.deps diff --git a/.github/scripts/Dockerfile.ci.deps b/.github/scripts/Dockerfile.ci.deps new file mode 100644 index 000000000..fa5ea657f --- /dev/null +++ b/.github/scripts/Dockerfile.ci.deps @@ -0,0 +1,27 @@ +# TE CI Dockerfile +ARG BASE_DOCKER=registry-sc-harbor.amd.com/framework/compute-rocm-rel-7.2:57_ubuntu22.04_py3.11_pytorch_release-2.8_08d38866 +FROM $BASE_DOCKER +WORKDIR / + +RUN pip install setuptools wheel +RUN pip install ipython pytest fire pydantic pybind11 ninja pandas +RUN apt-get update && apt-get install -y vim + +ARG PYTORCH_ROCM_ARCH=gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx950;gfx1151 + +# Install flash-attention v2.8.1 +ENV GPU_ARCHS=gfx90a;gfx950;gfx942 +RUN git clone --branch v2.8.1 --depth 1 https://github.com/Dao-AILab/flash-attention.git \ + && cd flash-attention \ + && FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE && FLASH_ATTENTION_SKIP_CK_BUILD=FALSE python setup.py install \ + && cd .. + + +RUN pip install \ + https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/jax_rocm7_pjrt-0.8.0%2Brocm7.2.0-py3-none-manylinux_2_28_x86_64.whl \ + https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/jax_rocm7_plugin-0.8.0%2Brocm7.2.0-cp311-cp311-manylinux_2_28_x86_64.whl \ + jax==0.8.0 \ + https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/jaxlib-0.8.0%2Brocm7.2.0-cp311-cp311-manylinux_2_27_x86_64.whl + +WORKDIR /workspace/ +CMD ["/bin/bash"] \ No newline at end of file diff --git a/docker/Dockerfile.ci.deps b/docker/Dockerfile.ci.deps deleted file mode 100644 index f9683eea5..000000000 --- a/docker/Dockerfile.ci.deps +++ /dev/null @@ -1,64 +0,0 @@ -# TE CI Dockerfile -ARG BASE_DOCKER=rocm/pytorch:rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.5.1 -FROM $BASE_DOCKER -WORKDIR /var/lib/jenkins - -RUN apt --fix-broken install -y -RUN apt update \ - && apt install -y nano wget ninja-build \ - && apt install -y python3 python3-pip git \ - && apt install -y sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev - -RUN python3 -m pip install --upgrade pip -RUN pip install cmake setuptools wheel -RUN pip install ipython pytest fire pydantic pybind11 - -# RUN pip uninstall -y torch - -RUN apt install -y libzstd-dev -RUN apt install -y libibverbs-dev - -ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer -ENV PATH=$PATH:/opt/rocm/bin: -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/: - -# Install pytorch -# ARG PYTORCH_COMMIT="f929e0d602a71aa393ca2e6097674b210bdf321c" -ARG PYTORCH_ROCM_ARCH=gfx942 -RUN rm -fr pytorch \ - && git clone https://github.com/pytorch/pytorch \ - && cd pytorch \ - && git fetch origin ${PYTORCH_COMMIT} \ - && git checkout -q ${PYTORCH_COMMIT} \ - && git submodule update --recursive --init \ - && ./tools/amd_build/build_amd.py \ - && BUILD_TEST=0 python3 setup.py install \ - && cd .. - -# Install flash-attention -ENV GPU_ARCHS=${PYTORCH_ROCM_ARCH} -RUN git clone https://github.com/ROCm/flash-attention.git \ - && cd flash-attention \ - && git checkout v2.7.3-cktile \ - && pip install . \ - && cd .. - -# Install jax -ARG JAX_COMMIT="58e53c664a30015eac865d57b4987827460d67b0" -ARG XLA_COMMIT="fe4a1ec96238c765874ebc76f17184df0d2c7b1f" -RUN git clone https://github.com/ROCm/xla.git && cd xla && git fetch origin ${XLA_COMMIT} && git checkout -q ${XLA_COMMIT} && cd .. \ - && git clone https://github.com/ROCm/jax.git && cd jax && git fetch origin ${JAX_COMMIT} && git checkout -q ${JAX_COMMIT} \ - && python3 ./build/build.py --enable_rocm \ - --build_gpu_plugin \ - --use_clang=true \ - --clang_path=/opt/rocm-6.4.0/lib/llvm/bin/clang \ - --gpu_plugin_rocm_version=60 \ - --rocm_path=/opt/rocm-6.4.0/ \ - --rocm_amdgpu_targets=${GPU_ARCH} \ - --bazel_options=--override_repository=xla=/var/lib/jenkins/xla \ - && pip install jax==0.4.35 \ - && python3 setup.py develop --user && python3 -m pip install dist/*.whl \ - && pip install jax==0.4.35 - -WORKDIR /workspace/ -CMD ["/bin/bash"] \ No newline at end of file From d1ecdd3bc2575119872ca5a34392f983d0af2834 Mon Sep 17 00:00:00 2001 From: Veera Rajasekhar Reddy Gopu Date: Thu, 12 Feb 2026 01:51:51 +0000 Subject: [PATCH 6/8] Addressed comments --- .github/scripts/Dockerfile.ci.deps | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/.github/scripts/Dockerfile.ci.deps b/.github/scripts/Dockerfile.ci.deps index fa5ea657f..fd7207536 100644 --- a/.github/scripts/Dockerfile.ci.deps +++ b/.github/scripts/Dockerfile.ci.deps @@ -1,27 +1,35 @@ -# TE CI Dockerfile +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. + +## TE CI Dockerfile ARG BASE_DOCKER=registry-sc-harbor.amd.com/framework/compute-rocm-rel-7.2:57_ubuntu22.04_py3.11_pytorch_release-2.8_08d38866 FROM $BASE_DOCKER WORKDIR / +# Build arguments +ARG FA_VERSION=v2.8.1 +ARG ROCM_VERSION=7.2 +ARG JAX_VERSION=0.8.0 +ARG PYTHON_VERSION=311 + RUN pip install setuptools wheel RUN pip install ipython pytest fire pydantic pybind11 ninja pandas RUN apt-get update && apt-get install -y vim -ARG PYTORCH_ROCM_ARCH=gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx950;gfx1151 - -# Install flash-attention v2.8.1 +# Install flash-attention ENV GPU_ARCHS=gfx90a;gfx950;gfx942 -RUN git clone --branch v2.8.1 --depth 1 https://github.com/Dao-AILab/flash-attention.git \ +RUN git clone --branch ${FA_VERSION} --depth 1 https://github.com/Dao-AILab/flash-attention.git \ && cd flash-attention \ && FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE && FLASH_ATTENTION_SKIP_CK_BUILD=FALSE python setup.py install \ && cd .. - -RUN pip install \ - https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/jax_rocm7_pjrt-0.8.0%2Brocm7.2.0-py3-none-manylinux_2_28_x86_64.whl \ - https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/jax_rocm7_plugin-0.8.0%2Brocm7.2.0-cp311-cp311-manylinux_2_28_x86_64.whl \ - jax==0.8.0 \ - https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/jaxlib-0.8.0%2Brocm7.2.0-cp311-cp311-manylinux_2_27_x86_64.whl +# Install JAX +RUN ROCM_MAJOR=$(echo "${ROCM_VERSION}" | cut -d. -f1) && pip install \ + https://repo.radeon.com/rocm/manylinux/rocm-rel-${ROCM_VERSION}/jax_rocm${ROCM_MAJOR}_pjrt-${JAX_VERSION}%2Brocm${ROCM_VERSION}.0-py3-none-manylinux_2_28_x86_64.whl \ + https://repo.radeon.com/rocm/manylinux/rocm-rel-${ROCM_VERSION}/jax_rocm${ROCM_MAJOR}_plugin-${JAX_VERSION}%2Brocm${ROCM_VERSION}.0-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux_2_28_x86_64.whl \ + jax==${JAX_VERSION} \ + https://repo.radeon.com/rocm/manylinux/rocm-rel-${ROCM_VERSION}/jaxlib-${JAX_VERSION}%2Brocm${ROCM_VERSION}.0-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux_2_27_x86_64.whl WORKDIR /workspace/ CMD ["/bin/bash"] \ No newline at end of file From bdc75a221518850c07065af97ad9be77f7f107fb Mon Sep 17 00:00:00 2001 From: Veera Rajasekhar Reddy Gopu Date: Thu, 12 Feb 2026 01:52:59 +0000 Subject: [PATCH 7/8] Adjusted tolerance for gfx942 for ROCm7.2 support --- tests/pytorch/attention/test_kv_cache.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/pytorch/attention/test_kv_cache.py b/tests/pytorch/attention/test_kv_cache.py index ba5ab7f87..8d2518d3e 100644 --- a/tests/pytorch/attention/test_kv_cache.py +++ b/tests/pytorch/attention/test_kv_cache.py @@ -31,6 +31,7 @@ init_method_normal, scaled_init_method_normal, is_bf16_compatible, + get_device_compute_capability, ) _current_file = pathlib.Path(__file__).resolve() @@ -378,7 +379,7 @@ def get_tols(config, module, backend, dtype): # With FA on ROCm it may not fit default tolerance if IS_HIP_EXTENSION and backend == "FlashAttention": tols = { - torch.half: (5e-3, 5e-3), + torch.half: (6e-3, 6e-3) if get_device_compute_capability() == (9, 4) else (5e-3, 5e-3), torch.bfloat16: (4e-2, 4e-2), } else: From dde1a02bee6c646f4b918cf5c0c84d72e69f8f37 Mon Sep 17 00:00:00 2001 From: Veera Rajasekhar Reddy Gopu Date: Thu, 12 Feb 2026 01:56:24 +0000 Subject: [PATCH 8/8] Updated CI docker image --- ci/ci_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/ci_config.json b/ci/ci_config.json index a7b3d5d6c..4123a3ed2 100644 --- a/ci/ci_config.json +++ b/ci/ci_config.json @@ -1,6 +1,6 @@ { "docker_images": { - "default": "registry-sc-harbor.amd.com/framework/te-ci:rocm-7.1.1_ubuntu22.04_py3.11_pytorch_release_2.8_63e525b2_jax_0.7.1_fa-2.8.0", + "default": "registry-sc-harbor.amd.com/framework/te-ci:rocm-7.2_ubuntu22.04_py3.11_pytorch_release-2.8_08d38866_jax_0.8.0_fa_2.8.1", "release_v1.13": "compute-artifactory.amd.com:5000/rocm-plus-docker/framework/private/te-ci:rocm-6.4_0_ubuntu22_py310_torch25_jax0435qa_fa273", "release_v1.14": "compute-artifactory.amd.com:5000/rocm-plus-docker/framework/private/te-ci:rocm-6.4_0_ubuntu22_py310_torch25_jax0435qa_fa273" }