From 86c39231f4103d739de6c110ae81836ca1e72f80 Mon Sep 17 00:00:00 2001 From: Jayson Vantuyl Date: Fri, 22 May 2026 01:40:01 +0000 Subject: [PATCH] Fix ROCm build: fix bazel paths and gcc detection When building XLA with ROCm, Bazel's header validation rejects absolute-path includes from the ROCm clang resource directory because rocm_configure.bzl only adds the bazel-cache-relative path to cxx_builtin_include_directories. Additionally, the rocm_configure repository rule cannot see TF_ROCM_CLANG and CLANG_COMPILER_PATH because they were set via --action_env (build actions only) rather than --repo_env (repository rules), causing the crosstool wrapper to default to gcc. This commit: - Adds a patch to rocm_configure.bzl that includes both the real and symlink-resolved absolute ROCm clang resource paths in cxx_builtin_include_directories - Changes --action_env to --repo_env for TF_ROCM_CLANG and CLANG_COMPILER_PATH so the repository rule generates a clang-based crosstool wrapper - Adds TF_ROCM_CLANG and CLANG_COMPILER_PATH as ENV vars in the Dockerfile for container builds --- builds/Dockerfile | 6 ++++ builds/build.sh | 1 + extension/patches/apply.sh | 11 +++++++ .../patches/rocm_absolute_includes.patch | 29 +++++++++++++++++++ lib/xla.ex | 8 +++-- 5 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 extension/patches/rocm_absolute_includes.patch diff --git a/builds/Dockerfile b/builds/Dockerfile index 3a80230..9e6afba 100644 --- a/builds/Dockerfile +++ b/builds/Dockerfile @@ -45,6 +45,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates ENV ROCM_PATH "/opt/rocm" +# Pass repo_env flags so that rocm_configure repository rule can see +# them when generating the crosstool wrapper. Without these, the +# wrapper defaults to gcc and fails on Clang-only flags. +ENV TF_ROCM_CLANG="1" +ENV CLANG_COMPILER_PATH="/usr/lib/llvm-18/bin/clang" + FROM base-${VARIANT} # Set the missing UTF-8 locale, otherwise Elixir warns diff --git a/builds/build.sh b/builds/build.sh index 5cbbc7c..718f18a 100755 --- a/builds/build.sh +++ b/builds/build.sh @@ -71,6 +71,7 @@ case "$target" in --build-arg BASE_IMAGE=$base_image \ --build-arg ROCM_VERSION=$rocm_ver \ --build-arg XLA_TARGET=rocm \ + --network host \ . ;; diff --git a/extension/patches/apply.sh b/extension/patches/apply.sh index ef83574..eba6cb5 100644 --- a/extension/patches/apply.sh +++ b/extension/patches/apply.sh @@ -21,3 +21,14 @@ arch="$(uname -m)" # See https://github.com/tensorflow/tensorflow/pull/86413 and the # referenced threads. git apply $dir/cuda_ncrtc_builtins.patch + +# When building XLA with ROCm, the compiler resolves symlinks in the +# local_config_rocm repository and reports include paths at the real +# absolute location (e.g. /opt/rocm/llvm/lib/clang/22/include) +# rather than the symlinked bazel-cache path. Bazel's header +# validation rejects these as "absolute path inclusions" unless they +# are listed in cxx_builtin_include_directories. This patch adds +# the absolute resource directory paths alongside the relative ones. +if [[ -n "${XLA_TARGET:-}" && "${XLA_TARGET}" == "rocm" ]]; then + git apply $dir/rocm_absolute_includes.patch +fi diff --git a/extension/patches/rocm_absolute_includes.patch b/extension/patches/rocm_absolute_includes.patch new file mode 100644 index 0000000..762f0fc --- /dev/null +++ b/extension/patches/rocm_absolute_includes.patch @@ -0,0 +1,29 @@ +diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl +index 1510140..6ab54b4 100644 +--- a/third_party/gpus/rocm_configure.bzl ++++ b/third_party/gpus/rocm_configure.bzl +@@ -163,6 +163,24 @@ def _rocm_include_path(repository_ctx, rocm_config, bash_bin): + inc_dirs.append(resource_dir + "/include") + inc_dirs.append(resource_dir + "/share") + ++ # Also add the absolute paths, because the compiler may resolve ++ # symlinks and report includes at the real location rather than ++ # the symlinked bazel-cache path. Bazel's header validation rejects ++ # absolute path inclusions unless they are in cxx_builtin_include_directories. ++ # ++ # resource_dir_abs is the canonical path (e.g. /opt/rocm/lib/llvm/lib/clang/22). ++ # The ROCm installation typically has /opt/rocm/llvm -> ./lib/llvm, so the ++ # compiler may report includes via the symlink path instead. We must include ++ # both the real path and the symlink-based path. ++ inc_dirs.append(resource_dir_abs + "/include") ++ inc_dirs.append(resource_dir_abs + "/share") ++ rocm_path_env = get_host_environ(repository_ctx, "ROCM_PATH", "/opt/rocm") ++ if rocm_path_env: ++ clang_version = resource_dir_abs.split("/")[-1] ++ rocm_prefix = rocm_path_env.rstrip("/") ++ inc_dirs.append(rocm_prefix + "/llvm/lib/clang/" + clang_version + "/include") ++ inc_dirs.append(rocm_prefix + "/llvm/lib/clang/" + clang_version + "/share") ++ + return inc_dirs + + def _enable_rocm(repository_ctx): diff --git a/lib/xla.ex b/lib/xla.ex index 6caf0f6..bf993a5 100644 --- a/lib/xla.ex +++ b/lib/xla.ex @@ -356,8 +356,12 @@ defmodule XLA do "rocm" <> _ -> [ "--config=rocm", - ~s/--action_env=TF_ROCM_CLANG="1"/, - ~s/--action_env=TF_HIPCC_CLANG="1"/, + # These must be repo_env (not action_env) so that the + # rocm_configure repository rule sees them when generating + # the crosstool wrapper. Otherwise the wrapper defaults to + # gcc and chokes on Clang-only flags like -Qunused-arguments. + ~s/--repo_env=TF_ROCM_CLANG="1"/, + ~s|--repo_env=CLANG_COMPILER_PATH="/usr/lib/llvm-18/bin/clang"|, # See https://github.com/jax-ml/jax/blob/098e953afb2b83daf85e6456c89e896f9cfd483d/.bazelrc#L239 # GPU targets: MI200 (gfx90a), MI300 (gfx942), RDNA2 (gfx1030), RDNA3 (gfx1100), RDNA4 (gfx120x) # Note: gfx900/906/908 (Vega, MI50/60, MI100) removed - deprecated in ROCm 7.x