@@ -142,3 +142,40 @@ def _reduce_modelconfig(mc: ModelConfig):
142142 ' lead to a later error. If remote code is not needed'
143143 ' remove `--trust-remote-code`' ,
144144 exc_info = e )
145+
146+
147+ def monkey_patch_hf_modules_cache ():
148+ """Monkey patch HF_MODULES_CACHE to a temporary directory per process. This
149+ is necessary to avoid conflicts when multiple processes try to read/write
150+ to the same HF_MODULES_CACHE directory, especially in multi-GPU setups.
151+
152+ modified from: https://github.com/InternLM/xtuner/blob/main/xtuner/v1/utils/misc.py
153+ """
154+ import os
155+
156+ import transformers
157+ from huggingface_hub import constants
158+
159+ # When using `remote_code` in HF components like tokenizer or config
160+ # (e.g., `AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)`),
161+ # the hf_model_path is copied to HF_MODULES_CACHE.
162+ # On multi-GPU machines (e.g., 8 GPUs), simultaneous read/write operations
163+ # by multiple processes on this shared directory can cause conflicts.
164+ # Therefore, we set HF_MODULES_CACHE to a temporary directory per process.
165+
166+ HF_PATCH_MODULES_CACHE_PREFIX = 'modules_pid_'
167+ modules_cache = os .path .join (constants .HF_HOME , f'{ HF_PATCH_MODULES_CACHE_PREFIX } { os .getpid ()} ' )
168+ os .environ ['HF_MODULES_CACHE' ] = modules_cache
169+
170+ transformers .utils .hub .HF_MODULES_CACHE = modules_cache
171+
172+ # During import, Python creates a new name HF_MODULES_CACHE in the namespace
173+ # of the dynamic_module_utils module, binding it to the object referenced by
174+ # transformers.utils.HF_MODULES_CACHE at that moment.
175+ # Hence, we also need to set transformers.dynamic_module_utils.HF_MODULES_CACHE
176+ # to the new modules_cache.
177+
178+ transformers .dynamic_module_utils .HF_MODULES_CACHE = modules_cache
179+ transformers .utils .HF_MODULES_CACHE = modules_cache
180+
181+ logger .info (f'Set HF_MODULES_CACHE to { modules_cache } for current process { os .getpid ()} ' )
0 commit comments