fix hf modules read/write conflicts by multi processors (#4188)

lvhan028 · web-flow · commit d38f0320e759 · 2025-12-10T16:29:15.000+08:00
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
@@ -28,7 +28,7 @@
 from ..spec_decode import build_spec_agent
 from ..strategies import build_strategy_factory
 from ..strategies.base.model_agent import ExtraInputs, ExtraOutputs, StoppingCriteria
-from ..utils import get_gpu_memory
+from ..utils import get_gpu_memory, monkey_patch_hf_modules_cache
 from ..weight_loader.model_weight_loader import ModelWeightLoader, load_model_weights
 from .cache_engine import CacheEngine, StateCacheEngine
 from .guided_process import GuidedDecodingManager
@@ -325,6 +325,7 @@ def __init__(
         self.model_config = model_config
         self.cache_config = cache_config
         # use raw tokenizer
+        monkey_patch_hf_modules_cache()
         self.tokenizer = Tokenizer(model_path).model.model
 
         self._pre_in_que = None
diff --git a/lmdeploy/pytorch/utils.py b/lmdeploy/pytorch/utils.py
@@ -142,3 +142,40 @@ def _reduce_modelconfig(mc: ModelConfig):
             ' lead to a later error. If remote code is not needed'
             ' remove `--trust-remote-code`',
             exc_info=e)
+
+
+def monkey_patch_hf_modules_cache():
+    """Monkey patch HF_MODULES_CACHE to a temporary directory per process. This
+    is necessary to avoid conflicts when multiple processes try to read/write
+    to the same HF_MODULES_CACHE directory, especially in multi-GPU setups.
+
+    modified from: https://github.com/InternLM/xtuner/blob/main/xtuner/v1/utils/misc.py
+    """
+    import os
+
+    import transformers
+    from huggingface_hub import constants
+
+    # When using `remote_code` in HF components like tokenizer or config
+    # (e.g., `AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)`),
+    # the hf_model_path is copied to HF_MODULES_CACHE.
+    # On multi-GPU machines (e.g., 8 GPUs), simultaneous read/write operations
+    # by multiple processes on this shared directory can cause conflicts.
+    # Therefore, we set HF_MODULES_CACHE to a temporary directory per process.
+
+    HF_PATCH_MODULES_CACHE_PREFIX = 'modules_pid_'
+    modules_cache = os.path.join(constants.HF_HOME, f'{HF_PATCH_MODULES_CACHE_PREFIX}{os.getpid()}')
+    os.environ['HF_MODULES_CACHE'] = modules_cache
+
+    transformers.utils.hub.HF_MODULES_CACHE = modules_cache
+
+    # During import, Python creates a new name HF_MODULES_CACHE in the namespace
+    # of the dynamic_module_utils module, binding it to the object referenced by
+    # transformers.utils.HF_MODULES_CACHE at that moment.
+    # Hence, we also need to set transformers.dynamic_module_utils.HF_MODULES_CACHE
+    # to the new modules_cache.
+
+    transformers.dynamic_module_utils.HF_MODULES_CACHE = modules_cache
+    transformers.utils.HF_MODULES_CACHE = modules_cache
+
+    logger.info(f'Set HF_MODULES_CACHE to {modules_cache} for current process {os.getpid()}')