Fix Falcon3-7b WOQ TP=3 with int4 checkpoint (#3402)

Xia-Weiwen · web-flow · commit 2effbbbb3dde · 2024-12-13T01:21:39.000-08:00
* Falcon3-7b WOQ TP=3 with int4 checkpoint

* Add assert

* Split mha according to kv_heads

* Fix bug with old deepspeed

* Fix run_accuracy_with_deepspeed.py
diff --git a/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
@@ -224,25 +224,35 @@ def get_int_from_env(env_keys, default):
 TokenSequence = Union[List[int], torch.LongTensor, torch.Tensor, BatchEncoding]
 
 tp_grain_size = 64
-if args.ipex_weight_only_quantization and args.low_precision_checkpoint != "":
-    pathname = args.low_precision_checkpoint
-    assert os.path.exists(pathname), f"Checkpoint file does not exist: {pathname}"
-    if os.path.isdir(pathname):
-        try:
-            with open(pathname + "/config.json") as f:
-                quant_model_config = json.load(f)
-                tp_grain_size = int(
-                    quant_model_config["quantization_config"]["group_size"]
-                )
-        except Exception as e:
-            print("Failed to get group_size from config.json")
-    elif args.group_size > 0:
-        tp_grain_size = args.group_size
-    else:
-        print(
-            "Warning: cannot get group_size from config.json or --group-size, "
-            "using default value 64 for tp_grain_size"
-        )
+ds_init_inf_kwargs = {}
+# Need to check if this attr is available. Old DeepSpeep does not have it.
+if "tp_grain_size" in dir(deepspeed.inference.config.DeepSpeedTPConfig()):
+    if args.ipex_weight_only_quantization and args.low_precision_checkpoint != "":
+        pathname = args.low_precision_checkpoint
+        assert os.path.exists(pathname), f"Checkpoint file does not exist: {pathname}"
+        if os.path.isdir(pathname):
+            try:
+                with open(pathname + "/config.json") as f:
+                    quant_model_config = json.load(f)
+                    tp_grain_size = int(
+                        quant_model_config["quantization_config"]["group_size"]
+                    )
+            except Exception as e:
+                print("Failed to get group_size from config.json")
+        elif args.group_size > 0:
+            tp_grain_size = args.group_size
+        else:
+            print(
+                "Warning: cannot get group_size from config.json or --group-size, "
+                "using default value 64 for tp_grain_size"
+            )
+    ds_init_inf_kwargs.update(
+        {
+            "tensor_parallel": deepspeed.inference.config.DeepSpeedTPConfig(
+                tp_grain_size=tp_grain_size
+            )
+        }
+    )
 
 
 class HuggingFaceModel(BaseLM):
@@ -420,9 +430,7 @@ def write_checkpoints_json():
             base_dir=repo_root,
             dtype=infer_dtype,
             checkpoint=checkpoints_json,
-            tensor_parallel=deepspeed.inference.config.DeepSpeedTPConfig(
-                tp_grain_size=tp_grain_size
-            ),
+            **ds_init_inf_kwargs,
         )
 
         self.model = self.model.module
@@ -1495,9 +1503,7 @@ def write_checkpoints_json():
             base_dir=repo_root,
             dtype=infer_dtype,
             checkpoint=checkpoints_json,
-            tensor_parallel=deepspeed.inference.config.DeepSpeedTPConfig(
-                tp_grain_size=tp_grain_size
-            ),
+            **ds_init_inf_kwargs,
         )
 
         self._model = self._model.module
@@ -2263,9 +2269,7 @@ def write_checkpoints_json():
             base_dir=repo_root,
             dtype=infer_dtype,
             checkpoint=checkpoints_json,
-            tensor_parallel=deepspeed.inference.config.DeepSpeedTPConfig(
-                tp_grain_size=tp_grain_size
-            ),
+            **ds_init_inf_kwargs,
         )
 
         self.model = self.model.module
diff --git a/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
@@ -486,35 +486,41 @@ def write_checkpoints_json():
     dist.barrier()
 
 tp_grain_size = 64
-if args.ipex_weight_only_quantization and args.low_precision_checkpoint != "":
-    pathname = args.low_precision_checkpoint
-    assert os.path.exists(pathname), f"Checkpoint file does not exist: {pathname}"
-    if os.path.isdir(pathname):
-        try:
-            with open(pathname + "/config.json") as f:
-                quant_model_config = json.load(f)
-                tp_grain_size = int(
-                    quant_model_config["quantization_config"]["group_size"]
-                )
-        except Exception as e:
-            print("Failed to get group_size from config.json")
-    elif args.group_size > 0:
-        tp_grain_size = args.group_size
-    else:
-        print(
-            "Warning: cannot get group_size from config.json or --group-size, "
-            "using default value 64 for tp_grain_size"
-        )
+# Need to check if this attr is available. Old DeepSpeep does not have it.
+if "tp_grain_size" in dir(deepspeed.inference.config.DeepSpeedTPConfig()):
+    if args.ipex_weight_only_quantization and args.low_precision_checkpoint != "":
+        pathname = args.low_precision_checkpoint
+        assert os.path.exists(pathname), f"Checkpoint file does not exist: {pathname}"
+        if os.path.isdir(pathname):
+            try:
+                with open(pathname + "/config.json") as f:
+                    quant_model_config = json.load(f)
+                    tp_grain_size = int(
+                        quant_model_config["quantization_config"]["group_size"]
+                    )
+            except Exception as e:
+                print("Failed to get group_size from config.json")
+        elif args.group_size > 0:
+            tp_grain_size = args.group_size
+        else:
+            print(
+                "Warning: cannot get group_size from config.json or --group-size, "
+                "using default value 64 for tp_grain_size"
+            )
+    kwargs.update(
+        {
+            "tensor_parallel": deepspeed.inference.config.DeepSpeedTPConfig(
+                tp_grain_size=tp_grain_size
+            )
+        }
+    )
 
 model = deepspeed.init_inference(
     model,
     mp_size=world_size,
     base_dir=repo_root,
     dtype=infer_dtype,
     checkpoint=checkpoints_json,
-    tensor_parallel=deepspeed.inference.config.DeepSpeedTPConfig(
-        tp_grain_size=tp_grain_size
-    ),
     **kwargs,
 )
 
@@ -659,6 +665,8 @@ def write_checkpoints_json():
         )
         if low_precision_checkpoint is not None:
             num_heads = model.config.num_attention_heads
+            if hasattr(model.config, "num_key_value_heads"):
+                num_heads = model.config.num_key_value_heads
             rank = local_rank
 
             mha_layers_split_by_N = [
@@ -712,7 +720,13 @@ def write_checkpoints_json():
                         # awq qweight: [K, N // 8]
                         # awq scales: [K // G, N]
                         # awq qzeros: [K // G, N // 8]
-                        dim = data.shape[-1] // head_range[-1]
+                        if data.shape[-1] % head_range[-1] == 0:
+                            dim = data.shape[-1] // head_range[-1]
+                        else:
+                            assert data.shape[-1] % world_size == 0
+                            dim = data.shape[-1] // world_size
+                            q_head_start = local_rank
+                            q_head_end = local_rank + 1
                         low_precision_checkpoint_dict[key] = data[
                             :, q_head_start * dim : q_head_end * dim
                         ]