intel
diff --git a/‎csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp‎
Lines changed: 387 additions & 58 deletions b/‎csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp‎
Lines changed: 387 additions & 58 deletions
diff --git a/‎examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py‎
Lines changed: 19 additions & 0 deletions b/‎examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎examples/cpu/llm/inference/run.py‎
Lines changed: 13 additions & 0 deletions b/‎examples/cpu/llm/inference/run.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎examples/cpu/llm/inference/single_instance/run_generation.py‎
Lines changed: 18 additions & 0 deletions b/‎examples/cpu/llm/inference/single_instance/run_generation.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎intel_extension_for_pytorch/transformers/generation/beam_sample.py‎
Lines changed: 36 additions & 10 deletions b/‎intel_extension_for_pytorch/transformers/generation/beam_sample.py‎
Lines changed: 36 additions & 10 deletions
diff --git a/‎intel_extension_for_pytorch/transformers/generation/beam_search.py‎
Lines changed: 42 additions & 12 deletions b/‎intel_extension_for_pytorch/transformers/generation/beam_search.py‎
Lines changed: 42 additions & 12 deletions
diff --git a/‎intel_extension_for_pytorch/transformers/generation/greedy_search.py‎
Lines changed: 36 additions & 14 deletions b/‎intel_extension_for_pytorch/transformers/generation/greedy_search.py‎
Lines changed: 36 additions & 14 deletions
@@ -197,6 +197,17 @@
     help="Quantize weight symmetrically for weight only quantization. It usually brings better latency at"
     " the cost of accuracy. It has not effect if you are loading low-precision checkpoints.",
 )
+parser.add_argument(
+    "--kv-cache-dtype",
+    type=str,
+    choices=[
+        "auto",
+        "fp8_e5m2",
+    ],
+    default="auto",
+    help='Data type for kv cache storage. If "auto", will use model '
+    "data type. fp8 type now supports e5m2.",
+)
 parser.add_argument(
     "--low-precision-checkpoint",
     default="",
@@ -206,6 +217,7 @@
     " quantization with INT4 weight.",
 )
 
+
 args = parser.parse_args()
 
 
@@ -350,6 +362,13 @@ def get_checkpoint_files(model_name_or_path):
     config = AutoConfig.from_pretrained(
         args.config_file, torchscript=True, trust_remote_code=True
     )
+
+if args.kv_cache_dtype == "auto":
+    kv_cache_dtype = None
+elif args.kv_cache_dtype == "fp8_e5m2":
+    kv_cache_dtype = torch.float8_e5m2
+config.kv_cache_dtype = kv_cache_dtype
+
 if not hasattr(config, "text_max_length") and args.prompt is None:
     config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens)
 if model_type == "mpt" and args.prompt is None:
 
@@ -300,6 +300,17 @@ def main(args_in: Optional[List[str]] = None) -> None:
         " In other cases, this feature is always turned on regardless of this argument and it does not"
         " conflict with the accuracy test.",
     )
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=[
+            "auto",
+            "fp8_e5m2",
+        ],
+        default="auto",
+        help='Data type for kv cache storage. If "auto", will use model '
+        "data type. fp8 type now supports e5m2.",
+    )
     args = parser.parse_args(args_in)
 
     parent_path = Path(__file__).parent.absolute()
@@ -335,6 +346,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
             infer_cmd.extend(["--num-iter", str(args.num_iter)])
             infer_cmd.extend(["--num-warmup", str(args.num_warmup)])
             infer_cmd.extend(["--batch-size", str(args.batch_size)])
+            infer_cmd.extend(["--kv-cache-dtype", args.kv_cache_dtype])
             if args.vision_text_model:
                 infer_cmd.extend(["--vision-text-model"])
             if args.greedy:
@@ -630,6 +642,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
         infer_cmd.extend(["--num-iter", str(args.num_iter)])
         infer_cmd.extend(["--num-warmup", str(args.num_warmup)])
         infer_cmd.extend(["--batch-size", str(args.batch_size)])
+        infer_cmd.extend(["--kv-cache-dtype", args.kv_cache_dtype])
         if args.local_rank is not None:
             infer_cmd.extend(["--local_rank", str(args.local_rank)])
         if args.greedy:
 
@@ -105,6 +105,17 @@
     action="store_true",
     help="whether or not it is vision-text multi-model structure",
 )
+parser.add_argument(
+    "--kv-cache-dtype",
+    type=str,
+    choices=[
+        "auto",
+        "fp8_e5m2",
+    ],
+    default="auto",
+    help='Data type for kv cache storage. If "auto", will use model '
+    "data type. fp8 type now supports e5m2.",
+)
 
 args = parser.parse_args()
 print(args)
@@ -154,6 +165,13 @@
         trust_remote_code=True,
         torch_dtype=amp_dtype,
     )
+
+if args.kv_cache_dtype == "auto":
+    kv_cache_dtype = None
+elif args.kv_cache_dtype == "fp8_e5m2":
+    kv_cache_dtype = torch.float8_e5m2
+config.kv_cache_dtype = kv_cache_dtype
+
 if not hasattr(config, "text_max_length") and args.prompt is None:
     config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens)
 if model_type == "mpt" and args.prompt is None:
 
@@ -178,6 +178,12 @@ def _beam_sample(
             "Maira2ForConditionalGeneration",
         ]:
             first_token = False
+            if hasattr(self.config, "kv_cache_dtype"):
+                kv_cache_dtype = self.config.kv_cache_dtype
+            elif hasattr(self, "dtype"):
+                kv_cache_dtype = self.dtype
+            else:
+                kv_cache_dtype = torch.float
             if model_inputs["past_key_values"] is None:
                 first_token = True
                 if self.model_backbone == "T5ForConditionalGeneration":
@@ -189,8 +195,12 @@ def _beam_sample(
                         [
                             (
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
                                 beam_idx_tmp,
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
                                 self.decoder.block[i]
@@ -247,10 +257,14 @@ def _beam_sample(
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
                                 torch.zeros(
                                     [int(batch_size * num_beams), num_head, 1, head_dim]
-                                ).contiguous(),
+                                )
+                                .contiguous()
+                                .to(kv_cache_dtype),
                                 torch.zeros(
                                     [int(batch_size * num_beams), num_head, 1, head_dim]
-                                ).contiguous(),
+                                )
+                                .contiguous()
+                                .to(kv_cache_dtype),
                                 beam_idx_tmp,
                             )
                             for i in range(self.config.num_hidden_layers)
@@ -265,8 +279,12 @@ def _beam_sample(
                         [
                             (
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
                                 beam_idx_tmp,
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
                                 self.model.decoder.layers[i]
@@ -324,8 +342,12 @@ def _beam_sample(
                                     torch.zeros(
                                         1, 0, 0, 1, dtype=torch.long
                                     ).contiguous(),
-                                    torch.zeros([1, 1, 1, 1]).contiguous(),
-                                    torch.zeros([1, 1, 1, 1]).contiguous(),
+                                    torch.zeros([1, 1, 1, 1])
+                                    .contiguous()
+                                    .to(kv_cache_dtype),
+                                    torch.zeros([1, 1, 1, 1])
+                                    .contiguous()
+                                    .to(kv_cache_dtype),
                                     beam_idx_tmp,
                                 )
                                 if i
@@ -343,8 +365,12 @@ def _beam_sample(
                         [
                             (
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
                                 beam_idx_tmp,
                             )
                             for i in range(num_hidden_layers)
 
@@ -206,6 +206,12 @@ def _beam_search(
         ]:
             first_token = False
             has_position_id = model_inputs.get("position_ids", None) is not None
+            if hasattr(self.config, "kv_cache_dtype"):
+                kv_cache_dtype = self.config.kv_cache_dtype
+            elif hasattr(self, "dtype"):
+                kv_cache_dtype = self.dtype
+            else:
+                kv_cache_dtype = torch.float
             if model_inputs["past_key_values"] is None:
                 first_token = True
                 if self.model_backbone == "T5ForConditionalGeneration":
@@ -217,8 +223,12 @@ def _beam_search(
                         [
                             (
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
                                 beam_idx_tmp,
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
                                 self.decoder.block[i]
@@ -275,10 +285,14 @@ def _beam_search(
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
                                 torch.zeros(
                                     [int(batch_size * num_beams), num_head, 1, head_dim]
-                                ).contiguous(),
+                                )
+                                .contiguous()
+                                .to(kv_cache_dtype),
                                 torch.zeros(
                                     [int(batch_size * num_beams), num_head, 1, head_dim]
-                                ).contiguous(),
+                                )
+                                .contiguous()
+                                .to(kv_cache_dtype),
                                 beam_idx_tmp,
                             )
                             for i in range(self.config.num_hidden_layers)
@@ -293,8 +307,12 @@ def _beam_search(
                         [
                             (
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
                                 beam_idx_tmp,
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
                                 self.model.decoder.layers[i]
@@ -353,15 +371,23 @@ def _beam_search(
                                     torch.zeros(
                                         1, 0, 0, 1, dtype=torch.long
                                     ).contiguous(),
-                                    torch.zeros([1, 1, 1, 1]).contiguous(),
-                                    torch.zeros([1, 1, 1, 1]).contiguous(),
+                                    torch.zeros([1, 1, 1, 1])
+                                    .contiguous()
+                                    .to(kv_cache_dtype),
+                                    torch.zeros([1, 1, 1, 1])
+                                    .contiguous()
+                                    .to(kv_cache_dtype),
                                     beam_idx_tmp,
                                 )
                                 if i
                                 not in self.config.text_config.cross_attention_layers
                                 else (
-                                    torch.zeros([1, 1, 1, head_dim]).contiguous(),
-                                    torch.zeros([1, 1, 1, head_dim]).contiguous(),
+                                    torch.zeros([1, 1, 1, head_dim])
+                                    .contiguous()
+                                    .to(kv_cache_dtype),
+                                    torch.zeros([1, 1, 1, head_dim])
+                                    .contiguous()
+                                    .to(kv_cache_dtype),
                                 )
                             )
                             for i in range(num_hidden_layers)
@@ -372,8 +398,12 @@ def _beam_search(
                         [
                             (
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
                                 beam_idx_tmp,
                             )
                             for i in range(num_hidden_layers)
 
@@ -170,6 +170,12 @@ def _greedy_search(
             "Maira2ForConditionalGeneration",
         ]:
             first_token = False
+            if hasattr(self.config, "kv_cache_dtype"):
+                kv_cache_dtype = self.config.kv_cache_dtype
+            elif hasattr(self, "dtype"):
+                kv_cache_dtype = self.dtype
+            else:
+                kv_cache_dtype = torch.float
             input_bs = input_ids.size()[0]
             if model_inputs["past_key_values"] is None:
                 first_token = True
@@ -182,8 +188,12 @@ def _greedy_search(
                         [
                             (
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
                                 beam_idx_tmp,
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
                                 self.decoder.block[i]
@@ -232,8 +242,12 @@ def _greedy_search(
                         [
                             (
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
                                 beam_idx_tmp,
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
                                 self.model.decoder.layers[i]
@@ -291,12 +305,12 @@ def _greedy_search(
                         [
                             (
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
-                                torch.zeros(
-                                    [input_bs, num_head, 1, head_dim]
-                                ).contiguous(),
-                                torch.zeros(
-                                    [input_bs, num_head, 1, head_dim]
-                                ).contiguous(),
+                                torch.zeros([input_bs, num_head, 1, head_dim])
+                                .contiguous()
+                                .to(kv_cache_dtype),
+                                torch.zeros([input_bs, num_head, 1, head_dim])
+                                .contiguous()
+                                .to(kv_cache_dtype),
                                 beam_idx_tmp,
                             )
                             for i in range(num_hidden_layers)
@@ -314,8 +328,12 @@ def _greedy_search(
                                     torch.zeros(
                                         1, 0, 0, 1, dtype=torch.long
                                     ).contiguous(),
-                                    torch.zeros([1, 1, 1, 1]).contiguous(),
-                                    torch.zeros([1, 1, 1, 1]).contiguous(),
+                                    torch.zeros([1, 1, 1, 1])
+                                    .contiguous()
+                                    .to(kv_cache_dtype),
+                                    torch.zeros([1, 1, 1, 1])
+                                    .contiguous()
+                                    .to(kv_cache_dtype),
                                     beam_idx_tmp,
                                 )
                                 if i
@@ -333,8 +351,12 @@ def _greedy_search(
                         [
                             (
                                 torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
-                                torch.zeros([1, 1, 1, 1]).contiguous(),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
+                                torch.zeros([1, 1, 1, 1])
+                                .contiguous()
+                                .to(kv_cache_dtype),
                                 beam_idx_tmp,
                             )
                             for i in range(num_hidden_layers)