TPP LM head slice for generation and kernel parallel (#2253)

jianan-gu · web-flow · commit 99aa54f757de · 2023-11-09T14:41:14.000+08:00
* lm head parall

* lm head slice for generation
diff --git a/csrc/cpu/tpp/kernels/TPPGEMMKrnl.h b/csrc/cpu/tpp/kernels/TPPGEMMKrnl.h
@@ -92,6 +92,14 @@ inline void tpp_linear_bias(
   auto in_sizes = t_in.sizes();
   auto wt_sizes = t_wt.sizes();
   auto BS = in_sizes[0] * in_sizes[1];
+  if (BS > FT_OPT_SIZE) { // first token compute
+    if (wt_sizes[3] != 100) {
+      t_wt = wt_tensor_for_first_token<T>(t_wt);
+      wt_sizes = t_wt.sizes();
+    }
+    large_cache_opt = true;
+  }
+
   auto C = in_sizes[2];
 
   auto Nc = wt_sizes[1];
@@ -169,11 +177,14 @@ inline void tpp_linear_no_bias(
     at::Tensor& t_out) {
   auto in_sizes = t_in.sizes();
   auto BS = in_sizes[0] * in_sizes[1];
+  auto wt_sizes = t_wt.sizes();
   if (BS > FT_OPT_SIZE) { // first token compute
-    t_wt = wt_tensor_for_first_token<T>(t_wt);
+    if (wt_sizes[3] != 100) {
+      t_wt = wt_tensor_for_first_token<T>(t_wt);
+      wt_sizes = t_wt.sizes();
+    }
     large_cache_opt = true;
   }
-  auto wt_sizes = t_wt.sizes();
   auto C = in_sizes[2];
 
   auto Nc = wt_sizes[1];
diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py
@@ -245,6 +245,9 @@ def get_checkpoint_files(model_name_or_path):
 if not hasattr(config, "text_max_length") and args.prompt is None:
     config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens)
 
+if not hasattr(config, "lm_head_generation"):
+    config.lm_head_generation = True
+
 # XXX: can't automatically derive dtype via config's `from_pretrained`
 # dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16
 
diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py
@@ -98,6 +98,10 @@
     )
 if not hasattr(config, "text_max_length") and args.prompt is None:
     config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens)
+
+if not hasattr(config, "lm_head_generation"):
+    config.lm_head_generation = True
+
 model = model_class[0].from_pretrained(
     args.model_id,
     torch_dtype=amp_dtype,
diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py
@@ -47,6 +47,14 @@ def GPTJForCausalLM_forward(
         torch.cuda.set_device(self.transformer.first_device)
         hidden_states = hidden_states.to(self.lm_head.weight.device)
 
+    if (
+        hasattr(self, "config")
+        and hasattr(self.config, "lm_head_generation")
+        and self.config.lm_head_generation
+        and hidden_states.size(1) != 1
+    ):
+        hidden_states = hidden_states[:, -1:, :]
+
     # make sure sampling in fp16 works correctly and
     # compute loss in fp32 to match with mesh-tf version
     # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
@@ -119,6 +127,14 @@ def LlamaForCausalLM_forward(
     )
 
     hidden_states = outputs[0]
+    if (
+        hasattr(self, "config")
+        and hasattr(self.config, "lm_head_generation")
+        and self.config.lm_head_generation
+        and hidden_states.size(1) != 1
+    ):
+        hidden_states = hidden_states[:, -1:, :]
+
     logits = self.lm_head(hidden_states)
 
     loss = None
@@ -178,6 +194,13 @@ def GPTNeoXForCausalLM_forward(
     )
 
     hidden_states = outputs[0]
+    if (
+        hasattr(self, "config")
+        and hasattr(self.config, "lm_head_generation")
+        and self.config.lm_head_generation
+        and hidden_states.size(1) != 1
+    ):
+        hidden_states = hidden_states[:, -1:, :]
     lm_logits = self.embed_out(hidden_states)
 
     lm_loss = None
@@ -244,8 +267,15 @@ def OPTForCausalLM_forward(
         output_hidden_states=output_hidden_states,
         return_dict=return_dict,
     )
-
-    logits = self.lm_head(outputs[0]).contiguous()
+    hidden_states = outputs[0]
+    if (
+        hasattr(self, "config")
+        and hasattr(self.config, "lm_head_generation")
+        and self.config.lm_head_generation
+        and hidden_states.size(1) != 1
+    ):
+        hidden_states = hidden_states[:, -1:, :]
+    logits = self.lm_head(hidden_states).contiguous()
 
     loss = None
     if labels is not None: