Fix the WOQ-INT4 crash issue when the pre-allocated buffer is not enough (#3079)

liangan1 · web-flow · commit e74d7a97186e · 2024-07-15T16:05:12.000+08:00
diff --git a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp
@@ -4,8 +4,8 @@
 #include <torch/all.h>
 #include <torch/csrc/autograd/function.h>
 #include <limits>
-#include "vec/vec.h"
 #include "../../utils/isa_utils.h"
+#include "vec/vec.h"
 
 namespace torch_ipex {
 namespace cpu {
@@ -1346,7 +1346,8 @@ first_token_masked_mha(
   auto attn_outputs = at::Tensor();
   auto attn_weights = at::Tensor();
   if ((key.scalar_type() == at::kFloat || key.scalar_type() == at::kBFloat16 ||
-       (key.scalar_type() == at::kHalf && utils::isa_has_avx512_fp16_support())) &&
+       (key.scalar_type() == at::kHalf &&
+        utils::isa_has_avx512_fp16_support())) &&
       attention_mask.stride(-1) == 1) {
     query = query.transpose(1, 2);
     key = key.transpose(1, 2);
@@ -1447,27 +1448,26 @@ masked_multihead_self_attention_kernel_impl(
         query.size(0); // record the promt bs info
 
   } else if (offset > 0 && offset + cur_len > cache_size) {
-    auto new_cache_size = cache_size * 2 + 2;
+    auto new_cache_size = cache_size * 2;
     auto new_key_cache = at::empty(
         {new_cache_size, beam_batch, key.size(2), key.size(3)}, key.options());
     auto new_value_cache = at::empty(
         {new_cache_size, beam_batch, value.size(2), value.size(3)},
         value.options());
     auto new_beam_idx =
-        at::zeros({new_cache_size, beam_batch}, beam_idx.options());
+        at::zeros({new_cache_size + 2, beam_batch}, beam_idx.options());
     new_key_cache.slice(0, 0, cache_size).copy_(key_cache);
     new_value_cache.slice(0, 0, cache_size).copy_(value_cache);
-    new_beam_idx.slice(0, 0, cache_size).copy_(beam_idx);
+    new_beam_idx.slice(0, 0, cache_size + 2).copy_(beam_idx);
     auto new_beam_idx_access = new_beam_idx.accessor<long, 2>();
     auto beam_idx_access = beam_idx.accessor<long, 2>();
     for (auto i = offset; i < new_cache_size; i++) {
       for (auto j = 0; j < beam_batch; j++) {
         new_beam_idx_access[i][j] = beam_idx_access[0][j];
       }
     }
-    new_beam_idx_access[new_cache_size - 2][0] =
-        beam_idx_access[cache_size - 2][0];
-    new_beam_idx_access[new_cache_size - 1][0] =
+    new_beam_idx_access[new_cache_size][0] = beam_idx_access[cache_size - 2][0];
+    new_beam_idx_access[new_cache_size + 1][0] =
         beam_idx_access[cache_size - 1][0];
     key_cache = new_key_cache;
     value_cache = new_value_cache;
diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py
@@ -405,7 +405,13 @@ def load_image(image_file):
 
 num_beams = 1 if args.greedy else 4
 if not hasattr(config, "text_max_length") and args.prompt is None:
-    config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens)
+    if not args.benchmark:
+        if hasattr(config, "max_position_embeddings"):
+            config.text_max_length = config.max_position_embeddings
+        else:
+            config.text_max_length = 2048
+    else:
+        config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens)
 if model.name == "mpt" and not hasattr(config, "max_seq_len") and args.prompt is None:
     config.max_seq_len = int(args.input_tokens) + int(args.max_new_tokens)
 if model.name in ["git", "llava"]:
@@ -416,6 +422,7 @@ def load_image(image_file):
 if args.lm_head_generation and not hasattr(config, "lm_head_generation"):
     config.lm_head_generation = True
 
+
 user_model = model.get_user_model(config, args.benchmark)
 
 tokenizer = model.get_tokenizer()