intel
diff --git a/‎examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
Lines changed: 134 additions & 14 deletions b/‎examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
Lines changed: 134 additions & 14 deletions
@@ -223,6 +223,27 @@ def get_int_from_env(env_keys, default):
 
 TokenSequence = Union[List[int], torch.LongTensor, torch.Tensor, BatchEncoding]
 
+tp_grain_size = 64
+if args.ipex_weight_only_quantization and args.low_precision_checkpoint != "":
+    pathname = args.low_precision_checkpoint
+    assert os.path.exists(pathname), f"Checkpoint file does not exist: {pathname}"
+    if os.path.isdir(pathname):
+        try:
+            with open(pathname + "/config.json") as f:
+                quant_model_config = json.load(f)
+                tp_grain_size = int(
+                    quant_model_config["quantization_config"]["group_size"]
+                )
+        except Exception as e:
+            print("Failed to get group_size from config.json")
+    elif args.group_size > 0:
+        tp_grain_size = args.group_size
+    else:
+        print(
+            "Warning: cannot get group_size from config.json or --group-size, "
+            "using default value 64 for tp_grain_size"
+        )
+
 
 class HuggingFaceModel(BaseLM):
     _DEFAULT_MAX_LENGTH = 2048
@@ -399,6 +420,9 @@ def write_checkpoints_json():
             base_dir=repo_root,
             dtype=infer_dtype,
             checkpoint=checkpoints_json,
+            tensor_parallel=deepspeed.inference.config.DeepSpeedTPConfig(
+                tp_grain_size=tp_grain_size
+            ),
         )
 
         self.model = self.model.module
@@ -537,10 +561,13 @@ def write_checkpoints_json():
                     num_heads = model.config.num_attention_heads
                     rank = local_rank
 
-                    layers_split_by_N = [
+                    mha_layers_split_by_N = [
                         "q_proj",
                         "k_proj",
                         "v_proj",
+                    ]
+                    # mlp is split with grain size = tp_grain_size
+                    mlp_layers_split_by_N = [
                         "gate_proj",
                         "up_proj",
                         "fc_in",
@@ -549,23 +576,26 @@ def write_checkpoints_json():
                         "w1",
                         "w3",
                     ]
-                    layers_split_by_K = [
+                    mha_layers_split_by_K = [
                         "o_proj",
+                        "out_proj",
+                    ]
+                    # mlp is split with grain size = tp_grain_size
+                    mlp_layers_split_by_K = [
                         "down_proj",
                         "fc_out",
                         "fc2",
-                        "out_proj",
                         "dense",
                         "dense_4h_to_h",
                         "w2",
                     ]
+                    # lm_head is split with grain size = tp_grain_size
                     lm_head_layers = ["lm_head"]  # split by K but not quantized
                     quantization_method = quant_model_config["quantization_config"][
                         "quant_method"
                     ]
                     head_range = [0]
                     head_per_rank = num_heads // world_size
-
                     for i in range(0, world_size):
                         head_this_rank = head_per_rank
                         if i < num_heads % world_size:
@@ -578,7 +608,7 @@ def write_checkpoints_json():
                         )
                         if "bias" in key:
                             continue
-                        if any(substring in key for substring in layers_split_by_N):
+                        if any(substring in key for substring in mha_layers_split_by_N):
                             data = low_precision_checkpoint_dict[key]
                             if quantization_method == "awq":
                                 # awq qweight: [K, N // 8]
@@ -592,7 +622,48 @@ def write_checkpoints_json():
                                 raise AssertionError(
                                     f"{quantization_method} is not supported yet."
                                 )
-                        if any(substring in key for substring in layers_split_by_K):
+                        elif any(
+                            substring in key for substring in mlp_layers_split_by_N
+                        ):
+                            data = low_precision_checkpoint_dict[key]
+                            if quantization_method == "awq":
+                                # awq qweight: [K, N // 8]
+                                # awq scales: [K // G, N]
+                                # awq qzeros: [K // G, N // 8]
+                                if "scales" in key:
+                                    assert (
+                                        data.shape[1] % tp_grain_size == 0
+                                    ), "N must be divisible by tp_grain_size"
+                                    grains = data.shape[1] // tp_grain_size
+                                    dim = tp_grain_size
+                                else:
+                                    assert (
+                                        data.shape[1] * 8
+                                    ) % tp_grain_size == 0, (
+                                        "N must be divisible by tp_grain_size"
+                                    )
+                                    grains = data.shape[1] // (tp_grain_size // 8)
+                                    dim = tp_grain_size // 8
+                                grains_per_rank = grains // world_size
+                                grains_rem = grains % world_size
+                                grains_start = grains_per_rank * local_rank + min(
+                                    local_rank, grains_rem
+                                )
+                                grains_end = (
+                                    grains_start
+                                    + grains_per_rank
+                                    + (1 if local_rank < grains_rem else 0)
+                                )
+                                low_precision_checkpoint_dict[key] = data[
+                                    :, grains_start * dim : grains_end * dim
+                                ]
+                            else:
+                                raise AssertionError(
+                                    f"{quantization_method} is not supported yet."
+                                )
+                        elif any(
+                            substring in key for substring in mha_layers_split_by_K
+                        ):
                             data = low_precision_checkpoint_dict[key]
                             if quantization_method == "awq":
                                 # awq qweight: [K, N // 8]
@@ -612,18 +683,61 @@ def write_checkpoints_json():
                                 raise AssertionError(
                                     f"{quantization_method} is not supported yet."
                                 )
-                        if any(substring in key for substring in lm_head_layers):
+                        elif any(
+                            substring in key for substring in mlp_layers_split_by_K
+                        ):
+                            data = low_precision_checkpoint_dict[key]
+                            if quantization_method == "awq":
+                                # awq qweight: [K, N // 8]
+                                # awq scales: [K // G, N]
+                                # awq qzeros: [K // G, N // 8]
+                                if "qweight" in key:
+                                    assert (
+                                        data.shape[0] % tp_grain_size == 0
+                                    ), "K must be divisible by tp_grain_size"
+                                    grains = data.shape[0] // tp_grain_size
+                                    dim = tp_grain_size
+                                else:
+                                    grains = data.shape[0]
+                                    dim = 1
+                                grains_per_rank = grains // world_size
+                                grains_rem = grains % world_size
+                                grains_start = grains_per_rank * local_rank + min(
+                                    local_rank, grains_rem
+                                )
+                                grains_end = (
+                                    grains_start
+                                    + grains_per_rank
+                                    + (1 if local_rank < grains_rem else 0)
+                                )
+                                low_precision_checkpoint_dict[key] = data[
+                                    grains_start * dim : grains_end * dim
+                                ]
+                            else:
+                                raise AssertionError(
+                                    f"{quantization_method} is not supported yet."
+                                )
+                        elif any(substring in key for substring in lm_head_layers):
                             # lm_head: [N, K] (not quantized)
                             # Same for both AWQ and GPTQ
                             data = low_precision_checkpoint_dict[key]
-                            if data.shape[-1] % head_range[-1] == 0:
-                                dim = data.shape[-1] // head_range[-1]
-                            else:
-                                dim = data.shape[-1] // world_size
-                                q_head_start = local_rank
-                                q_head_end = local_rank + 1
+                            assert (
+                                data.shape[1] % tp_grain_size == 0
+                            ), "K must be divisible by tp_grain_size"
+                            grains = data.shape[1] // tp_grain_size
+                            dim = tp_grain_size
+                            grains_per_rank = grains // world_size
+                            grains_rem = grains % world_size
+                            grains_start = grains_per_rank * local_rank + min(
+                                local_rank, grains_rem
+                            )
+                            grains_end = (
+                                grains_start
+                                + grains_per_rank
+                                + (1 if local_rank < grains_rem else 0)
+                            )
                             low_precision_checkpoint_dict[key] = data[
-                                :, q_head_start * dim : q_head_end * dim
+                                :, grains_start * dim : grains_end * dim
                             ]
                     low_precision_checkpoint = (
                         low_precision_checkpoint_dict,
@@ -1381,6 +1495,9 @@ def write_checkpoints_json():
             base_dir=repo_root,
             dtype=infer_dtype,
             checkpoint=checkpoints_json,
+            tensor_parallel=deepspeed.inference.config.DeepSpeedTPConfig(
+                tp_grain_size=tp_grain_size
+            ),
         )
 
         self._model = self._model.module
@@ -2146,6 +2263,9 @@ def write_checkpoints_json():
             base_dir=repo_root,
             dtype=infer_dtype,
             checkpoint=checkpoints_json,
+            tensor_parallel=deepspeed.inference.config.DeepSpeedTPConfig(
+                tp_grain_size=tp_grain_size
+            ),
         )
 
         self.model = self.model.module