intel
diff --git a/‎examples/cpu/inference/python/llm/README.md
Lines changed: 9 additions & 8 deletions b/‎examples/cpu/inference/python/llm/README.md
Lines changed: 9 additions & 8 deletions
diff --git a/‎examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py
Lines changed: 0 additions & 3 deletions b/‎examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎examples/cpu/inference/python/llm/run.py
Lines changed: 14 additions & 9 deletions b/‎examples/cpu/inference/python/llm/run.py
Lines changed: 14 additions & 9 deletions
diff --git a/‎examples/cpu/inference/python/llm/single_instance/run_accuracy.py
Lines changed: 0 additions & 3 deletions b/‎examples/cpu/inference/python/llm/single_instance/run_accuracy.py
Lines changed: 0 additions & 3 deletions
@@ -124,17 +124,18 @@ OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py  --benchmark -m meta-llama
 #### Static quantization (int8):
 ```bash
 # general command:
-OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run.py  --benchmark -m <MODEL_ID> --ipex-smooth-quant --alpha <Tuned alpha for specific models> --output-dir "saved_results" --int8
-# For the best alpha values (range [0, 1.0], float) tuned for specific models, we verified good accuracy: "EleutherAI/gpt-j-6b" with alpha=1.0, "meta-llama/Llama-2-7b-chat-hf" with alpha=0.8.
-# For more recipes, please refer to https://github.com/intel/neural-compressor/blob/master/docs/source/smooth_quant.md#validated-models
-# Note: by default, we use "--int8" to run int8 mixed fp32 mode, while for peak performance of static quantization, please use "--int8-bf16-mixed" instead (may impact accuracy).
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run.py  --benchmark -m <MODEL_ID> --ipex-smooth-quant --qconfig-summary-file <path to specific model qconfig> --output-dir "saved_results" --int8
+# We provide tuned qconfig recipes files for "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-7b-chat-hf" and "EleutherAI/gpt-j-6b"
+# For the qconfig recipes of more models, you can just run your model_id and try with IPEX default recipes by removing "--qconfig-summary-file <path to specific model qconfig>"
+# If IPEX default recipes are not good enough for accuracy requirements, please refer to https://github.com/intel/neural-compressor/blob/master/docs/source/smooth_quant.md#validated-models for tuning more recipes.
+# Note: by default, we use "--int8" to run int8 mixed fp32 inference, while for the peak performance of static quantization, please use "--int8-bf16-mixed" instead (may impact accuracy).
 
 # An example of llama2 7b model:
-OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py  --benchmark -m meta-llama/Llama-2-7b-chat-hf --ipex-smooth-quant --alpha 0.8 --output-dir "saved_results" --int8
+OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py  --benchmark -m meta-llama/Llama-2-7b-hf --ipex-smooth-quant --qconfig-summary-file <path to meta-llama/Llama-2-7b-hf model qconfig> --output-dir "saved_results" --int8
 ```
 *Notes for all quantizations:*
 
-(1) for quantization benchmarks, the first runs will auto-generate the quantized model named "best_model.pt" in the "--output-dir" path, you can reuse these quantized models for inference-only benchmarks by adding "--quantized-model-path <output_dir + "best_model.pt">".
+(1) <a name="generation_sq">for all quantization benchmarks</a>, the first runs will auto-generate the quantized model named "best_model.pt" in the "--output-dir" path, you can reuse these quantized models for inference-only benchmarks by adding "--quantized-model-path <output_dir + "best_model.pt">". Specific for static quantization, if not using "--qconfig-summary-file", a qconfig recipe will also been generated in the "--output-dir" path.
 
 (2) for Falcon quantizations, "--config-file <CONFIG_FILE>" is needed and example of <CONFIG_FILE>: "utils/model_config/tiiuae_falcon-40b_config.json".
 
@@ -245,11 +246,11 @@ OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list
 OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py --accuracy-only -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --jit --tasks lambada_openai
 ```
 ### Quantizations:
+For the quantized models to be used in accuracy tests, we can reuse the model files that are named "best_model.pt" in the "--output-dir" path ([generated during inference performance tests](#generation_sq)).
 ```bash
 # general command:
-# For the quantized models to be used in accuracy tests, we can reuse the model files that are named "best_model.pt" in the "--output-dir" path (generated during inference performance tests).
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_accuracy.py --model <MODEL ID> --quantized-model-path "./saved_results/best_model.pt" --dtype int8 --accuracy-only --jit --tasks {TASK_NAME}
-# please also add  "--int8-bf16-mixed" if your model is quantized with this flag
+# Please also add  "--int8-bf16-mixed" if your model is quantized with this flag
 
 # An example of llama2 7b model:
 OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py -m meta-llama/Llama-2-7b-hf --quantized-model-path "./saved_results/best_model.pt" --dtype int8 --accuracy-only --jit --int8 --tasks lambada_openai
 
@@ -582,7 +582,6 @@ def _model_call(
                     enabled=True
                     if args.int8_bf16_mixed or self._dtype == torch.bfloat16
                     else False,
-                    dtype=torch.bfloat16,
                 ):
                     if self._dtype != "int8":
                         if (
@@ -680,7 +679,6 @@ def _model_call(
                     enabled=True
                     if args.int8_bf16_mixed or self._dtype == torch.bfloat16
                     else False,
-                    dtype=torch.bfloat16,
                 ):
                     if self._with_jit:
                         output = self.model(
@@ -697,7 +695,6 @@ def _model_call(
                     enabled=True
                     if args.int8_bf16_mixed or self._dtype == torch.bfloat16
                     else False,
-                    dtype=torch.bfloat16,
                 ):
                     if self._with_jit:
                         output = self.model(
 
@@ -49,16 +49,16 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser.add_argument("--output-dir", nargs="?", default="./saved_results")
 
     # quantization related arguments.
-    parser.add_argument("--int8", action="store_true")
+    parser.add_argument("--int8", action="store_true", help="default static int8 path (fp32 mixed)")
     parser.add_argument(
         "--int8-bf16-mixed",
         action="store_true",
-        help="by default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)",
+        help="by default static quant is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)",
     )
-    parser.add_argument("--quantized-model-path", default="")
-
+    parser.add_argument("--quantized-model-path", default="", help="path to the quantized model file")
+    parser.add_argument("--qconfig-summary-file", default="", help="qconfig for static quantization")
     parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k")
-    parser.add_argument("--ipex-smooth-quant", action="store_true")
+    parser.add_argument("--ipex-smooth-quant", action="store_true", help="smoothquant forstatic quantization")
     parser.add_argument("--alpha", default=0.5, type=float, help="alpha value for smoothquant")
     parser.add_argument(
         "--ipex-weight-only-quantization",
@@ -154,8 +154,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
             if args.config_file is not None:
                 infer_cmd.extend(["--config-file", str(args.config_file)])
 
-            print("running model geneartion...")
+            print("LLM RUNTIME INFO: running model geneartion...")
             subprocess.run(infer_cmd)
+            print("LLM RUNTIME INFO: finishing for geneartion process , exiting...")
         else:
             if args.config_file is None:
                 config = AutoConfig.from_pretrained(
@@ -231,7 +232,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
                     quant_cmd.extend(["--ipex-smooth-quant"])
                     quant_cmd.extend(["--alpha", str(args.alpha)])
                     quant_cmd.extend(["--dataset", str(args.dataset)])
-                print("quantizing model ...")
+                    quant_cmd.extend(["--qconfig-summary-file", str(args.qconfig_summary_file)])
+                print("LLM RUNTIME INFO: quantizing model ...")
                 subprocess.run(quant_cmd)
                 infer_cmd.extend(
                     ["--quantized-model-path", str(args.output_dir) + "/best_model.pt"]
@@ -268,8 +270,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
                 if args.config_file is not None:
                     infer_cmd.extend(["--config-file", str(args.config_file)])
 
-            print("running model geneartion...")
+            print("LLM RUNTIME INFO: running model geneartion...")
             subprocess.run(infer_cmd)
+            print("LLM RUNTIME INFO: finishing for geneartion process , exiting...")
 
     else:
         path = Path(parent_path, "distributed/run_generation_with_deepspeed.py")
@@ -296,6 +299,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
                     Path.mkdir(model_path)
             shard_cmd.extend(["--save-path", str(args.output_dir)+str(MODEL_CLASSES[model_type])])
             shard_cmd.extend(["--local_rank", str(args.local_rank)])
+            print("LLM RUNTIME INFO: sharding model...")
             subprocess.run(shard_cmd)
             infer_cmd.extend(["-m", str(args.output_dir)+str(MODEL_CLASSES[model_type])])
         else:
@@ -333,8 +337,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
             if args.int8_bf16_mixed:
                 infer_cmd.extend(["--int8-bf16-mixed"])
 
-        print("running model geneartion with deepspeed (autotp)...")
+        print("LLM RUNTIME INFO: running model geneartion with deepspeed (autotp)...")
         subprocess.run(infer_cmd)
+        print("LLM RUNTIME INFO: finishing for geneartion process , exiting...")
 
 
 if __name__ == "__main__":
 
@@ -419,7 +419,6 @@ def _model_call(
                     enabled=True
                     if args.int8_bf16_mixed or self._dtype == torch.bfloat16
                     else False,
-                    dtype=torch.bfloat16,
                 ):
                     if self._dtype != "int8":
                         if (
@@ -517,7 +516,6 @@ def _model_call(
                     enabled=True
                     if args.int8_bf16_mixed or self._dtype == torch.bfloat16
                     else False,
-                    dtype=torch.bfloat16,
                 ):
                     if self._with_jit:
                         output = self.model(
@@ -534,7 +532,6 @@ def _model_call(
                     enabled=True
                     if args.int8_bf16_mixed or self._dtype == torch.bfloat16
                     else False,
-                    dtype=torch.bfloat16,
                 ):
                     if self._with_jit:
                         output = self.model(