Skip to content

Commit 7c598e4

Browse files
authored
enable optimized codegen (#2257)
1 parent 99aa54f commit 7c598e4

File tree

17 files changed

+312
-13
lines changed

17 files changed

+312
-13
lines changed

examples/cpu/inference/python/llm/README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Text Generation
22
We provide the inference benchmarking scripts for large language models text generation.<br/>
3-
Support large language model families, including GPT-J, LLaMA, GPT-Neox, OPT, Falcon.<br/>
3+
Support large language model families, including GPT-J, LLaMA, GPT-Neox, OPT, Falcon, CodeGen.<br/>
44
The scripts include both single instance and distributed (DeepSpeed) use cases.<br/>
55
The scripts cover model generation inference with low precions cases for different models with best perf and accuracy (bf16 AMP,static quantization and weight only quantization).<br/>
66

@@ -84,10 +84,11 @@ wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prom
8484
|GPT-NEOX| "EleutherAI/gpt-neox-20b" ||||** |
8585
|FALCON*|"tiiuae/falcon-40b" ||||**|
8686
|OPT|"facebook/opt-30b", "facebook/opt-1.3b"||||**|
87+
|CodeGen|"Salesforce/codegen-2B-multi"||||**|
8788

8889
*For Falcon models from remote hub, we need to modify the config.json to use the modeling_falcon.py in transformers. Therefore, in the following scripts, we need to pass an extra configuration file like "--config-file=model_config/tiiuae_falcon-40b_config.json". This is optional for FP32/BF16 but needed for quantizations.
8990

90-
** For GPT-NEOX/FALCON/OPT models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage.
91+
** For GPT-NEOX/FALCON/OPT/CodeGen models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage.
9192

9293
*Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). For other LLM model families, we are working in progress to cover those optimizations, which will expand the model list above.
9394

@@ -180,6 +181,8 @@ python run_gpt-neox_quantization.py --ipex-weight-only-quantization --output-dir
180181
python run_falcon_quantization.py --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed -m <FALCON MODEL_ID> --config-file <CONFIG_FILE>
181182
## OPT quantization
182183
python run_opt_quantization.py --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed -m <OPT MODEL_ID>
184+
## CodeGen quantization
185+
python run_codegen_quantization.py --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed -m <CODEGEN MODEL_ID>
183186

184187
## (2) Run quantization performance test (note that GPT-NEOX uses --int8 instead of --int8-bf16-mixed)
185188
OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_<MODEL>_quantization.py -m <MODEL_ID> --quantized-model-path "./saved_results/best_model.pt" --benchmark --int8-bf16-mixed
@@ -277,7 +280,7 @@ export WORK_DIR=./
277280
cd distributed
278281
mv PATH/TO/prompt.json ./
279282
280-
# Run GPTJ/LLAMA/OPT/Falcon with bfloat16 DeepSpeed
283+
# Run GPTJ/LLAMA/OPT/Falcon/CodeGen with bfloat16 DeepSpeed
281284
deepspeed --bind_cores_to_rank run_generation_with_deepspeed.py --benchmark -m <MODEL_ID> --dtype bfloat16 --ipex --deployment-mode
282285
283286
# Run GPT-NeoX with ipex weight only quantization

examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
"opt": (AutoModelForCausalLM, AutoTokenizer),
2626
"llama": (AutoModelForCausalLM, LlamaTokenizer),
2727
"falcon": (AutoModelForCausalLM, AutoTokenizer),
28+
"codegen": (AutoModelForCausalLM, AutoTokenizer),
2829
"auto": (AutoModelForCausalLM, AutoTokenizer),
2930
}
3031

@@ -336,7 +337,7 @@ def _model_call(
336337
for text in inputs:
337338
input_ids = text.to(self._device)
338339
input_bs = inputs.shape[0] * self.num_beams
339-
if re.search("GPTJ", self.base_model.config.architectures[0]):
340+
if re.search("GPTJ", self.base_model.config.architectures[0]) or re.search("codegen", self.base_model.config.architectures[0], re.IGNORECASE):
340341
beam_idx_tmp = torch.zeros(
341342
(2048, int(input_bs)), dtype=torch.long
342343
).contiguous()

examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
"opt": (AutoModelForCausalLM, AutoTokenizer),
3333
"falcon": (AutoModelForCausalLM, AutoTokenizer),
3434
"chatglm": (AutoModelForCausalLM, AutoTokenizer),
35+
"codegen": (AutoModelForCausalLM, AutoTokenizer),
3536
"auto": (AutoModelForCausalLM, AutoTokenizer),
3637
}
3738

examples/cpu/inference/python/llm/run.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
181181
)
182182
elif re.search("OPT", config.architectures[0], re.IGNORECASE):
183183
qpath = Path(parent_path, "single_instance/run_opt_quantization.py")
184+
elif re.search("codegen", config.architectures[0], re.IGNORECASE):
185+
qpath = Path(parent_path, "single_instance/run_codegen_quantization.py")
184186

185187
infer_cmd = ["python", qpath]
186188
# 1) quantization

examples/cpu/inference/python/llm/single_instance/run_accuracy.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
"opt": (AutoModelForCausalLM, AutoTokenizer),
1818
"llama": (AutoModelForCausalLM, LlamaTokenizer),
1919
"falcon": (AutoModelForCausalLM, AutoTokenizer),
20+
"codegen": (AutoModelForCausalLM, AutoTokenizer),
2021
"auto": (AutoModelForCausalLM, AutoTokenizer),
2122
}
2223

@@ -172,7 +173,7 @@ def _model_call(
172173
for text in inputs:
173174
input_ids = text.to(self._device)
174175
input_bs = inputs.shape[0] * self.num_beams
175-
if re.search("GPTJ", self.base_model.config.architectures[0]):
176+
if re.search("GPTJ", self.base_model.config.architectures[0]) or re.search("codegen", self.base_model.config.architectures[0], re.IGNORECASE):
176177
beam_idx_tmp = torch.zeros(
177178
(2048, int(input_bs)), dtype=torch.long
178179
).contiguous()

examples/cpu/inference/python/llm/single_instance/run_generation.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"llama": (AutoModelForCausalLM, LlamaTokenizer),
2020
"opt": (AutoModelForCausalLM, AutoTokenizer),
2121
"falcon": (AutoModelForCausalLM, AutoTokenizer),
22+
"codegen": (AutoModelForCausalLM, AutoTokenizer),
2223
"auto": (AutoModelForCausalLM, AutoTokenizer),
2324
}
2425

intel_extension_for_pytorch/transformers/generation/beam_search.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,14 +176,17 @@ def _beam_search(
176176
or re.search("OPT", self.config.architectures[0], re.IGNORECASE)
177177
or re.search("falcon", self.config.architectures[0], re.IGNORECASE)
178178
or re.search("rw", self.config.architectures[0], re.IGNORECASE)
179+
or re.search("codegen", self.config.architectures[0], re.IGNORECASE)
179180
):
180181
first_token = False
181182
input_bs = input_ids.size()[0]
182183
has_position_id = True
183184
if model_inputs["past_key_values"] is None:
184185
first_token = True
185186
if first_token:
186-
if re.search("GPTJ", self.config.architectures[0]):
187+
if re.search("GPTJ", self.config.architectures[0]) or re.search(
188+
"codegen", self.config.architectures[0], re.IGNORECASE
189+
):
187190
beam_idx_tmp = torch.zeros(
188191
(2048, int(batch_size * num_beams)), dtype=torch.long
189192
).contiguous()

intel_extension_for_pytorch/transformers/generation/greedy_search.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,13 +157,16 @@ def _greedy_search(
157157
or re.search("OPT", self.config.architectures[0], re.IGNORECASE)
158158
or re.search("falcon", self.config.architectures[0], re.IGNORECASE)
159159
or re.search("rw", self.config.architectures[0], re.IGNORECASE)
160+
or re.search("codegen", self.config.architectures[0], re.IGNORECASE)
160161
):
161162
first_token = False
162163
input_bs = input_ids.size()[0]
163164
if model_inputs["past_key_values"] is None:
164165
first_token = True
165166
if first_token:
166-
if re.search("GPTJ", self.config.architectures[0]):
167+
if re.search("GPTJ", self.config.architectures[0]) or re.search(
168+
"codegen", self.config.architectures[0], re.IGNORECASE
169+
):
167170
beam_idx_tmp = torch.zeros(
168171
(2048, int(input_bs)), dtype=torch.long
169172
).contiguous()

intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ def __init__(self, module, config, tpp=False, woq=False):
2626
self.rope_base,
2727
self.model_backbone,
2828
)
29+
30+
if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search(
31+
"codegen", self.model_backbone, re.IGNORECASE
32+
):
33+
self._IPEXROPE.embed_positions.sin_cos = self.embed_positions
2934
if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search(
3035
"LLAMA", self.model_backbone, re.IGNORECASE
3136
):

intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,5 +67,14 @@ def __init__(self, module, config, tpp=False, woq=False):
6767
self.linear_add = _IPEXlinearAddCPU(
6868
module.linear_add.linear, tpp=tpp, woq=woq
6969
)
70+
elif re.search("codegen", self.model_backbone, re.IGNORECASE):
71+
if not self.distributed:
72+
self.linear_add_add = _IPEXlinearAddAddCPU(
73+
module.linear_add_add.linear, tpp=tpp, woq=woq
74+
)
75+
# woq_linear_gelu has accuracy issues on codegen, disable it
76+
self.linear_gelu = _IPEXlinearNewGeluCPU(
77+
module.linear_gelu.linear, tpp=tpp and not woq, woq=False
78+
)
7079
else:
7180
AssertionError(False, "Do not support the optimization of your model yet")

0 commit comments

Comments
 (0)