Support dict input for quantization prepare (#1682)

leslie-fang-intel · web-flow · commit 30b70e4b0bd8 · 2023-06-06T12:18:15.000+08:00
* Enable dict input for ipex quantization prepare

* code format

* add UT

* code format

* code clean up
diff --git a/intel_extension_for_pytorch/quantization/_quantize.py b/intel_extension_for_pytorch/quantization/_quantize.py
@@ -18,20 +18,32 @@
 )
 from ._quantize_utils import auto_prepare, auto_convert, copy_prepared_model
 from .. import nn
+from typing import Dict
 
 
-def prepare(model, configure, example_inputs=None, inplace=False, bn_folding=True):
+def prepare(
+    model,
+    configure,
+    example_inputs=None,
+    inplace=False,
+    bn_folding=True,
+    example_kwarg_inputs=None,
+):
     r"""
     Prepare an FP32 torch.nn.Module model to do calibration or to convert to quantized model.
 
     Args:
         model (torch.nn.Module): The FP32 model to be prepared.
         configure (torch.quantization.qconfig.QConfig): The observer settings about activation and weight.
         example_inputs (tuple or torch.Tensor): A tuple of example inputs that
-            will be passed to the function while running to init quantization state.
+            will be passed to the function while running to init quantization state. Only one of this
+            argument or ``example_kwarg_inputs`` should be specified.
         inplace: (bool): It will change the given model in-place if True. The default value is ``False``.
         bn_folding: (bool): whether to perform ``conv_bn`` and ``linear_bn`` folding.
-        The default value is ``True``.
+            The default value is ``True``.
+        example_kwarg_inputs (dict):  A dict of example inputs that will be passed to the function while
+            running to init quantization state. Only one of this argument or ``example_inputs`` should be
+            specified.
 
     Returns:
         torch.nn.Module
@@ -52,9 +64,10 @@ def prepare(model, configure, example_inputs=None, inplace=False, bn_folding=Tru
     if isinstance(configure, QConfigMapping):
         configure = configure.global_qconfig
     if not isinstance(configure.activation(), PlaceholderObserver):
-        assert (
-            example_inputs is not None
-        ), "IPEX quantization.prepare: example inputs cannot be None for static quantization"
+        assert example_inputs is not None or example_kwarg_inputs is not None, (
+            "IPEX quantization.prepare: example_inputs and example_kwarg_inputs cannot be none at same time "
+            "for static quantization."
+        )
     # auto model channels_last memory format conversion
     from ..frontend import (
         auto_channels_last,
@@ -81,12 +94,19 @@ def prepare(model, configure, example_inputs=None, inplace=False, bn_folding=Tru
 
     # replace dropout with identity to enable more fusion pattern.
     nn.utils._model_convert.replace_dropout_with_identity(prepare_model)
+    assert (
+        example_inputs is None or example_kwarg_inputs is None
+    ), "IPEX quantization.prepare: example_inputs and example_kwarg_inputs cannot be set at same time."
     # Special case for common case of passing a single Tensor
     if isinstance(example_inputs, (torch.Tensor, dict)):
         example_inputs = (example_inputs,)
     elif not isinstance(example_inputs, tuple) and example_inputs is not None:
         example_inputs = tuple(example_inputs)
-    return auto_prepare(prepare_model, configure, example_inputs)
+    if example_kwarg_inputs is not None:
+        assert isinstance(
+            example_kwarg_inputs, Dict
+        ), "IPEX quantization.prepare: example_kwarg_inputs must be type of Dict."
+    return auto_prepare(prepare_model, configure, example_inputs, example_kwarg_inputs)
 
 
 @functools.lru_cache(None)
diff --git a/intel_extension_for_pytorch/quantization/_quantize_utils.py b/intel_extension_for_pytorch/quantization/_quantize_utils.py
@@ -73,6 +73,7 @@ def auto_prepare(
     model: torch.nn.Module,
     configure: QConfig,
     example_inputs: Optional[Tuple[Any]],
+    example_kwarg_inputs: Optional[Dict[Any, Any]],
 ) -> torch.nn.Module:
     def convert_to_interception_proxy(x):
         if isinstance(x, torch.Tensor):
@@ -486,10 +487,20 @@ def load_qconf_summary(self, qconf_summary):
     if not isinstance(configure.activation(), PlaceholderObserver):
         model.__class__ = QuantizationInterceptionModule
         # init model quantization state using example_inputs
-        assert (
-            example_inputs is not None
-        ), "IPEX: example inputs cannot be None for static quantization"
-        model(*example_inputs)
+        assert example_inputs is not None or example_kwarg_inputs is not None, (
+            "IPEX: example_inputs and example_kwarg_inputs cannot be None at same time "
+            "for static quantization."
+        )
+        if example_kwarg_inputs is None:
+            model(*example_inputs)
+        elif example_inputs is None:
+            model(**example_kwarg_inputs)
+        else:
+            AssertionError(
+                False,
+                "IPEX quantization.prepare: example_inputs and example_kwarg_inputs cannot be set at same time "
+                "for static quantization.",
+            )
     return model
 
 
diff --git a/tests/cpu/test_ao_jit_ipex_quantization.py b/tests/cpu/test_ao_jit_ipex_quantization.py
@@ -25,6 +25,7 @@
     QConfig,
     PlaceholderObserver,
 )
+from torch.testing._internal.common_utils import run_tests
 
 default_weight_observer = PerChannelMinMaxObserver.with_args(
     dtype=torch.qint8, qscheme=torch.per_channel_symmetric
@@ -748,5 +749,89 @@ def forward(self, x, hx, cx):
             FileCheck().check_not("aten:lstm").check("aten::quantized_lstm").run(graph)
 
 
+class TestDictInput(JitLlgaTestCase):
+    def test_only_dict_input(self):
+        class SubModule(nn.Module):
+            def __init__(self):
+                super(SubModule, self).__init__()
+                self.linear = nn.Linear(3, 3)
+
+            def forward(self, x):
+                x = self.linear(x)
+                return x
+
+        class M(nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.linear1 = nn.Sequential(nn.Linear(3, 3))
+                self.linear2 = SubModule()
+                self.linear3 = nn.Linear(3, 3)
+
+            def forward(self, x1, x2, x3):
+                x1 = self.linear1(x1)
+                x2 = self.linear2(x2)
+                x3 = self.linear3(x3)
+                return x1 + x2 + x3
+
+        int8_bf16_list = [True, False]
+        for qconfig, int8_bf16 in itertools.product(static_qconfig, int8_bf16_list):
+            # Step1: Test model with tuple(x1, x2, x3) input.
+            m = M().eval()
+            m2 = copy.deepcopy(m).eval()
+            x1 = torch.randn(3, 3)
+            x2 = torch.randn(3, 3)
+            x3 = torch.randn(3, 3)
+            graph = self.checkQuantizeTrace(
+                m, [x1, x2, x3], atol=2e-1, qconfig=qconfig, int8_bf16=int8_bf16
+            )
+            FileCheck().check("aten::linear").run(graph)
+            patterns = [
+                [
+                    "aten::dequantize",
+                    "aten::linear",
+                ],
+                [
+                    "aten::dequantize",
+                    "aten::linear",
+                    "aten::add",
+                ],
+                [
+                    "aten::dequantize",
+                    "aten::linear",
+                    "aten::add",
+                ],
+            ]
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3)
+            self.checkPatterns(graph, patterns)
+
+            # Step2: Test model with Dict{"x1": x1, "x2": x2, "x3": x3} input.
+            graph = self.checkQuantizeTrace(
+                m2,
+                atol=2e-1,
+                qconfig=qconfig,
+                int8_bf16=int8_bf16,
+                x_kwarg={"x1": x1, "x2": x2, "x3": x3},
+            )
+            FileCheck().check("aten::linear").run(graph)
+            patterns = [
+                [
+                    "aten::dequantize",
+                    "aten::linear",
+                ],
+                [
+                    "aten::dequantize",
+                    "aten::linear",
+                    "aten::add",
+                ],
+                [
+                    "aten::dequantize",
+                    "aten::linear",
+                    "aten::add",
+                ],
+            ]
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3)
+            self.checkPatterns(graph, patterns)
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/tests/cpu/test_ao_jit_llga_utils.py b/tests/cpu/test_ao_jit_llga_utils.py
@@ -123,28 +123,58 @@ def assertFused(self, graph, fused_patterns):
         for pat in fused_patterns:
             self.assertGraphContainsExactly(graph, pat, 0)
 
+    def model_forward_helper(
+        self,
+        model,
+        x=None,
+        x_kwarg=None,
+    ):
+        if x is None and x_kwarg is None:
+            raise AssertionError(
+                "x and x_kwarg cannot be none at same time for model_forward_helper."
+            )
+        if x_kwarg is None:
+            return model(*x)
+        elif x is None:
+            return model(**x_kwarg)
+        else:
+            raise AssertionError(
+                "x and x_kwarg cannot be set at same time for model_forward_helper."
+            )
+
     def checkQuantizeTrace(
         self,
         model,
-        x,
+        x=None,
         atol=1e-3,
         rtol=1e-2,
         x_var=None,
         qconfig=default_static_qconfig,
         int8_bf16=False,
         freeze=True,
+        x_kwarg=None,
     ):
+        if x is None and x_kwarg is None:
+            raise AssertionError(
+                "x and x_kwarg cannot be none at same time for checkQuantizeTrace."
+            )
+        elif x is not None and x_kwarg is not None:
+            raise AssertionError(
+                "x and x_kwarg cannot be set at same time for checkQuantizeTrace."
+            )
+
         graph, traced_model, fp32_model = self.prepareModel(
-            model, x, qconfig, int8_bf16, freeze=freeze
+            model, x, qconfig, int8_bf16, freeze=freeze, x_kwarg=x_kwarg
         )
         with torch.no_grad():
-            y = fp32_model(*x)
+            y = self.model_forward_helper(fp32_model, x, x_kwarg)
             y = y.to(torch.bfloat16) if int8_bf16 else y
-            y_llga = traced_model(*x)
+            y_llga = self.model_forward_helper(traced_model, x, x_kwarg)
             self.assertEqual(y, y_llga, atol=atol, rtol=rtol)
 
             # test Fallback when input shape changes:
             if x_var:
+                assert x_kwarg is None, "x_kwarg input doesn't suppport use with x_var"
                 y_var = fp32_model(*x_var)
                 y_var = y_var.to(torch.bfloat16) if int8_bf16 else y_var
                 y_var_llga = traced_model(*x_var)
@@ -161,35 +191,52 @@ def prepareModel(
         prepare_inplace=True,
         convert_inplace=True,
         freeze=True,
+        x_kwarg=None,
     ):
         model.eval()
         fp32_model = copy.deepcopy(model)
         with torch.no_grad(), torch._jit_internal._disable_emit_hooks():
             ipex.nn.utils._model_convert.replace_dropout_with_identity(model)
             model = ipex.quantization.prepare(
-                model, qconfig, x, inplace=prepare_inplace
+                model, qconfig, x, inplace=prepare_inplace, example_kwarg_inputs=x_kwarg
             )
             # do calibration
-            y = model(*x)
+            y = self.model_forward_helper(model, x, x_kwarg)
             # jit trace to insert quant/dequant
+
+            def jit_trace_helper(convert_model, x, x_kwarg):
+                if x_kwarg is None:
+                    return torch.jit.trace(convert_model, x)
+                elif x is None:
+                    return torch.jit.trace(convert_model, example_kwarg_inputs=x_kwarg)
+                else:
+                    raise AssertionError(
+                        "Can't set x and x_kwarg at same time for jit trace."
+                    )
+
             if int8_bf16:
                 with torch.cpu.amp.autocast():
                     convert_model = ipex.quantization.convert(
                         model, inplace=convert_inplace
                     )
-                    traced_model = torch.jit.trace(convert_model, x)
+                    traced_model = jit_trace_helper(convert_model, x, x_kwarg)
             else:
                 convert_model = ipex.quantization.convert(
                     model, inplace=convert_inplace
                 )
-                traced_model = torch.jit.trace(convert_model, x)
+                traced_model = jit_trace_helper(convert_model, x, x_kwarg)
             if freeze:
                 traced_model = torch.jit.freeze(traced_model)
 
             # warm up run
-            y0 = traced_model(*x)
+            y0 = self.model_forward_helper(traced_model, x, x_kwarg)
             # get the graph at the second run after freezing
-            graph = traced_model.graph_for(*x)
+            if x_kwarg is None:
+                graph = traced_model.graph_for(*x)
+            elif x is None:
+                graph = traced_model.graph_for(**x_kwarg)
+            else:
+                raise AssertionError("Can't set x and x_kwarg at same time")
             return graph, traced_model, fp32_model
 
     def checkPatterns(self, graph, patterns):