Disable repack (#1563)

zhuhaozhe · web-flow · commit a821c0aef97e · 2023-06-05T14:46:57.000+08:00
* allow disable re-pack by global flag (#1522)

* enable linear fusion without jit repack

* fix bug and add ut for re-pack flag

* fix ut &amp; add ut for linear fusion without repack

* fix linear schema in concat_linear test

* fix format

* enable concat linear on ipex_linear and mkl_linear

* Revert "enable concat linear on ipex_linear and mkl_linear"

This reverts commit 68dd4561545be81bf9f9e07c065c9f17fefbc46c.

* fix ut

* add comments for why we still repack linear by default

* format change
diff --git a/csrc/jit/auto_opt_config.h b/csrc/jit/auto_opt_config.h
@@ -17,9 +17,26 @@ class TORCH_API AutoOptConfig {
     return jit_fuse_;
   }
 
+  inline void set_jit_repack_for_linear(bool jit_repack_for_linear) {
+    jit_repack_for_linear_ = jit_repack_for_linear;
+  }
+
+  inline bool get_jit_repack_for_linear() {
+    return jit_repack_for_linear_;
+  }
+
  private:
   AutoOptConfig()
       : jit_fuse_(true),
+        // jit repack  (ipex linear -> aten linear -> ipex linear) will use
+        // extra memory since the orinal graph will be always hold by design
+        // https://github.com/pytorch/pytorch/blob/8e2a86c2a54719fd66a3e612fe8b433fbb1d4522/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp#L668
+        // We use this flag to let custom disable repack to same meory
+        // This is default False for 2 reasons:
+        //    (1) JIT repack stage can get a real input, so the block format
+        //    will be the best format. (2) Linear + binary cannot be folded if
+        //    we do not do repack, since it is implemented on aten:linear
+        jit_repack_for_linear_(true),
         calibration_step_(false),
         qscheme_(at::QScheme::PER_TENSOR_AFFINE) {}
 
@@ -28,6 +45,7 @@ class TORCH_API AutoOptConfig {
   AutoOptConfig& operator=(const AutoOptConfig&) = default;
 
   bool jit_fuse_;
+  bool jit_repack_for_linear_;
   // the flag for one iteration of calibration step whether end or not.
   bool calibration_step_;
   at::QScheme qscheme_;
diff --git a/csrc/jit/fusion_pass.cpp b/csrc/jit/fusion_pass.cpp
@@ -1,5 +1,6 @@
 #include "fusion_pass.h"
 #include <string>
+#include "auto_opt_config.h"
 #include "codegen/onednn/interface.h"
 #include "cpu/kernels/Matmul.h"
 #include "passes/concat_linear.h"
@@ -132,8 +133,10 @@ void IPEXFusionPass(std::shared_ptr<Graph>& graph) {
   // up fusion pass, will further abstract this as a class method.
   auto aten_linear_recorder = ATenLinearRecorder(graph);
   // linear folding
-  graph_rewrite::replaceFrozenIPEXLinearWithAtenLinear(
-      graph, aten_linear_recorder.use_mkl());
+  if (AutoOptConfig::singleton().get_jit_repack_for_linear()) {
+    graph_rewrite::replaceFrozenIPEXLinearWithAtenLinear(
+        graph, aten_linear_recorder.use_mkl());
+  }
   // concat multi-linear with same input
   torch_ipex::jit::FrozenConcatLinear(
       graph, aten_linear_recorder.get_records());
diff --git a/csrc/jit/passes/graph_rewrite_linear.cpp b/csrc/jit/passes/graph_rewrite_linear.cpp
@@ -2,6 +2,7 @@
 #include <ideep.hpp>
 #include "passes/utils.h"
 
+#include "auto_opt_config.h"
 #include "graph_rewrite.h"
 #include "graph_rewrite_utils.h"
 
@@ -93,6 +94,101 @@ void replaceFrozenIPEXLinearWithAtenLinear(
   EliminateDeadCode(graph);
 }
 
+void replaceAtenLinearWithPrepackNode(
+    Node* n,
+    std::unordered_set<Node*>& aten_linear,
+    const bool& use_mkl_sgemm) {
+  WithInsertPoint guard(n);
+  auto graph = n->owningGraph();
+  auto input_size_option =
+      n->inputs().at(0)->type()->cast<TensorType>()->sizes().concrete_sizes();
+  if (!(input_size_option.has_value() &&
+        input_size_option.value().size() >= 2)) {
+    return;
+  }
+  auto input_size = input_size_option.value();
+  int64_t b_size =
+      std::accumulate(
+          input_size.begin(), input_size.end(), 1, std::multiplies<double>()) /
+      input_size[input_size.size() - 1];
+  IValue batch_size_value(b_size);
+  auto batch_size = graph->insertConstant(batch_size_value);
+  auto tt = n->inputs().at(1)->type()->cast<TensorType>();
+  auto weight_size_option = tt->sizes().concrete_sizes();
+  if (!(weight_size_option.has_value() &&
+        weight_size_option.value().size() == 2)) {
+    return;
+  }
+  auto weight_dtype_option = tt->scalarType();
+  bool should_repack = aten_linear.find(n) == aten_linear.end() &&
+      AutoOptConfig::singleton().get_jit_repack_for_linear();
+
+  // we should pack aten linear to ipex prepack linear for 2 cases:
+  // (1): Repack case, this aten linear is created by ipex linear
+  // (2) BF16 case, we believe IPEX BF16 prepack linear always better than aten
+  // BF16 linear
+  bool should_pack_for_bf16 = weight_dtype_option.has_value() &&
+      weight_dtype_option.value() == at::ScalarType::BFloat16 &&
+      ideep::has_bf16_type_support();
+  bool should_pack = should_repack || should_pack_for_bf16;
+  if (!(should_pack))
+    return;
+
+  auto weight_size = weight_size_option.value();
+
+  // Note that once creating a graph node, make sure it is also inserted into
+  // the graph, for: PyTorch (when disabled TE) has a check on the graph node,
+  // pointing out that every mutable value in the system has a corresponding
+  // element. So if creating a graph node but not inserted, it will not pass
+  // the check since its graph element is not initialized. Details please
+  // refer to
+  // https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/ir/alias_analysis.cpp#L1956
+  auto use_mkl_sgemm_ =
+      use_mkl_sgemm && weight_dtype_option.value() != at::ScalarType::BFloat16;
+  auto prepack_node = graph->create(
+      use_mkl_sgemm_ ? Symbol::fromQualString("ipex_prepack::mkl_sgemm_prepack")
+                     : Symbol::fromQualString("ipex_prepack::linear_prepack"),
+      1);
+  for (auto i = 1; i < n->inputs().size(); ++i) {
+    Value* v = n->inputs().at(i);
+    prepack_node->addInput(v);
+  }
+  prepack_node->addInput(batch_size);
+  prepack_node->output()->setType(
+      use_mkl_sgemm_
+          ? getCustomClass("__torch__.torch.classes.ipex_prepack.MKLOpContext")
+          : getCustomClass(
+                "__torch__.torch.classes.ipex_prepack.LinearOpContext"));
+  graph->insertNode(prepack_node);
+  auto prepack_linear = graph->insertNode(graph->create(
+      use_mkl_sgemm_ ? Symbol::fromQualString("ipex_prepack::mkl_sgemm_run")
+                     : Symbol::fromQualString("ipex_prepack::linear_run"),
+      1));
+  prepack_linear->addInput(n->inputs().at(0));
+  prepack_linear->addInput(prepack_node->output());
+  prepack_linear->output()->setType(n->output()->type()->cast<TensorType>());
+  auto v = n->outputs().at(0);
+  n->output()->replaceAllUsesWith(prepack_linear->output());
+}
+
+void replaceIpexLinearWithLinearRunNode(Node* n) {
+  WithInsertPoint guard(n);
+  auto graph = n->owningGraph();
+  auto use_mkl_sgemm =
+      n->kind() == Symbol::fromQualString("torch_ipex::ipex_MKLSGEMM");
+  auto get_data_handle_node = n->inputs().at(3)->node();
+  auto linear_ctx = get_data_handle_node->inputs().at(0);
+  auto linear_run = graph->insertNode(graph->create(
+      use_mkl_sgemm ? Symbol::fromQualString("ipex_prepack::mkl_sgemm_run")
+                    : Symbol::fromQualString("ipex_prepack::linear_run"),
+      1));
+  linear_run->addInput(n->inputs().at(0));
+  linear_run->addInput(linear_ctx);
+  linear_run->output()->setType(n->output()->type()->cast<TensorType>());
+  n->output()->replaceAllUsesWith(linear_run->output());
+  return;
+}
+
 void insertPrePackedLinearOp(
     Block* b,
     std::unordered_set<Node*>& aten_linear,
@@ -101,75 +197,15 @@ void insertPrePackedLinearOp(
     for (Block* block : n->blocks()) {
       insertPrePackedLinearOp(block, aten_linear, use_mkl_sgemm);
     }
-    if (n->kind() != aten::linear)
-      continue;
-    WithInsertPoint guard(n);
-    auto graph = n->owningGraph();
-    auto input_size_option =
-        n->inputs().at(0)->type()->cast<TensorType>()->sizes().concrete_sizes();
-    if (!(input_size_option.has_value() &&
-          input_size_option.value().size() >= 2)) {
-      continue;
-    }
-    auto input_size = input_size_option.value();
-    int64_t b_size = std::accumulate(
-                         input_size.begin(),
-                         input_size.end(),
-                         1,
-                         std::multiplies<double>()) /
-        input_size[input_size.size() - 1];
-    IValue batch_size_value(b_size);
-    auto batch_size = graph->insertConstant(batch_size_value);
-    auto tt = n->inputs().at(1)->type()->cast<TensorType>();
-    auto weight_size_option = tt->sizes().concrete_sizes();
-    if (!(weight_size_option.has_value() &&
-          weight_size_option.value().size() == 2)) {
-      continue;
-    }
-    auto weight_dtype_option = tt->scalarType();
-    if (!(weight_dtype_option.has_value() &&
-              (weight_dtype_option.value() == at::ScalarType::BFloat16) &&
-              ideep::has_bf16_type_support() ||
-          aten_linear.find(n) == aten_linear.end())) {
+    if (n->kind() == aten::linear) {
+      replaceAtenLinearWithPrepackNode(n, aten_linear, use_mkl_sgemm);
+    } else if (
+        n->kind() == Symbol::fromQualString("torch_ipex::ipex_linear") ||
+        n->kind() == Symbol::fromQualString("torch_ipex::ipex_MKLSGEMM")) {
+      replaceIpexLinearWithLinearRunNode(n);
+    } else {
       continue;
     }
-    auto weight_size = weight_size_option.value();
-
-    // Note that once creating a graph node, make sure it is also inserted into
-    // the graph, for: PyTorch (when disabled TE) has a check on the graph node,
-    // pointing out that every mutable value in the system has a corresponding
-    // element. So if creating a graph node but not inserted, it will not pass
-    // the check since its graph element is not initialized. Details please
-    // refer to
-    // https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/ir/alias_analysis.cpp#L1956
-    auto use_mkl_sgemm_ = use_mkl_sgemm &&
-        weight_dtype_option.value() != at::ScalarType::BFloat16;
-    auto prepack_node = graph->create(
-        use_mkl_sgemm_
-            ? Symbol::fromQualString("ipex_prepack::mkl_sgemm_prepack")
-            : Symbol::fromQualString("ipex_prepack::linear_prepack"),
-        1);
-    for (auto i = 1; i < n->inputs().size(); ++i) {
-      Value* v = n->inputs().at(i);
-      prepack_node->addInput(v);
-    }
-    prepack_node->addInput(batch_size);
-    prepack_node->output()->setType(
-        use_mkl_sgemm_
-            ? getCustomClass(
-                  "__torch__.torch.classes.ipex_prepack.MKLOpContext")
-            : getCustomClass(
-                  "__torch__.torch.classes.ipex_prepack.LinearOpContext"));
-    graph->insertNode(prepack_node);
-    auto prepack_linear = graph->insertNode(graph->create(
-        use_mkl_sgemm_ ? Symbol::fromQualString("ipex_prepack::mkl_sgemm_run")
-                       : Symbol::fromQualString("ipex_prepack::linear_run"),
-        1));
-    prepack_linear->addInput(n->inputs().at(0));
-    prepack_linear->addInput(prepack_node->output());
-    prepack_linear->output()->setType(n->output()->type()->cast<TensorType>());
-    auto v = n->outputs().at(0);
-    n->output()->replaceAllUsesWith(prepack_linear->output());
   }
   EliminateDeadCode(b);
 }
diff --git a/intel_extension_for_pytorch/csrc/cpu/Module.cpp b/intel_extension_for_pytorch/csrc/cpu/Module.cpp
@@ -141,6 +141,16 @@ void InitIpexModuleBindings(py::module m) {
     return AutoOptConfig::singleton().get_jit_fuse();
   });
 
+  m.def("enable_jit_linear_repack", []() {
+    AutoOptConfig::singleton().set_jit_repack_for_linear(true);
+  });
+  m.def("disable_jit_linear_repack", []() {
+    AutoOptConfig::singleton().set_jit_repack_for_linear(false);
+  });
+  m.def("get_jit_linear_repack", []() {
+    return AutoOptConfig::singleton().get_jit_repack_for_linear();
+  });
+
   // BF32
   py::enum_<FP32MathMode>(m, "FP32MathMode")
       .value("FP32", FP32MathMode::FP32)
diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py