Add ipex::einsum_add (#674)

liangan1 · web-flow · commit 3094f346a67c · 2022-04-11T15:04:41.000+08:00
Enable einsum+add fusion by oneDNN binary post ops.
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/Einsum.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/Einsum.cpp
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/Einsum.h b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/Einsum.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+
+#include <c10/core/Scalar.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+
+#include "csrc/cpu/ideep/ideep.hpp"
+
+namespace torch {
+namespace jit {
+
+// XXX: PyTorch does not support nesting namespace
+// And the alias analysis is not working for namespace other than aten ...
+// So we fake some op namespaces to workaround that.
+namespace ipex {
+static auto einsum_binary = Symbol::fromQualString("ipex::einsum_binary");
+
+} // namespace ipex
+
+} // namespace jit
+} // namespace torch
+
+namespace torch_ipex {
+namespace cpu {
+
+at::Tensor einsum_binary(
+    c10::string_view,
+    const c10::List<at::Tensor>& operands,
+    const at::Tensor& input,
+    const c10::Scalar& alpha);
+
+} // namespace cpu
+} // namespace torch_ipex
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/Matmul.h b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/Matmul.h
@@ -25,6 +25,14 @@ static auto bmm_add = Symbol::fromQualString("ipex::bmm_add");
 namespace torch_ipex {
 namespace cpu {
 
+at::Tensor bmm_impl(
+    const at::Tensor& tensor1,
+    const at::Tensor& tensor2,
+    at::Tensor out,
+    const ideep::attr_t& attr,
+    const std::vector<ideep::tensor>& postop_tensors,
+    const float dst_coeff);
+
 at::Tensor dil_matmul_div(
     const at::Tensor& left,
     const at::Tensor& right,
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite.h b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite.h
@@ -52,6 +52,7 @@ void FuseConcatBnRelu(std::shared_ptr<Graph>& graph);
 
 void insertPrePackedConvTranspose2dOp(std::shared_ptr<Graph>& graph);
 
+void FusedEinsumPost(std::shared_ptr<Graph>& graph);
 } // namespace graph_rewrite
 } // namespace jit
 } // namespace torch
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_einsum.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_einsum.cpp
@@ -0,0 +1,48 @@
+#include "graph_rewrite.h"
+#include "graph_rewrite_utils.h"
+
+#include <ATen/code_template.h>
+
+namespace torch {
+namespace jit {
+namespace graph_rewrite {
+
+using namespace at::jit;
+
+auto ipex_einsum_filter =
+    [](const Match& match,
+       const std::unordered_map<std::string, Value*>& vmap) {
+      const auto& match_vmap = match.values_map;
+      auto equation =
+          getIValue("equation", match_vmap, vmap).value().toStringView();
+      int num_ops = std::count(equation.begin(), equation.end(), ',') + 1;
+      if (num_ops != 2)
+        return false; // only process the 2 operands
+      return true;
+    };
+
+void FusedEinsumPost(std::shared_ptr<Graph>& graph) {
+  SubgraphRewriter rewriter_einsum_binary;
+  std::array<std::string, 2> binarys = {"add", "add_"};
+  auto aten_einsum_binary = CodeTemplate(R"(
+     graph(%equation, %inputs, %add_arg, %alpha):
+        %x = aten::einsum(%equation, %inputs)
+        %res = aten::${binary}(%x, %add_arg, %alpha)
+        return (%res))");
+  std::string fused_einsum_binary = R"(
+    graph(%equation, %inputs, %add_arg, %alpha):
+        %res = ipex::einsum_binary(%equation, %inputs, %add_arg, %alpha)
+        return (%res))";
+
+  for (const auto& binary : binarys) {
+    TemplateEnv env;
+    env.s("binary", binary);
+    rewriter_einsum_binary.RegisterRewritePattern(
+        aten_einsum_binary.format(env), fused_einsum_binary);
+  }
+  rewriter_einsum_binary.runOnGraph(graph, ipex_einsum_filter);
+}
+
+} // namespace graph_rewrite
+} // namespace jit
+} // namespace torch
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp
@@ -7,6 +7,7 @@
 #include "csrc/aten/cpu/ConcatBnRelu.h"
 #include "csrc/jit/cpu/kernels/ConvPacked.h"
 #include "csrc/jit/cpu/kernels/ConvTransposePacked.h"
+#include "csrc/jit/cpu/kernels/Einsum.h"
 #include "csrc/jit/cpu/kernels/Embeddingbag.h"
 #include "csrc/jit/cpu/kernels/Interaction.h"
 #include "csrc/jit/cpu/kernels/LinearPacked.h"
@@ -832,6 +833,22 @@ RegisterOperators op({
           };
         },
         aliasAnalysisFromSchema()),
+    Operator(
+        "ipex::einsum_binary(str equation, Tensor[] tensors, Tensor add_arg, Scalar alpha) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [](Stack* stack) {
+            auto result = einsum_binary(
+                (std::move(peek(stack, 0, 4))).toStringView(),
+                (std::move(peek(stack, 1, 4))).toTensorList(),
+                (std::move(peek(stack, 2, 4))).toTensor(),
+                (std::move(peek(stack, 3, 4))).toScalar());
+
+            drop(stack, 4);
+            pack(stack, std::move(result));
+            return 0;
+          };
+        },
+        aliasAnalysisFromSchema()),
 
 });
 } // namespace jit
diff --git a/intel_extension_for_pytorch/csrc/jit/fusion_pass.cpp b/intel_extension_for_pytorch/csrc/jit/fusion_pass.cpp
@@ -352,6 +352,9 @@ void IPEXFusionPass(std::shared_ptr<Graph>& graph) {
   // concat multi-linear with same input
   FrozenConcatLinear(graph);
 
+  // ipex einsum
+  graph_rewrite::FusedEinsumPost(graph);
+
   // Fuse the scores calculation(dim + matmul + (add)? + softmax) for
   // Multi-Head-Attention
   graph_rewrite::FuseMHAScoreCalc(graph);
diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py
@@ -786,6 +786,27 @@ def forward(self, x):
         y3 += x
         return y3.relu_()
 
+class EinsumAdd(nn.Module):
+    def __init__(self, equation):
+        super(EinsumAdd, self).__init__()
+        self.equation = equation
+    def forward(self, input1, input2, bias):
+        return torch.einsum(self.equation, input1, input2) + bias
+
+class EinsumAddInplace(nn.Module):
+    def __init__(self, equation):
+        super(EinsumAddInplace, self).__init__()
+        self.equation = equation
+    def forward(self, input1, input2, bias):
+        return torch.einsum(self.equation, input1, input2).add_(bias)
+
+class EinsumAddInplaceV1(nn.Module):
+    def __init__(self, equation):
+        super(EinsumAddInplaceV1, self).__init__()
+        self.equation = equation
+    def forward(self, input1, input2, bias):
+        return bias.add_(torch.einsum(self.equation, input1, input2))
+
 class Tester(TestCase):
 
     def _test_output(self, model, x, kind_in_graph=None, kind_not_in_graph=None, prec=None, levels=['O0','O1'], use_channels_last=[True, False]):
@@ -2466,6 +2487,101 @@ def test_bmm_add(self):
         expected = torch.baddbmm(M, batch1, batch2)
         self.assertTrue(torch.allclose(out, expected))
 
+    def test_einsum_add(self):
+        def _test_fp32(model_test, input1, input2, bias, kind_in_graph='ipex::einsum_binary', prec=1e-3):
+            model = copy.deepcopy(model_test)
+            model = model.eval()
+            model = ipex.optimize(model, dtype=torch.float32)
+            with torch.no_grad():
+                res_ref = model(input1, input2, bias)
+                tr_model = torch.jit.trace(model, (input1, input2, bias))
+                tr_model = torch.jit.freeze(tr_model)
+                tr_model(input1, input2, bias)
+                tr_model(input1, input2, bias)
+                trace_graph = tr_model.graph_for(input1, input2, bias)
+                res_jit = tr_model(input1, input2, bias,)
+                self.assertEqual(res_ref, res_jit, prec)
+                self.assertTrue(any(n.kind() == kind_in_graph for n in trace_graph.nodes()))
+
+        bias = torch.randn(3,2304)
+        input1 = torch.randn(2, 3, 768)
+        input2 = torch.randn(768, 2304)
+        model_v1 = EinsumAdd('bsh,ho->bso')
+        _test_fp32(model_v1, input1, input2, bias)
+        
+        bias = torch.randn(2304)
+        input1 = torch.randn(4, 3, 768)
+        input2 = torch.randn(768, 2304)
+        model_v1 = EinsumAddInplace('bsh,ho->bso')
+        _test_fp32(model_v1, input1, input2, bias)
+        
+        bias = torch.randn(4, 3, 2304)
+        input1 = torch.randn(4, 3, 768)
+        input2 = torch.randn(768, 2304)
+        model_v1 = EinsumAddInplaceV1('bsh,ho->bso')
+        _test_fp32(model_v1, input1, input2, bias, kind_in_graph='aten::einsum')
+
+        bias1 = torch.randn(2, 1, 128, 128)
+        input3 = torch.randn(2, 4, 128, 768)
+        input4 = torch.randn(2, 4, 128, 768)
+        model_v2 = EinsumAdd("bnqd,bnkd->bnqk")
+        _test_fp32(model_v2, input3, input4, bias1)
+        
+        bias1 = torch.randn(8, 1, 1, 128)
+        input3 = torch.randn(8, 4, 128, 768)
+        input4 = torch.randn(8, 4, 128, 768)
+        model_v2 = EinsumAdd("bnqd,bnkd->bnqk")
+        _test_fp32(model_v2, input3, input4, bias1)
+
+        bias1 = torch.randn(2, 4, 128, 768)
+        input1 = torch.randn(2, 4, 128, 768)
+        input2 = torch.randn(4, 768, 768)
+        model_v2 = EinsumAdd("balh,ahr->balr")
+        _test_fp32(model_v2, input1, input2, bias1)
+
+        bias1 = torch.randn(768)
+        input1 = torch.randn(128, 1024)
+        input2 = torch.randn(768, 1024)
+        model_v2 = EinsumAdd("mc,nc->mn")
+        _test_fp32(model_v2, input1, input2, bias1)
+
+        bias1 = torch.randn(768)
+        input1 = torch.randn(128, 1024)
+        input2 = torch.randn(1024, 768)
+        model_v2 = EinsumAdd("mc,cn->mn")
+        _test_fp32(model_v2, input1, input2, bias1)
+        
+        bias1 = torch.randn(1024)
+        input1 = torch.randn(1024, 1024)
+        input2 = torch.randn(1024, 1024)
+        model_v2 = EinsumAdd("mc,cn->nm")
+        _test_fp32(model_v2, input1, input2, bias1)
+        
+        bias1 = torch.randn(768)
+        input1 = torch.randn(2, 128, 1024)
+        input2 = torch.randn(1024, 23, 768)
+        model_v2 = EinsumAdd("bqc,chv->bqhv")
+        _test_fp32(model_v2, input1, input2, bias1)
+        
+        bias = torch.randn(768)
+        input1 = torch.randn(2, 128, 16, 64)
+        input2 = torch.randn(16,64, 768)
+        model = EinsumAdd("bqhc,hco->bqo")
+        _test_fp32(model, input1, input2, bias)
+        
+        bias = torch.randn(8)
+        input1 = torch.randn(8)
+        input2 = torch.randn(8)
+        model = EinsumAdd("i,i->")
+        _test_fp32(model, input1, input2, bias)
+       
+        #the output of torch.einsum("ij,j") is tensor([]) 
+        bias = torch.randn(1)
+        input1 = torch.randn(0, 3) 
+        input2 = torch.randn(3)
+        model = EinsumAdd(("ij,j"))
+        _test_fp32(model, input1, input2, bias)
+
     def test_ipex_softmax(self):
         self._test_output(
             AtenSoftmaxRepalce(),