intel
diff --git a/‎nlp_toolkit/backends/neural_engine/SparseLib/include/interface.hpp‎
Lines changed: 15 additions & 0 deletions b/‎nlp_toolkit/backends/neural_engine/SparseLib/include/interface.hpp‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎nlp_toolkit/backends/neural_engine/SparseLib/include/jit_domain/jit_matmul_avx512f_p2031_p2013.hpp‎
Lines changed: 79 additions & 0 deletions b/‎nlp_toolkit/backends/neural_engine/SparseLib/include/jit_domain/jit_matmul_avx512f_p2031_p2013.hpp‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernel_hashing.hpp‎
Lines changed: 5 additions & 1 deletion b/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernel_hashing.hpp‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernels/eltwiseop.hpp‎
Lines changed: 1 addition & 1 deletion b/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernels/eltwiseop.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernels/matmul_avx512f_p2031_p2013.hpp‎
Lines changed: 106 additions & 0 deletions b/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernels/matmul_avx512f_p2031_p2013.hpp‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernels/matmul_types.hpp‎
Lines changed: 53 additions & 0 deletions b/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernels/matmul_types.hpp‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernels/sparse_data.hpp‎
Lines changed: 0 additions & 1 deletion b/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernels/sparse_data.hpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernels/spmm_types.hpp‎
Lines changed: 2 additions & 1 deletion b/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernels/spmm_types.hpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎nlp_toolkit/backends/neural_engine/SparseLib/include/param_types.hpp‎
Lines changed: 1 addition & 1 deletion b/‎nlp_toolkit/backends/neural_engine/SparseLib/include/param_types.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nlp_toolkit/backends/neural_engine/SparseLib/src/cpu_engine.cpp‎
Lines changed: 2 additions & 0 deletions b/‎nlp_toolkit/backends/neural_engine/SparseLib/src/cpu_engine.cpp‎
Lines changed: 2 additions & 0 deletions
@@ -101,6 +101,13 @@ class sparse_matmul_desc : public kernel_desc_proxy {
   virtual ~sparse_matmul_desc() {}
 };
 
+class transpose_matmul_desc : public kernel_desc_proxy {
+ public:
+  transpose_matmul_desc() {}
+  explicit transpose_matmul_desc(const operator_desc& op_desc) : kernel_desc_proxy(op_desc) {}
+  virtual ~transpose_matmul_desc() {}
+};
+
 class postop_desc : public kernel_desc_proxy {
  public:
   postop_desc() {}
@@ -131,6 +138,14 @@ class sparse_matmul : public kernel_proxy {
   explicit sparse_matmul(const kernel_desc_proxy& kdp) : kernel_proxy(kdp) {}
   virtual ~sparse_matmul() {}
 };
+
+class transpose_matmul : public kernel_proxy {
+ public:
+  transpose_matmul() {}
+  explicit transpose_matmul(const kernel_desc_proxy& kdp) : kernel_proxy(kdp) {}
+  virtual ~transpose_matmul() {}
+};
+
 class postop : public kernel_proxy {
  public:
   postop() {}
 
@@ -0,0 +1,79 @@
+//  Copyright (c) 2022 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#ifndef ENGINE_SPARSELIB_INCLUDE_JIT_DOMAIN_JIT_MATMU_AVX512F_P2031_p2013_HPP_
+#define ENGINE_SPARSELIB_INCLUDE_JIT_DOMAIN_JIT_MATMU_AVX512F_P2031_p2013_HPP_
+
+#include "jit_generator.hpp"
+#include "kernels/matmul_types.hpp"
+#include "utils.hpp"
+
+#define GET_OFF(field) offsetof(ssd::matmul_data_t, field)
+
+namespace jd {
+/**
+ * @brief jit_matmul_avx512f_p2031_p2013_t calculates this kind matmul: alpha * src0 x src1 + beta * src2 = dst.
+ *        alpha * src0(M, K) x src1(K, N) + beta * scr2(M, N) = dst(M, N)
+ */
+class jit_matmul_avx512f_p2031_p2013_t : public jit_generator {
+ public:
+  explicit jit_matmul_avx512f_p2031_p2013_t(const ssd::matmul_param_t& param)
+      : jit_generator(),
+        param_(param),
+        TH_(param.m_tile),
+        TW_(param.n_tile),
+        ld_src0(param.M * param.batch * dsize_src0),
+        ld_src1(param.N * param.batch * dsize_src1),
+        ld_src2(param.N * dsize_src2),
+        ld_dst(param.N * dsize_dst),
+        k_iters(param.K / UNROLL_K) {}
+  virtual ~jit_matmul_avx512f_p2031_p2013_t() {}
+
+ private:
+  ssd::matmul_param_t param_;
+
+  void generate() override;
+  void calc_THxkxTW();
+  Xbyak::Zmm TH_Vmm(int i = 0);                   // Register allocator of load weight. 1D shape=(TH)
+  Xbyak::Zmm TW_Vmm(int i = 0);                   // Register allocator of load activation. 1D shape=(TW)
+  Xbyak::Zmm dst_tile_Vmm(int i = 0, int j = 0);  // Reg alloc of DST tile. 2D shape=(TH,TW), stride=(TW,1)
+
+  const int TH_;  // tile height (along m) in terms of #registers
+  const int TW_;  // tile width (along n) in terms of #registers
+  static constexpr size_t dsize_src0 = sizeof(decltype(*ssd::matmul_data_t::src0));
+  static constexpr size_t dsize_src1 = sizeof(decltype(*ssd::matmul_data_t::src1));
+  static constexpr size_t dsize_src2 = sizeof(decltype(*ssd::matmul_data_t::src2));
+  static constexpr size_t dsize_dst = sizeof(decltype(*ssd::matmul_data_t::dst));
+  // leading dimension in #bytes
+  const int ld_src0, ld_src1, ld_src2, ld_dst;
+  const int k_iters;
+
+  const Xbyak::Zmm& vreg_temp = zmm31;
+  static constexpr int VREG_NUMS = 32;
+  static constexpr int USED_VREGS = 1;
+  static constexpr int UNROLL_K = 8;
+  static constexpr int BYTES_ZMM = 64;
+
+  const Xbyak::Reg64& parambase = rdi;
+  const Xbyak::Reg64& reg_src0 = rsi;
+  const Xbyak::Reg64& reg_src1 = rdx;
+  const Xbyak::Reg64& reg_src2 = rcx;
+  const Xbyak::Reg64& reg_dst = r8;
+  const Xbyak::Reg64& reg_src0_end = r9;
+  const Xbyak::Reg64& reg_src1_end = r10;
+  const Xbyak::Reg64& reg_iterk = r11;
+  const Xbyak::Reg64& reg_tmp = rbx;
+};
+}  // namespace jd
+#endif  // ENGINE_SPARSELIB_INCLUDE_JIT_DOMAIN_JIT_MATMU_AVX512F_P2031_p2013_HPP_
@@ -80,12 +80,16 @@ class hash_t {
         hash_combine(seed, op_attrs["append_sum"]);
         hash_combine(seed, op_attrs["sub_func"]);
         break;
-        // todo:remove it.
       case kernel_kind::postop:
       case kernel_kind::eltwiseop:
       case kernel_kind::layernorm_ba:
         hash_combine(seed, op_attrs["matrix_shape"]);
         break;
+      case kernel_kind::transpose_matmul:
+        hash_combine(seed, op_attrs["alpha"]);
+        hash_combine(seed, op_attrs["beta"]);
+        hash_combine(seed, op_attrs["m_tile"]);
+        hash_combine(seed, op_attrs["n_tile"]);
       default:
         break;
     }
 
@@ -88,7 +88,7 @@ class eltwiseop_k_t : public kernel_t {
   bool eltwiseop_kernel_create(jit_eltwiseop_t** ker_pp, const ssd::eltwiseop_param_t& param);
 
  private:
-  jit_eltwiseop_t* jit_kers_;
+  jit_eltwiseop_t* jit_kers_ = nullptr;
   int64_t nthr_;
   std::vector<ssd::eltwiseop_data_t*> td;
 };
 
@@ -0,0 +1,106 @@
+//  Copyright (c) 2022 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#ifndef ENGINE_SPARSELIB_INCLUDE_KERNELS_MATMUL_AVX512F_P2031_P2013_HPP_
+#define ENGINE_SPARSELIB_INCLUDE_KERNELS_MATMUL_AVX512F_P2031_P2013_HPP_
+
+#include <glog/logging.h>
+#include <memory>
+#include <vector>
+#include <algorithm>
+
+#include "jit_domain/jit_matmul_avx512f_p2031_p2013.hpp"
+#include "kernel.hpp"
+#include "kernel_desc.hpp"
+
+namespace jd {
+// By convention,
+//   1. xxxx_kd_t is the descriptor of a specific derived primitive/kernel.
+//   2. xxxx_k_t is a specific derived primitive/kernel.
+//   3. jit_xxxx_t is JIT assembly implementation of a specific derived
+//   primitive/kernel. where, "xxxx" represents an algorithm, such as brgemm,
+//   GEMM and so on.
+class matmul_avx512f_p2031_p2013_k_t;
+
+/**
+ * @brief a derived kernel descriptor. matmul_param_t is its class member.
+ */
+class matmul_avx512f_p2031_p2013_kd_t : public kernel_desc_t {
+ public:
+  explicit matmul_avx512f_p2031_p2013_kd_t(const jd::operator_desc& op_desc)
+      : kernel_desc_t(kernel_kind::sparse_matmul), op_desc_(op_desc) {}
+  virtual ~matmul_avx512f_p2031_p2013_kd_t() {}
+
+  bool init() override;
+
+  // kernel_desc_t::create_primitive() override.
+  DECLARE_COMMON_PD_T(matmul_avx512f_p2031_p2013_k_t, matmul_avx512f_p2031_p2013_kd_t);
+
+  const jd::operator_desc& operator_desc() const override { return op_desc_; }
+  const ssd::matmul_param_t& jit_param() const { return jit_param_; }
+
+  inline std::vector<dim_t> shape() const {
+    std::vector<dim_t> result(op_desc_.tensor_descs()[ssd::SRC0].shape());
+    result.push_back(op_desc_.tensor_descs()[ssd::SRC0].shape().back());
+    return result;
+  }
+
+ private:
+  bool matmul_params_init(const jd::operator_desc& op_desc);
+
+  jd::operator_desc op_desc_;
+  ssd::matmul_param_t jit_param_;
+};
+
+/**
+ * @brief a derived kernel. kd_t and jit_domain are its class members.
+ */
+class matmul_avx512f_p2031_p2013_k_t : public kernel_t {
+ public:
+  using kd_t = matmul_avx512f_p2031_p2013_kd_t;
+  explicit matmul_avx512f_p2031_p2013_k_t(const std::shared_ptr<const kd_t>& kd);
+  virtual ~matmul_avx512f_p2031_p2013_k_t() {
+    if (jit_ker_ != nullptr) {
+      delete jit_ker_;
+      jit_ker_ = nullptr;
+    }
+  }
+
+  // Delete move constructor and move operator
+  matmul_avx512f_p2031_p2013_k_t(matmul_avx512f_p2031_p2013_k_t&& other) = delete;
+  matmul_avx512f_p2031_p2013_k_t& operator=(matmul_avx512f_p2031_p2013_k_t&& other) = delete;
+  // Delete copy constructor and copy operator
+  matmul_avx512f_p2031_p2013_k_t(const matmul_avx512f_p2031_p2013_k_t& other) = delete;
+  matmul_avx512f_p2031_p2013_k_t& operator=(const matmul_avx512f_p2031_p2013_k_t& other) = delete;
+
+  bool init() override;
+  bool execute(const std::vector<const void*>& rt_data) const override;
+  const std::shared_ptr<const kd_t> derived_kd() const { return std::static_pointer_cast<const kd_t>(kd_); }
+
+ private:
+  bool matmul_kernel_create(jit_matmul_avx512f_p2031_p2013_t** ker_pp, const ssd::matmul_param_t& param);
+
+ private:
+  jit_matmul_avx512f_p2031_p2013_t* jit_ker_ = nullptr;
+  const std::vector<std::vector<dim_t>> t_shapes_;
+  const std::vector<dim_t> src0_perm_shape_;  // src0 shape after perm2031
+  const std::vector<dim_t> src1_perm_shape_;  // src1 shape after perm2013
+  const dim_t M_, K_, N_;                     // dim of matrix multiplication
+  const dim_t bs0_;                           // outer batch size dim
+  const dim_t bs1_;                           // innter batch size dim
+};
+
+}  // namespace jd
+
+#endif  // ENGINE_SPARSELIB_INCLUDE_KERNELS_MATMUL_AVX512F_P2031_P2013_HPP_
@@ -0,0 +1,53 @@
+//  Copyright (c) 2022 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#ifndef ENGINE_SPARSELIB_INCLUDE_KERNELS_MATMUL_TYPES_HPP_
+#define ENGINE_SPARSELIB_INCLUDE_KERNELS_MATMUL_TYPES_HPP_
+
+#include <cstdint>
+#include <vector>
+
+#include "param_types.hpp"
+#include "../utils.hpp"
+namespace jd {
+namespace ssd {
+/**
+ * @brief tensors index configuration of this kernel.
+ * TODO(Yi): potential confliction with indices of other op types
+ */
+static constexpr int SRC0 = 0;
+static constexpr int SRC1 = 1;
+static constexpr int DST0 = 2;
+static constexpr int SRC2 = 3;  // for binary add
+
+struct matmul_param_t {
+  dim_t M;
+  dim_t N;
+  dim_t K;
+  dim_t batch;                    // leading dim is `batch` times of its num_cols
+  float alpha = 1.f, beta = 1.f;  // alpha * (src0 * src1) + beta * src_binary_add = dst
+  dim_t m_tile = 8;
+  dim_t n_tile = 2;
+};
+
+struct matmul_data_t {
+  const float* src0;
+  const float* src1;
+  float* dst;
+  const float* src2;
+};
+
+}  // namespace ssd
+}  // namespace jd
+#endif  // ENGINE_SPARSELIB_INCLUDE_KERNELS_MATMUL_TYPES_HPP_
@@ -18,7 +18,6 @@
 #include <utility>
 #include <vector>
 
-#include "kernels/spmm_types.hpp"
 #include "param_types.hpp"
 #include "utils.hpp"
 
 
@@ -30,6 +30,7 @@ class bsr_data_t;
 namespace ssd {
 /**
  * @brief tensors index configuration of this kernel.
+ * TODO(Yi): potential confliction with indices of other op types
  */
 static constexpr int WEI = 0;
 static constexpr int SRC = 1;
@@ -52,7 +53,7 @@ enum class subfunc_level : uint8_t {
   prod,            // use sub-function for tile product
   dense_and_prod,  // use fused sub-function for dense loading & tile product
   load_and_prod,   // use fused sub-function for dense loading & sparse loading & tile product
-  k_dims,         // a whole THxKxTW tile generates a constent size of code
+  k_dims,          // a whole THxKxTW tile generates a constent size of code
   subfunc_level_MAX = k_dims
 };
 
 
@@ -20,7 +20,7 @@
 #include <map>
 namespace jd {
 // The main kinds of kernel.
-enum class kernel_kind : uint8_t { undef, sparse_matmul, postop, eltwiseop, layernorm_ba };
+enum class kernel_kind : uint8_t { undef, sparse_matmul, postop, eltwiseop, layernorm_ba, transpose_matmul };
 
 enum class postop_alg : uint8_t { undef, exp, tanh, gelu, relu, quantize, dequantize, linear, int8_lut };
 
 
@@ -25,6 +25,7 @@ const std::vector<impl_list_item_t>* cpu_engine::get_implementation_list(const o
   DECLARE_IMPL_LIST(postop);
   DECLARE_IMPL_LIST(eltwiseop);
   DECLARE_IMPL_LIST(layernorm_ba);
+  DECLARE_IMPL_LIST(transpose_matmul);
 
 #undef DECLARE_IMPL_LIST
 
@@ -38,6 +39,7 @@ const std::vector<impl_list_item_t>* cpu_engine::get_implementation_list(const o
     CASE(postop);
     CASE(eltwiseop);
     CASE(layernorm_ba);
+    CASE(transpose_matmul);
     default:
       return &cpu_engine::empty_list;
   }