intel
diff --git a/‎nlp_toolkit/backends/neural_engine/SparseLib/include/jit_domain/jit_spmm_vnni.hpp‎
Lines changed: 11 additions & 16 deletions b/‎nlp_toolkit/backends/neural_engine/SparseLib/include/jit_domain/jit_spmm_vnni.hpp‎
Lines changed: 11 additions & 16 deletions
diff --git a/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernels/sparse_data.hpp‎
Lines changed: 1 addition & 13 deletions b/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernels/sparse_data.hpp‎
Lines changed: 1 addition & 13 deletions
diff --git a/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernels/spmm_types.hpp‎
Lines changed: 8 additions & 8 deletions b/‎nlp_toolkit/backends/neural_engine/SparseLib/include/kernels/spmm_types.hpp‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎nlp_toolkit/backends/neural_engine/SparseLib/include/utils.hpp‎
Lines changed: 2 additions & 0 deletions b/‎nlp_toolkit/backends/neural_engine/SparseLib/include/utils.hpp‎
Lines changed: 2 additions & 0 deletions
@@ -32,16 +32,15 @@ namespace jd {
  */
 class jit_spmm_vnni_t : public jit_generator {
  public:
-  explicit jit_spmm_vnni_t(const ssd::flat_param_t& param)
-      : jit_generator(), param_(param), csrp_(param_.sparse_ptr) {}
+  explicit jit_spmm_vnni_t(const ssd::flat_param_t& param);
   virtual ~jit_spmm_vnni_t() {}
 
  public:
   const void* sequence_vals() const { return seq_vals_.data(); }
 
  private:
   ssd::flat_param_t param_;
-  csrp_data_t<int8_t>* csrp_;
+  bsr_data_t<int8_t>* bsr_;
   std::vector<int8_t> seq_vals_;
 
  private:
@@ -54,30 +53,26 @@ class jit_spmm_vnni_t : public jit_generator {
   Xbyak::Zmm dst_tile_Vmm(int i, int j);  // Reg alloc of DST tile. 2D shape=(TH,TW), stride=(TW,1)
   void params_alias(const ssd::flat_param_t& param);
   void read_params();
-  void load_bias(const std::vector<int64_t>& m_indices);
+  void load_bias(int64_t m_start);
   void load_dense(const std::vector<int64_t>& k_indices);
-  void load_sparse();
+  void load_sparse(const int8_t* bsr_data, int64_t kp_lo, int64_t kp_hi);
   void tile_product(int tile_height, int tile_width);
-  void handle_dst_buffer_init(int kb_idx, const std::vector<int64_t>& m_indices);
-  void handle_dst_buffer_epilogue(int kb_idx, const std::vector<int64_t>& m_indices);
+  void handle_dst_buffer_init(int kb_idx, int64_t m_start);
+  void handle_dst_buffer_epilogue(int kb_idx, int64_t m_start);
   void mul_scale(int i);
   void move_out(int i, int j, int row_idx, int bytes = 1);
-  std::unordered_map<int64_t, std::vector<int64_t>> get_idx_balanced(const std::vector<int64_t>& m_indices,
+  std::unordered_map<int64_t, std::vector<int64_t>> get_idx_balanced(int64_t m_start,
                                                                      const std::vector<int64_t>& sparse_indptr,
                                                                      const std::vector<int64_t>& sparse_indices, int lo,
                                                                      int hi);
-  std::unordered_map<int64_t, std::vector<int8_t>> get_val_balanced(const std::vector<int64_t>& m_indices,
+  std::unordered_map<int64_t, std::vector<int8_t>> get_val_balanced(int64_t m_start,
                                                                     const std::vector<int64_t>& sparse_indptr,
                                                                     const std::vector<int64_t>& sparse_indices, int lo,
                                                                     int hi, const std::vector<int8_t>& sparse_inddata);
-  void repeat_THx4xTW_matmal(const std::vector<int64_t>& m_indices,
-                             const std::unordered_map<int64_t, std::vector<int64_t>>& k_indices_map,
-                             const std::unordered_map<int64_t, std::vector<int8_t>>& k_inddata_map);
+  void repeat_THx4xTW_matmal(int64_t imb);
   void clear_dst_tile();
-  void load_intermediate_dst(const std::vector<int64_t>& m_indices);
-  void store_intermediate_dst(const std::vector<int64_t>& m_indices);
-  void save_sequence_vals(const std::vector<int64_t>& m_indices,
-                          const std::unordered_map<int64_t, std::vector<int8_t>>& k_inddata_map, int pos1, int pos2);
+  void load_intermediate_dst(int64_t m_start);
+  void store_intermediate_dst(int64_t m_start);
   void gen_sub_function();
 
  private:
 
@@ -145,23 +145,11 @@ static constexpr int ADJ = 4;  // 4 is that "Multiply groups of 4 adjacent pairs
 
 inline int align_nnz(const int& a_nnz) { return ceil_div(a_nnz, ADJ) * ADJ; }
 
-template <typename T>
-sparse_data_t<T>* reorder_to(int rows, int cols, const void* uncoded_ptr, const format_type& dst_encode_fmt);
-
 template <typename T, dim_t group>
 std::vector<bsr_data_t<T>*>* reorder_to_bsr_amx(dim_t rows, dim_t cols, dim_t micro_rows, const void* uncoded_ptr);
 
 template <typename T>
-uint64_t get_uncoded_nnz(int rows, int cols, const T* uncoded_data, int line_idx = -1);
-
-template <typename T>
-sparse_data_t<T> tocsr(int rows, int cols, const T* uncoded_data);
-
-template <typename T>
-std::pair<std::vector<int64_t>, std::vector<int64_t>> csr_with_permute(int rows, int cols, const T* uncoded_data);
-
-template <typename T>
-bsr_data_t<T> tobsr(dim_t rows, dim_t cols, dim_t blocksize[2], const T* uncoded_data);
+bsr_data_t<T> tobsr(dim_t rows, dim_t cols, dim_t blk_row, dim_t blk_col, const T* uncoded_data);
 
 template <typename T, dim_t group>
 bsr_data_t<T> to_bsr_amx(dim_t rows, dim_t cols, dim_t blk_row, dim_t blk_col, const T* uncoded_data);
 
@@ -25,6 +25,8 @@ template <typename T>
 class csrp_data_t;
 template <typename T>
 class bsc_data_t;
+template <typename T>
+class bsr_data_t;
 namespace ssd {
 /**
  * @brief tensors index configuration of this kernel.
@@ -56,15 +58,15 @@ struct flat_param_t {
   bool append_sum;
   data_type output_type;
   sparse_scheme scheme;
-  // optimization config of JIT machine code.
   std::vector<int64_t> mkn_blocks;
-  std::vector<int64_t> tile_shape;
+  std::vector<int64_t> tile_shape; // 2d vector for microkernel shape in terms of zmm registers
   bool sub_func;
-  int64_t start;
-  int64_t end;
+  int64_t im_start;  // start m-idx of dest to be calculated
+  int64_t im_end;    // end m-idx of dest to be calculated
+  int64_t in_start;  // start n-idx of dest to be calculated
+  int64_t in_end;    // end n-idx of dest to be calculated
   // sparse weight related
-  csrp_data_t<int8_t>* sparse_ptr;
-  std::vector<int64_t> avg_group;
+  bsr_data_t<int8_t>* sparse_ptr;
 };
 
 /**
@@ -76,8 +78,6 @@ struct flat_data_t {
   const void* ptr_bias;      // bias(M, 1).
   void* ptr_dst;             // dst(M, N).
   const void* ptr_scales;
-  int64_t start;
-  int64_t end;
 };
 
 /**
 
@@ -63,6 +63,8 @@ T str_to_num(const std::string& s);
 template <typename T>
 std::vector<T> split_str(const std::string& s, const char& delim = ',');
 
+std::string join_str(const std::vector<std::string>& ss, const std::string& delim = ",");
+
 bool init_amx();
 
 /**