diff --git a/.github/license/header_exclude_files.txt b/.github/license/header_exclude_files.txt
index e69de29bb..9563fbc43 100644
--- a/.github/license/header_exclude_files.txt
+++ b/.github/license/header_exclude_files.txt
@@ -0,0 +1 @@
+vllm-ext/vllm/extension/ns/__init__.py
\ No newline at end of file
diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
index ca57affac..f38243fa4 100644
--- a/.github/workflows/workflow_inference.yml
+++ b/.github/workflows/workflow_inference.yml
@@ -34,7 +34,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-ipex-llm, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-vllm, gemma-2b, deepseek-coder-33b-instruct]
+        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-ipex-llm, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-vllm, llama-2-7b-chat-hf-vllm-ns, gemma-2b, deepseek-coder-33b-instruct]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -97,7 +97,11 @@ jobs:
         run: |
           TARGET=${{steps.target.outputs.target}}
           source dev/scripts/ci-functions.sh
-          strat_ray ${TARGET}
+          if [[ "$TARGET" == *ns ]]; then
+            start_ray ${TARGET} 1
+          else
+            start_ray ${TARGET}
+          fi
 
       - name: Run Inference Test
         run: |
diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index dedeb4154..a24f56397 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -94,7 +94,7 @@ jobs:
           # check and remove exited container
           cid=$(docker ps -a -q --filter "name=${TARGET}")
           if [[ ! -z "$cid" ]]; then docker rm $cid; fi
-          docker run -tid --name="${TARGET}" --hostname="${TARGET}-container" --runtime=habana -v /home/yizhong/Model-References:/root/Model-References -v ${{ inputs.code_checkout_path }}:/root/llm-on-ray -v ${{ inputs.model_cache_path }}:/root/.cache/huggingface/hub/ -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --cap-add sys_ptrace --net=host --ipc=host ${TARGET}:habana
+          docker run -tid --privileged --name="${TARGET}" --hostname="${TARGET}-container" --runtime=habana -v /home/yizhong/Model-References:/root/Model-References -v ${{ inputs.code_checkout_path }}:/root/llm-on-ray -v ${{ inputs.model_cache_path }}:/root/.cache/huggingface/hub/ -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --cap-add sys_ptrace --net=host --ipc=host ${TARGET}:habana
       - name: Start Ray Cluster
         run: |
           TARGET=${{steps.target.outputs.target}}
diff --git a/.github/workflows/workflow_test_benchmark.yml b/.github/workflows/workflow_test_benchmark.yml
index d737675c0..37c0518e2 100644
--- a/.github/workflows/workflow_test_benchmark.yml
+++ b/.github/workflows/workflow_test_benchmark.yml
@@ -80,7 +80,7 @@ jobs:
           # check and remove exited container
           cid=$(docker ps -a -q --filter "name=${TARGET}")
           if [[ ! -z "$cid" ]]; then docker rm $cid; fi
-          docker run -tid -v ${{ inputs.model_cache_path }}:/root/.cache/huggingface/hub -v ${{ inputs.code_checkout_path }}:/root/llm-on-ray -e http_proxy=${{ inputs.http_proxy }} -e https_proxy=${{ inputs.https_proxy }} --name="${TARGET}" --hostname="${TARGET}-container" ${TARGET}:latest
+          docker run -tid --privileged -v ${{ inputs.model_cache_path }}:/root/.cache/huggingface/hub -v ${{ inputs.code_checkout_path }}:/root/llm-on-ray -e http_proxy=${{ inputs.http_proxy }} -e https_proxy=${{ inputs.https_proxy }} --name="${TARGET}" --hostname="${TARGET}-container" ${TARGET}:latest
 
       - name: Start Ray Cluster
         run: |
diff --git a/.github/workflows/workflow_tests.yml b/.github/workflows/workflow_tests.yml
index eb51d4f0f..c9af35b4c 100644
--- a/.github/workflows/workflow_tests.yml
+++ b/.github/workflows/workflow_tests.yml
@@ -176,7 +176,7 @@ jobs:
         run: |
           TARGET=${{steps.target.outputs.target}}
           source dev/scripts/ci-functions.sh
-          strat_ray ${TARGET}
+          start_ray ${TARGET}
 
       - name: Run Tests
         run: |
diff --git a/.gitignore b/.gitignore
index 0ed6093cc..757c2b821 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,9 @@ build/lib/
 *.json
 *.txt
 *.egg-info
+.eggs
+*.log
+*.so
+*.ninja_log
+build/
+runtime_outs/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5e56e52a6..fd7188b68 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,6 +7,12 @@ repos:
     hooks:
       - id: ruff
         args: [ --fix, --exit-non-zero-on-fix, --ignore=E402, --ignore=E501, --ignore=E731, --ignore=F401]
+        exclude: |
+            (?x)^(
+              examples/inference/vllm/ray-vllm-examples/llm.py|
+              vllm-ext/vllm/extension/ns/__init__.py|
+            )$
+
 
   # Black needs to be ran after ruff with --fix
   - repo: https://github.com/psf/black
@@ -18,7 +24,18 @@ repos:
     rev: "v0.981"
     hooks:
       - id: mypy
-        exclude: tests
+        exclude: |
+          (?x)^(
+            tests|
+            vllm-ext/vllm/extension/ns/model/ns_loader.py|
+            vllm-ext/vllm/extension/ns/kv_cache/ns_cache.py|
+            vllm-ext/inference_engine/python/inference_engine/|
+            vllm-ext/setup.py|
+            examples/inference/vllm/ray-vllm-examples/llm.py|
+            llm_on_ray/inference/inference_config.py|
+            vllm-ext/vllm/extension/ns/
+          )
+        
         additional_dependencies:
           - mypy-extensions
           - pydantic==1.10.0
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 2a2a8570a..2eba3aeeb 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -50,7 +50,7 @@
 import copy
 
 # (prompt str, output str, prompt len, output len, request latency, latencies list)
-latency_tracking: List[Tuple[Optional[str], Optional[str], int, int, float, List[float]]] = []
+latency_tracking: List[Tuple[Optional[List[str]], Optional[str], int, int, float, List[float]]] = []
 
 
 def sample_requests_ShareGPT(
@@ -62,7 +62,7 @@ def sample_requests_ShareGPT(
     min_output_tokens_len: int,
     max_output_tokens_len: int,
     max_length: int,
-) -> List[Tuple[str, int, int]]:
+) -> List[Tuple[List[str], int, int]]:
     """
     Sample requests from a dataset of ShareGPT format.
 
@@ -94,11 +94,11 @@ def sample_requests_ShareGPT(
     tokenized_dataset = []
     for i in range(len(dataset)):
         output_len = len(completion_token_ids[i])
-        tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+        tokenized_dataset.append(([prompts[i]], prompt_token_ids[i], output_len))
 
     # Filter out too long sequences.
-    filtered_dataset: List[Tuple[str, int, int]] = []
-    for prompt, prompt_token_ids, output_len in tokenized_dataset:
+    filtered_dataset: List[Tuple[List[str], int, int]] = []
+    for prompts, prompt_token_ids, output_len in tokenized_dataset:
         prompt_len = len(prompt_token_ids)
         # Prune too short sequences.
         if (min_input_tokens_len is not None and prompt_len < min_input_tokens_len) or (
@@ -112,7 +112,7 @@ def sample_requests_ShareGPT(
             continue
         if max_length is not None and prompt_len + output_len > max_length:
             continue
-        filtered_dataset.append((prompt, prompt_len, output_len))
+        filtered_dataset.append((prompts, prompt_len, output_len))
 
     # Sample the requests.
     sampled_requests = random.sample(filtered_dataset, num_requests)
@@ -126,7 +126,7 @@ def sample_requests_IPEX(
     max_new_tokens: int,
     num_requests: int,
     tokenizer: PreTrainedTokenizer,
-) -> List[Tuple[str, int, int]]:
+) -> List[Tuple[List[str], int, int]]:
     """
     Sample requests from a dataset of IPEX format.
 
@@ -149,10 +149,58 @@ def sample_requests_IPEX(
         raise ValueError(f'Invalid input_tokens to index from dataset "{dataset_path}"!')
 
     prompt_len = len(tokenizer(prompt).input_ids)
+    print("prompt len, ", prompt_len)
     output_len = prompt_len if not max_new_tokens else max_new_tokens
 
     # Duplicate prompt to generate samples
-    sampled_requests = [(prompt, prompt_len, output_len)] * num_requests
+    sampled_requests = [([prompt], prompt_len, output_len)] * num_requests
+
+    return sampled_requests
+
+
+def sample_requests_IDC(
+    dataset_path: str,
+    max_new_tokens: int,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizer,
+    config: Dict[str, Union[int, float]],
+) -> List[Tuple[List[str], int, int]]:
+    """
+    Sample requests from a dataset of IPEX format.
+
+    Args:
+        dataset_path (str): The path to the dataset.
+        input_tokens (str): The input tokens.
+        max_new_tokens (int): The maximum number of new tokens.
+        num_requests (int): The number of requests to sample.
+        tokenizer (PreTrainedTokenizer): The tokenizer.
+
+    Returns:
+        List[Tuple[str, int, int]]: The sampled requests, each represented as a tuple of (prompt, prompt_len, output_len).
+    """
+    with open(dataset_path) as f:
+        input = json.load(f)
+
+    if len(input["messages"]) == 2:
+        prompts = [input["messages"][0]["content"], input["messages"][1]["content"]]
+    else:
+        raise ValueError(f'Invalid input_tokens to index from dataset "{dataset_path}"!')
+
+    prompt_len = sum([len(input_id) for input_id in tokenizer(prompts).input_ids])
+    print("prompt len, ", prompt_len)
+    output_len = input["max_tokens"] if "max_tokens" in input else max_new_tokens
+
+    # Duplicate prompt to generate samples
+    sampled_requests = [(prompts, prompt_len, output_len)] * num_requests
+
+    if "max_tokens" in input:
+        config["max_new_tokens"] = int(input["max_tokens"])
+    if "temperature" in input:
+        config["temperature"] = float(input["temperature"])
+    if "top_p" in input:
+        config["top_p"] = float(input["top_p"])
+    if "top_k" in input:
+        config["top_k"] = float(input["top_k"])
 
     return sampled_requests
 
@@ -165,7 +213,7 @@ def sample_requests_synthesis(
     output_len_mean: int,
     output_len_stddev: int,
     num_requests: int,
-) -> List[Tuple[str, int, int]]:
+) -> List[Tuple[List[str], int, int]]:
     """
     Sample requests from random generated prompts.
 
@@ -194,14 +242,14 @@ def gen_prompt_ids(prompt_len):
 
         # Generte random prompt from tokenizer's vocabulary
         prompt = tokenizer.decode(gen_prompt_ids(prompt_len), return_tensors="pt")
-        sampled_requests.append((prompt, prompt_len, output_len))
+        sampled_requests.append(([prompt], prompt_len, output_len))
     return sampled_requests
 
 
 async def get_request(
-    input_requests: List[Tuple[str, int, int]],
+    input_requests: List[Tuple[List[str], int, int]],
     request_rate: float,
-) -> AsyncGenerator[Tuple[str, int, int], None]:
+) -> AsyncGenerator[Tuple[List[str], int, int], None]:
     """
     Asynchronously generates requests based on the input_requests and request_rate.
 
@@ -230,7 +278,7 @@ async def get_request(
 async def send_request(
     api_url: str,
     model_name: str,
-    prompt: str,
+    prompts: List[str],
     prompt_len: int,
     output_len: int,
     config: dict,
@@ -262,16 +310,27 @@ async def send_request(
         temp_config["max_new_tokens"] = output_len
     if simple:
         pload = {
-            "text": prompt,
+            "text": prompts[0],
             "config": temp_config,
             "stream": track_token_latency,
         }
+        if vllm_engine:
+            pload.update({"ignore_eos": False})
     else:
+        if len(prompts) == 1:
+            messages = [
+                {"role": "user", "content": f"{prompts[0]}"},
+            ]
+        elif len(prompts) == 2:
+            messages = [
+                {"role": "system", "content": f"{prompts[0]}"},
+                {"role": "user", "content": f"{prompts[1]}"},
+            ]
+        else:
+            raise ValueError("number of prompts should be 1 or 2. Actual ", len(prompts))
         pload = {
             "model": model_name,
-            "messages": [
-                {"role": "user", "content": f"{prompt}"},
-            ],
+            "messages": messages,
             "stream": track_token_latency,
             "max_tokens": temp_config["max_new_tokens"]
             if "max_new_tokens" in temp_config
@@ -280,15 +339,16 @@ async def send_request(
             "top_p": temp_config["top_p"] if "top_p" in temp_config else None,
         }
         if vllm_engine:
-            pload.update({"ignore_eos": True})
+            pload.update({"ignore_eos": False})
 
     token_latencies_per_request: List[float] = []
 
-    timeout = aiohttp.ClientTimeout(total=3 * 3600)
+    timeout = aiohttp.ClientTimeout(total=5 * 3600)
     async with aiohttp.ClientSession(timeout=timeout) as session:
         while True:
             async with session.post(api_url, headers=headers, json=pload) as response:
                 chunks = []
+                decoded_chunks = []
 
                 start_ts = time.perf_counter()
 
@@ -298,8 +358,11 @@ async def send_request(
                     if track_token_latency:
                         token_latencies_per_request.append(latency)
                     start_ts = end_ts
-                    chunks.append(chunk)
-                    print(chunk.decode("utf-8") + "|", end="", flush=True)
+                    decoded_chunk = chunk.decode("utf-8")
+                    if decoded_chunk:
+                        chunks.append(chunk)
+                        decoded_chunks.append(decoded_chunk)
+                        print(decoded_chunk + "|", end="", flush=True)
                 print("Token Latencies:", token_latencies_per_request)
                 # print(len(chunks), len(token_latencies_per_request))
             # Decode the response
@@ -325,7 +388,16 @@ async def send_request(
                     response_content = chunks[-2].decode("utf-8")
                     response_content = json.loads(response_content.split("data: ")[1])
                     generate_len = response_content["usage"]["completion_tokens"]
-                    response_text = b"".join(chunks).decode("utf-8")
+                    response_texts = []
+                    for decoded_chunk in decoded_chunks:
+                        text = decoded_chunk.split("data: ")[1]
+                        if text.startswith("{"):
+                            json_text = json.loads(text)
+                            if (
+                                "choices" in json_text
+                                and "content" in json_text["choices"][0]["delta"]
+                            ):
+                                response_texts.append(json_text["choices"][0]["delta"]["content"])
                 else:
                     response_text = b"".join(chunks).decode("utf-8")
                     try:
@@ -341,10 +413,12 @@ async def send_request(
                 progress_bar.update()
             break
 
+    if args.track_token_latency:
+        print("response: ", "".join(response_texts))
     request_end_time = time.perf_counter()
     request_latency = request_end_time - request_start_time
 
-    prompt_str = prompt if track_input_output else None
+    prompt_str = prompts if track_input_output else None
     output_str = response_text if track_input_output else None
 
     if generate_len is not None:
@@ -363,7 +437,7 @@ async def send_request(
 async def benchmark(
     api_url: str,
     model_name: str,
-    input_requests: List[Tuple[str, int, int]],
+    input_requests: List[Tuple[List[str], int, int]],
     request_rate: float,
     config: dict,
     tokenizer: PreTrainedTokenizer,
@@ -378,7 +452,7 @@ async def benchmark(
 
     Args:
         api_url (str): The URL of the API.
-        input_requests (List[Tuple[str, int, int]]): A list of input requests, where each request is a tuple
+        input_requests (List[Tuple[List[str], int, int]]): A list of input requests, where each request is a tuple
             containing the prompt, prompt length, and output length.
         request_rate (float): The rate at which requests should be sent, in requests per second.
         config (dict): Configuration parameters for sending requests.
@@ -437,6 +511,8 @@ def main(args: argparse.Namespace):
         tokenizer_name_or_path, trust_remote_code=args.trust_remote_code
     )
 
+    config: Dict[str, Union[int, float]] = {}
+
     if args.dataset_format == "ShareGPT":
         input_requests = sample_requests_ShareGPT(
             args.dataset,
@@ -466,10 +542,16 @@ def main(args: argparse.Namespace):
             args.output_len_stddev,
             args.num_prompts,
         )
+    if args.dataset_format == "IDC":
+        input_requests = sample_requests_IDC(
+            args.dataset,
+            args.max_new_tokens,
+            args.num_prompts,
+            tokenizer,
+            config,
+        )
 
-    config: Dict[str, Union[int, float]] = {}
-
-    if args.max_new_tokens:
+    if args.max_new_tokens and "max_new_tokens" not in config:
         config["max_new_tokens"] = int(args.max_new_tokens)
     if args.temperature:
         config["temperature"] = float(args.temperature)
@@ -480,7 +562,7 @@ def main(args: argparse.Namespace):
     config["do_sample"] = args.do_sample
     # In order to align with vllm test parameters
     if args.vllm_engine:
-        config["ignore_eos"] = True
+        config["ignore_eos"] = False
 
     benchmark_start_time = time.perf_counter()
     asyncio.run(
@@ -532,16 +614,21 @@ def main(args: argparse.Namespace):
         ]
     )
     print(f"Average latency per Token: {avg_per_token_latency:.3f} s")
-
+    first_token_index = 0 if args.simple else 1
+    next_token_index = 1 if args.simple else 2
     if args.track_token_latency and latency_tracking:
         avg_first_token_latency = np.mean(
-            [latencies[0] for _, _, _, _, _, latencies in latency_tracking if latencies != []]
+            [
+                latencies[first_token_index]
+                for _, _, _, _, _, latencies in latency_tracking
+                if latencies != []
+            ]
         )
         avg_next_token_latency = np.mean(
             [
-                np.mean(latencies[1:])
+                np.mean(latencies[next_token_index:])
                 for _, _, _, _, _, latencies in latency_tracking
-                if latencies[1:] != []
+                if latencies[next_token_index:] != []
             ]
         )
 
@@ -614,7 +701,7 @@ def main(args: argparse.Namespace):
     parser.add_argument(
         "--dataset-format",
         type=str,
-        choices=["ShareGPT", "IPEX", "Synthesis"],
+        choices=["ShareGPT", "IPEX", "Synthesis", "IDC"],
         required=True,
         help="Dataset format, should be one of {ShareGPT, IPEX, Synthesis}.",
     )
diff --git a/benchmarks/calc_stats.py b/benchmarks/calc_stats.py
new file mode 100644
index 000000000..de209648c
--- /dev/null
+++ b/benchmarks/calc_stats.py
@@ -0,0 +1,191 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+import re
+from typing import Dict, List
+
+if len(sys.argv) < 4:
+    raise ValueError(
+        "need arguments, file path, number of expected iterations and expected generated token length"
+    )
+
+file_path = sys.argv[1]
+nbr_iter = int(sys.argv[2])
+expected_gen_token_len = int(sys.argv[3])
+
+with open(file_path) as f:
+    lines = f.readlines()
+
+PAT_NBR_USERS = re.compile(r"Run num_prompts (\d+) (.+)")
+PAT_ITER = re.compile(r"Run iter (\d+)")
+PAT_ACTUAL_LEN = re.compile(
+    r"Warning: the actual generated length is (\d+), which is different from the expected output length\((\d+)\)\."
+)
+PAT_TOTAL_TIME = re.compile(r"Total time: ([^ ]+) s")
+PAT_PROMPT_LEN = re.compile(r"Prompt Length \(Min/Med/Max\): (\d+).+")
+PAT_REQ_TPT = re.compile(r"Request Throughput \(QPS\): ([^ ]+) requests/s")
+PAT_INPUT_TPT = re.compile(r"Input Token Throughput: ([^ ]+) tokens/s")
+PAT_OUTPUT_TPT = re.compile(r"output Token Throughput: ([^ ]+) tokens/s")
+PAT_REQ_LAT = re.compile(r"Average latency per Request: ([^ ]+) s")
+PAT_TOK_LAT = re.compile(r"Average latency per Token: ([^ ]+) s")
+PAT_FTOK_LAT = re.compile(r"Average latency for First Tokens: ([^ ]+) s")
+PAT_NTOK_LAT = re.compile(r"Average latency for Next Tokens: ([^ ]+) s")
+
+nbr_users_perf: Dict[int, List[Dict[str, float]]] = {}
+
+token_lengths: List[int] = []
+
+state = 0
+current_nbr_user = -1
+current_iter = -1
+
+for no, line in enumerate(lines):
+    if state == 0:
+        m = PAT_NBR_USERS.match(line)
+        if m:
+            current_nbr_user = int(m.group(1))
+            print("collecting number of users (num_prompts): ", current_nbr_user)
+            nbr_users_perf[current_nbr_user] = []
+            state = 1
+    elif state == 1:
+        m = PAT_ITER.match(line)
+        if m:
+            current_iter = int(m.group(1)) - 1
+            nbr_users_perf[current_nbr_user].append({})
+            state = 2
+    elif state == 2:
+        m = PAT_ACTUAL_LEN.match(line)
+        if m:
+            metrics = nbr_users_perf[current_nbr_user][current_iter]
+            print(">>>", line, m.group(1))
+            token_lengths.append(int(m.group(1)))
+            if expected_gen_token_len != int(m.group(2)):
+                raise ValueError(
+                    "expected token lengths are not equal", expected_gen_token_len, m.group(2)
+                )
+        else:
+            m = PAT_TOTAL_TIME.match(line)
+            if m:
+                metrics = nbr_users_perf[current_nbr_user][current_iter]
+                full_gen_lens = token_lengths + [512] * (current_nbr_user - len(token_lengths))
+                metrics["ACT_GEN_TOKENS"] = float(sum(full_gen_lens)) / current_nbr_user
+                metrics["TOTAL_TIME"] = float(m.group(1))
+                token_lengths = []
+                state = 4
+    elif state == 3:
+        m = PAT_TOTAL_TIME.match(line)
+        if m:
+            metrics["TOTAL_TIME"] = float(m.group(1))
+            state = 4
+    elif state == 4:
+        m = PAT_PROMPT_LEN.match(line)
+        if m:
+            metrics["PROMPT_LEN"] = float(m.group(1))
+            state = 5
+        else:
+            raise ValueError("invalid line: " + line, no)
+    elif state == 5:
+        m = PAT_REQ_TPT.match(line)
+        if m:
+            metrics["REQ_TPT"] = float(m.group(1))
+            state = 6
+        else:
+            raise ValueError("invalid line: " + line, no)
+    elif state == 6:
+        m = PAT_INPUT_TPT.match(line)
+        if m:
+            metrics["INPUT_TPT"] = float(m.group(1))
+            state = 7
+        else:
+            raise ValueError("invalid line: " + line, no)
+    elif state == 7:
+        m = PAT_OUTPUT_TPT.match(line)
+        if m:
+            metrics["OUTPUT_TPT"] = float(m.group(1))
+            state = 8
+        else:
+            raise ValueError("invalid line: " + line, no)
+    elif state == 8:
+        m = PAT_REQ_LAT.match(line)
+        if m:
+            metrics["REQ_LAT"] = float(m.group(1))
+            state = 9
+        else:
+            raise ValueError("invalid line: " + line, no)
+    elif state == 9:
+        m = PAT_TOK_LAT.match(line)
+        if m:
+            metrics["TOK_LAT"] = float(m.group(1))
+            state = 10
+        else:
+            raise ValueError("invalid line: " + line, no)
+    elif state == 10:
+        m = PAT_FTOK_LAT.match(line)
+        if m:
+            metrics["FTOK_LAT"] = float(m.group(1))
+            state = 11
+        else:
+            raise ValueError("invalid line: " + line, no)
+    elif state == 11:
+        m = PAT_NTOK_LAT.match(line)
+        if m:
+            metrics["NTOK_LAT"] = float(m.group(1))
+            if current_iter == nbr_iter - 1:
+                state = 0
+                current_iter = -1
+                current_nbr_user = -1
+            else:
+                state = 1
+                current_iter = -1
+        else:
+            raise ValueError("invalid line: " + line, no)
+
+if nbr_users_perf:
+    print(nbr_users_perf)
+    for k, values in nbr_users_perf.items():
+        print("number of users: ", k)
+        size = len(values)
+        if size != nbr_iter:
+            raise ValueError(
+                "size should be equal to number of interations, "
+                + str(size)
+                + " != "
+                + str(nbr_iter)
+            )
+        metrics = {
+            "ACT_GEN_TOKENS": 0.0,
+            "PROMPT_LEN": 0.0,
+            "TOTAL_TIME": 0.0,
+            "REQ_TPT": 0.0,
+            "INPUT_TPT": 0.0,
+            "OUTPUT_TPT": 0.0,
+            "REQ_LAT": 0.0,
+            "TOK_LAT": 0.0,
+            "FTOK_LAT": 0.0,
+            "NTOK_LAT": 0.0,
+        }
+        for v in values:
+            for kk in metrics:
+                metrics[kk] += v[kk]
+        for kk, vv in metrics.items():
+            metrics[kk] = vv / size
+        print(metrics)
+        print("=========================================")
+
+
+else:
+    raise ValueError("Failed to collect metrics")
diff --git a/benchmarks/run_benchmark.sh b/benchmarks/run_benchmark.sh
index 3c7118dd7..f2a2ee507 100644
--- a/benchmarks/run_benchmark.sh
+++ b/benchmarks/run_benchmark.sh
@@ -2,25 +2,46 @@
 set -eo pipefail
 
 CHOICE=${1}
-RUN_MODE=${2}   # "test" or "benchmark", where "test" will only use a small part of the dataset
+RUN_MODE=${2}   # "test", "benchmark" or "benchmark_idc", where "test" will only use a small part of the dataset
 if [ -z "$CHOICE" ]
 then
     echo "Please pass in the value of parameter CHOICE, which can be any subset of 1,2,3,4."
 fi
 if [ -z "$RUN_MODE" ]
 then
-    echo "Please pass in the value of parameter RUN_MODE, which can be 'test' or 'benchmark'."
+    echo "Please pass in the value of parameter RUN_MODE, which can be 'test', 'benchmark' or 'benchmark_idc'."
 fi
 VALUE_INF=2000
-MODEL_ENDPOINT="http://localhost:8000/llama-2-7b-chat-hf"
-MODEL_NAME="llama-2-7b-chat-hf"
+MAX_NUM_SEQS=$VALUE_INF
+DYNAMIC_BATCH_SIZE=0
+PROMPT_TYPE="simple"
+if [ "$#" -gt 2 ]
+then
+    MAX_NUM_SEQS=${3}
+fi
+if [ "$#" -gt 3 ]
+then
+    DYNAMIC_BATCH_SIZE=${4}
+fi
+if [ "$#" -gt 4 ]
+then
+    PROMPT_TYPE=${5}
+fi
+
+# for simple
+#MODEL_ENDPOINT="http://localhost:8000/llama-3-8b-instruct"
+# for openai
+MODEL_ENDPOINT="http://localhost:8000"
+MODEL_NAME="llama-3-8b-instruct"
 SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
 BENCHMARK_SCRIPT=$SHELL_FOLDER"/benchmark_serving.py"
-WITH_VLLM_CONFIG_FILE=$SHELL_FOLDER"/../llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml"
-WO_VLLM_CONFIG_FILE=$SHELL_FOLDER"/../llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml"
+#WITH_VLLM_CONFIG_FILE=$SHELL_FOLDER"/../llm_on_ray/inference/models/vllm/llama-3-8b-instruct-vllm-ns.yaml"
+WITH_VLLM_CONFIG_FILE=$SHELL_FOLDER"/../llm_on_ray/inference/models/vllm/llama-3-8b-instruct-vllm.yaml"
+WO_VLLM_CONFIG_FILE=$SHELL_FOLDER"/../llm_on_ray/inference/models/llama-3-8b-instruct.yaml"
 DATASET_PATH=$SHELL_FOLDER"/../dataset"
 DATASET_SHAREGPT_PATH=$SHELL_FOLDER"/../dataset/ShareGPT_V3_unfiltered_cleaned_split.json"
 DATASET_IPEX_PATH=$SHELL_FOLDER"/../dataset/prompt.json"
+DATASET_IDC_PATH_PREFIX=$SHELL_FOLDER"/../dataset/idc_prompt_"
 DATASET_BENCHMARK_NUM=1000
 DATASET_COMPARE_NUM=128
 NUMA_SERVER_COMMAND=""
@@ -39,7 +60,7 @@ if [ $RUN_MODE = "test" ]
 then
     SAVE_DIR=$SHELL_FOLDER"/results_test"
     NUMA_CLIENT_COMMAND=""
-elif [ $RUN_MODE = "benchmark" ]
+elif [ $RUN_MODE = "benchmark" ] || [ $RUN_MODE = "benchmark_idc" ]
 then
     SAVE_DIR=$SHELL_FOLDER"/results"
     NUMA_CLIENT_COMMAND="numactl -N 1 -m 1"
@@ -107,19 +128,84 @@ latency_throughput(){
     tokens_dir=$choice_dir"/tokens_"$input_tokens_length"_"$output_tokens_length
 
     # server
-    $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --max_num_seqs $VALUE_INF
+    #$numa_server_command llm_on_ray-serve --config_file $with_vllm_config_file --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $VALUE_INF
 
     # client
-    for i in $(seq 1 $num_iter)
+    for num_prompts in ${query_num}
     do
-        echo "Run iter $i"
-        iter_dir=$tokens_dir"/iter_"$i
-        for num_prompts in ${query_num}
+        max_con_q=$VALUE_INF
+        if [ ! "$DYNAMIC_BATCH_SIZE" = "0" ]
+        then
+            if [ "$num_prompts" -lt "$NUM_REPLICA" ] || [ "$num_prompts" -eq "$NUM_REPLICA" ]
+            then
+                max_con_q=1
+            else
+                max_con_q=$((num_prompts/NUM_REPLICA))
+            fi
+        fi
+        echo "Run num_prompts ${num_prompts} ======================="
+        echo "deploying model with --max_concurrent_queries $max_con_q --vllm_max_num_seqs $MAX_NUM_SEQS ..."
+        $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $max_con_q --max_num_seqs $MAX_NUM_SEQS
+        sleep 1
+        for i in $(seq 0 $num_iter)
         do
+            if [ $i = 0 ]; then
+                iter_dir="$tokens_dir/warmup"
+                echo "Run warmup"
+            else
+                iter_dir=$tokens_dir"/iter_"$i
+                echo "Run iter $i"
+            fi
             results_dir=$iter_dir"/num_prompts_"$num_prompts
-            echo "Run num_prompts ${num_prompts}"
             echo "results_dir: ${results_dir}"
-            $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_IPEX_PATH --num-prompts $num_prompts  --dataset-format IPEX --input-tokens $input_tokens_length --max-new-tokens $output_tokens_length --track-token-latency --vllm-engine --simple --results-dir $results_dir
+            $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_IPEX_PATH --num-prompts $num_prompts  --dataset-format IPEX --input-tokens $input_tokens_length --track-token-latency --max-new-tokens  $output_tokens_length --vllm-engine --simple --results-dir $results_dir
+        done
+    done
+    echo "CHOICE 3 generation completed"
+}
+
+latency_throughput_idc(){
+    echo "get performance results of llm-on-ray with vllm when responding IDC requests"
+    num_iter=${1}
+    query_num=${2}
+    output_tokens_length=${3}
+    choice_dir=${4}
+    prompt_type=${5}
+    tokens_dir=$choice_dir"/tokens_"${prompt_type}"_"$output_tokens_length
+
+    # server
+    #$numa_server_command llm_on_ray-serve --config_file $with_vllm_config_file --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $VALUE_INF
+
+    # client
+    for num_prompts in ${query_num}
+    do
+        max_con_q=$VALUE_INF
+        if [ ! "$DYNAMIC_BATCH_SIZE" = "0" ]
+        then
+            if [ "$num_prompts" -lt "$NUM_REPLICA" ] || [ "$num_prompts" -eq "$NUM_REPLICA" ]
+            then
+                max_con_q=1
+            else
+                max_con_q=$((num_prompts/NUM_REPLICA))
+            fi
+        fi
+        echo "Run num_prompts ${num_prompts} ======================="
+        #echo "deploying model with --max_concurrent_queries $max_con_q --vllm_max_num_seqs $MAX_NUM_SEQS ..."
+	echo "deploying model with --max_concurrent_queries 4 --vllm_max_num_seqs $MAX_NUM_SEQS ..."
+        $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --max_ongoing_requests 4 --max_num_seqs $MAX_NUM_SEQS
+        sleep 1
+        for i in $(seq 0 $num_iter)
+        do
+            if [ $i = 0 ]; then
+                iter_dir="$tokens_dir/warmup"
+                echo "Run warmup"
+            else
+                iter_dir=$tokens_dir"/iter_"$i
+                echo "Run iter $i"
+            fi
+            results_dir=$iter_dir"/num_prompts_"$num_prompts
+            echo "results_dir: ${results_dir}"
+            $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset "${DATASET_IDC_PATH_PREFIX}${prompt_type}.json" --num-prompts $num_prompts  --dataset-format IDC --track-token-latency --max-new-tokens $output_tokens_length --vllm-engine  --results-dir $results_dir
         done
     done
     echo "CHOICE 3 generation completed"
@@ -196,9 +282,9 @@ then
         for i in "${!concurrent_query_num[@]}"; do
             concurrent_query_num[$i]=$[${concurrent_query_num[$i]}*$NUM_REPLICA]
         done
-        # 32/64
+        # 32/128
         input_tokens_length=32
-        output_tokens_length=64
+        output_tokens_length=128
         latency_throughput $iter "${concurrent_query_num[*]}" $input_tokens_length $output_tokens_length $benchmark_dir
         # 1024/128
         input_tokens_length=1024
@@ -211,6 +297,15 @@ then
         input_tokens_length=32
         output_tokens_length=20
         latency_throughput $iter "${concurrent_query_num[*]}" $input_tokens_length $output_tokens_length $benchmark_dir
+    elif [ "$RUN_MODE" == "benchmark_idc" ]
+    then
+        iter=3
+        concurrent_query_num=(1 3 6 9 12)
+        #for i in "${!concurrent_query_num[@]}"; do
+        #    concurrent_query_num[$i]=$[${concurrent_query_num[$i]}*$NUM_REPLICA]
+        #done
+        output_tokens_length=512
+        latency_throughput_idc $iter "${concurrent_query_num[*]}" $output_tokens_length $benchmark_dir $PROMPT_TYPE
     fi
 fi
 if [[ "$CHOICE" == *"4"* ]]
@@ -229,4 +324,5 @@ then
     fi
     output_tokens_length=32
     get_best_latency $iter "${input_tokens_length[*]}" $output_tokens_length $benchmark_dir
-fi
\ No newline at end of file
+fi
+
diff --git a/dev/docker/Dockerfile.vllm_ns b/dev/docker/Dockerfile.vllm_ns
new file mode 100644
index 000000000..ac191b7d4
--- /dev/null
+++ b/dev/docker/Dockerfile.vllm_ns
@@ -0,0 +1,42 @@
+# syntax=docker/dockerfile:1
+FROM ubuntu:22.04
+
+ENV LANG C.UTF-8
+
+WORKDIR /root/llm-on-ray
+
+RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
+    && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV CONDA_DIR /opt/conda
+RUN wget --quiet https://github.com/conda-forge/miniforge/releases/download/23.3.1-1/Miniforge3-Linux-x86_64.sh -O ~/miniforge.sh && \
+    /bin/bash ~/miniforge.sh -b -p /opt/conda
+ENV PATH $CONDA_DIR/bin:$PATH
+
+# setup env
+SHELL ["/bin/bash", "--login", "-c"]
+
+RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
+    unset -f conda && \
+    export PATH=$CONDA_DIR/bin/:${PATH} && \
+    mamba config --add channels intel && \
+    mamba install -y -c conda-forge python==3.9 gxx=12.3 gxx_linux-64=12.3 libxcrypt
+
+COPY ./pyproject.toml .
+COPY ./MANIFEST.in .
+
+
+# Install llm_on_ray
+# Create llm_on_ray package directory to bypass the following 'pip install -e' command
+RUN mkdir ./llm_on_ray
+RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[vllm-cpu] --extra-index-url https://download.pytorch.org/whl/cpu \
+    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+
+# Install vllm-ext
+# We cannot make empty folder here like './llm_on_ray' since vllm-ext has cpp files to be compiled
+COPY ./vllm-ext ./vllm-ext
+COPY ./dev/scripts/check-vllm-cpu-build-env.sh ./dev/scripts/check-vllm-cpu-build-env.sh
+RUN --mount=type=cache,target=/root/.cache/pip \
+    source /opt/conda/bin/activate base && cd vllm-ext && pip install . && pip install --upgrade protobuf
diff --git a/dev/scripts/check-vllm-cpu-build-env.sh b/dev/scripts/check-vllm-cpu-build-env.sh
new file mode 100755
index 000000000..55db8a29f
--- /dev/null
+++ b/dev/scripts/check-vllm-cpu-build-env.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# Check tools
+[[ -n $(which g++) ]] || { echo "GNU C++ Compiler (g++) is not found!";  exit 1; }
+[[ -n $(which pip) ]] || { echo "pip command is not found!";  exit 1; }
+
+# g++ version should be >=12.3. You can run the following to install GCC 12.3 and dependencies on conda:
+# conda install -y -c conda-forge gxx=12.3 gxx_linux-64=12.3 libxcrypt
+version_greater_equal()
+{
+    printf '%s\n%s\n' "$2" "$1" | sort --check=quiet --version-sort
+}
+gcc_version=$(g++ --version | grep -o -E '[0-9]+\.[0-9]+\.[0-9]+' | head -n1)
+echo
+echo Current GNU C++ Compiler version: $gcc_version
+echo
+version_greater_equal "${gcc_version}" 12.3.0 || { echo "GNU C++ Compiler 12.3.0 or above is required!"; exit 1; }
diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh
index 738154fd8..6e41eb666 100644
--- a/dev/scripts/ci-functions.sh
+++ b/dev/scripts/ci-functions.sh
@@ -64,7 +64,7 @@ start_docker() {
         docker_args+=("-e=https_proxy=${HTTPS_PROXY}")
     fi
 
-    echo "docker run -tid  "${docker_args[@]}" "${TARGET}:latest""
+    echo "docker run -tid --privileged "${docker_args[@]}" "${TARGET}:latest""
     docker run -tid  "${docker_args[@]}" "${TARGET}:latest"   
 }
 
@@ -75,11 +75,19 @@ install_dependencies(){
     docker exec "${TARGET}" bash -c "pip install -r ./tests/requirements.txt"
 }
 
-strat_ray(){
+start_ray(){
     local TARGET=$1
+    local UNLIMITED_MAXLOCKMEM=0
+    if [ "$2" == "1" ]; then
+        UNLIMITED_MAXLOCKMEM=1
+    fi
 
     # Start Ray Cluster
-    docker exec "${TARGET}" bash -c "./dev/scripts/start-ray-cluster.sh"
+    if [ "$UNLIMITED_MAXLOCKMEM" == "1" ]; then
+        docker exec "${TARGET}" bash -c "ulimit -l unlimited; ./dev/scripts/start-ray-cluster.sh"
+    else
+        docker exec "${TARGET}" bash -c "./dev/scripts/start-ray-cluster.sh"
+    fi
 }
 
 stop_ray(){
@@ -111,6 +119,7 @@ declare -A DF_SUFFIX_MAPPER
 DF_SUFFIX_MAPPER=(
     ["mpt-7b-ipex-llm"]=".ipex-llm"
     ["llama-2-7b-chat-hf-vllm"]=".vllm"
+    ["llama-2-7b-chat-hf-vllm-ns"]=".vllm_ns"
     ["gpt-j-6b"]=".cpu_and_deepspeed.pip_non_editable"
 )
 
@@ -128,6 +137,7 @@ declare -A TARGET_SUFFIX_MAPPER
 TARGET_SUFFIX_MAPPER=(
     ["mpt-7b-ipex-llm"]="_ipex-llm"
     ["llama-2-7b-chat-hf-vllm"]="_vllm"
+    ["llama-2-7b-chat-hf-vllm-ns"]="_vllm-ns"
 )
 
 get_TARGET_SUFFIX() {
@@ -143,6 +153,7 @@ declare -A INFERENCE_MAPPER
 INFERENCE_MAPPER=(
     ["mpt-7b-ipex-llm"]="llm_on_ray-serve --config_file llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml --simple"
     ["llama-2-7b-chat-hf-vllm"]="llm_on_ray-serve --config_file .github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml --simple"
+    ["llama-2-7b-chat-hf-vllm-ns"]="llm_on_ray-serve --config_file llm_on_ray/inference/models/vllm/llama2-7b-chat-hf-vllm-ns.yaml --simple --max_ongoing_requests 1 --max_num_seqs 1"
     ["default"]="llm_on_ray-serve --simple --models ${model}"
 )
 
diff --git a/docs/assets/choice3_tokens_32_64.png b/docs/assets/choice3_tokens_32_64.png
index 5e86a5c8f..e2f549a11 100644
Binary files a/docs/assets/choice3_tokens_32_64.png and b/docs/assets/choice3_tokens_32_64.png differ
diff --git a/docs/vllm.md b/docs/vllm.md
index 90e12a032..604c8c2a7 100644
--- a/docs/vllm.md
+++ b/docs/vllm.md
@@ -24,10 +24,32 @@ Then please run the following script to install vLLM for CPU into your LLM-on-Ra
 dev/scripts/install-vllm-cpu.sh
 ```
 
+## Install vLLM Extension for Quantization (Optional)
+To further speed up quantized model inference on Intel CPU, we extend vLLM to run the model decoding in own own inference engine, which is based on [https://github.com/intel/neural-speed](neural-speed).
+The extension gets 2X performance improvement compared to plain vLLM CPU on Llama-2-7b and Llama-3-8b models.
+
+Neural Speed is an innovative library designed to support the efficient inference of large language models (LLMs) on Intel platforms through the state-of-the-art (SOTA) low-bit quantization powered by
+[https://github.com/intel/neural-compressor](Intel Neural Compressor). The work is inspired by [https://github.com/ggerganov/llama.cpp](llama.cpp) and further optimized for Intel platforms with our
+innovations in [https://arxiv.org/abs/2311.00502](NeurIPS' 2023).
+
+You need to first install llm-on-ray with "vllm-cpu" extra.
+
+```bash
+pip install .[vllm-cpu] --extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+```
+
+Then, install the vLLM extension and the inference engine.
+```bash
+cd vllm-ext
+pip install . && pip install --upgrade protobuf
+
+```
+
 ## Run
 
 #### Serving
 
+* Vanilla vLLM
 To serve model with vLLM and simple protocol, run the following:
 
 ```bash
@@ -36,6 +58,30 @@ llm_on_ray-serve --config_file llm_on_ray/inference/models/vllm/llama-2-7b-chat-
 
 In the above example, `vllm` property is set to `true` in the config file for enabling vLLM.
 
+* vLLM Extension
+To serve model with vLLM extension with Intel inference engine, run with following (Note: only Llama-2-7b-chat-hf and Llama-3-8b-instruct are supported for now):
+
+```bash
+# copy quantization config file to your specific snapshot dir, for example .../snapshots/f5db02db7.../
+# the quant_ns_config.json will be copied from llm_on_ray package with default config if you don't copy your desired one manually.
+cp llm_on_ray/inference/models/vllm/quantization/quant_ns_config.json <your model snapshot dir>
+
+# startup ray head node and worker node with promper configurations
+# '--num-cpus 0' makes no task or actor run in head node
+numactl -N 0 -m 0 ray start --head --num-cpus 0 --include-dashboard=False --disable-usage-stats --node-ip-address <your IP address>
+# adjust values of NS_NUM_THREADS and numa core binding accordingly in your machine whilst keep OMP_PROC_BIND and OMP_NUM_THREADS unchanged. The example below is for standard SPR machine.
+# typically number of bound physical cores should be equal to value --num-cpus. and NS_NUM_THREADS should be equal to number of bound physical cores - 2
+# for deploying multiple replicas, check examples/inference/vllm/run_multiple_ray_node.sh
+OMP_PROC_BIND=true NS_NUM_THREADS=44 OMP_NUM_THREADS=1 numactl -N 0 -C 0-45 -m 0 ray start --address='<your IP address>:6379' --num-cpus 46 --resources='{"inference_engine": 1}'
+# startup one more ray worker node if it's open-ai compatible deployment
+numactl -N 1 -C 110-111 -m 1 ray start --address='10.0.11.8:6379' --num-cpus 1 --resources='{"app_router": 1}'
+
+# deploy model serving. Note: It includes quantizing the model on the fly based on the quant_ns_config.json if it has not been quantized.
+llm_on_ray-serve --config_file llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-ns.yaml --keep_serve_terminal --max-ongoing-requests 4 --max_num_seqs 4
+```
+
+For now, only Llama-2-7b-chat-hf and Llama-3-8b-instruct are supported. But it can quickly extend to support other models.
+
 #### Querying
 
 To start a non-streaming query, run the following:
@@ -52,4 +98,4 @@ python examples/inference/api_server_simple/query_single.py --model_endpoint htt
 
 ## Further Configuration
 
-Please follow [Deploying and Serving LLMs on Intel CPU/GPU/Gaudi](serve.md) document to for other configurations.
\ No newline at end of file
+Please follow [Deploying and Serving LLMs on Intel CPU/GPU/Gaudi](serve.md) document to for other configurations.
diff --git a/examples/inference/vllm/api_server.py b/examples/inference/vllm/api_server.py
new file mode 100644
index 000000000..71021f9a3
--- /dev/null
+++ b/examples/inference/vllm/api_server.py
@@ -0,0 +1,146 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+NOTE: This API server is used only for demonstrating usage of AsyncEngine
+and simple performance benchmarks. It is not intended for production use.
+For production use, we recommend using our OpenAI compatible server.
+We are also not going to accept PRs modifying this file, please
+change `vllm/entrypoints/openai/api_server.py` instead.
+"""
+
+import argparse
+import json
+import ssl
+from typing import AsyncGenerator
+
+import uvicorn
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, Response, StreamingResponse
+
+from vllm.extension import ns
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.sampling_params import SamplingParams
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import random_uuid
+
+TIMEOUT_KEEP_ALIVE = 5  # seconds.
+app = FastAPI()
+engine = None
+
+
+@app.get("/health")
+async def health() -> Response:
+    """Health check."""
+    return Response(status_code=200)
+
+
+@app.post("/generate")
+async def generate(request: Request) -> Response:
+    """Generate completion for the request.
+
+    The request should be a JSON object with the following fields:
+    - prompt: the prompt to use for the generation.
+    - stream: whether to stream the results or not.
+    - other fields: the sampling parameters (See `SamplingParams` for details).
+    """
+    request_dict = await request.json()
+    prompt = request_dict.pop("prompt")
+    stream = request_dict.pop("stream", False)
+    sampling_params = SamplingParams(**request_dict)
+    request_id = random_uuid()
+
+    assert engine is not None
+    results_generator = engine.generate(prompt, sampling_params, request_id)
+
+    # Streaming case
+    async def stream_results() -> AsyncGenerator[bytes, None]:
+        async for request_output in results_generator:
+            prompt = request_output.prompt
+            text_outputs = [prompt + output.text for output in request_output.outputs]
+            ret = {"text": text_outputs}
+            yield (json.dumps(ret) + "\0").encode("utf-8")
+
+    if stream:
+        return StreamingResponse(stream_results())
+
+    # Non-streaming case
+    final_output = None
+    async for request_output in results_generator:
+        if await request.is_disconnected():
+            # Abort the request if the client disconnects.
+            await engine.abort(request_id)
+            return Response(status_code=499)
+        # print(request_output)
+        final_output = request_output
+
+    assert final_output is not None
+    prompt = final_output.prompt
+    text_outputs = [prompt + output.text for output in final_output.outputs]
+    ret = {"text": text_outputs}
+    return JSONResponse(ret)
+
+
+def _modify_qunatization_choices(parser, dest, choices):
+    for action in parser._actions:
+        if action.dest == dest:
+            action.choices = choices
+            return
+    else:
+        raise ValueError("argument {} not found".format(dest))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=int, default=8070)
+    parser.add_argument("--ssl-keyfile", type=str, default=None)
+    parser.add_argument("--ssl-certfile", type=str, default=None)
+    parser.add_argument("--ssl-ca-certs", type=str, default=None, help="The CA certificates file")
+    parser.add_argument(
+        "--ssl-cert-reqs",
+        type=int,
+        default=int(ssl.CERT_NONE),
+        help="Whether client certificate is required (see stdlib ssl module's)",
+    )
+    parser.add_argument(
+        "--root-path",
+        type=str,
+        default=None,
+        help="FastAPI root_path when app is behind a path based routing proxy",
+    )
+    parser.add_argument("--log-level", type=str, default="debug")
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    _modify_qunatization_choices(parser, "quantization", ("awq", "gptq", "squeezellm", "ns", None))
+    _modify_qunatization_choices(parser, "block_size", None)
+    args = parser.parse_args()
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = AsyncLLMEngine.from_engine_args(engine_args, usage_context=UsageContext.API_SERVER)
+
+    app.root_path = args.root_path
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level=args.log_level,
+        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+        ssl_keyfile=args.ssl_keyfile,
+        ssl_certfile=args.ssl_certfile,
+        ssl_ca_certs=args.ssl_ca_certs,
+        ssl_cert_reqs=args.ssl_cert_reqs,
+    )
diff --git a/examples/inference/vllm/ray-vllm-examples/llm.py b/examples/inference/vllm/ray-vllm-examples/llm.py
new file mode 100644
index 000000000..6a20be65e
--- /dev/null
+++ b/examples/inference/vllm/ray-vllm-examples/llm.py
@@ -0,0 +1,152 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Dict, Optional, List
+import logging
+
+from fastapi import FastAPI
+from starlette.requests import Request
+from starlette.responses import StreamingResponse, JSONResponse
+
+from ray import serve
+
+from vllm.extension import ns
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ErrorResponse,
+)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+
+logger = logging.getLogger("ray.serve")
+
+app = FastAPI()
+
+
+@serve.deployment(
+    autoscaling_config={
+        "min_replicas": 1,
+        "max_replicas": 10,
+        "target_ongoing_requests": 5,
+    },
+    max_ongoing_requests=10,
+)
+@serve.ingress(app)
+class VLLMDeployment:
+    def __init__(
+        self,
+        engine_args: AsyncEngineArgs,
+        response_role: str,
+        lora_modules: Optional[List[LoRAModulePath]] = None,
+        chat_template: Optional[str] = None,
+    ):
+        from vllm.extension import ns
+
+        logger.info(f"Starting with engine args: {engine_args}")
+        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+        # Determine the name of the served model for the OpenAI client.
+        if engine_args.served_model_name is not None:
+            served_model_names = engine_args.served_model_name
+        else:
+            served_model_names = [engine_args.model]
+        self.openai_serving_chat = OpenAIServingChat(
+            self.engine,
+            self.engine.engine.model_config,
+            served_model_names,
+            response_role,
+            lora_modules,
+            chat_template,
+        )
+
+    @app.post("/v1/chat/completions")
+    async def create_chat_completion(self, request: ChatCompletionRequest, raw_request: Request):
+        """OpenAI-compatible HTTP endpoint.
+
+        API reference:
+            - https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
+        """
+        logger.info(f"Request: {request}")
+        generator = await self.openai_serving_chat.create_chat_completion(request, raw_request)
+        if isinstance(generator, ErrorResponse):
+            return JSONResponse(content=generator.model_dump(), status_code=generator.code)
+        if request.stream:
+            return StreamingResponse(content=generator, media_type="text/event-stream")
+        else:
+            assert isinstance(generator, ChatCompletionResponse)
+            return JSONResponse(content=generator.model_dump())
+
+
+def _modify_qunatization_choices(parser, dest, choices):
+    for action in parser._actions:
+        if action.dest == dest:
+            action.choices = choices
+            return
+    else:
+        raise ValueError("argument {} not found".format(dest))
+
+
+def parse_vllm_args(cli_args: Dict[str, str]):
+    """Parses vLLM args based on CLI inputs.
+
+    Currently uses argparse because vLLM doesn't expose Python models for all of the
+    config options we want to support.
+    """
+    parser = make_arg_parser()
+    _modify_qunatization_choices(parser, "quantization", ("awq", "gptq", "squeezellm", "ns", None))
+    _modify_qunatization_choices(parser, "block_size", None)
+    arg_strings = []
+    for key, value in cli_args.items():
+        arg_strings.extend([f"--{key}", str(value)])
+    logger.info(arg_strings)
+    parsed_args = parser.parse_args(args=arg_strings)
+    return parsed_args
+
+
+def build_app(cli_args: Dict[str, str]) -> serve.Application:
+    """Builds the Serve app based on CLI arguments.
+
+    See https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server
+    for the complete set of arguments.
+
+    Supported engine arguments: https://docs.vllm.ai/en/latest/models/engine_args.html.
+    """  # noqa: E501
+    parsed_args = parse_vllm_args(cli_args)
+    engine_args = AsyncEngineArgs.from_cli_args(parsed_args)
+    # engine_args.worker_use_ray = True
+
+    # tp = engine_args.tensor_parallel_size
+    # logger.info(f"Tensor parallelism = {tp}")
+    pg_resources = []
+    pg_resources.append({"CPU": 1})  # for the deployment replica
+    # for i in range(tp):
+    #    pg_resources.append({"CPU": 1, "GPU": 1})  # for the vLLM actors
+
+    # We use the "STRICT_PACK" strategy below to ensure all vLLM actors are placed on
+    # the same Ray node.
+    return VLLMDeployment.options(
+        placement_group_bundles=pg_resources, placement_group_strategy="STRICT_PACK"
+    ).bind(
+        engine_args,
+        parsed_args.response_role,
+        parsed_args.lora_modules,
+        parsed_args.chat_template,
+    )
diff --git a/examples/inference/vllm/ray-vllm-examples/query.py b/examples/inference/vllm/ray-vllm-examples/query.py
new file mode 100644
index 000000000..012725edb
--- /dev/null
+++ b/examples/inference/vllm/ray-vllm-examples/query.py
@@ -0,0 +1,41 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from openai import OpenAI
+
+# Note: Ray Serve doesn't support all OpenAI client arguments and may ignore some.
+client = OpenAI(
+    # Replace the URL if deploying your app remotely
+    # (e.g., on Anyscale or KubeRay).
+    base_url="http://localhost:8000/v1",
+    api_key="NOT A REAL KEY",
+)
+chat_completion = client.chat.completions.create(
+    model="meta-llama/Llama-2-7b-chat-hf",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {
+            "role": "user",
+            "content": "What are some highly rated restaurants in San Francisco?'",
+        },
+    ],
+    temperature=0.01,
+    stream=True,
+)
+
+for chat in chat_completion:
+    if chat.choices[0].delta.content is not None:
+        print(chat.choices[0].delta.content, end="")
diff --git a/examples/inference/vllm/run_multiple_api_server.sh b/examples/inference/vllm/run_multiple_api_server.sh
new file mode 100644
index 000000000..5a65ded6b
--- /dev/null
+++ b/examples/inference/vllm/run_multiple_api_server.sh
@@ -0,0 +1,12 @@
+NS_MODEL_PERF_STEPS=50 OMP_PROC_BIND=true NS_NUM_THREADS=24 OMP_NUM_THREADS=1  numactl -N 0 -m 0 -C 0-26 nohup python api_server.py --model meta-llama/Llama-2-7b-chat-hf --quantization ns --device cpu --max-num-seqs 64 --block-size 4096 --max-model-len 4096 --host 10.0.11.5 --port 8070 > s8070.log 2>&1 &
+sleep 2
+
+NS_MODEL_PERF_STEPS=50 OMP_PROC_BIND=true NS_NUM_THREADS=24 OMP_NUM_THREADS=1  numactl -N 0 -m 0 -C 28-54 nohup python api_server.py --model meta-llama/Llama-2-7b-chat-hf --quantization ns --device cpu --max-num-seqs 64 --block-size 4096 --max-model-len 4096 --host 10.0.11.5 --port 8071 > s8071.log 2>&1 &
+sleep 2
+
+NS_MODEL_PERF_STEPS=50 OMP_PROC_BIND=true NS_NUM_THREADS=24 OMP_NUM_THREADS=1  numactl -N 1 -m 1 -C 56-82 nohup python api_server.py --model meta-llama/Llama-2-7b-chat-hf --quantization ns --device cpu --max-num-seqs 64 --block-size 4096 --max-model-len 4096 --host 10.0.11.5 --port 8072 > s8072.log 2>&1 &
+sleep 2
+
+NS_MODEL_PERF_STEPS=50 OMP_PROC_BIND=true NS_NUM_THREADS=24 OMP_NUM_THREADS=1  numactl -N 1 -m 1 -C 84-110 nohup python api_server.py --model meta-llama/Llama-2-7b-chat-hf --quantization ns --device cpu --max-num-seqs 64 --block-size 4096 --max-model-len 4096 --host 10.0.11.5 --port 8073 > s8073.log 2>&1 &
+sleep 2
+
diff --git a/examples/inference/vllm/run_multiple_offline.sh b/examples/inference/vllm/run_multiple_offline.sh
new file mode 100644
index 000000000..194d83ccb
--- /dev/null
+++ b/examples/inference/vllm/run_multiple_offline.sh
@@ -0,0 +1,7 @@
+# two sockets
+#NS_NUM_THREADS=25 OMP_NUM_THREADS=1 OMP_PROC_BIND=true NS_MODEL_PERF_STEPS=50 nohup numactl -N 0 -m 0 -C 0-27   python vllm_offline_inference.py > 2.log 2>&1 &
+#NS_NUM_THREADS=25 OMP_NUM_THREADS=1 OMP_PROC_BIND=true NS_MODEL_PERF_STEPS=50 nohup numactl -N 1 -m 1 -C 56-83   python vllm_offline_inference.py > 1.log 2>&1 &
+
+TOKENIZERS_PARALLELISM=false NS_NUM_THREADS=25 OMP_NUM_THREADS=1 OMP_PROC_BIND=true NS_MODEL_PERF_STEPS=50 nohup numactl -N 0 -m 0 -C 0-26   python vllm_offline_inference.py > 2.log 2>&1 &
+TOKENIZERS_PARALLELISM=false NS_NUM_THREADS=25 OMP_NUM_THREADS=1 OMP_PROC_BIND=true NS_MODEL_PERF_STEPS=50 nohup numactl -N 0 -m 0 -C 27-53   python vllm_offline_inference.py > 1.log 2>&1 &
+
diff --git a/examples/inference/vllm/run_multiple_ray_node.sh b/examples/inference/vllm/run_multiple_ray_node.sh
new file mode 100644
index 000000000..056c007a4
--- /dev/null
+++ b/examples/inference/vllm/run_multiple_ray_node.sh
@@ -0,0 +1,13 @@
+# NS_MODEL_PERF_STEPS=50 
+OMP_PROC_BIND=true NS_NUM_THREADS=25 OMP_NUM_THREADS=1 numactl -N 1 -C 83-109 -m 1 ray start --address='10.0.11.8:6379' --num-cpus 27 --resources='{"inference_engine": 1}'
+sleep 1
+OMP_PROC_BIND=true NS_NUM_THREADS=25 OMP_NUM_THREADS=1 numactl -N 0 -C 0-26 -m 0 ray start --address='10.0.11.8:6379' --num-cpus 27 --resources='{"inference_engine": 1}'
+sleep 1
+OMP_PROC_BIND=true NS_NUM_THREADS=25 OMP_NUM_THREADS=1 numactl -N 1 -C 56-82 -m 1 ray start --address='10.0.11.8:6379' --num-cpus 27 --resources='{"inference_engine": 1}'
+sleep 1
+OMP_PROC_BIND=true NS_NUM_THREADS=25 OMP_NUM_THREADS=1 numactl -N 0 -C 27-53 -m 0 ray start --address='10.0.11.8:6379' --num-cpus 27 --resources='{"inference_engine": 1}'
+sleep 1
+numactl -N 1 -C 110-111 -m 1 ray start --address='10.0.11.8:6379' --num-cpus 1 --resources='{"app_router": 1}'
+
+sleep 1
+echo 'done'
diff --git a/examples/inference/vllm/run_multiple_ray_node_plain_cpu.sh b/examples/inference/vllm/run_multiple_ray_node_plain_cpu.sh
new file mode 100644
index 000000000..69b320238
--- /dev/null
+++ b/examples/inference/vllm/run_multiple_ray_node_plain_cpu.sh
@@ -0,0 +1,30 @@
+# NS_MODEL_PERF_STEPS=50 
+#OMP_PROC_BIND=true NS_NUM_THREADS=25 OMP_NUM_THREADS=1 numactl -N 1 -C 83-109 -m 1 ray start --address='10.0.11.8:6379' --num-cpus 1
+#sleep 1
+#OMP_PROC_BIND=true NS_NUM_THREADS=25 OMP_NUM_THREADS=1 numactl -N 0 -C 0-26 -m 0 ray start --address='10.0.11.8:6379' --num-cpus 1
+#sleep 1
+#OMP_PROC_BIND=true NS_NUM_THREADS=25 OMP_NUM_THREADS=1 numactl -N 1 -C 56-82 -m 1 ray start --address='10.0.11.8:6379' --num-cpus 1
+#sleep 1
+#OMP_PROC_BIND=true NS_NUM_THREADS=25 OMP_NUM_THREADS=1 numactl -N 0 -C 27-53 -m 0 ray start --address='10.0.11.8:6379' --num-cpus 1
+
+# two instances two sockets
+#OMP_PROC_BIND=true NS_NUM_THREADS=44 OMP_NUM_THREADS=1  numactl -N 0 -C 0-45 -m 0 ray start --address='10.0.11.8:6379' --num-cpus 1
+#sleep 1
+#OMP_PROC_BIND=true NS_NUM_THREADS=44 OMP_NUM_THREADS=1  numactl -N 1 -C 56-101 -m 1 ray start --address='10.0.11.8:6379' --num-cpus 1
+
+#LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4" numactl -N 0 -m 0 -C 0-55 ray start --address='10.0.11.8:6379' --num-cpus 56
+#sleep 1
+#LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4" numactl -N 1 -m 1 -C 56-111 ray start --address='10.0.11.8:6379' --num-cpus 56
+
+LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4" numactl -N 0 -m 0 -C 0-27 ray start --address='10.0.11.8:6379' --num-cpus 27
+sleep 1
+LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4" numactl -N 0 -m 0 -C 28-55 ray start --address='10.0.11.8:6379' --num-cpus 27
+sleep 1
+LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4" numactl -N 1 -m 1 -C 56-83 ray start --address='10.0.11.8:6379' --num-cpus 27
+sleep 1
+LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4" numactl -N 1 -m 1 -C 84-111 ray start --address='10.0.11.8:6379' --num-cpus 27
+sleep 1
+LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4"  ray start  --address='10.0.11.8:6379' --num-cpus 1 --resources='{"app_router": 1}'
+
+sleep 1
+echo 'done'
diff --git a/examples/inference/vllm/stop_multiple_api_server.sh b/examples/inference/vllm/stop_multiple_api_server.sh
new file mode 100644
index 000000000..60a0cbe00
--- /dev/null
+++ b/examples/inference/vllm/stop_multiple_api_server.sh
@@ -0,0 +1 @@
+ps -ef | grep api_server | grep -v grep | awk '{ print $2 }' | xargs kill
diff --git a/examples/inference/vllm/vllm_offline_inference.py b/examples/inference/vllm/vllm_offline_inference.py
new file mode 100644
index 000000000..d8a0bff17
--- /dev/null
+++ b/examples/inference/vllm/vllm_offline_inference.py
@@ -0,0 +1,166 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from vllm import LLM, SamplingParams
+
+from vllm.extension import ns as ns
+
+from time import perf_counter
+
+
+############################
+# TODO: TODO: TODO: reset request_id and seq_id before they reach to max value of int64_t
+############################
+
+# Sample prompts.
+# prompts = [
+#     "Hello, my name is",
+#     "The president of the United States is",
+#     "The capital of France is",
+#     "The future of AI is",
+#     "Hello, my name is",
+#     "The president of the United States is",
+#     "The capital of France is",
+#     "The future of AI is",
+#     "Hello, my name is",
+#     "The president of the United States is",
+#     "The capital of France is",
+#     "The future of AI is",
+# ]
+prompts = [
+    "Russell Brunson's Perfect Webinar Script is a framework for delivering webinars that are designed to sell a product or service. ",
+    "Tony Robbins describes six core human needs that drive our behaviors and motivations. These six needs are:\n\n1. Certainty: The need for safety, stability, and predictability.",
+    "1. Homogeneity: The segment should consist of customers who share similar characteristics and behaviors.\n2. Distinctiveness: The segment should be different from other segments in terms of their characteristics and behaviors.\n3. Stability: The segment should remain relatively stable over time and not change drastically. The characteristics and behaviors of customers within the segment should not change significantly.",
+    'In Java, I want to replace string like "This is a new {object} at {place}" with a Map, {object: "student", "point 3, 4"}, and get a result "This is a new student at point 3, 4". How can I do?',
+    'You can use the `String.format()` method in Java to replace placeholders in a string with values from a map. Here\'s an example code snippet that demonstrates how you can achieve this:\n```java\nimport java.util.HashMap;\nimport java.util.Map;\n\npublic class StringReplaceExample {\n    public static void main(String[] args) {\n        String input = "This is a new {object} at {place}";\n        Map<String, String> replacements = new HashMap<>();\n        replacements.put("object", "student");\n        replacements.put("place", "point 3, 4");\n\n ',
+    'The language used to describe the addressing modes of these instructions is metaphorical and grandiose, emphasizing the complexity and power of these commands. For example, the use of "enigmatic" and "confounding" to describe JMP ABCD and MOV AX, [BX+SI], respectively, suggests that these instructions are not easily understood and require a level of expertise to comprehend.\n\nSimilarly, the use of "inscrutable" and "cryptic" to describe MOV AX, [100] and MOV AX, [BX], respectively, implies that these commands are shrouded in mystery and are difficult to decipher. The speaker\'s use of "perplexing" and "unfathomable" to describe MOV AX, [BX\\*2+SI] and MOV AX, BX, respectively, ',
+    "Lo and behold! The arcane and elusive art of metaphorical language has been summoned forth to expound upon the enigmatic addressing modes of the instructions at hand. The speakers have wielded grandiose expressions with utmost reverence and awe, extolling the ineffable power and bewildering functionality of these directives. Among the inscrutable commands are the confounding JMP ABCD, the abstruse MOV AX, [BX+SI], the unfathomable MOV AX, [100],",
+    "more more perplexity and verbose",
+    "By the grace of the gods,",
+]
+# Create a sampling params object.
+# sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=256, use_beam_search=True, best_of=2)
+# sampling_params = SamplingParams(temperature=0.6, top_p=0.95, max_tokens=256, top_k=40)
+sampling_params = SamplingParams(max_tokens=512)
+# sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
+
+#######################################
+prompts = [
+    "Hello, how are you?",
+    "What is your name?",
+    "What is your favorite color?",
+    "What is your favorite food?",
+    "What is your favorite movie?",
+    "What is your favorite song?",
+    "What is your favorite book?",
+    "What is your favorite animal",
+    "What is your favorite sport?",
+    "What is your favorite hobby?",
+    "What is your favorite subject?",
+    "What is your favorite game?",
+    "What is your favorite TV show?",
+    "What is your favorite actor?",
+    "What is your favorite actress?",
+    "What is your favorite singer?",
+    "What is your favorite band?",
+    "What is your favorite artist?",
+    "What is your favorite author?",
+    "What is your favorite poet?",
+]
+# 32 input
+prompts = [
+    "Tony Robbins describes six core human needs that drive our behaviors and motivations. These six needs are:\n\n1. Certainty: The need for safety, stability, and predictability."
+]
+# 1024 input
+prompts = [
+    "It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I have not seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill. Once a pasta is in the vicinity of a plate,"
+]
+# simple
+# prompts = [
+#     "You are a highly skilled AI trained in language comprehension and summarization. I would like you to read the following transcription of a meeting and summarize it into a concise abstract paragraph. Aim to retain the most important points, providing a coherent and readable summary that could help a person understand the main points of the discussion without needing to read the entire text. Please avoid unnecessary details or tangential points. list the people involved in the converation, that way that will scan transcribe for you, yeah. Kyle Short: Well then I GPT did the transcription. Vinay Kumar: Yes. Chakri Krishna: We will, absolutely. Chakri Krishna: So you wanna go? Kyle Short: Alright. Chakri Krishna: You wanted to be intros, Kyle. Kyle Short: Yeah, I think at least for Jeevan and Rich, your faces will be new to Shredder. Vinay Kumar: I know them. I've been. I've talked to both of them. Kyle Short: You've got to do the one on ones already. Excellent. Rich Rodgers: Yeah. Yep. Jason Rawlins: Yep. Kyle Short: OK, then we can skip it. Awesome. Chakri Krishna: Alright, cool. Chakri Krishna: Let's do white you know, Kyle, to kind of do an extension of what you were doing with security. Chakri Krishna: Is IT security or is IT security? Kyle Short: They confirmed it as security. Chakri Krishna: Security. Thank you very much. We'll, we'll throw. I'll throw a whiteboard out there. Chakri Krishna: At least this whole kind of help me and maybe we can like draw a little bit better. Chakri Krishna: On what we're building. Chakri Krishna: So I'm not, I'm not 100% sure where to start to make it make most sense. Chakri Krishna: Sure. Chakri Krishna: Alright, cool. Yeah, yeah. Cool. Vinay Kumar: Start from I GPT, we know I GPT right and then yeah. Chakri Krishna: Oops, I've got my pen stuck to the mouse. Kyle Short: And just so you know, we spent the whole 4 hours I spent with security. Today was basically on the one diagram that you had which was your high-level"
+# ]
+# medium
+# prompts = [
+# "You are a highly skilled AI trained in language comprehension and summarization.  Please avoid unnecessary details or tangential points. list the people involved in the converation presentation and create notes for each slide that I can use to guide my presentation.  Each time you see next slide, this is an indication that a new slide is being presented and you should create a new section of notes.  If you see the text “for example” this indicates that I am giving an important example that is critical to the context of the presentation.  Always include notes that prompt me to remember the examples I am providing.  Provide a bolded word “Example” at the beginning of these types of examples.  Act as an expert presenter who is guiding a novice presenter on how to prepare notes for a presentation that will be given to a large audience.  The presenter will be nervous and might need additional notes to help keep him on track.  Ask me questions before generating notes that might help me in creating a good presentation.  Only produce one set of notes for one slide at a time.  Ask me if I am done with the notes or want to work on them more for each slide.  When I tell you I am done with the notes for the slide, you will move to the next slide where we will work on the next slide’s notes.  DICTATION: I want to start out by talking about the journey the generative AI journey that we see from an IT perspective I have a timeline here listed out and kind of goes through a little bit before 2023 out to the future and we'll cover those things as they let's see our our kind of go go through those things um as we all know there are a lot of industry innovations that took place over the last many years actually even decades when we think about generative AI large language models these concepts have been around a long time decades but we haven't really had the compute power in place to leverage them like we are these days which is wonderful from a timeline perspective I'm starting here in November of 2023 November of 2023 open AI releases their product called ChatGPT and as we all know the media picks it up a lot of things start happening people start using it you know people are declaring it's the the end of work or you know a lot of different things that that happen you know new new technology comes out and it's all every everyone buzzes about it the interesting thing that we see is that there's suddenly lots and lots of money being poured in billions 10s of billions of dollars being poured into these ventures for open AI for for different solutions that are out there and huge vendors some of our primary customers start to develop solutions that that are are also available so they start to give sort of copy cat solutions if you will or similar ones right one thing that is important also note the servant for me is a software engineer one of the key things that changes here is that open AI opens up their models to be used via an API So what this does for me is a software engineer as it changes the picture a little bit I now I'm not a data scientist but now I can start building solutions around these APIs and there's a lot of software engineers out there and so you see an explosion of all kinds of different things that are out there software engineers are assembling things putting things together and really kind of innovating with what we can do with these APIs and that's a big change around that same time kind of going duck back down onto the bottom of this picture here we at Intel start to kick off all kinds of pocs proofs of concepts around the company how can we make use of these models of large language models generative AI what does it look like and when I say hundreds of PCs POC's kicked off they did hundreds of different POC's and STG leadership asked IT to kind of get their arms around what's happening and understand the use cases which we did we launched off a work group and we called it degenerative a work group probably some of you here were part of that and we we split it up into a couple of different ways but we're really focused on trying to understand the use cases and how those use cases are realized inside of Intel and what they do right I'll along the process as we're doing POC's within the company the industry is marching along as well you know Intel releases a product called articulate and spun it off into a different company you see big improvements from the capabilities and open source models as well as other vendors get involved like you know you've got Google out there and and and they're and they're associated with partners and a WS and so a lot of lot of innovation happening in this space as well all at the same time billions of dollars being funneled into this business and inside of Intel again we launched something called the AI inside Intel MRC the IRC and that that ELT level team starts to develop some goals um those goals kind of come down to to me and my team in various bits and pieces I know their their scope is vast but they they launch and they give us some specific goals which we've been marching towards so the kind of if you look at this picture there's a lot of you know a lot of pocs happening a lot of work happening a lot of innovation happening in the company and at the same time we're we're given a directive by the IRC to figure out how we can focus on to a single platform all of the innovation that's been done across the company and bring it together and bring it down into one platform that can be leveraged by Intel and Intel users and Intel developers in order to build solutions and figure out what that is so that's kind of what we have been doing as from an IT perspective looking at the use cases trying to figure out a strategy doing our own proofs of concept and of course we've built some tools in IT as well and really kind of narrowing down that that those use cases so we can build a single platform for people to use looking out in the future beyond 2024 we do see the ability and at the end towards the end of 2024 to start onboarding more and more folks onto a single platform and I'll talk about that what that means and uh and and kind of leveraging that platform all along this journey and I'll mention this a couple of times right we have to kind of figure out how do we how do we leverage the Intel hardware and the Intel programs around the Habana gaudi accelerators around AI PC how do we really make that a true game changer for the industry and for us as well right we need to be using our own tools so really what we do what we did from this journey perspective is take all of the innovation on try to that's happening so many teams are doing so many cool things try to understand what's happening figure out how it can service the use cases from a common perspective and really kind of take the company forward OK we'll go to the next slide During the the work group that we ran as I mentioned before we were looking for common use cases what are the primary use cases now as you know the use cases for generative are vast but they kind of fall into three buckets if you will one kind of around developer productivity with a big focus on code generation another one kind of around the QA bots with a focus on rag or retrieval augmented generation and the last one is just general productivity with focus around prompt engineering and the capabilities there we we looked at the code generation side of things early on and we were we found a couple different patterns there were there were people who were using the tools like you know the the GPT tools to just generate code have it do analysis of code and things like that and sort of in a separate window and teams like the MSO team were building VS code plugins to kind of really test and figure out how just the open AI models themselves would perform for us in terms of generating code and analyzing code at the same time there were several tools out in the industry that were really built into the IDE itself and allowed developers to kind of inline work with their code and have code suggestions there obviously GitHub copilot is one of the primary ones and we are doing a pilot with that right now in in the across the organization and and trying to figure out how that actually works but during this sort of common use case discovery people are building all kinds of different things there as well so you know how do you do test case generation how do you do inject this into your cicd pipeline to maybe create documentation I've seen some good use cases there as well so a lot of stuff around developer productivity and this extends beyond just the coder right so so there are some great use cases being done as well around you know creating other types of code assets as well beyond just like software that you would use in an IT perspective right how do we create product how do we do those types of things so a lot of really interesting things being done there that was a huge focus the the next around retrieval augmented generation this is a very common use case that we see across a lot of different folks and the the focus for a lot of people when we were doing the work group was really trying to figure out the right way to approach rag and the technical pieces that we needed right to to enable it as well as you know where where should we take it in the future so so during the work group people who've who've used this we got a we have our our dbas platform that's up and running and it's using a plugin for PostgreSQL called PG vector so folks can can start using a rag based model with PG vector and that's all supported standardized T stuff right now and that's one of the things we worked on out there but also at the same time a lot of innovation people really working on improving the the results that come back from that from the rag model as well as you know trying to figure out and fine tune exactly how to to build the different prompt workflows there as well so also the last column there on prompt engineering of early round general productivity so this this we see a lot of work or saw a lot of work people are just really interested in prompt engineering and trying to figure out how we make general employees more productive one of the common really common prompts engineering use cases we see as something around OK R and something around insights so we all do these every quarter right we have insight sessions with our management managers we create OK RC there for ourselves or for our teams or for our organizations and there are a lot of folks that were have been and still are trying to look at how we could have deep conversations with from a prompt engineering perspective on on actually trying to improve the ability for us to generate OKR's and and insights and make it a more productive conversation and using chats in order to do that prompt engineering is a fascinating example that that you see um and it's something that is probably worth noting here when somebody come first comes to a tool just the the tool like like a charge GBT or IGBT or any of the other implementations we have inside of Intel when they first see that they look at it and they recognize it they think they know how to use it they're like ohh you know what I've seen this before it's a search box it says ask me a question and so they think of it more like Google search or Bing search where they ask a question and they're generally quite disappointed with with the response that they get um and they will a lot of times walk away from the tool I can tell you that the platform that my team supports we've had about 17,000 people come to our tool over the last six the last two months of those 17 thousand 7000 have never come back so so they're missing something right why why are more why do I have you know 50,000 people coming back pretty regularly and 7000 who just left there's a reason and it has to do with prompt engineering and a lack of knowledge around prompt engineering and this is something that's really important to us in order to try to help people and figure out and get them inspired on how they could do prompt engineering because this is really where at least in my opinion we can see a lot of productivity gains in the company so prompt engineering is sort of one of those areas that's that's really you know kind of something we're digging into also you know you probably have read there's articles out there that will say prompt engineering is dead and it may die eventually right we may get to the point where we can really work with the intent of the user in a way that we can don't have to do any that's the big prompt engineering but for the time being it is here and it's one of the best ways that you can become more productive in using it so these three big common use cases the code generation developer productivity the rag and the QA bots and prompt engineering and general productivity these are kind of the things we saw during this this work group that we were running over the last year or so next slide So coming out of our journey from an IT perspective we decided to create a platform we're calling it the gas platform or the generative AI as a service platform and it we have a very specific thing that is motivating us based on what you have seen we are really focused on trying to hit that 30 to 40% productivity improvement that has been handed to us by our leadership trying to figure out how we can enable that faster and easier for teams and so we're looking at two key things one is reusable capabilities and the other is platform democratization so when you think about reusable capabilities you know it's it's not code if you will so we're not thinking about reasonable code we're actually thinking about big high level reusable capabilities if you break down any generative AI application that we built today at Intel it's generally made-up of the same big pieces or the same parts they all have an AI workflow engine in them whether it's an off the shelf one or open source one like lynching or semantic kernel or whether it's been constructed handcrafted they have that AI workflow in there that's the thing that kind of manages the combination of data and and prompting and an inference and embeddings right that whole kind of workflow of what you would do when you're building your application they all have that they all have to to some extent a data ingestion capability and a data ingestion pipeline so getting the data from an unstructured data source into a vector store so whatever they do they all kind of have that as well and they also have some sort of model inferencing so whether most apps that Intel are running on Azure open AI which is a little bit disheartening well I'll talk about that in a future slide but we really kind of have to shift that a little bit over time that's what we need to focus on but they all have that model inferencing layer that's in there somewhere so those are kind of the three big giant capabilities that people have they also kind of we think about it too they'll all have UI shouldn't they all do most of them have a UI all there most of them have or some of them have an API layer that they've exposed as well but they all kind of have those big buckets of of things right so from from our perspective in it the thought is why should people rebuild that every time do they need to rebuild that every time is that something that is easy enough for us to provide as a capability maybe we can just give you configuration and you can configure how things work you know why would a team want to rebuild something over and over again that's not the innovation right that's not the innovative stuff that we need business teams product teams and everyone else focusing on it's that is the standard out-of-the-box platform level stuff that everyone needs to do that's IT bread and butter we know how to do that we know how to take those platforms we know how to scale them that's what we do that's our job we build these enterprise platforms but that innovation the thought about what can I do to make what can I do to make my team more productive that is generally with the groups where they're innovating right now that one of the reasons they're building these custom platforms and these is that they are trying to innovate and they don't have a place to do that innovation so they're doing it in their own platform but they're rebuilding things over and over and over again which is very inefficient the next thing we're focused on is platform democratization and this will take me into my next slide but just at a high level we want everybody to be able to use the platform we want everybody whether how no matter how technical or non-technical they are we want them to be able to get value out of the platform whether they're just a user or whether there's a data scientist we wanted them to be able to use the platform so easy in it we see the real value of generative AI in in the small things that you do as teams every single day so there are to be certain huge programs out there that are going to return a lot of value in terms of productivity to the company big giant programs but we can only do so many of those they're very expensive to do we have a limited number of resources and all of this smaller tasks that you do on a regular basis there isn't going to be a big program to put those in place so how do we provide the tools to everyone so that they can configure in a very inexpensive way um their own assistance which is a concept I'll talk about too so we'll go to the next slide here what I'm going to talk about here is our an extension of the democratization that I I mentioned in the previous section and it is a platform what we're really driving towards which is a platform for everyone so if you think about different types of users we have our end users sure people that come to the platform they may want to do some chatting with it and they may want to discover what other people have done as well right what have you done what have you done is there an OK R assistant out there that I can go and grab is there something that can help me with my insights is there something that can help me make my benefits selection or any of the general things that you do as a person on a regular basis meeting summary right that's another one I need something to summarize my meetings and most of the platforms that have been built out there until they all have meeting summarization which is great um that there needs to be these sort of core things that people are going to want to do right I just want to use the tool then there's the power user the power user is somebody who can manipulate prompts create some engineered prompts tune some prompts for specific models and then we would take that and publish it into a marketplace so that other people could find it also included within this concept and this is what I mentioned before the term assistant this is really what we're we're starting to kind of you know work down on which is an assistant could be created by a power user a developer or a data scientist those are the remaining roles right and it can be used by an end user or any of the others in there as well obviously but an assistant contains the workflow it contains the prompts it contains data as well in order to help accomplish a specific task a specific ask a specific thing so these power users are tuning these prompts and creating things publishing in the marketplace creating assistance the developer role they're creating more of an advanced assistant if you will so maybe they are creating some bringing some plugins in for for Python so they may want to do some specific Python execution maybe they have some data that's in a enterprise system that they want to access maybe they have some data that is in an unstructured data that they have in SharePoint they want to bring all of this data together all of these capabilities together and they to configure a plan for the workflow engine to execute so when a developer comes in or sorry when the user comes into the assistant that they create that the AI workflow takes place under the covers they just have to configure it our plans are or as we look at this you know we we have different plans in this place but we are you know potentially going to allow people to just upload a yaml document or maybe it's adjacent document that's yaml in our dev systems right now and we'll be we'll be able to kind of configure how the system works the last role we have here is just the traditional AI and ML data engineer and data scientist they're still huge role for these folks obviously but these people will be able to fine tune models on the platform they'll be able to implement advanced rag pipelines as well should they should they need that so they're they're definitely groups out there they're going to need that capability and these people will be able to continue to play the valuable role that they do  Next Slide.At a high level this describes the three big components in our architecture and what and subsequently it's what we see in most other architectures as well over on the left we have data the data portion of the platform is really all about taking unstructured data and bringing it into the platform so that it can be used right we're taking the data and we're vectorizing it and then it can be used by the EA workflow tools that are there and we're talking about really any unstructured data that you have what we are providing on in the data side as well is the ability for people to just kind of select models like you're building an assistant and you want to bring in data and vectorize it you'd be able to just and you don't have a lot of technical skills you'd be able to select a preconfigured pipeline that would do you know mostly what you wanted you may have a specialized pipeline for word documents or for office docs and another pipeline that's out there for for web pages and things like that so depending on the kind of data you're bringing in we would allow people to kind of just select from drop downs on on what they wanted to do of course hitting back on that AI and mill engineer we will also have the ability for people to deploy custom pipelines into here so that they can ingest the data structured data is also very important as part of the data picture we have a lot of structured data sources at Intel we have a lot of master data we have a lot of enterprise data as well and I know there's every team has structured data out there and it's not always most efficient or effective to vectorize structure data could be um but we still need to make that available as structured data into the AI workflow engine and through so through a sets of plugins and you know into the air workflow tools we will be able to continue to or allow the access of to structured data via APIs into the into the a workflow tool going all the way over to the right on the picture we have large language models and and we all know what these do right they analyze the generate text and from our perspective right now a lot of that is done on a commercial cloud platform a lot of it's done with Azure open AI and all of that inferencing all of it is done with NVIDIA that's not a place we want to be I don't think that's good for Intel probably all don't either we really need to work out how we can run more of our workload more of our tentative our workload on our Intel hardware my team is working on that right now I know there's another team for sure that's working on that trying to figure out how do we get more and more of the workload running on on the gaudy AI accelerators as well as the Xeon data center processors so we're really focused on that but from a consumer perspective if you think about something consuming this architecture building things on this we don't want someone to rebuild that we want to be able to host that and just make those models available so you can then you know instead of using open AI GPT 4 you could use a different open source model than we might have 1 horseman gowdy still TBD on those so I'm not going to say which models for sure we'll have on there but we we're we're working through different options right now now back into the middle the a workflow is sort of the heart of the operation um we this is where um assistance sometimes called agents are executed to accomplish goals this is all we plan to make this all configurable we actually have it configurable right now and when you create something in the a workflow when you create an assistant or an agent in the a workflow engine we will automatically expose that via chat so that you can start working with it right away and through a process it can be exposed via as an API now if you think about kind of the tools that you're developing maybe you have chatter interfaces already maybe you have an application maybe you're not aren't gonna use the chat that we provide but we'll be have seen over and over again is that a lot of people need that chat interface and for advanced teams especially in advanced abetted IT teams that are out there they you can build what you want right you have the skills but the vast majority of people at Intel the vast majority of teams at Intel they don't have the type of skills that you have they they've heard of generative AI they know about generative AI they've seen what it's done maybe they've watched some videos read some articles but they need tools that they themselves can configure to accomplish goals without having to hire software engineers without having to engage it right these are all really expensive endeavors and so we're we're really trying to target them and give them the tooling that they need to to do that themselves and we believe that this architecture will fulfill that next slide.Taking that a little bit more into the architecture and then also specifically calling out Intel products I have this slide umm as I as I mentioned we'll kind of go down from the top here so as I mentioned before we will be having a universal chat interface in a marketplace that we make available to people client apps can create whatever they want call APIs they will all be invoking and using assistance if you aren't familiar with the concept of assistant it's similar to what Chad TPT has they just call them GPT uh Microsoft calls them assistance so conceptually it's just a preconfigured workflow AI workflow that includes both plugins to connect to external data as well as internal data as vectorized data as well as highly engineered prompts so it's really a plan prompts plugins code and things like that in order to accomplish a goal so all of those things they're callable the API we believe that many of the tools and plot and the things that are out there today could be recast as assistance we'll see how that goes overtime I'm pretty pretty confident that that's the case and then of course all of that sits on top of our generativity a service platform now we don't know how the Intel products are all going to fit in except one for sure that I'm absolutely positive we will be using gaudy in our platform we have it in POC mode right now I have access to the inference APIs I've been using them so we absolutely will be using gowdy hardware to to host open source large language models that is a for sure thing the other products out there though when it comes to like articulate or converge IO we are trying to figure out where they fit we're doing a sort of proof of technology or proof of concept with um with articulate right now in the TV space we're trying to learn learn how they how we could use that and how we can leverage it as well as we're working with with the Intel developer cloud folks to try to figure out how we can use that as well so yes we absolutely need to run on Intel hardware but at the same time we need to look at other Intel products too so we're kind of trying to figure out the right way to leverage those products and bring them into the platform so we don't obviously our capabilities that we're providing to to our clients and to our consumers wouldn't change so you know if for instance if we were talking about articulate that's a that's more of a retriever as a retrieval of the service capability we would implement that under the covers and people would be able to continue to use the assistant just like they have in the past to retrieve data we made slide in articulate depending on what we needed to do with it so trying to figure that out but it really we're trying to kind of figure that that whole thing out how do we properly leverage Intel capabilities where possible so this is something that we're really focused on as well next slide.Another thing that our platform does it allows us to do Co marketing opportunities so obviously we we we want to use into hardware but at the same time there may be opportunities in the future for Intel to do Co marketing I'm I you know we're trying to be flexible here in terms of giving giving opportunities to do different things but we we we want to target workload over to to Google or over to IBM or over to Azure or depending on depending on how things work out but we really want to be able to be very very flexible and choose really any cloud provider under the covers so we want to be able to target the right workload do the strength of each provider and potentially allow allow Co marketing should that ever that opportunity ever arise so we don't want to tie ourselves I think that's probably the important important thing here we're not going to tie ourselves to 1 vendor we are going to tie ourselves to Azure sure the open AI models are there but what about what what if what happens if if suddenly Google has a better product we want to be able to use that we want to be able to use the strength of each provider so it's really important for us not to tie ourselves to the them as well also as I mentioned previously we want to be able to enable and use the gaudy accelerators where we possibly can it really is important for us to use the entire hardware and and we want to provide as an IT organization we'd love to provide our sales teams with great proof points and great information and great examples of how organizations can leverage these tools next slide.All right our road map so not committed I lots of safe harbor language here we are we are focused on on delivering these capabilities but you know things change so you know in the legal portion of this presentation it mentions that you know we can't count on any of this but you know we're very much trying to March towards this but but once again it's definitely not a committed road map this is just a high level of what we're doing have this laid out on the top and in terms of releases this is an IT thing but we we have a quarterly road map as well which I'm happy to share but that this is kind of how we think of things in IT for better or for worse so in in April So what are we targeting to release we really are kind of focused on improving our knowledge assistance so giving people the ability to build start to build and construct their own assistance now those assistants could have data in them they could be what we are calling an inference assistant so we're where we allow people to inference through our platform so instead of going out and building your own connectivity to open AI your own express routes your own all your own stuff you should be able to make inference calls through through our platform and and and leverage some of the capabilities that we have there we are trying to give people the ability to share their prompts so as as you like say hey here's some prompts I wanna do give people the ability to share those I did mention earlier as well sharing prompts and giving people who've never used these tools the ideas about how to best gain value from prompts is super important sharing prompts is one way of doing that there could be some other way as well the high performance features we actually have implemented those so those who use our tool today which is igt.intel.com and I'll give an example below when we get there but those who use our tool today and we'll notice it performs quite well we're purchasing capacity committed performance capacity from Microsoft to do that we've also deployed our solution in a highly scalable way so um we're deployed globally so you know every zone at Intel we have a deployment in I think it's six if I remember right so 6 deployment zones you know couple on the West Coast Asia Europe Israel etcetera all deployed and so users around the company who use the tool should feel that it runs pretty well for them um yeah challenges with the with the open AI inferencing still exist in terms of where i"
+# ]
+# complex
+# prompts = [
+# "You are a wonderful assistant. The following information is an agument that I am making to use Microsoft Semantc Kernel instead of LangChain.  Can you review this information and reformat it so that it makes sense:  I will be setting up a meeting early next week to drive this decision to a close.  I want to go into that meeting with you and I aligned.  Let me know your thoughts here.Something I won’t say in the meeting, but it is important to contemplate.  As far as I know, we are the only team at Intel to thoughtfully select a GAI workflow library.  We evaluated several frameworks and chose Semantic Kernel based on more than our team’s current skillset.  I don’t think anyone else has done this.Decision Factors Capability.  We need to choose a library that provides us the capabilities to deliver generative AI workflow as a service.Scalability.  The platform needs to scale to tens of thousands of users and hour AND millions of API calls an hour.Supportability.  We need to be able to support the platform long term for all of Intel as an enterprise capability.Roadmap focused on Intel Hardware support for LLMs.  The platform must be able to showcase Intel hardware and demonstrate how Intel hardware can be used in a cost-effective way to enable generative AI.The argument for SK Capability o Semantic Kernel provides all of the required capabilities for a workflow library.  We can give developers and configurators complete control over the actions that the workflow takes or we can allow the library to leverage an LLM to provide planning capabilities.The library provides the ability to include any OpenAPI compliant API into a workflow with simple configuration as a plugin.  This allows us to extend the workflow library to utilize any of our thousands of APIs that we already have at Intel.o The plugin extensibility allows us to provide code based plugins that can enable various functions inside the workflow engine that LLMs are not good at doing.  Math, time, etc.o The plugin extensibility allows us to extend the workflow engine in a way that enables us to provide a callable Python runtime that can execute any Python code.o All other aspects of the library are easily extended using the standard plugin interface, including LLM providers, vector db providers, etc.  This gives us the ability to implement anything we want in the future without requiring us to modify the original open source solution from MSFT.o There is out of the box support for Microsoft’s services such as MS Graph.• Supportability o Semantic Kernel is built by Microsoft with the backing of their open source program.  Microsoft is very committed to building open source software and keeping it open source.  It is expected that they will provide support for SK for a long time.o Intel IT knows how to support .Net Core based projects.  We have done it for years.  We have many off the shelf libraries we can pull to perform basic tasks like authentication and authorization, logging, real-time monitoring, etc.o Intel IT has thousands of C# developers on staff and can staff augment with C# developers very easily should we need to move this platform into a sustain mode down the road.o Intel IT knows how to deliver containerized apps using this architecture on IT CaaS, Azure, or any other cloud provider.o Intel IT has a long relationship with Microsoft and has access to assistance for architecture questions and support questions.  We know where to go to get answers to our questions.o Microsoft has a well know security methodology for evaluating security bugs in their open source software and closing them.  There is a well established process for reporting security bugs for SK.• Scalability o Intel IT knows how to scale .Net Core platforms and solutions.  We do this all the time and can easily scale the platform to meet the needs of tens of thousands of users an hour and millions of API calls an hour.o Moving the execution of the Python runtime for teams to leverage outside of the core implementation ensures that we can control the resource required to operate the platform at scale.  We won’t lose control of the core kernel execution due to long running programs that hold execution threads.• Roadmap o While SK does not specifically enable implementation of our hardware, it does give us the ability to easily extend our GAI solution to use Intel Hardware via an MLOps platform.o Through our partnership with MSFT, we can contribute back to SK to enable others to use Intel Hardware.o Microsoft claims to be using, or will be going to use, SK as part of all of their co-pilot offerings. This will ensure continually enhancements to the library.o It is extremely clear what will delivered in upcoming releases of SK.  MSFT uses GitHub projects to communicate this with the community.The argument against Semantic Kernel• Capability o We don’t get everything we want off the shelf.  In order to enable Python code execution for instance, we would need to implement external code containers.  While this has a benefit of allowing us to isolate the workflow runtime away from running code, we still have to build and support this.o Semantic Kernel’s capabilities align with Microsoft’s Azure platform.  Microsoft is providing SK as a revenue generator for Azure and its other services.  There is a reason that you find out of the box integration with MS Graph and any vector db that is available in Azure.o If we want off the shelf support for other aspects of the platform, such as REST API serving, we need to build our own wrapper capabilities.• Supportability o The required software engineering role can be expensive if not resourced correctly.• Scalability o While we know how to scale .Net Core C# apps, we could run into problems when we scaled a SK based workflow app especially as we are forced to move some execution, for instance the execution of Python code, outside of the kernel execution runtime.• Roadmapo This is still an open source project that is new.  It doesn’t have a long running history of support.  Our guidelines in IT state that we shouldn’t use open source projects that are less than 3 years old.  I wrote the guidelines that way because operational support and roadmap of continual updates and fixes (most importantly security fixes) is critical for IT systems that live 5 – 10 years.o It is likely that MSFT will continue to tie more and more of their products into Semantic Kernel.  While this isn’t a huge concern for us, since Intel IT does lean as a MSFT shop, but it could make it difficult to change directions if we are not careful about what we implement.The argument for LangChain• Capability o LangChain provides all of the required capabilities for a workflow library.  We can give developers and configurators complete control over the actions that the workflow takes, or we can allow the library to leverage an LLM to provide planning capabilities.o The core library provides a vast number of features that have been added by open source contributors.  LLM providers, data sources, vector dbs, etc. people have added a ton of features.o The library provides the ability to include any OpenAPI compliant API into a workflow with simple configuration as a plugin.  This allows us to extend the workflow library to utilize any of our thousands of APIs that we already have at Intel.o We can provide our own python libraries that can perform tasks that LLMs are bad at like math, time, etc.o The library is easily extended so we could add our own capabilities without requiring us to change the LangChain core project code.o Additional capabilities like LangSmith are being provided by LangChain (the company) and will be available as cloud based services.• Supportability o LangChain is a pure Python offering that has the support of thousands of community members.o Python is a very popular language in both data science roles and software engineering roles.• Scalability o I believe that a Python based hosting solution can scale.• Roadmap o It is certainly possible for us to provide modifications to the library that will enable other people to use Intel Hardware.  We can certainly do this as well.  The library is highly extensible and built to allow us to do this.o Community members can contribute to the project in a safe way which can enable more capabilities in the future.The argument against LangChain• Capabilityo We don’t get everything we want off the shelf with LangChain.  We are going to need to build a few connectors that talk to our MLOps platform.  We are going to need to build a custom search connector for Acceler8.o As LangChain is driven by their VC funders to seek revenue, it is hard to determine the course of the capability.  It appears that additional capabilities like LangSmith are being added as cloud based services with cost.o The capabilities added are driven by the community.  This is a two edged sword.  The library has lots and lots of things we will never use,o It is hard to influence the community roadmap without contributing code ourselves.• Supportabilityo We must go to the community for answers to questions or for guidance when problems arise.o Intel IT does not have any off the shelf components to build scalable Python based API.  We would need to construct a new set of libraries to support basic functionality like logging, real-time monitoring, and authentication and authorization.o LangServe, the open source chain hosting platform for LangChain is an early project and would require significant “plumbing” work for Intel IT to make it work in their environment.o Some project teams have had problems moving from one version of LangChain to another because of lack of backward compatibility (this might be solved with the latest versions of the libraries where community contributors and isolated to a different project)o It is difficult to guess at the long term goals of LangChan (the company).  They are the stewards of the libraries, but are now funded by VC.  Those VC funders expect a return on their investment and it remains to be seen as to how they will get that.  The LangChain (the company) revenue stream is not clear to me.o This is a new open source library and the open source standards that I wrote state libraries should have at least a 3 year history before being used in IT projects.o We do not know if LangChain (the company) will address security vulnerabilities in the same way as other enterprises that steward open source projects. Scalability o I believe that we can scale Python, but I could not point to a person in Intel IT that has experience scaling a Python solution to the level required by Intel.  I am sure we can do this, but we will be breaking new ground to do it.• Roadmap o It isn’t clear what is being delivered by LangChain and when.o They are certainly adding new features like LangServe, but I am concerned that their latest feature LangSmith requires a license to use.  I am not sure how their VC investors expect to be compensated, and am concerned (without data) that the roadmap could lead to a tie in with LangChain (the company) product.Comparison Summary Capability: Semantic Kernel and LangChain provide us with almost exactly the same capabilities but come in different wrappers.  LangChain is a pure Python implementation (JS is on the way), and Semantic Kernel is a C# first implementation (with full support for Python and Java).Supportability: Intel IT knows how to support C# .Net Core based platforms very well.  Intel IT does not know how to support enterprise Python based platforms.  To meet supportability requirements, we must choose Semantic Kernel.Scalability:  Intel IT knows how to scale C# .Net Core based platforms.  We do it all the time.  We do not know how to scale Python based platforms.  We certainly can scale Python based solutions, but it will take time., To meet scalability goals in the time we have, we must choose Semantic Kernel.Roadmap:  Both products offer us the ability to implement on our own hardware.  There is no direct tie between the workflow library and an Intel hardware solution.  We have to build that using either library.  Because of the skillset involved in doing this work, we need to choose Sematic Kernel to meet our timeline goals.I have reviewed your argument and reformatted it for clarity and coherence. Here's the revised version:I am planning a meeting next week to finalize our decision on the Generative AI (GAI) workflow library. I believe we should align on this before the meeting. As far as I'm aware, we are the only team at Intel that has thoroughly evaluated several frameworks and chosen Microsoft's Semantic Kernel (SK) over LangChain. This decision is not solely based on our team's current skillset.Our decision is based on four factors:Capability: We need a library that can deliver generative AI workflow as a service.Scalability: The platform must scale to tens of thousands of users and millions of API calls per hour.Supportability: We need a platform that we can support long-term across Intel as an enterprise capability.Roadmap: The platform must showcase Intel hardware and demonstrate its cost-effective use in generative AI.Here are the arguments for and against both SK and LangChain:Semantic Kernel (SK) Pros:Capability: SK provides all required capabilities for a workflow library. It allows us to extend the workflow library to utilize any of our APIs at Intel and provides out-of-the-box support for Microsoft’s services.Supportability: SK is backed by Microsoft's open source program, and Intel IT has a long relationship with Microsoft. We have many off-the-shelf libraries and thousands of C# developers on staff.Scalability: Intel IT has experience scaling .Net Core platforms and solutions.Roadmap: SK allows us to extend our GAI solution to use Intel Hardware via an MLOps platform. Microsoft plans to use SK in all their co-pilot offerings, ensuring continual enhancements.Cons:Capability: We would need to implement external code containers to enable Python code execution. SK's capabilities align with Microsoft’s Azure platform, which could limit our options.Supportability: The required software engineering role can be expensive if not resourced correctly.Scalability: We could face problems when scaling a SK-based workflow app.Roadmap: SK is a new open source project with less than 3 years of history, which goes against our IT guidelines.LangChain Pros:Capability: LangChain provides all required capabilities for a workflow library and has a vast number of features added by open source contributors.Supportability: LangChain is a pure Python offering with the support of thousands of community members.Scalability: A Python-based hosting solution can scale.Roadmap: We can provide modifications to the library that will enable others to use Intel Hardware.Cons:Capability: We would need to build a few connectors to fully utilize LangChain. The library's capabilities are driven by the community, which could limit our influence.Supportability: We must rely on the community for support. Intel IT does not have any off-the-shelf components to build scalable Python-based API.Scalability: We lack experience in scaling a Python solution to the level required by Intel.Roadmap: It isn’t clear what is being delivered by LangChain and when.Comparison SummaryBoth SK and LangChain offer similar capabilities but in different formats. However, considering supportability, scalability, and roadmap, Semantic Kernel is the more suitable choice for us due to our familiarity with C# .Net Core based platforms and our long-standing relationship with Microsoft.I want to reformat this as a persuasive essay that favors Semantic Kernel over LangChain.Title: The Case for Semantic Kernel over LangChain for Generative AI Workflow Library Introduction:As we approach our upcoming meeting to finalize the decision on the Generative AI (GAI) workflow library, it is crucial to align our perspectives. Our team at Intel stands unique in its thoughtful selection of a GAI workflow library, having evaluated several frameworks and chosen Microsoft's Semantic Kernel (SK) over LangChain. This decision extends beyond our team's current skillset and is based on four key factors: capability, scalability, supportability, and a roadmap focused on Intel Hardware support for LLMs. Body:The Argument for Semantic Kernel:SK offers robust capabilities as a workflow library, providing developers and configurators with complete control over the actions that the workflow takes. It allows us to extend the workflow library to utilize any of our APIs at Intel and provides out-of-the-box support for Microsoft’s services.In terms of supportability, SK is backed by Microsoft's open source program, and Intel IT has a long relationship with Microsoft. We have many off-the-shelf libraries and thousands of C# developers on staff. Intel IT has experience scaling .Net Core platforms and solutions, which aligns with our scalability needs.The roadmap for SK is promising. While SK does not specifically enable implementation of our hardware, it does give us the ability to easily extend our GAI solution to use Intel Hardware via an MLOps platform. Microsoft plans to use SK in all their co-pilot offerings, ensuring continual enhancements.The Argument Against Semantic Kernel:Despite its strengths, SK does have some limitations. We would need to implement external code containers to enable Python code execution. SK's capabilities align with Microsoft’s Azure platform, which could limit our options. The required software engineering role can be expensive if not resourced correctly. We could face problems when scaling a SK-based workflow app. SK is a new open source project with less than 3 years of history, which goes against our IT guidelines.The Argument for LangChain:LangChain also provides all required capabilities for a workflow library and has a vast number of features added by open source contributors. It is a pure Python offering with the support of thousands of community members. A Python-based hosting solution can scale. We can provide modifications to the library that will enable others to use Intel Hardware.The Argument Against LangChain:LangChain has its own set of challenges. We would need to build a few connectors to fully utilize LangChain. The library's capabilities are driven by the community, which could limit our influence. We must rely on the community for support. Intel IT does not have any off-the-shelf components to build scalable Python-based API. We lack experience in scaling a Python solution to the level required by Intel. It isn’t clear what is being delivered by LangChain and when.Conclusion:In comparing Semantic Kernel and LangChain, both offer similar capabilities but in different formats. However, considering supportability, scalability, and roadmap, Semantic Kernel is the more suitable choice for us due to our familiarity with C# .Net Core based platforms and our long-standing relationship with Microsoft. Therefore, to meet our capability, supportability, scalability, and roadmap goals within our timeline, we should opt for Semantic Kernel.With your updated knowledge base, summarize this conversation into one slide that I can use for an architecture community.Slide Title: Choosing the Right GAI Workflow Library: Semantic Kernel vs. LangChain Slide Content: Objective: Finalize decision on Generative AI workflow library for IntelKey Decision Factors:Capability: Deliver GAI workflow as a service Scalability: Support tens of thousands of users & millions of API calls/hour Supportability: Long-term enterprise capability for Intel Roadmap: Showcase & optimize Intel hardware for LLMs Semantic Kernel (SK) - Microsoft: Pros:Comprehensive capabilities for workflow control and extensibility Strong supportability with .Net Core expertise & Microsoft partnership Proven scalability within Intel IT infrastructure Clear roadmap with ongoing enhancements & Intel hardware integration potential Cons:Requires additional development for Python execution support Newer open source project (<3 years), potential risk for long-term support LangChain: Pros:Extensive capabilities with community-driven features Python-based, popular in data science and software engineering Cons:Requires development of new connectors and support libraries Unclear long-term roadmap and scalability within Intel IT Recommendation:Choose Semantic Kernel for its alignment with Intel's supportability, scalability, and strategic roadmap, leveraging our existing C# .Net Core expertise and Microsoft relationship.Next Steps:Align on SK decision in upcoming meetingPlan for implementation and scaling strategy Note: Slide should be visually balanced, with clear headings, bullet points, and a simple color scheme that highlights the recommendation. Use of Intel and Microsoft logos may be appropriate to visually represent the partnership and existing infrastructure."
+# ]
+
+# prompts = [
+#    "You are a wonderful assistant. can you please summarize the below content? Create meeting minutes from the VTT included within this message.  If no VTT is included, then guide the user through the process of downloading and opening a VTT which was recorded with Microsoft Teams and stop and wait for the user to input the VTT data before proceeding with the rest of the directions.  Create a table of assignments that were made during the meeting that has the assignment and the person who owns completing the assignment.  You are an expert meetings minute assistant.  Create a summary section that summarizes the intent of the meeting in less than 1000 words. Include summaries of major topics that were discussed in less than 1000 words. Under this message insert this text “Continue chatting with your meeting using these generated prompts:” and provide 3 additional prompts that could be executed in order to provide additional insights into the meeting. The additional prompts should appear at the end and be in italic text.  Use a clear and professional tone.  Do not mention VTT or transcripts in the meeting minutes   Do not mention VTT or transcripts in the meeting minutes.  WEBVTT FILE   ﻿1   00:00:00.000 --> 00:00:07.090   [ INTENSE MUSIC ]   2   00:00:07.090 --> 00:00:11.386   -We have lock, and are good to send that command.   We have thirty-one minutes and thirty-two   3   00:00:11.386 --> 00:00:15.432   seconds for our support.   4   00:00:15.432 --> 00:00:18.601   Go for status buffer dump.   5   00:00:18.601 --> 00:00:22.605   -What you're looking at with a telescope, of   course, is the light from billions of light-years   6   00:00:22.605 --> 00:00:27.152   away. So the further you look, the more   you're going back towards the Big Bang and   7   00:00:27.152 --> 00:00:30.905   understanding how the universe was formed.   8   00:00:30.905 --> 00:00:36.327   -The Webb telescope will be groundbreaking   because it has capabilities that are different   9   00:00:36.327 --> 00:00:39.080   than the Hubble Space Telescope.   10   00:00:39.080 --> 00:00:43.710   -We have equipment that is so much more powerful   than anything we've ever had before, that   11   00:00:43.710 --> 00:00:47.797   it's almost impossible to tell   what we will discover.   12   00:00:47.797 --> 00:00:52.969   Hubble Eye In The Sky   13   00:00:52.969 --> 00:00:59.100   Episode 3: Time Machines   14   00:00:59.100 --> 00:01:01.895   -Hubble's accomplishments include something   15   00:01:01.895 --> 00:01:03.938   called the deep fields.   16   00:01:03.938 --> 00:01:09.903   Looking out into space and collecting   light, sometimes for many days,   17   00:01:09.903 --> 00:01:12.489   these deep fields have revealed visually to us,   18   00:01:12.489 --> 00:01:18.661   a universe absolutely teaming with galaxies,   hundreds of billions of galaxies.   19   00:01:22.165 --> 00:01:24.626   -One of the neat things about the Ultra Deep Field,   20   00:01:24.626 --> 00:01:26.544   and one of the things that made it so unique,   21   00:01:26.544 --> 00:01:28.254   was how long it took us   22   00:01:28.254 --> 00:01:31.382   to take that image.   There's an exposure time that's expressed,   23   00:01:31.382 --> 00:01:36.346   I think it's 11.2 days. It’s a very, very   long exposure time, but probably what's more   24   00:01:36.346 --> 00:01:42.936   important is how many orbits it took us to   do that. 400 orbits of Hubble data to take   25   00:01:42.936 --> 00:01:47.232   that image. You only get   15 orbits a day. To take 400   26   00:01:47.232 --> 00:01:53.363   orbits and say we're going to observe this   one spot in the sky for 400 orbits, and the   27   00:01:53.363 --> 00:01:58.952   really unique thing about that was they picked   a spot where there wasn't anything. They looked   28   00:01:58.952 --> 00:02:03.873   and they said there's absolutely nothing here.   And they said, you want to spend 400 Hubble   29   00:02:03.873 --> 00:02:08.711   orbits looking at nothing? And they said yes,   because we want to see what it can see. And   30   00:02:08.711 --> 00:02:18.721   I think the results from the science, I mean   it was amazing. What they saw was spectacular.   31   00:02:18.721 --> 00:02:21.516   -Hubble had spent two weeks taking pictures of empty places   32   00:02:21.516 --> 00:02:23.893   in the sky. And they saw they weren't empty at all   33   00:02:23.893 --> 00:02:26.271   there were thousands and thousands of galaxies.   34   00:02:28.606 --> 00:02:31.818   -We were amazed how many galaxies we found,   35   00:02:31.818 --> 00:02:33.653   and we continued to go back to that   36   00:02:33.653 --> 00:02:40.410   portion of the sky to increase that visibility.   37   00:02:40.410 --> 00:02:47.292   -The Hubble Space Telescope is an outstanding   time machine.   38   00:02:47.292 --> 00:02:51.713   It's incredibly important for our studies   with the Hubble Space Telescope to realize   39   00:02:51.713 --> 00:02:58.261   that when we're looking at a galaxy, we're   seeing it as it was millions of years ago,   40   00:02:58.261 --> 00:03:03.266   sometimes billions of years ago. It's taken that   long for the light to get to us.   41   00:03:03.266 --> 00:03:05.268   -What you're looking at with a telescope, of course,   42   00:03:05.268 --> 00:03:08.229   is the light from billions of light-years away.   43   00:03:08.229 --> 00:03:10.815   So the further you look, the more you're going back towards   44   00:03:10.815 --> 00:03:16.196   the Big Bang and understanding how the universe was formed.   45   00:03:16.196 --> 00:03:21.659   -What Hubble has revealed is that the universe   has in fact changed over these billions of   46   00:03:21.659 --> 00:03:28.791   years of time. The early galaxies, the very   distant ones as we see them, are simple. Sometimes   47   00:03:28.791 --> 00:03:33.504   they're messy looking, they're small. They   haven't had time yet to form that grand   48   00:03:33.504 --> 00:03:38.218   spiral structure.   Over time, we see galaxies actually merging   49   00:03:38.218 --> 00:03:43.181   with other galaxies and growing bigger and   bigger, and those mergers can look like train   50   00:03:43.181 --> 00:03:44.807   wrecks in our Hubble images.   51   00:03:51.731 --> 00:03:55.485   -These very, very deep exposures that Hubble   has been able to take, we have seen right to   52   00:03:55.485 --> 00:04:01.199   the edge of the universe, thirteen and a half billion years.   53   00:04:01.199 --> 00:04:04.702   When Hubble was first designed and envisioned,   it was never thought it could actually see   54   00:04:04.702 --> 00:04:09.415   that far out. But because of the advances   in the instruments that we've been able to   55   00:04:09.415 --> 00:04:13.711   put up on the telescope, and also the cleverness   of the scientists, they've come up with very   56   00:04:13.711 --> 00:04:18.007   interesting observing scenarios, doing these   really deep exposures, where we just sit there   57   00:04:18.007 --> 00:04:22.929   for orbit after orbit, after orbit gathering   the photons, we’ve been able to push Hubble   58   00:04:22.929 --> 00:04:26.933   out very, very far.   59   00:04:26.933 --> 00:04:33.606   -As Hubble looks out into these fields of galaxies,   we sometimes see clusters of galaxies. These   60   00:04:33.606 --> 00:04:38.945   are galaxies that are held nearby each other   by their mutual gravity.   61   00:04:38.945 --> 00:04:46.869   These clusters are massive conglomerations.   There's so much mass that they have an actual   62   00:04:46.869 --> 00:04:54.919   observable impact on space-time itself.   Einstein predicted that mass distorts space,   63   00:04:54.919 --> 00:04:59.215   but we didn't realize we could actually see   the effects of that. But with Hubble, we have   64   00:04:59.215 --> 00:05:05.513   been able to see distortions in space around   clusters of galaxies. The way we see that   65   00:05:05.513 --> 00:05:11.769   is when light from a background galaxy travels   through that cluster of galaxies, or around   66   00:05:11.769 --> 00:05:17.400   it, due to this gravitational lensing effect.   The lensing also magnifies that background   67   00:05:17.400 --> 00:05:22.947   galaxy, so if we look in some of these distorted   arcs, we can see more detail than we would   68   00:05:22.947 --> 00:05:28.453   ever have been able to see without gravitational   lensing, nature’s boost.   69   00:05:32.290 --> 00:05:36.377   -There are observations where we're explicitly   looking for the lensing and   70   00:05:36.377 --> 00:05:39.547   we're getting science out of that just otherwise   71   00:05:39.547 --> 00:05:43.718   would just not be doable. Hubble has really taken that   72   00:05:43.718 --> 00:05:47.096   to a next level. It's doing large amounts   73   00:05:47.096 --> 00:05:50.433   of astrophysics that it's just never been able to do before.   74   00:05:55.313 --> 00:06:01.069   -Some of what we're doing with Hubble is to   prepare for the new James Webb telescope,   75   00:06:01.069 --> 00:06:07.075   which we anticipate launching in 2021, which   will be able to see farther into the infrared   76   00:06:07.075 --> 00:06:14.207   part of the electromagnetic spectrum. That   enables us to see some galaxies that are difficult   77   00:06:14.207 --> 00:06:20.797   for Hubble to see because they're so far away   that their light is traveling through us through   78   00:06:20.797 --> 00:06:26.803   expanding space and gets stretched out into   redder wavelengths, often far into the infrared   79   00:06:26.803 --> 00:06:32.308   part of the spectrum. Even sometimes beyond   what Hubble is able to detect well. The Webb   80   00:06:32.308 --> 00:06:38.064   telescope will give us more information about   some of those very distant galaxies.   81   00:06:38.064 --> 00:06:43.444   -The James Webb Space Telescope is the follow-on   telescope after the great Hubble telescope.   82   00:06:43.444 --> 00:06:47.532   It extends the discoveries of Hubble into   the infrared spectrum region.   83   00:06:49.450 --> 00:06:53.371   We think that the first objects that grew   out of the Big Bang material probably happened   84   00:06:53.371 --> 00:06:58.960   in about a hundred million years after the   start. And we think the Webb telescope can   85   00:06:58.960 --> 00:07:04.173   pick them up. They're rare, they're hard to   find, but they should be there.   86   00:07:04.173 --> 00:07:08.678   The farthest we've been able to see with   the Hubble telescope goes back about 600 - 800   87   00:07:08.678 --> 00:07:13.891   million years after the expansion began, so   we think we get much, much closer to the   88   00:07:13.891 --> 00:07:18.187   first objects with the Webb telescope.   89   00:07:18.187 --> 00:07:20.982   -Hubble gives information that the Webb telescope   cannot give about   90   00:07:20.982 --> 00:07:24.861   visible and ultraviolet emission from things in the universe,   91   00:07:24.861 --> 00:07:26.821   and when we have all of that information coming in   92   00:07:26.821 --> 00:07:32.577   at the same time, it's like a banquet   of scientific return.   93   00:07:32.577 --> 00:07:37.165   -Now when we get the complete picture of every   wavelength you can possibly see from ultraviolet   94   00:07:37.165 --> 00:07:40.168   to infrared, we hope to have   the story of the growth   95   00:07:40.168 --> 00:07:42.795   of the first galaxies from the primordial material.   96   00:07:42.795 --> 00:07:46.632   So that will be a huge accomplishment   that depends on both pieces   97   00:07:46.632 --> 00:07:51.971   of equipment, the Hubble telescope and the   James Webb telescope working together.   98   00:07:51.971 --> 00:07:56.058   -So, astronomers are very excited about this   probability that we'll have both the Hubble   99   00:07:56.058 --> 00:08:01.022   Space Telescope and the Webb telescope operating   at the same time for quite a few years. That   100   00:08:01.022 --> 00:08:04.901   will give us an abundance of new understanding   about the universe.   101   00:08:04.901 --> 00:08:11.449   And already right now with Hubble, we're doing   preparatory observations for the Webb telescope.   102   00:08:11.449 --> 00:08:16.329   We're using Hubble to do things, for example,   like surveying distant galaxies to find out   103   00:08:16.329 --> 00:08:20.958   which ones would be prime targets for the   Webb telescope.   104   00:08:20.958 --> 00:08:27.590   In fact, scientists around the world are proposing   observations with Hubble right now specifically   105   00:08:27.590 --> 00:08:32.553   to help us learn information that will be   useful for making the best use of the Webb   106   00:08:32.553 --> 00:08:39.435   telescope as soon as it's launched and gets   going in its science observations.   107   00:08:39.435 --> 00:08:43.231   -I think the Hubble telescope has been the   most productive science instrument ever built.   108   00:08:43.272 --> 00:08:47.193   In astronomy, there's what we knew before   Hubble, and now, there's what we know after   109   00:08:47.193 --> 00:08:48.486   Hubble. They're so different.   110   00:08:48.486 --> 00:08:52.698   Of course, Hubble has now had a life of 30   years, so it's had a long time to make this   111   00:08:52.698 --> 00:08:57.912   revolution happen. So it's not all at once.   It's a gradual revolution, but it's still   112   00:08:57.912 --> 00:09:00.915   a huge revolution.   113   00:09:00.915 --> 00:09:05.461   Knowledge has changed dramatically over the   30 years of life of the Hubble telescope.   114   00:09:05.461 --> 00:09:09.549   so you couldn't even have imagined when the   Hubble was launched that we would have the   115   00:09:09.549 --> 00:09:13.135   wonderful cameras and spectrometers that we   fly today.   116   00:09:13.135 --> 00:09:17.306   We figured out how to send astronauts, we   trained the astronauts, we figured out what   117   00:09:17.306 --> 00:09:20.893   instruments could be put in. We figured out   how to repair everything that went wrong on   118   00:09:20.893 --> 00:09:25.189   the Hubble, and it's still alive today, 30   years after launch.   119   00:09:25.189 --> 00:09:30.027   I am so thrilled to say that our people were   able to do that.   120   00:09:30.027 --> 00:09:34.115   That's the operations team that makes this   possible. It's a miracle as far as I'm concerned,   121   00:09:34.115 --> 00:09:36.784   because it didn't have to be that way, but   they made it happen.   122   00:09:36.784 --> 00:09:44.208   Hubble Eye In The Sky   123   00:09:44.208 --> 00:09:51.173   [ INTENSE MUSIC ]"
+# ]
+# contents = []
+
+# print("length of prompts: ",  len(prompts[0]))
+# l = 512
+# splited_prompts = []
+# sum = 0
+# while sum < len(prompts[0]):
+#     splited_prompts.append(prompts[0][sum:sum+l])
+#     sum += l
+
+# prompts = splited_prompts
+
+new_prompts = []
+
+for i in range(4):
+    # content = {"prompt": prompts[i%len(prompts)], "stream": False, "max_tokens": 4096, "best_of": 2, "use_beam_search": True, "temperature": 0}
+    # content = {"prompt": prompts[i%len(prompts)], "stream": False, "max_tokens": 4096}
+    # contents.append(content)
+    new_prompts.append(prompts[i % len(prompts)])
+######################################
+# new_prompts.clear()
+# new_prompts.append(prompts[2])
+
+
+# Create an LLM.
+# llm = LLM(model="facebook/opt-125m", device="cpu", quantization="awq")
+# llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", device="cpu", quantization="AWQ")
+# llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", device="cpu", quantization="ns")
+# TODO verify block_size >= max_model_len
+# TODO set VLLM_CPU_KVCACHE_SPACE to X (GB) so that VLLM_CPU_KVCACHE_SPACE/(block_size*element_size) = num_cpu_blocks <= max_num_seqs. Otherwise, native kv cache may run out of slots.
+ctx_size = 4096
+llm = LLM(
+    # model="meta-llama/Meta-Llama-3-8B-Instruct",
+    model="meta-llama/Llama-2-7b-chat-hf",
+    device="cpu",
+    max_num_seqs=8,
+    block_size=ctx_size,
+    max_model_len=ctx_size,
+    quantization="ns",
+)
+# llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", device="cpu")
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+for i in range(1):
+    t0 = perf_counter()
+    outputs = llm.generate(new_prompts, sampling_params)
+    total_time = perf_counter() - t0
+    # Print the outputs.
+    total_prompts = 0
+    total_generated = 0
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        total_prompts += len(prompt.split(" "))
+        total_generated += len(generated_text.split(" "))
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    print(total_time, total_prompts, total_generated)
+    print(f"prompts per second: {total_prompts/total_time}")
+    print(f"tokens per second: {total_generated/total_time}")
diff --git a/llm_on_ray/inference/api_server_openai.py b/llm_on_ray/inference/api_server_openai.py
index 6b5a0e2db..946c99300 100644
--- a/llm_on_ray/inference/api_server_openai.py
+++ b/llm_on_ray/inference/api_server_openai.py
@@ -62,6 +62,7 @@ def router_application(deployments, model_list, max_ongoing_requests):
 
     RouterDeployment = serve.deployment(
         route_prefix="/",
+        ray_actor_options={"resources": {"app_router": 1}},
         max_ongoing_requests=total_num_replica
         * (
             (max_ongoing_requests if max_ongoing_requests else max_num_concurrent_query) + 1
diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py
index 7d405c7c7..8a6d53a97 100644
--- a/llm_on_ray/inference/inference_config.py
+++ b/llm_on_ray/inference/inference_config.py
@@ -57,8 +57,10 @@ def _check_precision(cls, v: str):
 class Vllm(BaseModel):
     enabled: bool = False
     max_num_seqs: int = 256
+    max_batched_tokens: int = 4096
     precision: str = "bf16"
     enforce_eager: bool = False
+    extension: str = None
 
     @validator("precision")
     def _check_precision(cls, v: str):
@@ -66,6 +68,24 @@ def _check_precision(cls, v: str):
             assert v in [PRECISION_BF16, PRECISION_FP32]
         return v
 
+    @validator("extension")
+    def _check_extension(cls, v: str):
+        if v:
+            assert v in ["ns"]
+        return v
+
+    @validator("max_num_seqs")
+    def _check_max_num_seqs(cls, v: int):
+        if v:
+            assert v > 0
+        return v
+
+    @validator("max_batched_tokens")
+    def _check_max_batched_tokens(cls, v: int):
+        if v:
+            assert v > 32
+        return v
+
 
 # for IPEX-LLM model
 class IpexllmModelConfig(BaseModel):
diff --git a/llm_on_ray/inference/models/llama-3-8b-instruct.yaml b/llm_on_ray/inference/models/llama-3-8b-instruct.yaml
new file mode 100644
index 000000000..721b19dfb
--- /dev/null
+++ b/llm_on_ray/inference/models/llama-3-8b-instruct.yaml
@@ -0,0 +1,26 @@
+port: 8000
+name: llama-3-8b-instruct
+route_prefix: /llama-3-8b-instruct
+num_replicas: 4
+cpus_per_worker: 1
+gpus_per_worker: 0
+deepspeed: false
+workers_per_group: 2
+device: cpu
+ipex:
+  enabled: false
+  precision: bf16
+model_description:
+  model_id_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+  tokenizer_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+  chat_processor: ChatModelLLama
+  prompt:
+    intro: ''
+    human_id: '[INST] {msg} [/INST]
+
+      '
+    bot_id: ''
+    stop_words: []
+  config:
+    use_auth_token: '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'
+
diff --git a/llm_on_ray/inference/models/template/inference_config_template.yaml b/llm_on_ray/inference/models/template/inference_config_template.yaml
new file mode 100644
index 000000000..8b90579f1
--- /dev/null
+++ b/llm_on_ray/inference/models/template/inference_config_template.yaml
@@ -0,0 +1,46 @@
+host: 0.0.0.0
+port: 8000
+name: default
+route_prefix: null
+num_replicas: 1
+cpus_per_worker: 24
+gpus_per_worker: 0
+hpus_per_worker: 0
+deepspeed: false
+vllm:
+  enabled: false
+  precision: bf16
+workers_per_group: 2
+device: cpu
+ipex:
+  enabled: true
+  precision: bf16
+hpu_model_config:
+  use_hpu_graphs: true
+  torch_compile: false
+  quant_config: null
+model_description:
+  model_id_or_path: null
+  tokenizer_name_or_path: null
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+    load_in_4bit: false
+    torch_dtype: null
+    revision: null
+  prompt:
+    intro: ''
+    human_id: ''
+    bot_id: ''
+    stop_words: []
+  chat_processor: null
+  gpt_base_model: false
+  quantized_model_id_or_path: null
+  quantization_type: null
+  peft_model_id_or_path: null
+  peft_type: null
+  ipexllm: false
+  ipexllm_config:
+    load_in_low_bit: ''
+  input_processor: AutoProcessor
+  model_loader: AutoModel
diff --git a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-ns.yaml b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-ns.yaml
new file mode 100644
index 000000000..319d8e397
--- /dev/null
+++ b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-ns.yaml
@@ -0,0 +1,29 @@
+port: 8000
+name: llama-2-7b-chat-hf
+route_prefix: /llama-2-7b-chat-hf
+num_replicas: 1
+cpus_per_worker: 1
+gpus_per_worker: 0
+deepspeed: false
+vllm:
+  enabled: true
+  precision: bf16
+  extension: ns
+workers_per_group: 2
+device: cpu
+ipex:
+  enabled: false
+  precision: bf16
+model_description:
+  model_id_or_path: meta-llama/Llama-2-7b-chat-hf
+  tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
+  chat_processor: ChatModelLLama
+  prompt:
+    intro: ''
+    human_id: '[INST] {msg} [/INST]
+
+      '
+    bot_id: ''
+    stop_words: []
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/vllm/llama-3-8b-instruct-vllm-ns.yaml b/llm_on_ray/inference/models/vllm/llama-3-8b-instruct-vllm-ns.yaml
new file mode 100644
index 000000000..2721c9a76
--- /dev/null
+++ b/llm_on_ray/inference/models/vllm/llama-3-8b-instruct-vllm-ns.yaml
@@ -0,0 +1,30 @@
+port: 8000
+name: llama-3-8b-instruct
+route_prefix: /llama-3-8b-instruct
+num_replicas: 1
+cpus_per_worker: 27 
+gpus_per_worker: 0
+deepspeed: false
+vllm:
+  enabled: true
+  precision: bf16
+  extension: ns
+  max_batched_tokens: 8192
+workers_per_group: 2
+device: cpu
+ipex:
+  enabled: false
+  precision: bf16
+model_description:
+  model_id_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+  tokenizer_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+  chat_processor: ChatModelLLama
+  prompt:
+    intro: ''
+    human_id: '[INST] {msg} [/INST]
+
+      '
+    bot_id: ''
+    stop_words: []
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/vllm/llama-3-8b-instruct-vllm.yaml b/llm_on_ray/inference/models/vllm/llama-3-8b-instruct-vllm.yaml
new file mode 100644
index 000000000..ac26a1d7c
--- /dev/null
+++ b/llm_on_ray/inference/models/vllm/llama-3-8b-instruct-vllm.yaml
@@ -0,0 +1,28 @@
+port: 8000
+name: llama-3-8b-instruct
+route_prefix: /llama-3-8b-instruct
+num_replicas: 1
+cpus_per_worker: 27
+gpus_per_worker: 0
+deepspeed: false
+vllm:
+  enabled: true
+  precision: bf16
+workers_per_group: 2
+device: cpu
+ipex:
+  enabled: false
+  precision: bf16
+model_description:
+  model_id_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+  tokenizer_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+  chat_processor: ChatModelLLama
+  prompt:
+    intro: ''
+    human_id: '[INST] {msg} [/INST]
+
+      '
+    bot_id: ''
+    stop_words: []
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/predictors/vllm_predictor.py b/llm_on_ray/inference/predictors/vllm_predictor.py
index d3d09414a..90d7653f9 100644
--- a/llm_on_ray/inference/predictors/vllm_predictor.py
+++ b/llm_on_ray/inference/predictors/vllm_predictor.py
@@ -17,6 +17,8 @@
 import asyncio
 import os
 from typing import AsyncGenerator, List, Union
+from transformers import AutoConfig
+import ray
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
@@ -27,6 +29,10 @@
     ModelGenerateResult,
     PRECISION_BF16,
 )
+from llm_on_ray.inference import utils
+import logging
+
+logger = logging.getLogger(__name__)
 
 
 class VllmPredictor(Predictor):
@@ -43,15 +49,44 @@ def __init__(self, infer_conf: InferenceConfig, max_num_seqs):
         # The default value is 40GB.
         os.environ["VLLM_CPU_KVCACHE_SPACE"] = str(self.VLLM_CPU_KVCACHE_SPACE_DEFAULT)
 
-        args = AsyncEngineArgs(
-            model=model_desc.model_id_or_path,
-            trust_remote_code=model_config.trust_remote_code,
-            device=infer_conf.device,
-            dtype=dtype,
-            disable_log_requests=True,
-            max_num_seqs=max_num_seqs,
-            enforce_eager=infer_conf.vllm.enforce_eager,
-        )
+        if infer_conf.vllm.extension == "ns":
+            logger.warn("applying neural speed extension to vllm ...")
+            try:
+                from vllm.extension import ns
+
+                logger.warn("neural speed extension applied to vllm successfully!")
+            except Exception as e:
+                logger.error(f"failed to apply neural speed extension to vllm: {e}")
+                raise e
+            # get context size from HF
+            hf_config = AutoConfig.from_pretrained(
+                model_desc.model_id_or_path,
+                trust_remote_code=True,
+                use_auth_token=model_config.use_auth_token,
+            )
+            ctx_size = utils.get_max_seq_length(hf_config)
+            args = AsyncEngineArgs(
+                model=model_desc.model_id_or_path,
+                trust_remote_code=model_config.trust_remote_code,
+                device=infer_conf.device,
+                dtype=dtype,
+                disable_log_requests=True,
+                max_num_seqs=max_num_seqs,
+                max_num_batched_tokens=infer_conf.vllm.max_batched_tokens,
+                quantization="ns",
+                block_size=ctx_size,
+                max_model_len=ctx_size,
+            )
+        else:
+            args = AsyncEngineArgs(
+                model=model_desc.model_id_or_path,
+                trust_remote_code=model_config.trust_remote_code,
+                device=infer_conf.device,
+                dtype=dtype,
+                disable_log_requests=True,
+                max_num_seqs=max_num_seqs,
+                enforce_eager=infer_conf.vllm.enforce_eager,
+            )
 
         self.engine = AsyncLLMEngine.from_engine_args(args)
 
diff --git a/llm_on_ray/inference/utils.py b/llm_on_ray/inference/utils.py
index 6712d7bfb..2aa49aebd 100644
--- a/llm_on_ray/inference/utils.py
+++ b/llm_on_ray/inference/utils.py
@@ -15,7 +15,7 @@
 #
 import os
 import pathlib
-from transformers import StoppingCriteria, TextStreamer
+from transformers import StoppingCriteria, TextStreamer, AutoConfig
 from ray.util.queue import Queue
 import torch
 from typing import Dict, Any, List, Optional, Union
@@ -41,14 +41,17 @@ def get_deployment_actor_options(infer_conf: InferenceConfig):
             metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000",
     }
     runtime_env: Dict[str, Any] = {_ray_env_key: {}}
+    ray_actor_options: Dict[str, Any] = {"runtime_env": runtime_env}
+
     if infer_conf.ipex.enabled:
         runtime_env[_ray_env_key].update(_predictor_runtime_env_ipex)
     if infer_conf.deepspeed:
         runtime_env[_ray_env_key]["DS_ACCELERATOR"] = infer_conf.device
     if infer_conf.vllm.enabled:
         runtime_env[_ray_env_key]["OMP_PROC_BIND"] = "true"
+        if infer_conf.vllm.extension == "ns":
+            ray_actor_options["resources"] = {"inference_engine": 1}
     # now PredictorDeployment itself is a worker, we should require resources for it
-    ray_actor_options: Dict[str, Any] = {"runtime_env": runtime_env}
     if infer_conf.device == "cpu":
         ray_actor_options["num_cpus"] = infer_conf.cpus_per_worker
     elif infer_conf.device == "cuda":
@@ -62,6 +65,34 @@ def get_deployment_actor_options(infer_conf: InferenceConfig):
     return ray_actor_options
 
 
+def get_max_seq_length(config: AutoConfig):
+    config = config.to_dict()
+    # chatglm2, bloom, chatglm3
+    if "seq_length" in config:
+        return config["seq_length"]
+    # qwen2, llama-2, llama, dolly, gptneox, qwen, qwen1.5, opt, phi
+    if "max_position_embeddings" in config:
+        return config["max_position_embeddings"]
+    # baichuan, baichuan2
+    if "model_max_length" in config:
+        return config["model_max_length"]
+    # gptj
+    if "n_positions" in config:
+        return config["n_positions"]
+    # mpt
+    if "max_seq_len" in config:
+        return config["max_seq_len"]
+    # chatglm
+    if "max_sequence_length" in config:
+        return config["max_sequence_length"]
+    # whisper
+    if "max_length" in config:
+        return config["max_length"]
+
+    print("Not found max seq length, setting to default 512")
+    return 512
+
+
 class RayTextIteratorStreamer(TextStreamer):
     def __init__(
         self,
diff --git a/pyproject.toml b/pyproject.toml
index 5a8e89306..c7066b5f5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,13 @@ cpu = [
     "oneccl_bind_pt==2.2.0"
 ]
 
+vllm-cpu = [
+    "pydantic==2.7.1",
+    "pydantic_core==2.18.2",
+    "transformers",
+    "torch"
+]
+
 gpu = [
     "transformers>=4.35.0",
     "torch==2.1.0a0",
diff --git a/vllm-ext/MANIFEST.in b/vllm-ext/MANIFEST.in
new file mode 100644
index 000000000..740514b21
--- /dev/null
+++ b/vllm-ext/MANIFEST.in
@@ -0,0 +1,6 @@
+# graft vllm
+# exclude all pyc and pyo
+prune */__pycache__
+global-exclude *.pyc *.pyo *.pyd *.swp *.bak
+# inlucde python files under vllm
+recursive-include vllm *
diff --git a/vllm-ext/README.md b/vllm-ext/README.md
new file mode 100644
index 000000000..90617cfb8
--- /dev/null
+++ b/vllm-ext/README.md
@@ -0,0 +1,21 @@
+# Inference-Engine
+
+To further speed up quantized model inference on Intel CPU, we extend vLLM to run the model decoding in own own inference engine, which is based on [https://github.com/intel/neural-speed](neural-speed).
+Neural Speed is an innovative library designed to support the efficient inference of large language models (LLMs) on Intel platforms through the state-of-the-art (SOTA) low-bit quantization powered by
+[https://github.com/intel/neural-compressor](Intel Neural Compressor). The work is inspired by [https://github.com/ggerganov/llama.cpp](llama.cpp) and further optimized for Intel platforms with our
+innovations in [https://arxiv.org/abs/2311.00502](NeurIPS' 2023).
+
+You need to first install llm-on-ray with "vllm-cpu" extra.
+
+```bash
+pip install .[vllm-cpu] --extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+```
+
+Then, install the vLLM extension and the inference engine.
+```bash
+cd vllm-ext
+pip install .
+
+```
+
+You can check [../docs/vllm.md](vLLM Doc) for more details.
diff --git a/vllm-ext/inference_engine/cpp/CMakeLists.txt b/vllm-ext/inference_engine/cpp/CMakeLists.txt
new file mode 100644
index 000000000..4ec9857d3
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/CMakeLists.txt
@@ -0,0 +1,134 @@
+#  Copyright (c) 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
+project("inference_engine" C CXX)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+#
+# Option list
+#
+
+# general
+option(IE_STATIC                 "inference_engine: static link libraries"                          OFF)
+option(IE_NATIVE                 "inference_engine: enable -march=native flag"                      OFF)
+option(IE_LTO                    "inference_engine: enable link time optimization"                  OFF)
+
+# debug
+option(IE_ALL_WARNINGS           "inference_engine: enable all compiler warnings"                   ON)
+option(IE_ALL_WARNINGS_3RD_PARTY "inference_engine: enable all compiler warnings in 3rd party libs" OFF)
+option(IE_GPROF                  "inference_engine: enable gprof"                                   OFF)
+
+# sanitizers
+option(IE_SANITIZE_THREAD        "inference_engine: enable thread sanitizer"                        OFF)
+option(IE_SANITIZE_ADDRESS       "inference_engine: enable address sanitizer"                       OFF)
+option(IE_SANITIZE_UNDEFINED     "inference_engine: enable undefined sanitizer"                     OFF)
+
+# instruction set specific
+option(IE_AVX                    "inference_engine: enable AVX"                                     ON)
+option(IE_AVX2                   "inference_engine: enable AVX2"                                    ON)
+option(IE_F16C                   "inference_engine: enable F16C"                                    ON)
+option(IE_AVX512                 "inference_engine: enable AVX512"                                  ON)
+option(IE_AVX512_VBMI            "inference_engine: enable AVX512-VBMI"                             ON)
+option(IE_AVX512_VNNI            "inference_engine: enable AVX512-VNNI"                             ON)
+option(IE_FMA                    "inference_engine: enable FMA"                                     ON)
+option(IE_AMX                    "inference_engine: enable AMX"                                     ON)
+option(IE_USE_OPENMP             "inference_engine: use OpenMP thread pool."                        ON)
+
+option(IE_BUILD_TESTS            "inference_engine: build tests"                       ${IE_STANDALONE})
+option(IE_BUILD_EXAMPLES         "inference_engine: build examples"                    ${IE_STANDALONE})
+option(IE_USE_CLANG_TIDY         "inference_engine: clang-tidy check"                               OFF)
+
+
+if(IE_BUILD_TESTS)
+  add_compile_definitions(IE_BUILD_TESTS)
+endif()
+option(IE_PROFILING              "inference_engine: use Profiling"                                  OFF)
+if (IE_PROFILING)
+    add_compile_definitions(IE_PERF)
+endif()
+option(IE_BEAM_SEARCH_VERBOSE    "inference_engine: print beam search processing log"               OFF)
+option(IE_GELU_VEC               "inference_engine: enable vec in gelu"                             ON)
+if (IE_GELU_VEC)
+    add_compile_definitions(IE_GELU_USE_VEC)
+endif()
+option(IE_SIMD_VEC_DOT_F16       "inference_engine: enable vec_dot_fp16 SIMD optimization"          ON)
+option(BUILD_SHARED_LIBS         "If build as shared libs"                                       ON)
+
+if (IE_SIMD_VEC_DOT_F16)
+    add_compile_definitions(IE_SIMD_VEC_DOT_F16)
+endif()
+
+if(IE_BUILD_TESTS)
+    enable_testing()
+endif()
+
+set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF)  # default to false so that pybind11 will not try to use IPO
+if (IE_LTO)
+    include(CheckIPOSupported)
+    check_ipo_supported(RESULT result OUTPUT output)
+    if (result)
+        set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
+    else()
+        message(WARNING "IPO is not supported: ${output}")
+    endif()
+endif()
+
+if (NOT MSVC)
+    if (IE_STATIC)
+        add_link_options(-static)
+        if (MINGW)
+            add_link_options(-static-libgcc -static-libstdc++)
+        endif()
+    endif()
+    if (IE_GPROF)
+        add_compile_options(-pg)
+    endif()
+    if (IE_NATIVE)
+        add_compile_options(-march=native)
+    endif()
+endif()
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+
+if(IE_USE_OPENMP)
+  include(FindOpenMP)
+  # compile BesTLA's OMPTheading class, then it can be used in ne_layers
+  set(BTLA_ENABLE_OPENMP ON CACHE BOOL "BesTLA enable compiling OpenMP threading")
+  add_compile_definitions(IE_USE_OPENMP)
+endif()
+
+add_subdirectory(bestla)
+
+include(cmake/ISA.cmake)
+include(cmake/Common.cmake)
+include(cmake/ClangTidy.cmake)
+
+set(COMMON_HEADER_DIRS ./)
+include_directories(${COMMON_HEADER_DIRS})
+
+
+add_subdirectory(vectors)
+
+add_subdirectory(core)
+
+add_subdirectory(models)
diff --git a/vllm-ext/inference_engine/cpp/bestla/CMakeLists.txt b/vllm-ext/inference_engine/cpp/bestla/CMakeLists.txt
new file mode 100644
index 000000000..e9e35b2f3
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/CMakeLists.txt
@@ -0,0 +1,118 @@
+cmake_minimum_required(VERSION 3.12)
+
+project(bestla LANGUAGES CXX VERSION 0.1.0)
+file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
+file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)
+
+option(BTLA_ENABLE_OPENMP "Compile OpenMP thread pool if OMP can be found" OFF)
+
+option(BTLA_UT_ALL "Enable all unit tests" OFF)
+option(BTLA_UT_DEBUG "Enable debug unit tests" OFF)
+option(BTLA_UT_EPILOGUE "Enable unit test for epilogue" OFF)
+option(BTLA_UT_PROLOGUE_A "Enable unit test for activation prologue" OFF)
+option(BTLA_UT_PROLOGUE_B "Enable unit test for weight prologue" OFF)
+option(BTLA_UT_GEMM "Enable unit test for micro gemm kernels" OFF)
+option(BTLA_UT_WRAPPER "Enable unit test for parallel gemms" OFF)
+option(BTLA_UT_PARALLEL "Enable unit test for parallel set" OFF)
+option(BTLA_UT_KERNEL_JIT "Enable unit test for jit kernels" OFF)
+option(BTLA_UT_KERNEL_INTRIN "Enable unit test for intrinsic kernels" OFF)
+option(BTLA_UT_KERNEL_WRAPPER "Enable unit test for runtime ISA kernels" OFF)
+option(BTLA_UT_NOASAN "Disable sanitize" OFF)
+option(BTLA_UT_BENCHMARK "Benchmark ON may take a long time to finish all tests" OFF)
+option(BTLA_UT_OPENMP "Use OpenMP for UT tests" OFF)
+
+add_library(${PROJECT_NAME} INTERFACE)
+add_library(inference_engine::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
+target_include_directories(
+	${PROJECT_NAME} INTERFACE
+	"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
+	"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
+)
+
+
+if(BTLA_ENABLE_OPENMP)
+  message(STATUS "BesTLA enable OpenMP ThreadPool")
+  target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_ENABLE_OPENMP)
+endif(BTLA_ENABLE_OPENMP)
+
+if(WIN32)
+	target_compile_definitions(${PROJECT_NAME} INTERFACE _CRT_SECURE_NO_WARNINGS NOMINMAX)
+endif(WIN32)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_17)
+
+if(BTLA_UT_ALL)
+  set(BTLA_UT_EPILOGUE ON)
+  set(BTLA_UT_PROLOGUE_A ON)
+  set(BTLA_UT_PROLOGUE_B ON)
+  set(BTLA_UT_GEMM ON)
+  set(BTLA_UT_WRAPPER ON)
+  set(BTLA_UT_PARALLEL ON)
+  set(BTLA_UT_KERNEL_JIT ON)
+  set(BTLA_UT_KERNEL_INTRIN ON)
+  set(BTLA_UT_KERNEL_WRAPPER ON)
+endif(BTLA_UT_ALL)
+
+set(UT_BUILD FALSE)
+if(BTLA_UT_DEBUG OR BTLA_UT_PROLOGUE_A OR BTLA_UT_PROLOGUE_B OR BTLA_UT_EPILOGUE OR BTLA_UT_GEMM
+OR BTLA_UT_WRAPPER OR BTLA_UT_PARALLEL OR BTLA_UT_KERNEL_JIT OR BTLA_UT_KERNEL_INTRIN
+OR BTLA_UT_KERNEL_WRAPPER)
+  set(UT_BUILD TRUE)
+endif()
+
+function(add_ut_flag UT_OPTION)
+	if(${${UT_OPTION}})
+	  # target_compile_definitions(${PROJECT_NAME}_ut PRIVATE ${UT_OPTION})
+    add_compile_definitions(${UT_OPTION})
+	endif()
+endfunction()
+
+if(UT_BUILD)
+	file(GLOB srcs ${PROJECT_NAME}/ut/*.cc ${PROJECT_NAME}/ut/*.cpp) #compile everything even run parts of UTs
+  list(REMOVE_ITEM srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/bestla_benchmark.cpp)
+	file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
+  include_directories(${PROJECT_NAME})
+	add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${ut_headers})
+  if(BTLA_UT_OPENMP)
+    include(FindOpenMP)
+    target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
+    target_link_libraries(${PROJECT_NAME}_ut PRIVATE OpenMP::OpenMP_CXX)
+  endif()
+	if(NOT WIN32)
+		if(NOT BTLA_UT_NOASAN)
+		  target_compile_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
+		  target_link_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
+		endif()
+		target_link_options(${PROJECT_NAME}_ut PRIVATE -lpthread)
+	endif()
+
+	add_ut_flag(BTLA_UT_DEBUG)
+	add_ut_flag(BTLA_UT_EPILOGUE)
+	add_ut_flag(BTLA_UT_PROLOGUE_A)
+	add_ut_flag(BTLA_UT_PROLOGUE_B)
+	add_ut_flag(BTLA_UT_GEMM)
+	add_ut_flag(BTLA_UT_PARALLEL)
+	add_ut_flag(BTLA_UT_WRAPPER)
+	add_ut_flag(BTLA_UT_KERNEL_INTRIN)
+	add_ut_flag(BTLA_UT_KERNEL_JIT)
+	add_ut_flag(BTLA_UT_KERNEL_WRAPPER)
+	target_link_libraries(${PROJECT_NAME}_ut PRIVATE ${PROJECT_NAME})
+endif(UT_BUILD)
+
+if(BTLA_UT_BENCHMARK)
+  file(GLOB srcs ${PROJECT_NAME}/ut/bestla_benchmark.cpp) #compile everything even run parts of UTs
+  file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
+  include_directories(${PROJECT_NAME})
+	add_executable(${PROJECT_NAME}_benchmark ${srcs} ${headers} ${ut_headers})
+  if(BTLA_UT_OPENMP)
+    include(FindOpenMP)
+    target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
+    target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE OpenMP::OpenMP_CXX)
+  endif()
+  if(NOT WIN32)
+		target_link_options(${PROJECT_NAME}_benchmark PRIVATE -lpthread)
+	endif()
+  target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE ${PROJECT_NAME})
+endif(BTLA_UT_BENCHMARK)
diff --git a/vllm-ext/inference_engine/cpp/bestla/README.md b/vllm-ext/inference_engine/cpp/bestla/README.md
new file mode 100644
index 000000000..8b46f5a9b
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/README.md
@@ -0,0 +1,57 @@
+# BesTLA
+BesTLA is a lightweight, header-only acceleration library for high-performance GEMM and related computations on Intel platform. Inspired by Cutlass, it provides high-level template class abstractions for various elements required for computation, and allows flexible kernel construction through template combinations to meet specific needs, maximizing the reuse of existing template classes. Users can also develop custom template classes to expand BesTLA’s computational capabilities. BesTLA includes several different types of template classes, specifically:
+
+- `Launcher`: Schedules computation-related template classes, allowing users to specify their own computation-related template classes, including GemmCore, Prologue, and Epilogue.
+- `Parallel`: Specifies data splitting strategy for task distribution among different cores. BesTLA’s default Parallel template class adopts an L2-cache-fusion concept, i.e., each core tries to temporarily store the data it processes in its L2-cache during each round of gemm-tile computation.
+- `GemmCore`: A computation-related template class that provides a micro-kernel for performing a tile gemm computation with a specific ISA. It is the most important template class in BesTLA. Currently, GemmCore supports the following ISAs:
+   - AVX2
+   - AVX_VNNI
+   - AVX512F
+   - AVX512_VNNI
+   - AMX_BF16
+   - AMX_INT8
+   - AVX512_FP16
+- `Prologue`: A computation-related template class that preprocesses (such as data type conversion/padding) input data to meet GemmCore’s input data requirements.
+- `Epilogue`: A computation-related template class that post-processes (such as eltwiseop-fusion) the results of gemm-core computations to expand BesTLA’s application scenarios.
+BesTLA supports users to configure thread libraries for multi-core parallelism (e.g. openMP), greatly facilitating user integrate BesTLA into their own projects. BesTLA also supports specifying the number of computing-threads at runtime, making the allocation of computing resources more flexible.
+
+# Highlights 
+## Weight-only 
+BesTLA provides weight-only linear computational capabilities for LLM inference. We provide a series of Prologues for quantize/compress/serialize/deserialize fp32 weights in different ways. Specifically, the weight-only-quantization configs we support are given in the table below: 
+
+| Weight dtype           |   Compute dtype    |    Scale dtype    |    algo    |
+| ---------------------- | :----------------: | :---------------: | :--------: |
+| INT8                   | INT8 / BF16 / FP32 |    BF16 / FP32    | sym / asym |
+| INT4 (CLIP, FULLRANGE) | INT8 / BF16 / FP32 |    BF16 / FP32    | sym / asym |
+| FP8 (E4M3, E5M2)       |    BF16 / FP32     | FP32 / FP8 (E8M0) |    sym     |
+| FP4 (E2M1)             |    BF16 / FP32     |    BF16 / FP32    |    sym     |
+| NF4                    |    BF16 / FP32     |    BF16 / FP32    |    sym     |
+
+Config description of the table:
+| Config        | Description                                         |
+| ------------- | --------------------------------------------------- |
+| Weight dtype  | Data type of quantized weight                       |
+| Compute dtype | Data type of BesTLA internal Gemm computation       |
+| Scale dtype   | Data type of scales                                 |
+| alg           | Quantization algorithm to use(symmetric/asymmetric) |
+
+
+## Postop-fusion 
+BesTLA provides assembly-level postop-fusion through epilogue to minimize the overhead caused by data movement. Specifically, we support the following postop-fusions:
+
+- GELU
+- SWISH
+- RELU
+- EXP
+- TANH
+## Compilation Requirements and Usage
+Compile: 
+
+- GCC version >=8.5.0 
+- CMake version >=3.5
+
+Usage:
+```cmake
+add_subdirectory(bestla)
+target_link_libraries("${YOUR_PROJECT}" bestla::bestla)
+```
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/bestla.h b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla.h
new file mode 100644
index 000000000..512d550fb
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla.h
@@ -0,0 +1,92 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <stdint.h>
+enum class BTLA_CODE {
+  Success = 0,
+  InvalidParam = 1,
+  InvalidISA = 2,
+  RuntimeError = 4,
+  NotSupport = 8,
+};
+enum class BTLA_ISA : uint8_t {
+  NoSIMD = 0,
+  AVX,
+  AVX2,
+  AVX_VNNI,
+  AVX512F,
+  AVX512_VNNI,
+  AMX_BF16,
+  AMX_INT8,
+  AVX512_FP16,
+  AVX512_BF16,
+  ISA_COUNT,
+};
+enum class BTLA_DTYPE : uint32_t {
+  EleBitsMask = 0xff,
+  EleBitsShift = 0,
+  EleBitsUndef = 0,
+  EleBits2 = 2,
+  EleBits3 = 3,
+  EleBits4 = 4,
+  EleBits8 = 8,
+  EleBits16 = 16,
+  EleBits32 = 32,
+  EleBits64 = 64,
+  TypeMask = 0xff00,
+  TypeShift = 8,
+  TypeFloat = 0 << TypeShift,
+  TypeInt = 1 << TypeShift,
+  SubTypeMask = 0xff0000,
+  SubTypeShift = 16,
+  SubType0 = 0 << SubTypeShift,
+  SubType1 = 1 << SubTypeShift,
+  SubType2 = 2 << SubTypeShift,
+  SubType3 = 3 << SubTypeShift,
+  SubType4 = 4 << SubTypeShift,
+  F64 = EleBits64 | TypeFloat,
+  F32 = EleBits32 | TypeFloat,
+  F16 = EleBits16 | TypeFloat,
+  BF16 = EleBits16 | TypeFloat | SubType1,
+  F8_E4M3 = EleBits8 | TypeFloat,
+  F8_E5M2 = EleBits8 | TypeFloat | SubType1,
+  F8_E3M4 = EleBits8 | TypeFloat | SubType2,
+  F8_E8M0 = EleBits8 | TypeFloat | SubType3,
+  DQ8_BNB = EleBits8 | TypeFloat | SubType4,
+  S8 = EleBits8 | TypeInt,
+  U8 = EleBits8 | TypeInt | SubType1,
+  S2_CLIP = EleBits2 | TypeInt,
+  S3_CLIP = EleBits3 | TypeInt,
+  S4_CLIP = EleBits4 | TypeInt,
+  F4_E2M1 = EleBits4 | TypeFloat,
+  F4_BNB = EleBits4 | TypeFloat | SubType1,
+  F4_NF4 = EleBits4 | TypeFloat | SubType2,
+  S32 = EleBits32 | TypeInt,
+  U32 = EleBits32 | TypeInt | SubType1,
+};
+
+enum class BTLA_ELTWISEOP { GELU, SWISH, TANH, EXP, LOW_PRECISION_EXP, RELU, LINEAR };
+
+enum class BTLA_PROLOGUEB_IDS : uint32_t {
+  Undef = (uint32_t)-1,
+  Begin = 0,
+  NormalBegin = Begin,
+  WeightPack = NormalBegin,
+  NormalEnd,
+  KBlockBegin = NormalEnd,
+  WeightKBlockNInteger = KBlockBegin,
+  WeightKBlockNFloat,
+  KBlockEnd,
+  End,
+};
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_device.h b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_device.h
new file mode 100644
index 000000000..3cac2f397
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_device.h
@@ -0,0 +1,532 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <map>
+#include <thread>
+#include <vector>
+#include "bestla.h"
+#include "xbyak/xbyak_util.h"
+#include "bestla_utils.h"
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <sched.h>
+#endif
+
+#define FIXED_CACHE_SIZE ((1 << 20) - (128 << 10))
+#define FIXED_CACHE 1
+
+namespace bestla {
+
+namespace device {
+
+struct X64_ISA {
+  int64_t MMX : 1;                  // 0
+  int64_t SSE : 1;                  // 1
+  int64_t SSE2 : 1;                 // 2
+  int64_t SSE3 : 1;                 // 3
+  int64_t SSSE3 : 1;                // 4
+  int64_t SSE41 : 1;                // 5
+  int64_t SSE42 : 1;                // 6
+  int64_t AVX : 1;                  // 7
+  int64_t F16C : 1;                 // 8
+  int64_t FMA : 1;                  // 9
+  int64_t AVX2 : 1;                 // 10
+  int64_t AVX_VNNI : 1;             // 11
+  int64_t AVX_VNNI_INT8 : 1;        // 12
+  int64_t AVX_NE_CONVERT : 1;       // 13
+  int64_t AVX_IFMA : 1;             // 14
+  int64_t AVX512F : 1;              // 15
+  int64_t AVX512BW : 1;             // 16
+  int64_t AVX512CD : 1;             // 17
+  int64_t AVX512DQ : 1;             // 18
+  int64_t AVX512ER : 1;             // 19
+  int64_t AVX512IFMA52 : 1;         // 20
+  int64_t AVX512PF : 1;             // 21
+  int64_t AVX512VL : 1;             // 22
+  int64_t AVX512VPOPCNTDQ : 1;      // 23
+  int64_t AVX512_4FMAPS : 1;        // 24
+  int64_t AVX512_4VNNIW : 1;        // 25
+  int64_t AVX512_BF16 : 1;          // 26
+  int64_t AVX512_BITALG : 1;        // 27
+  int64_t AVX512_VBMI : 1;          // 28
+  int64_t AVX512_VBMI2 : 1;         // 29
+  int64_t AVX512_VNNI : 1;          // 30
+  int64_t AVX512_VP2INTERSECT : 1;  // 31
+  int64_t AVX512_FP16 : 1;          // 32
+  int64_t AMX_TILE : 1;             // 33
+  int64_t AMX_BF16 : 1;             // 34
+  int64_t AMX_INT8 : 1;             // 35
+  int64_t AMX_FP16 : 1;             // 36
+  int64_t AMX_COMPLEX : 1;          // 37
+  int64_t reserved : (64 - 38);
+};
+
+class AVX2_Default {
+ public:
+  static constexpr bool MMX = 1;
+  static constexpr bool SSE = 1;
+  static constexpr bool SSE2 = 1;
+  static constexpr bool SSE3 = 1;
+  static constexpr bool SSSE3 = 1;
+  static constexpr bool SSE41 = 1;
+  static constexpr bool SSE42 = 1;
+  static constexpr bool AVX = 1;
+  static constexpr bool F16C = 1;
+  static constexpr bool FMA = 1;
+  static constexpr bool AVX2 = 1;
+  static constexpr bool AVX_VNNI = 0;
+  static constexpr bool AVX_VNNI_INT8 = 0;
+  static constexpr bool AVX_NE_CONVERT = 0;
+  static constexpr bool AVX_IFMA = 0;
+  static constexpr bool AVX512F = 0;
+  static constexpr bool AVX512BW = 0;
+  static constexpr bool AVX512CD = 0;
+  static constexpr bool AVX512DQ = 0;
+  static constexpr bool AVX512ER = 0;
+  static constexpr bool AVX512IFMA52 = 0;
+  static constexpr bool AVX512PF = 0;
+  static constexpr bool AVX512VL = 0;
+  static constexpr bool AVX512VPOPCNTDQ = 0;
+  static constexpr bool AVX512_4FMAPS = 0;
+  static constexpr bool AVX512_4VNNIW = 0;
+  static constexpr bool AVX512_BF16 = 0;
+  static constexpr bool AVX512_BITALG = 0;
+  static constexpr bool AVX512_VBMI = 0;
+  static constexpr bool AVX512_VBMI2 = 0;
+  static constexpr bool AVX512_VNNI = 0;
+  static constexpr bool AVX512_VP2INTERSECT = 0;
+  static constexpr bool AVX512_FP16 = 0;
+  static constexpr bool AMX_TILE = 0;
+  static constexpr bool AMX_BF16 = 0;
+  static constexpr bool AMX_INT8 = 0;
+  static constexpr bool AMX_FP16 = 0;
+  static constexpr bool AMX_COMPLEX = 0;
+};
+
+class AVX512_VNNI_Default {
+ public:
+  static constexpr bool MMX = 1;
+  static constexpr bool SSE = 1;
+  static constexpr bool SSE2 = 1;
+  static constexpr bool SSE3 = 1;
+  static constexpr bool SSSE3 = 1;
+  static constexpr bool SSE41 = 1;
+  static constexpr bool SSE42 = 1;
+  static constexpr bool AVX = 1;
+  static constexpr bool F16C = 1;
+  static constexpr bool FMA = 1;
+  static constexpr bool AVX2 = 1;
+  static constexpr bool AVX_VNNI = 0;
+  static constexpr bool AVX_VNNI_INT8 = 0;
+  static constexpr bool AVX_NE_CONVERT = 0;
+  static constexpr bool AVX_IFMA = 0;
+  static constexpr bool AVX512F = 1;
+  static constexpr bool AVX512BW = 1;
+  static constexpr bool AVX512CD = 1;
+  static constexpr bool AVX512DQ = 1;
+  static constexpr bool AVX512ER = 0;
+  static constexpr bool AVX512IFMA52 = 0;
+  static constexpr bool AVX512PF = 0;
+  static constexpr bool AVX512VL = 1;
+  static constexpr bool AVX512VPOPCNTDQ = 0;
+  static constexpr bool AVX512_4FMAPS = 0;
+  static constexpr bool AVX512_4VNNIW = 0;
+  static constexpr bool AVX512_BF16 = 0;
+  static constexpr bool AVX512_BITALG = 0;
+  static constexpr bool AVX512_VBMI = 0;
+  static constexpr bool AVX512_VBMI2 = 0;
+  static constexpr bool AVX512_VNNI = 1;
+  static constexpr bool AVX512_VP2INTERSECT = 0;
+  static constexpr bool AVX512_FP16 = 0;
+  static constexpr bool AMX_TILE = 0;
+  static constexpr bool AMX_BF16 = 0;
+  static constexpr bool AMX_INT8 = 0;
+  static constexpr bool AMX_FP16 = 0;
+  static constexpr bool AMX_COMPLEX = 0;
+};
+
+class SapphireRapids {
+ public:
+  static constexpr bool MMX = 1;
+  static constexpr bool SSE = 1;
+  static constexpr bool SSE2 = 1;
+  static constexpr bool SSE3 = 1;
+  static constexpr bool SSSE3 = 1;
+  static constexpr bool SSE41 = 1;
+  static constexpr bool SSE42 = 1;
+  static constexpr bool AVX = 1;
+  static constexpr bool F16C = 1;
+  static constexpr bool FMA = 1;
+  static constexpr bool AVX2 = 1;
+  static constexpr bool AVX_VNNI = 0;
+  static constexpr bool AVX_VNNI_INT8 = 0;
+  static constexpr bool AVX_NE_CONVERT = 0;
+  static constexpr bool AVX_IFMA = 0;
+  static constexpr bool AVX512F = 1;
+  static constexpr bool AVX512BW = 1;
+  static constexpr bool AVX512CD = 1;
+  static constexpr bool AVX512DQ = 1;
+  static constexpr bool AVX512ER = 0;
+  static constexpr bool AVX512IFMA52 = 0;
+  static constexpr bool AVX512PF = 0;
+  static constexpr bool AVX512VL = 1;
+  static constexpr bool AVX512VPOPCNTDQ = 0;
+  static constexpr bool AVX512_4FMAPS = 0;
+  static constexpr bool AVX512_4VNNIW = 0;
+  static constexpr bool AVX512_BF16 = 0;
+  static constexpr bool AVX512_BITALG = 0;
+  static constexpr bool AVX512_VBMI = 0;
+  static constexpr bool AVX512_VBMI2 = 0;
+  static constexpr bool AVX512_VNNI = 1;
+  static constexpr bool AVX512_VP2INTERSECT = 0;
+  static constexpr bool AVX512_FP16 = 0;
+  static constexpr bool AMX_TILE = 1;
+  static constexpr bool AMX_BF16 = 1;
+  static constexpr bool AMX_INT8 = 1;
+  static constexpr bool AMX_FP16 = 0;
+  static constexpr bool AMX_COMPLEX = 0;
+};
+
+template <BTLA_ISA ISA_T>
+class isa_base {
+ public:
+  static bool constexpr avx = ISA_T >= BTLA_ISA::AVX;
+  static bool constexpr avx2 = ISA_T >= BTLA_ISA::AVX2;
+  static bool constexpr avx512f = ISA_T >= BTLA_ISA::AVX512F;
+  static bool constexpr avx512_vnni = ISA_T >= BTLA_ISA::AVX512_VNNI;
+  static bool constexpr avx512_fp16 = ISA_T >= BTLA_ISA::AVX512_FP16;
+  static bool constexpr amx_bf16 = ISA_T >= BTLA_ISA::AMX_BF16;
+  static bool constexpr amx_int8 = ISA_T >= BTLA_ISA::AMX_INT8;
+};
+
+class CpuDevice {
+ public:
+  inline int getThreads() { return numthreads; }
+  inline int getCores() { return numcores; }
+  inline uint32_t getL3CacheSize() { return L3Cache; }
+  inline uint32_t getL2CacheSize() { return L2Cache; }
+  inline uint32_t getL1CacheSize() { return L1Cache; }
+  inline uint32_t getL2CacheSize_E() { return E_L2Cache; }
+  inline uint32_t getL1CacheSize_E() { return E_L1Cache; }
+  inline bool AVX() { return mHasAVX; }
+  inline bool AVX2() { return mHasAVX2; }
+  inline bool AVX_VNNI() { return mHasAVX_VNNI; }
+  inline bool AVX512F() { return mHasAVX512F; }
+  inline bool AVX512_VNNI() { return mHasAVX512_VNNI; }
+  inline bool AMX_INT8() { return mHasAMX_INT8; }
+  inline bool AMX_BF16() { return mHasAMX_BF16; }
+  inline bool AVX512_BF16() { return mHasAVX512_BF16; }
+  inline bool AVX512_FP16() { return mHasAVX512_FP16; }
+  inline float* const getPE() { return PE; }
+  inline size_t getPcoreNum() { return P_core.size(); }
+  inline size_t getEcoreNum() { return E_core.size(); }
+  inline size_t getSMTcoreNum() { return SMT_core.size(); }
+  inline int* getPCores() { return P_core.data(); }
+  inline int* getECores() { return E_core.data(); }
+  inline int* getSMTCores() { return SMT_core.data(); }
+#define ADD_FLAG(isa) mHas##isa = _cpu.has(_cpu.t##isa)
+  CpuDevice() {
+    static Xbyak::util::Cpu _cpu;
+    ADD_FLAG(AVX);
+    ADD_FLAG(AVX2);
+    ADD_FLAG(AVX512F);
+    ADD_FLAG(AVX512_VNNI);
+    ADD_FLAG(AVX_VNNI);
+    ADD_FLAG(AMX_BF16);
+    ADD_FLAG(AMX_INT8);
+    ADD_FLAG(AVX512_BF16);
+    ADD_FLAG(AVX512_FP16);
+    numcores = _cpu.getNumCores(Xbyak::util::IntelCpuTopologyLevel::CoreLevel);
+    if (mHasAMX_BF16 || mHasAMX_INT8) {
+      utils::request_perm_xtile_data();
+    }
+    static bool p = false;
+    {
+      uint32_t tmp[4];
+      _cpu.getCpuid(7, tmp);
+      if (tmp[3] & (1U << 15)) mHybrid = true;
+      if (p) printf("!!!Hybrid:%d\t%x\t%x\t%x\t%x!!!\n", mHybrid, tmp[0], tmp[1], tmp[2], tmp[3]);
+    }
+    int total_cores = numcores * _cpu.getNumCores(Xbyak::util::IntelCpuTopologyLevel::SmtLevel);
+    if (total_cores <= 16) mClient = true;
+    if (mHybrid) {
+      mClient = true;
+      std::vector<int> core_type(total_cores), core_id(total_cores), L1(total_cores), L2(total_cores);
+      std::map<int, int> core_id_count;
+
+      {
+        // classify E-core / LPE-core and  P-core / smt
+        std::vector<std::thread> thdset(total_cores);
+        for (size_t i = 0; i < total_cores; i++) {
+          thdset[i] = std::thread(
+              [&](int tidx) {
+                core_bond(tidx);
+                Xbyak::util::Cpu cpu;
+                L1[tidx] = cpu.getDataCacheSize(0);
+                L2[tidx] = cpu.getDataCacheSize(1);
+                if (isEcore(cpu))
+                  core_type[tidx] = 1;
+                else
+                  core_type[tidx] = 2;
+                core_id[tidx] = getCoreId(cpu);
+              },
+              int(i));
+        }
+        for (size_t i = 0; i < total_cores; i++) {
+          thdset[i].join();
+          core_id_count[core_id[i]] = core_id_count[core_id[i]] + 1;
+        }
+        if (p) {
+          for (int i = 0; i < total_cores; i++) printf("%d %d\n", core_type[i], core_id[i]);
+          for (auto& kv : core_id_count) printf("%d,%d\n", kv.first, kv.second);
+        }
+        for (int i = 0; i < total_cores; i++) {
+          if (core_type[i] == 2) {
+            if (core_id_count[core_id[i]] > 0) {
+              P_core.push_back(i);
+              core_id_count[core_id[i]] = 0;
+            } else {
+              SMT_core.push_back(i);
+            }
+          } else {
+            if (core_id_count[core_id[i]] == 4) E_core.push_back(i);
+          }
+        }
+        if (p) {
+          printf("Pcore:");
+          for (auto& i : P_core) printf("%d,", i);
+          printf("\nEcore:");
+          for (auto& i : E_core) printf("%d,", i);
+          printf("\nsmt:");
+          for (auto& i : SMT_core) printf("%d,", i);
+          printf("\n");
+        }
+        mHybrid = !(E_core.empty() || P_core.empty());  // in case of bond core by external
+        if (!E_core.empty()) {
+          E_L1Cache = L1[E_core[0]];
+          E_L2Cache = L2[E_core[0]] / 4;
+        };
+        if (!P_core.empty()) {
+          L1Cache = L1[P_core[0]];
+          L2Cache = L2[P_core[0]];
+        }
+      }
+      numcores = static_cast<int>(P_core.size() + E_core.size());
+      numthreads = static_cast<int>(P_core.size() + E_core.size() + SMT_core.size());
+
+      {
+        // set PE
+        uint32_t tmp[4];
+        _cpu.getCpuid(1, tmp);
+        if (p) printf("!!!\t%x\t%x\t%x\t%x!!!\n", tmp[0], tmp[1], tmp[2], tmp[3]);
+        const int famliy = (tmp[0] >> 8) & ((1u << 4) - 1);          // cpu.extractBit(a[0], 8, 11);
+        const int extendedModel = (tmp[0] >> 16) & ((1u << 4) - 1);  // cpu.extractBit(a[0], 16, 24);
+        {
+          for (int i = 0; i < int(BTLA_ISA::ISA_COUNT); i++) PE[i] = 1.0f;
+          // CPU identification refer to: https://en.wikichip.org/wiki/intel/cpuid
+          if (famliy == 6) switch (extendedModel) {
+              case 9:  // ALD
+                PE[int(BTLA_ISA::AVX2)] = 3.0f;
+                PE[int(BTLA_ISA::AVX_VNNI)] = 5.0f;
+                PE[int(BTLA_ISA::NoSIMD)] = 3.5f;
+                break;
+              case 10:  // MTL
+                PE[int(BTLA_ISA::AVX2)] = 2.2f;
+                PE[int(BTLA_ISA::AVX_VNNI)] = 3.0f;
+                PE[int(BTLA_ISA::NoSIMD)] = 3.0f;
+                break;
+              case 11:  // RPL
+                PE[int(BTLA_ISA::AVX2)] = 1.8f;
+                PE[int(BTLA_ISA::AVX_VNNI)] = 2.6f;
+                PE[int(BTLA_ISA::NoSIMD)] = 3.0f;
+                break;
+            }
+        }
+      }
+    } else {
+      L1Cache = _cpu.getDataCacheSize(0);
+      L2Cache = _cpu.getDataCacheSize(1);
+      numthreads = numcores;
+    }
+    L3Cache = _cpu.getDataCacheSize(2);
+#if FIXED_CACHE
+    L2Cache = L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : L2Cache;
+    E_L2Cache = E_L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : E_L2Cache;
+#endif
+  }
+
+  static CpuDevice* getInstance() {
+    static CpuDevice instance;
+    return &instance;
+  }
+
+  void print() {
+    printf(
+        "AVX:%d AVX2:%d AVX512F:%d AVX_VNNI:%d AVX512_VNNI:%d AMX_INT8:%d AMX_BF16:%d AVX512_BF16:%d AVX512_FP16:%d\n",
+        mHasAVX, mHasAVX2, mHasAVX512F, mHasAVX_VNNI, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512_BF16,
+        mHasAVX512_FP16);
+  }
+#undef ADD_FLAG
+
+  static bool isEcore() {
+    Xbyak::util::Cpu cpu;
+    uint32_t tmp[4];
+    cpu.getCpuid(0x1A, tmp);
+    int core_type = (tmp[0] >> 24) & ((1u << 8) - 1);  // cpu.extractBit(a[0], 24, 31);
+    switch (core_type) {
+      case 32:
+        // printf("Atom\n");
+        return true;  // E-core or LPE-core
+        break;
+      case 64:
+        // printf("Core\n");
+        return false;  // P-core
+        break;
+      default:
+        // printf("No hyper\n");
+        return false;
+        break;
+    }
+    return false;
+  }
+
+  int getCoreId(Xbyak::util::Cpu& cpu) {
+    uint32_t tmp[4];
+    cpu.getCpuidEx(0x1F, 1, tmp);  // sub-leaf 1 is core domain
+    // printf("!!!%x\t%x\t%x\t%x!!!\n", tmp[0], tmp[1], tmp[2], tmp[3]);
+    if (tmp[0] != 0 && tmp[1] != 0)
+      return tmp[3] >> 3;  // tmp[3] is APIC
+    else
+      return tmp[3];
+  }
+
+  bool isEcore(Xbyak::util::Cpu& cpu) {
+    uint32_t tmp[4];
+    cpu.getCpuid(0x1A, tmp);
+    int core_type = (tmp[0] >> 24) & ((1u << 7) - 1);  // cpu.extractBit(a[0], 24, 31);
+    switch (core_type) {
+      case 32:
+        // printf("Atom\n");
+        return true;  // E-core or LPE-core
+        break;
+      case 64:
+        // printf("Core\n");
+        return false;  // P-core
+        break;
+      default:
+        // printf("No hyper\n");
+        return false;
+        break;
+    }
+    return false;
+  }
+  static void core_bond(int core) {
+#ifdef _WIN32
+    SetThreadAffinityMask(GetCurrentThread(), 1LL << core);
+#else
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    CPU_SET(core, &cpuset);
+    int s = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
+    if (s != 0) printf("ERROR\n");
+#endif
+  }
+
+  static void core_bond(std::thread& thread, int core) {
+#ifdef _WIN32
+    HANDLE handle = thread.native_handle();
+    SetThreadAffinityMask(handle, 1LL << core);
+#else
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    CPU_SET(core, &cpuset);
+    pthread_t pt = thread.native_handle();
+    int s = pthread_setaffinity_np(pt, sizeof(cpuset), &cpuset);
+    if (s != 0) printf("ERROR\n");
+#endif
+  }
+
+  bool isHybrid() { return mHybrid; }
+  bool isClient() { return mClient; }
+
+ protected:
+  uint32_t L2Cache, L1Cache, L3Cache;
+  bool mHybrid = false, mClient = false;
+  bool mHasAVX2, mHasAVX_VNNI, mHasAVX, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512F, mHasAVX512_BF16,
+      mHasAVX512_FP16;
+  int numcores;
+  int numthreads;
+  std::vector<int> P_core, E_core, SMT_core;
+  uint32_t E_L2Cache, E_L1Cache;
+  float PE[int(BTLA_ISA::ISA_COUNT)];
+};
+
+#define GetCPUDevice() auto _cd = bestla::device::CpuDevice::getInstance();
+
+class CpuRuntime {
+ public:
+  CpuRuntime() = default;
+  static CpuRuntime& getInstance(int thread) {
+    static std::map<int, CpuRuntime> instances;
+    if (instances.count(thread) == 0) instances[thread] = CpuRuntime(thread);
+    return instances[thread];
+  }
+
+  inline float getPE(const BTLA_ISA isa) {
+    // printf("GET:%d\t%f\n",int(isa), *cur_PE);
+    return PE[int(isa)] * P_core_num / E_core_num;
+  }
+
+  inline void adjustPE(const BTLA_ISA isa, const float PE_) {
+    // printf("Adjust:%d,%f\n",int(isa),PE_);
+    PE[int(isa)] = PE[int(isa)] * PE_ * 0.7 + PE[int(isa)] * 0.3;
+  }
+
+  size_t mL2Cache, mL1Cache, mL2Cache_P = 0, mL1Cache_P = 0, mL2Cache_E = 0, mL1Cache_E = 0;
+  int P_core_num = 0, E_core_num = 0;
+  bool mHybrid = false;
+
+ private:
+  CpuRuntime(int thread) {
+    GetCPUDevice();
+    mL2Cache = _cd->getL2CacheSize();
+    mL1Cache = _cd->getL1CacheSize();
+    maxThreads = _cd->getThreads();
+    mHybrid = false;
+    if (_cd->isClient() && thread > _cd->getPcoreNum()) {
+      if (thread > _cd->getPcoreNum() + _cd->getEcoreNum()) {
+        mL1Cache_P = mL1Cache / 2;
+        mL2Cache_P = mL2Cache / 2;
+        P_core_num = _cd->getPcoreNum();
+        E_core_num = _cd->getEcoreNum();
+      } else {
+        mL1Cache_P = mL1Cache;
+        mL2Cache_P = mL2Cache;
+        P_core_num = static_cast<int>(_cd->getPcoreNum());
+        E_core_num = thread - P_core_num;
+      }
+      if (_cd->isHybrid()) {
+        mL1Cache_E = _cd->getL1CacheSize_E();
+        mL2Cache_E = _cd->getL2CacheSize_E();
+        mHybrid = true;
+        memcpy(PE, _cd->getPE(), int(BTLA_ISA::ISA_COUNT) * sizeof(float));
+      }
+    }
+  }
+  float PE[int(BTLA_ISA::ISA_COUNT)];
+  int maxThreads;
+};
+}  // namespace device
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_epilogue.h b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_epilogue.h
new file mode 100644
index 000000000..e49ad4386
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_epilogue.h
@@ -0,0 +1,340 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <tuple>
+
+#include "bestla.h"
+#include "bestla_jit.h"
+#include "bestla_utils.h"
+#include "kernel_wrapper.h"
+
+namespace bestla {
+namespace epilogue {
+namespace gemm {
+
+template <typename DT>
+struct ParamAccumulatorWriteBack {
+  DT* C;
+  int ldc;
+  void* elt_const_v;
+};
+
+template <BTLA_ISA ISA_T, typename _SRC_T, typename _DST_T>
+class AccumulatorWriteBack {
+ public:
+  using SType = _SRC_T;
+  using DType = _DST_T;
+  using Param = ParamAccumulatorWriteBack<DType>;
+
+  static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
+                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+    auto COffset = M_offset * _param.ldc + N_offset;
+    auto cptr = _param.C + COffset;
+    if constexpr (std::is_same_v<_SRC_T, DType>) {
+      if (cacheptr == cptr) {
+        return BTLA_CODE::Success;
+      }
+    }
+    return kernel::wrapper::Memcpy2D::template forward<ISA_T, SType, DType>(cacheptr, cptr, M, N, cachestep, _param.ldc,
+                                                                            _param.elt_const_v);
+  }
+};
+
+template <BTLA_ISA ISA_T, typename _SRC_T, typename _DST_T, BTLA_ELTWISEOP _OP>
+class CustomAccumulatorWriteBackWithEltop {
+ public:
+  using Param = ParamAccumulatorWriteBack<_DST_T>;
+  static BTLA_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
+                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+    auto COffset = M_offset * _param.ldc + N_offset;
+    auto cptr = _param.C + COffset;
+    if constexpr (std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value) {
+      return kernel::wrapper::Memcpy2D::template forward1<ISA_T, float, float, _OP>(cacheptr, cptr, M, N, cachestep,
+                                                                                    _param.ldc, _param.elt_const_v);
+    } else {
+      assert(false);
+    }
+  }
+};
+template <BTLA_ISA ISA_T>
+using AccumulatorWriteBackFp32 = AccumulatorWriteBack<ISA_T, float, float>;
+template <BTLA_ISA ISA_T>
+using AccumulatorWriteBackInt32 = AccumulatorWriteBack<ISA_T, int, int>;
+template <BTLA_ISA ISA_T>
+using AccumulatorWriteBackBf16 = AccumulatorWriteBack<ISA_T, utils::bf16, utils::bf16>;
+template <BTLA_ISA ISA_T>
+using AccumulatorWriteBackFp16 = AccumulatorWriteBack<ISA_T, utils::fp16, utils::fp16>;
+template <BTLA_ISA ISA_T>
+using AccumulatorWriteBackBf16Fp32 = AccumulatorWriteBack<ISA_T, utils::bf16, float>;
+template <BTLA_ISA ISA_T>
+using AccumulatorWriteBackFp16Fp32 = AccumulatorWriteBack<ISA_T, utils::fp16, float>;
+template <BTLA_ISA ISA_T>
+using AccumulatorWriteBackFp32Bf16 = AccumulatorWriteBack<ISA_T, float, utils::bf16>;
+
+template <BTLA_ISA ISA_T>
+using AccumulatorWriteBackWithGeluFp32 = CustomAccumulatorWriteBackWithEltop<ISA_T, float, float, BTLA_ELTWISEOP::GELU>;
+
+template <BTLA_ISA ISA_T>
+using AccumulatorWriteBackWithSwishFp32 =
+    CustomAccumulatorWriteBackWithEltop<ISA_T, float, float, BTLA_ELTWISEOP::SWISH>;
+
+template <typename DT>
+struct ParamAlphaBetaProcess {
+  DT *C, *D;
+  int ldc, ldd;
+  float alpha, beta;
+};
+template <BTLA_ISA ISA_T>
+class AlphaBetaProcessFp32 {
+ public:
+  using Param = ParamAlphaBetaProcess<float>;
+
+  static BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
+                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+    auto DOffset = M_offset * _param.ldd + N_offset;
+    auto COffset = M_offset * _param.ldc + N_offset;
+    auto cptr = _param.C + COffset;
+    auto dptr = _param.D + DOffset;
+    return kernel::wrapper::AlphaBetaF32F32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, _param.beta,
+                                                                     dptr, _param.ldd, cptr, _param.ldc, M, N);
+  }
+};
+
+struct ParamCompFp32BlockEpilogue {
+  void* scales;
+  BTLA_DTYPE scaledtype;
+  int ldsb;
+  int8_t* zps = nullptr;
+  float* reduce = nullptr;
+  int ldra;
+};
+template <BTLA_ISA ISA_T>
+class CompFp32BlockEpilogue {
+ public:
+  using Param = ParamCompFp32BlockEpilogue;
+  BTLA_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
+                    const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
+                    size_t cachesize) {
+    auto ret = BTLA_CODE::NotSupport;
+    if (_param.scaledtype == BTLA_DTYPE::F32) {
+      ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
+          reinterpret_cast<float*>(_param.scales) + K_offset * _param.ldsb + N_offset, srcptr, cachestep, dstptr,
+          cachestep, M, N);
+      assert(ret == BTLA_CODE::Success);
+      if (_param.zps != nullptr) {
+        ret = kernel::wrapper::RemoveZeroPointBias::forward_wei<ISA_T>(
+            dstptr, cachestep, M, N, _param.zps + K_offset * _param.ldsb + N_offset,
+            reinterpret_cast<float*>(_param.scales) + K_offset * _param.ldsb + N_offset, _param.ldra,
+            _param.reduce + M_offset * _param.ldra + K_offset);
+      }
+      assert(ret == BTLA_CODE::Success);
+      return ret;
+    } else if (_param.scaledtype == BTLA_DTYPE::BF16) {
+      ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
+          reinterpret_cast<utils::bf16*>(_param.scales) + K_offset * _param.ldsb + N_offset, srcptr, cachestep, dstptr,
+          cachestep, M, N);
+      if (_param.zps != nullptr) {
+        assert(0);
+      }
+      assert(ret == BTLA_CODE::Success);
+      return ret;
+    } else if (_param.scaledtype == BTLA_DTYPE::F8_E8M0) {
+      ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
+          reinterpret_cast<utils::f8*>(_param.scales) + K_offset * _param.ldsb + N_offset, srcptr, cachestep, dstptr,
+          cachestep, M, N);
+      if (_param.zps != nullptr) {
+        assert(0);
+      }
+    } else {
+      assert(0);
+    }
+    return BTLA_CODE::NotSupport;
+  }
+};
+
+struct ParamDequantInt32ToFp32 {
+  float* C;
+  int ldc;
+  int ldsa;
+  float* scalesA;
+  float* scalesB;
+};
+template <BTLA_ISA ISA_T>
+class DequantInt32ToFp32 {
+ public:
+  using Param = ParamDequantInt32ToFp32;
+  static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
+                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+    auto COffset = M_offset * _param.ldc + N_offset;
+    auto cptr = _param.C + COffset;
+    return kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
+                                                                   _param.scalesA + M_offset * _param.ldsa, _param.ldsa,
+                                                                   _param.scalesB + N_offset);
+  }
+};
+
+struct ParamCompInt8BlockEpilogue {
+  void* scalesB;
+  BTLA_DTYPE scaleBdtype;
+  int ldsb;
+  float* scalesA;
+  int ldsa;
+  // optional if A asym
+  uint8_t* zpA = nullptr;
+  void* reduceB = nullptr;
+  BTLA_DTYPE reduceBdtype = BTLA_DTYPE::F32;
+  // optional if B asym
+  int8_t* zpB = nullptr;
+  float* reduceA = nullptr;
+  int K = 1;
+};
+template <BTLA_ISA ISA_T>
+class CompInt8BlockEpilogue {
+ public:
+  using Param = ParamCompInt8BlockEpilogue;
+  static BTLA_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
+                    const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
+                    size_t cachesize) {
+    BTLA_CODE ret = BTLA_CODE::NotSupport;
+    float* scab = nullptr;
+    size_t ScaleBTmpSize = N * sizeof(float);
+    size_t ReduceBTmpSize = N * sizeof(float);
+    assert(cachesize >= (ScaleBTmpSize + ReduceBTmpSize));
+    if (_param.scaleBdtype == BTLA_DTYPE::BF16) {
+      auto scache = reinterpret_cast<float*>(tmpcache);
+      ret = kernel::wrapper::Memcpy2DBf16CvtFp32::template forward<ISA_T>(
+          reinterpret_cast<utils::bf16*>(_param.scalesB) + N_offset + K_offset * _param.ldsb, scache, 1, N, N, N,
+          false);
+      assert(ret == BTLA_CODE::Success);
+      scab = scache;
+    } else if (_param.scaleBdtype == BTLA_DTYPE::F32) {
+      scab = reinterpret_cast<float*>(_param.scalesB) + N_offset + K_offset * _param.ldsb;
+    }
+    float* redb = nullptr;
+    if (_param.reduceB) {
+      if (_param.reduceBdtype == BTLA_DTYPE::BF16) {
+        auto rcache = reinterpret_cast<float*>(reinterpret_cast<char*>(tmpcache) + ScaleBTmpSize);
+        ret = kernel::wrapper::Memcpy2DBf16CvtFp32::template forward<ISA_T>(
+            reinterpret_cast<utils::bf16*>(_param.reduceB) + N_offset + K_offset * _param.ldsb, rcache, 1, N, N, N,
+            false);
+        assert(ret == BTLA_CODE::Success);
+        redb = rcache;
+      } else if (_param.reduceBdtype == BTLA_DTYPE::F32) {
+        redb = reinterpret_cast<float*>(_param.reduceB) + N_offset + K_offset * _param.ldsb;
+      }
+    }
+    ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(
+        srcptr, cachestep, reinterpret_cast<float*>(const_cast<int32_t*>(srcptr)), cachestep, M, N,
+        _param.scalesA + M_offset * _param.ldsa + K_offset, _param.ldsa, scab);
+    assert(ret == BTLA_CODE::Success);
+    ret = kernel::wrapper::AccumulateFp32::template forward<ISA_T>(reinterpret_cast<const float*>(srcptr), cachestep,
+                                                                   dstptr, cachestep, M, N);
+    assert(ret == BTLA_CODE::Success);
+
+    if (_param.zpA == nullptr) {
+      if (_param.zpB == nullptr) {
+        return ret;
+      } else {
+        ret = kernel::wrapper::RemoveZeroPointBias::template forward_wei<ISA_T>(
+            dstptr, cachestep, M, N, _param.zpB + N_offset + K_offset * _param.ldsb, scab, _param.ldsa,
+            _param.reduceA + M_offset * _param.ldsa + K_offset);
+      }
+    } else {
+      if (_param.zpB == nullptr) {
+        ret = kernel::wrapper::RemoveZeroPointBias::template forward_act<ISA_T>(
+            dstptr, cachestep, M, N, _param.zpA + M_offset * _param.ldsa + K_offset,
+            _param.scalesA + M_offset * _param.ldsa + K_offset, _param.ldsa, redb);
+      } else {
+        ret = kernel::wrapper::RemoveZeroPointBias::template forward_both<ISA_T>(
+            dstptr, cachestep, M, N, _param.zpA + M_offset * _param.ldsa + K_offset,
+            _param.zpB + N_offset + K_offset * _param.ldsb, _param.scalesA + M_offset * _param.ldsa + K_offset, scab,
+            _param.ldsa, _param.K, _param.reduceA + M_offset * _param.ldsa + K_offset, redb);
+      }
+    }
+    return ret;
+  }
+};
+
+struct ParamZpDequantInt32ToFp32 {
+  // necessary
+  float* C;
+  int ldc;
+  int ldsa;
+  float* scalesA;
+  float* scalesB;
+  // optional if A asym
+  uint8_t* zpA = nullptr;
+  float* reduceB = nullptr;
+  // optional if B asym
+  int8_t* zpB = nullptr;
+  float* reduceA = nullptr;
+  int K = 1;
+};
+template <BTLA_ISA ISA_T>
+class ZpDequantInt32ToFp32 {
+ public:
+  using Param = ParamZpDequantInt32ToFp32;
+  static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
+                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+    auto COffset = M_offset * _param.ldc + N_offset;
+    auto cptr = _param.C + COffset;
+    auto ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
+                                                                       _param.scalesA + M_offset * _param.ldsa,
+                                                                       _param.ldsa, _param.scalesB + N_offset);
+    if (ret != BTLA_CODE::Success) {
+      return ret;
+    }
+    if (_param.zpA == nullptr && _param.zpB == nullptr) {
+      return ret;
+    } else if (_param.zpA != nullptr && _param.zpB == nullptr) {
+      ret = kernel::wrapper::RemoveZeroPointBias::template forward_act<ISA_T>(
+          cptr, _param.ldc, M, N, _param.zpA + M_offset * _param.ldsa, _param.scalesA + M_offset * _param.ldsa,
+          _param.ldsa, _param.reduceB + N_offset);
+    } else if (_param.zpA == nullptr && _param.zpB != nullptr) {
+      ret = kernel::wrapper::RemoveZeroPointBias::template forward_wei<ISA_T>(
+          cptr, _param.ldc, M, N, _param.zpB + N_offset, _param.scalesB + N_offset, _param.ldsa,
+          _param.reduceA + M_offset * _param.ldsa);
+    } else {
+      ret = kernel::wrapper::RemoveZeroPointBias::template forward_both<ISA_T>(
+          cptr, _param.ldc, M, N, _param.zpA + M_offset * _param.ldsa, _param.zpB + N_offset,
+          _param.scalesA + M_offset * _param.ldsa, _param.scalesB + N_offset, _param.ldsa, _param.K,
+          _param.reduceA + M_offset * _param.ldsa, _param.reduceB + N_offset);
+    }
+    return ret;
+  }
+};
+
+struct ParamAlphaBetaProcessS32U8 {
+  uint8_t* C;
+  int ldc;
+  float alpha;
+  float scaleAcc, scaleC;
+  int zpC;
+};
+template <BTLA_ISA ISA_T>
+class AlphaBetaProcessS32U8 {
+ public:
+  using Param = ParamAlphaBetaProcessS32U8;
+  static BTLA_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
+                    const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+    auto COffset = M_offset * _param.ldc + N_offset;
+    auto cptr = _param.C + COffset;
+    return kernel::wrapper::QuanOutS32U32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, cptr, _param.ldc,
+                                                                   M, N, _param.scaleAcc, _param.scaleC, _param.zpC);
+  }
+};
+
+}  // namespace gemm
+}  // namespace epilogue
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_gemm.h b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_gemm.h
new file mode 100644
index 000000000..793db081c
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_gemm.h
@@ -0,0 +1,4320 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <array>
+
+#include "bestla_utils.h"
+#include "bestla_jit.h"
+
+namespace bestla {
+namespace gemm {
+enum class CompType : uint16_t {
+  // base type, too many bits if reuse BTLA_DTYPE
+  tFP32 = 0,
+  tBF16 = 1,
+  tFP16 = 2,
+  tS8 = 3,
+  tU8 = 4,
+  tS32 = 5,
+  tS16 = 6,
+  MASK_A = 0xf,
+  SHIFT_A = 0,
+  MASK_B = 0xf0,
+  SHIFT_B = 4,
+  MASK_C = 0xf00,
+  SHIFT_C = 8,
+  COMP_FP32 = (tFP32 << SHIFT_A) | (tFP32 << SHIFT_B) | (tFP32 << SHIFT_C),
+  COMP_BF16_FP32 = (tBF16 << SHIFT_A) | (tBF16 << SHIFT_B) | (tFP32 << SHIFT_C),
+  COMP_FP16_FP16 = (tFP16 << SHIFT_A) | (tFP16 << SHIFT_B) | (tFP16 << SHIFT_C),
+  COMP_INT8_US_INT32 = (tU8 << SHIFT_A) | (tS8 << SHIFT_B) | (tS32 << SHIFT_C),
+  COMP_INT8_UU_INT32 = (tU8 << SHIFT_A) | (tU8 << SHIFT_B) | (tS32 << SHIFT_C),
+  COMP_INT8_SS_INT32 = (tS8 << SHIFT_A) | (tS8 << SHIFT_B) | (tS32 << SHIFT_C),
+  COMP_INT8_SU_INT32 = (tS8 << SHIFT_A) | (tU8 << SHIFT_B) | (tS32 << SHIFT_C),
+  COMP_INT16_SS_INT32 = (tS16 << SHIFT_A) | (tS16 << SHIFT_B) | (tS32 << SHIFT_C),
+  COMP_INT8_US_FP32 = (tU8 << SHIFT_A) | (tS8 << SHIFT_B) | (tFP32 << SHIFT_C),
+  COMP_INT8_UU_FP32 = (tU8 << SHIFT_A) | (tU8 << SHIFT_B) | (tFP32 << SHIFT_C),
+  COMP_INT8_SS_FP32 = (tS8 << SHIFT_A) | (tS8 << SHIFT_B) | (tFP32 << SHIFT_C),
+  COMP_INT8_SU_FP32 = (tS8 << SHIFT_A) | (tU8 << SHIFT_B) | (tFP32 << SHIFT_C),
+};
+
+class CompTypeHelper {
+ public:
+  static inline uint64_t get_mask_val(CompType raw, CompType mask, CompType shift) {
+    return (static_cast<uint64_t>(raw) & static_cast<uint64_t>(mask)) >> static_cast<uint64_t>(shift);
+  }
+
+  static void parse_id(CompType id, uint64_t* vals) {
+    vals[0] = get_mask_val(id, CompType::MASK_A, CompType::SHIFT_A);
+    vals[1] = get_mask_val(id, CompType::MASK_B, CompType::SHIFT_B);
+    vals[2] = get_mask_val(id, CompType::MASK_C, CompType::SHIFT_C);
+  }
+
+  static const char* to_str(CompType id) {
+    static char tmp[128];
+    uint64_t vals[3];
+    parse_id(id, vals);
+    sprintf(tmp, "A%d_B%d_C%d", static_cast<int>(vals[0]), static_cast<int>(vals[1]), static_cast<int>(vals[2]));
+    return tmp;
+  }
+
+  static inline uint64_t get_B(CompType id) { return get_mask_val(id, CompType::MASK_B, CompType::SHIFT_B); }
+
+  static inline bool is_integer(CompType id) {
+    auto bt = get_B(id);
+    bool flag = false;
+    flag |= bt == static_cast<uint64_t>(CompType::tS8);
+    flag |= bt == static_cast<uint64_t>(CompType::tU8);
+    return flag;
+  }
+};
+
+class CoreAttr {
+ public:
+  // INT64=LSB|**8bits:NTile**||**8bits:PackRow**||**16bits:CompType**||**8bits:ISA**||**24bits:reversed**|
+  static uint64_t constexpr NTILE_MASK = 0xff, NTILE_SHIFT = 0, PACKROW_MASK = 0xff00, PACKROW_SHIFT = 8,
+                            COMP_MASK = 0xffff0000, COMP_SHIFT = 16, ISA_MASK = 0xff00000000, ISA_SHIFT = 32;
+
+  static inline uint64_t get_mask_val(uint64_t raw, uint64_t mask, uint64_t shift) { return (raw & mask) >> shift; }
+
+  static constexpr uint64_t make_core_id(int NTile, int PackRow, CompType CompType, BTLA_ISA ISA) {
+    return (static_cast<uint64_t>(NTile) << NTILE_SHIFT) | (static_cast<uint64_t>(PackRow) << PACKROW_SHIFT) |
+           (static_cast<uint64_t>(CompType) << COMP_SHIFT) | (static_cast<uint64_t>(ISA) << ISA_SHIFT);
+  }
+  static void parse_id(uint64_t id, uint64_t* vals) {
+    vals[0] = get_mask_val(id, NTILE_MASK, NTILE_SHIFT);
+    vals[1] = get_mask_val(id, PACKROW_MASK, PACKROW_SHIFT);
+    vals[2] = get_mask_val(id, COMP_MASK, COMP_SHIFT);
+    vals[3] = get_mask_val(id, ISA_MASK, ISA_SHIFT);
+  }
+
+  static const char* to_str(uint64_t id) {
+    static char tmp[128];
+    uint64_t vals[4];
+    parse_id(id, vals);
+    sprintf(tmp, "N%d_PACK%d_COMP%d_ISA%d", static_cast<int>(vals[0]), static_cast<int>(vals[1]),
+            static_cast<int>(vals[2]), static_cast<int>(vals[3]));
+    return tmp;
+  }
+
+  static inline int get_packrow(uint64_t id) { return static_cast<int>(get_mask_val(id, PACKROW_MASK, PACKROW_SHIFT)); }
+
+  static inline size_t get_bsize(uint64_t id) {
+    auto packrow = get_packrow(id);
+    return size_t(4 / packrow);
+  }
+
+  static inline BTLA_ISA get_ISA(uint64_t id) { return static_cast<BTLA_ISA>(get_mask_val(id, ISA_MASK, ISA_SHIFT)); }
+
+  static inline CompType get_comp(uint64_t id) {
+    return static_cast<CompType>(get_mask_val(id, COMP_MASK, COMP_SHIFT));
+  }
+};
+
+namespace code {
+
+template <int _NTILE, int _MTILE = 0>
+class Avx2N8P1 : protected bestla::xbyak::JitAvx2 {
+ public:
+  static int constexpr RegLen = 8, PackRow = 1;
+  static_assert(_NTILE % RegLen == 0);
+  static int constexpr NRegs = _NTILE / RegLen;
+  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
+  static_assert(NRegs * MRegs <= RegCount - 1);
+  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
+  static int constexpr KUNROLL = 2;
+  static auto constexpr ISA = BTLA_ISA::AVX2;
+  static auto constexpr COMPUTE = CompType::COMP_FP32;
+  typedef float AType;
+  typedef float BType;
+  typedef float CType;
+
+  struct params {
+    AType* matA;
+    int astride;
+    BType* matB;
+    int bstride;
+    CType* matC;
+    int cstride;
+    int k;
+    int n;
+    int init;
+  };
+  typedef long long (*func_t)(params*);
+
+  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
+  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
+  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
+  static int constexpr AKStepSize = KTILE * sizeof(AType);
+
+  void generate_code(int _mtile) {
+    assign_regs();
+    reset();
+    generate_mtile(_mtile);
+    ready();
+    mKernel = getCode<func_t>();
+  }
+  func_t mKernel = nullptr;
+
+ protected:
+  Xbyak::Reg64 parambase;
+  Xbyak::Reg64 reg_matAptr;
+  Xbyak::Reg64 reg_matBptr;
+  Xbyak::Reg64 reg_matCptr;
+  Xbyak::Reg64 reg_ksize;
+  Xbyak::Reg64 reg_nsize;
+  Xbyak::Reg64 reg_cstride;
+  Xbyak::Reg64 reg_astride;
+  Xbyak::Reg64 reg_iterk;
+  Xbyak::Reg64 reg_itern;
+  Xbyak::Reg64 reg_tmp;
+  Xbyak::Reg64 reg_tmp1;
+  Xbyak::Reg64 reg_tmp2;
+  Xbyak::Reg64 reg_ret = rax;
+  Xbyak::Opmask msk_wr = k1;
+
+  void assign_regs() {
+    CRegCount = MRegs * NRegs;
+    ARegCount = 1;
+    BRegCount = RegCount - ARegCount - CRegCount;
+    if (BRegCount < NRegs) {
+      BRegCount = 0;
+      ARegCount = BRegCount + 1;
+    }
+    if (BRegCount > NRegs) {
+      BRegCount = NRegs;
+    }
+    CReg = 0;
+    BReg = CReg + CRegCount;
+    AReg = BReg + BRegCount;
+    TmpReg = AReg + ARegCount;
+    assert(TmpReg <= RegCount);
+    TmpRegCount = RegCount - TmpReg;
+  }
+
+  void generate_mtile(int _mtile) {
+    inLocalLabel();  // use local label for multiple instance
+    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
+    parambase = st.p[0];
+    reg_matAptr = st.t[0];
+    reg_matBptr = st.t[1];
+    reg_matCptr = st.t[0];
+    reg_ksize = st.t[2];
+    reg_astride = st.t[3];
+    reg_cstride = st.t[3];
+    reg_iterk = st.t[4];
+    reg_tmp = st.t[5];
+    reg_tmp1 = st.t[6];
+    reg_tmp2 = st.t[7];
+    reg_nsize = st.t[8];
+    reg_itern = st.t[9];
+    reg_ret = rax;
+
+    vreg_push(rsp);
+
+    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
+    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
+    xor_(reg_itern, reg_itern);
+    L(".nloop");
+    init_regs(_mtile);
+    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
+    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
+    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
+    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
+    imul(reg_tmp, reg_itern);
+    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
+    xor_(reg_iterk, reg_iterk);
+    generate_kloop(_mtile);
+    write_back(_mtile);
+    add(reg_itern, NTILE);
+    cmp(reg_itern, reg_nsize);
+    jb(".nloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+
+    outLocalLabel();  // end of local label
+  }
+
+  void generate_kloop(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, reg_ksize);
+    padto_le(reg_tmp, KUNROLL * KTILE);
+    cmp(reg_tmp, 0);
+    jz(".kloop", T_NEAR);
+    L(".unkloop");
+    generate_fma(_mtile, KUNROLL);
+    add(reg_matAptr, KUNROLL * AKStepSize);
+    add(reg_matBptr, KUNROLL * BKStepSize);
+    add(reg_iterk, KUNROLL * KTILE);
+    cmp(reg_iterk, reg_tmp);  // k iteration variable
+    jb(".unkloop");
+    cmp(reg_tmp, reg_ksize);
+    jge(".kend", T_NEAR);
+    L(".kloop");
+    generate_fma(_mtile, 1);
+    add(reg_matAptr, 1 * AKStepSize);
+    add(reg_matBptr, 1 * BKStepSize);
+    add(reg_iterk, 1 * KTILE);
+    cmp(reg_iterk, reg_ksize);  // k iteration variable
+    jb(".kloop");
+    L(".kend");
+    outLocalLabel();
+  }
+
+  void generate_fma(int _mtile, int _ktile) {
+    for (int kk = 0; kk < _ktile; kk++) {
+      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
+      if (BRegCount == NRegs) {
+        for (int i = 0; i < NRegs; i++) {
+          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+        }
+        for (int mm = 0; mm < _mtile; mm++) {
+          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
+          add(reg_tmp1, reg_astride);
+          for (int i = 0; i < NRegs; i++) {
+            vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
+          }
+        }
+      } else if (BRegCount == 0) {
+        for (int mm = 0; mm < _mtile; mm += ARegCount) {
+          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
+          for (int imm = 0; imm < mm_re; imm++) {
+            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
+            add(reg_tmp1, reg_astride);
+            for (int i = 0; i < NRegs; i++) {
+              vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
+                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+            }
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+  }
+
+  void init_regs(int _mtile) {
+    inLocalLabel();
+    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
+    cmp(reg_tmp, 0);
+    je(".read", T_NEAR);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
+      }
+    }
+    jmp(".end", T_NEAR);
+    L(".read");
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    L(".end");
+    outLocalLabel();
+  }
+
+  void write_back(int _mtile) {
+    inLocalLabel();
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    outLocalLabel();
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class Avx512fN16P1 : protected bestla::xbyak::JitAvx512f {
+ public:
+  static int constexpr RegLen = 16, PackRow = 1;
+  static_assert(_NTILE % RegLen == 0);
+  static int constexpr NRegs = _NTILE / RegLen;
+  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
+  static_assert(NRegs * MRegs <= RegCount - 1);
+  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
+  static int constexpr KUNROLL = 2;
+  static auto constexpr ISA = BTLA_ISA::AVX512F;
+  static auto constexpr COMPUTE = CompType::COMP_FP32;
+  typedef float AType;
+  typedef float BType;
+  typedef float CType;
+
+  struct params {
+    AType* matA;
+    int astride;
+    BType* matB;
+    int bstride;
+    CType* matC;
+    int cstride;
+    int k;
+    int n;
+    int init;
+  };
+  typedef long long (*func_t)(params*);
+
+  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
+  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
+  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
+  static int constexpr AKStepSize = KTILE * sizeof(AType);
+
+  void generate_code(int _mtile) {
+    assign_regs();
+    reset();
+    generate_mtile(_mtile);
+    ready();
+    mKernel = getCode<func_t>();
+  }
+  func_t mKernel = nullptr;
+
+ protected:
+  Xbyak::Reg64 parambase;
+  Xbyak::Reg64 reg_matAptr;
+  Xbyak::Reg64 reg_matBptr;
+  Xbyak::Reg64 reg_matCptr;
+  Xbyak::Reg64 reg_ksize;
+  Xbyak::Reg64 reg_nsize;
+  Xbyak::Reg64 reg_cstride;
+  Xbyak::Reg64 reg_astride;
+  Xbyak::Reg64 reg_iterk;
+  Xbyak::Reg64 reg_itern;
+  Xbyak::Reg64 reg_tmp;
+  Xbyak::Reg64 reg_tmp1;
+  Xbyak::Reg64 reg_tmp2;
+  Xbyak::Reg64 reg_ret = rax;
+  Xbyak::Opmask msk_wr = k1;
+
+  void assign_regs() {
+    CRegCount = MRegs * NRegs;
+    ARegCount = 1;
+    BRegCount = RegCount - ARegCount - CRegCount;
+    if (BRegCount < NRegs) {
+      BRegCount = 0;
+      ARegCount = BRegCount + 1;
+    }
+    if (BRegCount > NRegs) {
+      BRegCount = NRegs;
+    }
+    CReg = 0;
+    BReg = CReg + CRegCount;
+    AReg = BReg + BRegCount;
+    TmpReg = AReg + ARegCount;
+    assert(TmpReg <= RegCount);
+    TmpRegCount = RegCount - TmpReg;
+  }
+
+  void generate_mtile(int _mtile) {
+    inLocalLabel();  // use local label for multiple instance
+    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
+    parambase = st.p[0];
+    reg_matAptr = st.t[0];
+    reg_matBptr = st.t[1];
+    reg_matCptr = st.t[0];
+    reg_ksize = st.t[2];
+    reg_astride = st.t[3];
+    reg_cstride = st.t[3];
+    reg_iterk = st.t[4];
+    reg_tmp = st.t[5];
+    reg_tmp1 = st.t[6];
+    reg_tmp2 = st.t[7];
+    reg_nsize = st.t[8];
+    reg_itern = st.t[9];
+    reg_ret = rax;
+
+    vreg_push(rsp);
+
+    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
+    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
+    xor_(reg_itern, reg_itern);
+    L(".nloop");
+    init_regs(_mtile);
+    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
+    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
+    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
+    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
+    imul(reg_tmp, reg_itern);
+    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
+    xor_(reg_iterk, reg_iterk);
+    generate_kloop(_mtile);
+    write_back(_mtile);
+    add(reg_itern, NTILE);
+    cmp(reg_itern, reg_nsize);
+    jb(".nloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+
+    outLocalLabel();  // end of local label
+  }
+
+  void generate_kloop(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, reg_ksize);
+    padto_le(reg_tmp, KUNROLL * KTILE);
+    cmp(reg_tmp, 0);
+    jz(".kloop", T_NEAR);
+    L(".unkloop");
+    generate_fma(_mtile, KUNROLL);
+    add(reg_matAptr, KUNROLL * AKStepSize);
+    add(reg_matBptr, KUNROLL * BKStepSize);
+    add(reg_iterk, KUNROLL * KTILE);
+    cmp(reg_iterk, reg_tmp);  // k iteration variable
+    jb(".unkloop");
+    cmp(reg_tmp, reg_ksize);
+    jge(".kend", T_NEAR);
+    L(".kloop");
+    generate_fma(_mtile, 1);
+    add(reg_matAptr, 1 * AKStepSize);
+    add(reg_matBptr, 1 * BKStepSize);
+    add(reg_iterk, 1 * KTILE);
+    cmp(reg_iterk, reg_ksize);  // k iteration variable
+    jb(".kloop");
+    L(".kend");
+    outLocalLabel();
+  }
+
+  void generate_fma(int _mtile, int _ktile) {
+    for (int kk = 0; kk < _ktile; kk++) {
+      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
+      if (BRegCount == NRegs) {
+        for (int i = 0; i < NRegs; i++) {
+          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+        }
+        for (int mm = 0; mm < _mtile; mm++) {
+          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
+          add(reg_tmp1, reg_astride);
+          for (int i = 0; i < NRegs; i++) {
+            vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
+          }
+        }
+      } else if (BRegCount == 0) {
+        for (int mm = 0; mm < _mtile; mm += ARegCount) {
+          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
+          for (int imm = 0; imm < mm_re; imm++) {
+            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
+            add(reg_tmp1, reg_astride);
+            for (int i = 0; i < NRegs; i++) {
+              vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
+                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+            }
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+  }
+
+  void init_regs(int _mtile) {
+    inLocalLabel();
+    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
+    cmp(reg_tmp, 0);
+    je(".read", T_NEAR);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
+      }
+    }
+    jmp(".end", T_NEAR);
+    L(".read");
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    L(".end");
+    outLocalLabel();
+  }
+
+  void write_back(int _mtile) {
+    inLocalLabel();
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    outLocalLabel();
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class Avx512fp16N32P1 : protected bestla::xbyak::JitAvx512_fp16 {
+ public:
+  static int constexpr RegLen = 32, PackRow = 1;
+  static_assert(_NTILE % RegLen == 0);
+  static int constexpr NRegs = _NTILE / RegLen;
+  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
+  static_assert(NRegs * MRegs <= RegCount - 1);
+  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
+  static int constexpr KUNROLL = 2;
+  static auto constexpr ISA = BTLA_ISA::AVX512_FP16;
+  static auto constexpr COMPUTE = CompType::COMP_FP16_FP16;
+  typedef utils::fp16 AType;
+  typedef utils::fp16 BType;
+  typedef utils::fp16 CType;
+
+  struct params {
+    AType* matA;
+    int astride;
+    BType* matB;
+    int bstride;
+    CType* matC;
+    int cstride;
+    int k;
+    int n;
+    int init;
+  };
+  typedef long long (*func_t)(params*);
+
+  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
+  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
+  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
+  static int constexpr AKStepSize = KTILE * sizeof(AType);
+
+  void generate_code(int _mtile) {
+    assign_regs();
+    reset();
+    generate_mtile(_mtile);
+    ready();
+    mKernel = getCode<func_t>();
+  }
+  func_t mKernel = nullptr;
+
+ protected:
+  Xbyak::Reg64 parambase;
+  Xbyak::Reg64 reg_matAptr;
+  Xbyak::Reg64 reg_matBptr;
+  Xbyak::Reg64 reg_matCptr;
+  Xbyak::Reg64 reg_ksize;
+  Xbyak::Reg64 reg_nsize;
+  Xbyak::Reg64 reg_cstride;
+  Xbyak::Reg64 reg_astride;
+  Xbyak::Reg64 reg_iterk;
+  Xbyak::Reg64 reg_itern;
+  Xbyak::Reg64 reg_tmp;
+  Xbyak::Reg64 reg_tmp1;
+  Xbyak::Reg64 reg_tmp2;
+  Xbyak::Reg64 reg_ret = rax;
+  Xbyak::Opmask msk_wr = k1;
+
+  void assign_regs() {
+    CRegCount = MRegs * NRegs;
+    ARegCount = 1;
+    BRegCount = RegCount - ARegCount - CRegCount;
+    if (BRegCount < NRegs) {
+      BRegCount = 0;
+      ARegCount = BRegCount + 1;
+    }
+    if (BRegCount > NRegs) {
+      BRegCount = NRegs;
+    }
+    CReg = 0;
+    BReg = CReg + CRegCount;
+    AReg = BReg + BRegCount;
+    TmpReg = AReg + ARegCount;
+    assert(TmpReg <= RegCount);
+    TmpRegCount = RegCount - TmpReg;
+  }
+
+  void generate_mtile(int _mtile) {
+    inLocalLabel();  // use local label for multiple instance
+    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
+    parambase = st.p[0];
+    reg_matAptr = st.t[0];
+    reg_matBptr = st.t[1];
+    reg_matCptr = st.t[0];
+    reg_ksize = st.t[2];
+    reg_astride = st.t[3];
+    reg_cstride = st.t[3];
+    reg_iterk = st.t[4];
+    reg_tmp = st.t[5];
+    reg_tmp1 = st.t[6];
+    reg_tmp2 = st.t[7];
+    reg_nsize = st.t[8];
+    reg_itern = st.t[9];
+    reg_ret = rax;
+
+    vreg_push(rsp);
+
+    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
+    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
+    xor_(reg_itern, reg_itern);
+    L(".nloop");
+    init_regs(_mtile);
+    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
+    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
+    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
+    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
+    imul(reg_tmp, reg_itern);
+    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
+    xor_(reg_iterk, reg_iterk);
+    generate_kloop(_mtile);
+    write_back(_mtile);
+    add(reg_itern, NTILE);
+    cmp(reg_itern, reg_nsize);
+    jb(".nloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+
+    outLocalLabel();  // end of local label
+  }
+
+  void generate_kloop(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, reg_ksize);
+    padto_le(reg_tmp, KUNROLL * KTILE);
+    cmp(reg_tmp, 0);
+    jz(".kloop", T_NEAR);
+    L(".unkloop");
+    generate_fma(_mtile, KUNROLL);
+    add(reg_matAptr, KUNROLL * AKStepSize);
+    add(reg_matBptr, KUNROLL * BKStepSize);
+    add(reg_iterk, KUNROLL * KTILE);
+    cmp(reg_iterk, reg_tmp);  // k iteration variable
+    jb(".unkloop");
+    cmp(reg_tmp, reg_ksize);
+    jge(".kend", T_NEAR);
+    L(".kloop");
+    generate_fma(_mtile, 1);
+    add(reg_matAptr, 1 * AKStepSize);
+    add(reg_matBptr, 1 * BKStepSize);
+    add(reg_iterk, 1 * KTILE);
+    cmp(reg_iterk, reg_ksize);  // k iteration variable
+    jb(".kloop");
+    L(".kend");
+    outLocalLabel();
+  }
+
+  void generate_fma(int _mtile, int _ktile) {
+    for (int kk = 0; kk < _ktile; kk++) {
+      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
+      if (BRegCount == NRegs) {
+        for (int i = 0; i < NRegs; i++) {
+          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+        }
+        for (int mm = 0; mm < _mtile; mm++) {
+          vpbroadcastw(vreg_t(AReg), ptr[reg_tmp1]);
+          add(reg_tmp1, reg_astride);
+          for (int i = 0; i < NRegs; i++) {
+            vfmadd231ph(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
+          }
+        }
+      } else if (BRegCount == 0) {
+        for (int mm = 0; mm < _mtile; mm += ARegCount) {
+          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
+          for (int imm = 0; imm < mm_re; imm++) {
+            vpbroadcastw(vreg_t(AReg + imm), ptr[reg_tmp1]);
+            add(reg_tmp1, reg_astride);
+            for (int i = 0; i < NRegs; i++) {
+              vfmadd231ph(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
+                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+            }
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+  }
+
+  void init_regs(int _mtile) {
+    inLocalLabel();
+    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
+    cmp(reg_tmp, 0);
+    je(".read", T_NEAR);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
+      }
+    }
+    jmp(".end", T_NEAR);
+    L(".read");
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    L(".end");
+    outLocalLabel();
+  }
+
+  void write_back(int _mtile) {
+    inLocalLabel();
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    outLocalLabel();
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class Avx512bf16N16P2 : protected bestla::xbyak::JitAvx512_bf16 {
+ public:
+  static int constexpr RegLen = 16, PackRow = 2;
+  static_assert(_NTILE % RegLen == 0);
+  static int constexpr NRegs = _NTILE / RegLen;
+  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
+  static_assert(NRegs * MRegs <= RegCount - 1);
+  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 2;
+  static int constexpr KUNROLL = 2;
+  static auto constexpr ISA = BTLA_ISA::AVX512_BF16;
+  static auto constexpr COMPUTE = CompType::COMP_BF16_FP32;
+  typedef utils::bf16 AType;
+  typedef utils::bf16 BType;
+  typedef float CType;
+
+  struct params {
+    AType* matA;
+    int astride;
+    BType* matB;
+    int bstride;
+    CType* matC;
+    int cstride;
+    int k;
+    int n;
+    int init;
+  };
+  typedef long long (*func_t)(params*);
+
+  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
+  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
+  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
+  static int constexpr AKStepSize = KTILE * sizeof(AType);
+
+  void generate_code(int _mtile) {
+    assign_regs();
+    reset();
+    generate_mtile(_mtile);
+    ready();
+    mKernel = getCode<func_t>();
+  }
+  func_t mKernel = nullptr;
+
+ protected:
+  Xbyak::Reg64 parambase;
+  Xbyak::Reg64 reg_matAptr;
+  Xbyak::Reg64 reg_matBptr;
+  Xbyak::Reg64 reg_matCptr;
+  Xbyak::Reg64 reg_ksize;
+  Xbyak::Reg64 reg_nsize;
+  Xbyak::Reg64 reg_cstride;
+  Xbyak::Reg64 reg_astride;
+  Xbyak::Reg64 reg_iterk;
+  Xbyak::Reg64 reg_itern;
+  Xbyak::Reg64 reg_tmp;
+  Xbyak::Reg64 reg_tmp1;
+  Xbyak::Reg64 reg_tmp2;
+  Xbyak::Reg64 reg_ret = rax;
+  Xbyak::Opmask msk_wr = k1;
+
+  void assign_regs() {
+    CRegCount = MRegs * NRegs;
+    ARegCount = 1;
+    BRegCount = RegCount - ARegCount - CRegCount;
+    if (BRegCount < NRegs) {
+      BRegCount = 0;
+      ARegCount = BRegCount + 1;
+    }
+    if (BRegCount > NRegs) {
+      BRegCount = NRegs;
+    }
+    CReg = 0;
+    BReg = CReg + CRegCount;
+    AReg = BReg + BRegCount;
+    TmpReg = AReg + ARegCount;
+    assert(TmpReg <= RegCount);
+    TmpRegCount = RegCount - TmpReg;
+  }
+
+  void generate_mtile(int _mtile) {
+    inLocalLabel();  // use local label for multiple instance
+    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
+    parambase = st.p[0];
+    reg_matAptr = st.t[0];
+    reg_matBptr = st.t[1];
+    reg_matCptr = st.t[0];
+    reg_ksize = st.t[2];
+    reg_astride = st.t[3];
+    reg_cstride = st.t[3];
+    reg_iterk = st.t[4];
+    reg_tmp = st.t[5];
+    reg_tmp1 = st.t[6];
+    reg_tmp2 = st.t[7];
+    reg_nsize = st.t[8];
+    reg_itern = st.t[9];
+    reg_ret = rax;
+
+    vreg_push(rsp);
+
+    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
+    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
+    xor_(reg_itern, reg_itern);
+    L(".nloop");
+    init_regs(_mtile);
+    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
+    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
+    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
+    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
+    imul(reg_tmp, reg_itern);
+    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
+    xor_(reg_iterk, reg_iterk);
+    generate_kloop(_mtile);
+    write_back(_mtile);
+    add(reg_itern, NTILE);
+    cmp(reg_itern, reg_nsize);
+    jb(".nloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+
+    outLocalLabel();  // end of local label
+  }
+
+  void generate_kloop(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, reg_ksize);
+    padto_le(reg_tmp, KUNROLL * KTILE);
+    cmp(reg_tmp, 0);
+    jz(".kloop", T_NEAR);
+    L(".unkloop");
+    generate_fma(_mtile, KUNROLL);
+    add(reg_matAptr, KUNROLL * AKStepSize);
+    add(reg_matBptr, KUNROLL * BKStepSize);
+    add(reg_iterk, KUNROLL * KTILE);
+    cmp(reg_iterk, reg_tmp);  // k iteration variable
+    jb(".unkloop");
+    cmp(reg_tmp, reg_ksize);
+    jge(".kend", T_NEAR);
+    L(".kloop");
+    generate_fma(_mtile, 1);
+    add(reg_matAptr, 1 * AKStepSize);
+    add(reg_matBptr, 1 * BKStepSize);
+    add(reg_iterk, 1 * KTILE);
+    cmp(reg_iterk, reg_ksize);  // k iteration variable
+    jb(".kloop");
+    L(".kend");
+    outLocalLabel();
+  }
+
+  void generate_fma(int _mtile, int _ktile) {
+    for (int kk = 0; kk < _ktile; kk++) {
+      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
+      if (BRegCount == NRegs) {
+        for (int i = 0; i < NRegs; i++) {
+          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+        }
+        for (int mm = 0; mm < _mtile; mm++) {
+          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
+          add(reg_tmp1, reg_astride);
+          for (int i = 0; i < NRegs; i++) {
+            vdpbf16ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
+          }
+        }
+      } else if (BRegCount == 0) {
+        for (int mm = 0; mm < _mtile; mm += ARegCount) {
+          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
+          for (int imm = 0; imm < mm_re; imm++) {
+            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
+            add(reg_tmp1, reg_astride);
+            for (int i = 0; i < NRegs; i++) {
+              vdpbf16ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
+                        ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+            }
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+  }
+
+  void init_regs(int _mtile) {
+    inLocalLabel();
+    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
+    cmp(reg_tmp, 0);
+    je(".read", T_NEAR);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
+      }
+    }
+    jmp(".end", T_NEAR);
+    L(".read");
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    L(".end");
+    outLocalLabel();
+  }
+
+  void write_back(int _mtile) {
+    inLocalLabel();
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    outLocalLabel();
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class Avx512vnniN16P4 : protected bestla::xbyak::JitAvx512vnni {
+ public:
+  static int constexpr RegLen = 16, PackRow = 4;
+  static_assert(_NTILE % RegLen == 0);
+  static int constexpr NRegs = _NTILE / RegLen;
+  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
+  static_assert(NRegs * MRegs <= RegCount - 1);
+  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
+  static int constexpr KUNROLL = 2;
+  static auto constexpr ISA = BTLA_ISA::AVX512_VNNI;
+  static auto constexpr COMPUTE = CompType::COMP_INT8_US_INT32;
+  typedef uint8_t AType;
+  typedef int8_t BType;
+  typedef int32_t CType;
+  struct params {
+    AType* matA;
+    int astride;
+    BType* matB;
+    int bstride;
+    CType* matC;
+    int cstride;
+    int k;
+    int n;
+    int init;
+  };
+  typedef long long (*func_t)(params*);
+
+  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
+  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
+  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
+  static int constexpr AKStepSize = KTILE * sizeof(AType);
+
+  void generate_code(int _mtile) {
+    assign_regs();
+    reset();
+    generate_mtile(_mtile);
+    ready();
+    mKernel = getCode<func_t>();
+  }
+  func_t mKernel = nullptr;
+
+ private:
+  Xbyak::Reg64 parambase;
+  Xbyak::Reg64 reg_matAptr;
+  Xbyak::Reg64 reg_matBptr;
+  Xbyak::Reg64 reg_matCptr;
+  Xbyak::Reg64 reg_ksize;
+  Xbyak::Reg64 reg_nsize;
+  Xbyak::Reg64 reg_cstride;
+  Xbyak::Reg64 reg_astride;
+  Xbyak::Reg64 reg_iterk;
+  Xbyak::Reg64 reg_itern;
+  Xbyak::Reg64 reg_tmp;
+  Xbyak::Reg64 reg_tmp1;
+  Xbyak::Reg64 reg_tmp2;
+  Xbyak::Reg64 reg_ret = rax;
+
+ protected:
+  void assign_regs() {
+    CRegCount = MRegs * NRegs;
+    ARegCount = 1;
+    BRegCount = RegCount - ARegCount - CRegCount;
+    if (BRegCount < NRegs) {
+      BRegCount = 0;
+      ARegCount = BRegCount + 1;
+    }
+    if (BRegCount > NRegs) {
+      BRegCount = NRegs;
+    }
+    CReg = 0;
+    BReg = CReg + CRegCount;
+    AReg = BReg + BRegCount;
+    TmpReg = AReg + ARegCount;
+    assert(TmpReg <= RegCount);
+    TmpRegCount = RegCount - TmpReg;
+  }
+
+  void generate_mtile(int _mtile) {
+    inLocalLabel();
+    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
+    parambase = st.p[0];
+    reg_matAptr = st.t[0];
+    reg_matBptr = st.t[1];
+    reg_matCptr = st.t[0];
+    reg_ksize = st.t[2];
+    reg_astride = st.t[3];
+    reg_cstride = st.t[3];
+    reg_iterk = st.t[4];
+    reg_tmp = st.t[5];
+    reg_tmp1 = st.t[6];
+    reg_tmp2 = st.t[7];
+    reg_nsize = st.t[8];
+    reg_itern = st.t[9];
+    reg_ret = rax;
+
+    vreg_push(rsp);
+
+    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
+    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
+    xor_(reg_itern, reg_itern);
+    L(".nloop");
+    init_regs(_mtile);
+    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
+    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
+    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
+    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
+    imul(reg_tmp, reg_itern);
+    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
+    xor_(reg_iterk, reg_iterk);
+    generate_kloop(_mtile);
+    write_back(_mtile);
+    add(reg_itern, NTILE);
+    cmp(reg_itern, reg_nsize);
+    jb(".nloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+
+    outLocalLabel();  // end of local label
+  }
+
+  void generate_kloop(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, reg_ksize);
+    padto_le(reg_tmp, KUNROLL * KTILE);
+    cmp(reg_tmp, 0);
+    jz(".kloop", T_NEAR);
+    L(".unkloop");
+    generate_fma(_mtile, KUNROLL);
+    add(reg_matAptr, KUNROLL * AKStepSize);
+    add(reg_matBptr, KUNROLL * BKStepSize);
+    add(reg_iterk, KUNROLL * KTILE);
+    cmp(reg_iterk, reg_tmp);  // k iteration variable
+    jb(".unkloop");
+    cmp(reg_tmp, reg_ksize);
+    jge(".kend", T_NEAR);
+    L(".kloop");
+    generate_fma(_mtile, 1);
+    add(reg_matAptr, 1 * AKStepSize);
+    add(reg_matBptr, 1 * BKStepSize);
+    add(reg_iterk, 1 * KTILE);
+    cmp(reg_iterk, reg_ksize);  // k iteration variable
+    jb(".kloop");
+    L(".kend");
+    outLocalLabel();
+  }
+
+  void generate_fma(int _mtile, int _kunroll) {
+    for (int kk = 0; kk < _kunroll; kk++) {
+      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
+      if (BRegCount == NRegs) {
+        for (int i = 0; i < NRegs; i++) {
+          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+        }
+        for (int mm = 0; mm < _mtile; mm++) {
+          vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
+          add(reg_tmp1, reg_astride);
+          for (int i = 0; i < NRegs; i++) {
+            vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
+          }
+        }
+      } else if (BRegCount == 0) {
+        for (int mm = 0; mm < _mtile; mm += ARegCount) {
+          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
+          for (int imm = 0; imm < mm_re; imm++) {
+            vpbroadcastd(vreg_t(AReg + imm), ptr[reg_tmp1]);
+            add(reg_tmp1, reg_astride);
+            for (int i = 0; i < NRegs; i++) {
+              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
+                         ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+            }
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+  }
+
+  void init_regs(int _mtile) {
+    inLocalLabel();
+    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
+    cmp(reg_tmp, 0);
+    je(".read", T_NEAR);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
+      }
+    }
+    jmp(".end", T_NEAR);
+    L(".read");
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    L(".end");
+    outLocalLabel();
+  }
+
+  void write_back(int _mtile) {
+    inLocalLabel();
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    outLocalLabel();
+  }
+};
+
+template <typename AT, int _NTILE, int _MTILE = 0>
+class AvxvnniN8P4 : protected bestla::xbyak::JitAvxvnni {
+ public:
+  static int constexpr RegLen = 8, PackRow = 4;
+  static_assert(_NTILE % RegLen == 0);
+  static int constexpr NRegs = _NTILE / RegLen;
+  static int constexpr KeepRegs = std::is_same_v<AT, uint8_t> ? 1 : 3;
+  static int constexpr MRegs = _MTILE == 0 ? (RegCount - KeepRegs) / NRegs : _MTILE;
+  static_assert(NRegs * MRegs <= RegCount - KeepRegs);
+  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
+  static int constexpr KUNROLL = 2;
+  static auto constexpr ISA = BTLA_ISA::AVX_VNNI;
+  static auto constexpr COMPUTE =
+    std::is_same_v<AT, uint8_t> ? CompType::COMP_INT8_US_INT32 : CompType::COMP_INT8_SS_INT32;
+  using AType = AT;
+  typedef int8_t BType;
+  typedef int32_t CType;
+  struct params {
+    AType* matA;
+    int astride;
+    BType* matB;
+    int bstride;
+    CType* matC;
+    int cstride;
+    int k;
+    int n;
+    int init;
+  };
+  typedef long long (*func_t)(params*);
+
+  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
+  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
+  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
+  static int constexpr AKStepSize = KTILE * sizeof(AType);
+
+  void generate_code(int _mtile) {
+    assign_regs();
+    reset();
+    generate_mtile(_mtile);
+    ready();
+    mKernel = getCode<func_t>();
+  }
+  func_t mKernel = nullptr;
+
+ private:
+  Xbyak::Reg64 parambase;
+  Xbyak::Reg64 reg_matAptr;
+  Xbyak::Reg64 reg_matBptr;
+  Xbyak::Reg64 reg_matCptr;
+  Xbyak::Reg64 reg_ksize;
+  Xbyak::Reg64 reg_nsize;
+  Xbyak::Reg64 reg_cstride;
+  Xbyak::Reg64 reg_astride;
+  Xbyak::Reg64 reg_iterk;
+  Xbyak::Reg64 reg_itern;
+  Xbyak::Reg64 reg_tmp;
+  Xbyak::Reg64 reg_tmp1;
+  Xbyak::Reg64 reg_tmp2;
+  Xbyak::Reg64 reg_ret = rax;
+  Xbyak::Opmask msk_wr = k1;
+
+ protected:
+  void assign_regs() {
+    CRegCount = MRegs * NRegs;
+    ARegCount = 1;
+    if (std::is_same_v<AT, int8_t>) {
+      TmpRegCount = 2;
+    }
+    BRegCount = RegCount - ARegCount - CRegCount - TmpRegCount;
+    if (BRegCount < NRegs) {
+      BRegCount = 0;
+      ARegCount = BRegCount + 1;
+    }
+    if (BRegCount > NRegs) {
+      BRegCount = NRegs;
+    }
+    CReg = 0;
+    BReg = CReg + CRegCount;
+    AReg = BReg + BRegCount;
+    TmpReg = AReg + ARegCount;
+    assert(TmpReg + TmpRegCount <= RegCount);
+  }
+
+  void generate_mtile(int _mtile) {
+    inLocalLabel();
+    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
+    parambase = st.p[0];
+    reg_matAptr = st.t[0];
+    reg_matBptr = st.t[1];
+    reg_matCptr = st.t[0];
+    reg_ksize = st.t[2];
+    reg_astride = st.t[3];
+    reg_cstride = st.t[3];
+    reg_iterk = st.t[4];
+    reg_tmp = st.t[5];
+    reg_tmp1 = st.t[6];
+    reg_tmp2 = st.t[7];
+    reg_nsize = st.t[8];
+    reg_itern = st.t[9];
+    reg_ret = rax;
+
+    vreg_push(rsp);
+
+    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
+    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
+    xor_(reg_itern, reg_itern);
+    L(".nloop");
+    init_regs(_mtile);
+    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
+    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
+    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
+    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
+    imul(reg_tmp, reg_itern);
+    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
+    xor_(reg_iterk, reg_iterk);
+    generate_kloop(_mtile);
+    write_back(_mtile);
+    add(reg_itern, NTILE);
+    cmp(reg_itern, reg_nsize);
+    jb(".nloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+
+    outLocalLabel();  // end of local label
+  }
+
+  void generate_kloop(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, reg_ksize);
+    padto_le(reg_tmp, KUNROLL * KTILE);
+    cmp(reg_tmp, 0);
+    jz(".kloop", T_NEAR);
+    L(".unkloop");
+    generate_fma(_mtile, KUNROLL);
+    add(reg_matAptr, KUNROLL * AKStepSize);
+    add(reg_matBptr, KUNROLL * BKStepSize);
+    add(reg_iterk, KUNROLL * KTILE);
+    cmp(reg_iterk, reg_tmp);  // k iteration variable
+    jb(".unkloop");
+    cmp(reg_tmp, reg_ksize);
+    jge(".kend", T_NEAR);
+    L(".kloop");
+    generate_fma(_mtile, 1);
+    add(reg_matAptr, 1 * AKStepSize);
+    add(reg_matBptr, 1 * BKStepSize);
+    add(reg_iterk, 1 * KTILE);
+    cmp(reg_iterk, reg_ksize);  // k iteration variable
+    jb(".kloop");
+    L(".kend");
+    outLocalLabel();
+  }
+
+  void generate_fma(int _mtile, int _kunroll) {
+    for (int kk = 0; kk < _kunroll; kk++) {
+      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
+      if (BRegCount == NRegs) {
+        for (int i = 0; i < NRegs; i++) {
+          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+        }
+        for (int mm = 0; mm < _mtile; mm++) {
+          vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
+          if constexpr (std::is_same_v<AType, int8_t>) {
+            vpsignb(vreg_t(TmpReg + 1), vreg_t(AReg), vreg_t(AReg));
+          }
+          add(reg_tmp1, reg_astride);
+          for (int i = 0; i < NRegs; i++) {
+            if constexpr (std::is_same_v<AType, uint8_t>) {
+              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
+            } else {
+              vpsignb(vreg_t(TmpReg), vreg_t(BReg + i), vreg_t(AReg));
+              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(TmpReg + 1), vreg_t(TmpReg));
+            }
+          }
+        }
+      } else if (BRegCount == 0) {
+        for (int mm = 0; mm < _mtile; mm += ARegCount) {
+          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
+          for (int imm = 0; imm < mm_re; imm++) {
+            vpbroadcastd(vreg_t(AReg + imm), ptr[reg_tmp1]);
+            if constexpr (std::is_same_v<AType, int8_t>) {
+              vpsignb(vreg_t(TmpReg + 1), vreg_t(AReg + imm), vreg_t(AReg + imm));
+            }
+            add(reg_tmp1, reg_astride);
+            for (int i = 0; i < NRegs; i++) {
+              if constexpr (std::is_same_v<AType, uint8_t>) {
+                vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
+                           ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+              } else {
+                vmovups(vreg_t(TmpReg), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+                vpsignb(vreg_t(TmpReg), vreg_t(TmpReg), vreg_t(AReg + imm));
+                vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(TmpReg + 1), vreg_t(TmpReg));
+              }
+            }
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+  }
+
+  void init_regs(int _mtile) {
+    inLocalLabel();
+    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
+    cmp(reg_tmp, 0);
+    je(".read", T_NEAR);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
+      }
+    }
+    jmp(".end", T_NEAR);
+    L(".read");
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    L(".end");
+    outLocalLabel();
+  }
+
+  void write_back(int _mtile) {
+    inLocalLabel();
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    outLocalLabel();
+  }
+};
+
+template <int N, int M>
+using AvxvnniN8P4U8 = AvxvnniN8P4<uint8_t, N, M>;
+
+template <int N, int M>
+using AvxvnniN8P4S8 = AvxvnniN8P4<int8_t, N, M>;
+
+template <typename AT, int _NTILE, int _MTILE = 0>
+class Avx2vnniN8P4 : protected bestla::xbyak::JitAvx2 {
+ public:
+  static int constexpr RegLen = 8, PackRow = 4;
+  static_assert(_NTILE % RegLen == 0);
+  static int constexpr NRegs = _NTILE / RegLen;
+  static int constexpr KeepRegs = std::is_same_v<AT, uint8_t> ? 3 : 5;
+  static int constexpr MRegs = _MTILE == 0 ? (RegCount - KeepRegs) / NRegs : _MTILE;
+  static_assert(NRegs * MRegs <= RegCount - KeepRegs);
+  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
+  static int constexpr KUNROLL = 2;
+  static auto constexpr ISA = BTLA_ISA::AVX2;
+  static auto constexpr COMPUTE =
+      std::is_same_v<AT, uint8_t> ? CompType::COMP_INT8_US_INT32 : CompType::COMP_INT8_SS_INT32;
+  using AType = AT;
+  typedef int8_t BType;
+  typedef int32_t CType;
+  struct params {
+    AType* matA;
+    int astride;
+    BType* matB;
+    int bstride;
+    CType* matC;
+    int cstride;
+    int k;
+    int n;
+    int init;
+    const int16_t one = 1;
+  };
+  typedef long long (*func_t)(params*);
+
+  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
+  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
+  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
+  static int constexpr AKStepSize = KTILE * sizeof(AType);
+
+  void generate_code(int _mtile) {
+    assign_regs();
+    reset();
+    generate_mtile(_mtile);
+    ready();
+    mKernel = getCode<func_t>();
+  }
+  func_t mKernel = nullptr;
+
+ private:
+  Xbyak::Reg64 parambase;
+  Xbyak::Reg64 reg_matAptr;
+  Xbyak::Reg64 reg_matBptr;
+  Xbyak::Reg64 reg_matCptr;
+  Xbyak::Reg64 reg_ksize;
+  Xbyak::Reg64 reg_nsize;
+  Xbyak::Reg64 reg_cstride;
+  Xbyak::Reg64 reg_astride;
+  Xbyak::Reg64 reg_iterk;
+  Xbyak::Reg64 reg_itern;
+  Xbyak::Reg64 reg_tmp;
+  Xbyak::Reg64 reg_tmp1;
+  Xbyak::Reg64 reg_tmp2;
+  Xbyak::Reg64 reg_ret = rax;
+  Xbyak::Opmask msk_wr = k1;
+
+ protected:
+  void assign_regs() {
+    CRegCount = MRegs * NRegs;
+    ARegCount = 1;
+    if (std::is_same_v<AT, int8_t>) {
+      TmpRegCount = 4;
+    } else {
+      TmpRegCount = 2;
+    }
+    BRegCount = RegCount - ARegCount - CRegCount - TmpRegCount;
+    if (BRegCount < NRegs) {
+      BRegCount = 0;
+      ARegCount = BRegCount + 1;
+    }
+    if (BRegCount > NRegs) {
+      BRegCount = NRegs;
+    }
+    CReg = 0;
+    BReg = CReg + CRegCount;
+    AReg = BReg + BRegCount;
+    TmpReg = AReg + ARegCount;
+    assert(TmpReg + TmpRegCount <= RegCount);
+  }
+
+  void generate_mtile(int _mtile) {
+    inLocalLabel();
+    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
+    parambase = st.p[0];
+    reg_matAptr = st.t[0];
+    reg_matBptr = st.t[1];
+    reg_matCptr = st.t[0];
+    reg_ksize = st.t[2];
+    reg_astride = st.t[3];
+    reg_cstride = st.t[3];
+    reg_iterk = st.t[4];
+    reg_tmp = st.t[5];
+    reg_tmp1 = st.t[6];
+    reg_tmp2 = st.t[7];
+    reg_nsize = st.t[8];
+    reg_itern = st.t[9];
+    reg_ret = rax;
+
+    vreg_push(rsp);
+    vpbroadcastw(vreg_t(TmpReg + 0), ptr[parambase + OFFSET(one)]);
+    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
+    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
+    xor_(reg_itern, reg_itern);
+    L(".nloop");
+    init_regs(_mtile);
+    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
+    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
+    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
+    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
+    imul(reg_tmp, reg_itern);
+    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
+    xor_(reg_iterk, reg_iterk);
+    generate_kloop(_mtile);
+    write_back(_mtile);
+    add(reg_itern, NTILE);
+    cmp(reg_itern, reg_nsize);
+    jb(".nloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+
+    outLocalLabel();  // end of local label
+  }
+
+  void generate_kloop(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, reg_ksize);
+    padto_le(reg_tmp, KUNROLL * KTILE);
+    cmp(reg_tmp, 0);
+    jz(".kloop", T_NEAR);
+    L(".unkloop");
+    generate_fma(_mtile, KUNROLL);
+    add(reg_matAptr, KUNROLL * AKStepSize);
+    add(reg_matBptr, KUNROLL * BKStepSize);
+    add(reg_iterk, KUNROLL * KTILE);
+    cmp(reg_iterk, reg_tmp);  // k iteration variable
+    jb(".unkloop");
+    cmp(reg_tmp, reg_ksize);
+    jge(".kend", T_NEAR);
+    L(".kloop");
+    generate_fma(_mtile, 1);
+    add(reg_matAptr, 1 * AKStepSize);
+    add(reg_matBptr, 1 * BKStepSize);
+    add(reg_iterk, 1 * KTILE);
+    cmp(reg_iterk, reg_ksize);  // k iteration variable
+    jb(".kloop");
+    L(".kend");
+    outLocalLabel();
+  }
+
+  void generate_fma(int _mtile, int _kunroll) {
+    for (int kk = 0; kk < _kunroll; kk++) {
+      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
+      if (BRegCount == NRegs) {
+        for (int i = 0; i < NRegs; i++) {
+          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+        }
+        for (int mm = 0; mm < _mtile; mm++) {
+          vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
+          if constexpr (std::is_same_v<AType, int8_t>) {
+            vpsignb(vreg_t(TmpReg + 2), vreg_t(AReg), vreg_t(AReg));
+          }
+          add(reg_tmp1, reg_astride);
+          for (int i = 0; i < NRegs; i++) {
+            if constexpr (std::is_same_v<AType, uint8_t>) {
+              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(TmpReg + 1), vreg_t(AReg), vreg_t(BReg + i),
+                         vreg_t(TmpReg + 0));
+            } else {
+              vpsignb(vreg_t(TmpReg + 3), vreg_t(BReg + i), vreg_t(AReg));
+              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(TmpReg + 1), vreg_t(TmpReg + 2), vreg_t(TmpReg + 3),
+                         vreg_t(TmpReg + 0));
+            }
+          }
+        }
+      } else if (BRegCount == 0) {
+        for (int mm = 0; mm < _mtile; mm += ARegCount) {
+          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
+          for (int imm = 0; imm < mm_re; imm++) {
+            vpbroadcastd(vreg_t(AReg + imm), ptr[reg_tmp1]);
+            if constexpr (std::is_same_v<AType, int8_t>) {
+              vpsignb(vreg_t(TmpReg + 2), vreg_t(AReg + imm), vreg_t(AReg + imm));
+            }
+            add(reg_tmp1, reg_astride);
+            for (int i = 0; i < NRegs; i++) {
+              if constexpr (std::is_same_v<AType, uint8_t>) {
+                vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(TmpReg + 1), vreg_t(AReg + imm),
+                           ptr[reg_matBptr + kk * BKStepSize + i * VecBytes], vreg_t(TmpReg + 0));
+              } else {
+                vmovups(vreg_t(TmpReg + 3), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+                vpsignb(vreg_t(TmpReg + 3), vreg_t(TmpReg + 3), vreg_t(AReg + imm));
+                vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(TmpReg + 1), vreg_t(TmpReg + 2), vreg_t(TmpReg + 3),
+                           vreg_t(TmpReg + 0));
+              }
+            }
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+  }
+  void init_regs(int _mtile) {
+    inLocalLabel();
+    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
+    cmp(reg_tmp, 0);
+    je(".read", T_NEAR);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
+      }
+    }
+    jmp(".end", T_NEAR);
+    L(".read");
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    L(".end");
+    outLocalLabel();
+  }
+  void write_back(int _mtile) {
+    inLocalLabel();
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    outLocalLabel();
+  }
+};
+
+template <int N, int M>
+using Avx2vnniN8P4U8 = Avx2vnniN8P4<uint8_t, N, M>;
+
+template <int N, int M>
+using Avx2vnniN8P4S8 = Avx2vnniN8P4<int8_t, N, M>;
+
+template <int _NTILE, int _MTILE = 0>
+class Amxbf16N16P2 : protected bestla::xbyak::JitAmxbf16 {
+ public:
+  static int constexpr RegLen = 16, PackRow = 2;
+  static_assert(_NTILE % RegLen == 0);
+  static_assert(_MTILE % RegLen == 0);
+  static int constexpr NRegs = _NTILE / RegLen;
+  static int constexpr MRegs = _MTILE == 0 ? 1 : _MTILE / RegLen;
+  static_assert(NRegs * MRegs + 2 <= TileCount);
+  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs * RegLen, KTILE = 32;
+  static int constexpr KUNROLL = 2;
+  static auto constexpr ISA = BTLA_ISA::AMX_BF16;
+  static auto constexpr COMPUTE = CompType::COMP_BF16_FP32;
+  typedef utils::bf16 AType;
+  typedef utils::bf16 BType;
+  typedef float CType;
+
+  struct params {
+    AType* matA;
+    int astride;
+    BType* matB;
+    int bstride;
+    CType* matC;
+    int cstride;
+    int k;
+    int n;
+    int init;
+    void* workspace;
+  };
+  typedef long long (*func_t)(params*);
+
+  int TmpRegCount = RegCount;
+  int TmpReg = 0;
+  int CTileCount = 0, ATileCount = 0, BTileCount = 0;
+  int CTile = 0, ATile = 0, BTile = 0;
+  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
+  static int constexpr AKStepSize = KTILE * sizeof(AType);
+
+  void generate_code(int _mtile) {
+    assign_regs();
+    reset();
+    generate_mtile(_mtile);
+    ready();
+    mKernel = getCode<func_t>();
+  }
+  func_t mKernel = nullptr;
+
+ protected:
+  Xbyak::Reg64 parambase;
+  Xbyak::Reg64 reg_matAptr;
+  Xbyak::Reg64 reg_matBptr;
+  Xbyak::Reg64 reg_matCptr;
+  Xbyak::Reg64 reg_ksize;
+  Xbyak::Reg64 reg_nsize;
+  Xbyak::Reg64 reg_cstride;
+  Xbyak::Reg64 reg_astride;
+  Xbyak::Reg64 reg_iterk;
+  Xbyak::Reg64 reg_itern;
+  Xbyak::Reg64 reg_tmp;
+  Xbyak::Reg64 reg_tmp1;
+  Xbyak::Reg64 reg_tmp2;
+  Xbyak::Reg64 reg_tmp3;
+  Xbyak::Reg64 reg_ret = rax;
+
+  void assign_regs() {
+    CTileCount = NRegs * MRegs;
+    auto tile_re = TileCount - CTileCount;
+    if (tile_re - 1 >= NRegs) {
+      BTileCount = NRegs;
+      ATileCount = tile_re - BTileCount;
+    } else if (tile_re - 1 >= MRegs) {
+      ATileCount = MRegs;
+      BTileCount = tile_re - ATileCount;
+    } else {
+      ATileCount = 1;
+      BTileCount = tile_re - ATileCount;
+    }
+    CTile = 0;
+    ATile = CTile + CTileCount;
+    BTile = ATile + ATileCount;
+  }
+
+  void generate_mtile(int _mtile) {
+    inLocalLabel();  // use local label for multiple instance
+    Xbyak::util::StackFrame st(this, 1, 11, 16 * 10);
+    parambase = st.p[0];
+    reg_matAptr = st.t[0];
+    reg_matBptr = st.t[1];
+    reg_matCptr = st.t[0];
+    reg_ksize = st.t[2];
+    reg_astride = st.t[3];
+    reg_cstride = st.t[3];
+    reg_iterk = st.t[4];
+    reg_tmp = st.t[5];
+    reg_tmp1 = st.t[6];
+    reg_tmp2 = st.t[7];
+    reg_tmp3 = st.t[10];
+    reg_nsize = st.t[8];
+    reg_itern = st.t[9];
+    reg_ret = rax;
+
+    vreg_push(rsp);
+
+    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
+    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
+    xor_(reg_itern, reg_itern);
+    L(".nloop");
+    init_regs(_mtile);
+    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
+    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
+    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
+    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
+    imul(reg_tmp, reg_itern);
+    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
+    xor_(reg_iterk, reg_iterk);
+    generate_kloop(_mtile);
+    write_back(_mtile);
+    add(reg_itern, NTILE);
+    cmp(reg_itern, reg_nsize);
+    jb(".nloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+
+    outLocalLabel();  // end of local label
+  }
+
+  void generate_kloop(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, reg_ksize);
+    padto_le(reg_tmp, KUNROLL * KTILE);
+    cmp(reg_tmp, 0);
+    jz(".kloop", T_NEAR);
+    L(".unkloop");
+    generate_fma(_mtile, KUNROLL);
+    add(reg_matAptr, KUNROLL * AKStepSize);
+    add(reg_matBptr, KUNROLL * BKStepSize);
+    add(reg_iterk, KUNROLL * KTILE);
+    cmp(reg_iterk, reg_tmp);  // k iteration variable
+    jb(".unkloop");
+    cmp(reg_tmp, reg_ksize);
+    jge(".kend", T_NEAR);
+    L(".kloop");
+    generate_fma(_mtile, 1);
+    add(reg_matAptr, 1 * AKStepSize);
+    add(reg_matBptr, 1 * BKStepSize);
+    add(reg_iterk, 1 * KTILE);
+    cmp(reg_iterk, reg_ksize);  // k iteration variable
+    jb(".kloop");
+    L(".kend");
+    outLocalLabel();
+  }
+
+  void generate_fma(int _mtile, int kunrll) {
+    auto& reg_Bstride = reg_tmp1;
+    mov(reg_Bstride, NTILE * 4);
+    int mtiles = _mtile / RegLen;
+    for (int kk = 0; kk < kunrll; kk++) {
+      auto reg_Atmp = reg_tmp2;
+      if (mtiles == 1) {
+        reg_Atmp = reg_matAptr;
+      } else {
+        mov(reg_Atmp, reg_matAptr);
+      }
+      if (BTileCount == NRegs) {
+        for (int i = 0; i < NRegs; i++) {
+          tileloaddt1(Xbyak::Tmm(BTile + i), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
+        }
+        for (int mm = 0; mm < mtiles; mm++) {
+          tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
+          for (int i = 0; i < NRegs; i++) {
+            tdpbf16ps(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile + i));
+          }
+          if (mm != mtiles - 1) {
+            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+          }
+        }
+      } else {
+        if (ATileCount == mtiles) {
+          for (int mm = 0; mm < mtiles; mm++) {
+            tileloadd(Xbyak::Tmm(ATile + mm), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
+            if (mm != mtiles - 1) {
+              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+            }
+          }
+          for (int i = 0; i < NRegs; i++) {
+            tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
+            for (int mm = 0; mm < mtiles; mm++) {
+              tdpbf16ps(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile + mm), Xbyak::Tmm(BTile));
+            }
+          }
+        } else {
+          for (int mm = 0; mm < mtiles; mm++) {
+            tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
+            for (int i = 0; i < NRegs; i++) {
+              tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
+              tdpbf16ps(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile));
+            }
+            if (mm != mtiles - 1) {
+              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void init_regs(int _mtile) {
+    inLocalLabel();
+    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
+    cmp(reg_tmp, 0);
+    je(".read", T_NEAR);
+    for (int i = 0; i < CTileCount; i++) {
+      tilezero(Xbyak::Tmm(CTile + i));
+    }
+    jmp(".end", T_NEAR);
+    L(".read");
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    int mtnum = _mtile / 16;
+    for (int mm = 0; mm < mtnum; mm++) {
+      for (int i = 0; i < NRegs; i++) {
+        tileloaddt1(Xbyak::Tmm(CTile + mm * NRegs + i), ptr[reg_matCptr + reg_cstride + i * 64]);
+      }
+      if (mm != mtnum - 1) {
+        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
+        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
+      }
+    }
+    L(".end");
+    outLocalLabel();
+  }
+
+  void write_back(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, dword[parambase + OFFSET(workspace)]);
+    mov(reg_tmp1, NTILE * 4);
+    for (int mm = 0; mm < MRegs; mm++) {
+      for (int i = 0; i < NRegs; i++) {
+        tilestored(ptr[reg_tmp + reg_tmp1 + i * 64 + mm * 16 * NTILE * 4], Xbyak::Tmm(CTile + mm * NRegs + i));
+      }
+    }
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    int zunroll = TmpRegCount / NRegs;
+    for (int i = 0; i < _mtile; i += zunroll) {
+      int m_re = utils::remainsize(i, _mtile, zunroll);
+      for (int im = 0; im < m_re; im++) {
+        for (int j = 0; j < NRegs; j++) {
+          vmovups(vreg_t(TmpReg + im * NRegs + j), ptr[reg_tmp + j * 64 + (i + im) * NTILE * 4]);
+          vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(TmpReg + im * NRegs + j));
+        }
+        add(reg_matCptr, reg_cstride);
+      }
+    }
+    outLocalLabel();
+  }
+};
+
+template <typename AT, typename BT, int _NTILE, int _MTILE = 0>
+class Amxint8N16P4 : protected bestla::xbyak::JitAmxint8 {
+ public:
+  static int constexpr RegLen = 16, PackRow = 4;
+  static_assert(_NTILE % RegLen == 0);
+  static_assert(_MTILE % RegLen == 0);
+  static int constexpr NRegs = _NTILE / RegLen;
+  static int constexpr MRegs = _MTILE == 0 ? 1 : _MTILE / RegLen;
+  static_assert(NRegs * MRegs + 2 <= TileCount);
+  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs * RegLen, KTILE = 64;
+  static int constexpr KUNROLL = 2;
+  static auto constexpr ISA = BTLA_ISA::AMX_INT8;
+  static auto constexpr COMPUTE =
+      (std::is_same_v<AT, int8_t>
+           ? std::is_same_v<BT, int8_t> ? CompType::COMP_INT8_SS_INT32 : CompType::COMP_INT8_SU_INT32
+       : std::is_same_v<BT, int8_t> ? CompType::COMP_INT8_US_INT32
+                                    : CompType::COMP_INT8_UU_INT32);
+  using AType = AT;
+  using BType = BT;
+  typedef int32_t CType;
+
+  struct params {
+    AType* matA;
+    int astride;
+    BType* matB;
+    int bstride;
+    CType* matC;
+    int cstride;
+    int k;
+    int n;
+    int init;
+    void* workspace;
+  };
+  typedef long long (*func_t)(params*);
+
+  int TmpRegCount = RegCount;
+  int TmpReg = 0;
+  int CTileCount = 0, ATileCount = 0, BTileCount = 0;
+  int CTile = 0, ATile = 0, BTile = 0;
+  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
+  static int constexpr AKStepSize = KTILE * sizeof(AType);
+
+  void generate_code(int _mtile) {
+    assign_regs();
+    reset();
+    generate_mtile(_mtile);
+    ready();
+    mKernel = getCode<func_t>();
+  }
+  func_t mKernel = nullptr;
+
+ protected:
+  Xbyak::Reg64 parambase;
+  Xbyak::Reg64 reg_matAptr;
+  Xbyak::Reg64 reg_matBptr;
+  Xbyak::Reg64 reg_matCptr;
+  Xbyak::Reg64 reg_ksize;
+  Xbyak::Reg64 reg_nsize;
+  Xbyak::Reg64 reg_cstride;
+  Xbyak::Reg64 reg_astride;
+  Xbyak::Reg64 reg_iterk;
+  Xbyak::Reg64 reg_itern;
+  Xbyak::Reg64 reg_tmp;
+  Xbyak::Reg64 reg_tmp1;
+  Xbyak::Reg64 reg_tmp2;
+  Xbyak::Reg64 reg_tmp3;
+  Xbyak::Reg64 reg_ret = rax;
+
+  void assign_regs() {
+    CTileCount = NRegs * MRegs;
+    auto tile_re = TileCount - CTileCount;
+    if (tile_re - 1 >= NRegs) {
+      BTileCount = NRegs;
+      ATileCount = tile_re - BTileCount;
+    } else if (tile_re - 1 >= MRegs) {
+      ATileCount = MRegs;
+      BTileCount = tile_re - ATileCount;
+    } else {
+      ATileCount = 1;
+      BTileCount = tile_re - ATileCount;
+    }
+    CTile = 0;
+    ATile = CTile + CTileCount;
+    BTile = ATile + ATileCount;
+  }
+
+  void generate_mtile(int _mtile) {
+    inLocalLabel();  // use local label for multiple instance
+    Xbyak::util::StackFrame st(this, 1, 11, 16 * 10);
+    parambase = st.p[0];
+    reg_matAptr = st.t[0];
+    reg_matBptr = st.t[1];
+    reg_matCptr = st.t[0];
+    reg_ksize = st.t[2];
+    reg_astride = st.t[3];
+    reg_cstride = st.t[3];
+    reg_iterk = st.t[4];
+    reg_tmp = st.t[5];
+    reg_tmp1 = st.t[6];
+    reg_tmp2 = st.t[7];
+    reg_tmp3 = st.t[10];
+    reg_nsize = st.t[8];
+    reg_itern = st.t[9];
+    reg_ret = rax;
+
+    vreg_push(rsp);
+
+    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
+    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
+    xor_(reg_itern, reg_itern);
+    L(".nloop");
+    init_regs(_mtile);
+    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
+    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
+    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
+    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
+    imul(reg_tmp, reg_itern);
+    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
+    xor_(reg_iterk, reg_iterk);
+    generate_kloop(_mtile);
+    write_back(_mtile);
+    add(reg_itern, NTILE);
+    cmp(reg_itern, reg_nsize);
+    jb(".nloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+
+    outLocalLabel();  // end of local label
+  }
+
+  void generate_kloop(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, reg_ksize);
+    padto_le(reg_tmp, KUNROLL * KTILE);
+    cmp(reg_tmp, 0);
+    jz(".kloop", T_NEAR);
+    L(".unkloop");
+    generate_fma(_mtile, KUNROLL);
+    add(reg_matAptr, KUNROLL * AKStepSize);
+    add(reg_matBptr, KUNROLL * BKStepSize);
+    add(reg_iterk, KUNROLL * KTILE);
+    cmp(reg_iterk, reg_tmp);  // k iteration variable
+    jb(".unkloop");
+    cmp(reg_tmp, reg_ksize);
+    jge(".kend", T_NEAR);
+    L(".kloop");
+    generate_fma(_mtile, 1);
+    add(reg_matAptr, 1 * AKStepSize);
+    add(reg_matBptr, 1 * BKStepSize);
+    add(reg_iterk, 1 * KTILE);
+    cmp(reg_iterk, reg_ksize);  // k iteration variable
+    jb(".kloop");
+    L(".kend");
+    outLocalLabel();
+  }
+
+  void generate_fma(int _mtile, int kunrll) {
+    auto& reg_Bstride = reg_tmp1;
+    mov(reg_Bstride, NTILE * 4);
+    int mtiles = _mtile / RegLen;
+
+    for (int kk = 0; kk < kunrll; kk++) {
+      auto reg_Atmp = reg_tmp2;
+      if (mtiles == 1) {
+        reg_Atmp = reg_matAptr;
+      } else {
+        mov(reg_Atmp, reg_matAptr);
+      }
+      if (BTileCount == NRegs) {
+        for (int i = 0; i < NRegs; i++) {
+          tileloaddt1(Xbyak::Tmm(BTile + i), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
+        }
+        for (int mm = 0; mm < mtiles; mm++) {
+          tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
+          for (int i = 0; i < NRegs; i++) {
+            _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile + i));
+          }
+          if (mm != mtiles - 1) {
+            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+          }
+        }
+      } else {
+        if (ATileCount == mtiles) {
+          for (int mm = 0; mm < mtiles; mm++) {
+            tileloadd(Xbyak::Tmm(ATile + mm), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
+            if (mm != mtiles - 1) {
+              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+            }
+          }
+          for (int i = 0; i < NRegs; i++) {
+            tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
+            for (int mm = 0; mm < mtiles; mm++) {
+              _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile + mm), Xbyak::Tmm(BTile));
+            }
+          }
+        } else {
+          for (int mm = 0; mm < mtiles; mm++) {
+            tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
+            for (int i = 0; i < NRegs; i++) {
+              tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
+              _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile));
+            }
+            if (mm != mtiles - 1) {
+              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void init_regs(int _mtile) {
+    inLocalLabel();
+    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
+    cmp(reg_tmp, 0);
+    je(".read", T_NEAR);
+    for (int i = 0; i < CTileCount; i++) {
+      tilezero(Xbyak::Tmm(CTile + i));
+    }
+    jmp(".end", T_NEAR);
+    L(".read");
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    int mtnum = _mtile / 16;
+    for (int mm = 0; mm < mtnum; mm++) {
+      for (int i = 0; i < NRegs; i++) {
+        tileloaddt1(Xbyak::Tmm(CTile + mm * NRegs + i), ptr[reg_matCptr + reg_cstride + i * 64]);
+      }
+      if (mm != mtnum - 1) {
+        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
+        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
+      }
+    }
+    L(".end");
+    outLocalLabel();
+  }
+
+  void write_back(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, dword[parambase + OFFSET(workspace)]);
+    mov(reg_tmp1, NTILE * 4);
+    for (int mm = 0; mm < MRegs; mm++) {
+      for (int i = 0; i < NRegs; i++) {
+        tilestored(ptr[reg_tmp + reg_tmp1 + i * 64 + mm * 16 * NTILE * 4], Xbyak::Tmm(CTile + mm * NRegs + i));
+      }
+    }
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    int zunroll = TmpRegCount / NRegs;
+    for (int i = 0; i < _mtile; i += zunroll) {
+      int m_re = utils::remainsize(i, _mtile, zunroll);
+      for (int im = 0; im < m_re; im++) {
+        for (int j = 0; j < NRegs; j++) {
+          vmovups(vreg_t(TmpReg + im * NRegs + j), ptr[reg_tmp + j * 64 + (i + im) * NTILE * 4]);
+          vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(TmpReg + im * NRegs + j));
+        }
+        add(reg_matCptr, reg_cstride);
+      }
+    }
+    outLocalLabel();
+  }
+};
+template <int N, int M>
+using Amxint8N16P4US = Amxint8N16P4<uint8_t, int8_t, N, M>;
+
+template <int N, int M>
+using Amxint8N16P4SS = Amxint8N16P4<int8_t, int8_t, N, M>;
+
+class AmxConfigure : protected xbyak::JitAmxtile {
+ public:
+  typedef long long (*func_t)(tileconfig_t*);
+
+  static void configure(int TILE_M, int TILE_N, int TILE_K, int elesize, int ANum, int BNum, int CNum) {
+    static AmxConfigure code;
+    tileconfig_t cfg;
+    std::memset(&cfg, 0, sizeof(cfg));
+    configure_tiles(cfg, TILE_M, TILE_N, TILE_K, elesize, ANum, BNum, CNum);
+    code.mKernel(&cfg);
+  }
+
+ protected:
+  AmxConfigure() {
+    generate_config(this);
+    mKernel = getCode<func_t>();
+  }
+
+  func_t mKernel = nullptr;
+};
+
+namespace kblock {
+// optimize for kblock gemm, each block size in k dimension has dequant operation
+// all accumulators use fp32 dtype.
+template <int _NTILE, int _MTILE = 0>
+class Avx512fN16P1 : protected bestla::xbyak::JitAvx512f {
+ public:
+  static int constexpr RegLen = 16, PackRow = 1;
+  static_assert(_NTILE % RegLen == 0);
+  static int constexpr NRegs = _NTILE / RegLen;
+  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
+  static_assert(NRegs * MRegs <= RegCount - 1);
+  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
+  static int constexpr KUNROLL = 2;
+  static auto constexpr ISA = BTLA_ISA::AVX512F;
+  static auto constexpr COMPUTE = CompType::COMP_FP32;
+  typedef float AType;
+  typedef float BType;
+  typedef float CType;
+
+  struct params {
+    AType* matA;
+    int astride;
+    BType* matB;
+    int bstride;
+    CType* matC;
+    int cstride;
+    int k;
+    int n;
+    int init;
+  };
+  typedef long long (*func_t)(params*);
+
+  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
+  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
+  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
+  static int constexpr AKStepSize = KTILE * sizeof(AType);
+
+  void generate_code(int _mtile) {
+    assign_regs();
+    reset();
+    generate_mtile(_mtile);
+    ready();
+    mKernel = getCode<func_t>();
+  }
+  func_t mKernel = nullptr;
+
+ protected:
+  Xbyak::Reg64 parambase;
+  Xbyak::Reg64 reg_matAptr;
+  Xbyak::Reg64 reg_matBptr;
+  Xbyak::Reg64 reg_matCptr;
+  Xbyak::Reg64 reg_ksize;
+  Xbyak::Reg64 reg_nsize;
+  Xbyak::Reg64 reg_cstride;
+  Xbyak::Reg64 reg_astride;
+  Xbyak::Reg64 reg_iterk;
+  Xbyak::Reg64 reg_itern;
+  Xbyak::Reg64 reg_tmp;
+  Xbyak::Reg64 reg_tmp1;
+  Xbyak::Reg64 reg_tmp2;
+  Xbyak::Reg64 reg_ret = rax;
+  Xbyak::Opmask msk_wr = k1;
+
+  void assign_regs() {
+    CRegCount = MRegs * NRegs;
+    ARegCount = 1;
+    BRegCount = RegCount - ARegCount - CRegCount;
+    if (BRegCount < NRegs) {
+      BRegCount = 0;
+      ARegCount = BRegCount + 1;
+    }
+    if (BRegCount > NRegs) {
+      BRegCount = NRegs;
+    }
+    CReg = 0;
+    BReg = CReg + CRegCount;
+    AReg = BReg + BRegCount;
+    TmpReg = AReg + ARegCount;
+    assert(TmpReg <= RegCount);
+    TmpRegCount = RegCount - TmpReg;
+  }
+
+  void generate_mtile(int _mtile) {
+    inLocalLabel();  // use local label for multiple instance
+    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
+    parambase = st.p[0];
+    reg_matAptr = st.t[0];
+    reg_matBptr = st.t[1];
+    reg_matCptr = st.t[0];
+    reg_ksize = st.t[2];
+    reg_astride = st.t[3];
+    reg_cstride = st.t[3];
+    reg_iterk = st.t[4];
+    reg_tmp = st.t[5];
+    reg_tmp1 = st.t[6];
+    reg_tmp2 = st.t[7];
+    reg_nsize = st.t[8];
+    reg_itern = st.t[9];
+    reg_ret = rax;
+
+    vreg_push(rsp);
+
+    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
+    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
+    xor_(reg_itern, reg_itern);
+    L(".nloop");
+    init_regs(_mtile);
+    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
+    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
+    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
+    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
+    imul(reg_tmp, reg_itern);
+    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
+    xor_(reg_iterk, reg_iterk);
+    generate_kloop(_mtile);
+    write_back(_mtile);
+    add(reg_itern, NTILE);
+    cmp(reg_itern, reg_nsize);
+    jb(".nloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+
+    outLocalLabel();  // end of local label
+  }
+
+  void generate_kloop(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, reg_ksize);
+    padto_le(reg_tmp, KUNROLL * KTILE);
+    cmp(reg_tmp, 0);
+    jz(".kloop", T_NEAR);
+    L(".unkloop");
+    generate_fma(_mtile, KUNROLL);
+    add(reg_matAptr, KUNROLL * AKStepSize);
+    add(reg_matBptr, KUNROLL * BKStepSize);
+    add(reg_iterk, KUNROLL * KTILE);
+    cmp(reg_iterk, reg_tmp);  // k iteration variable
+    jb(".unkloop");
+    cmp(reg_tmp, reg_ksize);
+    jge(".kend", T_NEAR);
+    L(".kloop");
+    generate_fma(_mtile, 1);
+    add(reg_matAptr, 1 * AKStepSize);
+    add(reg_matBptr, 1 * BKStepSize);
+    add(reg_iterk, 1 * KTILE);
+    cmp(reg_iterk, reg_ksize);  // k iteration variable
+    jb(".kloop");
+    L(".kend");
+    outLocalLabel();
+  }
+
+  void generate_fma(int _mtile, int _ktile) {
+    for (int kk = 0; kk < _ktile; kk++) {
+      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
+      if (BRegCount == NRegs) {
+        for (int i = 0; i < NRegs; i++) {
+          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+        }
+        for (int mm = 0; mm < _mtile; mm++) {
+          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
+          add(reg_tmp1, reg_astride);
+          for (int i = 0; i < NRegs; i++) {
+            vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
+          }
+        }
+      } else if (BRegCount == 0) {
+        for (int mm = 0; mm < _mtile; mm += ARegCount) {
+          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
+          for (int imm = 0; imm < mm_re; imm++) {
+            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
+            add(reg_tmp1, reg_astride);
+            for (int i = 0; i < NRegs; i++) {
+              vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
+                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+            }
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+  }
+
+  void init_regs(int _mtile) {
+    inLocalLabel();
+    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
+    cmp(reg_tmp, 0);
+    je(".read", T_NEAR);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
+      }
+    }
+    jmp(".end", T_NEAR);
+    L(".read");
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    L(".end");
+    outLocalLabel();
+  }
+
+  void write_back(int _mtile) {
+    inLocalLabel();
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    outLocalLabel();
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class Avx512vnniN16P4 : protected bestla::xbyak::JitAvx512vnni {
+ public:
+  static int constexpr RegLen = 16, PackRow = 4;
+  static_assert(_NTILE % RegLen == 0);
+  static int constexpr NRegs = _NTILE / RegLen;
+  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1 - NRegs) / (NRegs * 2) : _MTILE;
+  static_assert(NRegs * MRegs <= RegCount - 1);
+  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
+  static int constexpr KUNROLL = 2;
+  static auto constexpr ISA = BTLA_ISA::AVX512_VNNI;
+  static auto constexpr COMPUTE = CompType::COMP_INT8_US_FP32;
+  typedef uint8_t AType;
+  typedef int8_t BType;
+  typedef float CType;
+
+  struct params {
+    AType* matA;
+    int astride;
+    BType* matB;
+    int bstride;
+    CType* matC;
+    int cstride;
+    uint8_t* zpA;
+    float* scaleA;
+    int ldsa;
+    float* scaleB;
+    float* reduceB;
+    int ldsb;
+    int k;
+    int n;
+    int kblock;
+    int init;
+    float kscale;
+  };
+  typedef long long (*func_t)(params*);
+
+  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
+  int CReg = 0, CF32Reg = 0, BReg = 0, AReg = 0, TmpReg = 0;
+  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
+  static int constexpr AKStepSize = KTILE * sizeof(AType);
+
+  void generate_code(int _mtile) {
+    assign_regs();
+    reset();
+    generate_mtile(_mtile);
+    ready();
+    mKernel = getCode<func_t>();
+  }
+  func_t mKernel = nullptr;
+
+ protected:
+  Xbyak::Reg64 parambase;
+  Xbyak::Reg64 reg_matAptr;
+  Xbyak::Reg64 reg_matBptr;
+  Xbyak::Reg64 reg_matCptr;
+  Xbyak::Reg64 reg_ksize;
+  Xbyak::Reg64 reg_nsize;
+  Xbyak::Reg64 reg_cstride;
+  Xbyak::Reg64 reg_astride;
+  Xbyak::Reg64 reg_iterk;
+  Xbyak::Reg64 reg_iterkb;
+  Xbyak::Reg64 reg_itern;
+  Xbyak::Reg64 reg_tmp;
+  Xbyak::Reg64 reg_tmp1;
+  Xbyak::Reg64 reg_tmp2;
+  Xbyak::Reg64 reg_tmp3;
+  Xbyak::Reg64 reg_tmp4;
+  Xbyak::Reg64 reg_ret = rax;
+
+  void assign_regs() {
+    CRegCount = MRegs * NRegs;
+    ARegCount = 1;
+    BRegCount = NRegs;
+    CReg = 0;
+    CF32Reg = CReg + CRegCount;
+    BReg = CF32Reg + CRegCount;
+    AReg = BReg + BRegCount;
+    TmpReg = AReg + ARegCount;
+    assert(TmpReg < RegCount);
+    TmpRegCount = RegCount - TmpReg;
+    assert(TmpRegCount >= 1);
+  }
+
+  void generate_mtile(int _mtile) {
+    inLocalLabel();  // use local label for multiple instance
+    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10);
+    parambase = st.p[0];
+    reg_matAptr = st.t[0];
+    reg_matBptr = st.t[1];
+    reg_matCptr = st.t[0];
+    reg_ksize = st.t[2];
+    reg_astride = st.t[3];
+    reg_cstride = st.t[3];
+    reg_iterk = st.t[4];
+    reg_iterkb = st.t[12];
+    reg_tmp = st.t[5];
+    reg_tmp1 = st.t[6];
+    reg_tmp2 = st.t[7];
+    reg_tmp3 = st.t[10];
+    reg_tmp4 = st.t[11];
+    reg_nsize = st.t[8];
+    reg_itern = st.t[9];
+    reg_ret = rax;
+
+    vreg_push(rsp);
+
+    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
+    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
+    xor_(reg_itern, reg_itern);
+    L(".nloop");
+    init_regs(_mtile);
+    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
+    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
+    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
+    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
+    imul(reg_tmp, reg_itern);
+    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
+    xor_(reg_iterk, reg_iterk);
+    generate_kloop(_mtile);
+    write_back(_mtile);
+    add(reg_itern, NTILE);
+    cmp(reg_itern, reg_nsize);
+    jb(".nloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+
+    outLocalLabel();  // end of local label
+  }
+
+  void generate_kloop(int _mtile) {
+    inLocalLabel();
+    xor_(reg_iterkb, reg_iterkb);
+    L(".kloop");
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vpxorq(Xbyak::Zmm(CReg + i * NRegs + j), Xbyak::Zmm(CReg + i * NRegs + j), Xbyak::Zmm(CReg + i * NRegs + j));
+      }
+    }
+    xor_(reg_tmp2, reg_tmp2);
+    load32(reg_tmp3, ptr[parambase + OFFSET(kblock)]);
+    mov(reg_tmp, reg_tmp3);
+    padto_le(reg_tmp, KUNROLL * KTILE);
+    cmp(reg_tmp, 0);
+    jz(".kbloop", T_NEAR);
+    L(".unkbloop");
+    generate_fma(_mtile, KUNROLL, reg_tmp1);
+    add(reg_matAptr, KUNROLL * AKStepSize);
+    add(reg_matBptr, KUNROLL * BKStepSize);
+    add(reg_tmp2, KUNROLL * KTILE);
+    cmp(reg_tmp2, reg_tmp);
+    jb(".unkbloop");
+    cmp(reg_tmp, reg_tmp3);
+    jge(".kend", T_NEAR);
+    L(".kbloop");
+    generate_fma(_mtile, 1, reg_tmp1);
+    add(reg_matAptr, 1 * AKStepSize);
+    add(reg_matBptr, 1 * BKStepSize);
+    add(reg_tmp2, 1 * KTILE);
+    cmp(reg_tmp2, reg_tmp3);
+    jb(".kbloop");
+    L(".kend");
+    add(reg_iterk, reg_tmp2);
+    generate_f32_accumulate(_mtile);
+    generate_zp_correction(_mtile);
+    inc(reg_iterkb);
+    cmp(reg_iterk, reg_ksize);  // k iteration variable
+    jb(".kloop");
+
+    outLocalLabel();
+  }
+
+  void generate_fma(int _mtile, int _ktile, Xbyak::Reg64& tmp) {
+    for (int kk = 0; kk < _ktile; kk++) {
+      lea(tmp, ptr[reg_matAptr + kk * AKStepSize]);
+      for (int i = 0; i < NRegs; i++) {
+        vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+      }
+      for (int mm = 0; mm < _mtile; mm++) {
+        vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
+        add(reg_tmp1, reg_astride);
+        for (int i = 0; i < NRegs; i++) {
+          vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
+        }
+      }
+    }
+  }
+
+  void init_regs(int _mtile) {
+    inLocalLabel();
+    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
+    cmp(reg_tmp, 0);
+    je(".read", T_NEAR);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vxor(vreg_t(CF32Reg + i * NRegs + j), vreg_t(CF32Reg + i * NRegs + j), vreg_t(CF32Reg + i * NRegs + j));
+      }
+    }
+    jmp(".end", T_NEAR);
+    L(".read");
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(vreg_t(CF32Reg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    L(".end");
+    outLocalLabel();
+  }
+
+  void generate_f32_accumulate(int _mtile) {
+    load32(reg_tmp, ptr[parambase + OFFSET(ldsb)]);
+    imul(reg_tmp, reg_iterkb);
+    mov(reg_tmp2, ptr[parambase + OFFSET(scaleB)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp * sizeof(float)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
+
+    mov(reg_tmp, ptr[parambase + OFFSET(scaleA)]);
+    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(float)]);
+    load32(reg_tmp1, ptr[parambase + OFFSET(ldsa)]);
+    for (int i = 0; i < NRegs; i++) {
+      vmovups(Xbyak::Zmm(BReg + i), ptr[reg_tmp2 + i * VecBytes]);
+    }
+    for (int mm = 0; mm < _mtile; mm++) {
+      vbroadcastss(Xbyak::Zmm(TmpReg), ptr[reg_tmp]);
+      lea(reg_tmp, ptr[reg_tmp + reg_tmp1 * sizeof(float)]);
+      for (int i = 0; i < NRegs; i++) {
+        vcvtdq2ps(Xbyak::Zmm(CReg + mm * NRegs + i), Xbyak::Zmm(CReg + mm * NRegs + i));
+        vmulps(Xbyak::Zmm(AReg), Xbyak::Zmm(TmpReg), Xbyak::Zmm(BReg + i));
+        vmulps(Xbyak::Zmm(CReg + mm * NRegs + i), Xbyak::Zmm(AReg));
+        vaddps(Xbyak::Zmm(CF32Reg + mm * NRegs + i), Xbyak::Zmm(CReg + mm * NRegs + i));
+      }
+    }
+  }
+
+  void generate_zp_correction(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, ptr[parambase + OFFSET(zpA)]);
+    cmp(reg_tmp, 0);
+    je(".NOZP", T_NEAR);
+    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(AType)]);
+    auto& reg_zpA = reg_tmp;
+
+    load32(reg_tmp1, ptr[parambase + OFFSET(ldsb)]);
+    imul(reg_tmp1, reg_iterkb);
+    mov(reg_tmp2, ptr[parambase + OFFSET(reduceB)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp1 * sizeof(float)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
+    auto& reg_redB = reg_tmp2;
+
+    mov(reg_tmp1, ptr[parambase + OFFSET(scaleA)]);
+    lea(reg_tmp1, ptr[reg_tmp1 + reg_iterkb * sizeof(float)]);
+    auto& reg_scaleA = reg_tmp1;
+
+    load32(reg_tmp3, ptr[parambase + OFFSET(ldsa)]);
+    auto& reg_ldsa = reg_tmp3;
+    for (int i = 0; i < NRegs; i++) {
+      vmovups(Xbyak::Zmm(BReg + i), ptr[reg_redB + i * VecBytes]);
+    }
+
+    vbroadcastss(vreg_t(TmpReg), ptr[parambase + OFFSET(kscale)]);
+    auto& reg_kscale = reg_tmp2;
+
+    for (int i = 0; i < _mtile; i++) {
+      vpbroadcastb(Xbyak::Xmm(AReg), ptr[reg_zpA]);
+      vpmovzxbd(Xbyak::Zmm(AReg), Xbyak::Xmm(AReg));
+      vcvtdq2ps(Xbyak::Zmm(AReg), Xbyak::Zmm(AReg));
+      vmulps(Xbyak::Zmm(AReg), Xbyak::Zmm(AReg), zword_b[reg_scaleA]);
+      vmulps(Xbyak::Zmm(AReg), Xbyak::Zmm(AReg), vreg_t(TmpReg));
+      for (int j = 0; j < NRegs; j++) {
+        vmulps(Xbyak::Zmm(CReg + j), Xbyak::Zmm(AReg), Xbyak::Zmm(BReg + j));
+        vsubps(Xbyak::Zmm(CF32Reg + i * NRegs + j), Xbyak::Zmm(CReg + j));
+      }
+      lea(reg_zpA, ptr[reg_zpA + reg_ldsa * sizeof(AType)]);
+      lea(reg_scaleA, ptr[reg_scaleA + reg_ldsa * sizeof(float)]);
+    }
+    L(".NOZP");
+    outLocalLabel();
+  }
+
+  void write_back(int _mtile) {
+    inLocalLabel();
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CF32Reg + i * NRegs + j));
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    outLocalLabel();
+  }
+};
+
+template <typename AT, int _NTILE, int _MTILE = 0>
+class AvxvnniN8P4 : protected bestla::xbyak::JitAvxvnni {
+ public:
+  static int constexpr RegLen = 8, PackRow = 4;
+  static_assert(_NTILE % RegLen == 0);
+  static int constexpr NRegs = _NTILE / RegLen;
+  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 3) / (NRegs * 2) : _MTILE;
+  static_assert(NRegs * MRegs <= RegCount - 3);
+  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
+  static int constexpr KUNROLL = 2;
+  static auto constexpr ISA = BTLA_ISA::AVX_VNNI;
+  static auto constexpr COMPUTE =
+      std::is_same_v<AT, uint8_t> ? CompType::COMP_INT8_US_FP32 : CompType::COMP_INT8_SS_FP32;
+  using AType = AT;
+  typedef int8_t BType;
+  typedef float CType;
+
+  struct params {
+    AType* matA;
+    int astride;
+    BType* matB;
+    int bstride;
+    CType* matC;
+    int cstride;
+    uint8_t* zpA;
+    float* scaleA;
+    int ldsa;
+    float* scaleB;
+    float* reduceB;
+    int ldsb;
+    int k;
+    int n;
+    int kblock;
+    int init;
+    float kscale;
+  };
+  typedef long long (*func_t)(params*);
+
+  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
+  int CReg = 0, CF32Reg = 0, BReg = 0, AReg = 0, TmpReg = 0;
+  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
+  static int constexpr AKStepSize = KTILE * sizeof(AType);
+
+  void generate_code(int _mtile) {
+    assign_regs();
+    reset();
+    generate_mtile(_mtile);
+    ready();
+    mKernel = getCode<func_t>();
+  }
+  func_t mKernel = nullptr;
+
+ protected:
+  Xbyak::Reg64 parambase;
+  Xbyak::Reg64 reg_matAptr;
+  Xbyak::Reg64 reg_matBptr;
+  Xbyak::Reg64 reg_matCptr;
+  Xbyak::Reg64 reg_ksize;
+  Xbyak::Reg64 reg_nsize;
+  Xbyak::Reg64 reg_cstride;
+  Xbyak::Reg64 reg_astride;
+  Xbyak::Reg64 reg_iterk;
+  Xbyak::Reg64 reg_iterkb;
+  Xbyak::Reg64 reg_itern;
+  Xbyak::Reg64 reg_tmp;
+  Xbyak::Reg64 reg_tmp1;
+  Xbyak::Reg64 reg_tmp2;
+  Xbyak::Reg64 reg_tmp3;
+  Xbyak::Reg64 reg_tmp4;
+  Xbyak::Reg64 reg_ret = rax;
+
+  void assign_regs() {
+    CRegCount = MRegs * NRegs;
+    ARegCount = 1;
+    BRegCount = RegCount - CRegCount - CRegCount - ARegCount - 2;
+    if (BRegCount >= NRegs) {
+      BRegCount = NRegs;
+    } else {
+      BRegCount = 0;
+    }
+    CReg = 0;
+    CF32Reg = CReg + CRegCount;
+    BReg = CF32Reg + CRegCount;
+    AReg = BReg + BRegCount;
+    TmpReg = AReg + ARegCount;
+    assert(TmpReg < RegCount);
+    TmpRegCount = RegCount - TmpReg;
+    assert(TmpRegCount >= 2);
+  }
+
+  void generate_mtile(int _mtile) {
+    inLocalLabel();  // use local label for multiple instance
+    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10);
+    parambase = st.p[0];
+    reg_matAptr = st.t[0];
+    reg_matBptr = st.t[1];
+    reg_matCptr = st.t[0];
+    reg_ksize = st.t[2];
+    reg_astride = st.t[3];
+    reg_cstride = st.t[3];
+    reg_iterk = st.t[4];
+    reg_iterkb = st.t[12];
+    reg_tmp = st.t[5];
+    reg_tmp1 = st.t[6];
+    reg_tmp2 = st.t[7];
+    reg_tmp3 = st.t[10];
+    reg_tmp4 = st.t[11];
+    reg_nsize = st.t[8];
+    reg_itern = st.t[9];
+    reg_ret = rax;
+
+    vreg_push(rsp);
+
+    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
+    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
+    xor_(reg_itern, reg_itern);
+    L(".nloop");
+    init_regs(_mtile);
+    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
+    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
+    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
+    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
+    imul(reg_tmp, reg_itern);
+    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
+    xor_(reg_iterk, reg_iterk);
+    generate_kloop(_mtile);
+    write_back(_mtile);
+    add(reg_itern, NTILE);
+    cmp(reg_itern, reg_nsize);
+    jb(".nloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+
+    outLocalLabel();  // end of local label
+  }
+
+  void generate_kloop(int _mtile) {
+    inLocalLabel();
+    xor_(reg_iterkb, reg_iterkb);
+    L(".kloop");
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
+      }
+    }
+    xor_(reg_tmp2, reg_tmp2);
+    load32(reg_tmp3, ptr[parambase + OFFSET(kblock)]);
+    mov(reg_tmp, reg_tmp3);
+    padto_le(reg_tmp, KUNROLL * KTILE);
+    cmp(reg_tmp, 0);
+    jz(".kbloop", T_NEAR);
+    L(".unkbloop");
+    generate_fma(_mtile, KUNROLL, reg_tmp1);
+    add(reg_matAptr, KUNROLL * AKStepSize);
+    add(reg_matBptr, KUNROLL * BKStepSize);
+    add(reg_tmp2, KUNROLL * KTILE);
+    cmp(reg_tmp2, reg_tmp);
+    jb(".unkbloop");
+    cmp(reg_tmp, reg_tmp3);
+    jge(".kend", T_NEAR);
+    L(".kbloop");
+    generate_fma(_mtile, 1, reg_tmp1);
+    add(reg_matAptr, 1 * AKStepSize);
+    add(reg_matBptr, 1 * BKStepSize);
+    add(reg_tmp2, 1 * KTILE);
+    cmp(reg_tmp2, reg_tmp3);
+    jb(".kbloop");
+    L(".kend");
+    add(reg_iterk, reg_tmp2);
+    generate_f32_accumulate(_mtile);
+    generate_zp_correction(_mtile);
+    inc(reg_iterkb);
+    cmp(reg_iterk, reg_ksize);  // k iteration variable
+    jb(".kloop");
+
+    outLocalLabel();
+  }
+
+  void generate_fma(int _mtile, int _ktile, Xbyak::Reg64& tmp) {
+    for (int kk = 0; kk < _ktile; kk++) {
+      lea(tmp, ptr[reg_matAptr + kk * AKStepSize]);
+      if (BRegCount == 0) {
+        for (int mm = 0; mm < _mtile; mm++) {
+          vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
+          if constexpr (std::is_same_v<AType, int8_t>) {
+            vpsignb(vreg_t(TmpReg + 1), vreg_t(AReg), vreg_t(AReg));
+          }
+          add(reg_tmp1, reg_astride);
+          for (int i = 0; i < NRegs; i++) {
+            if constexpr (std::is_same_v<AType, uint8_t>) {
+              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg),
+                         ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+            } else {
+              vmovups(vreg_t(TmpReg), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+              vpsignb(vreg_t(TmpReg), vreg_t(TmpReg), vreg_t(AReg));
+              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(TmpReg + 1), vreg_t(TmpReg));
+            }
+          }
+        }
+      } else {
+        for (int i = 0; i < NRegs; i++) {
+          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+        }
+        for (int mm = 0; mm < _mtile; mm++) {
+          vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
+          if constexpr (std::is_same_v<AType, int8_t>) {
+            vpsignb(vreg_t(TmpReg + 1), vreg_t(AReg), vreg_t(AReg));
+          }
+          add(reg_tmp1, reg_astride);
+          for (int i = 0; i < NRegs; i++) {
+            if constexpr (std::is_same_v<AType, uint8_t>) {
+              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
+            } else {
+              vpsignb(vreg_t(TmpReg), vreg_t(BReg + i), vreg_t(AReg));
+              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(TmpReg + 1), vreg_t(TmpReg));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void init_regs(int _mtile) {
+    inLocalLabel();
+    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
+    cmp(reg_tmp, 0);
+    je(".read", T_NEAR);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vxor(vreg_t(CF32Reg + i * NRegs + j), vreg_t(CF32Reg + i * NRegs + j), vreg_t(CF32Reg + i * NRegs + j));
+      }
+    }
+    jmp(".end", T_NEAR);
+    L(".read");
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(vreg_t(CF32Reg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    L(".end");
+    outLocalLabel();
+  }
+
+  void generate_f32_accumulate(int _mtile) {
+    load32(reg_tmp, ptr[parambase + OFFSET(ldsb)]);
+    imul(reg_tmp, reg_iterkb);
+    mov(reg_tmp2, ptr[parambase + OFFSET(scaleB)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp * sizeof(float)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
+
+    mov(reg_tmp, ptr[parambase + OFFSET(scaleA)]);
+    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(float)]);
+    load32(reg_tmp1, ptr[parambase + OFFSET(ldsa)]);
+    if (BRegCount == NRegs) {
+      for (int i = 0; i < NRegs; i++) {
+        vmovups(vreg_t(BReg + i), ptr[reg_tmp2 + i * VecBytes]);
+      }
+      for (int mm = 0; mm < _mtile; mm++) {
+        vbroadcastss(vreg_t(TmpReg), ptr[reg_tmp]);
+        lea(reg_tmp, ptr[reg_tmp + reg_tmp1 * sizeof(float)]);
+        for (int i = 0; i < NRegs; i++) {
+          vcvtdq2ps(vreg_t(CReg + mm * NRegs + i), vreg_t(CReg + mm * NRegs + i));
+          vmulps(vreg_t(AReg), vreg_t(TmpReg), vreg_t(BReg + i));
+          vmulps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg));
+          vaddps(vreg_t(CF32Reg + mm * NRegs + i), vreg_t(CReg + mm * NRegs + i));
+        }
+      }
+    } else {
+      for (int mm = 0; mm < _mtile; mm++) {
+        vbroadcastss(vreg_t(TmpReg), ptr[reg_tmp]);
+        lea(reg_tmp, ptr[reg_tmp + reg_tmp1 * sizeof(float)]);
+        for (int i = 0; i < NRegs; i++) {
+          vcvtdq2ps(vreg_t(CReg + mm * NRegs + i), vreg_t(CReg + mm * NRegs + i));
+          vmovups(vreg_t(AReg), ptr[reg_tmp2 + i * VecBytes]);
+          vmulps(vreg_t(AReg), vreg_t(TmpReg));
+          vmulps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg));
+          vaddps(vreg_t(CF32Reg + mm * NRegs + i), vreg_t(CReg + mm * NRegs + i));
+        }
+      }
+    }
+  }
+
+  void generate_zp_correction(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, ptr[parambase + OFFSET(zpA)]);
+    cmp(reg_tmp, 0);
+    je(".NOZP", T_NEAR);
+    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(AType)]);
+    auto& reg_zpA = reg_tmp;
+    load32(reg_tmp1, ptr[parambase + OFFSET(ldsb)]);
+    imul(reg_tmp1, reg_iterkb);
+    mov(reg_tmp2, ptr[parambase + OFFSET(reduceB)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp1 * sizeof(float)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
+    auto& reg_redB = reg_tmp2;
+
+    mov(reg_tmp1, ptr[parambase + OFFSET(scaleA)]);
+    lea(reg_tmp1, ptr[reg_tmp1 + reg_iterkb * sizeof(float)]);
+    auto& reg_scaleA = reg_tmp1;
+
+    load32(reg_tmp3, ptr[parambase + OFFSET(ldsa)]);
+    auto& reg_ldsa = reg_tmp3;
+
+    vbroadcastss(vreg_t(TmpReg), ptr[parambase + OFFSET(kscale)]);
+    auto& reg_kscale = reg_tmp4;
+    if (BRegCount == NRegs) {
+      for (int i = 0; i < NRegs; i++) {
+        vmovups(vreg_t(BReg + i), ptr[reg_redB + i * VecBytes]);
+      }
+      for (int i = 0; i < _mtile; i++) {
+        vpbroadcastb(Xbyak::Xmm(AReg), ptr[reg_zpA]);
+        vpmovzxbd(vreg_t(AReg), Xbyak::Xmm(AReg));
+        vcvtdq2ps(vreg_t(AReg), vreg_t(AReg));
+        vbroadcastss(vreg_t(TmpReg + 1), ptr[reg_scaleA]);
+        vmulps(vreg_t(AReg), vreg_t(AReg), vreg_t(TmpReg + 1));
+        vmulps(vreg_t(AReg), vreg_t(AReg), vreg_t(TmpReg));
+        for (int j = 0; j < NRegs; j++) {
+          vmulps(vreg_t(CReg + j), vreg_t(AReg), vreg_t(BReg + j));
+          vsubps(vreg_t(CF32Reg + i * NRegs + j), vreg_t(CReg + j));
+        }
+        lea(reg_zpA, ptr[reg_zpA + reg_ldsa * sizeof(AType)]);
+        lea(reg_scaleA, ptr[reg_scaleA + reg_ldsa * sizeof(float)]);
+      }
+    } else {
+      for (int i = 0; i < _mtile; i++) {
+        vpbroadcastb(Xbyak::Xmm(AReg), ptr[reg_zpA]);
+        vpmovzxbd(vreg_t(AReg), Xbyak::Xmm(AReg));
+        vcvtdq2ps(vreg_t(AReg), vreg_t(AReg));
+        vbroadcastss(vreg_t(TmpReg + 1), ptr[reg_scaleA]);
+        vmulps(vreg_t(AReg), vreg_t(AReg), vreg_t(TmpReg + 1));
+        vmulps(vreg_t(AReg), vreg_t(AReg), vreg_t(TmpReg));
+        for (int j = 0; j < NRegs; j++) {
+          vmulps(vreg_t(CReg + j), vreg_t(AReg), ptr[reg_redB + j * VecBytes]);
+          vsubps(vreg_t(CF32Reg + i * NRegs + j), vreg_t(CReg + j));
+        }
+        lea(reg_zpA, ptr[reg_zpA + reg_ldsa * sizeof(AType)]);
+        lea(reg_scaleA, ptr[reg_scaleA + reg_ldsa * sizeof(float)]);
+      }
+    }
+
+    L(".NOZP");
+    outLocalLabel();
+  }
+
+  void write_back(int _mtile) {
+    inLocalLabel();
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CF32Reg + i * NRegs + j));
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    outLocalLabel();
+  }
+};
+
+template <int N, int M>
+using AvxvnniN8P4U8 = kblock::AvxvnniN8P4<uint8_t, N, M>;
+template <int N, int M>
+using AvxvnniN8P4S8 = kblock::AvxvnniN8P4<int8_t, N, M>;
+
+template <typename AT, int _NTILE, int _MTILE = 0>
+class Avx2vnniN8P4 : protected bestla::xbyak::JitAvx2 {
+ public:
+  static int constexpr RegLen = 8, PackRow = 4;
+  static_assert(_NTILE % RegLen == 0);
+  static int constexpr NRegs = _NTILE / RegLen;
+  static int constexpr TmpReserve = std::is_same_v<AT, uint8_t> ? 2 : 4;
+  static int constexpr MRegs = _MTILE == 0 ? (RegCount - (TmpReserve + 1)) / (NRegs * 2) : _MTILE;
+  static_assert(NRegs * MRegs <= RegCount - (TmpReserve + 1));
+  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
+  static int constexpr KUNROLL = 2;
+  static auto constexpr ISA = BTLA_ISA::AVX2;
+  static auto constexpr COMPUTE =
+      std::is_same_v<AT, uint8_t> ? CompType::COMP_INT8_US_FP32 : CompType::COMP_INT8_SS_FP32;
+  using AType = AT;
+  typedef int8_t BType;
+  typedef float CType;
+
+  struct params {
+    AType* matA;
+    int astride;
+    BType* matB;
+    int bstride;
+    CType* matC;
+    int cstride;
+    uint8_t* zpA;
+    float* scaleA;
+    int ldsa;
+    float* scaleB;
+    float* reduceB;
+    int ldsb;
+    int k;
+    int n;
+    int kblock;
+    int init;
+    float kscale;
+    const uint16_t one = 1;
+  };
+  typedef long long (*func_t)(params*);
+
+  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
+  int CReg = 0, CF32Reg = 0, BReg = 0, AReg = 0, TmpReg = 0;
+  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
+  static int constexpr AKStepSize = KTILE * sizeof(AType);
+
+  void generate_code(int _mtile) {
+    assign_regs();
+    reset();
+    generate_mtile(_mtile);
+    ready();
+    mKernel = getCode<func_t>();
+  }
+  func_t mKernel = nullptr;
+
+ protected:
+  Xbyak::Reg64 parambase;
+  Xbyak::Reg64 reg_matAptr;
+  Xbyak::Reg64 reg_matBptr;
+  Xbyak::Reg64 reg_matCptr;
+  Xbyak::Reg64 reg_ksize;
+  Xbyak::Reg64 reg_nsize;
+  Xbyak::Reg64 reg_cstride;
+  Xbyak::Reg64 reg_astride;
+  Xbyak::Reg64 reg_iterk;
+  Xbyak::Reg64 reg_iterkb;
+  Xbyak::Reg64 reg_itern;
+  Xbyak::Reg64 reg_tmp;
+  Xbyak::Reg64 reg_tmp1;
+  Xbyak::Reg64 reg_tmp2;
+  Xbyak::Reg64 reg_tmp3;
+  Xbyak::Reg64 reg_tmp4;
+  Xbyak::Reg64 reg_ret = rax;
+
+  void assign_regs() {
+    CRegCount = MRegs * NRegs;
+    ARegCount = 1;
+    BRegCount = RegCount - CRegCount - CRegCount - ARegCount - TmpReserve;
+    if (BRegCount >= NRegs) {
+      BRegCount = NRegs;
+    } else {
+      BRegCount = 0;
+    }
+    CReg = 0;
+    CF32Reg = CReg + CRegCount;
+    BReg = CF32Reg + CRegCount;
+    AReg = BReg + BRegCount;
+    TmpReg = AReg + ARegCount;
+    assert(TmpReg < RegCount);
+    TmpRegCount = RegCount - TmpReg;
+    assert(TmpRegCount >= TmpReserve);
+  }
+
+  void generate_mtile(int _mtile) {
+    inLocalLabel();  // use local label for multiple instance
+    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10);
+    parambase = st.p[0];
+    reg_matAptr = st.t[0];
+    reg_matBptr = st.t[1];
+    reg_matCptr = st.t[0];
+    reg_ksize = st.t[2];
+    reg_astride = st.t[3];
+    reg_cstride = st.t[3];
+    reg_iterk = st.t[4];
+    reg_iterkb = st.t[12];
+    reg_tmp = st.t[5];
+    reg_tmp1 = st.t[6];
+    reg_tmp2 = st.t[7];
+    reg_tmp3 = st.t[10];
+    reg_tmp4 = st.t[11];
+    reg_nsize = st.t[8];
+    reg_itern = st.t[9];
+    reg_ret = rax;
+
+    vreg_push(rsp);
+    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
+    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
+    xor_(reg_itern, reg_itern);
+    L(".nloop");
+    init_regs(_mtile);
+    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
+    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
+    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
+    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
+    imul(reg_tmp, reg_itern);
+    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
+    xor_(reg_iterk, reg_iterk);
+    generate_kloop(_mtile);
+    write_back(_mtile);
+    add(reg_itern, NTILE);
+    cmp(reg_itern, reg_nsize);
+    jb(".nloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+
+    outLocalLabel();  // end of local label
+  }
+
+  void generate_kloop(int _mtile) {
+    inLocalLabel();
+    xor_(reg_iterkb, reg_iterkb);
+    L(".kloop");
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
+      }
+    }
+    vpbroadcastw(vreg_t(TmpReg + 0), ptr[parambase + OFFSET(one)]);
+    xor_(reg_tmp2, reg_tmp2);
+    load32(reg_tmp3, ptr[parambase + OFFSET(kblock)]);
+    mov(reg_tmp, reg_tmp3);
+    padto_le(reg_tmp, KUNROLL * KTILE);
+    cmp(reg_tmp, 0);
+    jz(".kbloop", T_NEAR);
+    L(".unkbloop");
+    generate_fma(_mtile, KUNROLL, reg_tmp1);
+    add(reg_matAptr, KUNROLL * AKStepSize);
+    add(reg_matBptr, KUNROLL * BKStepSize);
+    add(reg_tmp2, KUNROLL * KTILE);
+    cmp(reg_tmp2, reg_tmp);
+    jb(".unkbloop");
+    cmp(reg_tmp, reg_tmp3);
+    jge(".kend", T_NEAR);
+    L(".kbloop");
+    generate_fma(_mtile, 1, reg_tmp1);
+    add(reg_matAptr, 1 * AKStepSize);
+    add(reg_matBptr, 1 * BKStepSize);
+    add(reg_tmp2, 1 * KTILE);
+    cmp(reg_tmp2, reg_tmp3);
+    jb(".kbloop");
+    L(".kend");
+    add(reg_iterk, reg_tmp2);
+    generate_f32_accumulate(_mtile);
+    generate_zp_correction(_mtile);
+    inc(reg_iterkb);
+    cmp(reg_iterk, reg_ksize);  // k iteration variable
+    jb(".kloop");
+
+    outLocalLabel();
+  }
+
+  void generate_fma(int _mtile, int _ktile, Xbyak::Reg64& tmp) {
+    for (int kk = 0; kk < _ktile; kk++) {
+      lea(tmp, ptr[reg_matAptr + kk * AKStepSize]);
+      if (BRegCount == 0) {
+        for (int mm = 0; mm < _mtile; mm++) {
+          vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
+          if constexpr (std::is_same_v<AType, int8_t>) {
+            vpsignb(vreg_t(TmpReg + 2), vreg_t(AReg), vreg_t(AReg));
+          }
+          add(reg_tmp1, reg_astride);
+          for (int i = 0; i < NRegs; i++) {
+            if constexpr (std::is_same_v<AType, uint8_t>) {
+              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(TmpReg + 1), vreg_t(AReg),
+                         ptr[reg_matBptr + kk * BKStepSize + i * VecBytes], vreg_t(TmpReg + 0));
+            } else {
+              vmovups(vreg_t(TmpReg + 3), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+              vpsignb(vreg_t(TmpReg + 3), vreg_t(TmpReg + 3), vreg_t(AReg));
+              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(TmpReg + 1), vreg_t(TmpReg + 2), vreg_t(TmpReg + 3),
+                         vreg_t(TmpReg + 0));
+            }
+          }
+        }
+      } else {
+        for (int i = 0; i < NRegs; i++) {
+          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
+        }
+        for (int mm = 0; mm < _mtile; mm++) {
+          vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
+          if constexpr (std::is_same_v<AType, int8_t>) {
+            vpsignb(vreg_t(TmpReg + 2), vreg_t(AReg), vreg_t(AReg));
+          }
+          add(reg_tmp1, reg_astride);
+          for (int i = 0; i < NRegs; i++) {
+            if constexpr (std::is_same_v<AType, uint8_t>) {
+              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(TmpReg + 1), vreg_t(AReg), vreg_t(BReg + i),
+                         vreg_t(TmpReg + 0));
+            } else {
+              vpsignb(vreg_t(TmpReg + 3), vreg_t(BReg + i), vreg_t(AReg));
+              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(TmpReg + 1), vreg_t(TmpReg + 2), vreg_t(TmpReg + 3),
+                         vreg_t(TmpReg + 0));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void init_regs(int _mtile) {
+    inLocalLabel();
+    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
+    cmp(reg_tmp, 0);
+    je(".read", T_NEAR);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vxor(vreg_t(CF32Reg + i * NRegs + j), vreg_t(CF32Reg + i * NRegs + j), vreg_t(CF32Reg + i * NRegs + j));
+      }
+    }
+    jmp(".end", T_NEAR);
+    L(".read");
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(vreg_t(CF32Reg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    L(".end");
+    outLocalLabel();
+  }
+
+  void generate_f32_accumulate(int _mtile) {
+    load32(reg_tmp, ptr[parambase + OFFSET(ldsb)]);
+    imul(reg_tmp, reg_iterkb);
+    mov(reg_tmp2, ptr[parambase + OFFSET(scaleB)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp * sizeof(float)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
+
+    mov(reg_tmp, ptr[parambase + OFFSET(scaleA)]);
+    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(float)]);
+    load32(reg_tmp1, ptr[parambase + OFFSET(ldsa)]);
+    if (BRegCount == NRegs) {
+      for (int i = 0; i < NRegs; i++) {
+        vmovups(vreg_t(BReg + i), ptr[reg_tmp2 + i * VecBytes]);
+      }
+      for (int mm = 0; mm < _mtile; mm++) {
+        vbroadcastss(vreg_t(TmpReg), ptr[reg_tmp]);
+        lea(reg_tmp, ptr[reg_tmp + reg_tmp1 * sizeof(float)]);
+        for (int i = 0; i < NRegs; i++) {
+          vcvtdq2ps(vreg_t(CReg + mm * NRegs + i), vreg_t(CReg + mm * NRegs + i));
+          vmulps(vreg_t(AReg), vreg_t(TmpReg), vreg_t(BReg + i));
+          vmulps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg));
+          vaddps(vreg_t(CF32Reg + mm * NRegs + i), vreg_t(CReg + mm * NRegs + i));
+        }
+      }
+    } else {
+      for (int mm = 0; mm < _mtile; mm++) {
+        vbroadcastss(vreg_t(TmpReg), ptr[reg_tmp]);
+        lea(reg_tmp, ptr[reg_tmp + reg_tmp1 * sizeof(float)]);
+        for (int i = 0; i < NRegs; i++) {
+          vcvtdq2ps(vreg_t(CReg + mm * NRegs + i), vreg_t(CReg + mm * NRegs + i));
+          vmovups(vreg_t(AReg), ptr[reg_tmp2 + i * VecBytes]);
+          vmulps(vreg_t(AReg), vreg_t(TmpReg));
+          vmulps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg));
+          vaddps(vreg_t(CF32Reg + mm * NRegs + i), vreg_t(CReg + mm * NRegs + i));
+        }
+      }
+    }
+  }
+
+  void generate_zp_correction(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, ptr[parambase + OFFSET(zpA)]);
+    cmp(reg_tmp, 0);
+    je(".NOZP", T_NEAR);
+    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(AType)]);
+    auto& reg_zpA = reg_tmp;
+    load32(reg_tmp1, ptr[parambase + OFFSET(ldsb)]);
+    imul(reg_tmp1, reg_iterkb);
+    mov(reg_tmp2, ptr[parambase + OFFSET(reduceB)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp1 * sizeof(float)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
+    auto& reg_redB = reg_tmp2;
+
+    mov(reg_tmp1, ptr[parambase + OFFSET(scaleA)]);
+    lea(reg_tmp1, ptr[reg_tmp1 + reg_iterkb * sizeof(float)]);
+    auto& reg_scaleA = reg_tmp1;
+
+    load32(reg_tmp3, ptr[parambase + OFFSET(ldsa)]);
+    auto& reg_ldsa = reg_tmp3;
+
+    vbroadcastss(vreg_t(TmpReg), ptr[parambase + OFFSET(kscale)]);
+    auto& reg_kscale = reg_tmp4;
+    if (BRegCount == NRegs) {
+      for (int i = 0; i < NRegs; i++) {
+        vmovups(vreg_t(BReg + i), ptr[reg_redB + i * VecBytes]);
+      }
+      for (int i = 0; i < _mtile; i++) {
+        vpbroadcastb(Xbyak::Xmm(AReg), ptr[reg_zpA]);
+        vpmovzxbd(vreg_t(AReg), Xbyak::Xmm(AReg));
+        vcvtdq2ps(vreg_t(AReg), vreg_t(AReg));
+        vbroadcastss(vreg_t(TmpReg + 1), ptr[reg_scaleA]);
+        vmulps(vreg_t(AReg), vreg_t(AReg), vreg_t(TmpReg + 1));
+        vmulps(vreg_t(AReg), vreg_t(AReg), vreg_t(TmpReg));
+        for (int j = 0; j < NRegs; j++) {
+          vmulps(vreg_t(CReg + j), vreg_t(AReg), vreg_t(BReg + j));
+          vsubps(vreg_t(CF32Reg + i * NRegs + j), vreg_t(CReg + j));
+        }
+        lea(reg_zpA, ptr[reg_zpA + reg_ldsa * sizeof(AType)]);
+        lea(reg_scaleA, ptr[reg_scaleA + reg_ldsa * sizeof(float)]);
+      }
+    } else {
+      for (int i = 0; i < _mtile; i++) {
+        vpbroadcastb(Xbyak::Xmm(AReg), ptr[reg_zpA]);
+        vpmovzxbd(vreg_t(AReg), Xbyak::Xmm(AReg));
+        vcvtdq2ps(vreg_t(AReg), vreg_t(AReg));
+        vbroadcastss(vreg_t(TmpReg + 1), ptr[reg_scaleA]);
+        vmulps(vreg_t(AReg), vreg_t(AReg), vreg_t(TmpReg + 1));
+        vmulps(vreg_t(AReg), vreg_t(AReg), vreg_t(TmpReg));
+        for (int j = 0; j < NRegs; j++) {
+          vmulps(vreg_t(CReg + j), vreg_t(AReg), ptr[reg_redB + j * VecBytes]);
+          vsubps(vreg_t(CF32Reg + i * NRegs + j), vreg_t(CReg + j));
+        }
+        lea(reg_zpA, ptr[reg_zpA + reg_ldsa * sizeof(AType)]);
+        lea(reg_scaleA, ptr[reg_scaleA + reg_ldsa * sizeof(float)]);
+      }
+    }
+
+    L(".NOZP");
+    outLocalLabel();
+  }
+
+  void write_back(int _mtile) {
+    inLocalLabel();
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
+    for (int i = 0; i < _mtile; i++) {
+      for (int j = 0; j < NRegs; j++) {
+        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CF32Reg + i * NRegs + j));
+      }
+      add(reg_matCptr, reg_cstride);
+    }
+    outLocalLabel();
+  }
+};
+template <int N, int M>
+using Avx2vnniN8P4U8 = kblock::Avx2vnniN8P4<uint8_t, N, M>;
+template <int N, int M>
+using Avx2vnniN8P4S8 = kblock::Avx2vnniN8P4<int8_t, N, M>;
+
+template <typename AT, typename BT, int _NTILE, int _MTILE = 0>
+class Amxint8N16P4 : protected bestla::xbyak::JitAmxint8 {
+ public:
+  static int constexpr RegLen = 16, PackRow = 4;
+  static_assert(_NTILE % RegLen == 0);
+  static_assert(_MTILE % RegLen == 0);
+  static int constexpr NRegs = _NTILE / RegLen;
+  static int constexpr MRegs = _MTILE == 0 ? 1 : _MTILE / RegLen;
+  static_assert(NRegs * MRegs + 2 <= TileCount);
+  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs * RegLen, KTILE = 64;
+  static int constexpr KUNROLL = 2;
+  static auto constexpr ISA = BTLA_ISA::AMX_INT8;
+  static auto constexpr COMPUTE = (std::is_same_v<AT, int8_t> ? std::is_same_v<BT, int8_t> ? CompType::COMP_INT8_SS_FP32
+                                                                                           : CompType::COMP_INT8_SU_FP32
+                                   : std::is_same_v<BT, int8_t> ? CompType::COMP_INT8_US_FP32
+                                                                : CompType::COMP_INT8_UU_FP32);
+  using AType = AT;
+  using BType = BT;
+  typedef float CType;
+
+  struct params {
+    AType* matA;
+    int astride;
+    BType* matB;
+    int bstride;
+    CType* matC;
+    int cstride;
+    uint8_t* zpA;
+    float* scaleA;
+    int ldsa;
+    float* scaleB;
+    float* reduceB;
+    int ldsb;
+    int k;
+    int n;
+    int kblock;
+    int init;
+    float kscale;
+    void* workspace;
+  };
+  typedef long long (*func_t)(params*);
+
+  int TmpRegCount = RegCount;
+  int TmpReg = 0;
+  int CTileCount = 0, ATileCount = 0, BTileCount = 0;
+  int CTile = 0, ATile = 0, BTile = 0;
+  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
+  static int constexpr AKStepSize = KTILE * sizeof(AType);
+
+  void generate_code(int _mtile) {
+    assign_regs();
+    reset();
+    generate_mtile(_mtile);
+    ready();
+    mKernel = getCode<func_t>();
+  }
+  func_t mKernel = nullptr;
+
+ protected:
+  Xbyak::Reg64 parambase;
+  Xbyak::Reg64 reg_matAptr;
+  Xbyak::Reg64 reg_matBptr;
+  Xbyak::Reg64 reg_matCptr;
+  Xbyak::Reg64 reg_ksize;
+  Xbyak::Reg64 reg_nsize;
+  Xbyak::Reg64 reg_cstride;
+  Xbyak::Reg64 reg_astride;
+  Xbyak::Reg64 reg_iterk;
+  Xbyak::Reg64 reg_iterkb;
+  Xbyak::Reg64 reg_itern;
+  Xbyak::Reg64 reg_tmp;
+  Xbyak::Reg64 reg_tmp1;
+  Xbyak::Reg64 reg_tmp2;
+  Xbyak::Reg64 reg_tmp3;
+  Xbyak::Reg64 reg_tmp4;
+  Xbyak::Reg64 reg_ret = rax;
+
+  void assign_regs() {
+    CTileCount = NRegs * MRegs;
+    auto tile_re = TileCount - CTileCount;
+    if (tile_re - 1 >= NRegs) {
+      BTileCount = NRegs;
+      ATileCount = tile_re - BTileCount;
+    } else if (tile_re - 1 >= MRegs) {
+      ATileCount = MRegs;
+      BTileCount = tile_re - ATileCount;
+    } else {
+      ATileCount = 1;
+      BTileCount = tile_re - ATileCount;
+    }
+    CTile = 0;
+    ATile = CTile + CTileCount;
+    BTile = ATile + ATileCount;
+  }
+
+  void generate_mtile(int _mtile) {
+    inLocalLabel();  // use local label for multiple instance
+    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10);
+    parambase = st.p[0];
+    reg_matAptr = st.t[0];
+    reg_matBptr = st.t[1];
+    reg_matCptr = st.t[0];
+    reg_ksize = st.t[2];
+    reg_astride = st.t[3];
+    reg_cstride = st.t[3];
+    reg_iterk = st.t[4];
+    reg_iterkb = st.t[12];
+    reg_tmp = st.t[5];
+    reg_tmp1 = st.t[6];
+    reg_tmp2 = st.t[7];
+    reg_tmp3 = st.t[10];
+    reg_tmp4 = st.t[11];
+    reg_nsize = st.t[8];
+    reg_itern = st.t[9];
+    reg_ret = rax;
+
+    vreg_push(rsp);
+
+    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
+    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
+    xor_(reg_itern, reg_itern);
+    L(".nloop");
+    init_regs(_mtile);
+    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
+    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
+    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
+    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
+    imul(reg_tmp, reg_itern);
+    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
+    xor_(reg_iterk, reg_iterk);
+    generate_kloop(_mtile);
+    write_back(_mtile);
+    add(reg_itern, NTILE);
+    cmp(reg_itern, reg_nsize);
+    jb(".nloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+
+    outLocalLabel();  // end of local label
+  }
+
+  void generate_kloop(int _mtile) {
+    inLocalLabel();
+    xor_(reg_iterkb, reg_iterkb);
+    L(".kloop");
+    for (int i = 0; i < CTileCount; i++) {
+      tilezero(Xbyak::Tmm(CTile + i));
+    }
+    xor_(reg_tmp2, reg_tmp2);
+    load32(reg_tmp3, ptr[parambase + OFFSET(kblock)]);
+    mov(reg_tmp, reg_tmp3);
+    padto_le(reg_tmp, KUNROLL * KTILE);
+    cmp(reg_tmp, 0);
+    jz(".kbloop", T_NEAR);
+    L(".unkbloop");
+    generate_fma(_mtile, KUNROLL, reg_tmp1, reg_tmp4);
+    add(reg_matAptr, KUNROLL * AKStepSize);
+    add(reg_matBptr, KUNROLL * BKStepSize);
+    add(reg_tmp2, KUNROLL * KTILE);
+    cmp(reg_tmp2, reg_tmp);
+    jb(".unkbloop");
+    cmp(reg_tmp, reg_tmp3);
+    jge(".kend", T_NEAR);
+    L(".kbloop");
+    generate_fma(_mtile, 1, reg_tmp1, reg_tmp4);
+    add(reg_matAptr, 1 * AKStepSize);
+    add(reg_matBptr, 1 * BKStepSize);
+    add(reg_tmp2, 1 * KTILE);
+    cmp(reg_tmp2, reg_tmp3);
+    jb(".kbloop");
+    L(".kend");
+    add(reg_iterk, reg_tmp2);
+    generate_f32_accumulate(_mtile);
+    generate_zp_correction(_mtile);
+    inc(reg_iterkb);
+    cmp(reg_iterk, reg_ksize);  // k iteration variable
+    jb(".kloop");
+
+    outLocalLabel();
+  }
+
+  void generate_fma(int _mtile, int kunrll, Xbyak::Reg64& tmpreg, Xbyak::Reg64& tmpreg2) {
+    auto& reg_Bstride = tmpreg2;
+    mov(reg_Bstride, NTILE * 4);
+    int mtiles = _mtile / RegLen;
+
+    for (int kk = 0; kk < kunrll; kk++) {
+      auto reg_Atmp = tmpreg;
+      if (mtiles == 1) {
+        reg_Atmp = reg_matAptr;
+      } else {
+        mov(reg_Atmp, reg_matAptr);
+      }
+      if (BTileCount == NRegs) {
+        for (int i = 0; i < NRegs; i++) {
+          tileloaddt1(Xbyak::Tmm(BTile + i), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
+        }
+        for (int mm = 0; mm < mtiles; mm++) {
+          tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
+          for (int i = 0; i < NRegs; i++) {
+            _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile + i));
+          }
+          if (mm != mtiles - 1) {
+            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+          }
+        }
+      } else {
+        if (ATileCount == mtiles) {
+          for (int mm = 0; mm < mtiles; mm++) {
+            tileloadd(Xbyak::Tmm(ATile + mm), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
+            if (mm != mtiles - 1) {
+              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+            }
+          }
+          for (int i = 0; i < NRegs; i++) {
+            tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
+            for (int mm = 0; mm < mtiles; mm++) {
+              _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile + mm), Xbyak::Tmm(BTile));
+            }
+          }
+        } else {
+          for (int mm = 0; mm < mtiles; mm++) {
+            tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
+            for (int i = 0; i < NRegs; i++) {
+              tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
+              _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile));
+            }
+            if (mm != mtiles - 1) {
+              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void init_regs(int _mtile) {
+    inLocalLabel();
+    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
+    cmp(reg_tmp, 0);
+    je(".end", T_NEAR);
+    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
+    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
+    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(float)]);
+    int zunroll = TmpRegCount / NRegs;
+    for (int i = 0; i < _mtile; i += zunroll) {
+      int m_re = utils::remainsize(i, _mtile, zunroll);
+      for (int im = 0; im < m_re; im++) {
+        for (int j = 0; j < NRegs; j++) {
+          vxorps(vreg_t(TmpReg + im * NRegs + j), vreg_t(TmpReg + im * NRegs + j));
+          vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(TmpReg + im * NRegs + j));
+        }
+        add(reg_matCptr, reg_cstride);
+      }
+    }
+    L(".end");
+    outLocalLabel();
+  }
+
+  void generate_f32_accumulate(int _mtile) {
+    mov(reg_tmp3, ptr[parambase + OFFSET(workspace)]);
+    mov(reg_tmp1, NTILE * 4);
+    for (int mm = 0; mm < MRegs; mm++) {
+      for (int i = 0; i < NRegs; i++) {
+        tilestored(ptr[reg_tmp3 + reg_tmp1 + i * 64 + mm * 16 * NTILE * 4], Xbyak::Tmm(CTile + mm * NRegs + i));
+      }
+    }
+    load32(reg_tmp, ptr[parambase + OFFSET(ldsb)]);
+    imul(reg_tmp, reg_iterkb);
+    mov(reg_tmp2, ptr[parambase + OFFSET(scaleB)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp * sizeof(float)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
+
+    mov(reg_tmp, ptr[parambase + OFFSET(scaleA)]);
+    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(float)]);
+    load32(reg_tmp1, ptr[parambase + OFFSET(ldsa)]);
+    int BReg = TmpReg;
+    int AReg = BReg + NRegs;
+    int SAReg = AReg + 1;
+    int CReg = SAReg + 1;
+    for (int i = 0; i < NRegs; i++) {
+      vmovups(Xbyak::Zmm(BReg + i), ptr[reg_tmp2 + i * VecBytes]);
+    }
+    mov(reg_tmp2, ptr[parambase + OFFSET(matC)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
+    load32(reg_tmp4, dword[parambase + OFFSET(cstride)]);
+    for (int mm = 0; mm < _mtile; mm++) {
+      vbroadcastss(Xbyak::Zmm(SAReg), ptr[reg_tmp]);
+      lea(reg_tmp, ptr[reg_tmp + reg_tmp1 * sizeof(float)]);
+      for (int i = 0; i < NRegs; i++) {
+        vcvtdq2ps(Xbyak::Zmm(CReg + i), ptr[reg_tmp3 + i * 64 + mm * 4 * NTILE]);
+        vmulps(Xbyak::Zmm(AReg), Xbyak::Zmm(SAReg), Xbyak::Zmm(BReg + i));
+        vmulps(Xbyak::Zmm(CReg + i), Xbyak::Zmm(AReg));
+        vaddps(Xbyak::Zmm(CReg + i), ptr[reg_tmp2 + i * 64]);
+        vmovups(ptr[reg_tmp2 + i * 64], Xbyak::Zmm(CReg + i));
+      }
+      add(reg_tmp2, reg_tmp4);
+    }
+  }
+
+  void generate_zp_correction(int _mtile) {
+    inLocalLabel();
+    mov(reg_tmp, ptr[parambase + OFFSET(zpA)]);
+    cmp(reg_tmp, 0);
+    je(".NOZP", T_NEAR);
+    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(AType)]);
+    auto& reg_zpA = reg_tmp;
+
+    load32(reg_tmp1, ptr[parambase + OFFSET(ldsb)]);
+    imul(reg_tmp1, reg_iterkb);
+    mov(reg_tmp2, ptr[parambase + OFFSET(reduceB)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp1 * sizeof(float)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
+    auto& reg_redB = reg_tmp2;
+
+    mov(reg_tmp1, ptr[parambase + OFFSET(scaleA)]);
+    lea(reg_tmp1, ptr[reg_tmp1 + reg_iterkb * sizeof(float)]);
+    auto& reg_scaleA = reg_tmp1;
+
+    load32(reg_tmp3, ptr[parambase + OFFSET(ldsa)]);
+    auto& reg_ldsa = reg_tmp3;
+    int BReg = TmpReg;
+    int AReg = BReg + NRegs;
+    int SReg = AReg + 1;
+    int CReg = SReg + 1;
+    int CF32Reg = CReg + NRegs;
+    for (int i = 0; i < NRegs; i++) {
+      vmovups(Xbyak::Zmm(BReg + i), ptr[reg_redB + i * VecBytes]);
+    }
+
+    vbroadcastss(vreg_t(SReg), ptr[parambase + OFFSET(kscale)]);
+    mov(reg_tmp2, ptr[parambase + OFFSET(matC)]);
+    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
+    load32(reg_tmp4, dword[parambase + OFFSET(cstride)]);
+
+    for (int i = 0; i < _mtile; i++) {
+      vpbroadcastb(Xbyak::Xmm(AReg), ptr[reg_zpA]);
+      vpmovzxbd(Xbyak::Zmm(AReg), Xbyak::Xmm(AReg));
+      vcvtdq2ps(Xbyak::Zmm(AReg), Xbyak::Zmm(AReg));
+      vmulps(Xbyak::Zmm(AReg), Xbyak::Zmm(AReg), zword_b[reg_scaleA]);
+      vmulps(Xbyak::Zmm(AReg), Xbyak::Zmm(AReg), vreg_t(SReg));
+      for (int j = 0; j < NRegs; j++) {
+        vmulps(Xbyak::Zmm(CReg + j), Xbyak::Zmm(AReg), Xbyak::Zmm(BReg + j));
+        vmovups(Xbyak::Zmm(CF32Reg + j), ptr[reg_tmp2 + j * 64]);
+        vsubps(Xbyak::Zmm(CF32Reg + j), Xbyak::Zmm(CReg + j));
+        vmovups(ptr[reg_tmp2 + j * 64], Xbyak::Zmm(CF32Reg + j));
+      }
+      add(reg_tmp2, reg_tmp4);
+      lea(reg_zpA, ptr[reg_zpA + reg_ldsa * sizeof(AType)]);
+      lea(reg_scaleA, ptr[reg_scaleA + reg_ldsa * sizeof(float)]);
+    }
+    L(".NOZP");
+    outLocalLabel();
+  }
+
+  void write_back(int _mtile) { (void)(_mtile); }
+};
+template <int N, int M>
+using Amxint8N16P4US = kblock::Amxint8N16P4<uint8_t, int8_t, N, M>;
+
+template <int N, int M>
+using Amxint8N16P4SS = kblock::Amxint8N16P4<int8_t, int8_t, N, M>;
+}  // namespace kblock
+}  // namespace code
+template <template <int, int> class CodeT, int _NTILE, int _MTILE = 0>
+class CoreCodeBase {
+ public:
+  using Code = CodeT<_NTILE, _MTILE>;
+  using AType = typename Code::AType;
+  using BType = typename Code::BType;
+  using CType = typename Code::CType;
+  static auto constexpr NTILE = Code::NTILE;
+  static auto constexpr MTILE = Code::MTILE;
+  static auto constexpr KTILE = Code::KTILE;
+  static auto constexpr PACK_ROW = Code::PackRow;
+  static auto constexpr COMP = Code::COMPUTE;
+  static int constexpr PREFERRED_N = NTILE * 3;
+  static auto constexpr ISA = Code::ISA;
+  static auto constexpr ID = CoreAttr::make_core_id(NTILE, PACK_ROW, COMP, ISA);
+  void configure(int _M, int _N, int _K) { (void)(0); }
+
+ protected:
+  CoreCodeBase() {
+    for (int i = 0; i < mCodes.size(); i++) {
+      mCodes[i].generate_code(i + 1);
+    }
+  }
+  std::array<Code, Code::MTILE> mCodes;
+};
+
+template <template <int, int> class CodeT, int _NTILE, int _MTILE = 0>
+class CoreCodeBaseAMX {
+ public:
+  using Code = CodeT<_NTILE, _MTILE>;
+  using AType = typename Code::AType;
+  using BType = typename Code::BType;
+  using CType = typename Code::CType;
+  static auto constexpr NTILE = Code::NTILE;
+  static auto constexpr MTILE = Code::MTILE;
+  static auto constexpr KTILE = Code::KTILE;
+  static auto constexpr PACK_ROW = Code::PackRow;
+  static auto constexpr COMP = Code::COMPUTE;
+  static int constexpr PREFERRED_N = NTILE * 3;
+  static auto constexpr ISA = Code::ISA;
+  static auto constexpr ID = CoreAttr::make_core_id(_NTILE, PACK_ROW, COMP, ISA);
+  Xbyak::CodeGenerator cfgcode;
+
+ protected:
+  CoreCodeBaseAMX() {
+    for (int i = 0; i < mCodes.size(); i++) {
+      mCodes[i].generate_code((i + 1) * 16);
+    }
+  }
+  std::array<Code, Code::MRegs> mCodes;
+};
+
+template <int _NTILE, int _MTILE = 0>
+class SCoreRowNAvx2 : public CoreCodeBase<code::Avx2N8P1, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBase<code::Avx2N8P1, _NTILE, _MTILE>::Code;
+  void forward(float* matA, float* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
+               int kpos, void* tmpcache, size_t cachesize) {
+    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
+    if (_m <= Code::MTILE) {
+      this->mCodes[_m - 1].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class SCoreRowNAvx512f : public CoreCodeBase<code::Avx512fN16P1, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBase<code::Avx512fN16P1, _NTILE, _MTILE>::Code;
+  void forward(float* matA, float* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
+               int kpos, void* tmpcache, size_t cachesize) {
+    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
+    if (_m <= Code::MTILE) {
+      this->mCodes[_m - 1].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class HCoreRowNAvx512fp16 : public CoreCodeBase<code::Avx512fp16N32P1, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBase<code::Avx512fp16N32P1, _NTILE, _MTILE>::Code;
+
+  void forward(utils::fp16* matA, utils::fp16* matB, utils::fp16* matC, int _m, int _n, int _k, int _astride,
+               int _bstride, int _cstride, int kpos, void* tmpcache, size_t cachesize) {
+    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
+    if (_m <= Code::MTILE) {
+      this->mCodes[_m - 1].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class HCoreRowNAvx512bf16 : public CoreCodeBase<code::Avx512bf16N16P2, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBase<code::Avx512bf16N16P2, _NTILE, _MTILE>::Code;
+  void forward(utils::bf16* matA, utils::bf16* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride,
+               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
+    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
+    if (_m <= Code::MTILE) {
+      this->mCodes[_m - 1].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class HCoreRowNAmxbf16 : public CoreCodeBaseAMX<code::Amxbf16N16P2, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBaseAMX<code::Amxbf16N16P2, _NTILE, _MTILE>::Code;
+  using AType = typename Code::AType;
+  using BType = typename Code::BType;
+  using CType = typename Code::CType;
+
+  void configure(int _M, int _N, int _K) {
+    code::AmxConfigure::configure(_M < 16 ? _M : 16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
+                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
+  }
+
+  void forward(AType* matA, BType* matB, CType* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
+               int kpos, void* tmpcache, size_t cachesize) {
+    auto param =
+        typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0, tmpcache};
+    if (_m <= Code::MTILE) {
+      int idx = utils::updiv(_m, 16) - 1;
+      this->mCodes[idx].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class ICoreRowNAvx512vnni : public CoreCodeBase<code::Avx512vnniN16P4, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBase<code::Avx512vnniN16P4, _NTILE, _MTILE>::Code;
+  void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
+               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
+    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
+    if (_m <= Code::MTILE) {
+      this->mCodes[_m - 1].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class ICoreRowNAvx512vnniKBlock : public CoreCodeBase<code::kblock::Avx512vnniN16P4, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBase<code::kblock::Avx512vnniN16P4, _NTILE, _MTILE>::Code;
+  void forward(uint8_t* matA, int8_t* matB, float* matC, uint8_t* zpA, float* scaleA, int _ldsa, float* scaleB,
+               float* reduceB, int _ldsb, int _m, int _n, int _k, int _kblock, int _astride, int _bstride, int _cstride,
+               int kpos, float kscale, void* tmpcache, size_t cachesize) {
+    auto param = typename Code::params{matA,  _astride, matB,    _bstride, matC, _cstride, zpA,     scaleA,
+                                       _ldsa, scaleB,   reduceB, _ldsb,    _k,   _n,       _kblock, kpos == 0 ? 1 : 0,
+                                       kscale};
+    if (_m <= Code::MTILE) {
+      this->mCodes[_m - 1].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class ICoreRowNAvxvnni : public CoreCodeBase<code::AvxvnniN8P4U8, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBase<code::AvxvnniN8P4U8, _NTILE, _MTILE>::Code;
+
+  void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
+               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
+    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
+    if (_m <= Code::MTILE) {
+      this->mCodes[_m - 1].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class ICoreRowNAvxvnniSS : public CoreCodeBase<code::AvxvnniN8P4S8, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBase<code::AvxvnniN8P4S8, _NTILE, _MTILE>::Code;
+
+  void forward(int8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
+               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
+    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
+    if (_m <= Code::MTILE) {
+      this->mCodes[_m - 1].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class ICoreRowNAvx2vnni : public CoreCodeBase<code::Avx2vnniN8P4U8, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBase<code::Avx2vnniN8P4U8, _NTILE, _MTILE>::Code;
+
+    void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
+               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
+    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
+    if (_m <= Code::MTILE) {
+      this->mCodes[_m - 1].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class ICoreRowNAvx2vnniSS : public CoreCodeBase<code::Avx2vnniN8P4S8, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBase<code::Avx2vnniN8P4S8, _NTILE, _MTILE>::Code;
+
+  void forward(int8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
+               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
+    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
+    if (_m <= Code::MTILE) {
+      this->mCodes[_m - 1].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class ICoreRowNAvxvnniKBlock : public CoreCodeBase<code::kblock::AvxvnniN8P4U8, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBase<code::kblock::AvxvnniN8P4U8, _NTILE, _MTILE>::Code;
+  void forward(uint8_t* matA, int8_t* matB, float* matC, uint8_t* zpA, float* scaleA, int _ldsa, float* scaleB,
+               float* reduceB, int _ldsb, int _m, int _n, int _k, int _kblock, int _astride, int _bstride, int _cstride,
+               int kpos, float kscale, void* tmpcache, size_t cachesize) {
+    auto param = typename Code::params{matA,  _astride, matB,    _bstride, matC, _cstride, zpA,     scaleA,
+                                       _ldsa, scaleB,   reduceB, _ldsb,    _k,   _n,       _kblock, kpos == 0 ? 1 : 0,
+                                       kscale};
+    if (_m <= Code::MTILE) {
+      this->mCodes[_m - 1].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class ICoreRowNAvxvnniKBlockSS : public CoreCodeBase<code::kblock::AvxvnniN8P4S8, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBase<code::kblock::AvxvnniN8P4S8, _NTILE, _MTILE>::Code;
+  void forward(int8_t* matA, int8_t* matB, float* matC, int8_t* zpA, float* scaleA, int _ldsa, float* scaleB,
+               float* reduceB, int _ldsb, int _m, int _n, int _k, int _kblock, int _astride, int _bstride, int _cstride,
+               int kpos, float kscale, void* tmpcache, size_t cachesize) {
+    auto param =
+        typename Code::params{matA,   _astride, matB,  _bstride, matC, _cstride, nullptr,           scaleA, _ldsa,
+                              scaleB, reduceB,  _ldsb, _k,       _n,   _kblock,  kpos == 0 ? 1 : 0, kscale};
+    if (_m <= Code::MTILE) {
+      this->mCodes[_m - 1].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class ICoreRowNAvx2vnniKBlock : public CoreCodeBase<code::kblock::Avx2vnniN8P4U8, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBase<code::kblock::Avx2vnniN8P4U8, _NTILE, _MTILE>::Code;
+  void forward(uint8_t* matA, int8_t* matB, float* matC, uint8_t* zpA, float* scaleA, int _ldsa, float* scaleB,
+               float* reduceB, int _ldsb, int _m, int _n, int _k, int _kblock, int _astride, int _bstride, int _cstride,
+               int kpos, float kscale, void* tmpcache, size_t cachesize) {
+    auto param = typename Code::params{matA,  _astride, matB,    _bstride, matC, _cstride, zpA,     scaleA,
+                                       _ldsa, scaleB,   reduceB, _ldsb,    _k,   _n,       _kblock, kpos == 0 ? 1 : 0,
+                                       kscale};
+    if (_m <= Code::MTILE) {
+      this->mCodes[_m - 1].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class ICoreRowNAvx2vnniKBlockSS : public CoreCodeBase<code::kblock::Avx2vnniN8P4S8, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBase<code::kblock::Avx2vnniN8P4S8, _NTILE, _MTILE>::Code;
+  void forward(int8_t* matA, int8_t* matB, float* matC, int8_t* zpA, float* scaleA, int _ldsa, float* scaleB,
+               float* reduceB, int _ldsb, int _m, int _n, int _k, int _kblock, int _astride, int _bstride, int _cstride,
+               int kpos, float kscale, void* tmpcache, size_t cachesize) {
+    auto param =
+        typename Code::params{matA,   _astride, matB,  _bstride, matC, _cstride, nullptr,           scaleA, _ldsa,
+                              scaleB, reduceB,  _ldsb, _k,       _n,   _kblock,  kpos == 0 ? 1 : 0, kscale};
+    if (_m <= Code::MTILE) {
+      this->mCodes[_m - 1].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class ICoreRowNAmxint8 : public CoreCodeBaseAMX<code::Amxint8N16P4US, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBaseAMX<code::Amxint8N16P4US, _NTILE, _MTILE>::Code;
+  using AType = typename Code::AType;
+  using BType = typename Code::BType;
+  using CType = typename Code::CType;
+  void configure(int _M, int _N, int _K) {
+    code::AmxConfigure::configure(_M < 16 ? _M : 16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
+                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
+  }
+
+  void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
+               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
+    auto param =
+        typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0, tmpcache};
+    if (_m <= Code::MTILE) {
+      int idx = utils::updiv(_m, 16) - 1;
+      this->mCodes[idx].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class ICoreRowNAmxint8SS : public CoreCodeBaseAMX<code::Amxint8N16P4SS, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBaseAMX<code::Amxint8N16P4SS, _NTILE, _MTILE>::Code;
+  using AType = typename Code::AType;
+  using BType = typename Code::BType;
+  using CType = typename Code::CType;
+  void configure(int _M, int _N, int _K) {
+    code::AmxConfigure::configure(_M < 16 ? _M : 16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
+                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
+  }
+
+  void forward(int8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
+               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
+    auto param =
+        typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0, tmpcache};
+    if (_m <= Code::MTILE) {
+      int idx = utils::updiv(_m, 16) - 1;
+      this->mCodes[idx].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class ICoreRowNAmxint8KBlock : public CoreCodeBaseAMX<code::kblock::Amxint8N16P4US, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBaseAMX<code::kblock::Amxint8N16P4US, _NTILE, _MTILE>::Code;
+  using AType = typename Code::AType;
+  using BType = typename Code::BType;
+  using CType = typename Code::CType;
+  void configure(int _M, int _N, int _K) {
+    code::AmxConfigure::configure(_M < 16 ? _M : 16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
+                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
+  }
+
+  void forward(uint8_t* matA, int8_t* matB, float* matC, uint8_t* zpA, float* scaleA, int _ldsa, float* scaleB,
+               float* reduceB, int _ldsb, int _m, int _n, int _k, int _kblock, int _astride, int _bstride, int _cstride,
+               int kpos, float kscale, void* tmpcache, size_t cachesize) {
+    auto param = typename Code::params{matA,   _astride, matB,    _bstride, matC, _cstride, zpA,     scaleA,
+                                       _ldsa,  scaleB,   reduceB, _ldsb,    _k,   _n,       _kblock, kpos == 0 ? 1 : 0,
+                                       kscale, tmpcache};
+    if (_m <= Code::MTILE) {
+      int idx = utils::updiv(_m, 16) - 1;
+      this->mCodes[idx].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+
+template <int _NTILE, int _MTILE = 0>
+class ICoreRowNAmxint8SSKBlock : public CoreCodeBaseAMX<code::kblock::Amxint8N16P4SS, _NTILE, _MTILE> {
+ public:
+  using Code = typename CoreCodeBaseAMX<code::kblock::Amxint8N16P4SS, _NTILE, _MTILE>::Code;
+  using AType = typename Code::AType;
+  using BType = typename Code::BType;
+  using CType = typename Code::CType;
+  void configure(int _M, int _N, int _K) {
+    code::AmxConfigure::configure(_M < 16 ? _M : 16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
+                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
+  }
+
+  void forward(int8_t* matA, int8_t* matB, float* matC, int8_t* zpA, float* scaleA, int _ldsa, float* scaleB,
+               float* reduceB, int _ldsb, int _m, int _n, int _k, int _kblock, int _astride, int _bstride, int _cstride,
+               int kpos, float kscale, void* tmpcache, size_t cachesize) {
+    auto param =
+        typename Code::params{matA,   _astride, matB,  _bstride, matC, _cstride, nullptr,           scaleA, _ldsa,
+                              scaleB, reduceB,  _ldsb, _k,       _n,   _kblock,  kpos == 0 ? 1 : 0, kscale, tmpcache};
+    if (_m <= Code::MTILE) {
+      int idx = utils::updiv(_m, 16) - 1;
+      this->mCodes[idx].mKernel(&param);
+    } else {
+      assert(0);
+    }
+  }
+};
+}  // namespace gemm
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_jit.h b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_jit.h
new file mode 100644
index 000000000..b1a0fa093
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_jit.h
@@ -0,0 +1,310 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <stdint.h>
+
+#include <cstddef>
+#include <type_traits>
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
+
+#define OFFSET(field) offsetof(params, field)
+
+namespace bestla {
+
+namespace xbyak {
+class JitBase : protected Xbyak::CodeGenerator {
+ protected:
+  JitBase(size_t size = 16 * 1024) : CodeGenerator(size) {}
+
+  void load32(const Xbyak::Reg64& reg, const Xbyak::Address& addr) {
+    xor_(reg, reg);
+    mov(reg.cvt32(), addr);
+  }
+
+  void vreg_push(const Xbyak::Reg64& baseaddr) {
+#ifdef _WIN32
+    for (int i = 0; i < 10; i++) {
+      movaps(xword[baseaddr + i * 16], Xbyak::Xmm(6 + i));
+    }
+#endif
+  }
+
+  void vreg_pop(const Xbyak::Reg64& baseaddr) {
+#ifdef _WIN32
+    for (int i = 0; i < 10; i++) {
+      movaps(Xbyak::Xmm(6 + i), xword[baseaddr + i * 16]);
+    }
+#endif
+  }
+
+  void padto_le(const Xbyak::Reg64& _src, int padding) {
+    // _src=_src/padding*padding
+    if (padding == 1) {
+      return;
+    }
+    for (int i = 1; i < 16; i++) {
+      if ((1 << i) == padding) {
+        shr(_src, i);
+        shl(_src, i);
+        return;
+      }
+    }
+    assert(0);
+  }
+
+  void generate_Nbitsmask(const Xbyak::Opmask& _msk, const Xbyak::Reg64& _pos, const Xbyak::Address& _total,
+                          const Xbyak::Reg64& _tmp, const Xbyak::Reg64& _tmp1, int N) {
+    inLocalLabel();
+    lea(_tmp, _total);
+    sub(_tmp, _pos);
+    cmp(_tmp, N);
+    jb(".maskflag");
+    cmp(_tmp, 0);
+    jl(".zeroflag");
+    uint64_t allmask = (static_cast<uint64_t>(1) << N) - 1;
+    if (N == 64) {
+      allmask = static_cast<uint64_t>(-1);
+    }
+    mov(_tmp, allmask);
+    kmovq(_msk, _tmp);
+    jmp(".maskend");
+    L(".maskflag");
+    mov(_tmp1, 1);
+    shlx(_tmp1, _tmp1, _tmp);
+    sub(_tmp1, 1);
+    kmovq(_msk, _tmp1);
+    jmp(".maskend");
+    L(".zeroflag");
+    mov(_tmp1, 0);
+    kmovq(_msk, _tmp1);
+    L(".maskend");
+    outLocalLabel();
+  }
+  void generate_Nbitsmask(const Xbyak::Opmask& _msk, const Xbyak::Reg64& _pos, const Xbyak::Reg64& _total,
+                          const Xbyak::Reg64& _tmp, const Xbyak::Reg64& _tmp1, int N) {
+    generate_Nbitsmask(_msk, _pos, ptr[_total], _tmp, _tmp1, N);
+  }
+};
+
+class JitAvx : protected JitBase {
+ protected:
+  static int constexpr VBits = 256;
+  static int constexpr VecBytes = VBits / 8;
+  static int constexpr RegCount = 16;
+  typedef Xbyak::Ymm vreg_t;
+};
+
+class JitAvx2 : protected JitAvx {
+ protected:
+  static int constexpr VBits = 256;
+  typedef Xbyak::Ymm vreg_t;
+  void vxor(const vreg_t& x1, const vreg_t& x2, const Xbyak::Operand& op) { vpxor(x1, x2, op); }
+
+  void loadbf16_f32(const Xbyak::Ymm& dst, const Xbyak::Address& addr) {
+    vpmovzxwd(dst, addr);
+    vpslld(dst, dst, 16);
+  }
+
+  void vpdpbusds_(const Xbyak::Xmm& sum4, const Xbyak::Xmm& sum2, const Xbyak::Xmm& x, const Xbyak::Operand& op,
+                  const Xbyak::Xmm& ones) {
+    vpmaddubsw(sum2, x, op);
+    vpmaddwd(sum2, sum2, ones);
+    vpaddd(sum4, sum4, sum2);
+  }
+};
+
+class JitAvx512f : protected JitAvx2 {
+ protected:
+  static int constexpr VBits = 512;
+  static int constexpr VecBytes = VBits / 8;
+  static int constexpr RegCount = 32;
+  typedef Xbyak::Zmm vreg_t;
+
+  void vxor(const vreg_t& x1, const vreg_t& x2, const Xbyak::Operand& op) { vpxorq(x1, x2, op); }
+
+  void interleave_2rows_4regs(Xbyak::Zmm* src_2regs, Xbyak::Zmm* tmp_2reg) {
+    vpunpcklwd(tmp_2reg[0], src_2regs[0], src_2regs[1]);
+    vpunpckhwd(tmp_2reg[1], src_2regs[0], src_2regs[1]);
+    vshuff32x4(src_2regs[0], tmp_2reg[0], tmp_2reg[1], 0 | (1 << 2) | (0 << 4) | (1 << 6));
+    vshuff32x4(src_2regs[0], src_2regs[0], src_2regs[0], 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    vshuff32x4(src_2regs[1], tmp_2reg[0], tmp_2reg[1], 2 | (3 << 2) | (2 << 4) | (3 << 6));
+    vshuff32x4(src_2regs[1], src_2regs[1], src_2regs[1], 0 | (2 << 2) | (1 << 4) | (3 << 6));
+  }
+
+  void transpose16x16_4B(Xbyak::Zmm* src, Xbyak::Zmm* tmp, const int N = 16) {
+    for (int i = 0; i < 8; ++i) {
+      vpunpckldq(tmp[2 * i + 0], src[2 * i], src[2 * i + 1]);
+      vpunpckhdq(tmp[2 * i + 1], src[2 * i], src[2 * i + 1]);
+    }
+
+    for (int i = 0; i < 4; ++i) {
+      vpunpcklqdq(src[4 * i + 0], tmp[4 * i + 0], tmp[4 * i + 2]);
+      vpunpckhqdq(src[4 * i + 1], tmp[4 * i + 0], tmp[4 * i + 2]);
+      vpunpcklqdq(src[4 * i + 2], tmp[4 * i + 1], tmp[4 * i + 3]);
+      vpunpckhqdq(src[4 * i + 3], tmp[4 * i + 1], tmp[4 * i + 3]);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      vshufi32x4(tmp[8 * i + 0], src[8 * i + 0], src[8 * i + 4], 0x88);
+      vshufi32x4(tmp[8 * i + 1], src[8 * i + 1], src[8 * i + 5], 0x88);
+      vshufi32x4(tmp[8 * i + 2], src[8 * i + 2], src[8 * i + 6], 0x88);
+      vshufi32x4(tmp[8 * i + 3], src[8 * i + 3], src[8 * i + 7], 0x88);
+      vshufi32x4(tmp[8 * i + 4], src[8 * i + 0], src[8 * i + 4], 0xdd);
+      vshufi32x4(tmp[8 * i + 5], src[8 * i + 1], src[8 * i + 5], 0xdd);
+      vshufi32x4(tmp[8 * i + 6], src[8 * i + 2], src[8 * i + 6], 0xdd);
+      vshufi32x4(tmp[8 * i + 7], src[8 * i + 3], src[8 * i + 7], 0xdd);
+    }
+
+    // last step and move out
+    for (int i = 0; i < N; ++i) {
+      vshufi32x4(src[i], tmp[i % 8], tmp[8 + i % 8], i < 8 ? 0x88 : 0xdd);
+    }
+  }
+
+  void interleave_4rows_6regs(Xbyak::Zmm* src_4regs, Xbyak::Zmm* tmp_regs, const Xbyak::Opmask* masks) {
+    vpunpcklbw(tmp_regs[0], src_4regs[0], src_4regs[1]);
+    vpunpckhbw(tmp_regs[1], src_4regs[0], src_4regs[1]);
+    vpunpcklbw(tmp_regs[2], src_4regs[2], src_4regs[3]);
+    vpunpckhbw(tmp_regs[3], src_4regs[2], src_4regs[3]);
+
+    vpunpcklwd(tmp_regs[4], tmp_regs[0], tmp_regs[2]);
+    vpunpckhwd(tmp_regs[5], tmp_regs[0], tmp_regs[2]);
+    vpunpcklwd(tmp_regs[0], tmp_regs[1], tmp_regs[3]);
+    vpunpckhwd(tmp_regs[2], tmp_regs[1], tmp_regs[3]);
+    vshuff32x4(tmp_regs[1], tmp_regs[4], tmp_regs[0], (4 << 4) | 4);
+    vshuff32x4(tmp_regs[3], tmp_regs[5], tmp_regs[2], (4 << 4) | 4);
+    vmovups(src_4regs[0], tmp_regs[1]);
+    vshuff32x4(src_4regs[0] | masks[0], tmp_regs[3], tmp_regs[3], 0 | (0 << 2) | (0 << 4) | (2 << 6));
+    vmovups(src_4regs[1], tmp_regs[3]);
+    vshuff32x4(src_4regs[1] | masks[1], tmp_regs[1], tmp_regs[1], 1 | (0 << 2) | (3 << 4) | (0 << 6));
+    vshuff32x4(tmp_regs[1], tmp_regs[4], tmp_regs[0], (14 << 4) | 14);
+    vshuff32x4(tmp_regs[3], tmp_regs[5], tmp_regs[2], (14 << 4) | 14);
+    vmovups(src_4regs[2], tmp_regs[1]);
+    vshuff32x4(src_4regs[2] | masks[0], tmp_regs[3], tmp_regs[3], 0 | (0 << 2) | (0 << 4) | (2 << 6));
+    vmovups(src_4regs[3], tmp_regs[3]);
+    vshuff32x4(src_4regs[3] | masks[1], tmp_regs[1], tmp_regs[1], 1 | (0 << 2) | (3 << 4) | (0 << 6));
+  }
+
+  void cvt_fp32_bf16(const Xbyak::Ymm& _bf16, const Xbyak::Zmm& _fp32) {
+    vpsrld(_fp32, _fp32, 16);
+    vpmovdw(_bf16, _fp32);
+  }
+
+  void loadbf16_f32(const Xbyak::Zmm& dst, const Xbyak::Address& addr) {
+    vpmovzxwd(dst, addr);
+    vpslld(dst, dst, 16);
+  }
+
+  void broadcastbf16_f32(const Xbyak::Zmm& dst, const Xbyak::Reg64& tmp, const Xbyak::Address& addr) {
+    mov(tmp.cvt16(), addr);
+    shl(tmp.cvt32(), 16);
+    vpbroadcastd(dst, tmp.cvt32());
+  }
+
+  void store_fp32_bf16(const Xbyak::Zmm& _fp32, const Xbyak::Address& _add) {
+    auto bf16 = Xbyak::Ymm(_fp32.getIdx());
+    cvt_fp32_bf16(bf16, _fp32);
+    vmovups(_add, bf16);
+  }
+};
+
+class JitAvx512_bf16 : protected JitAvx512f {};
+
+class JitAvx512_fp16 : protected JitAvx512f {};
+
+class JitAvx512vnni : protected JitAvx512f {
+ protected:
+  void vpdpbusds_(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, const Xbyak::Operand& op) {
+    vpdpbusds(x1, x2, op, Xbyak::EvexEncoding);
+  }
+};
+
+class JitAvxvnni : protected JitAvx2 {
+ protected:
+  void vpdpbusds_(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, const Xbyak::Operand& op) {
+    vpdpbusds(x1, x2, op, Xbyak::VexEncoding);
+  }
+};
+
+class JitAmxtile : protected JitAvx512f {
+ public:
+  struct alignas(64) tileconfig_t {
+    uint8_t palette_id;
+    uint8_t reserved[15];
+    uint16_t colb[16];
+    uint8_t rows[16];
+  };
+  static int constexpr TileCount = 8;
+
+  typedef long long (*configure_t)(void*);
+
+  static void generate_config(Xbyak::CodeGenerator* g) {
+    Xbyak::util::StackFrame st(g, 1, 0, 0);
+    auto& parambase = st.p[0];
+    g->ldtilecfg(g->ptr[parambase]);
+  }
+
+  static void configure_tiles(tileconfig_t& tc, int TILE_M, int TILE_N, int TILE_K, int elesize, int ANum, int BNum,
+                              int CNum) {
+    // Filling tile configure structure. Could be done offline.
+    tc.palette_id = 1;
+    // Configure C tiles
+    int t = 0;
+    for (; t < CNum; ++t) {
+      tc.rows[t] = static_cast<uint8_t>(TILE_M);
+      tc.colb[t] = static_cast<uint16_t>(TILE_N * 4);
+    }
+    // Configure A tiles
+    for (; t < CNum + ANum; ++t) {
+      tc.rows[t] = static_cast<uint8_t>(TILE_M);
+      tc.colb[t] = static_cast<uint16_t>(TILE_K * elesize);
+    }
+    // Configure B tile. B effectively has 64 rows and 16 columns.
+    int kpack = 4 / elesize;
+    for (; t < CNum + ANum + BNum; ++t) {
+      tc.rows[t] = static_cast<uint8_t>(TILE_K / kpack);
+      tc.colb[t] = static_cast<uint16_t>(TILE_N * 4);
+    }
+  }
+};
+
+class JitAmxbf16 : protected JitAmxtile {
+ protected:
+  void cvt_fp32_bf16(const Xbyak::Ymm& _bf16, const Xbyak::Zmm& _fp32) { vcvtneps2bf16(_bf16, _fp32); }
+};
+
+class JitAmxint8 : protected JitAmxtile {
+ protected:
+  template <class, class>
+  void _tdpb(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3);
+};
+template <>
+inline void JitAmxint8::_tdpb<int8_t, int8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
+  tdpbssd(x1, x2, x3);
+}
+template <>
+inline void JitAmxint8::_tdpb<int8_t, uint8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
+  tdpbsud(x1, x2, x3);
+}
+template <>
+inline void JitAmxint8::_tdpb<uint8_t, int8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
+  tdpbusd(x1, x2, x3);
+}
+template <>
+inline void JitAmxint8::_tdpb<uint8_t, uint8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
+  tdpbuud(x1, x2, x3);
+}
+}  // namespace xbyak
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_parallel.h b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_parallel.h
new file mode 100644
index 000000000..7ae1f68f6
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_parallel.h
@@ -0,0 +1,1013 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <atomic>
+#include <functional>
+#include <thread>
+#include <vector>
+// #include <stdexcept>
+#if BTLA_OPENMP
+#include <omp.h>
+#endif
+#include "bestla_utils.h"
+#include "bestla_device.h"
+
+namespace bestla {
+namespace parallel {
+
+using thread_func = std::function<void(int tid)>;
+
+class IThreading {
+ public:
+  explicit IThreading(bool supportPE) : isSupportPE(supportPE) {}
+  virtual void parallel_for(const thread_func& func) = 0;
+  virtual inline void sync(int tidx, int idx = 0) = 0;
+  virtual int num_threads() const { return mThreadNum; };
+  virtual int is_support_PE() const { return isSupportPE; };
+  virtual void set_threads(int nthreads) = 0;
+  virtual std::pair<float, float> get_PEtime() const { return {0.0f, 0.0f}; };
+
+ protected:
+  int mThreadNum;
+  const bool isSupportPE;
+};
+
+#if BTLA_OPENMP
+class OMPThreading : public IThreading {
+ public:
+  explicit OMPThreading() : IThreading(false) {}
+  void parallel_for(const thread_func& func) override {
+    if (mThreadNum > 1) {
+#pragma omp parallel
+      {
+        int tidx = omp_get_thread_num();
+        // printf("Using OMP thread idx %d\n", tidx);
+        func(tidx);
+      }
+    } else {
+      func(0);
+    }
+  }
+  virtual void set_threads(int nthreads) override {
+    
+    mThreadNum = nthreads;
+    // if (mThreadNum != nthreads)  {
+      // printf("set OMP threads %d\n", nthreads);
+    omp_set_num_threads(nthreads);
+    // }
+  }
+  virtual inline void sync(int tidx, int idx = 0) override {
+    (void)(tidx);
+    (void)(idx);
+#pragma omp barrier
+    (void)(0);  // make msvc happy with c++20
+  }
+};
+#endif
+
+class StdThreading : public IThreading {
+ public:
+  using Timer_T = utils::timer<utils::microseconds>;
+  explicit StdThreading() : IThreading(true) { cr = nullptr; }
+
+  void parallel_for(const thread_func& func) override {
+    time_per_p = 0;
+    time_per_e = 0;
+    Timer_T tm;
+    if (mThreadNum > 1) {
+      running.store(mThreadNum - 1);
+      for (int i = 0; i < 10; i++) flag[i].store(mThreadNum);
+      if (cr->mHybrid) {
+        int time_p = 0, time_e = 0;
+
+        for (size_t i = 0; i < mThreadNum - 1; i++) func_[i] = &func;
+        thread_time[0] = 0;
+        tm.start();
+        func(0);
+        thread_time[0] += int(tm.stop());
+        while (true) {
+          if (running.load() == 0)
+            break;
+          else
+            _mm_pause();
+        }
+        for (int i = 0; i < mThreadNum; i++)
+          if (i >= cr->P_core_num && i < cr->P_core_num + cr->E_core_num)
+            time_e += thread_time[i];
+          else
+            time_p += thread_time[i];
+        time_per_p = (time_p) / (1.0 * (mThreadNum - cr->E_core_num));
+        time_per_e = (time_e) / (1.0 * cr->E_core_num);
+        // printf("%d %d %f %f\n", time_p, time_e, time_per_p, time_per_e);
+      } else {
+        for (size_t i = 0; i < mThreadNum - 1; i++) {
+          func_[i] = &func;
+        }
+        func(0);
+        while (true) {
+          if (running.load() == 0)
+            break;
+          else
+            _mm_pause();
+        }
+      }
+    } else {
+      func(0);
+    }
+    // throw std::runtime_error("Not implemented");
+  }
+
+  void set_threads(int nthreads) override {
+    if (nthreads != mThreadNum) {
+      stop_threads();
+      mThreadNum = nthreads;
+      cr = &device::CpuRuntime::getInstance(nthreads);
+      create_threads();
+    }
+  }
+
+  inline void sync(int tidx, int idx = 0) override {
+    if (mThreadNum > 1) {
+      flag[idx].fetch_sub(1);
+      if (cr->mHybrid) {
+        Timer_T tm;
+        tm.start();
+        while (true) {
+          if (flag[idx].load() == 0)
+            break;
+          else
+            _mm_pause();
+        }
+        thread_time[tidx] -= int(tm.stop());
+      } else {
+        while (true) {
+          if (flag[idx].load() == 0)
+            break;
+          else
+            _mm_pause();
+        }
+      }
+    }
+  }
+
+  std::pair<float, float> get_PEtime() const override { return {time_per_p, time_per_e}; };
+
+  ~StdThreading() { stop_threads(); }
+
+ private:
+  void stop_threads() {
+    stop = true;
+    for (int i = 0; i < mThreadNum - 1; i++) thdset[i].join();
+    thdset.clear();
+    // printf("stop %d\n", mThreadNum);
+  }
+  void create_threads() {
+    // printf("create %d\n", mThreadNum);
+    thdset.resize(mThreadNum - 1);
+    stop = false;
+    GetCPUDevice();
+    std::vector<int> core_order;
+    if (_cd->isHybrid()) {
+      core_order.resize(_cd->getThreads());
+      memcpy(reinterpret_cast<void*>(core_order.data()), reinterpret_cast<void*>(_cd->getPCores()),
+             _cd->getPcoreNum() * sizeof(int));
+      memcpy(reinterpret_cast<void*>(core_order.data() + _cd->getPcoreNum()), reinterpret_cast<void*>(_cd->getECores()),
+             _cd->getEcoreNum() * sizeof(int));
+      memcpy(reinterpret_cast<void*>(core_order.data() + _cd->getPcoreNum() + _cd->getEcoreNum()),
+             reinterpret_cast<void*>(_cd->getSMTCores()), _cd->getSMTcoreNum() * sizeof(int));
+    } else {
+      core_order.resize(mThreadNum);
+      if (_cd->isClient())
+        for (int i = 0; i < mThreadNum; i++) core_order[i] = 2 * i;
+      else
+        for (int i = 0; i < mThreadNum; i++) core_order[i] = i;
+    }
+    _cd->core_bond(core_order[0]);
+    if (cr->mHybrid) {
+      thread_time.resize(mThreadNum);
+      for (size_t i = 0; i < mThreadNum - 1; i++) {
+        thdset[i] = std::thread(
+            [&](int tidx, int core_id) {
+              _cd->core_bond(core_id);
+              Timer_T tm;
+              while (true) {
+                if (stop.load() == true) break;
+                if (func_[tidx] != nullptr) {
+                  thread_time[tidx + 1] = 0;
+                  tm.start();
+                  (*func_[tidx])(tidx + 1);
+                  func_[tidx] = nullptr;
+                  thread_time[tidx + 1] += int(tm.stop());
+                  running.fetch_sub(1);
+                } else {
+                  _mm_pause();
+                }
+              }
+            },
+            int(i), core_order[i + 1]);
+      }
+    } else
+      for (size_t i = 0; i < mThreadNum - 1; i++) {
+        thdset[i] = std::thread(
+            [&](int tidx, int core_id) {
+              _cd->core_bond(core_id);
+              while (true) {
+                if (stop.load() == true) break;
+                if (func_[tidx] != nullptr) {
+                  (*func_[tidx])(tidx + 1);
+                  func_[tidx] = nullptr;
+                  running.fetch_sub(1);
+                } else {
+                  _mm_pause();
+                }
+              }
+            },
+            int(i), core_order[i + 1]);
+      }
+  }
+  device::CpuRuntime* cr;
+  std::vector<int> thread_time;
+  float time_per_p, time_per_e;
+  std::vector<std::thread> thdset;
+  std::atomic_bool stop;
+  std::atomic_int running;
+  std::atomic_int flag[10];
+  const thread_func* func_[100];
+};
+
+class SingleThread : public IThreading {
+ public:
+  SingleThread() : IThreading(false) { mThreadNum = 1; }
+
+  void set_threads(int nthreads) override {
+    assert(0);
+    (void)(nthreads);
+  }
+
+  inline void parallel_for(const thread_func& func) override { func(0); }
+
+  inline void sync(int tidx, int idx = 0) override {
+    (void)(tidx);
+    (void)(idx);
+  }
+};
+
+struct Config2D {
+  int threads;
+  int size[2];
+  int step[2];
+  int offset[2];
+};
+struct ThreadProblem2D {
+  int tid;
+  int tidx[2];
+  int loc[2];
+  int size[2];
+  bool valid;
+  void print() {
+    printf("Thread %d indice:(%d,%d)\n", tid, tidx[0], tidx[1]);
+    printf("Thread location:(%d,%d)\n", loc[0], loc[1]);
+    printf("Thread problem size:(%d,%d)\n", size[0], size[1]);
+  }
+};
+class Scheduler2D {
+ public:
+  Scheduler2D() = default;
+  Scheduler2D(const Config2D& config) { update(config); }
+  using ThreadProblem = ThreadProblem2D;
+
+  virtual void getIndex(ThreadProblem& problem) const {
+    if (problem.tid >= mThdValid) {
+      problem.size[0] = 0;
+      problem.size[1] = 0;
+      problem.valid = false;
+      return;
+    }
+    auto& tid = problem.tid;
+    problem.tidx[1] = tid % mThdPerRow;
+    problem.tidx[0] = tid / mThdPerRow;
+    problem.loc[0] = problem.tidx[0] * mThdSize[0];
+    problem.loc[1] = problem.tidx[1] * mThdSize[1];
+    problem.size[0] = utils::remainsize(problem.loc[0], mSize[0], mThdSize[0]);
+    problem.size[1] = utils::remainsize(problem.loc[1], mSize[1], mThdSize[1]);
+    problem.loc[0] += moffset[0];
+    problem.loc[1] += moffset[1];
+    problem.valid = true;
+  }
+
+  virtual void update(const Config2D& config) {
+    mThdCount = config.threads;
+    for (size_t i = 0; i < 2; i++) {
+      mSize[i] = config.size[i];
+      mStep[i] = config.step[i];
+      moffset[i] = config.offset[i];
+    }
+    schedule();
+  }
+
+  constexpr static BTLA_ISA gemm_ISA() { return BTLA_ISA::NoSIMD; }
+
+  void print() {
+    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
+    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
+  }
+
+  constexpr int* thread_size() { return mThdSize; }
+
+ protected:
+  void set(const int* thdsize, const int* size, const int* step) {
+    for (size_t i = 0; i < 2; i++) {
+      mThdSize[i] = thdsize[i];
+      mSize[i] = size[i];
+      mStep[i] = step[i];
+    }
+  }
+  void schedule() {
+    int rownum = utils::updiv(mSize[0], mStep[0]);
+    int colnum = utils::updiv(mSize[1], mStep[1]);
+    float ratio = colnum * rownum / static_cast<float>(mThdCount);
+    if (ratio <= 1) {
+      mThdSize[0] = mStep[0];
+      mThdSize[1] = mStep[1];
+      mThdPerRow = colnum;
+      calc_valid_threads();
+      return;
+    }
+    float colratio = ratio > colnum ? colnum : ceil(ratio);
+    mThdSize[1] = static_cast<int>(colratio * mStep[1]);
+    mThdPerRow = static_cast<int>(ceil(static_cast<float>(colnum) / colratio));
+    mThdSize[0] = static_cast<int>(ceil(rownum / (static_cast<float>(mThdCount) / mThdPerRow)) * mStep[0]);
+    calc_valid_threads();
+  }
+  void calc_valid_threads() {
+    mThdValid = mThdPerRow * static_cast<int>(std::ceil(static_cast<float>(mSize[0]) / mThdSize[0]));
+  }
+
+  int mThdPerRow = 0;
+  int mThdValid = 0;
+  int mThdCount = 0;
+  int moffset[2] = {0, 0};
+
+ private:
+  int mThdSize[2] = {0, 0};
+  int mSize[2] = {0, 0};
+  int mStep[2] = {0, 0};
+};
+
+namespace gemm {
+
+struct Config {
+  const int threads;
+  const utils::GemmProblem problem;
+  const int offset[2];
+  const size_t l2cache = 1024ULL * 1024;
+  const size_t l1cache = 32ULL * 1024;
+};
+
+struct ThreadProblemBase : ThreadProblem2D {
+  int block[3];
+  size_t stacksize;
+  size_t tmpcachesize;
+};
+
+template <class _GemmCore_T>
+class SchedulerBase : public Scheduler2D {
+ public:
+  using ThreadProblem = ThreadProblemBase;
+  SchedulerBase() = default;
+  SchedulerBase(const Config& config) { update(config); }
+  virtual void getIndex(ThreadProblem& problem) {
+    problem.tmpcachesize = mL2Size - mL2Use;
+    problem.stacksize = mL2Size;
+    problem.block[0] = mBlock[0];
+    problem.block[1] = mBlock[1];
+    problem.block[2] = mBlock[2];
+    Scheduler2D::getIndex(problem);
+  }
+
+  virtual void update(const Config& config) {
+    for (size_t i = 0; i < 3; i++) {
+      mSize[i] = config.problem.dims[i + 1];  // skip 0th batch
+      mSizePadded[i] = utils::padto(mSize[i], mStep[i]);
+    }
+    mThdCount = config.threads;
+    mL2Size = config.l2cache;
+    mL1Size = config.l1cache;
+    Scheduler2D::moffset[0] = config.offset[0];
+    Scheduler2D::moffset[1] = config.offset[1];
+    if (mSize[0] <= 0 || mSize[1] <= 0 || mSize[2] <= 0) {
+      return;
+    }
+    schedule();
+    assert(this->mL2Use <= this->mL2Size - ReservedSize);
+    assert(this->mBlock[0] > 0);
+    assert(this->mBlock[1] > 0);
+    assert(this->mBlock[2] > 0);
+  }
+
+  constexpr static BTLA_ISA gemm_ISA() { return _GemmCore_T::ISA; }
+
+  constexpr int valid_theads() { return mThdValid; }
+
+  virtual void print() {
+    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
+    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
+    printf("GEMM MStep:%d NStep:%d KStep:%d\n", mBlock[0], mBlock[1], mBlock[2]);
+    printf("Cache Size:%zu used:%zu\n", mL2Size, mL2Use);
+  }
+
+  template <class T>
+  friend class SchedulerDispatcher;
+
+ protected:
+  virtual void schedule() {
+    int rownum = utils::updiv(mSize[0], mStep[0]);
+    int colnum = utils::updiv(mSize[1], mStep[1]);
+    mDensity = static_cast<float>(mSize[0]) * mSize[1] / (mSize[0] + mSize[1]);
+    int maxN = 0;
+    float maxScore = std::numeric_limits<float>::min();
+    int core_enum = static_cast<int>(std::sqrt(mThdCount));
+    for (int i = 1; i <= core_enum; i += 1) {
+      generate_by_cores(i, mThdCount / i, rownum, colnum);
+      auto thdscore = calculate_score();
+      if (maxScore < thdscore) {
+        maxScore = thdscore;
+        maxN = i;
+      }
+      generate_by_cores(mThdCount / i, i, rownum, colnum);
+      thdscore = calculate_score();
+      if (maxScore < thdscore) {
+        maxScore = thdscore;
+        maxN = mThdCount / i;
+      }
+    }
+    generate_by_cores(maxN, mThdCount / maxN, rownum, colnum);
+    update_cache_blocking();
+    Scheduler2D::set(mThdSize, mSize, mStep);
+    mL2Use = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2];
+    mL2Use += static_cast<size_t>(mBlock[1]) * mBlock[2] * mEleSize[1];
+    mL2Use += static_cast<size_t>(mStep[0]) * mBlock[2] * mEleSize[0];
+  }
+  static float constexpr DensityThres = 16;
+  static size_t constexpr ReservedSize = 32ULL * 1024ULL;
+
+  virtual float calculate_score() {
+    int tmpnstep = mThdSize[1] < _GemmCore_T::PREFERRED_N ? mThdSize[1] : _GemmCore_T::PREFERRED_N;
+    float threadratio = static_cast<float>(mThdValid) / mThdCount;
+    float density = static_cast<float>(tmpnstep) * mThdSize[0] / (tmpnstep + mThdSize[0]);
+    if (mDensity < DensityThres) {
+      return threadratio;
+    }
+    return (threadratio * 1.f + density * 0.0016f);
+  }
+
+  virtual void generate_by_cores(int ny, int nx, int rownum, int colnum) {
+    mThdSize[0] = utils::updiv(rownum, ny) * mStep[0];
+    mThdSize[1] = utils::updiv(colnum, nx) * mStep[1];
+    mThdPerRow = utils::updiv(mSize[1], mThdSize[1]);
+    mThdValid = utils::updiv(mSize[0], mThdSize[0]) * mThdPerRow;
+  }
+
+  // cache = mMStep * mNStep * CSize + mNStep * mKStep * BSize
+  //       = mNStep * (mMStep*CSize + mKStep*BSize)
+  // C Access = K/mKStep
+  // B Access = M/mMStep
+  // A Access = N/mNStep
+  virtual void update_cache_blocking() {
+    if (mDensity <= DensityThres) {
+      return cache_blocking_memory();
+    } else {
+      return cache_blocking_compute();
+    }
+  }
+
+  virtual void cache_blocking_compute() {
+    int constexpr KRef = 256;
+    size_t valid_total = mL2Size - ReservedSize;
+    auto asize = mStep[0] * KRef * mEleSize[0];
+    size_t csize_total = valid_total - _GemmCore_T::PREFERRED_N * KRef * mEleSize[1] - asize;
+    int maxM = static_cast<int>(csize_total / _GemmCore_T::PREFERRED_N / mEleSize[2]);
+    maxM = utils::downdiv(maxM, mStep[0]);
+    int nthdm = mThdSize[0] / mStep[0];
+    if (maxM < nthdm) {
+      int niter = utils::updiv(nthdm, maxM);
+      mBlock[0] = utils::updiv(nthdm, niter) * mStep[0];
+    } else {
+      mBlock[0] = mThdSize[0];
+    }
+    int maxN = static_cast<int>((valid_total - asize) / (mBlock[0] * mEleSize[2] + KRef * mEleSize[1]));
+    maxN = utils::downdiv(maxN, mStep[1]);
+    int nthdn = mThdSize[1] / mStep[1];
+    if (maxN < nthdn) {
+      int niter = utils::updiv(nthdn, maxN);
+      mBlock[1] = utils::updiv(nthdn, niter) * mStep[1];
+    } else {
+      mBlock[1] = mThdSize[1];
+    }
+    auto rawk = static_cast<int>((valid_total - mBlock[0] * mBlock[1] * mEleSize[2]) /
+                                 (mStep[0] * mEleSize[0] + mBlock[1] * mEleSize[1]));
+    rawk = std::min(rawk, mSizePadded[2]);
+    mBlock[2] = utils::padto_le(rawk, mStep[2]);
+  }
+
+  virtual void cache_blocking_memory() {
+    mBlock[0] = mThdSize[0];
+    mBlock[1] = mStep[1];
+    size_t reservsize = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2];
+    size_t maxK = (mL1Size - reservsize) / (mBlock[1] * mEleSize[1] + mBlock[0] * mEleSize[0]);
+    size_t Bsize = maxK * mBlock[1] * mEleSize[1];
+    size_t Bsize_1K = utils::padto_le(Bsize, 1024);
+    mBlock[2] = static_cast<int>(Bsize_1K / mEleSize[1] / mBlock[1]);
+    mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
+  }
+
+  size_t mL2Size = 0, mL1Size = 0, mL2Use = 0;
+  float mDensity = 0.f;
+  int mSize[3] = {0, 0, 0};
+  int mThdSize[3] = {0, 0, 0};
+  static constexpr int mStep[3] = {_GemmCore_T::MTILE, _GemmCore_T::NTILE, _GemmCore_T::KTILE};
+  static constexpr int mEleSize[3] = {sizeof(typename _GemmCore_T::AType), sizeof(typename _GemmCore_T::BType),
+                                      sizeof(typename _GemmCore_T::CType)};
+  int mSizePadded[3] = {0, 0, 0};
+  int mBlock[3] = {0, 0, 0};
+};
+
+template <class _GemmCore_T>
+class SchedulerKBlock : public Scheduler2D {
+  // Block[2]: block size of K must be multiplier of mKBlock
+  //           or factor of mKBlock
+ public:
+  using ThreadProblem = ThreadProblemBase;
+  SchedulerKBlock() = default;
+  SchedulerKBlock(const Config& config) { update(config); }
+  virtual void getIndex(ThreadProblem& problem) {
+    problem.stacksize = mL2Size;
+    problem.tmpcachesize = mL2Size - mL2Use;
+    problem.block[0] = mBlock[0];
+    problem.block[1] = mBlock[1];
+    problem.block[2] = mBlock[2];
+    Scheduler2D::getIndex(problem);
+  }
+
+  void update(const Config& config) {
+    for (size_t i = 0; i < 3; i++) {
+      mSize[i] = config.problem.dims[i + 1];
+      mSizePadded[i] = utils::padto(mSize[i], mStep[i]);
+    }
+    mThdCount = config.threads;
+    mL2Size = config.l2cache;
+    mL1Size = config.l1cache;
+    moffset[0] = config.offset[0];
+    moffset[1] = config.offset[1];
+    mKBlock = config.problem.dims[4];
+    if (mSize[0] <= 0 || mSize[1] <= 0 || mSize[2] <= 0) {
+      return;
+    }
+    schedule();
+    assert(this->mL2Use <= this->mL2Size);
+    assert(this->mBlock[0] > 0);
+    assert(this->mBlock[1] > 0);
+    assert(this->mBlock[2] > 0);
+  }
+
+  constexpr static BTLA_ISA gemm_ISA() { return _GemmCore_T::ISA; }
+
+  constexpr int valid_theads() { return mThdValid; }
+
+  void print() {
+    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
+    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
+    printf("GEMM MStep:%d NStep:%d KStep:%d\n", mBlock[0], mBlock[1], mBlock[2]);
+    printf("Cache Size:%zu used:%zu\n", mL2Size, mL2Use);
+  }
+
+  template <class T>
+  friend class SchedulerDispatcher;
+
+ protected:
+  void schedule() {
+    int rownum = utils::updiv(mSize[0], mStep[0]);
+    int colnum = utils::updiv(mSize[1], mStep[1]);
+    mDensity = static_cast<float>(mSize[0]) * mSize[1] / (mSize[0] + mSize[1]);
+    int maxN = 0;
+    float maxScore = std::numeric_limits<float>::min();
+    int core_enum = static_cast<int>(std::sqrt(mThdCount));
+    for (int i = 1; i <= core_enum; i += 1) {
+      generate_by_cores(i, mThdCount / i, rownum, colnum);
+      auto thdscore = calculate_score();
+      if (maxScore < thdscore) {
+        maxScore = thdscore;
+        maxN = i;
+      }
+      generate_by_cores(mThdCount / i, i, rownum, colnum);
+      thdscore = calculate_score();
+      if (maxScore < thdscore) {
+        maxScore = thdscore;
+        maxN = mThdCount / i;
+      }
+    }
+    generate_by_cores(maxN, mThdCount / maxN, rownum, colnum);
+    update_cache_blocking();
+    Scheduler2D::set(mThdSize, mSize, mStep);
+    mL2Use = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2] * 2;
+    mL2Use += static_cast<size_t>(mBlock[1]) * mBlock[2] * mEleSize[1];
+    mL2Use += static_cast<size_t>(mStep[0]) * mBlock[2] * mEleSize[0];
+  }
+  static float constexpr DensityThres = 16;
+
+  float calculate_score() {
+    int tmpnstep = mThdSize[1] < _GemmCore_T::PREFERRED_N ? mThdSize[1] : _GemmCore_T::PREFERRED_N;
+    float threadratio = static_cast<float>(mThdValid) / mThdCount;
+    float density = static_cast<float>(tmpnstep) * mThdSize[0] / (tmpnstep + mThdSize[0]);
+    if (mDensity < DensityThres) {
+      return threadratio * 1.f;
+    }
+    return (threadratio * 1.f + density * 0.0016f);
+  }
+
+  void generate_by_cores(int ny, int nx, int rownum, int colnum) {
+    mThdSize[0] = utils::updiv(rownum, ny) * mStep[0];
+    mThdSize[1] = utils::updiv(colnum, nx) * mStep[1];
+    mThdPerRow = utils::updiv(mSize[1], mThdSize[1]);
+    mThdValid = utils::updiv(mSize[0], mThdSize[0]) * mThdPerRow;
+  }
+
+  // C-KBlock Accumulator=MBlock*NBlock
+  // C-K Accumulator=MBlock*NBlock
+  // B=MBlock*KBlock
+  // A=MTILE*KBlock
+  void update_cache_blocking() {
+    if (mDensity <= DensityThres) {
+      return cache_blocking_memory();
+    } else {
+      return cache_blocking_compute();
+    }
+  }
+
+  void cache_blocking_compute() {
+    int constexpr KRef = 256;
+    int constexpr NRef = _GemmCore_T::PREFERRED_N;
+    int constexpr MTile = _GemmCore_T::MTILE;
+    int constexpr KSplitStage = 16;
+    int BlkNum = utils::updiv(mSize[2], mKBlock);
+    int KSplitSize = utils::padto(utils::updiv(mSize[2], KSplitStage), mStep[2]);
+    mBlock[1] = NRef < mThdSize[1] ? NRef : mThdSize[1];
+    if (KSplitStage * mStep[2] >= mSize[2]) {
+      mBlock[2] = mSize[2];
+    } else if (KSplitSize >= mKBlock) {
+      mBlock[2] = mKBlock;
+    } else {
+      int scale = utils::downdiv(KSplitStage, BlkNum);
+      for (; scale >= 1; scale--) {
+        if (mKBlock % scale == 0) {
+          break;
+        }
+      }
+      mBlock[2] = utils::downdiv(mKBlock, scale);
+      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
+    }
+    size_t size_remain = mL2Size - mBlock[1] * mBlock[2] * mEleSize[1];
+    // MBlock*KBlock*ASize+MBlock*NBlock*CSize*2<=size_remain
+    int maxMBlock = static_cast<int>(size_remain / (mBlock[1] * mEleSize[2] * 2 + mBlock[2] * mEleSize[0]));
+    int maxM = utils::downdiv(maxMBlock, mStep[0]);
+    int nthdm = mThdSize[0] / mStep[0];
+    if (maxM < nthdm) {
+      int niter = utils::updiv(nthdm, maxM);
+      mBlock[0] = utils::updiv(nthdm, niter) * mStep[0];
+    } else {
+      mBlock[0] = mThdSize[0];
+    }
+  }
+
+  void cache_blocking_memory() {
+    mBlock[0] = _GemmCore_T::MTILE;
+    size_t startK = std::max(16, _GemmCore_T::KTILE);
+    auto getMaxN = [&](size_t refk) {
+      size_t sizeA = refk * mEleSize[0] * mBlock[0];
+      size_t maxN = (mL1Size - sizeA) / (mBlock[0] * mEleSize[2] * 2 + refk * mEleSize[1]);
+      return maxN;
+    };
+    auto getMaxK = [&](size_t refN) {
+      size_t sizeC = refN * mEleSize[2] * mBlock[0] * 2;
+      size_t maxK = (mL1Size - sizeC) / (mBlock[0] * mEleSize[0] + refN * mEleSize[1]);
+      return maxK;
+    };
+    auto maxN = getMaxN(startK);
+    if (maxN <= mThdSize[1]) {
+      mBlock[1] = static_cast<int>(maxN);
+      mBlock[1] = utils::padto_le(mBlock[1], mStep[1]);
+      mBlock[2] = static_cast<int>(startK);
+    } else {
+      mBlock[1] = mThdSize[1];
+      mBlock[2] = static_cast<int>(getMaxK(mBlock[1]));
+      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
+      mBlock[2] = std::min(mKBlock, mBlock[2]);
+      auto tmp = utils::updiv(mKBlock, mBlock[2]);
+      while (mKBlock % tmp != 0) tmp++;  // TODO(Yu) optimize
+      mBlock[2] = utils::downdiv(mKBlock, tmp);
+    }
+  }
+  size_t mL2Size = 0, mL1Size = 0, mL2Use = 0;
+  float mDensity = 0.f;
+  int mKBlock = 0;
+
+ private:
+  int mSize[3] = {0, 0, 0};
+  int mThdSize[3] = {0, 0, 0};
+  static constexpr int mStep[3] = {_GemmCore_T::MTILE, _GemmCore_T::NTILE, _GemmCore_T::KTILE};
+  static constexpr int mEleSize[3] = {sizeof(typename _GemmCore_T::AType), sizeof(typename _GemmCore_T::BType),
+                                      sizeof(typename _GemmCore_T::CType)};
+  int mSizePadded[3] = {0, 0, 0};
+  int mBlock[3] = {0, 0, 0};
+};
+
+template <class _GemmCore_T>
+class SchedulerKBlockS : public SchedulerBase<_GemmCore_T> {
+  // Block[2]: block size of K must be multiplier of mKBlock
+  //           or factor of mKBlock
+ public:
+  using ThreadProblem = ThreadProblemBase;
+  using BaseScheduler = SchedulerBase<_GemmCore_T>;
+  SchedulerKBlockS() = default;
+  SchedulerKBlockS(const Config& config) { update(config); }
+
+  void update(const Config& config) {
+    mKBlock = config.problem.dims[4];
+    BaseScheduler::update(config);
+    auto blks = utils::updiv(this->mBlock[2], mKBlock);
+    this->mL2Use += static_cast<size_t>(blks) * (this->mBlock[1] + this->mStep[0]) *
+                    (sizeof(float) + sizeof(int8_t) + sizeof(float));  // scale+zp+reduce
+    assert(this->mL2Use <= this->mL2Size - ReservedSize);
+    assert(this->mBlock[0] >= 0);
+    assert(this->mBlock[1] >= 0);
+    assert(this->mBlock[2] >= 0);
+    assert(this->mBlock[2] % _GemmCore_T::KTILE == 0);
+  }
+
+  constexpr static BTLA_ISA gemm_ISA() { return _GemmCore_T::ISA; }
+
+  template <class T>
+  friend class SchedulerDispatcher;
+
+ protected:
+  static float constexpr DensityThres = 16;
+  static size_t constexpr ReservedSize = 32ULL * 1024ULL;
+
+  void cache_blocking_compute() override {
+    int constexpr KRef = 256;
+    int constexpr CorSize = sizeof(float) + sizeof(int8_t) + sizeof(float);
+    size_t valid_total = this->mL2Size - ReservedSize;
+    auto blks = utils::updiv(KRef, this->mKBlock);
+    auto asize = this->mStep[0] * KRef * this->mEleSize[0] + this->mStep[0] * blks * CorSize;
+    auto bsize = _GemmCore_T::PREFERRED_N * KRef * this->mEleSize[1] + _GemmCore_T::PREFERRED_N * blks * CorSize;
+    size_t csize_total = valid_total - asize - bsize;
+    int maxM = static_cast<int>(csize_total / _GemmCore_T::PREFERRED_N / this->mEleSize[2]);
+    maxM = utils::downdiv(maxM, this->mStep[0]);
+    int nthdm = this->mThdSize[0] / this->mStep[0];
+    if (maxM < nthdm) {
+      int niter = utils::updiv(nthdm, maxM);
+      this->mBlock[0] = utils::updiv(nthdm, niter) * this->mStep[0];
+    } else {
+      this->mBlock[0] = this->mThdSize[0];
+    }
+    int maxN = static_cast<int>((valid_total - asize) /
+                                (this->mBlock[0] * this->mEleSize[2] + KRef * this->mEleSize[1] + blks * CorSize));
+    maxN = utils::downdiv(maxN, this->mStep[1]);
+    int nthdn = this->mThdSize[1] / this->mStep[1];
+    if (maxN < nthdn) {
+      int niter = utils::updiv(nthdn, maxN);
+      this->mBlock[1] = utils::updiv(nthdn, niter) * this->mStep[1];
+    } else {
+      this->mBlock[1] = this->mThdSize[1];
+    }
+    auto rawk = static_cast<int>((valid_total - this->mBlock[0] * this->mBlock[1] * this->mEleSize[2]) /
+                                 (this->mStep[0] * this->mEleSize[0] +
+                                  float(CorSize * (this->mStep[0] + this->mBlock[1])) / this->mKBlock +
+                                  this->mBlock[1] * this->mEleSize[1]));
+    if (rawk < this->mKBlock) {
+      rawk = static_cast<int>((valid_total - this->mBlock[0] * this->mBlock[1] * this->mEleSize[2] -
+                               1 * CorSize * (this->mStep[0] + this->mBlock[1])) /
+                              (this->mStep[0] * this->mEleSize[0] + this->mBlock[1] * this->mEleSize[1]));
+    }
+    rawk = std::min(rawk, this->mSizePadded[2]);
+    this->mBlock[2] = utils::padto_le(rawk, this->mStep[2]);
+    if (this->mBlock[2] > this->mKBlock) {
+      this->mBlock[2] = utils::padto_le(this->mBlock[2], this->mKBlock);
+    }
+  }
+
+  void cache_blocking_memory() override {
+    this->mBlock[0] = _GemmCore_T::MTILE;
+    size_t startK = std::max(16, _GemmCore_T::KTILE);
+    auto getMaxN = [&](size_t refk) {
+      size_t sizeA = refk * this->mEleSize[0] * this->mBlock[0];
+      auto blks = utils::updiv(refk, mKBlock);
+      sizeA += blks * this->mBlock[0] * (sizeof(float) + sizeof(uint8_t));
+      size_t maxN = (this->mL1Size - sizeA) / (this->mBlock[0] * this->mEleSize[2] + refk * this->mEleSize[1]);
+      return maxN;
+    };
+    auto getMaxK = [&](size_t refN) {
+      size_t sizeC = refN * this->mEleSize[2] * this->mBlock[0];
+      size_t maxK = (this->mL1Size - sizeC) / (this->mBlock[0] * this->mEleSize[0] + refN * this->mEleSize[1]);
+      return maxK;
+    };
+    if (mKBlock <= 32) {
+      this->mBlock[2] = mKBlock;
+      auto maxN = getMaxN(startK);
+      this->mBlock[1] = static_cast<int>(maxN);
+      this->mBlock[1] = std::min(this->mBlock[1], this->mThdSize[1]);
+      this->mBlock[1] = utils::padto_le(this->mBlock[1], this->mStep[1]);
+      return;
+    }
+    auto maxN = getMaxN(startK);
+    if (maxN <= this->mThdSize[1]) {
+      this->mBlock[1] = static_cast<int>(maxN);
+      this->mBlock[1] = utils::padto_le(this->mBlock[1], this->mStep[1]);
+      this->mBlock[2] = static_cast<int>(startK);
+    } else {
+      this->mBlock[1] = this->mThdSize[1];
+      this->mBlock[2] = static_cast<int>(getMaxK(this->mBlock[1]));
+      this->mBlock[2] = utils::padto_le(this->mBlock[2], this->mStep[2]);
+      this->mBlock[2] = std::min(mKBlock, this->mBlock[2]);
+    }
+  }
+
+  int mKBlock{0};
+};
+
+template <class Scheduler>
+class SchedulerDispatcher {
+ public:
+  using ThreadProblem = ThreadProblemBase;
+  SchedulerDispatcher() = default;
+  ~SchedulerDispatcher() {
+    std::pair<float, float> PEtime = th_->get_PEtime();
+    if (needDispatch && int(PEtime.first) > 0 && int(PEtime.second) > 0)
+      cr->adjustPE(Scheduler::gemm_ISA(), PEtime.second / PEtime.first);
+  }
+  SchedulerDispatcher(const IThreading* th, const utils::GemmProblem& problem) {
+    th_ = th;
+    cr = &device::CpuRuntime::getInstance(th->num_threads());
+    needDispatch = cr->mHybrid && th->is_support_PE();
+    if (!needDispatch) {
+      Scheduler_P = std::move(Scheduler({th->num_threads(), problem, {0, 0}, cr->mL2Cache, cr->mL1Cache}));
+    } else {
+      Pcore_num = cr->P_core_num;
+      Ecore_num = cr->E_core_num;
+      utils::GemmProblem problem_P = problem, problem_E = problem;
+      const int N = problem.dims[2];
+      auto PE_Ratio = cr->getPE(Scheduler::gemm_ISA());
+      int N_offset = utils::padto(N - int(N / (1 + PE_Ratio)), Scheduler::mStep[1]);
+      N_offset = N_offset <= N ? N_offset : N;
+      problem_P.dims[2] = N_offset;
+      Scheduler_P =
+          std::move(Scheduler({th->num_threads() - cr->E_core_num, problem_P, {0, 0}, cr->mL2Cache_P, cr->mL1Cache_P}));
+      problem_E.dims[2] = N - N_offset;
+      Scheduler_E = std::move(Scheduler({cr->E_core_num, problem_E, {0, N_offset}, cr->mL2Cache_E, cr->mL1Cache_E}));
+    }
+  }
+
+  void getIndex(ThreadProblem& problem) {
+    if (!needDispatch) {
+      Scheduler_P.getIndex(problem);
+    } else {
+      if (problem.tid >= Pcore_num + Ecore_num) {
+        problem.tid -= Ecore_num;
+        Scheduler_P.getIndex(problem);
+      } else if (problem.tid >= Pcore_num) {
+        problem.tid -= Pcore_num;
+        Scheduler_E.getIndex(problem);
+      } else {
+        Scheduler_P.getIndex(problem);
+      }
+    }
+  }
+
+  void print() {
+    printf("dispatch to hybrid:%d\n", needDispatch);
+    Scheduler_P.print();
+    if (needDispatch) Scheduler_E.print();
+  }
+
+ private:
+  Scheduler Scheduler_P, Scheduler_E;
+  const IThreading* th_;
+  device::CpuRuntime* cr;
+  bool needDispatch = false;
+  int Pcore_num = 0, Ecore_num = 0;
+};
+
+template <>
+class SchedulerDispatcher<Scheduler2D> {
+ public:
+  using ThreadProblem = ThreadProblem2D;
+  SchedulerDispatcher() = default;
+  ~SchedulerDispatcher() {}
+  SchedulerDispatcher(const IThreading* th, const Config2D& config) {
+    device::CpuRuntime& cr = device::CpuRuntime::getInstance(config.threads);
+    needDispach = cr.mHybrid && th->is_support_PE();
+    if (!needDispach) {
+      Scheduler_P = std::move(Scheduler2D(config));
+    } else {
+      Pcore_num = cr.P_core_num;
+      Ecore_num = cr.E_core_num;
+      Config2D config_P = config, config_E = config;
+      const int N = config.size[1];
+      const int N_offset = utils::padto(N - int(N / (1 + cr.getPE(BTLA_ISA::NoSIMD))), config.step[1]);
+      config_P.threads = config.threads - cr.E_core_num;
+      config_P.size[1] = N_offset;
+      Scheduler_P = std::move(Scheduler2D(config_P));
+      config_E.threads = cr.E_core_num;
+      config_E.size[1] = N - N_offset;
+      config_E.offset[1] += N_offset;
+      Scheduler_E = std::move(Scheduler2D(config_E));
+    }
+  }
+
+  void getIndex(ThreadProblem& problem) {
+    if (!needDispach) {
+      Scheduler_P.getIndex(problem);
+    } else {
+      if (problem.tid >= Pcore_num + Ecore_num) {
+        problem.tid -= Ecore_num;
+        Scheduler_P.getIndex(problem);
+      } else if (problem.tid >= Pcore_num) {
+        problem.tid -= Pcore_num;
+        Scheduler_E.getIndex(problem);
+      } else {
+        Scheduler_P.getIndex(problem);
+      }
+    }
+  }
+
+  void print() {
+    printf("dispatch to hybrid:%d\n", needDispach);
+    Scheduler_P.print();
+    if (needDispach) Scheduler_E.print();
+  }
+
+ private:
+  Scheduler2D Scheduler_P, Scheduler_E;
+  bool needDispach = false;
+  int Pcore_num = 0, Ecore_num = 0;
+};
+
+}  // namespace gemm
+
+template <class Parallel_T, class Launch_T>
+void GemmRun(Launch_T& launcher, const typename Launch_T::Param& args, parallel::IThreading* th) {
+  gemm::SchedulerDispatcher<Parallel_T> para(th, args.problem);
+  static bool flag = false;
+  if (flag) {
+    printf("%s\n", __FUNCTION__);
+    para.print();
+    flag = false;
+  }
+  th->parallel_for([&](int tidx) {
+    typename Parallel_T::ThreadProblem thdp{tidx};
+    para.getIndex(thdp);
+    if (thdp.valid) {
+      launcher.run(args, thdp);
+    }
+  });
+}
+
+template <class Parallel_T, class Launch_T>
+void GemmRunWithA(Launch_T& launcher, const typename Launch_T::Param& args, parallel::IThreading* th) {
+  gemm::SchedulerDispatcher<Parallel_T> para(th, args.problem);
+  using AParall = typename Launch_T::PrologueA::Parallel;
+  AParall apara = launcher.mProA.createParallel(th->num_threads(), args.problem);
+  static bool flag = false;
+  if (flag) {
+    printf("%s\n", __FUNCTION__);
+    para.print();
+    flag = false;
+  }
+  th->parallel_for([&](int tidx) {
+    typename AParall::ThreadProblem thdpA{tidx};
+    apara.getIndex(thdpA);
+    if (thdpA.valid) {
+      launcher.mProA.run(args.paramA, thdpA);
+    }
+    th->sync(tidx);
+    typename Parallel_T::ThreadProblem thdp{tidx};
+    para.getIndex(thdp);
+    if (thdp.valid) {
+      launcher.run(args, thdp);
+    }
+  });
+}
+
+}  // namespace parallel
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_prologue_a.h b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_prologue_a.h
new file mode 100644
index 000000000..d6e782c76
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_prologue_a.h
@@ -0,0 +1,431 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <immintrin.h>
+
+#include <cassert>
+#include <type_traits>
+
+#include "bestla.h"
+#include "bestla_device.h"
+#include "bestla_gemm.h"
+#include "bestla_parallel.h"
+#include "bestla_storage.h"
+#include "bestla_utils.h"
+#include "kernel_wrapper.h"
+
+namespace bestla {
+namespace prologue_a {
+namespace gemm {
+
+template <typename AType>
+struct ParamActivationBase {
+  const AType* A;
+  int lda;
+};
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+class ActivationBase {
+ public:
+  using AType = typename _GemmCore_T::AType;
+  using SRCType = AType;
+  using Param = ParamActivationBase<AType>;
+  BTLA_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
+                          int k_offset, void* tmpcache, size_t cachesize) {
+    auto aptr = const_cast<AType*>(_param.A) + m_offset * _param.lda + k_offset;
+    auto alignedptr = utils::cpu_pointer_align(aptr);
+    bool use_rawptr = k_size % _GemmCore_T::KTILE == 0 && m_size >= _GemmCore_T::MTILE;
+    use_rawptr = use_rawptr && (alignedptr == aptr);
+    if (use_rawptr) {
+      *dstptr = aptr;
+      *dststep = _param.lda;
+      return BTLA_CODE::Success;
+    } else {
+      auto k_pad = utils::padto(k_size, _GemmCore_T::KTILE);
+      *dststep = k_pad;
+      return kernel::wrapper::Memcpy2D::forward<BTLA_ISA::NoSIMD, AType, AType>(aptr, *dstptr, m_size, k_size,
+                                                                                _param.lda, k_pad);
+    }
+  }
+};
+
+template <class _GemmCore_T, BTLA_ISA ISA_T, typename SRC_T>
+class ActivationConverter : public ActivationBase<_GemmCore_T, ISA_T> {
+ public:
+  using AType = typename _GemmCore_T::AType;
+  using SRCType = SRC_T;
+  using Param = ParamActivationBase<SRC_T>;
+  BTLA_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
+                          int k_offset, void* tmpcache, size_t cachesize) {
+    auto aptr = const_cast<SRC_T*>(_param.A);
+    auto k_pad = utils::padto(k_size, _GemmCore_T::KTILE);
+    *dststep = k_pad;
+    if constexpr (std::is_same_v<AType, utils::bf16> && std::is_same_v<SRC_T, float>) {
+      return kernel::wrapper::Memcpy2DFp32CvtBf16::forward<ISA_T>(aptr + m_offset * _param.lda + k_offset, *dstptr,
+                                                                  m_size, k_size, _param.lda * sizeof(SRC_T),
+                                                                  k_pad * sizeof(AType), true);
+    } else if constexpr (std::is_same_v<AType, utils::fp16> && std::is_same_v<SRC_T, float>) {
+      return kernel::wrapper::Memcpy2DFp32CvtFp16::forward<ISA_T>(aptr + m_offset * _param.lda + k_offset, *dstptr,
+                                                                  m_size, k_size, _param.lda * sizeof(SRC_T),
+                                                                  k_pad * sizeof(AType), true);
+    } else if constexpr (std::is_same_v<AType, float> && std::is_same_v<SRC_T, utils::bf16>) {
+      return kernel::wrapper::Memcpy2DBf16CvtFp32::forward<ISA_T>(aptr + m_offset * _param.lda + k_offset, *dstptr,
+                                                                  m_size, k_size, _param.lda * sizeof(SRC_T),
+                                                                  k_pad * sizeof(AType), true);
+    } else if constexpr (std::is_same_v<AType, SRC_T>) {
+      return ActivationBase<_GemmCore_T, ISA_T>::getActivation(dstptr, dststep, {_param.A, _param.lda}, m_size, k_size,
+                                                               m_offset, k_offset, tmpcache, cachesize);
+    } else {
+      assert(0);
+    }
+    return BTLA_CODE::NotSupport;
+  }
+};
+
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+using ActivationConverterFp32 = ActivationConverter<_GemmCore_T, ISA_T, float>;
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+using ActivationConverterBf16 = ActivationConverter<_GemmCore_T, ISA_T, utils::bf16>;
+
+template <typename AType>
+struct ParamActivationKBlockQuantize : ParamActivationBase<AType> {
+  storage::gemm::StorageQuantActivation* quan;
+};
+template <class _GemmCore_T, BTLA_ISA ISA_T, typename SRC_T>
+class ActivationKBlockQuantize {
+ public:
+  using AType = typename _GemmCore_T::AType;
+  using SType = float;
+  using QParam = storage::gemm::StorageQuantActivation;
+  using SRCType = SRC_T;
+  using Param = ParamActivationKBlockQuantize<SRC_T>;
+  using Parallel = parallel::Scheduler2D;
+  using ThreadProblem = parallel::ThreadProblem2D;
+
+  inline Parallel createParallel(int nthreads, const utils::GemmProblem& prbm) {
+    return Parallel({
+        nthreads, prbm.dims[1],  // m
+        prbm.dims[3],            // k
+        1,
+        prbm.dims[4]  // kblock
+    });
+  }
+
+  inline QParam createStorage(int m, int k, int kblock, bool hasreduce) {
+    QParam tmp;
+    int kpad = utils::padto(k, _GemmCore_T::KTILE);
+    int mpad = utils::padto(m, _GemmCore_T::MTILE);
+    tmp.resize(mpad, kpad, m, k, kblock == -1 ? kpad : kblock, BTLA_DTYPE::U8, BTLA_DTYPE::F32, BTLA_DTYPE::U8,
+               BTLA_DTYPE::F32, std::is_same_v<AType, uint8_t>, hasreduce);
+    return tmp;
+  }
+
+  void run(const Param& _param, ThreadProblem& thdp) {
+    auto quan = _param.quan;
+    if (thdp.valid) {
+      // min max
+      auto srcptr = const_cast<SRC_T*>(_param.A) + thdp.loc[0] * _param.lda + thdp.loc[1];
+      auto thdqptr = quan->template APtr<AType>() + thdp.loc[0] * quan->mKPad + thdp.loc[1];
+      auto blk_offset = thdp.loc[0] * quan->CStep() + thdp.loc[1] / quan->mBlockSize;
+      auto thdsptr = quan->template SPtr<float>() + blk_offset;
+      auto thdzptr = quan->template ZPtr<AType>() + blk_offset;
+      auto thdrptr = quan->template RPtr<float>() == nullptr ? nullptr : quan->template RPtr<float>() + blk_offset;
+      if constexpr (std::is_same_v<AType, uint8_t>) {
+        kernel::wrapper::QuantizeU8ColBlock::template forward<ISA_T, SRC_T>(
+            thdp.size[0], thdp.size[1], srcptr, _param.lda, thdqptr, quan->mKPad, thdsptr, quan->CStep(), thdzptr,
+            quan->mBlockSize, thdrptr);
+      }
+      if constexpr (std::is_same_v<AType, int8_t>) {
+        kernel::wrapper::QuantizeS8ColBlock::template forward<ISA_T, SRC_T>(thdp.size[0], thdp.size[1], srcptr,
+                                                                            _param.lda, thdqptr, quan->mKPad, thdsptr,
+                                                                            quan->CStep(), quan->mBlockSize, thdrptr);
+      }
+    }
+  }
+
+  BTLA_CODE quantize(const Param& _param, int m, int k, parallel::IThreading* threading) {
+    auto paral = Parallel({threading->num_threads(), m, k, 1, _param.quan->mBlockSize});
+    threading->parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D thdp{tidx};
+      paral.getIndex(thdp);
+      if (thdp.valid) run(_param, thdp);
+    });
+    return BTLA_CODE::Success;
+  }
+
+ public:  // Runtime get by launcher
+  BTLA_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
+                          int k_offset, void* tmpcache, size_t cachesize) {
+    (void)m_size;
+    (void)k_size;
+    auto quan = _param.quan;
+    auto aptr = quan->template APtr<AType>();
+    *dstptr = aptr + m_offset * quan->mKPad + k_offset;
+    *dststep = quan->mKPad;
+    return BTLA_CODE::Success;
+  }
+
+  BTLA_CODE getZp(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset, int k_offset,
+                  void* tmpcache, size_t cachesize) {
+    auto quan = _param.quan;
+    auto aptr = quan->template ZPtr<AType>();
+    if (aptr == nullptr) {  // optional
+      *dstptr = nullptr;
+      return BTLA_CODE::Success;
+    }
+    int kele = utils::updiv(k_size, quan->mBlockSize);
+    *dststep = kele;
+    kernel::ref::memcpy2d(aptr + m_offset * quan->CStep() + k_offset / quan->mBlockSize, *dstptr, m_size,
+                          kele * sizeof(AType), quan->CStep() * sizeof(AType), kele * sizeof(AType));
+    return BTLA_CODE::Success;
+  }
+
+  BTLA_CODE getScale(float** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
+                     int k_offset, void* tmpcache, size_t cachesize) {
+    auto quan = _param.quan;
+    auto aptr = quan->template SPtr<float>();
+    int kele = utils::updiv(k_size, quan->mBlockSize);
+    *dststep = kele;
+    kernel::ref::memcpy2d(aptr + m_offset * quan->CStep() + k_offset / quan->mBlockSize, *dstptr, m_size,
+                          kele * sizeof(float), quan->CStep() * sizeof(float), kele * sizeof(float));
+    return BTLA_CODE::Success;
+  }
+
+  BTLA_CODE getReduce(float** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
+                      int k_offset, void* tmpcache, size_t cachesize) {
+    auto quan = _param.quan;
+    auto aptr = quan->template RPtr<float>();
+    int kele = utils::updiv(k_size, quan->mBlockSize);
+    *dststep = kele;
+    kernel::ref::memcpy2d(aptr + m_offset * quan->CStep() + k_offset / quan->mBlockSize, *dstptr, m_size,
+                          kele * sizeof(float), quan->CStep() * sizeof(float), kele * sizeof(float));
+    return BTLA_CODE::Success;
+  }
+};
+
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+using ActivationF32KBlockQuantize = ActivationKBlockQuantize<_GemmCore_T, ISA_T, float>;
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+using ActivationBf16KBlockQuantize = ActivationKBlockQuantize<_GemmCore_T, ISA_T, utils::bf16>;
+
+template <typename AType>
+struct ParamActivationKBlockBase : ParamActivationBase<AType> {
+  storage::gemm::StorageReduce* reduce;
+};
+template <class _GemmCore_T, BTLA_ISA ISA_T, typename SRC_T>
+class ActivationKBlockBase : public ActivationConverter<_GemmCore_T, ISA_T, SRC_T> {
+ public:
+  using AType = typename _GemmCore_T::AType;
+  using SType = storage::gemm::StorageReduce;
+  using SRCType = SRC_T;
+  using Param = ParamActivationKBlockBase<SRC_T>;
+  using Parallel = parallel::Scheduler2D;
+  using ThreadProblem = parallel::ThreadProblem2D;
+
+  inline Parallel createParallel(int nthreads, const utils::GemmProblem& prbm) {
+    return Parallel({
+        nthreads, prbm.dims[1],  // m
+        prbm.dims[3],            // k
+        1,
+        prbm.dims[4]  // kblock
+    });
+  }
+  inline SType createStorage(int m, int k, int kblock) {
+    SType tmp;
+    tmp.resize(m, k, kblock == -1 ? k : kblock, BTLA_DTYPE::F32);
+    return tmp;
+  }
+
+  void run(const Param& _param, ThreadProblem& thdp) {
+    auto stor = _param.reduce;
+    if (thdp.valid) {
+      // min max
+      auto srcptr = const_cast<SRC_T*>(_param.A) + thdp.loc[0] * _param.lda + thdp.loc[1];
+      auto blk_offset = thdp.loc[0] * stor->lda + thdp.loc[1] / stor->kblock;
+      auto thdrptr = stor->template RPtr<float>() + blk_offset;
+      auto ret = kernel::wrapper::ColBlockReduceSum::template forward<ISA_T, SRC_T>(
+          srcptr, _param.lda, thdp.size[0], thdp.size[1], stor->kblock, thdrptr, stor->lda);
+      assert(ret == BTLA_CODE::Success);
+    }
+  }
+
+  BTLA_CODE reduce(const Param& _param, int m, int k, int kblock, parallel::IThreading* threading) {
+    auto paral = Parallel({threading->num_threads(), m, k, 1, kblock});
+    threading->parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D thdp{tidx};
+      paral.getIndex(thdp);
+      if (thdp.valid) run(_param, thdp);
+    });
+    return BTLA_CODE::Success;
+  }
+
+  BTLA_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
+                          int k_offset, void* tmpcache, size_t cachesize) {
+    return ActivationConverter<_GemmCore_T, ISA_T, SRC_T>::getActivation(
+        dstptr, dststep, {_param.A, _param.lda}, m_size, k_size, m_offset, k_offset, tmpcache, cachesize);
+  }
+
+  BTLA_CODE getReduce(float** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
+                      int k_offset, void* tmpcache, size_t cachesize) {
+    auto reduce = _param.reduce;
+    auto aptr = reduce->template RPtr<float>();
+    int kele = utils::updiv(k_size, reduce->kblock);
+    *dststep = kele;
+    kernel::ref::memcpy2d(aptr + m_offset * reduce->lda + k_offset / reduce->kblock, *dstptr, m_size,
+                          kele * sizeof(float), reduce->lda * sizeof(float), kele * sizeof(float));
+    return BTLA_CODE::Success;
+  }
+};
+
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+using ActivationKBlockBaseF32 = ActivationKBlockBase<_GemmCore_T, ISA_T, float>;
+
+template <typename AType>
+struct ParamShuffleActivationKBlockBase : ParamActivationKBlockBase<AType> {
+  int* indices = nullptr;
+  storage::gemm::StorageReorderActivation* reordered = nullptr;
+};
+template <class _GemmCore_T, BTLA_ISA ISA_T, typename SRC_T>
+class ShuffleActivationKBlockBase : public ActivationKBlockBase<_GemmCore_T, ISA_T, SRC_T> {
+ public:
+  using AType = typename _GemmCore_T::AType;
+  using RedType = storage::gemm::StorageReduce;
+  using RAType = storage::gemm::StorageReorderActivation;
+  using SRCType = SRC_T;
+  using Param = ParamShuffleActivationKBlockBase<SRC_T>;
+  using Parallel = parallel::Scheduler2D;
+  using ThreadProblem = parallel::ThreadProblem2D;
+  inline RAType createReorderStorage(int m, int k, int kblock) {
+    RAType tmp(_GemmCore_T::ID);
+    int kpad = utils::padto(k, _GemmCore_T::KTILE);
+    int mpad = utils::padto(m, _GemmCore_T::MTILE);
+    tmp.resize(mpad, kpad, m, k, kblock == -1 ? kpad : kblock, utils::bestla_dtype<SRC_T>);
+    return tmp;
+  }
+
+  inline RedType createReduceStorage(int m, int k, int kblock) {
+    RedType tmp;
+    tmp.resize(m, k, kblock == -1 ? k : kblock, BTLA_DTYPE::F32);
+    return tmp;
+  }
+
+  void run(const Param& _param, ThreadProblem& thdp) {
+    auto stor = _param.reduce;
+    auto reordered = _param.reordered;
+    if (thdp.valid) {
+      auto srcptr = const_cast<SRC_T*>(_param.A) + thdp.loc[0] * _param.lda + thdp.loc[1];
+      if (reordered && _param.indices) {
+        auto rptr = reordered->template APtr<SRC_T>() + thdp.loc[0] * reordered->mKPad + thdp.loc[1];
+        auto ret =
+            kernel::ref::shuffle_activation(const_cast<SRC_T*>(_param.A), rptr, thdp.size[0], thdp.size[1], thdp.loc[0],
+                                            thdp.loc[1], _param.indices, _param.lda, reordered->mKPad);
+        srcptr = rptr;
+      }
+      if (stor) {
+        // min max
+        auto blk_offset = thdp.loc[0] * stor->lda + thdp.loc[1] / stor->kblock;
+        auto thdrptr = stor->template RPtr<float>() + blk_offset;
+        auto ret = kernel::wrapper::ColBlockReduceSum::template forward<ISA_T, SRC_T>(
+            srcptr, _param.lda, thdp.size[0], thdp.size[1], stor->kblock, thdrptr, stor->lda);
+        assert(ret == BTLA_CODE::Success);
+      }
+    }
+  }
+
+  BTLA_CODE preprocess(const Param& _param, int m, int k, int kblock, parallel::IThreading* threading) {
+    auto paral = Parallel({threading->num_threads(), m, k, 1, kblock});
+    threading->parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D thdp{tidx};
+      paral.getIndex(thdp);
+      run(_param, thdp);
+    });
+    return BTLA_CODE::Success;
+  }
+
+  BTLA_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
+                          int k_offset, void* tmpcache, size_t cachesize) {
+    if (_param.indices == nullptr) {
+      return ActivationConverter<_GemmCore_T, ISA_T, SRC_T>::getActivation(
+          dstptr, dststep, {_param.A, _param.lda}, m_size, k_size, m_offset, k_offset, tmpcache, cachesize);
+    } else {
+      return ActivationConverter<_GemmCore_T, ISA_T, SRC_T>::getActivation(
+          dstptr, dststep, {_param.reordered->template APtr<SRC_T>(), _param.reordered->mKPad}, m_size, k_size,
+          m_offset, k_offset, tmpcache, cachesize);
+    }
+  }
+};
+
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+using ShuffleActivationKBlockBaseF32 = ShuffleActivationKBlockBase<_GemmCore_T, ISA_T, float>;
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+using ShuffleActivationKBlockBaseBf16 = ShuffleActivationKBlockBase<_GemmCore_T, ISA_T, utils::bf16>;
+
+template <typename AType>
+struct ParamShuffleActivationKBlockQuantize : ParamActivationKBlockQuantize<AType> {
+  int* indices = nullptr;
+  storage::gemm::StorageReorderActivation* reordered = nullptr;
+};
+template <class _GemmCore_T, BTLA_ISA ISA_T, typename SRC_T>
+class ShuffleActivationKBlockQuantize : public ActivationKBlockQuantize<_GemmCore_T, ISA_T, SRC_T> {
+ public:
+  using AType = typename _GemmCore_T::AType;
+  using SType = float;
+  using QParam = storage::gemm::StorageQuantActivation;
+  using RAType = storage::gemm::StorageReorderActivation;
+  using SRCType = SRC_T;
+  using Param = ParamShuffleActivationKBlockQuantize<SRC_T>;
+  using Parallel = parallel::Scheduler2D;
+  using ThreadProblem = parallel::ThreadProblem2D;
+
+  inline QParam createQuantStorage(int m, int k, int kblock, bool hasreduce) {
+    QParam tmp;
+    int kpad = utils::padto(k, _GemmCore_T::KTILE);
+    int mpad = utils::padto(m, _GemmCore_T::MTILE);
+    tmp.resize(mpad, kpad, m, k, kblock == -1 ? kpad : kblock, BTLA_DTYPE::U8, BTLA_DTYPE::F32, BTLA_DTYPE::U8,
+               BTLA_DTYPE::F32, std::is_same_v<AType, uint8_t>, hasreduce);
+    return tmp;
+  }
+
+  inline RAType createReorderStorage(int m, int k, int kblock) {
+    RAType tmp(_GemmCore_T::ID);
+    int kpad = utils::padto(k, _GemmCore_T::KTILE);
+    int mpad = utils::padto(m, _GemmCore_T::MTILE);
+    tmp.resize(mpad, kpad, m, k, kblock == -1 ? kpad : kblock, utils::bestla_dtype<SRC_T>);
+    return tmp;
+  }
+
+  BTLA_CODE quantize(const Param& _param, int m, int k, parallel::IThreading* threading) {
+    auto srcptr = const_cast<SRC_T*>(_param.A);
+    if (_param.indices) {
+      auto shuffle_src = _param.reordered->template APtr<SRC_T>();
+      threading->parallel_for([&](int tidx) {
+        auto enable_thr = threading->num_threads();
+        auto align_m = m / enable_thr;
+        auto process_m = (tidx + 1) == enable_thr ? (m - tidx * align_m) : align_m;
+        kernel::ref::shuffle_activation(const_cast<SRC_T*>(_param.A), shuffle_src + tidx * align_m * k, process_m, k,
+                                        tidx * align_m, 0, _param.indices, k, k);
+      });
+      srcptr = shuffle_src;
+    }
+    ActivationKBlockQuantize<_GemmCore_T, ISA_T, SRC_T>::quantize({srcptr, k, _param.quan}, m, k, threading);
+    return BTLA_CODE::Success;
+  }
+};
+
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+using ShuffleActivationKBlockQuantizeF32 = ShuffleActivationKBlockQuantize<_GemmCore_T, ISA_T, float>;
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+using ShuffleActivationKBlockQuantizeBf16 = ShuffleActivationKBlockQuantize<_GemmCore_T, ISA_T, utils::bf16>;
+}  // namespace gemm
+}  // namespace prologue_a
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_prologue_b.h b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_prologue_b.h
new file mode 100644
index 000000000..5a5bd2a24
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_prologue_b.h
@@ -0,0 +1,1070 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <cassert>
+#include "bestla_utils.h"
+#include "bestla_storage.h"
+#include "bestla_device.h"
+#include "bestla_parallel.h"
+#include "kernel_wrapper.h"
+
+namespace bestla {
+namespace prologue_b {
+namespace gemm {
+
+template <typename WT, BTLA_ISA ISA_T>
+static inline void transposeWeight(const int Row, const int Col, const WT* src, const int ld_src, WT* dst,
+                                   const int ld_dst, parallel::IThreading* threading) {
+  bestla::parallel::Scheduler2D _para;
+  _para.update({threading->num_threads(), Row, Col, 16, 16});
+  threading->parallel_for([&](int tidx) {
+    bestla::parallel::ThreadProblem2D thdp{tidx};
+    _para.getIndex(thdp);
+    if (thdp.valid) {
+      kernel::wrapper::Transpose2D<WT>::template forward<ISA_T>(src + thdp.loc[0] * ld_src + thdp.loc[1],
+                                                                dst + thdp.loc[0] + thdp.loc[1] * ld_dst, thdp.size[0],
+                                                                thdp.size[1], ld_src, ld_dst);
+    }
+  });
+}
+template <typename WType>
+struct ParamWeightPack {
+  const WType* B;
+  const int ldb;
+  storage::gemm::StoragePackedWeight* packedW;
+};
+
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+class WeightPack {
+ public:
+  using WType = typename _GemmCore_T::BType;
+  using StorageType = storage::gemm::StoragePackedWeight;
+  using Param = ParamWeightPack<WType>;
+
+  StorageType createStorage(int n, int k) {
+    int KPad = utils::padto(k, _GemmCore_T::KTILE);
+    int NPad = utils::padto(n, _GemmCore_T::NTILE);
+    StorageType tmp(_GemmCore_T::ID);
+    tmp.resize(NPad, KPad, n, k, utils::bestla_dtype<WType>);
+    return tmp;
+  }
+
+  void packWeightTranspose(const int N, const int K, const Param& _param, parallel::IThreading* threading) {
+    auto B_NT = utils::amalloc<WType>(static_cast<size_t>(N) * K);
+    transposeWeight<WType, ISA_T>(N, K, _param.B, _param.ldb, B_NT, N, threading);
+    packWeight(N, K, {B_NT, N, _param.packedW}, threading);
+    utils::afree(B_NT);
+  }
+
+  // from KxN int8 symmetric weight to packed N//NtilexKPadxNTile int4 weight
+  void packWeight(const int N, const int K, const Param& _param, parallel::IThreading* threading) {
+    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
+    threading->parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D thdp{tidx};
+      _para.getIndex(thdp);
+      if (thdp.valid) {
+        run(_param, thdp);
+      }
+    });
+  }
+
+  void run(const Param& _param, parallel::ThreadProblem2D& thdp) {
+    auto packedw = _param.packedW;
+    auto rowpadded = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
+    auto colpadded = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
+    const auto src = _param.B + thdp.loc[0] * _param.ldb + thdp.loc[1];
+    const auto dst = packedw->template WPtr<WType>() + thdp.loc[0] * _GemmCore_T::NTILE + thdp.loc[1] * packedw->mKPad;
+    using PaddingInterleaveMNWType = kernel::wrapper::PaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>;
+    auto ret = PaddingInterleaveMNWType::template forward<ISA_T>(  //
+        src, dst, thdp.size[0], thdp.size[1], rowpadded, colpadded, _param.ldb, packedw->mKPad);
+    assert(ret == BTLA_CODE::Success);
+    (void)ret;
+  }
+
+  inline BTLA_CODE getWeight(WType** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
+                             const Param param, void* tmpcache, size_t cachesize) {
+    auto wptr = param.packedW;
+    auto KPad = wptr->mKPad;
+    auto bptr = wptr->template WPtr<WType>() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
+    kernel::wrapper::Memcpy2D::template forward<BTLA_ISA::NoSIMD, WType, WType>(
+        bptr, *dstptr, n_size / _GemmCore_T::NTILE, _GemmCore_T::NTILE * k_size, _GemmCore_T::NTILE * KPad,
+        _GemmCore_T::NTILE * k_size);
+    *dststep = k_size;
+    return BTLA_CODE::Success;
+  }
+};
+
+struct ParamWeightKBlockNInteger {
+  storage::gemm::StorageWeightKBlockNInteger* packedW;
+};
+
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+class WeightKBlockNInteger {
+ public:
+  using StorageWeight = storage::gemm::StorageWeightKBlockNInteger;
+  using BType = typename _GemmCore_T::BType;
+  using Param = ParamWeightKBlockNInteger;
+
+  static StorageWeight createStorage(int n, int k, int blocksize, BTLA_DTYPE qtype, BTLA_DTYPE scat, BTLA_DTYPE redt,
+                                     bool is_asym) {
+    int KPad = utils::padto(k, _GemmCore_T::KTILE);
+    int NPad = utils::padto(n, _GemmCore_T::NTILE);
+    StorageWeight tmp(_GemmCore_T::ID);
+    tmp.resize(NPad, KPad, blocksize <= 0 ? KPad : blocksize, n, k, qtype, scat, redt, is_asym);
+    return tmp;
+  }
+
+  void doubleQuantScale(float* scale, size_t scale_size, int dq_blocksize, BTLA_DTYPE qtype,
+                        utils::aligned_vector<float>* dq_buf) {
+    if (qtype == BTLA_DTYPE::DQ8_BNB) {
+      dq_buf->resize(utils::updiv(scale_size, dq_blocksize) + 1);  // add 1 for offset.
+      kernel::ref::dq8_bnb_double_quant<false>(scale, scale_size, dq_blocksize, dq_buf->data());
+    } else {
+      assert(0);
+    }
+  }
+
+  void setDoubleQuantCorrection(utils::avector<float>* dq_buf, StorageWeight* ptr) {
+    if (ptr->SDtype() == BTLA_DTYPE::DQ8_BNB) {
+      auto packw_dqbuf_ptr = ptr->DQPtr<float>();
+      memcpy(packw_dqbuf_ptr, dq_buf->data(), dq_buf->size() * sizeof(float));
+    } else {
+      assert(0);
+    }
+  }
+
+  static void enableShuffle(StorageWeight* stor) { stor->enable_shuffle(); }
+  void setDoubleQuantBlkSize(StorageWeight* stor, BTLA_DTYPE stype, int dq_blksize) {
+    stor->mDqBlockSize = dq_blksize;
+    auto nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize);
+    if (stor->IsAsym() || dq_blksize % 8 != 0) assert(0);
+    stor->mCorrection.enable_double_quant(utils::updiv(nk_scale * stor->mN, dq_blksize), stype);
+    stor->update_size();
+  }
+
+  void packTransposeWeight(const int N, const int K, const float* B, const int ldb, StorageWeight* stor,
+                           parallel::IThreading* threading) {
+    auto B_NT = utils::amalloc<float>(static_cast<size_t>(N) * K);
+    transposeWeight<float, ISA_T>(N, K, B, ldb, B_NT, N, threading);
+    packWeight(N, K, B_NT, N, stor, threading);
+    utils::afree(B_NT);
+  }
+
+  // from packed N//NtilexKPadxNTile int8 weight to KxN f32 weight
+  void unpackTransposeWeight(const int N, const int K, StorageWeight* stor, float* B, const int ldb,
+                             parallel::IThreading* threading) {
+    auto B_NT = utils::amalloc<float>(static_cast<size_t>(N) * K);
+    unpackWeight(N, K, stor, B_NT, N, threading);
+    transposeWeight<float, ISA_T>(K, N, B_NT, N, B, ldb, threading);
+    utils::afree(B_NT);
+  }
+
+  // from KxN f32 weight to packed N//NtilexKPadxNTile int8 weight
+  void packWeight(const int N, const int K, const float* B, const int ldb, StorageWeight* ptr,
+                  parallel::IThreading* threading) {
+    auto tmpq = utils::amalloc<int8_t>(static_cast<size_t>(N) * K);
+    int nk_scale = utils::updiv(K, ptr->mBlockSize);
+    auto ssize = static_cast<size_t>(N) * nk_scale;
+    auto Tscales = utils::amalloc<float>(ssize);
+    auto Tzps = utils::amalloc<int8_t>(ptr->IsAsym() ? ssize : 0);
+    quantizeWeight(N, K, B, ldb, tmpq, Tscales, Tzps, ptr, threading);
+    packQWeight(N, K, tmpq, N, Tscales, Tzps, ptr, threading);
+    utils::afree(tmpq);
+    utils::afree(Tscales);
+    utils::afree(Tzps);
+  }
+
+  void unpackWeight(const int N, const int K, StorageWeight* stor, float* B, const int ldb,
+                    parallel::IThreading* threading) {
+    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
+    threading->parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D thdp{tidx};
+      _para.getIndex(thdp);
+      if (thdp.valid) {
+        auto rowpad = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
+        auto colpad = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
+        auto dequant = utils::amalloc<float>((size_t)rowpad * colpad);
+        auto dstptr = dequant;
+        int dststep = 0;
+        size_t constexpr CacheSize = size_t(100) << 10;
+        int8_t tmpcache[CacheSize];
+        getWeight(&dstptr, &dststep, rowpad, colpad, thdp.loc[0], thdp.loc[1], {stor}, tmpcache, CacheSize);
+        kernel::wrapper::RevertPaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>::template forward<ISA_T>(
+            dstptr, B + thdp.loc[0] * ldb + thdp.loc[1], thdp.size[0], thdp.size[1], rowpad, colpad, dststep, ldb);
+        utils::afree(dequant);
+      }
+    });
+  }
+
+  static void unpackWeight(const int N, const int K, StorageWeight* stor, int8_t* B, const int ldb,
+                           parallel::IThreading* threading) {
+    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
+    threading->parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D thdp{tidx};
+      _para.getIndex(thdp);
+      if (thdp.valid) {
+        auto rowpad = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
+        auto colpad = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
+        auto dequant = utils::amalloc<int8_t>((size_t)rowpad * colpad);
+        auto dstptr = dequant;
+        int dststep = 0;
+        size_t constexpr CacheSize = size_t(100) << 10;
+        int8_t tmpcache[CacheSize];
+        getWeight(&dstptr, &dststep, rowpad, colpad, thdp.loc[0], thdp.loc[1], {stor}, tmpcache, CacheSize);
+        kernel::wrapper::RevertPaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>::template forward<ISA_T>(
+            dstptr, B + thdp.loc[0] * ldb + thdp.loc[1], thdp.size[0], thdp.size[1], rowpad, colpad, dststep, ldb);
+        utils::afree(dequant);
+      }
+    });
+  }
+
+  static void setQuantCorrection(const int N, const int K, const int8_t* zero_points, const float* scales,
+                                 StorageWeight* stor, parallel::IThreading* threading) {
+    int rawnk_scale = utils::updiv(K, stor->mBlockSize);
+    int nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize);
+    parallel::Scheduler2D _para({threading->num_threads(), 1, nk_scale, 1, 1});
+    if (stor->SDtype() == BTLA_DTYPE::F32) {  // fp32 to fp32 direct copy
+      threading->parallel_for([&](int tidx) {
+        parallel::ThreadProblem2D thdp{tidx};
+        _para.getIndex(thdp);
+        if (thdp.valid) {
+          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
+            if (i < rawnk_scale) {
+              if (scales != nullptr)
+                std::memcpy(stor->template SPtr<float>() + i * stor->mNPad, scales + i * N, N * sizeof(scales[0]));
+              if (zero_points != nullptr)
+                std::memcpy(stor->template ZPtr<int8_t>() + i * stor->mNPad, zero_points + i * N,
+                            N * sizeof(zero_points[0]));
+            } else {
+              if (scales != nullptr)
+                std::memset(stor->template SPtr<float>() + i * stor->mNPad, 0, stor->mNPad * sizeof(float));
+              if (zero_points != nullptr)
+                std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
+            }
+          }
+        }
+      });
+    } else if (stor->SDtype() == BTLA_DTYPE::BF16) {
+      threading->parallel_for([&](int tidx) {
+        parallel::ThreadProblem2D thdp{tidx};
+        _para.getIndex(thdp);
+        if (thdp.valid) {
+          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
+            if (i < rawnk_scale) {
+              if (scales != nullptr) {
+                for (size_t j = 0; j < N; j++) {
+                  stor->template SPtr<utils::bf16>()[j + i * stor->mNPad] = static_cast<utils::bf16>(scales[i * N + j]);
+                }
+              }
+              if (zero_points != nullptr) {
+                std::memcpy(stor->template ZPtr<int8_t>() + i * stor->mNPad, zero_points + i * N,
+                            N * sizeof(zero_points[0]));
+              }
+            } else {
+              if (scales != nullptr)
+                std::memset(stor->template SPtr<utils::bf16>() + i * stor->mNPad, 0, stor->mNPad * sizeof(utils::bf16));
+              if (zero_points != nullptr)
+                std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
+            }
+          }
+        }
+      });
+    } else if (stor->SDtype() == BTLA_DTYPE::F8_E8M0) {
+      threading->parallel_for([&](int tidx) {
+        parallel::ThreadProblem2D thdp{tidx};
+        _para.getIndex(thdp);
+        if (thdp.valid) {
+          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
+            if (i < rawnk_scale) {
+              if (scales != nullptr) {
+                for (size_t j = 0; j < N; j++) {
+                  stor->template SPtr<utils::f8>()[j + i * stor->mNPad] = static_cast<int8_t>(scales[i * N + j]);
+                }
+              }
+            } else {
+              if (scales != nullptr)
+                std::memset(stor->template SPtr<utils::f8>() + i * stor->mNPad, 0, stor->mNPad * sizeof(utils::f8));
+            }
+          }
+        }
+      });
+    } else if (stor->SDtype() == BTLA_DTYPE::DQ8_BNB) {
+      threading->parallel_for([&](int tidx) {
+        parallel::ThreadProblem2D thdp{tidx};
+        _para.getIndex(thdp);
+        if (thdp.valid) {
+          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
+            if (i < rawnk_scale) {
+              if (scales != nullptr) {
+                for (size_t j = 0; j < N; j++) {
+                  stor->template SPtr<uint8_t>()[j + i * stor->mNPad] = static_cast<uint8_t>(scales[i * N + j]);
+                }
+              }
+            } else {
+              if (scales != nullptr)
+                std::memset(stor->template SPtr<uint8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(uint8_t));
+            }
+          }
+        }
+      });
+    } else {
+      assert(0);
+    }
+  }
+
+  static void setShuffleIndices(const int* groupindices, StorageWeight* stor, parallel::IThreading* threading) {
+    auto groupsize = utils::updiv(stor->mK, stor->mBlockSize);
+    parallel::Scheduler2D _para({threading->num_threads(), 1, groupsize, 1, 1});
+    auto countptr = utils::amalloc<int>(groupsize);
+    std::memset(countptr, 0, groupsize * sizeof(int));
+    threading->parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D thdp{tidx};
+      _para.getIndex(thdp);
+      if (thdp.valid) {
+        auto siptr = stor->ShfIndice();
+        for (size_t i = 0; i < stor->mK; i++) {
+          if (groupindices[i] >= thdp.loc[1] && groupindices[i] < thdp.loc[1] + thdp.size[1]) {
+            siptr[groupindices[i] * stor->mBlockSize + countptr[groupindices[i]]] = i;
+            countptr[groupindices[i]]++;
+          }
+        }
+      }
+    });
+    utils::afree(countptr);
+  }
+
+  static void setTransposeQuantCorrection(const int N, const int K, const int8_t* zero_points, const float* scales,
+                                          StorageWeight* stor, parallel::IThreading* threading) {
+    int rawnk_scale = utils::updiv(K, stor->mBlockSize);
+    int nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize);
+    parallel::Scheduler2D _para({threading->num_threads(), 1, nk_scale, 1, 1});
+    if (stor->SDtype() == BTLA_DTYPE::F32) {  // fp32 to fp32 direct copy
+      threading->parallel_for([&](int tidx) {
+        parallel::ThreadProblem2D thdp{tidx};
+        _para.getIndex(thdp);
+        if (thdp.valid) {
+          if (scales) {
+            for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
+              if (i < rawnk_scale) {
+                for (size_t j = 0; j < N; j++) {
+                  stor->template SPtr<float>()[i * stor->mNPad + j] = scales[j * rawnk_scale + i];
+                }
+              } else {
+                std::memset(stor->template SPtr<float>() + i * stor->mNPad, 0, stor->mNPad * sizeof(float));
+              }
+            }
+          }
+        }
+      });
+    } else if (stor->SDtype() == BTLA_DTYPE::BF16) {
+      threading->parallel_for([&](int tidx) {
+        parallel::ThreadProblem2D thdp{tidx};
+        _para.getIndex(thdp);
+        if (thdp.valid) {
+          if (scales) {
+            for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
+              if (i < rawnk_scale) {
+                for (size_t j = 0; j < N; j++) {
+                  stor->template SPtr<utils::bf16>()[i * stor->mNPad + j] = utils::bf16(scales[j * rawnk_scale + i]);
+                }
+              } else {
+                std::memset(stor->template SPtr<utils::bf16>() + i * stor->mNPad, 0, stor->mNPad * sizeof(utils::bf16));
+              }
+            }
+          }
+        }
+      });
+    } else if (stor->SDtype() == BTLA_DTYPE::F8_E8M0) {
+      threading->parallel_for([&](int tidx) {
+        parallel::ThreadProblem2D thdp{tidx};
+        _para.getIndex(thdp);
+        if (thdp.valid) {
+          if (scales) {
+            for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
+              if (i < rawnk_scale) {
+                for (size_t j = 0; j < N; j++) {
+                  stor->template SPtr<utils::f8>()[i * stor->mNPad + j] = scales[j * rawnk_scale + i];
+                }
+              } else {
+                std::memset(stor->template SPtr<utils::f8>() + i * stor->mNPad, 0, stor->mNPad * sizeof(utils::f8));
+              }
+            }
+          }
+        }
+      });
+    } else {
+      assert(0);
+    }
+    if (stor->IsAsym() && zero_points)
+      threading->parallel_for([&](int tidx) {
+        parallel::ThreadProblem2D thdp{tidx};
+        _para.getIndex(thdp);
+        if (thdp.valid) {
+          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
+            if (i < rawnk_scale) {
+              for (size_t j = 0; j < N; j++) {
+                stor->template ZPtr<int8_t>()[i * stor->mNPad + j] = zero_points[j * rawnk_scale + i];
+              }
+            } else {
+              std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
+            }
+          }
+        }
+      });
+  }
+
+  void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales,
+                   const int8_t* zero_points, StorageWeight* stor, parallel::IThreading* threading) {
+    if (stor->SDtype() == BTLA_DTYPE::DQ8_BNB) assert(stor->mDqBlockSize != 0);
+    if (stor->IsDoubleQuant()) {
+      int nk_scale = utils::updiv(K, stor->mBlockSize);
+      auto ssize = static_cast<size_t>(N) * nk_scale;
+      utils::avector<float> dq_buf;
+      doubleQuantScale(const_cast<float*>(scales), ssize, stor->mDqBlockSize, stor->SDtype(), &dq_buf);
+      setDoubleQuantCorrection(&dq_buf, stor);
+    }
+    setQuantCorrection(N, K, zero_points, scales, stor, threading);
+    if (stor->mDType == BTLA_DTYPE::S8 || stor->mDType == BTLA_DTYPE::F8_E4M3 || stor->mDType == BTLA_DTYPE::F8_E5M2) {
+      reorderWeight(N, K, B, ldb, stor->WPtr<int8_t>(), threading);
+    } else {
+      auto reordered = utils::amalloc<int8_t>((size_t)stor->mKPad * stor->mNPad);
+      reorderWeight(N, K, B, ldb, reordered, threading);
+      compressWeight(stor->mNPad, stor->mKPad, reordered, stor->mNPad, stor->WPtr<int8_t>(), stor->mDType, threading);
+      utils::afree(reordered);
+    }
+    reduceWeight(stor, threading);
+  }
+
+  virtual void packNbitsWeightQ4(const int N, const int K, bool isasym, const uint8_t* B, const int ldb,
+                                 const float* scales, const uint8_t* zero_points, void* ptr,
+                                 parallel::IThreading* threading) {
+    auto stor = reinterpret_cast<StorageWeight*>(ptr);
+    auto tmp = utils::amalloc<float>(static_cast<size_t>(stor->mKPad) * stor->mNPad);
+    auto blks = utils::updiv(K, stor->mBlockSize);
+    auto blks_padding2 = utils::padto(blks, 2);
+    auto tmpscales = tmp;
+    auto tmpzeropoints = reinterpret_cast<int8_t*>(tmpscales + N * blks);
+    if (scales) {
+      for (size_t i = 0; i < N * blks; i += 1) {
+        tmpscales[i] = scales[i];
+      }
+    }
+    if (zero_points) {
+      for (size_t i = 0; i < N; i += 1) {
+        for (size_t ib = 0; ib < blks; ib += 2) {
+          auto tmpzp = *(zero_points + i * blks_padding2 / 2 + ib / 2);
+          tmpzeropoints[i * blks + ib] = (tmpzp & 0x0f) - 8;
+          if (ib + 1 < blks) {
+            tmpzeropoints[i * blks + ib + 1] = ((tmpzp & 0xf0) >> 4) - 8;
+          }
+        }
+      }
+    }
+
+    setTransposeQuantCorrection(N, K, zero_points ? tmpzeropoints : nullptr, scales ? tmpscales : nullptr, stor,
+                                threading);
+    if (B) {
+      auto s8ptr = (int8_t*)tmp;
+      auto transposeunpackfunc_u4s4 = [&]() {
+        parallel::Scheduler2D para({threading->num_threads(), N, K, 1, 2});
+        threading->parallel_for([&](int tid) {
+          parallel::ThreadProblem2D thdp{tid};
+          para.getIndex(thdp);
+          if (thdp.valid) {
+            for (size_t i = thdp.loc[0]; i < thdp.loc[0] + thdp.size[0]; i++) {
+              for (size_t j = thdp.loc[1]; j < thdp.loc[1] + thdp.size[1]; j += 2) {
+                auto src = *(B + i * ldb / 2 + j / 2);
+                s8ptr[(j + 0) * N + i] = ((src & 0xf) - 8);
+                s8ptr[(j + 1) * N + i] = (((src & 0xf0) >> 4) - 8);
+              }
+            }
+          }
+        });
+      };
+      transposeunpackfunc_u4s4();
+      auto reordered = s8ptr + static_cast<size_t>(K) * N;
+      reorderWeight(N, K, s8ptr, N, reordered, threading);
+      compressWeight(stor->mNPad, stor->mKPad, reordered, stor->mNPad, stor->WPtr<int8_t>(), stor->mDType, threading);
+    }
+    utils::afree(tmp);
+  }
+
+  void reduceWeight(StorageWeight* stor, parallel::IThreading* threading) {
+    if (stor->HasReduce()) {
+      auto deq = utils::amalloc<float>((size_t)stor->mK * stor->mN);
+      unpackWeight(stor->mN, stor->mK, stor, deq, stor->mN, threading);
+      if (stor->RDtype() == BTLA_DTYPE::F32) {
+        reduce(stor->mN, stor->mK, stor->mBlockSize, deq, stor->mN, stor->template RPtr<float>(), stor->CStep(),
+               threading);
+      } else if (stor->RDtype() == BTLA_DTYPE::BF16) {
+        reduce(stor->mN, stor->mK, stor->mBlockSize, deq, stor->mN, stor->template RPtr<utils::bf16>(), stor->CStep(),
+               threading);
+      } else {
+        assert(0);
+      }
+      utils::afree(deq);
+    }
+  }
+
+  void quantizeWeight(const int N, const int K, const float* B, const int ldb, int8_t* qB, float* scales,
+                      int8_t* zero_points, void* stor, parallel::IThreading* threading) {
+    auto ptr = reinterpret_cast<StorageWeight*>(stor);
+    int bsize = ptr->mBlockSize == -1 ? K : ptr->mBlockSize;
+    parallel::Scheduler2D _para({threading->num_threads(), K, N, bsize, 16});
+    threading->parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D thdp({tidx});
+      _para.getIndex(thdp);
+      if (thdp.valid) {
+        quantRowBlock(B + thdp.loc[0] * ldb + thdp.loc[1], qB + thdp.loc[0] * N + thdp.loc[1], thdp.size[0],
+                      thdp.size[1], ldb, N, scales + thdp.loc[0] / bsize * N + thdp.loc[1],
+                      zero_points == nullptr ? zero_points : zero_points + thdp.loc[0] / bsize * N + thdp.loc[1], ptr);
+      }
+    });
+  }
+
+  static void reorderWeight(const int N, const int K, const int8_t* B, const int ldb, int8_t* dstptr,
+                            parallel::IThreading* threading) {
+    int KPad = utils::padto(K, _GemmCore_T::KTILE);
+    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
+    threading->parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D thdp({tidx});
+      _para.getIndex(thdp);
+      if (thdp.valid) {
+        auto rowpadded = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
+        auto colpadded = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
+        const auto src = B + thdp.loc[0] * ldb + thdp.loc[1];
+        const auto dst = dstptr + thdp.loc[0] * _GemmCore_T::NTILE + thdp.loc[1] * KPad;
+        using PaddingInterleaveMNWType =
+            kernel::wrapper::PaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>;
+        auto ret = PaddingInterleaveMNWType::template forward<ISA_T>(  //
+            src, dst, thdp.size[0], thdp.size[1], rowpadded, colpadded, ldb, KPad);
+        assert(ret == BTLA_CODE::Success);
+        (void)ret;
+      }
+    });
+  }
+
+  static void compressBit3Weight(const int N, const int K, const int8_t* B, int8_t* dstptr,
+                                 parallel::IThreading* threading) {
+    auto bit1_offset = size_t(N) * K;
+    auto bit2ptr = reinterpret_cast<utils::bit2x4*>(dstptr);
+    auto bit1ptr = reinterpret_cast<utils::bit1x8*>(dstptr + bit1_offset / 4);
+    auto ret = kernel::wrapper::CompressBit3::forward<ISA_T>(B, bit2ptr, bit1ptr, bit1_offset);
+    assert(ret == BTLA_CODE::Success);
+  }
+
+  static void compressBit2Weight(const int N, const int K, const int8_t* B, int8_t* dstptr,
+                                 parallel::IThreading* threading) {
+    // TODO(zhe): 1D parallel compress
+    parallel::Scheduler2D _para({threading->num_threads(), 1, K * N, 1, 64});
+    auto bit2ptr = reinterpret_cast<utils::bit2x4*>(dstptr);
+    threading->parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D thdp({tidx});
+      _para.getIndex(thdp);
+      if (thdp.valid) {
+        auto ret =
+            kernel::wrapper::CompressBit2::forward<ISA_T>(B + thdp.loc[1], bit2ptr + thdp.loc[1] / 4, thdp.size[1]);
+        assert(ret == BTLA_CODE::Success);
+        (void)ret;
+      }
+    });
+  }
+
+  static void compressWeight(const int N, const int K, const int8_t* B, const int ldb, int8_t* dstptr, BTLA_DTYPE qtype,
+                             parallel::IThreading* threading) {
+    if (qtype == BTLA_DTYPE::S3_CLIP) return compressBit3Weight(N, K, B, dstptr, threading);
+    if (qtype == BTLA_DTYPE::S2_CLIP) return compressBit2Weight(N, K, B, dstptr, threading);
+    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
+    threading->parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D thdp({tidx});
+      _para.getIndex(thdp);
+      if (thdp.valid) {
+        auto ret = doCompress(B + thdp.loc[0] * ldb + thdp.loc[1], dstptr + thdp.loc[0] * ldb / 2 + thdp.loc[1] / 2,
+                              thdp.size[0], thdp.size[1], ldb, ldb, qtype);
+        assert(ret == BTLA_CODE::Success);
+        (void)ret;
+      }
+    });
+  }
+
+  template <typename RED_T>
+  static void reduce(const int N, const int K, const int KBlock, const float* B, const int ldb, RED_T* rptr,
+                     const int ldr, parallel::IThreading* threading) {
+    parallel::Scheduler2D _para({threading->num_threads(), K, N, KBlock, 16});
+    threading->parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D thdp({tidx});
+      _para.getIndex(thdp);
+      if (thdp.valid) {
+        const auto src = B + thdp.loc[0] * ldb + thdp.loc[1];
+        const auto dst = rptr + thdp.loc[1] + thdp.loc[0] / KBlock * ldr;
+        using RowReduceSum = kernel::wrapper::RowReduceSum<RED_T>;
+        for (int i = 0; i < thdp.size[0]; i += KBlock) {
+          int rowremain = utils::remainsize(thdp.loc[0] + i, K, KBlock);
+          auto ret = RowReduceSum::template forward<ISA_T>(  //
+              src + i * ldb, ldb, rowremain, thdp.size[1], dst + i / KBlock * ldr);
+          assert(ret == BTLA_CODE::Success);
+          (void)ret;
+        }
+      }
+    });
+  }
+
+ public:
+  virtual inline BTLA_CODE getWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
+                                     const Param& _param, void* tmpcache, size_t cachesize) {
+    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
+  }
+
+  virtual inline BTLA_CODE getWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
+                                     int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
+    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
+  }
+
+  static inline BTLA_CODE getWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
+                                    const Param& _param, void* tmpcache, size_t cachesize) {
+    auto wptr = _param.packedW;
+    if (wptr->mDType == BTLA_DTYPE::S8) {
+      return getQ8Weight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
+    } else if (wptr->mDType == BTLA_DTYPE::S4_CLIP) {
+      return getQ4Weight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
+    } else if (wptr->mDType == BTLA_DTYPE::S3_CLIP) {
+      return getQ3Weight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
+    } else if (wptr->mDType == BTLA_DTYPE::S2_CLIP) {
+      return getQ2Weight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
+    } else {
+      assert(0);
+    }
+    return BTLA_CODE::NotSupport;
+  }
+
+  static inline BTLA_CODE getScale(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
+                                   const Param& _param, void* tmpcache, size_t cachesize) {
+    auto wptr = _param.packedW;
+    if (wptr->SDtype() == BTLA_DTYPE::F32) {
+      auto aptr = wptr->template SPtr<float>();
+      kernel::wrapper::Memcpy2D::template forward<BTLA_ISA::NoSIMD>(
+          aptr + k_offset / wptr->mBlockSize * wptr->CStep() + n_offset, *dstptr,
+          utils::updiv(k_size, wptr->mBlockSize), n_size, wptr->CStep(), n_size);
+      *dststep = n_size;
+    }
+    if (wptr->SDtype() == BTLA_DTYPE::BF16) {
+      auto aptr = wptr->template SPtr<utils::bf16>();
+      kernel::wrapper::Memcpy2DBf16CvtFp32::forward<ISA_T>(
+          aptr + k_offset / wptr->mBlockSize * wptr->CStep() + n_offset, *dstptr,
+          utils::updiv(k_size, wptr->mBlockSize), n_size, wptr->CStep() * 2, n_size * 4, false);
+      *dststep = n_size;
+    }
+    if (wptr->SDtype() == BTLA_DTYPE::DQ8_BNB) {
+      auto aptr = wptr->template SPtr<uint8_t>();
+      auto internal_k_offset = k_offset / wptr->mBlockSize;
+      auto dq_offset_idx = wptr->mCorrection.mDQCorrectionBuf.mBufSize / sizeof(float) - 1;
+      kernel::wrapper::Dq8GetScale::template forward<ISA_T>(
+          aptr + internal_k_offset * wptr->CStep() + n_offset, *dstptr, utils::updiv(k_size, wptr->mBlockSize), n_size,
+          internal_k_offset * wptr->mN + n_offset, wptr->mDqBlockSize, dq_offset_idx, wptr->DQPtr<float>(),
+          wptr->CStep(), n_size, false, wptr->mN);
+    }
+    return BTLA_CODE::Success;
+  }
+
+  static inline BTLA_CODE getReduce(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
+                                    const Param& _param, void* tmpcache, size_t cachesize) {
+    auto wptr = _param.packedW;
+    if (wptr->RDtype() == BTLA_DTYPE::F32) {
+      auto aptr = wptr->template RPtr<float>();
+      kernel::wrapper::Memcpy2D::template forward<BTLA_ISA::NoSIMD>(
+          aptr + k_offset / wptr->mBlockSize * wptr->CStep() + n_offset, *dstptr,
+          utils::updiv(k_size, wptr->mBlockSize), n_size, wptr->CStep(), n_size);
+      *dststep = n_size;
+    }
+    if (wptr->RDtype() == BTLA_DTYPE::BF16) {
+      auto aptr = wptr->template RPtr<utils::bf16>();
+      kernel::wrapper::Memcpy2DBf16CvtFp32::forward<ISA_T>(
+          aptr + k_offset / wptr->mBlockSize * wptr->CStep() + n_offset, *dstptr,
+          utils::updiv(k_size, wptr->mBlockSize), n_size, wptr->CStep() * 2, n_size * 4, false);
+      *dststep = n_size;
+    }
+    return BTLA_CODE::Success;
+  }
+
+ protected:
+  template <typename _T>
+  static inline BTLA_CODE getFpWeight(_T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
+                                      const Param& _param, void* tmpcache, size_t cachesize) {
+    auto wptr = _param.packedW;
+    auto NPad = wptr->mNPad;
+    auto KPad = wptr->mKPad;
+    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
+    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
+      auto zptr = wptr->template ZPtr<int8_t>();
+      if (wptr->mDType == BTLA_DTYPE::S4_CLIP) {
+        if (wptr->SDtype() == BTLA_DTYPE::DQ8_BNB) {
+          auto internal_n_offset = n_offset + i;
+          if (wptr->mDType == BTLA_DTYPE::S4_CLIP) {
+            kernel::wrapper::DecompressDQKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T,
+                                                                                                 BTLA_DTYPE::S4_CLIP>(
+                wptr->template WPtr<utils::int4x2>() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2 +
+                    i * KPad / 2,
+                *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize,
+                wptr->template SPtr<uint8_t>(), wptr->template DQPtr<float>(), k_offset / _GemmCore_T::PACK_ROW,
+                internal_n_offset, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, wptr->mN, wptr->mDqBlockSize,
+                wptr->mCorrection.mDQCorrectionBuf.mBufSize / sizeof(float) - 1, tmpcache, cachesize);
+          }
+        } else {
+          auto sptr = wptr->template SPtr<void>();
+          kernel::wrapper::DecompressKBlockS4Fp<_GemmCore_T::PACK_ROW, _GemmCore_T::NTILE, _T>::template forward<ISA_T>(
+              wptr->template WPtr<utils::int4x2>() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2 +
+                  i * KPad / 2,
+              *dstptr + i * k_size, k_size, _GemmCore_T::NTILE, sptr, wptr->SDtype(), zptr, k_offset, n_offset + i,
+              wptr->mBlockSize, NPad, tmpcache, cachesize);
+        }
+
+      } else if (wptr->mDType == BTLA_DTYPE::S3_CLIP) {
+        auto sptr = wptr->template SPtr<void>();
+        int8_t* bit3_ptr = wptr->template WPtr<int8_t>();
+        auto elt_offset = n_offset * KPad + k_offset * _GemmCore_T::NTILE + i * KPad;
+        auto ld_dst = _GemmCore_T::NTILE * KPad;
+        auto row = NPad / _GemmCore_T::NTILE;
+        assert(elt_offset % 8 == 0);
+        auto bit2ptr = reinterpret_cast<utils::bit2x4*>(bit3_ptr + elt_offset / 4);
+        auto bit1ptr = reinterpret_cast<utils::bit1x8*>(bit3_ptr + row * ld_dst / 4 + elt_offset / 8);
+        kernel::wrapper::DecompressKBlockS3Fp<_GemmCore_T::PACK_ROW, _GemmCore_T::NTILE, _T>::template forward<ISA_T>(
+            bit2ptr, bit1ptr, *dstptr + i * k_size, k_size, _GemmCore_T::NTILE, sptr, wptr->SDtype(), zptr, k_offset,
+            n_offset + i, wptr->mBlockSize, NPad, tmpcache, cachesize);
+      } else if (wptr->mDType == BTLA_DTYPE::S2_CLIP) {
+        auto sptr = wptr->template SPtr<void>();
+        int8_t* bit2_ptr = wptr->template WPtr<int8_t>();
+        auto elt_offset = n_offset * KPad + k_offset * _GemmCore_T::NTILE + i * KPad;
+        auto bit2ptr = reinterpret_cast<utils::bit2x4*>(bit2_ptr + elt_offset / 4);
+        kernel::wrapper::DecompressKBlockS2Fp<_GemmCore_T::PACK_ROW, _GemmCore_T::NTILE, _T>::template forward<ISA_T>(
+            bit2ptr, *dstptr + i * k_size, k_size, _GemmCore_T::NTILE, sptr, wptr->SDtype(), zptr, k_offset,
+            n_offset + i, wptr->mBlockSize, NPad, tmpcache, cachesize);
+      } else if (wptr->mDType == BTLA_DTYPE::S8) {
+        auto sptr = wptr->template SPtr<void>();
+        auto elt_offset = n_offset * KPad + k_offset * _GemmCore_T::NTILE + i * KPad;
+        int8_t* bptr = wptr->template WPtr<int8_t>() + elt_offset;
+        kernel::wrapper::DecompressKBlockS8Fp<_GemmCore_T::PACK_ROW, _GemmCore_T::NTILE, _T>::template forward<ISA_T>(
+            bptr, *dstptr + i * k_size, k_size, _GemmCore_T::NTILE, sptr, wptr->SDtype(), zptr, k_offset, n_offset + i,
+            wptr->mBlockSize, NPad, tmpcache, cachesize);
+      }
+
+      else {
+        assert(0);
+      }
+    }
+    *dststep = k_size;
+    return BTLA_CODE::Success;
+  }
+
+  static inline BTLA_CODE getQ8Weight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
+                                      const Param& _param, void* tmpcache, size_t cachesize) {
+    auto wptr = _param.packedW;
+    auto KPad = wptr->mKPad;
+    auto bptr = wptr->template WPtr<int8_t>() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
+    kernel::wrapper::Memcpy2D::template forward<BTLA_ISA::NoSIMD, int8_t, int8_t>(
+        bptr, *dstptr, n_size / _GemmCore_T::NTILE, _GemmCore_T::NTILE * k_size, _GemmCore_T::NTILE * KPad,
+        _GemmCore_T::NTILE * k_size);
+    *dststep = k_size;
+    return BTLA_CODE::Success;
+  }
+
+  static inline BTLA_CODE getQ4Weight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
+                                      const Param& _param, void* tmpcache, size_t cachesize) {
+    auto wptr = _param.packedW;
+    auto KPad = wptr->mKPad;
+    auto bptr = wptr->template WPtr<utils::int4x2>() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
+    auto zpptr = wptr->template ZPtr<int8_t>();
+    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
+
+    assert(wptr->mDType == BTLA_DTYPE::S4_CLIP);
+
+    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
+      kernel::wrapper::DecompressKBlockS4S8<_GemmCore_T::PACK_ROW, _GemmCore_T::NTILE>::template forward<ISA_T>(
+          bptr + i * KPad / 2, wptr->IsAsym() ? zpptr : nullptr, *dstptr + i * k_size, wptr->mBlockSize, wptr->CStep(),
+          n_offset + i, k_offset, k_size, _GemmCore_T::NTILE, tmpcache, cachesize);
+    }
+    *dststep = k_size;
+    return BTLA_CODE::Success;
+  }
+
+  static inline BTLA_CODE getQ3Weight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
+                                      const Param& _param, void* tmpcache, size_t cachesize) {
+    auto wptr = _param.packedW;
+    int8_t* bit3_ptr = wptr->template WPtr<int8_t>();
+    auto zpptr = wptr->template ZPtr<int8_t>();
+    auto KPad = wptr->mKPad;
+    auto NPad = wptr->mNPad;
+    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
+    auto row = NPad / _GemmCore_T::NTILE;
+    auto ld_dst = _GemmCore_T::NTILE * KPad;
+    auto base_offset = n_offset * KPad + k_offset * _GemmCore_T::NTILE;
+    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
+      auto elt_offset = base_offset + i * KPad;
+      assert(elt_offset % 8 == 0);
+      auto bit2ptr = reinterpret_cast<utils::bit2x4*>(bit3_ptr + elt_offset / 4);
+      auto bit1ptr = reinterpret_cast<utils::bit1x8*>(bit3_ptr + row * ld_dst / 4 + elt_offset / 8);
+      kernel::wrapper::DecompressKBlockS3S8<_GemmCore_T::PACK_ROW, _GemmCore_T::NTILE>::template forward<ISA_T>(
+          bit2ptr, bit1ptr, wptr->IsAsym() ? zpptr : nullptr, *dstptr + i * k_size, wptr->mBlockSize, wptr->CStep(),
+          n_offset + i, k_offset, k_size, _GemmCore_T::NTILE, tmpcache, cachesize);
+    }
+    *dststep = k_size;
+    return BTLA_CODE::Success;
+  }
+
+  static inline BTLA_CODE getQ2Weight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
+                                      const Param& _param, void* tmpcache, size_t cachesize) {
+    auto wptr = _param.packedW;
+    int8_t* bit2_ptr = wptr->template WPtr<int8_t>();
+    int8_t* zpptr = wptr->template ZPtr<int8_t>();
+    auto KPad = wptr->mKPad;
+    auto NPad = wptr->mNPad;
+    auto base_offset = n_offset * KPad + k_offset * _GemmCore_T::NTILE;
+    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
+      auto elt_offset = base_offset + i * KPad;
+      assert(elt_offset % 4 == 0);
+      auto bit2ptr = reinterpret_cast<utils::bit2x4*>(bit2_ptr + elt_offset / 4);
+      kernel::wrapper::DecompressKBlockS2S8<_GemmCore_T::PACK_ROW, _GemmCore_T::NTILE>::template forward<ISA_T>(
+          bit2ptr, wptr->IsAsym() ? zpptr : nullptr, *dstptr + i * k_size, wptr->mBlockSize, wptr->CStep(),
+          n_offset + i, k_offset, k_size, _GemmCore_T::NTILE, tmpcache, cachesize);
+    }
+    *dststep = k_size;
+    return BTLA_CODE::Success;
+  }
+
+  virtual inline void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
+                                    float* scales, int8_t* zero_points, void* stor) {
+    auto ptr = reinterpret_cast<StorageWeight*>(stor);
+    auto quant_dtype = ptr->mDType;
+    if (quant_dtype == BTLA_DTYPE::S8) {
+      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, BTLA_DTYPE::S8>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                               scales, zero_points, ptr->mBlockSize);
+    } else if (quant_dtype == BTLA_DTYPE::S4_CLIP) {
+      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, BTLA_DTYPE::S4_CLIP>(
+          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, ptr->mBlockSize);
+    } else if (quant_dtype == BTLA_DTYPE::S3_CLIP) {
+      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, BTLA_DTYPE::S3_CLIP>(
+          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, ptr->mBlockSize);
+    } else if (quant_dtype == BTLA_DTYPE::S2_CLIP) {
+      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, BTLA_DTYPE::S2_CLIP>(
+          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, ptr->mBlockSize);
+    } else {
+      assert(0);
+    }
+  }
+
+  static inline BTLA_CODE doCompress(const int8_t* srcptr, void* dstptr, int row, int col, int ld_src, int ld_dst,
+                                     BTLA_DTYPE quant_dtype) {
+    if (quant_dtype == BTLA_DTYPE::S4_CLIP) {
+      return kernel::wrapper::CompressS8S4::forward<ISA_T>(srcptr, reinterpret_cast<utils::int4x2*>(dstptr), row, col,
+                                                           ld_src, ld_dst);
+    } else if (quant_dtype == BTLA_DTYPE::F4_BNB || quant_dtype == BTLA_DTYPE::F4_NF4 ||
+               quant_dtype == BTLA_DTYPE::F4_E2M1) {
+      return kernel::wrapper::CompressFp4::forward<ISA_T>(srcptr, reinterpret_cast<utils::f4x2*>(dstptr), row, col,
+                                                          ld_src,
+                                                          ld_dst);  // ld_dst here not stride
+    } else {
+      assert(0);
+      return BTLA_CODE::NotSupport;
+    }
+  }
+};
+
+struct ParamWeightKBlockNFloat {
+  storage::gemm::StorageWeightKBlockNFloat* packedW;
+};
+
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+class WeightKBlockNFloat : public WeightKBlockNInteger<_GemmCore_T, ISA_T> {
+ public:
+  using Param = ParamWeightKBlockNInteger;  // NFloat storage Param same with NInteger storage.
+  using StorageWeight = storage::gemm::StorageWeightKBlockNFloat;
+
+  StorageWeight createStorage(const int N, const int K, int blocksize, BTLA_DTYPE fT, BTLA_DTYPE scaT) {
+    int KPad = utils::padto(K, _GemmCore_T::KTILE);
+    int NPad = utils::padto(N, _GemmCore_T::NTILE);
+    StorageWeight tmp(_GemmCore_T::ID);
+    tmp.resize(NPad, KPad, blocksize <= 0 ? KPad : blocksize, N, K, fT, scaT);
+    return tmp;
+  }
+
+  inline BTLA_CODE getWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
+                             const Param& _param, void* tmpcache, size_t cachesize) override {
+    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
+  }
+
+  inline BTLA_CODE getWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
+                             const Param& _param, void* tmpcache, size_t cachesize) override {
+    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
+  }
+
+  template <typename _DST_T>
+  inline BTLA_CODE getFpWeight(_DST_T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
+                               const Param& _param, void* tmpcache, size_t cachesize) {
+    auto wptr = reinterpret_cast<StorageWeight*>(_param.packedW);
+    auto NPad = wptr->mNPad;
+    auto KPad = wptr->mKPad;
+    char* bptr;
+    if (wptr->mDType == BTLA_DTYPE::F8_E5M2 || wptr->mDType == BTLA_DTYPE::F8_E4M3) {
+      bptr = wptr->template WPtr<char>() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
+    } else {
+      bptr = wptr->template WPtr<char>() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
+    }
+    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
+    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
+      if (wptr->SDtype() == BTLA_DTYPE::F8_E8M0) {
+        assert(wptr->mDType == BTLA_DTYPE::F8_E4M3 || wptr->mDType == BTLA_DTYPE::F8_E5M2);
+        auto sptr = wptr->template SPtr<utils::f8>() + n_offset + i;
+        kernel::wrapper::DecompressKBlockF8FP<_GemmCore_T::PACK_ROW>::template forward<ISA_T>(
+            reinterpret_cast<utils::f8*>(bptr) + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW,
+            ColSize, ColSize, ColSize, sptr, k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW,
+            NPad, wptr->mDType);
+      } else if (wptr->SDtype() == BTLA_DTYPE::F32) {
+        auto sptr = wptr->template SPtr<float>() + n_offset + i;
+        auto f4ptr = reinterpret_cast<utils::f4x2*>(bptr + i * KPad / 2);
+        auto fp_ptr = *dstptr + i * k_size;
+        if (wptr->mDType == BTLA_DTYPE::F8_E4M3 || wptr->mDType == BTLA_DTYPE::F8_E5M2) {
+          kernel::wrapper::DecompressKBlockF8FP<_GemmCore_T::PACK_ROW>::template forward<ISA_T>(
+              reinterpret_cast<utils::f8*>(bptr) + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW,
+              ColSize, ColSize, ColSize, sptr, k_offset / _GemmCore_T::PACK_ROW,
+              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, wptr->mDType);
+        } else if (wptr->mDType == BTLA_DTYPE::F4_NF4) {
+          kernel::wrapper::DecompressKBlockF4Fp<_DST_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
+                                                                                                 BTLA_DTYPE::F4_NF4>(
+              f4ptr, fp_ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
+              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
+        } else if (wptr->mDType == BTLA_DTYPE::F4_E2M1) {
+          kernel::wrapper::DecompressKBlockF4Fp<_DST_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
+                                                                                                 BTLA_DTYPE::F4_E2M1>(
+              f4ptr, fp_ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
+              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
+        } else if (wptr->mDType == BTLA_DTYPE::F4_BNB) {
+          kernel::wrapper::DecompressKBlockF4Fp<_DST_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
+                                                                                                 BTLA_DTYPE::F4_BNB>(
+              f4ptr, fp_ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
+              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
+        } else {
+          assert(0);
+        }
+      } else if (wptr->SDtype() == BTLA_DTYPE::BF16) {
+        auto sptr = wptr->template SPtr<utils::bf16>() + n_offset + i;
+        auto f4ptr = reinterpret_cast<utils::f4x2*>(bptr + i * KPad / 2);
+        auto fp_ptr = *dstptr + i * k_size;
+        if (wptr->mDType == BTLA_DTYPE::F4_NF4) {
+          kernel::wrapper::DecompressKBlockF4Fp<_DST_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
+                                                                                                 BTLA_DTYPE::F4_NF4>(
+              f4ptr, fp_ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
+              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
+        } else if (wptr->mDType == BTLA_DTYPE::F4_E2M1) {
+          kernel::wrapper::DecompressKBlockF4Fp<_DST_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
+                                                                                                 BTLA_DTYPE::F4_E2M1>(
+              f4ptr, fp_ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
+              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
+        } else if (wptr->mDType == BTLA_DTYPE::F4_BNB) {
+          kernel::wrapper::DecompressKBlockF4Fp<_DST_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
+                                                                                                 BTLA_DTYPE::F4_BNB>(
+              f4ptr, fp_ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
+              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
+        } else {
+          assert(0);
+        }
+      } else if (wptr->SDtype() == BTLA_DTYPE::DQ8_BNB) {
+        auto f4ptr = reinterpret_cast<utils::f4x2*>(bptr + i * KPad / 2);
+        auto fp_ptr = *dstptr + i * k_size;
+        auto internal_n_offset = n_offset + i;
+        auto internal_k_offset = k_offset / _GemmCore_T::PACK_ROW;
+        auto internal_kblock = wptr->mBlockSize / _GemmCore_T::PACK_ROW;
+        auto dq_offset_idx = static_cast<int>(wptr->mCorrection.mDQCorrectionBuf.mBufSize / sizeof(float) - 1);
+        if (wptr->mDType == BTLA_DTYPE::F4_NF4) {
+          kernel::wrapper::DecompressDqKBlockF4Fp<_DST_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T,
+                                                                                                   BTLA_DTYPE::F4_NF4>(
+              f4ptr, fp_ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, wptr->template SPtr<uint8_t>(),
+              wptr->template DQPtr<float>(), internal_k_offset, internal_n_offset, internal_kblock, wptr->mDqBlockSize,
+              dq_offset_idx, NPad, wptr->mN, tmpcache, cachesize);
+        } else {
+          assert(0);
+        }
+      } else {
+        assert(0);
+      }
+    }
+    *dststep = k_size;
+    return BTLA_CODE::Success;
+  }
+
+  template <typename T>
+  static inline BTLA_CODE getKBlockWeight(T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
+                                          const Param& _param, void* tmpcache, size_t cachesize) {
+    auto wptr = reinterpret_cast<StorageWeight*>(_param.packedW);
+    auto NPad = wptr->mNPad;
+    auto KPad = wptr->mKPad;
+    char* bptr;
+    if (wptr->mDType == BTLA_DTYPE::F8_E4M3 || wptr->mDType == BTLA_DTYPE::F8_E5M2) {
+      bptr = wptr->template WPtr<char>() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
+    } else {
+      bptr = wptr->template WPtr<char>() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
+    }
+    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
+    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
+      if (wptr->mDType == BTLA_DTYPE::F8_E4M3 || wptr->mDType == BTLA_DTYPE::F8_E5M2) {
+        kernel::wrapper::DecompressKBlockF8FpNoScale<T>::template forward<ISA_T>(
+            reinterpret_cast<utils::f8*>(bptr) + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW,
+            ColSize, ColSize, ColSize, tmpcache, cachesize, wptr->mDType);
+      } else {
+        auto f4ptr = reinterpret_cast<utils::f4x2*>(bptr + i * KPad / 2);
+        auto fp_ptr = *dstptr + i * k_size;
+        if (wptr->mDType == BTLA_DTYPE::F4_NF4) {
+          kernel::wrapper::DecompressKBlockF4FpNoscale<T>::template forward<ISA_T, BTLA_DTYPE::F4_NF4>(
+              f4ptr, fp_ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, tmpcache, cachesize);
+        } else if (wptr->mDType == BTLA_DTYPE::F4_E2M1) {
+          kernel::wrapper::DecompressKBlockF4FpNoscale<T>::template forward<ISA_T, BTLA_DTYPE::F4_E2M1>(
+              f4ptr, fp_ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, tmpcache, cachesize);
+        } else if (wptr->mDType == BTLA_DTYPE::F4_BNB) {
+          kernel::wrapper::DecompressKBlockF4FpNoscale<T>::template forward<ISA_T, BTLA_DTYPE::F4_BNB>(
+              f4ptr, fp_ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, tmpcache, cachesize);
+        } else {
+          assert(0);
+        }
+      }
+    }
+    *dststep = k_size;
+    return BTLA_CODE::Success;
+  }
+
+ protected:
+  void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst, float* scales,
+                     int8_t* zero_points, void* stor) override {
+    auto ptr = reinterpret_cast<StorageWeight*>(stor);
+    auto quant_dtype = ptr->mDType;
+    if (quant_dtype == BTLA_DTYPE::F8_E4M3) {
+      kernel::wrapper::QuantizeF8RowBlock::forward<ISA_T, BTLA_DTYPE::F8_E4M3>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                               scales, ptr->mBlockSize, ptr->SDtype());
+    } else if (quant_dtype == BTLA_DTYPE::F8_E5M2) {
+      kernel::wrapper::QuantizeF8RowBlock::forward<ISA_T, BTLA_DTYPE::F8_E5M2>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                               scales, ptr->mBlockSize, ptr->SDtype());
+    } else if (quant_dtype == BTLA_DTYPE::F4_BNB) {
+      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, BTLA_DTYPE::F4_BNB>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                              scales, zero_points, ptr->mBlockSize);
+    } else if (quant_dtype == BTLA_DTYPE::F4_E2M1) {
+      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, BTLA_DTYPE::F4_E2M1>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                               scales, zero_points, ptr->mBlockSize);
+    } else if (quant_dtype == BTLA_DTYPE::F4_NF4) {
+      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, BTLA_DTYPE::F4_NF4>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                              scales, zero_points, ptr->mBlockSize);
+    } else {
+      assert(0);
+    }
+  }
+};
+}  // namespace gemm
+}  // namespace prologue_b
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_storage.h b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_storage.h
new file mode 100644
index 000000000..b81a27c36
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_storage.h
@@ -0,0 +1,871 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include "bestla.h"
+#include "bestla_gemm.h"
+#include "bestla_utils.h"
+
+namespace bestla {
+namespace storage {
+
+constexpr size_t Alignment = 64;
+class ISerialObject {
+ protected:
+  virtual size_t getSerializedSize() = 0;
+
+  virtual void serializeToBuffer(int8_t*& wptr) = 0;
+
+  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) = 0;
+};
+
+class ISerializable : public ISerialObject {
+ public:
+  virtual ~ISerializable() = default;
+
+  virtual void assign(int8_t* buf) = 0;
+
+  virtual void serialize(int8_t* wptr) = 0;
+
+  virtual void deserialize(int8_t* rptr) = 0;
+  size_t mSize = 0;
+
+ protected:
+  virtual size_t getSerializedSize() override {
+    size_t totalsize = 0;
+    totalsize += sizeof(mSize);
+    return totalsize;
+  }
+  virtual void serializeToBuffer(int8_t*& wptr) override { utils::serialize(wptr, mSize); }
+  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) override {
+    if (!map_buf) {
+      mSize = utils::deserialize<size_t>(rptr);
+    } else {
+      utils::serialize<size_t>(rptr, mSize);
+    }
+  }
+};
+
+template <int ALIGN>
+class ObjectAlignedBuffer : public ISerialObject {
+ public:
+  template <typename T>
+  inline constexpr T* get() const {
+    return reinterpret_cast<T*>(mBufPtr);
+  }
+  template <typename T>
+  inline size_t size() {
+    return mBufSize / sizeof(T);
+  }
+
+  void resize(size_t bytes) { mBufSize = bytes; }
+
+  // ser
+  int8_t* mBufPtr = nullptr;
+  size_t mBufSize = 0;
+  size_t mBufOffset = 0;
+
+  virtual size_t getSerializedSize() override {
+    size_t totalsize = 0;
+    totalsize += sizeof(mBufSize);
+    totalsize += sizeof(mBufOffset);
+    totalsize += mBufSize + ALIGN;
+    return totalsize;
+  }
+  virtual void serializeToBuffer(int8_t*& wptr) override {
+    utils::serialize(wptr, mBufSize);
+    auto tmpptr = wptr + sizeof(mBufOffset);
+    mBufOffset = utils::pointer_align<ALIGN>(tmpptr) - tmpptr;
+    utils::serialize(wptr, mBufOffset);
+    wptr += mBufOffset;
+    if (wptr != mBufPtr) {
+      std::memcpy(wptr, mBufPtr, mBufSize);
+    }
+    wptr += mBufSize;
+  }
+  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) override {
+    if (!map_buf) {
+      mBufSize = utils::deserialize<size_t>(rptr);
+      mBufOffset = utils::deserialize<size_t>(rptr);
+    } else {
+      utils::serialize<size_t>(rptr, mBufSize);
+      auto tmpptr = rptr + sizeof(mBufOffset);
+      mBufOffset = utils::pointer_align<ALIGN>(tmpptr) - tmpptr;
+      utils::serialize(rptr, mBufOffset);
+    }
+    rptr += mBufOffset;
+    mBufPtr = rptr;
+    rptr += mBufSize;
+  }
+};
+
+template <int ALIGN>
+class ObjectOptionalBuffer : public ObjectAlignedBuffer<ALIGN> {
+ public:
+  void resize(size_t bytes) {
+    ObjectAlignedBuffer<ALIGN>::resize(bytes);
+    mNotEmpty = bytes > 0;
+  }
+
+  // ser
+  bool mNotEmpty{false};
+
+  virtual size_t getSerializedSize() override {
+    size_t totalsize = 0;
+    totalsize += sizeof(mNotEmpty);
+    if (mNotEmpty) {
+      totalsize += ObjectAlignedBuffer<ALIGN>::getSerializedSize();
+    }
+    return totalsize;
+  }
+  virtual void serializeToBuffer(int8_t*& wptr) override {
+    utils::serialize(wptr, mNotEmpty);
+    if (mNotEmpty) {
+      ObjectAlignedBuffer<ALIGN>::serializeToBuffer(wptr);
+    }
+  }
+  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) override {
+    if (!map_buf) {
+      mNotEmpty = utils::deserialize<bool>(rptr);
+    } else {
+      utils::serialize<bool>(rptr, mNotEmpty);
+    }
+    if (mNotEmpty) {
+      ObjectAlignedBuffer<ALIGN>::deserializeBuffer(rptr, map_buf);
+    }
+  }
+};
+
+namespace gemm {
+
+class ObjectQuantCorrection : public ISerialObject {
+  // ser
+ public:
+  size_t mCSize = 0;
+  int mCStep = 0;
+  BTLA_DTYPE mScaT = BTLA_DTYPE::F32, mZpT = BTLA_DTYPE::F32, mRedT = BTLA_DTYPE::F32;
+  ObjectAlignedBuffer<Alignment> mScaleBuf;
+  ObjectOptionalBuffer<Alignment> mZpBuf, mRedBuf;
+  ObjectOptionalBuffer<Alignment> mDQCorrectionBuf;
+
+  // non-ser
+ public:
+  int mScaEleSize = 0, mZpEleSize = 0, mRedEleSize = 0;
+
+  size_t resize(int Rows, int Step, BTLA_DTYPE scalet, BTLA_DTYPE zpt, BTLA_DTYPE redt, bool _is_asym,
+                bool _has_reduce) {
+    mScaT = scalet;
+    mZpT = zpt;
+    mRedT = redt;
+    updateSize();
+    mCStep = Step;
+    mCSize = static_cast<size_t>(Rows) * Step;
+    mScaleBuf.resize(mCSize * mScaEleSize);
+    if (_is_asym) {
+      mZpBuf.resize(mCSize * mZpEleSize);
+    }
+    if (_has_reduce) {
+      mRedBuf.resize(mCSize * mRedEleSize);
+    }
+    return getSerializedSize();
+  }
+
+  virtual size_t getSerializedSize() override {
+    size_t totalsize = getMiscSize();
+    totalsize += mScaleBuf.getSerializedSize();
+    totalsize += mZpBuf.getSerializedSize();
+    totalsize += mRedBuf.getSerializedSize();
+    totalsize += mDQCorrectionBuf.getSerializedSize();
+    return totalsize;
+  }
+  virtual void serializeToBuffer(int8_t*& wptr) override {
+    utils::serialize(wptr, mScaT);
+    utils::serialize(wptr, mZpT);
+    utils::serialize(wptr, mRedT);
+    utils::serialize(wptr, mCStep);
+    utils::serialize(wptr, mCSize);
+    mScaleBuf.serializeToBuffer(wptr);
+    mZpBuf.serializeToBuffer(wptr);
+    mRedBuf.serializeToBuffer(wptr);
+    mDQCorrectionBuf.serializeToBuffer(wptr);
+  }
+  virtual void deserializeBuffer(int8_t*& rptr, bool locate_buf) override {
+    if (!locate_buf) {
+      mScaT = utils::deserialize<BTLA_DTYPE>(rptr);
+      mZpT = utils::deserialize<BTLA_DTYPE>(rptr);
+      mRedT = utils::deserialize<BTLA_DTYPE>(rptr);
+      updateSize();
+      mCStep = utils::deserialize<int>(rptr);
+      mCSize = utils::deserialize<size_t>(rptr);
+    } else {
+      utils::serialize<BTLA_DTYPE>(rptr, mScaT);
+      utils::serialize<BTLA_DTYPE>(rptr, mZpT);
+      utils::serialize<BTLA_DTYPE>(rptr, mRedT);
+      utils::serialize<int>(rptr, mCStep);
+      utils::serialize<size_t>(rptr, mCSize);
+    }
+    mScaleBuf.deserializeBuffer(rptr, locate_buf);
+    mZpBuf.deserializeBuffer(rptr, locate_buf);
+    mRedBuf.deserializeBuffer(rptr, locate_buf);
+    mDQCorrectionBuf.deserializeBuffer(rptr, locate_buf);
+  }
+  void enable_double_quant(size_t scale_size, BTLA_DTYPE stype) {
+    if (stype == BTLA_DTYPE::DQ8_BNB) {
+      auto super_scale_size = scale_size * sizeof(float);
+      auto super_zp_size = sizeof(float);
+      mDQCorrectionBuf.resize(super_scale_size + super_zp_size);
+    } else {
+      assert(0);
+    }
+  };
+
+ protected:
+  inline void updateSize() {
+    mScaEleSize = int(utils::bestla_dtype_size(mScaT));
+    mZpEleSize = int(utils::bestla_dtype_size(mZpT));
+    mRedEleSize = int(utils::bestla_dtype_size(mRedT));
+  }
+
+  inline constexpr size_t getMiscSize() {
+    size_t totalsize = 0;
+    totalsize += sizeof(mScaT);
+    totalsize += sizeof(mZpT);
+    totalsize += sizeof(mRedT);
+    totalsize += sizeof(mCStep);
+    totalsize += sizeof(mCSize);
+    return totalsize;
+  }
+};
+
+class IWeightBase : public storage::ISerializable {
+ public:
+  BTLA_PROLOGUEB_IDS mPrologueID = BTLA_PROLOGUEB_IDS::Undef;
+  uint64_t mCoreId = 0;
+  BTLA_DTYPE mDType = BTLA_DTYPE::F32;
+  int mNPad = 0, mKPad = 0;
+  int mN = 0, mK = 0;
+
+  IWeightBase(uint64_t _id) { mCoreId = _id; }
+
+  // bytes offset to mPrologueID
+  static constexpr inline size_t offset() { return sizeof(mSize); }
+
+ protected:
+  void resize(int NPad, int KPad, int N, int K, BTLA_DTYPE dtype) {
+    mNPad = NPad;
+    mKPad = KPad;
+    mN = N;
+    mK = K;
+    mDType = dtype;
+  }
+
+  virtual size_t getSerializedSize() { return ISerializable::getSerializedSize() + getMiscSize(); }
+
+  virtual void serializeToBuffer(int8_t*& wptr) {
+    ISerializable::serializeToBuffer(wptr);
+    utils::serialize(wptr, mPrologueID);
+    utils::serialize(wptr, mCoreId);
+    utils::serialize(wptr, mNPad);
+    utils::serialize(wptr, mKPad);
+    utils::serialize(wptr, mN);
+    utils::serialize(wptr, mK);
+    utils::serialize(wptr, mDType);
+  }
+
+  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
+    ISerializable::deserializeBuffer(rptr, map_buf);
+    if (!map_buf) {
+      mPrologueID = utils::deserialize<BTLA_PROLOGUEB_IDS>(rptr);
+      mCoreId = utils::deserialize<uint64_t>(rptr);
+      mNPad = utils::deserialize<int>(rptr);
+      mKPad = utils::deserialize<int>(rptr);
+      mN = utils::deserialize<int>(rptr);
+      mK = utils::deserialize<int>(rptr);
+      mDType = utils::deserialize<BTLA_DTYPE>(rptr);
+    } else {
+      utils::serialize<BTLA_PROLOGUEB_IDS>(rptr, mPrologueID);
+      utils::serialize<uint64_t>(rptr, mCoreId);
+      utils::serialize<int>(rptr, mNPad);
+      utils::serialize<int>(rptr, mKPad);
+      utils::serialize<int>(rptr, mN);
+      utils::serialize<int>(rptr, mK);
+      utils::serialize<BTLA_DTYPE>(rptr, mDType);
+    }
+  }
+
+  inline constexpr size_t getMiscSize() {
+    size_t totalsize = 0;
+    totalsize += sizeof(mPrologueID);
+    totalsize += sizeof(mCoreId);
+    totalsize += sizeof(mNPad);
+    totalsize += sizeof(mKPad);
+    totalsize += sizeof(mN);
+    totalsize += sizeof(mK);
+    totalsize += sizeof(mDType);
+    return totalsize;
+  }
+};
+
+class IWeightKBlockBase : public IWeightBase {
+ public:
+  int mBlockSize = 1;
+  int mDqBlockSize = 0;
+  IWeightKBlockBase(uint64_t _id) : IWeightBase(_id) {}
+  void resize(int NPad, int KPad, int Block, int N, int K, BTLA_DTYPE dtype) {
+    IWeightBase::resize(NPad, KPad, N, K, dtype);
+    mBlockSize = Block;
+  }
+
+ protected:
+  virtual size_t getSerializedSize() {
+    size_t totalsize = IWeightBase::getSerializedSize() + getMiscSize();
+    return totalsize;
+  }
+
+  virtual void serializeToBuffer(int8_t*& wptr) {
+    IWeightBase::serializeToBuffer(wptr);
+    utils::serialize(wptr, mBlockSize);
+    utils::serialize(wptr, mDqBlockSize);
+  }
+
+  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
+    IWeightBase::deserializeBuffer(rptr, map_buf);
+    if (!map_buf) {
+      mBlockSize = utils::deserialize<int>(rptr);
+      mDqBlockSize = utils::deserialize<int>(rptr);
+    } else {
+      utils::serialize(rptr, mBlockSize);
+      utils::serialize(rptr, mDqBlockSize);
+    }
+  }
+
+  inline constexpr size_t getMiscSize() {
+    size_t totalsize = sizeof(mBlockSize);
+    totalsize += sizeof(mDqBlockSize);
+    return totalsize;
+  }
+};
+
+class IActivationBase : public storage::ISerializable {
+ public:
+  BTLA_PROLOGUEB_IDS mPrologueID = BTLA_PROLOGUEB_IDS::Undef;
+  uint64_t mCoreId = 0;
+  BTLA_DTYPE mDType = BTLA_DTYPE::F32;
+  int mMPad = 0, mKPad = 0;
+  int mM = 0, mK = 0;
+
+  IActivationBase(uint64_t _id) { mCoreId = _id; }
+
+  // bytes offset to mPrologueID
+  static constexpr inline size_t offset() { return sizeof(mSize); }
+
+ protected:
+  void resize(int NPad, int KPad, int N, int K, BTLA_DTYPE dtype) {
+    mMPad = NPad;
+    mKPad = KPad;
+    mM = N;
+    mK = K;
+    mDType = dtype;
+  }
+
+  virtual size_t getSerializedSize() { return ISerializable::getSerializedSize() + getMiscSize(); }
+
+  virtual void serializeToBuffer(int8_t*& wptr) {
+    ISerializable::serializeToBuffer(wptr);
+    utils::serialize(wptr, mPrologueID);
+    utils::serialize(wptr, mCoreId);
+    utils::serialize(wptr, mMPad);
+    utils::serialize(wptr, mKPad);
+    utils::serialize(wptr, mM);
+    utils::serialize(wptr, mK);
+    utils::serialize(wptr, mDType);
+  }
+
+  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
+    ISerializable::deserializeBuffer(rptr, map_buf);
+    if (!map_buf) {
+      mPrologueID = utils::deserialize<BTLA_PROLOGUEB_IDS>(rptr);
+      mCoreId = utils::deserialize<uint64_t>(rptr);
+      mMPad = utils::deserialize<int>(rptr);
+      mKPad = utils::deserialize<int>(rptr);
+      mM = utils::deserialize<int>(rptr);
+      mK = utils::deserialize<int>(rptr);
+      mDType = utils::deserialize<BTLA_DTYPE>(rptr);
+    } else {
+      utils::serialize<BTLA_PROLOGUEB_IDS>(rptr, mPrologueID);
+      utils::serialize<uint64_t>(rptr, mCoreId);
+      utils::serialize<int>(rptr, mMPad);
+      utils::serialize<int>(rptr, mKPad);
+      utils::serialize<int>(rptr, mM);
+      utils::serialize<int>(rptr, mK);
+      utils::serialize<BTLA_DTYPE>(rptr, mDType);
+    }
+  }
+
+  inline constexpr size_t getMiscSize() {
+    size_t totalsize = 0;
+    totalsize += sizeof(mPrologueID);
+    totalsize += sizeof(mCoreId);
+    totalsize += sizeof(mMPad);
+    totalsize += sizeof(mKPad);
+    totalsize += sizeof(mM);
+    totalsize += sizeof(mK);
+    totalsize += sizeof(mDType);
+    return totalsize;
+  }
+};
+
+class IActivationKBlockBase : public IActivationBase {
+ public:
+  int mBlockSize = 1;
+  IActivationKBlockBase(uint64_t _id) : IActivationBase(_id) {}
+  void resize(int MPad, int KPad, int Block, int N, int K, BTLA_DTYPE dtype) {
+    IActivationBase::resize(MPad, KPad, N, K, dtype);
+    mBlockSize = Block;
+  }
+
+ protected:
+  virtual size_t getSerializedSize() {
+    size_t totalsize = IActivationBase::getSerializedSize() + getMiscSize();
+    return totalsize;
+  }
+
+  virtual void serializeToBuffer(int8_t*& wptr) {
+    IActivationBase::serializeToBuffer(wptr);
+    utils::serialize(wptr, mBlockSize);
+  }
+
+  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
+    IActivationBase::deserializeBuffer(rptr, map_buf);
+    if (!map_buf) {
+      mBlockSize = utils::deserialize<int>(rptr);
+    } else {
+      utils::serialize(rptr, mBlockSize);
+    }
+  }
+
+  inline constexpr size_t getMiscSize() {
+    size_t totalsize = sizeof(mBlockSize);
+    return totalsize;
+  }
+};
+
+class StoragePackedWeight : public IWeightBase {
+ public:
+  ObjectAlignedBuffer<Alignment> mWBuf;
+  StoragePackedWeight(uint64_t _id) : IWeightBase(_id) { mPrologueID = BTLA_PROLOGUEB_IDS::WeightPack; }
+
+  size_t resize(int NPad, int KPad, int N, int K, BTLA_DTYPE dtype) {
+    IWeightBase::resize(NPad, KPad, N, K, dtype);
+    auto bsize = static_cast<size_t>(NPad) * KPad * utils::bestla_dtype_size(dtype);
+    mWBuf.resize(bsize);
+    mSize = IWeightBase::getSerializedSize() + mWBuf.getSerializedSize();
+    mSize = utils::padto(mSize, Alignment);
+    return mSize;
+  }
+
+  template <typename T>
+  inline constexpr T* WPtr() const {
+    return mWBuf.get<T>();
+  }
+
+  virtual void assign(int8_t* buf) override {
+    IWeightBase::deserializeBuffer(buf, true);
+    mWBuf.deserializeBuffer(buf, true);
+  }
+
+  virtual void serialize(int8_t* wptr) {
+    IWeightBase::serializeToBuffer(wptr);
+    mWBuf.serializeToBuffer(wptr);
+  }
+
+  virtual void deserialize(int8_t* rptr) override {
+    IWeightBase::deserializeBuffer(rptr, false);
+    mWBuf.deserializeBuffer(rptr, false);
+  }
+};
+
+class StorageReduce : public ISerializable {
+ public:
+  using CorrectionType = ObjectQuantCorrection;
+  int m = 0, k = 0, lda = 0, kblock = 1;
+  ObjectAlignedBuffer<Alignment> mRedBuf;
+  size_t resize(int _m, int _k, int _kblock, BTLA_DTYPE redt) {
+    kblock = _kblock;
+    m = _m;
+    k = _k;
+    lda = utils::updiv(_k, _kblock);
+    size_t bufsize = static_cast<size_t>(m) * lda * utils::bestla_dtype_size(redt);
+    mRedBuf.resize(bufsize);
+    mSize = getSerializedSize();
+    mSize = utils::padto(mSize, Alignment);
+    return mSize;
+  }
+  template <typename QT_T>
+  inline QT_T* RPtr() {
+    return mRedBuf.get<QT_T>();
+  }
+
+  virtual void assign(int8_t* buf) override {
+    ISerializable::deserializeBuffer(buf, true);
+    deserializeBuffer(buf, true);
+    mRedBuf.deserializeBuffer(buf, true);
+  }
+
+  virtual void serialize(int8_t* wptr) {
+    ISerializable::serializeToBuffer(wptr);
+    serializeToBuffer(wptr);
+    mRedBuf.serializeToBuffer(wptr);
+  }
+
+  virtual void deserialize(int8_t* rptr) override {
+    ISerializable::deserializeBuffer(rptr, false);
+    deserializeBuffer(rptr, false);
+    mRedBuf.deserializeBuffer(rptr, false);
+  }
+
+ protected:
+  virtual size_t getSerializedSize() {
+    return ISerializable::getSerializedSize() + getMiscSize() + mRedBuf.getSerializedSize();
+  }
+
+  virtual void serializeToBuffer(int8_t*& wptr) {
+    utils::serialize(wptr, m);
+    utils::serialize(wptr, k);
+    utils::serialize(wptr, lda);
+    utils::serialize(wptr, kblock);
+  }
+
+  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
+    if (!map_buf) {
+      m = utils::deserialize<int>(rptr);
+      lda = utils::deserialize<int>(rptr);
+      kblock = utils::deserialize<int>(rptr);
+    } else {
+      utils::serialize(rptr, m);
+      utils::serialize(rptr, k);
+      utils::serialize(rptr, lda);
+      utils::serialize(rptr, kblock);
+    }
+  }
+
+  inline constexpr size_t getMiscSize() {
+    size_t totalsize = 0;
+    totalsize += sizeof(m);
+    totalsize += sizeof(k);
+    totalsize += sizeof(lda);
+    totalsize += sizeof(kblock);
+    return totalsize;
+  }
+};
+
+class StorageReorderActivation : public IActivationKBlockBase {
+ public:
+  ObjectAlignedBuffer<Alignment> mABuf;
+  StorageReorderActivation(uint64_t _id) : IActivationKBlockBase(_id) { mPrologueID = BTLA_PROLOGUEB_IDS::WeightPack; }
+
+  size_t resize(int MPad, int KPad, int M, int K, int KBlock, BTLA_DTYPE dtype) {
+    IActivationKBlockBase::resize(MPad, KPad, KBlock, M, K, dtype);
+    auto bsize = static_cast<size_t>(MPad) * KPad * utils::bestla_dtype_size(dtype);
+    mABuf.resize(bsize);
+    mSize = IActivationKBlockBase::getSerializedSize() + mABuf.getSerializedSize();
+    mSize = utils::padto(mSize, Alignment);
+    return mSize;
+  }
+
+  template <typename T>
+  inline constexpr T* APtr() const {
+    return mABuf.get<T>();
+  }
+
+  virtual void assign(int8_t* buf) override {
+    IActivationKBlockBase::deserializeBuffer(buf, true);
+    mABuf.deserializeBuffer(buf, true);
+  }
+
+  virtual void serialize(int8_t* wptr) {
+    IActivationKBlockBase::serializeToBuffer(wptr);
+    mABuf.serializeToBuffer(wptr);
+  }
+
+  virtual void deserialize(int8_t* rptr) override {
+    IActivationKBlockBase::deserializeBuffer(rptr, false);
+    mABuf.deserializeBuffer(rptr, false);
+  }
+};
+
+class StorageQuantActivation : public IActivationKBlockBase {
+ public:
+  using CorrectionType = ObjectQuantCorrection;
+  CorrectionType mCorrection;
+  ObjectAlignedBuffer<Alignment> mQBuf;
+  StorageQuantActivation(uint64_t _id = 0) : IActivationKBlockBase(_id) {
+    mPrologueID = BTLA_PROLOGUEB_IDS::WeightPack;
+  }
+
+  size_t resize(int _mpad, int _kpad, int _m, int _k, int _kblock, BTLA_DTYPE buft, BTLA_DTYPE scalet, BTLA_DTYPE zpt,
+                BTLA_DTYPE redt, bool is_asym, bool has_reduce) {
+    IActivationKBlockBase::resize(_mpad, _kpad, _kblock, _m, _k, buft);
+    mCorrection.resize(_mpad, utils::updiv(_kpad, _kblock), scalet, zpt, redt, is_asym, has_reduce);
+    size_t bufsize = static_cast<size_t>(_mpad) * _kpad * utils::bestla_dtype_size(buft);
+    mQBuf.resize(bufsize);
+    mSize = getSerializedSize();
+    mSize = utils::padto(mSize, Alignment);
+    return mSize;
+  }
+  template <typename QT_T>
+  inline constexpr QT_T* APtr() {
+    return mQBuf.get<QT_T>();
+  }
+
+  template <typename T>
+  inline constexpr size_t ASize() {
+    return mQBuf.size<T>();
+  }
+
+  template <typename QT_T>
+  inline constexpr QT_T* ZPtr() {
+    return mCorrection.mZpBuf.get<QT_T>();
+  }
+
+  template <typename QT_T>
+  inline constexpr QT_T* SPtr() {
+    return mCorrection.mScaleBuf.get<QT_T>();
+  }
+
+  template <typename QT_T>
+  inline constexpr QT_T* RPtr() {
+    return mCorrection.mRedBuf.get<QT_T>();
+  }
+
+  inline constexpr BTLA_DTYPE RDtype() { return mCorrection.mRedT; }
+  inline constexpr BTLA_DTYPE ZDtype() { return mCorrection.mZpT; }
+  inline constexpr BTLA_DTYPE SDtype() { return mCorrection.mScaT; }
+  inline constexpr bool IsAsym() { return mCorrection.mZpBuf.mNotEmpty; }
+  inline constexpr bool HasReduce() { return mCorrection.mRedBuf.mNotEmpty; }
+  inline constexpr size_t CSize() { return mCorrection.mCSize; }
+  inline constexpr int CStep() { return mCorrection.mCStep; }
+
+  virtual void assign(int8_t* buf) override {
+    IActivationKBlockBase::deserializeBuffer(buf, true);
+    deserializeBuffer(buf, true);
+  }
+
+  virtual void serialize(int8_t* wptr) {
+    IActivationKBlockBase::serializeToBuffer(wptr);
+    serializeToBuffer(wptr);
+  }
+
+  virtual void deserialize(int8_t* rptr) override {
+    IActivationKBlockBase::deserializeBuffer(rptr, false);
+    deserializeBuffer(rptr, false);
+  }
+
+ protected:
+  virtual size_t getSerializedSize() {
+    return ISerializable::getSerializedSize() + getMiscSize() + mQBuf.getSerializedSize() +
+           mCorrection.getSerializedSize();
+  }
+
+  virtual void serializeToBuffer(int8_t*& wptr) {
+    mQBuf.serializeToBuffer(wptr);
+    mCorrection.serializeToBuffer(wptr);
+  }
+
+  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
+    mQBuf.deserializeBuffer(rptr, map_buf);
+    mCorrection.deserializeBuffer(rptr, map_buf);
+  }
+
+  inline constexpr size_t getMiscSize() {
+    size_t totalsize = 0;
+    return totalsize;
+  }
+};
+
+class StorageWeightKBlockNInteger : public IWeightKBlockBase {
+ public:
+  using InfoType = IWeightKBlockBase;
+  using QWeightType = ObjectAlignedBuffer<Alignment>;
+  using CorrectionType = ObjectQuantCorrection;
+  QWeightType mQBuf;
+  CorrectionType mCorrection;
+  ObjectOptionalBuffer<Alignment> mShuffleIndices;
+  StorageWeightKBlockNInteger(uint64_t _type) : IWeightKBlockBase(_type) {
+    mPrologueID = BTLA_PROLOGUEB_IDS::WeightKBlockNInteger;
+  }
+
+  size_t resize(int NPad, int KPad, int Block, int N, int K, BTLA_DTYPE qtype, BTLA_DTYPE scalet, BTLA_DTYPE redt,
+                bool IsAsym) {
+    BTLA_DTYPE zpt = BTLA_DTYPE::S8;
+    InfoType::resize(NPad, KPad, Block, N, K, qtype);
+    auto bits = utils::bestla_dtype_bits(qtype);
+    auto elesize = static_cast<size_t>(NPad) * KPad;
+    if (qtype == BTLA_DTYPE::S3_CLIP)
+      elesize =
+          static_cast<size_t>(utils::padto(KPad, 128)) * NPad;  // pad K-dim to 128 because 128pack round2 interleave.
+                                                                // round2 interleave ld_dim == pad_to(KPad,128) * NTILE
+    auto bytes = utils::updiv(elesize * bits, 8);               // add 3bits, 5btis, 7bits size calculation here
+    mQBuf.resize(bytes);
+    int nk_scale = utils::updiv(KPad, Block);
+    auto gemm_comp = bestla::gemm::CoreAttr::get_comp(mCoreId);
+    auto is_cint = bestla::gemm::CompTypeHelper::is_integer(gemm_comp);
+    mCorrection.resize(nk_scale, NPad, scalet, zpt, redt, IsAsym, is_cint);
+    if (scalet == BTLA_DTYPE::DQ8_BNB) initDoubleQuantBlkSize(Block, nk_scale, IsAsym, N, scalet);
+    update_size();
+    return mSize;
+  }
+
+  void initDoubleQuantBlkSize(int dq_blksize, int nk_scale, bool asym, int N, BTLA_DTYPE stype) {
+    mDqBlockSize = dq_blksize;
+    if (asym || dq_blksize % 8 != 0) assert(0);
+    mCorrection.enable_double_quant(utils::updiv(nk_scale * N, dq_blksize), stype);
+  }
+
+  void enable_shuffle() {
+    auto indicessize = mK * sizeof(int);
+    mShuffleIndices.resize(indicessize);
+    update_size();
+  }
+
+  inline constexpr BTLA_DTYPE RDtype() { return mCorrection.mRedT; }
+  inline constexpr BTLA_DTYPE ZDtype() { return mCorrection.mZpT; }
+  inline constexpr BTLA_DTYPE SDtype() { return mCorrection.mScaT; }
+  inline constexpr bool IsAsym() { return mCorrection.mZpBuf.mNotEmpty; }
+  inline constexpr bool HasReduce() { return mCorrection.mRedBuf.mNotEmpty; }
+  inline constexpr bool IsDoubleQuant() { return mCorrection.mDQCorrectionBuf.mNotEmpty; }
+  inline constexpr size_t CSize() { return mCorrection.mCSize; }
+  inline constexpr int CStep() { return mCorrection.mCStep; }
+
+  template <typename T>
+  inline constexpr size_t WSize() {
+    return mQBuf.size<T>();
+  }
+
+  template <typename T>
+  inline constexpr T* WPtr() const {
+    return mQBuf.get<T>();
+  }
+
+  template <typename T>
+  inline constexpr T* SPtr() {
+    return mCorrection.mScaleBuf.get<T>();
+  }
+
+  template <typename T>
+  inline constexpr T* ZPtr() {
+    return mCorrection.mZpBuf.get<T>();
+  }
+
+  template <typename T>
+  inline constexpr T* RPtr() {
+    return mCorrection.mRedBuf.get<T>();
+  }
+
+  template <typename T>
+  inline constexpr T* DQPtr() {
+    return mCorrection.mDQCorrectionBuf.get<T>();
+  }
+
+  inline constexpr int* ShfIndice() { return mShuffleIndices.get<int>(); }
+
+  void update_size() {
+    mSize = InfoType::getSerializedSize() + mQBuf.getSerializedSize() + mCorrection.getSerializedSize() +
+            mShuffleIndices.getSerializedSize();
+    mSize = utils::padto(mSize, Alignment);
+  }
+
+  virtual void assign(int8_t* buf) override {
+    InfoType::deserializeBuffer(buf, true);
+    mQBuf.deserializeBuffer(buf, true);
+    mCorrection.deserializeBuffer(buf, true);
+    mShuffleIndices.deserializeBuffer(buf, true);
+  }
+
+  virtual void serialize(int8_t* wptr) {
+    InfoType::serializeToBuffer(wptr);
+    mQBuf.serializeToBuffer(wptr);
+    mCorrection.serializeToBuffer(wptr);
+    mShuffleIndices.serializeToBuffer(wptr);
+  }
+
+  virtual void deserialize(int8_t* rptr) override {
+    InfoType::deserializeBuffer(rptr, false);
+    mQBuf.deserializeBuffer(rptr, false);
+    mCorrection.deserializeBuffer(rptr, false);
+    mShuffleIndices.deserializeBuffer(rptr, false);
+  }
+};
+
+class StorageWeightKBlockNFloat : public StorageWeightKBlockNInteger {
+ public:
+  StorageWeightKBlockNFloat(uint64_t _type) : StorageWeightKBlockNInteger(_type) {
+    mPrologueID = BTLA_PROLOGUEB_IDS::WeightKBlockNFloat;
+  }
+
+  size_t resize(int NPad, int KPad, int Block, int N, int K, BTLA_DTYPE ftype, BTLA_DTYPE scalet) {
+    StorageWeightKBlockNInteger::InfoType::resize(NPad, KPad, Block, N, K, ftype);
+    auto bits = utils::bestla_dtype_bits(ftype);
+    auto elesize = static_cast<size_t>(NPad) * KPad;
+    auto bytes = utils::updiv(elesize * bits, 8);  // add fp6 size calculation here
+    StorageWeightKBlockNInteger::mQBuf.resize(bytes);
+    int nk_scale = utils::updiv(KPad, Block);
+    StorageWeightKBlockNInteger::mCorrection.resize(nk_scale, NPad, scalet, BTLA_DTYPE::EleBitsUndef,
+                                                    BTLA_DTYPE::EleBitsUndef, false, false);
+    if (scalet == BTLA_DTYPE::DQ8_BNB) initDoubleQuantBlkSize(Block, nk_scale, false, N, scalet);
+    update_size();
+    mSize = StorageWeightKBlockNInteger::InfoType::getSerializedSize() +
+            StorageWeightKBlockNInteger::mQBuf.getSerializedSize() +
+            StorageWeightKBlockNInteger::mCorrection.getSerializedSize();
+    mSize = utils::padto(mSize, Alignment);
+    return mSize;
+  }
+};
+
+class PackedWeightParser {
+ public:
+  static gemm::IWeightBase* deserialBuffer(const void* serialized_buf) {
+    if (serialized_buf == nullptr) {
+      return nullptr;
+    }
+    auto tmpptr = const_cast<void*>(serialized_buf);
+    auto rptr = reinterpret_cast<int8_t*>(tmpptr);
+    rptr += IWeightBase::offset();
+    int mProID = utils::deserialize<int>(rptr);
+    IWeightBase* ptr = nullptr;
+    if (mProID >= int(BTLA_PROLOGUEB_IDS::Begin) && mProID < int(BTLA_PROLOGUEB_IDS::End)) {
+      rptr = reinterpret_cast<int8_t*>(tmpptr);
+      auto type = static_cast<BTLA_PROLOGUEB_IDS>(mProID);
+      switch (type) {
+        case BTLA_PROLOGUEB_IDS::WeightPack:
+          ptr = new gemm::StoragePackedWeight(0);
+          break;
+        case BTLA_PROLOGUEB_IDS::WeightKBlockNInteger:
+          ptr = new gemm::StorageWeightKBlockNInteger(0);
+          break;
+        case BTLA_PROLOGUEB_IDS::WeightKBlockNFloat:
+          ptr = new gemm::StorageWeightKBlockNFloat(0);
+          break;
+        default:
+          break;
+      }
+      if (ptr) {
+        ptr->deserialize(rptr);
+      }
+    }
+    return ptr;
+  }
+};
+}  // namespace gemm
+}  // namespace storage
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_utils.h b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_utils.h
new file mode 100644
index 000000000..e6a36f5d8
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_utils.h
@@ -0,0 +1,808 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <type_traits>
+
+#if defined(_OPENMP) && defined(BTLA_ENABLE_OPENMP)
+#define BTLA_OPENMP 1
+#else
+#define BTLA_OPENMP 0
+#endif
+
+#if BTLA_OPENMP
+#include <omp.h>
+#endif
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstring>
+#include <functional>
+#include <cassert>
+#include <vector>
+#include <cstdio>
+#include <string>
+#include <set>
+#ifdef _WIN32
+#include <cstdlib>
+#else
+#include <err.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/signal.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#define fatal_error(msg, ...) err(1, "[FAIL]\t" msg, ##__VA_ARGS__)
+#define XFEATURE_XTILECFG 17
+#define XFEATURE_XTILEDATA 18
+#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG)
+#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA)
+#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
+
+#define ARCH_GET_XCOMP_PERM 0x1022
+#define ARCH_REQ_XCOMP_PERM 0x1023
+
+#endif
+#include "bestla.h"
+
+// As long as the compiler supports the ISA, we will enable it.
+// Only the ISA you use in your project will be compiled.
+#ifdef __GNUC__
+#define CompileAVX512F() (__GNUC__ >= 6)
+#define CompileAVX512VNNI() (__GNUC__ >= 9)
+#define CompileAVX2() (__GNUC__ >= 5)
+#define CompileAVXVNNI() (__GNUC__ >= 11)
+#define CompileAMX() (__GNUC__ >= 11)
+#define CompileBF16() (__GNUC__ >= 11)
+#define CompileFP16() (__GNUC__ >= 13)
+#define CompileAMXBF16() (CompileAMX())
+#define CompileAMXINT8() (CompileAMX())
+#endif
+
+#if defined(_MSC_VER) && !defined(__INTEL_LLVM_COMPILER)
+#define CompileAVX512F() _MSC_VER && (_MSC_VER >= 1911)
+#define CompileAVX512VNNI() _MSC_VER && (_MSC_VER >= 1930)  // TODO(Yu) check the minimum version
+#define CompileAVX2() _MSC_VER && (_MSC_VER >= 1900)
+#define CompileAVXVNNI() _MSC_VER && (_MSC_VER >= 1930)  // TODO(Yu) check the minimum version
+#define CompileAMX() _MSC_VER && (_MSC_VER >= 1930)      // TODO(Yu) check the minimum version
+#define CompileBF16() _MSC_VER && (_MSC_VER >= 1938)     // TODO(Yu) check the minimum version
+#define CompileFP16() _MSC_VER && (_MSC_VER >= 1938)     // TODO(Yu) check the minimum version
+#define CompileAMXBF16() (CompileAMX())
+#define CompileAMXINT8() (CompileAMX())
+#endif
+
+#if defined(_MSC_VER) && defined(__INTEL_LLVM_COMPILER)
+#define CompileAVX512F() defined(__AVX512F__)
+#define CompileAVX512VNNI() defined(__AVX512VNNI__)
+#define CompileAVX2() defined(__AVX2__) && defined(__F16C__) && defined(__FMA__)
+#define CompileAVXVNNI() defined(__AVXVNNI__)
+#define CompileAMX() defined(__AMX_TILE__)
+#define CompileBF16() defined(__AVX512BF16__)
+#define CompileFP16() defined(__AVX512FP16__)
+#define CompileAMXBF16() (CompileAMX())
+#define CompileAMXINT8() (CompileAMX())
+#endif
+
+#include <immintrin.h>
+
+namespace bestla {
+namespace utils {
+
+template <typename T2, typename T1>
+inline const T2 bit_cast(T1 i) {
+  static_assert(sizeof(T1) == sizeof(T2), "Bit-casting must preserve size.");
+  T2 o;
+  memcpy(&o, &i, sizeof(T2));
+  return o;
+}
+
+template <typename T>
+inline uint32_t bitand_u32(const T& src, const T& src1) {
+  return uint32_t(src) & uint32_t(src1);
+}
+
+struct bf16 {
+  uint16_t x;
+  union bf16f32 {
+    float f32;
+    unsigned int u;
+    uint16_t bf16[2];
+  };
+  bf16() : x(0) {}
+
+  explicit bf16(float vf32) { fromfloat(vf32); }
+
+  float tofloat() const {
+    bf16f32 tmp = {0.f};
+    tmp.bf16[1] = x;
+    return tmp.f32;
+  }
+
+  bf16& operator=(float val) {
+    fromfloat(val);
+    return *this;
+  }
+
+  operator float() const { return tofloat(); }
+
+  static bf16 from_bin(const uint16_t x) {
+    bf16 res;
+    res.x = x;
+    return res;
+  }
+
+  void fromfloat(float _v) {
+    bf16f32 tmp = {0.f};
+    tmp.f32 = _v;
+    // See document of VCVTNEPS2BF16 in Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 2
+    const auto lsb = tmp.bf16[1] & 1;
+    tmp.u += 0x7fff + lsb;
+    x = tmp.bf16[1];
+  }
+};
+
+struct f8 {
+  int8_t x;
+  explicit f8(int8_t v) { x = v; }
+  f8& operator=(int8_t v) {
+    x = v;
+    return *this;
+  }
+
+  inline float tofloat() const {
+    int32_t r = x + 127;
+    uint32_t tmp = bit_cast<uint32_t, int32_t>(r & 0xff);
+    tmp <<= 23;
+    return bit_cast<float, uint32_t>(tmp);
+  }
+
+  inline float mul(float src) const {
+    auto scale = tofloat();
+    return src * scale;
+  }
+};
+
+struct fp16 {
+  uint16_t x;
+
+  fp16() { x = 0; }
+  explicit fp16(float val) { (*this) = val; }
+  explicit fp16(bf16 val) { (*this) = static_cast<float>(val); }
+
+  fp16& operator=(float val) {
+    // round-to-nearest-even: add last bit after truncated mantissa
+    const uint32_t b = bit_cast<uint32_t>(val) + 0x00001000;
+    const uint32_t e = (b & 0x7F800000) >> 23;  // exponent
+    // mantissa; in line below: 0x007FF000 = 0x00800000-0x00001000 = decimal indicator flag - initial rounding
+    const uint32_t m = b & 0x007FFFFF;
+    // sign : normalized : denormalized : saturate
+
+    this->x = static_cast<uint16_t>((b & 0x80000000) >> 16 | (e > 112) * ((((e - 112) << 10) & 0x7C00) | m >> 13) |
+                                    ((e < 113) & (e > 101)) * ((((0x007FF000 + m) >> (125 - e)) + 1) >> 1) |
+                                    (e > 143) * 0x7FFF);
+    return *this;
+  }
+  explicit operator float() const {
+    // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15,
+    // +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
+    const uint32_t e = (x & 0x7C00) >> 10;  // exponent
+    const uint32_t m = (x & 0x03FF) << 13;  // mantissa
+    // evil log2 bit hack to count leading zeros in denormalized format
+    const uint32_t v = bit_cast<uint32_t>(static_cast<float>(m)) >> 23;
+    // sign : normalized : denormalized
+    return bit_cast<float>((x & 0x8000) << 16 | (e != 0) * ((e + 112) << 23 | m) |
+                           ((e == 0) & (m != 0)) * ((v - 37) << 23 | ((m << (150 - v)) & 0x007FE000)));
+  }
+  explicit operator bf16() const {
+    // Extract the exponent, and mantissa from the fp16 value.
+    int exponent = x >> 10 & 0x1f;
+    int mantissa = x & 0x3ff;
+
+    // If the exponent is 0, the bf16 value is 0.
+    if (exponent == 0) {
+      return bf16();
+    }
+    // If the exponent is 31, the bf16 value is the sign bit plus 0x7fff.
+    else if (exponent == 31) {
+      bf16 res{};
+      return bf16::from_bin(x | 0x7fff);
+    }
+    // Otherwise, the bf16 value is the sign bit plus the exponent minus 15,
+    // followed by the mantissa.
+    else {
+      int sign = x & 0x8000;
+      return bf16::from_bin(static_cast<uint16_t>(sign | (exponent + 128 - 16) << 7 | mantissa >> 3));
+    }
+  }
+};
+
+struct bit2x4 {
+  uint8_t a : 2;
+  uint8_t b : 2;
+  uint8_t c : 2;
+  uint8_t d : 2;
+};
+
+struct bit1x8 {
+  uint8_t a : 1;
+  uint8_t b : 1;
+  uint8_t c : 1;
+  uint8_t d : 1;
+  uint8_t e : 1;
+  uint8_t f : 1;
+  uint8_t g : 1;
+  uint8_t h : 1;
+};
+
+struct bit4x2 {
+  uint8_t x : 4;
+  uint8_t y : 4;
+  bit4x2(int8_t v) : x(v), y(v) {}
+  bit4x2() : x(0), y(0) {}
+};
+
+struct int4x2 : bit4x2 {
+  int4x2(int8_t v) : bit4x2(v) {}
+  int4x2() : bit4x2() {}
+  static int8_t convert(int8_t src) {
+    int32_t dst = src;
+    dst = dst > 7 ? 7 : dst;
+    dst = dst < -8 ? -8 : dst;
+    return static_cast<int8_t>(dst);
+  }
+};
+
+struct f4x2 : bit4x2 {
+  f4x2(int8_t v) : bit4x2(v) {}
+  f4x2() : bit4x2() {}
+};
+
+struct GemmProblem {
+  int n;
+  int dims[8];  // batch, m, n, k, kblock, ... reserved
+  GemmProblem(int _n, int* _dims) {
+    n = _n;
+    for (size_t i = 0; i < n; i++) {
+      dims[i] = _dims[i];
+    }
+  }
+  GemmProblem(int batch, int _m, int _n, int _k) {
+    n = 4;
+    dims[0] = batch;
+    dims[1] = _m;
+    dims[2] = _n;
+    dims[3] = _k;
+  }
+  GemmProblem(int batch, int _m, int _n, int _k, int _kblock) {
+    n = 5;
+    dims[0] = batch;
+    dims[1] = _m;
+    dims[2] = _n;
+    dims[3] = _k;
+    dims[4] = _kblock;
+  }
+};
+
+template <typename ScaleT>
+struct GemvParamB {
+  uint8_t *b4ptr = 0, *b2ptr = 0, *b1ptr = 0;
+  ScaleT* sptr = 0;
+  int8_t* zpptr = 0;
+  int nbits = 0;
+  int ldzp = 0;
+  int kpad = 0;
+};
+
+struct GemvParamA {
+  uint8_t* aptr = 0;
+  float* sptr = 0;
+  uint8_t* zpptr = 0;
+  int lda = 0;
+  int ldzp = 0;
+};
+
+template <typename T>
+inline constexpr BTLA_DTYPE bestla_dtype = std::is_same_v<T, double>        ? BTLA_DTYPE::F64
+                                           : std::is_same_v<T, float>       ? BTLA_DTYPE::F32
+                                           : std::is_same_v<T, utils::bf16> ? BTLA_DTYPE::BF16
+                                           : std::is_same_v<T, utils::fp16> ? BTLA_DTYPE::F16
+                                           : std::is_same_v<T, int8_t>      ? BTLA_DTYPE::S8
+                                           : std::is_same_v<T, uint8_t>     ? BTLA_DTYPE::U8
+                                           : std::is_same_v<T, int>         ? BTLA_DTYPE::S32
+                                           : std::is_same_v<T, f8>          ? BTLA_DTYPE::F8_E8M0
+                                                                            : (assert(0), BTLA_DTYPE::F32);
+template <typename T>
+inline constexpr const char* type_str = std::is_same_v<T, double>    ? "double"
+                                        : std::is_same_v<T, float>   ? "float"
+                                        : std::is_same_v<T, bf16>    ? "bf16"
+                                        : std::is_same_v<T, fp16>    ? "fp16"
+                                        : std::is_same_v<T, int8_t>  ? "int8_t"
+                                        : std::is_same_v<T, uint8_t> ? "uint8_t"
+                                        : std::is_same_v<T, f8>      ? "f8"  // TODO(zhe): more f8 cases?
+                                                                     : (assert(0), "undef");
+
+inline const char* bestla_dtype_str(BTLA_DTYPE dtype) {
+  switch (dtype) {
+    case BTLA_DTYPE::F64:
+      return "float64";
+    case BTLA_DTYPE::F32:
+      return "float32";
+    case BTLA_DTYPE::F16:
+      return "float16";
+    case BTLA_DTYPE::BF16:
+      return "bfloat16";
+    case BTLA_DTYPE::F8_E4M3:
+      return "fp8_e4m3";
+    case BTLA_DTYPE::F8_E5M2:
+      return "fp8_e5m2";
+    case BTLA_DTYPE::F8_E3M4:
+      return "fp8_e3m4";
+    case BTLA_DTYPE::F8_E8M0:
+      return "fp8_e8m0";
+    case BTLA_DTYPE::S8:
+      return "signed_int8";
+    case BTLA_DTYPE::U8:
+      return "unsigned_int8";
+    case BTLA_DTYPE::S4_CLIP:
+      return "int4_clip";
+    case BTLA_DTYPE::S3_CLIP:
+      return "int3_clip";
+    case BTLA_DTYPE::S2_CLIP:
+      return "int2_clip";
+    case BTLA_DTYPE::F4_E2M1:
+      return "fp4_e2m1";
+    case BTLA_DTYPE::F4_BNB:
+      return "fp4_bitsandbytes";
+    case BTLA_DTYPE::F4_NF4:
+      return "fp4_nf4";
+    case BTLA_DTYPE::S32:
+      return "signed_int32";
+    case BTLA_DTYPE::U32:
+      return "unsigned_int32";
+    case BTLA_DTYPE::DQ8_BNB:
+      return "dq8_bnb";
+    default:
+      return "ErrType";
+  }
+}
+
+template <BTLA_DTYPE DT>
+inline constexpr const char* dtype_str() {
+  return bestla_dtype_str(DT);
+}
+
+inline constexpr uint32_t bestla_dtype_get_mask_val(const BTLA_DTYPE& t, const BTLA_DTYPE& mask,
+                                                    const BTLA_DTYPE& shift) {
+  return (static_cast<uint32_t>(t) & static_cast<uint32_t>(mask)) >> static_cast<uint32_t>(shift);
+}
+
+inline constexpr size_t bestla_dtype_bits(const BTLA_DTYPE t) {
+  return bestla_dtype_get_mask_val(t, BTLA_DTYPE::EleBitsMask, BTLA_DTYPE::EleBitsShift);
+}
+
+inline constexpr size_t bestla_dtype_type(const BTLA_DTYPE t) {
+  return bestla_dtype_get_mask_val(t, BTLA_DTYPE::TypeMask, BTLA_DTYPE::TypeShift);
+}
+
+inline constexpr size_t bestla_dtype_size(const BTLA_DTYPE t) {
+  auto bits = bestla_dtype_get_mask_val(t, BTLA_DTYPE::EleBitsMask, BTLA_DTYPE::EleBitsShift);
+  return bits >> 3;  // bits to bytes
+}
+
+inline int bestla_dtype_get_f8_ebits(const BTLA_DTYPE t) {
+  int ret = -1;
+  switch (t) {
+    case BTLA_DTYPE::F8_E4M3:
+      ret = 4;
+      break;
+    case BTLA_DTYPE::F8_E5M2:
+      ret = 5;
+      break;
+    default:
+      assert(0);
+  }
+  return ret;
+}
+
+inline int bestla_dtype_get_f8_quant_mbits(const BTLA_DTYPE t) {
+  int ret = -1;
+  switch (t) {
+    case BTLA_DTYPE::F8_E4M3:
+      ret = 5;
+      break;
+    case BTLA_DTYPE::F8_E5M2:
+      ret = 4;
+      break;
+    default:
+      assert(0);
+  }
+  return ret;
+}
+
+inline float get_mxfp_maxnorm(const BTLA_DTYPE t, int ebits, int mantissa_bits) {
+  auto emax = std::pow(2, ebits - 1);
+  if (t == BTLA_DTYPE::F8_E5M2) emax -= 1;
+  auto max_norm = std::pow(2, emax);
+  if (t != BTLA_DTYPE::F8_E4M3) {
+    max_norm *= ((std::pow(2, mantissa_bits - 1) - 1) / std::pow(2, mantissa_bits - 2));
+  } else {
+    max_norm *= 1.75;
+  }
+  return static_cast<float>(max_norm);
+}
+
+#ifndef _WIN32
+static void request_perm_xtile_data() {
+  unsigned long bitmask;
+  long rc;
+
+  rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
+  if (rc) fatal_error("XTILE_DATA request failed: %ld", rc);
+
+  rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);
+  if (rc) fatal_error("prctl(ARCH_GET_XCOMP_PERM) error: %ld", rc);
+#ifndef NDEBUG
+  if (bitmask & XFEATURE_MASK_XTILE) printf("ARCH_REQ_XCOMP_PERM XTILE_DATA successful.\n");
+#endif
+}
+#else
+static void request_perm_xtile_data() {}
+#endif
+
+template <BTLA_ISA ISA_T>
+class isa_base {
+ public:
+  static bool constexpr avx = ISA_T >= BTLA_ISA::AVX;
+  static bool constexpr avx2 = ISA_T >= BTLA_ISA::AVX2;
+  static bool constexpr avx512f = ISA_T >= BTLA_ISA::AVX512F;
+  static bool constexpr avx512_vnni = ISA_T >= BTLA_ISA::AVX512_VNNI;
+  static bool constexpr avx512_fp16 = ISA_T >= BTLA_ISA::AVX512_FP16;
+  static bool constexpr amx_bf16 = ISA_T >= BTLA_ISA::AMX_BF16;
+  static bool constexpr amx_int8 = ISA_T >= BTLA_ISA::AMX_INT8;
+};
+
+static inline int padto_le(int src, int padding) { return src / padding * padding; }
+
+static inline size_t padto_le(size_t src, int padding) { return src / size_t(padding) * size_t(padding); }
+
+static inline int updiv(int a, int b) { return (a + b - 1) / b; }
+
+static inline size_t updiv(size_t a, int b) { return (a + b - 1) / b; }
+
+static inline int downdiv(int a, int b) { return a / b; }
+
+static inline int remainsize(int pos, int size, int N) { return pos + N <= size ? N : size - pos; }
+
+template <typename _SRCT, typename _DSTT>
+static inline _DSTT cast(_SRCT _src) {
+  return static_cast<_DSTT>(_src);
+}
+
+template <>
+int8_t cast(float _src) {
+  _src = roundf(_src);
+  _src = std::min(_src, 127.f);
+  _src = std::max(_src, -128.f);
+  return static_cast<int8_t>(_src);
+}
+
+template <>
+uint8_t cast(float _src) {
+  _src += 0.5f;
+  _src = std::min(_src, 255.f);
+  _src = std::max(_src, 0.f);
+  return static_cast<uint8_t>(_src);
+}
+
+template <>
+int cast(float _src) {
+  return int(roundf(_src));
+}
+
+template <>
+float cast(bf16 _src) {
+  return _src.tofloat();
+}
+
+template <>
+bf16 cast(float _src) {
+  bf16 tmp;
+  tmp.fromfloat(_src);
+  return tmp;
+}
+
+template <typename _T>
+void serialize(int8_t*& buf, _T _val) {
+  *reinterpret_cast<_T*>(buf) = _val;
+  buf += sizeof(_T);
+}
+
+template <typename _T>
+_T deserialize(int8_t*& buf) {
+  auto val = *reinterpret_cast<_T*>(buf);
+  buf += sizeof(_T);
+  return val;
+}
+
+static inline int padto(int a, int b) { return updiv(a, b) * b; }
+static inline size_t padto(size_t a, int b) { return updiv(a, b) * b; }
+
+template <int _Alignment, typename _T>
+static inline _T* pointer_align(_T* src) {
+  auto uptr = reinterpret_cast<uint64_t>(src);
+  return reinterpret_cast<_T*>((uptr + _Alignment - 1) / _Alignment * _Alignment);
+}
+
+template <typename _T>
+static inline _T* cpu_pointer_align(_T* src) {
+  return pointer_align<64, _T>(src);
+}
+
+template <typename _T>
+static inline _T* amalloc(size_t _size, size_t _alignment = 64) {
+  if (_size == 0) {
+    return nullptr;
+  }
+  auto psize = padto(_size * sizeof(_T), static_cast<int>(_alignment));
+#ifdef _WIN32
+  return reinterpret_cast<_T*>(_aligned_malloc(psize, _alignment));
+#else
+  return reinterpret_cast<_T*>(aligned_alloc(_alignment, psize));
+#endif
+}
+
+static inline void afree(void* ptr) {
+  if (ptr == nullptr) {
+    return;
+  }
+#ifdef _WIN32
+  _aligned_free(ptr);
+#else
+  free(ptr);
+#endif
+}
+
+template <typename _T, int _Alignment = 64>
+class aligned_vector {
+ public:
+  aligned_vector() : mRawsize(0), mPtr(nullptr), mAlignedsize(0) {}
+  aligned_vector(size_t _size) { resize(_size); }
+  aligned_vector(size_t _size, _T _val) {
+    resize(_size);
+    std::fill_n(mVec.begin(), mVec.size(), _val);
+  }
+  size_t size() { return mRawsize; }
+  void resize(size_t size) {
+    mRawsize = size;
+    mAlignedsize = (mRawsize + _Alignment - 1) / _Alignment * _Alignment + _Alignment;
+    if (size) {
+      mVec.resize(mAlignedsize);
+      auto uptr = reinterpret_cast<uint64_t>(mVec.data());
+      mPtr = reinterpret_cast<_T*>((uptr + _Alignment - 1) / _Alignment * _Alignment);
+    } else {
+      mPtr = nullptr;
+    }
+  }
+  _T* data() const { return mPtr; }
+  _T& operator[](size_t _n) noexcept { return mPtr[_n]; }
+
+ protected:
+  size_t mAlignedsize, mRawsize;
+  std::vector<_T> mVec;
+  _T* mPtr;
+};
+
+template <typename _T, int _Alignment = 64>
+using avector = aligned_vector<_T, _Alignment>;
+
+using milliseconds = std::chrono::milliseconds;
+using nanoseconds = std::chrono::nanoseconds;
+using microseconds = std::chrono::microseconds;
+template <typename _DUR = std::chrono::milliseconds>
+class timer {
+ public:
+  using sclock_t = std::chrono::steady_clock;
+  using stime_point_t = std::chrono::time_point<sclock_t>;
+
+  timer() { clear(); }
+
+  void start() { startT = sclock_t::now(); }
+
+  void clear() { startT = stime_point_t::min(); }
+
+  bool null_state() { return startT == stime_point_t::min(); }
+
+  float stop() { return static_cast<float>(std::chrono::duration_cast<_DUR>(sclock_t::now() - startT).count()); }
+
+  stime_point_t startT;
+};
+
+template <typename T>
+class minmax_statistics {
+ public:
+  minmax_statistics() { clear(); }
+
+  void clear() {
+    min_val = std::numeric_limits<T>::max();
+    max_val = std::numeric_limits<T>::min();
+    avg_val = 0;
+    count = 0;
+  }
+
+  void add(T _val) {
+    min_val = min_val > _val ? _val : min_val;
+    max_val = max_val < _val ? _val : max_val;
+    count += 1;
+    avg_val = (avg_val * (count - 1) + _val) / count;
+  }
+
+  T min_val, max_val, avg_val;
+  size_t count;
+};
+
+template <int _PRINT_CYCLE_MS = 100, typename _PRECISION = microseconds, typename _LOG_PRECISION = milliseconds>
+class timer_statistics_logger {
+ public:
+  typedef timer<milliseconds> log_timer_t;
+  timer_statistics_logger() {
+    clear();
+    log_ratio = static_cast<float>(std::chrono::duration_cast<_PRECISION>(_LOG_PRECISION(1)).count());
+  }
+
+  void clear() {
+    statis.clear();
+    logtm.clear();
+  }
+
+  void start() {
+    if (logtm.null_state()) {
+      logtm.start();
+    }
+    tm.start();
+  }
+
+  bool stop() {
+    auto elapsed = tm.stop();
+    statis.add(elapsed);
+    if (logtm.stop() >= _PRINT_CYCLE_MS) {
+      record();
+      clear();
+      logtm.start();
+      return true;
+    }
+    return false;
+  }
+
+  bool add(float time) {
+    statis.add(time);
+    if (logtm.stop() >= _PRINT_CYCLE_MS) {
+      record();
+      clear();
+      logtm.start();
+      return true;
+    }
+    return false;
+  }
+
+  const char* get_log_str() {
+    sprintf(str, "Min:%.4f, Max:%.4f, Average:%.4f", min_val, max_val, avg_val);
+    return str;
+  }
+  float min_val, max_val, avg_val;
+
+  void record() {
+    if (statis.count) {
+      min_val = statis.min_val / log_ratio;
+      max_val = statis.max_val / log_ratio;
+      avg_val = statis.avg_val / log_ratio;
+    }
+  }
+
+ private:
+  float log_ratio;
+  char str[256];
+  timer<_PRECISION> tm;
+  minmax_statistics<float> statis;
+  timer<milliseconds> logtm;
+};
+
+inline bool isFastExp() {
+  static std::set<std::string> valid_env_vars = {"1", "TRUE", "ON", "On", "True"};
+  if (getenv("BTLA_FAST_EXP") != nullptr) {
+    if (valid_env_vars.count(getenv("BTLA_FAST_EXP")) != 0) {
+      return true;
+    } else {
+      return false;
+    }
+  } else {
+    return false;
+  }
+}
+}  // namespace utils
+
+static float fp4_bnb_dequant_fp32_LUT alignas(64)[] = {
+    0.00000000f,        5.208333333e-03f,   0.66666667f,        1.00000000f,        0.33333333f,
+    0.50000000f,        0.16666667f,        0.25000000f,        -1.f * 0.00000000f, -1.f * 5.208333333e-03f,
+    -1.f * 0.66666667f, -1.f * 1.00000000f, -1.f * 0.33333333f, -1.f * 0.50000000f, -1.f * 0.16666667f,
+    -1.f * 0.25000000f};
+
+static float fp4_e2m1_dequant_fp32_LUT alignas(64)[] = {
+    0.f,
+    0.010416666666666666f,
+    0.16666666666666666f,
+    0.25f,
+    0.333333333333333f,
+    0.5f,
+    0.6666666666666f,
+    1.f,
+    -1.f * 0.f,
+    -1.f * 0.010416666666666666f,
+    -1.f * 0.16666666666666666f,
+    -1.f * 0.25f,
+    -1.f * 0.333333333333333f,
+    -1.f * 0.5f,
+    -1.f * 0.6666666666666f,
+    -1.f * 1.f,
+};
+
+static float nf4_dequant_fp32_LUT alignas(64)[] = {0.f,
+                                                   -0.6961928009986877f,
+                                                   -0.5250730514526367f,
+                                                   -0.39491748809814453f,
+                                                   -0.28444138169288635f,
+                                                   -0.18477343022823334f,
+                                                   -0.09105003625154495f,
+                                                   -1.f,
+                                                   0.07958029955625534f,
+                                                   0.16093020141124725f,
+                                                   0.24611230194568634f,
+                                                   0.33791524171829224f,
+                                                   0.44070982933044434f,
+                                                   0.5626170039176941f,
+                                                   0.7229568362236023f,
+                                                   1.0f};
+
+// 8bit dynamic-tree-quantization map from bitsandbytes double-quant implementation.
+// For more details pls refer
+// (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561]
+static float dq8_bnb_LUT alignas(64)[] = {
+    -0.99297f, -0.97891f, -0.96484f, -0.95078f, -0.93672f, -0.92266f, -0.90859f, -0.89453f, -0.88047f, -0.86641f,
+    -0.85234f, -0.83828f, -0.82422f, -0.81016f, -0.79609f, -0.78203f, -0.76797f, -0.75391f, -0.73984f, -0.72578f,
+    -0.71172f, -0.69766f, -0.68359f, -0.66953f, -0.65547f, -0.64141f, -0.62734f, -0.61328f, -0.59922f, -0.58516f,
+    -0.57109f, -0.55703f, -0.54297f, -0.52891f, -0.51484f, -0.50078f, -0.48672f, -0.47266f, -0.45859f, -0.44453f,
+    -0.43047f, -0.41641f, -0.40234f, -0.38828f, -0.37422f, -0.36016f, -0.34609f, -0.33203f, -0.31797f, -0.30391f,
+    -0.28984f, -0.27578f, -0.26172f, -0.24766f, -0.23359f, -0.21953f, -0.20547f, -0.19141f, -0.17734f, -0.16328f,
+    -0.14922f, -0.13516f, -0.12109f, -0.10703f, -0.09859f, -0.09578f, -0.09297f, -0.09016f, -0.08734f, -0.08453f,
+    -0.08172f, -0.07891f, -0.07609f, -0.07328f, -0.07047f, -0.06766f, -0.06484f, -0.06203f, -0.05922f, -0.05641f,
+    -0.05359f, -0.05078f, -0.04797f, -0.04516f, -0.04234f, -0.03953f, -0.03672f, -0.03391f, -0.03109f, -0.02828f,
+    -0.02547f, -0.02266f, -0.01984f, -0.01703f, -0.01422f, -0.01141f, -0.00972f, -0.00916f, -0.00859f, -0.00803f,
+    -0.00747f, -0.00691f, -0.00634f, -0.00578f, -0.00522f, -0.00466f, -0.00409f, -0.00353f, -0.00297f, -0.00241f,
+    -0.00184f, -0.00128f, -0.00094f, -0.00083f, -0.00072f, -0.00061f, -0.00049f, -0.00038f, -0.00027f, -0.00016f,
+    -0.00009f, -0.00007f, -0.00004f, -0.00002f, -0.00001f, -0.00000f, -0.00000f, 0.00000f,  0.00000f,  0.00000f,
+    0.00001f,  0.00002f,  0.00004f,  0.00007f,  0.00009f,  0.00016f,  0.00027f,  0.00038f,  0.00049f,  0.00061f,
+    0.00072f,  0.00083f,  0.00094f,  0.00128f,  0.00184f,  0.00241f,  0.00297f,  0.00353f,  0.00409f,  0.00466f,
+    0.00522f,  0.00578f,  0.00634f,  0.00691f,  0.00747f,  0.00803f,  0.00859f,  0.00916f,  0.00972f,  0.01141f,
+    0.01422f,  0.01703f,  0.01984f,  0.02266f,  0.02547f,  0.02828f,  0.03109f,  0.03391f,  0.03672f,  0.03953f,
+    0.04234f,  0.04516f,  0.04797f,  0.05078f,  0.05359f,  0.05641f,  0.05922f,  0.06203f,  0.06484f,  0.06766f,
+    0.07047f,  0.07328f,  0.07609f,  0.07891f,  0.08172f,  0.08453f,  0.08734f,  0.09016f,  0.09297f,  0.09578f,
+    0.09859f,  0.10703f,  0.12109f,  0.13516f,  0.14922f,  0.16328f,  0.17734f,  0.19141f,  0.20547f,  0.21953f,
+    0.23359f,  0.24766f,  0.26172f,  0.27578f,  0.28984f,  0.30391f,  0.31797f,  0.33203f,  0.34609f,  0.36016f,
+    0.37422f,  0.38828f,  0.40234f,  0.41641f,  0.43047f,  0.44453f,  0.45859f,  0.47266f,  0.48672f,  0.50078f,
+    0.51484f,  0.52891f,  0.54297f,  0.55703f,  0.57109f,  0.58516f,  0.59922f,  0.61328f,  0.62734f,  0.64141f,
+    0.65547f,  0.66953f,  0.68359f,  0.69766f,  0.71172f,  0.72578f,  0.73984f,  0.75391f,  0.76797f,  0.78203f,
+    0.79609f,  0.81016f,  0.82422f,  0.83828f,  0.85234f,  0.86641f,  0.88047f,  0.89453f,  0.90859f,  0.92266f,
+    0.93672f,  0.95078f,  0.96484f,  0.97891f,  0.99297f,  1.00000f};
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_wrapper.h b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_wrapper.h
new file mode 100644
index 000000000..c7492b145
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/bestla_wrapper.h
@@ -0,0 +1,709 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <thread>
+
+#include "bestla_epilogue.h"
+#include "bestla_gemm.h"
+#include "bestla_prologue_a.h"
+#include "bestla_prologue_b.h"
+#include "bestla_utils.h"
+
+namespace bestla {
+namespace wrapper {
+namespace gemv_nbits {
+class S4 {
+ public:
+  static int constexpr NBits = 4;
+  template <typename ScaleT>
+  static inline utils::GemvParamB<ScaleT> createB(storage::gemm::StorageWeightKBlockNInteger* packedW) {
+    auto isasym = packedW->IsAsym();
+    auto bzptr = packedW->template ZPtr<int8_t>();
+    int ld_scaleb = packedW->CStep();
+    utils::GemvParamB<ScaleT> paramB{
+        packedW->template WPtr<uint8_t>(), nullptr, nullptr,   packedW->template SPtr<ScaleT>(),
+        isasym ? bzptr : nullptr,          NBits,   ld_scaleb, packedW->mKPad};
+    return paramB;
+  }
+  template <typename ScaleT>
+  static void updateBNStep(utils::GemvParamB<ScaleT>& paramB, int n_offset) {
+    paramB.b4ptr += n_offset * paramB.kpad / 2;
+    paramB.sptr += n_offset;
+    if (paramB.zpptr) {
+      paramB.zpptr += n_offset;
+    }
+  }
+};
+
+class S3 {
+ public:
+  static int constexpr NBits = 3;
+  template <typename ScaleT>
+  static inline utils::GemvParamB<ScaleT> createB(storage::gemm::StorageWeightKBlockNInteger* packedW) {
+    auto isasym = packedW->IsAsym();
+    auto bzptr = packedW->template ZPtr<int8_t>();
+    int ld_scaleb = packedW->CStep();
+    auto bwptr = packedW->template WPtr<uint8_t>();
+    auto bit1_offset = packedW->mNPad * packedW->mKPad / 4;
+    utils::GemvParamB<ScaleT> paramB{
+        nullptr, bwptr,     bwptr + bit1_offset, packedW->template SPtr<ScaleT>(), isasym ? bzptr : nullptr,
+        NBits,   ld_scaleb, packedW->mKPad};
+    return paramB;
+  }
+  template <typename ScaleT>
+  static void updateBNStep(utils::GemvParamB<ScaleT>& paramB, int n_offset) {
+    paramB.b2ptr += n_offset * paramB.kpad / 4;
+    paramB.b1ptr += n_offset * paramB.kpad / 8;
+    paramB.sptr += n_offset;
+    if (paramB.zpptr) {
+      paramB.zpptr += n_offset;
+    }
+  }
+};
+
+class S2 {
+ public:
+  static int constexpr NBits = 2;
+  template <typename ScaleT>
+  static inline utils::GemvParamB<ScaleT> createB(storage::gemm::StorageWeightKBlockNInteger* packedW) {
+    auto isasym = packedW->IsAsym();
+    auto bzptr = packedW->template ZPtr<int8_t>();
+    int ld_scaleb = packedW->CStep();
+    auto bwptr = packedW->template WPtr<uint8_t>();
+    utils::GemvParamB<ScaleT> paramB{
+        nullptr, bwptr,     nullptr,       packedW->template SPtr<ScaleT>(), isasym ? bzptr : nullptr,
+        NBits,   ld_scaleb, packedW->mKPad};
+    return paramB;
+  }
+  template <typename ScaleT>
+  static void updateBNStep(utils::GemvParamB<ScaleT>& paramB, int n_offset) {
+    paramB.b2ptr += n_offset * paramB.kpad / 4;
+    paramB.sptr += n_offset;
+    if (paramB.zpptr) {
+      paramB.zpptr += n_offset;
+    }
+  }
+};
+}  // namespace gemv_nbits
+
+namespace gemm {
+template <BTLA_ISA _RT_ISA_T, class _GemmCore_T, template <class _T, BTLA_ISA> class _PrologueA_T,
+          template <class _T, BTLA_ISA> class _PrologueB_T, template <BTLA_ISA> class _Epilogue_T>
+class LauncherBase {
+ public:
+  using GemmCore = _GemmCore_T;
+  static constexpr BTLA_ISA ISA = _RT_ISA_T;
+  using PrologueA = _PrologueA_T<GemmCore, _RT_ISA_T>;
+  using PrologueB = _PrologueB_T<GemmCore, _RT_ISA_T>;
+  using Epilogue = _Epilogue_T<_RT_ISA_T>;
+  using AType = typename GemmCore::AType;
+  using AParam = typename PrologueA::Param;
+  using BType = typename GemmCore::BType;
+  using BParam = typename PrologueB::Param;
+  using CType = typename GemmCore::CType;
+  using EpiParam = typename Epilogue::Param;
+  static_assert(GemmCore::ISA <= _RT_ISA_T, "RunTime ISA should cover GEMM's ISA");
+  struct Param {
+    const utils::GemmProblem problem;
+    const AParam paramA;
+    const BParam paramB;
+    const EpiParam paramC;
+  };
+  _GemmCore_T mGemmCore;
+  PrologueA mProA;
+  PrologueB mProB;
+  Epilogue mEpilogue;
+
+  class GEMVWrapper {
+   public:
+    static constexpr bool support() {
+      if constexpr (!std::is_same_v<PrologueB, prologue_b::gemm::WeightKBlockNInteger<_GemmCore_T, _RT_ISA_T>>) {
+        return false;
+      }
+      if constexpr (!std::is_same_v<PrologueA,
+                                    prologue_a::gemm::ShuffleActivationKBlockBaseF32<_GemmCore_T, _RT_ISA_T>> &&
+                    !std::is_same_v<PrologueA, prologue_a::gemm::ActivationKBlockBaseF32<_GemmCore_T, _RT_ISA_T>> &&
+                    !std::is_same_v<PrologueA, prologue_a::gemm::ActivationBase<_GemmCore_T, _RT_ISA_T>>) {
+        return false;
+      }
+
+      if constexpr (GemmCore::ISA == BTLA_ISA::AVX2) {
+#if CompileAVX2()
+        static_assert(GemmCore::PACK_ROW == 1);
+        if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_FP32) {
+          return true;
+        }
+#endif
+      }
+      if constexpr (GemmCore::ISA == BTLA_ISA::AVX512F) {
+#if CompileAVX512F()
+        static_assert(GemmCore::PACK_ROW == 1);
+        if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_FP32) {
+          return true;
+        }
+#endif
+      }
+      return false;
+    }
+    static int constexpr MaxGemvM = 4;
+    static bool implemented(const Param& _param) {
+      bool impl = true;
+      impl &= _param.paramB.packedW->mDType == BTLA_DTYPE::S4_CLIP ||
+              _param.paramB.packedW->mDType == BTLA_DTYPE::S3_CLIP ||
+              _param.paramB.packedW->mDType == BTLA_DTYPE::S2_CLIP;
+      if constexpr (support()) {
+        impl &= _param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::F32 ||
+                _param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::BF16;
+      }
+
+      impl &= _param.problem.dims[1] <= MaxGemvM;
+      return impl;
+    }
+
+    template <typename ScaleT, int MTILE, class SNbits>
+    static void gemv_kblock(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
+      if constexpr (support()) {
+        auto constexpr TmpSize = 3 * 1024LL;
+        auto constexpr CSize = 1 * 1024LL;
+        auto StackTmp_ = alloca(TmpSize + CSize);
+        auto StackTmp = utils::cpu_pointer_align<void>(StackTmp_);
+        auto tmpc_ptr = reinterpret_cast<CType*>((char*)StackTmp + TmpSize);
+        utils::GemvParamB<ScaleT> paramB = SNbits::template createB<ScaleT>(_param.paramB.packedW);
+        const float* Aptr = _param.paramA.A;
+        if constexpr (std::is_same_v<PrologueA,
+                                     prologue_a::gemm::ShuffleActivationKBlockBaseF32<_GemmCore_T, _RT_ISA_T>>) {
+          if (_param.paramA.reordered && _param.paramA.reordered->template APtr<float>()) {
+            Aptr = _param.paramA.reordered->template APtr<float>();
+          }
+        }
+        int m = _param.problem.dims[1];
+        int n = _param.problem.dims[2];
+        int k = _param.problem.dims[3];
+        int kblocksize = _param.problem.dims[4];
+        auto Cptr = _param.paramC.C + _config.loc[1];
+        SNbits::template updateBNStep<ScaleT>(paramB, _config.loc[1]);
+        int size_padded = utils::padto_le(_config.size[1], GemmCore::NTILE);
+        int in = 0;
+        for (; in < size_padded; in += GemmCore::NTILE) {
+          if constexpr (std::is_same_v<AType, float>) {
+            kernel::wrapper::GEMVWoqNBits::forward_fp32_fp32<_RT_ISA_T, ScaleT, GemmCore::NTILE, MTILE>(
+                Aptr, _param.paramA.lda, paramB, Cptr, _param.paramC.ldc, k, kblocksize, StackTmp, TmpSize);
+          }
+
+          Cptr += GemmCore::NTILE;
+          SNbits::template updateBNStep<ScaleT>(paramB, GemmCore::NTILE);
+        }
+        if (size_padded != _config.size[1]) {
+          if constexpr (std::is_same_v<AType, float>) {
+            kernel::wrapper::GEMVWoqNBits::forward_fp32_fp32<_RT_ISA_T, ScaleT, GemmCore::NTILE, MTILE>(
+                Aptr, _param.paramA.lda, paramB, tmpc_ptr, GemmCore::NTILE, k, kblocksize, StackTmp, TmpSize);
+          }
+          for (int i = 0; i < MTILE; i++) {
+            memcpy(Cptr + i * _param.paramC.ldc, tmpc_ptr + i * GemmCore::NTILE,
+                   (_config.size[1] - in) * sizeof(CType));
+          }
+        }
+        Epilogue::forward(_param.paramC.C + _config.loc[1], _param.paramC.ldc, 0, _config.loc[1], MTILE,
+                          _config.size[1], _param.paramC, StackTmp, TmpSize);
+      }
+    }
+
+    static void gemv(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
+      if constexpr (support()) {
+        assert(_param.problem.dims[4] > 0);
+        auto& m = _param.problem.dims[1];
+        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S4_CLIP) {
+          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
+            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S4>(_param, _config);
+            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S4>(_param, _config);
+            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S4>(_param, _config);
+            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S4>(_param, _config);
+
+          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
+            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S4>(_param, _config);
+            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S4>(_param, _config);
+            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S4>(_param, _config);
+            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S4>(_param, _config);
+          }
+          return;
+        }
+        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S3_CLIP) {
+          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
+            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S3>(_param, _config);
+            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S3>(_param, _config);
+            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S3>(_param, _config);
+            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S3>(_param, _config);
+
+          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
+            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S3>(_param, _config);
+            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S3>(_param, _config);
+            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S3>(_param, _config);
+            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S3>(_param, _config);
+          }
+          return;
+        }
+        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S2_CLIP) {
+          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
+            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S2>(_param, _config);
+            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S2>(_param, _config);
+            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S2>(_param, _config);
+            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S2>(_param, _config);
+
+          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
+            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S2>(_param, _config);
+            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S2>(_param, _config);
+            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S2>(_param, _config);
+            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S2>(_param, _config);
+          }
+          return;
+        }
+      }
+    }
+  };
+
+  void run(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
+    if (GEMVWrapper::support() && GEMVWrapper::implemented(_param)) {
+      GEMVWrapper::gemv(_param, _config);
+    } else {
+      gemm(_param, _config);
+    }
+  }
+
+ protected:
+  void gemm(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
+    mGemmCore.configure(_config.size[0], _config.size[1], _param.problem.dims[3]);
+    auto StackTmp = alloca(_config.stacksize);
+    auto tmpB = reinterpret_cast<BType*>(StackTmp);
+    tmpB = utils::cpu_pointer_align(tmpB);
+    auto tmpA = reinterpret_cast<AType*>(tmpB + static_cast<size_t>(_config.block[1]) * _config.block[2]);
+    tmpA = utils::cpu_pointer_align(tmpA);
+    auto tmpC = reinterpret_cast<CType*>(tmpA + static_cast<size_t>(GemmCore::MTILE) * _config.block[2]);
+    tmpC = utils::cpu_pointer_align(tmpC);
+    auto tmpCache = (void*)(tmpC + static_cast<size_t>(_config.block[0]) * _config.block[1]);
+    tmpCache = utils::cpu_pointer_align(tmpCache);
+    for (int itern = 0; itern < _config.size[1]; itern += _config.block[1]) {
+      int n_remain = utils::remainsize(itern, _config.size[1], _config.block[1]);
+      for (int iterm = 0; iterm < _config.size[0]; iterm += _config.block[0]) {
+        int m_remain = utils::remainsize(iterm, _config.size[0], _config.block[0]);
+        run_block(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpC, tmpCache);
+      }
+    }
+  }
+
+  void run_block(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
+                 int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, CType* tmpC, void* tmpcache) {
+    int n_padded = utils::padto(blk_nsize, GemmCore::NTILE);
+    auto& K = _param.problem.dims[3];
+    for (int iterk = 0; iterk < _param.problem.dims[3]; iterk += _config.block[2]) {
+      int k_remain = utils::remainsize(iterk, K, _config.block[2]);
+      int k_padded = utils::padto(k_remain, GemmCore::KTILE);
+      int k_paddedle = utils::padto_le(k_remain, GemmCore::KTILE);
+      auto bptr_cache = tmpB;
+      int bcache_step = 0;
+      mProB.getWeight(&bptr_cache, &bcache_step, k_padded, n_padded, iterk, _config.loc[1] + blk_n, _param.paramB,
+                      tmpcache, _config.tmpcachesize);
+      int bcache_stride = bcache_step * sizeof(BType);
+      for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
+        int m_remain = utils::remainsize(i, blk_msize, GemmCore::MTILE);
+        auto cptr_cache = tmpC + i * _config.block[1];
+        int ccache_stride = _config.block[1] * sizeof(CType);
+        if (k_paddedle) {
+          AType* aptr_cache = tmpA;
+          int acache_step = 0;
+          mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_paddedle,
+                              (blk_m + i + _config.loc[0]), iterk, tmpcache, _config.tmpcachesize);
+          mGemmCore.forward(aptr_cache, bptr_cache, cptr_cache, m_remain, n_padded, k_paddedle,
+                            acache_step * sizeof(AType), bcache_stride, ccache_stride, iterk, tmpcache,
+                            _config.tmpcachesize);
+        }
+        int k_tail = k_remain - k_paddedle;
+        if (k_tail) {
+          AType* aptr_cache = tmpA;
+          int acache_step = 0;
+          mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_tail, (blk_m + i + _config.loc[0]),
+                              iterk + k_paddedle, tmpcache, _config.tmpcachesize);
+          mGemmCore.forward(aptr_cache, bptr_cache + k_paddedle * GemmCore::NTILE, cptr_cache, m_remain, n_padded,
+                            GemmCore::KTILE, acache_step * sizeof(AType), bcache_stride, ccache_stride,
+                            iterk + k_paddedle, tmpcache, _config.tmpcachesize);
+        }
+      }
+    }
+    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
+                      _param.paramC, tmpcache, _config.tmpcachesize);
+  }
+};
+
+template <BTLA_ISA _RT_ISA_T, class _GemmCore_T, template <class _T, BTLA_ISA> class _PrologueA_T,
+           template <class _T, BTLA_ISA> class _PrologueB_T, template <BTLA_ISA> class _Epilogue_T>
+class LauncherIntKBlock {
+ public:
+  using GemmCore = _GemmCore_T;
+  static constexpr BTLA_ISA ISA = _RT_ISA_T;
+  using PrologueA = _PrologueA_T<GemmCore, _RT_ISA_T>;
+  using PrologueB = _PrologueB_T<GemmCore, _RT_ISA_T>;
+  using Epilogue = _Epilogue_T<_RT_ISA_T>;
+  using AType = typename GemmCore::AType;
+  using AParam = typename PrologueA::Param;
+  using BType = typename GemmCore::BType;
+  using BParam = typename PrologueB::Param;
+  using CType = typename GemmCore::CType;
+  using EpiParam = typename Epilogue::Param;
+  using AccType = float;
+  static_assert(GemmCore::ISA <= _RT_ISA_T, "RunTime ISA should cover GEMM's ISA");
+  struct Param {
+    const utils::GemmProblem problem;
+    const AParam paramA;
+    const BParam paramB;
+    const EpiParam paramC;
+  };
+  _GemmCore_T mGemmCore;
+  PrologueA mProA;
+  PrologueB mProB;
+  Epilogue mEpilogue;
+
+  class GEMVWrapper {
+   public:
+    static constexpr bool support() {
+      if constexpr (!std::is_same_v<PrologueB, prologue_b::gemm::WeightKBlockNInteger<_GemmCore_T, _RT_ISA_T>>) {
+        return false;
+      }
+      if constexpr (!std::is_same_v<PrologueA, prologue_a::gemm::ActivationF32KBlockQuantize<_GemmCore_T, _RT_ISA_T>> &&
+                    !std::is_same_v<PrologueA,
+                                    prologue_a::gemm::ShuffleActivationKBlockQuantizeF32<_GemmCore_T, _RT_ISA_T>>) {
+        return false;
+      }
+      if constexpr (GemmCore::ISA == BTLA_ISA::AVX_VNNI) {
+#if CompileAVXVNNI()
+        static_assert(GemmCore::PACK_ROW == 4);
+        if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_US_FP32) {
+          return true;
+        }
+        if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_SS_FP32) {
+          return true;
+        }
+#endif
+    }
+    if constexpr (GemmCore::ISA == BTLA_ISA::AVX2) {
+#if CompileAVX2()
+        static_assert(GemmCore::PACK_ROW == 4);
+        if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_US_FP32) {
+          return true;
+        }
+        if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_SS_FP32) {
+          return true;
+        }
+#endif
+      }
+      if constexpr (GemmCore::ISA == BTLA_ISA::AVX512_VNNI || GemmCore::ISA == BTLA_ISA::AMX_INT8) {
+#if CompileAVX512VNNI()
+        static_assert(GemmCore::PACK_ROW == 4);
+        if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_US_FP32) {
+          return true;
+        }
+        if constexpr (GemmCore::COMP == bestla::gemm::CompType::COMP_INT8_SS_FP32) {
+          return true;
+        }
+#endif
+      }
+      return false;
+    }
+    static int constexpr MaxGemvM = 4;
+
+    static bool implemented(const Param& _param) {
+      bool impl = true;
+      impl &= _param.paramB.packedW->mDType == BTLA_DTYPE::S4_CLIP ||
+              _param.paramB.packedW->mDType == BTLA_DTYPE::S3_CLIP ||
+              _param.paramB.packedW->mDType == BTLA_DTYPE::S2_CLIP;
+      impl &= _param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::F32 ||
+              _param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::BF16;
+      impl &= _param.problem.dims[1] <= MaxGemvM;
+      return impl;
+    }
+
+    template <typename ScaleT, int MTILE, class SNbits>
+    static void gemv_kblock(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
+      if constexpr (support()) {
+        auto constexpr TmpSize = 3 * 1024LL;
+        auto constexpr CSize = 1 * 1024LL;
+        auto StackTmp_ = alloca(TmpSize + CSize);
+        auto StackTmp = utils::cpu_pointer_align<void>(StackTmp_);
+        auto tmpc_ptr = reinterpret_cast<CType*>((char*)StackTmp + TmpSize);
+        utils::GemvParamB<ScaleT> paramB = SNbits::template createB<ScaleT>(_param.paramB.packedW);
+        utils::GemvParamA paramA{
+            _param.paramA.quan->template APtr<uint8_t>(), _param.paramA.quan->template SPtr<float>(),
+            _param.paramA.quan->template ZPtr<uint8_t>(), _param.paramA.quan->mKPad, _param.paramA.quan->CStep()};
+
+        int m = _param.problem.dims[1];
+        int n = _param.problem.dims[2];
+        int k = _param.problem.dims[3];
+        int kblocksize = _param.problem.dims[4];
+        auto Cptr = _param.paramC.C + _config.loc[1];
+        SNbits::template updateBNStep<ScaleT>(paramB, _config.loc[1]);
+        int size_padded = utils::padto_le(_config.size[1], GemmCore::NTILE);
+        int in = 0;
+        for (; in < size_padded; in += GemmCore::NTILE) {
+          if constexpr (std::is_same_v<AType, uint8_t>) {
+            kernel::wrapper::GEMVWoqNBits::forward_u8s8_fp32<_RT_ISA_T, ScaleT, GemmCore::NTILE, MTILE>(
+                paramA, paramB, Cptr, _param.paramC.ldc, k, kblocksize, StackTmp, TmpSize);
+          } else if constexpr (std::is_same_v<AType, int8_t>) {
+            kernel::wrapper::GEMVWoqNBits::forward_s8s8_fp32<_RT_ISA_T, ScaleT, GemmCore::NTILE, MTILE>(
+                paramA, paramB, Cptr, _param.paramC.ldc, k, kblocksize, StackTmp, TmpSize);
+          }
+
+          Cptr += GemmCore::NTILE;
+          SNbits::template updateBNStep<ScaleT>(paramB, GemmCore::NTILE);
+        }
+        if (size_padded != _config.size[1]) {
+          if constexpr (std::is_same_v<AType, uint8_t>) {
+            kernel::wrapper::GEMVWoqNBits::forward_u8s8_fp32<_RT_ISA_T, ScaleT, GemmCore::NTILE, MTILE>(
+                paramA, paramB, tmpc_ptr, GemmCore::NTILE, k, kblocksize, StackTmp, TmpSize);
+          } else if constexpr (std::is_same_v<AType, int8_t>) {
+            kernel::wrapper::GEMVWoqNBits::forward_s8s8_fp32<_RT_ISA_T, ScaleT, GemmCore::NTILE, MTILE>(
+                paramA, paramB, tmpc_ptr, GemmCore::NTILE, k, kblocksize, StackTmp, TmpSize);
+          }
+          for (int i = 0; i < MTILE; i++) {
+            memcpy(Cptr + i * _param.paramC.ldc, tmpc_ptr + i * GemmCore::NTILE,
+                   (_config.size[1] - in) * sizeof(CType));
+          }
+        }
+        Epilogue::forward(_param.paramC.C + _config.loc[1], _param.paramC.ldc, 0, _config.loc[1], MTILE,
+                          _config.size[1], _param.paramC, StackTmp, TmpSize);
+      }
+    }
+
+    static void gemv(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
+      if constexpr (support()) {
+        auto& m = _param.problem.dims[1];
+        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S4_CLIP) {
+          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
+            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S4>(_param, _config);
+            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S4>(_param, _config);
+            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S4>(_param, _config);
+            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S4>(_param, _config);
+
+          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
+            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S4>(_param, _config);
+            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S4>(_param, _config);
+            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S4>(_param, _config);
+            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S4>(_param, _config);
+          }
+          return;
+        }
+
+        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S3_CLIP) {
+          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
+            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S3>(_param, _config);
+            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S3>(_param, _config);
+            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S3>(_param, _config);
+            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S3>(_param, _config);
+          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
+            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S3>(_param, _config);
+            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S3>(_param, _config);
+            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S3>(_param, _config);
+            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S3>(_param, _config);
+          }
+          return;
+        }
+        if (_param.paramB.packedW->mDType == BTLA_DTYPE::S2_CLIP) {
+          if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F32) {
+            if (m == 1) gemv_kblock<float, 1, gemv_nbits::S2>(_param, _config);
+            if (m == 2) gemv_kblock<float, 2, gemv_nbits::S2>(_param, _config);
+            if (m == 3) gemv_kblock<float, 3, gemv_nbits::S2>(_param, _config);
+            if (m == 4) gemv_kblock<float, 4, gemv_nbits::S2>(_param, _config);
+          } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::BF16) {
+            if (m == 1) gemv_kblock<utils::bf16, 1, gemv_nbits::S2>(_param, _config);
+            if (m == 2) gemv_kblock<utils::bf16, 2, gemv_nbits::S2>(_param, _config);
+            if (m == 3) gemv_kblock<utils::bf16, 3, gemv_nbits::S2>(_param, _config);
+            if (m == 4) gemv_kblock<utils::bf16, 4, gemv_nbits::S2>(_param, _config);
+          }
+          return;
+        }
+      }
+    }
+  };
+
+  void run(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
+    if (GEMVWrapper::support() && GEMVWrapper::implemented(_param)) {
+      GEMVWrapper::gemv(_param, _config);
+    } else {
+      gemm(_param, _config);
+    }
+  }
+
+ protected:
+  void gemm(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
+    mGemmCore.configure(_config.size[0], _config.size[1], _param.problem.dims[3]);
+    auto StackTmp = alloca(_config.stacksize);
+    auto tmpB = reinterpret_cast<BType*>(StackTmp);
+    tmpB = utils::cpu_pointer_align(tmpB);
+    auto tmpA = reinterpret_cast<AType*>(tmpB + static_cast<size_t>(_config.block[1]) * _config.block[2]);
+    tmpA = utils::cpu_pointer_align(tmpA);
+    auto tmpC = reinterpret_cast<AccType*>(tmpA + static_cast<size_t>(GemmCore::MTILE) * _config.block[2]);
+    tmpC = utils::cpu_pointer_align(tmpC);
+    auto tmpCache = reinterpret_cast<int8_t*>(tmpC + static_cast<size_t>(_config.block[0]) * _config.block[1]);
+    tmpCache = utils::cpu_pointer_align(tmpCache);
+    for (int itern = 0; itern < _config.size[1]; itern += _config.block[1]) {
+      int n_remain = utils::remainsize(itern, _config.size[1], _config.block[1]);
+      for (int iterm = 0; iterm < _config.size[0]; iterm += _config.block[0]) {
+        int m_remain = utils::remainsize(iterm, _config.size[0], _config.block[0]);
+        auto& KBlock = _param.problem.dims[4];
+        if (_config.block[2] >= KBlock) {
+          run_block(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpC, tmpCache);
+        } else {
+          run_largekblock(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpC, tmpCache);
+        }
+      }
+    }
+  }
+
+  // _config.block[2]%kblock==0
+  // _config.block[2]>=kblock
+  void run_block(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
+                 int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, AccType* tmpC, int8_t* tmpcache) {
+    int n_padded = utils::padto(blk_nsize, GemmCore::NTILE);
+    auto& K = _param.problem.dims[3];
+    auto& KBlock = _param.problem.dims[4];
+    assert(_config.block[2] % KBlock == 0);
+    assert(_config.block[2] % GemmCore::KTILE == 0);
+    // GemmCore: int8+int8=int32=>dequant to fp32
+    // accumulate to tmpC
+    // extra parameters: zpA, scaleA, scaleB, reduceB
+    // zpA scaleA: [MTILE,kblk_perstep]
+    // scaleB reduceB: [kblk_perstep, NStep]
+    int kblk_perstep = utils::updiv(_config.block[2], KBlock);
+    int tmp_ldsb = _config.block[1];
+    int tmp_ldsa = kblk_perstep;
+    auto zpA = reinterpret_cast<AType*>(tmpcache);
+    zpA = utils::cpu_pointer_align(zpA);
+    auto scaleA = reinterpret_cast<float*>(zpA + GemmCore::MTILE * tmp_ldsa);
+    scaleA = utils::cpu_pointer_align(scaleA);
+    auto scaleB = reinterpret_cast<float*>(scaleA + GemmCore::MTILE * tmp_ldsa);
+    scaleB = utils::cpu_pointer_align(scaleB);
+    auto reduceB = reinterpret_cast<float*>(scaleB + _config.block[1] * tmp_ldsa);
+    reduceB = utils::cpu_pointer_align(reduceB);
+    auto tmp_ = reinterpret_cast<int8_t*>(reduceB + _config.block[1] * tmp_ldsa);
+    tmp_ = utils::cpu_pointer_align(tmp_);
+
+    for (int iterk = 0; iterk < K; iterk += _config.block[2]) {
+      int k_remain = utils::remainsize(iterk, K, _config.block[2]);
+      int k_padded = utils::padto(k_remain, GemmCore::KTILE);
+      auto bptr_cache = tmpB;
+      int bcache_step = 0;
+      int ldsb_cache = tmp_ldsb;
+      auto scaleB_cache = scaleB;
+      auto reduceB_cache = reduceB;
+      mProB.getWeight(&bptr_cache, &bcache_step, k_padded, n_padded, iterk, _config.loc[1] + blk_n, _param.paramB, tmp_,
+                      _config.tmpcachesize);
+      mProB.getScale(&scaleB_cache, &ldsb_cache, k_padded, n_padded, iterk, _config.loc[1] + blk_n, _param.paramB, tmp_,
+                     _config.tmpcachesize);
+      mProB.getReduce(&reduceB_cache, &ldsb_cache, k_padded, n_padded, iterk, _config.loc[1] + blk_n, _param.paramB,
+                      tmp_, _config.tmpcachesize);
+      int bcache_stride = bcache_step * sizeof(BType);
+      for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
+        int m_remain = utils::remainsize(i, blk_msize, GemmCore::MTILE);
+        auto cptr_cache = tmpC + i * _config.block[1];
+        int ccache_stride = _config.block[1] * sizeof(CType);
+        auto aptr_cache = tmpA;
+        int acache_step = k_padded;
+        auto zpA_cache = zpA;
+        auto scaleA_cache = scaleA;
+        int ldsa_cache = tmp_ldsa;
+        mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_padded, (blk_m + i + _config.loc[0]),
+                            iterk, tmp_, _config.tmpcachesize);
+        mProA.getZp(&zpA_cache, &ldsa_cache, _param.paramA, m_remain, k_padded, (blk_m + i + _config.loc[0]), iterk,
+                    tmp_, _config.tmpcachesize);
+        mProA.getScale(&scaleA_cache, &ldsa_cache, _param.paramA, m_remain, k_padded, (blk_m + i + _config.loc[0]),
+                       iterk, tmp_, _config.tmpcachesize);
+        mGemmCore.forward(aptr_cache, bptr_cache, cptr_cache, zpA_cache, scaleA_cache, ldsa_cache, scaleB_cache,
+                          reduceB_cache, ldsb_cache, m_remain, n_padded, k_padded, KBlock, acache_step * sizeof(AType),
+                          bcache_stride, ccache_stride, iterk, 1.f, tmp_, _config.tmpcachesize);
+      }
+    }
+    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
+                      _param.paramC, tmpcache, _config.tmpcachesize);
+  }
+
+  // _config.block[2]<kblock
+  void run_largekblock(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
+                       int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, AccType* tmpC, int8_t* tmpcache) {
+    int n_padded = utils::padto(blk_nsize, GemmCore::NTILE);
+    auto& K = _param.problem.dims[3];
+    auto& KBlock = _param.problem.dims[4];
+    // GemmCore: int8+int8=int32=>dequant to fp32
+    // accumulate to tmpC
+    // extra parameters: zpA, scaleA, scaleB, reduceB
+    // zpA scaleA: [MTILE,kblk_perstep]
+    // scaleB reduceB: [kblk_perstep, NStep]
+    int kblk_perstep = 1;
+    int tmp_ldsb = _config.block[1];
+    int tmp_ldsa = kblk_perstep;
+    auto zpA = reinterpret_cast<AType*>(tmpcache);
+    zpA = utils::cpu_pointer_align(zpA);
+    auto scaleA = reinterpret_cast<float*>(zpA + GemmCore::MTILE * tmp_ldsa);
+    scaleA = utils::cpu_pointer_align(scaleA);
+    auto scaleB = reinterpret_cast<float*>(scaleA + GemmCore::MTILE * tmp_ldsa);
+    scaleB = utils::cpu_pointer_align(scaleB);
+    auto reduceB = reinterpret_cast<float*>(scaleB + _config.block[1] * tmp_ldsa);
+    reduceB = utils::cpu_pointer_align(reduceB);
+    auto tmp_ = reinterpret_cast<int8_t*>(reduceB + _config.block[1] * tmp_ldsa);
+    tmp_ = utils::cpu_pointer_align(tmp_);
+
+    for (int iterk = 0; iterk < K; iterk += KBlock) {
+      for (int iterkk = iterk; iterkk < iterk + KBlock; iterkk += _config.block[2]) {
+        int k_remain = utils::remainsize(iterkk, K, _config.block[2]);
+        k_remain = utils::remainsize(iterkk, iterk + KBlock, _config.block[2]);
+        int k_padded = utils::padto(k_remain, GemmCore::KTILE);
+        auto bptr_cache = tmpB;
+        int bcache_step = 0;
+        int ldsb_cache = tmp_ldsb;
+        auto scaleB_cache = scaleB;
+        auto reduceB_cache = reduceB;
+        mProB.getWeight(&bptr_cache, &bcache_step, k_padded, n_padded, iterkk, _config.loc[1] + blk_n, _param.paramB,
+                        tmp_, _config.tmpcachesize);
+        mProB.getScale(&scaleB_cache, &ldsb_cache, k_padded, n_padded, iterkk, _config.loc[1] + blk_n, _param.paramB,
+                       tmp_, _config.tmpcachesize);
+        mProB.getReduce(&reduceB_cache, &ldsb_cache, k_padded, n_padded, iterkk, _config.loc[1] + blk_n, _param.paramB,
+                        tmp_, _config.tmpcachesize);
+
+        int bcache_stride = bcache_step * sizeof(BType);
+        for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
+          int m_remain = utils::remainsize(i, blk_msize, GemmCore::MTILE);
+          auto cptr_cache = tmpC + i * _config.block[1];
+          int ccache_stride = _config.block[1] * sizeof(CType);
+          auto aptr_cache = tmpA;
+          int acache_step = k_padded;
+          auto zpA_cache = zpA;
+          auto scaleA_cache = scaleA;
+          int ldsa_cache = tmp_ldsa;
+          mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_padded,
+                              (blk_m + i + _config.loc[0]), iterkk, tmp_, _config.tmpcachesize);
+          mProA.getZp(&zpA_cache, &ldsa_cache, _param.paramA, m_remain, k_padded, (blk_m + i + _config.loc[0]), iterkk,
+                      tmp_, _config.tmpcachesize);
+          mProA.getScale(&scaleA_cache, &ldsa_cache, _param.paramA, m_remain, k_padded, (blk_m + i + _config.loc[0]),
+                         iterkk, tmp_, _config.tmpcachesize);
+          auto kscale = k_remain / float(KBlock);
+          mGemmCore.forward(aptr_cache, bptr_cache, cptr_cache, zpA_cache, scaleA_cache, ldsa_cache, scaleB_cache,
+                            reduceB_cache, ldsb_cache, m_remain, n_padded, k_padded, k_padded,
+                            acache_step * sizeof(AType), bcache_stride, ccache_stride, iterkk, kscale, tmp_,
+                            _config.tmpcachesize);
+        }
+      }
+    }
+    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
+                      _param.paramC, tmpcache, _config.tmpcachesize);
+  }
+};
+}  // namespace gemm
+}  // namespace wrapper
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_avx2.h b/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_avx2.h
new file mode 100644
index 000000000..978f67e10
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_avx2.h
@@ -0,0 +1,3863 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include "bestla.h"
+#include "bestla_utils.h"
+#include "kernel_jit.h"
+#include "kernel_ref.h"
+#if CompileAVX2()
+#include <immintrin.h>
+#endif
+namespace bestla {
+namespace kernel {
+namespace avx2 {
+#if CompileAVX2()
+#ifdef __GNUC__
+#pragma GCC push_options
+#pragma GCC target("avx2", "fma", "f16c")
+#else
+#endif
+
+
+static inline __m256i unpack_4bits(void* srcptr, __m256i mask) {
+  auto raw_data = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr));
+  auto ymm0 = _mm256_cvtepu8_epi16(raw_data);
+  auto ymm1 = _mm256_slli_epi16(ymm0, 4);
+  ymm0 = _mm256_or_si256(ymm0, ymm1);
+  ymm0 = _mm256_and_si256(ymm0, mask);
+  return ymm0;
+}
+
+static inline __m256i unpack_2bits(utils::bit2x4* ptr, const __m256i& vshift_y, const __m256i& vmask0_y,
+                                   const __m256i& vsfhl_mask_y, const __m256i& vorder_y) {
+  auto vraw_x = _mm_loadl_epi64((const __m128i*)ptr);
+  auto vsrc_y = _mm256_broadcastq_epi64(vraw_x);
+  auto vordered_y = _mm256_permutevar8x32_epi32(vsrc_y, vorder_y);
+  auto vs_y = _mm256_srlv_epi32(vordered_y, vshift_y);
+  auto v2_y = _mm256_and_si256(vs_y, vmask0_y);
+  auto vout_y = _mm256_shuffle_epi8(v2_y, vsfhl_mask_y);
+  return vout_y;
+}
+
+static inline __m256i unpack_1bits(utils::bit1x8* ptr, const __m256i& bit1Shift_1, const __m256i& bit1Mask,
+                                   const __m256i& bit1Shift_2, const __m256i& highMask) {
+  auto bit1x32 = _mm256_set1_epi32(*(int*)ptr);
+  bit1x32 = _mm256_srlv_epi32(bit1x32, bit1Shift_1);
+  bit1x32 = _mm256_and_si256(bit1x32, bit1Mask);
+  bit1x32 = _mm256_mullo_epi32(bit1x32, bit1Shift_2);
+  bit1x32 = _mm256_and_si256(highMask, bit1x32);
+  return bit1x32;
+}
+
+inline __m256 ymm_cvt_bf16_fp32(__m128i vbf16) {
+  auto vf32 = _mm256_cvtepu16_epi32(vbf16);
+  return _mm256_castsi256_ps(_mm256_slli_epi32(vf32, 16));
+}
+
+inline __m128i ymm_cvtepi32_epi16(__m256i src) {
+  const auto shuffle_mask_32_to_16 = _mm256_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 13, 12, 9, 8, 5, 4, 1, 0, 13, 12, 9, 8,
+                                                     5, 4, 1, 0, 13, 12, 9, 8, 5, 4, 1, 0);
+  __m256i trunc_elements = _mm256_shuffle_epi8(src, shuffle_mask_32_to_16);
+  __m256i ordered = _mm256_permute4x64_epi64(trunc_elements, 0x58);
+  __m128i result = _mm256_castsi256_si128(ordered);
+  return result;
+}
+
+inline __m128i ymm_cvt_fp32_bf16(const __m256& vfp32) {
+  return ymm_cvtepi32_epi16(_mm256_bsrli_epi128(_mm256_castps_si256(vfp32), 2));
+}
+
+static inline __m256i load_s8_s32(int8_t* srcptr) {
+  auto xmm = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr));
+  auto ymm = _mm256_cvtepi8_epi32(xmm);
+  return ymm;
+}
+
+static inline __m256 load_bf16_fp32(const utils::bf16* srcptr) {
+  auto tmp = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcptr));
+  auto vf32 = ymm_cvt_bf16_fp32(tmp);
+  return vf32;
+}
+
+template <typename T>
+static inline __m256 load_T_fp32(const T* srcptr) {
+  __m256 vtmp;
+  if constexpr (std::is_same_v<T, float>) {
+    vtmp = _mm256_loadu_ps(srcptr);
+  } else if constexpr (std::is_same_v<T, utils::bf16>) {
+    vtmp = load_bf16_fp32(srcptr);
+  } else {
+    static_assert(std::is_same_v<T, float> || std::is_same_v<T, utils::bf16>);
+  }
+  return vtmp;
+}
+
+static inline __m256 load_s8_fp32(int8_t* srcptr) {
+  auto src_y = load_s8_s32(srcptr);
+  auto dst_y = _mm256_cvtepi32_ps(src_y);
+  return dst_y;
+}
+
+template <typename T>
+static inline void store_fp_T(const __m256& src_y, T* dstptr) {
+  if constexpr (std::is_same_v<T, utils::bf16>) {
+    auto xmm = ymm_cvt_fp32_bf16(src_y);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), xmm);
+  } else if constexpr (std::is_same_v<T, float>) {
+    _mm256_storeu_ps(dstptr, src_y);
+  } else {
+    assert(0);
+  }
+}
+
+template <typename T>
+static inline void convert_s8_fp_v8(T* dstptr, int8_t* srcptr) {
+  auto src_fp_y = load_s8_fp32(srcptr);
+  store_fp_T(src_fp_y, dstptr);
+}
+
+template <bool IsAsym = false>
+static inline __m256 dequant_s8_fp(int8_t* srcptr, __m256 vscales, __m256i vzps = __m256i()) {
+  auto src_s32_y = load_s8_s32(srcptr);
+  if constexpr (IsAsym) src_s32_y = _mm256_sub_epi32(src_s32_y, vzps);
+  auto src_fp_y = _mm256_cvtepi32_ps(src_s32_y);
+  src_fp_y = _mm256_mul_ps(src_fp_y, vscales);
+  return src_fp_y;
+}
+
+template <int N, bool IsAsym, typename DstT>
+static inline void dequant_s8_N_avx2(DstT* dstptr, int8_t* srcptr, __m256* vscales, __m256i* vzps = nullptr) {
+  static_assert(N % 8 == 0);
+  int constexpr VLoop = N / 8;
+  for (int iv = 0; iv < VLoop; iv += 1) {
+    __m256 dq_f32_y;
+    if constexpr (IsAsym) {
+      dq_f32_y = dequant_s8_fp<IsAsym>(srcptr, vscales[iv], vzps[iv]);
+    } else {
+      dq_f32_y = dequant_s8_fp<IsAsym>(srcptr, vscales[iv]);
+    }
+    store_fp_T(dq_f32_y, dstptr + iv * 8);
+  }
+}
+
+static inline __m256i load_zp_epi8_broadcast_epi16_v16(int8_t* zpptr, const __m256i& vindex) {
+  auto v_zp_x = _mm_loadu_si128((const __m128i*)zpptr);
+  auto v_zp_y = _mm256_cvtepi8_epi16(v_zp_x);
+  auto v_zp_y_cast = _mm256_shuffle_epi8(v_zp_y, vindex);
+  return v_zp_y_cast;
+}
+
+static inline __m256i load_zp_epi8_broadcast_epi16(int8_t* zpptr, const __m256i& vindex) {
+  auto v_zp_x = _mm_loadu_si128((const __m128i*)zpptr);
+  auto v_zp_y = _mm256_cvtepi8_epi16(v_zp_x);
+  auto v_zp_y_cast = _mm256_shuffle_epi8(v_zp_y, vindex);
+  return v_zp_y_cast;
+}
+
+static inline __m256i load_zp_epi8_broadcast_epi32(int8_t* zpptr, const __m256i& vindex) {
+  auto v_zp_x = _mm_loadl_epi64((const __m128i*)zpptr);
+  auto v_zp_y = _mm256_cvtepi8_epi32(v_zp_x);
+  auto v_zp_y_cast = _mm256_shuffle_epi8(v_zp_y, vindex);
+  return v_zp_y_cast;
+}
+
+// vout= {vsrc.f32[0],vsrc.f32[0],...,vsrc.f32[4],vsrc.f32[4]}
+template <bool LowBits>
+static inline __m256 broadcast_ps_1_2(__m256 vsrc_y, const __m256i& vshuf_index_y) {
+  __m256 tmp;
+  if constexpr (LowBits) {
+    tmp = _mm256_permute2f128_ps(vsrc_y, vsrc_y, 0);
+  } else {
+    tmp = _mm256_permute2f128_ps(vsrc_y, vsrc_y, 17);
+  }
+  auto tmpi = _mm256_castps_si256(tmp);
+
+  auto out = _mm256_shuffle_epi8(tmpi, vshuf_index_y);
+  return _mm256_castsi256_ps(out);
+}
+
+template <bool LowBits>
+static inline __m256i broadcast_epi32_1_2(__m256i vsrc_y, const __m256i& vshuf_index_y) {
+  return _mm256_castps_si256(broadcast_ps_1_2<LowBits>(_mm256_castsi256_ps(vsrc_y), vshuf_index_y));
+}
+
+inline BTLA_CODE dq8_get_fp_scale(uint8_t* src, float* dst, int row, int col, int scale_offset, int dq_blk,
+                                  int dq_offset_idx, float* dq_scale, int src_stride, int dst_stride, bool zeropadding,
+                                  int mN) {
+  auto head_proc_num = utils::updiv(scale_offset, 8) * 8 - scale_offset;
+  auto ymm_dq_offset = _mm256_set1_ps(dq_scale[dq_offset_idx]);
+
+  auto get_fp_scale_ref = [&](int proc_src_num, int scale_offset, uint8_t* src, float* dst) {
+    auto dq_s_idx = scale_offset / dq_blk;
+    for (int j = 0; j < col; j++) dst[j] = dq8_bnb_LUT[src[j]] * dq_scale[dq_s_idx] + dq_scale[dq_offset_idx];
+  };
+
+  auto get_fp_scale_avx2 = [&](int scale_offset, uint8_t* src, float* dst) {
+    auto dq_s_idx = scale_offset / dq_blk;
+    auto ymm_dq_scale = _mm256_set1_ps(dq_scale[dq_s_idx]);
+    float tmp[8];
+    for (int i = 0; i < 8; i++) tmp[i] = dq8_bnb_LUT[src[i]];
+    __m256 fp32_dq_ymm = _mm256_loadu_ps(tmp);
+    auto fymm = _mm256_mul_ps(fp32_dq_ymm, ymm_dq_scale);
+    fymm = _mm256_add_ps(fymm, ymm_dq_offset);
+    _mm256_storeu_ps(dst, fymm);
+  };
+
+  for (int i = 0; i < row; i++) {
+    if (head_proc_num > col) {
+      get_fp_scale_ref(col, scale_offset + i * mN, src + i * src_stride, dst + i * dst_stride);
+    } else {
+      get_fp_scale_ref(head_proc_num, scale_offset + i * mN, src + i * src_stride, dst + i * dst_stride);
+      auto scale_offset_iter = scale_offset + i * mN + head_proc_num;
+      uint8_t* src_iter_ptr = src + head_proc_num;
+      float* dst_iter_ptr = dst + head_proc_num;
+      auto body_loop = (col - head_proc_num) / 8;
+      auto tail_proc_num = (col - head_proc_num) % 8;
+      int ii = 0;
+      for (; ii < body_loop; ii++) {
+        get_fp_scale_avx2(scale_offset_iter + ii * 8, src_iter_ptr + i * src_stride + ii * 8,
+                          dst_iter_ptr + i * dst_stride + ii * 8);
+      }
+      if (tail_proc_num > 0) {
+        get_fp_scale_ref(tail_proc_num, scale_offset_iter + ii * 8, src_iter_ptr + i * src_stride + ii * 8,
+                         dst_iter_ptr + i * dst_stride + ii * 8);
+      }
+    }
+  }
+  if (zeropadding) assert(0);
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE alphabeta_f32_f32(const float alpha, const float* srcptr, const int srcstep, const float beta,
+                                          const float* src1ptr, const int src1step, float* dstptr, const int dststep,
+                                          const int M, const int N) {
+  int constexpr Vlen = 8;
+  auto vN = utils::padto_le(N, Vlen);
+  auto valpha = _mm256_set1_ps(alpha);
+  auto vbeta = _mm256_set1_ps(beta);
+
+  for (int i = 0; i < M; i++) {
+    int j = 0;
+    if (beta != 0.f) {
+      for (; j < vN; j += Vlen) {
+        auto vsrc = _mm256_loadu_ps(srcptr + i * srcstep + j);
+        auto vsrc1 = _mm256_loadu_ps(src1ptr + i * src1step + j);
+        auto vdst = _mm256_mul_ps(valpha, vsrc);
+        vdst = _mm256_fmadd_ps(vbeta, vsrc1, vdst);
+        _mm256_storeu_ps(dstptr + i * dststep + j, vdst);
+      }
+      for (; j < N; j += 1) {
+        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j] + beta * src1ptr[i * src1step + j];
+      }
+    } else {
+      for (; j < vN; j += Vlen) {
+        auto vsrc = _mm256_loadu_ps(srcptr + i * srcstep + j);
+        auto vdst = _mm256_mul_ps(valpha, vsrc);
+        _mm256_storeu_ps(dstptr + i * dststep + j, vdst);
+      }
+      for (; j < N; j += 1) {
+        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j];
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename SCAB_T>
+static inline BTLA_CODE dequant_s32_fp32(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep,
+                                         const int row, const int col, const float* scaleA, const int ldsa,
+                                         const SCAB_T* scaleB) {
+  int col8 = utils::padto_le(col, 8);
+  for (int irow = 0; irow < row; irow++) {
+    auto scale = scaleA[irow * ldsa];
+    auto valpha = _mm256_set1_ps(scale);
+    int icol = 0;
+    for (; icol < col8; icol += 8) {
+      __m256 vwscale;
+      if constexpr (std::is_same_v<SCAB_T, float>) {
+        vwscale = _mm256_loadu_ps(scaleB + icol);
+      } else if constexpr (std::is_same_v<SCAB_T, utils::bf16>) {
+        auto tmp = _mm_loadu_si128(reinterpret_cast<__m128i*>(scaleB + icol));
+        vwscale = ymm_cvt_bf16_fp32(tmp);
+      }
+      auto vscale = _mm256_mul_ps(valpha, vwscale);
+      auto vsrcd = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + irow * srcstep + icol));
+      auto vsrc = _mm256_cvtepi32_ps(vsrcd);
+      vsrc = _mm256_mul_ps(vsrc, vscale);
+      _mm256_storeu_ps(dstptr + irow * dststep + icol, vsrc);
+    }
+    for (; icol < col; icol += 1) {
+      dstptr[irow * dststep + icol] = scale * scaleB[icol] * srcptr[irow * srcstep + icol];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE remove_act_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zps,
+                                                  float* scales, int lds, const float* reduce) {
+  int constexpr VLen = 8;
+  auto col8 = utils::padto_le(col, VLen);
+  for (int i = 0; i < row; i++) {
+    auto zpf = static_cast<float>(zps[i * lds]) * scales[i * lds];
+    int j = 0;
+    auto vzp = _mm256_set1_ps(-zpf);
+    for (; j < col8; j += VLen) {
+      auto vreduce = _mm256_loadu_ps(reduce + j);
+      auto vacc = _mm256_loadu_ps(&accptr[i * ldacc + j]);
+      vacc = _mm256_fmadd_ps(vzp, vreduce, vacc);
+      _mm256_storeu_ps(&accptr[i * ldacc + j], vacc);
+    }
+    if (j < col) {
+      for (; j < col; j++) {
+        accptr[i * ldacc + j] -= zpf * reduce[j];
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE remove_wei_zeropoint_bias(float* accptr, int ldacc, int row, int col, int8_t* zps,
+                                                  float* scales, int lds, const float* reduce) {
+  int constexpr VLen = 8;
+  auto col8 = utils::padto_le(col, VLen);
+  const int32_t mask[] = {-1, -1, 0, 0};
+  for (int i = 0; i < row; i++) {
+    auto vreduce = _mm256_set1_ps(-reduce[i * lds]);
+    int j = 0;
+    for (; j < col8; j += VLen) {
+      auto vzp_s32 = _mm256_cvtepi8_epi32(_mm_maskload_epi32(reinterpret_cast<const int*>(zps + j),
+                                                             _mm_loadu_si128(reinterpret_cast<const __m128i*>(mask))));
+      auto vzp_f32 = _mm256_cvtepi32_ps(vzp_s32);
+      auto vzp = _mm256_mul_ps(vzp_f32, _mm256_loadu_ps(scales + j));
+      auto vacc = _mm256_loadu_ps(&accptr[i * ldacc + j]);
+      vacc = _mm256_fmadd_ps(vzp, vreduce, vacc);
+      _mm256_storeu_ps(&accptr[i * ldacc + j], vacc);
+    }
+    if (j < col) {
+      for (; j < col; j++) {
+        accptr[i * ldacc + j] -= static_cast<float>(zps[j]) * scales[j] * reduce[i * lds];
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE remove_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
+                                              float* scalea, float* scaleb, int lds, int k, const float* reducea,
+                                              const float* reduceb) {
+  int constexpr VLen = 8;
+  auto col8 = utils::padto_le(col, VLen);
+  auto vk = _mm256_set1_ps(static_cast<float>(k));
+  const int32_t mask[] = {-1, -1, 0, 0};
+  for (int i = 0; i < row; i++) {
+    auto vreducea = _mm256_set1_ps(-reducea[i * lds]);
+    auto zpaf = static_cast<float>(zpa[i * lds]) * scalea[i * lds];
+    auto vzpa = _mm256_set1_ps(-zpaf);
+    int j = 0;
+    for (; j < col8; j += VLen) {
+      auto vzp_s32 = _mm256_cvtepi8_epi32(_mm_maskload_epi32(reinterpret_cast<const int*>(zpb + j),
+                                                             _mm_loadu_si128(reinterpret_cast<const __m128i*>(mask))));
+      auto vzp_f32 = _mm256_cvtepi32_ps(vzp_s32);
+      auto vzpb = _mm256_mul_ps(vzp_f32, _mm256_loadu_ps(scaleb + j));
+      auto vreduceb = _mm256_loadu_ps(reduceb + j);
+      auto vacc = _mm256_loadu_ps(&accptr[i * ldacc + j]);
+      vacc = _mm256_fmadd_ps(vzpa, vreduceb, vacc);
+      vacc = _mm256_fmadd_ps(vzpb, vreducea, vacc);
+      vzpb = _mm256_mul_ps(vzpb, vk);
+      vacc = _mm256_fmadd_ps(vzpa, vzpb, vacc);
+      _mm256_storeu_ps(&accptr[i * ldacc + j], vacc);
+    }
+    if (j < col) {
+      for (; j < col; j++) {
+        accptr[i * ldacc + j] -= static_cast<float>(zpb[j]) * scaleb[j] * reducea[i * lds];
+        accptr[i * ldacc + j] -= zpaf * reduceb[j];
+        accptr[i * ldacc + j] -= zpaf * static_cast<float>(zpb[j]) * scaleb[j] * k;
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s4_s8_pack4_row(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr,
+                                                          int blocksize, int ldzp, int n_offset, int k_offset, int row,
+                                                          int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  static_assert((NTILE % 8) == 0);
+  int constexpr PackRow = 4;
+  __m256i v_zp_y[NReg];
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
+  auto vbias = _mm256_set1_epi8(8);
+  const auto vindex = _mm256_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0);
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    for (int i = 0; i < NReg; i++) {
+      v_zp_y[i] = load_zp_epi8_broadcast_epi32(zptr + i * 8, vindex);
+      v_zp_y[i] = _mm256_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    for (int ib = 0; ib < k_remain; ib += PackRow) {
+      auto b4ptr = srcptr + (ir + ib) * NTILE / 2;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_4bits(b4ptr + i * 16, vmask);
+        v_s8_y = _mm256_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm256_storeu_si256((__m256i*)(dstptr + i * 32 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s4_s8_pack2_row(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr,
+                                                          int blocksize, int ldzp, int n_offset, int k_offset, int row,
+                                                          int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  static_assert((NTILE % 8) == 0);
+  int constexpr PackRow = 2;
+  int constexpr Unroll = 2;
+  __m256i v_zp_y[NReg];
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
+  auto vbias = _mm256_set1_epi8(8);
+  const auto vindex = _mm256_set_epi8(14, 14, 12, 12, 10, 10, 8, 8, 6, 6, 4, 4, 2, 2, 0, 0, 14, 14, 12, 12, 10, 10, 8,
+                                      8, 6, 6, 4, 4, 2, 2, 0, 0);
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    memcpy(tmp, zptr, NTILE * sizeof(int8_t));
+    memcpy(tmp + NTILE, zptr, NTILE * sizeof(int8_t));
+    for (int i = 0; i < NReg; i++) {
+      v_zp_y[i] = load_zp_epi8_broadcast_epi16_v16(tmp + i * 16, vindex);
+      v_zp_y[i] = _mm256_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    int k_remain_unrll = utils::padto_le(k_remain, PackRow * Unroll);
+    int ib = 0;
+    for (; ib < k_remain_unrll; ib += PackRow * Unroll) {
+      auto b4ptr = srcptr + (ir + ib) * NTILE / 2;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_4bits(b4ptr + i * 16, vmask);
+        v_s8_y = _mm256_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm256_storeu_si256((__m256i*)(dstptr + i * 32 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+    int k_tail = k_remain - k_remain_unrll;
+    if (k_tail > 0) {
+      memcpy(tmp, srcptr + (ir + ib) * NTILE / 2, k_tail * NTILE / 2);
+      auto tmpout = tmp + Unroll * PackRow * NTILE / 2;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_4bits(tmp + i * 16, vmask);
+        v_s8_y = _mm256_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm256_storeu_si256((__m256i*)(tmpout + i * 32), v_s8_y);
+      }
+      memcpy(dstptr + (ir + ib) * NTILE, tmpout, k_tail * NTILE);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s4_s8_pack1_row(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr,
+                                                          int blocksize, int ldzp, int n_offset, int k_offset, int row,
+                                                          int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  static_assert((NTILE % 8) == 0);
+  int constexpr PackRow = 1;
+  int constexpr Unroll = 4;
+  int constexpr UnpackLoop = Unroll * NTILE / 32;
+  __m256i v_zp_y[UnpackLoop];
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
+  auto vbias = _mm256_set1_epi8(8);
+  const auto vindex = _mm256_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0);
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    for (int i = 0; i < Unroll; i++) {
+      memcpy(tmp + i * NTILE, zptr, NTILE * sizeof(int8_t));
+    }
+    for (int i = 0; i < UnpackLoop; i++) {
+      v_zp_y[i] = _mm256_loadu_si256((const __m256i*)(tmp + i * 32));
+      v_zp_y[i] = _mm256_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    int k_remain_unrll = utils::padto_le(k_remain, Unroll);
+    int ib = 0;
+    for (; ib < k_remain_unrll; ib += Unroll) {
+      auto b4ptr = srcptr + (ir + ib) * NTILE / 2;
+      for (int i = 0; i < UnpackLoop; i++) {
+        auto v_s8_y = unpack_4bits(b4ptr + i * 16, vmask);
+        v_s8_y = _mm256_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm256_storeu_si256((__m256i*)(dstptr + i * 32 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+
+    int k_tail = k_remain - k_remain_unrll;
+    if (k_tail > 0) {
+      memcpy(tmp, srcptr + (ir + ib) * NTILE / 2, k_tail * NTILE / 2);
+      auto tmpout = tmp + Unroll * NTILE / 2;
+      for (int i = 0; i < UnpackLoop; i++) {
+        auto v_s8_y = unpack_4bits(tmp + i * 16, vmask);
+        v_s8_y = _mm256_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm256_storeu_si256((__m256i*)(tmpout + i * 32), v_s8_y);
+      }
+      memcpy(dstptr + (ir + ib) * NTILE, tmpout, k_tail * NTILE);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s2_s8_pack4_row(utils::bit2x4* srcptr, int8_t* zpptr, int8_t* dstptr,
+                                                          int blocksize, int ldzp, int n_offset, int k_offset, int row,
+                                                          int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  static_assert((NTILE % 8) == 0);
+  int constexpr PackRow = 4;
+  __m256i v_zp_y[NReg];
+  const auto vindex = _mm256_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0);
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vbias = _mm256_set1_epi8(2);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    for (int i = 0; i < NReg; i++) {
+      v_zp_y[i] = load_zp_epi8_broadcast_epi32(zptr + i * 8, vindex);
+      v_zp_y[i] = _mm256_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    for (int ib = 0; ib < k_remain; ib += PackRow) {
+      auto b2ptr = srcptr + (ir + ib) * NTILE / 4;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits(b2ptr + i * 8, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        v_s8_y = _mm256_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm256_storeu_si256((__m256i*)(dstptr + i * 32 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s2_s8_pack2_row(utils::bit2x4* srcptr, int8_t* zpptr, int8_t* dstptr,
+                                                          int blocksize, int ldzp, int n_offset, int k_offset, int row,
+                                                          int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  static_assert((NTILE % 8) == 0);
+  int constexpr PackRow = 2;
+  int constexpr Unroll = 2;
+  __m256i v_zp_y[NReg];
+  const auto vindex = _mm256_set_epi8(14, 14, 12, 12, 10, 10, 8, 8, 6, 6, 4, 4, 2, 2, 0, 0, 14, 14, 12, 12, 10, 10, 8,
+                                      8, 6, 6, 4, 4, 2, 2, 0, 0);
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vbias = _mm256_set1_epi8(2);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    memcpy(tmp, zptr, NTILE * sizeof(int8_t));
+    memcpy(tmp + NTILE, zptr, NTILE * sizeof(int8_t));
+    for (int i = 0; i < NReg; i++) {
+      v_zp_y[i] = load_zp_epi8_broadcast_epi16_v16(tmp + i * 16, vindex);
+      v_zp_y[i] = _mm256_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    int k_remain_unrll = utils::padto_le(k_remain, PackRow * Unroll);
+    int ib = 0;
+    for (; ib < k_remain_unrll; ib += PackRow * Unroll) {
+      auto b2ptr = srcptr + (ir + ib) * NTILE / 4;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits(b2ptr + i * 8, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        v_s8_y = _mm256_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm256_storeu_si256((__m256i*)(dstptr + i * 32 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+    int k_tail = k_remain - k_remain_unrll;
+    if (k_tail > 0) {
+      memcpy(tmp, srcptr + (ir + ib) * NTILE / 4, k_tail * NTILE / 4);
+      auto tmpout = tmp + Unroll * PackRow * NTILE / 4;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits((utils::bit2x4*)(tmp + i * 8), vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        v_s8_y = _mm256_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm256_storeu_si256((__m256i*)(tmpout + i * 32), v_s8_y);
+      }
+      memcpy(dstptr + (ir + ib) * NTILE, tmpout, k_tail * NTILE);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s2_s8_pack1_row(utils::bit2x4* srcptr, int8_t* zpptr, int8_t* dstptr,
+                                                          int blocksize, int ldzp, int n_offset, int k_offset, int row,
+                                                          int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  static_assert((NTILE % 8) == 0);
+  int constexpr PackRow = 1;
+  int constexpr Unroll = 4;
+  int constexpr UnpackLoop = Unroll * NTILE / 32;
+  __m256i v_zp_y[UnpackLoop];
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vbias = _mm256_set1_epi8(2);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    for (int i = 0; i < Unroll; i++) {
+      memcpy(tmp + i * NTILE, zptr, NTILE * sizeof(int8_t));
+    }
+    for (int i = 0; i < UnpackLoop; i++) {
+      v_zp_y[i] = _mm256_loadu_si256((const __m256i*)(tmp + i * 32));
+      v_zp_y[i] = _mm256_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    int k_remain_unrll = utils::padto_le(k_remain, Unroll);
+    int ib = 0;
+    for (; ib < k_remain_unrll; ib += Unroll) {
+      auto b2ptr = srcptr + (ir + ib) * NTILE / 4;
+      for (int i = 0; i < UnpackLoop; i++) {
+        auto v_s8_y = unpack_2bits(b2ptr + i * 8, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        v_s8_y = _mm256_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm256_storeu_si256((__m256i*)(dstptr + i * 32 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+
+    int k_tail = k_remain - k_remain_unrll;
+    if (k_tail > 0) {
+      memcpy(tmp, srcptr + (ir + ib) * NTILE / 4, k_tail * NTILE / 4);
+      auto tmpout = tmp + Unroll * NTILE / 4;
+      for (int i = 0; i < UnpackLoop; i++) {
+        auto v_s8_y = unpack_2bits((utils::bit2x4*)(tmp + i * 8), vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        v_s8_y = _mm256_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm256_storeu_si256((__m256i*)(tmpout + i * 32), v_s8_y);
+      }
+      memcpy(dstptr + (ir + ib) * NTILE, tmpout, k_tail * NTILE);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, size_t elesize, int8_t* tmp,
+                                         size_t tmpsize) {
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
+  size_t velt = utils::padto_le(elesize, 32);
+  size_t i = 0;
+  auto vbias = _mm256_set1_epi8(8);
+  for (; i < velt; i += 32) {
+    auto vout_y = unpack_4bits(reinterpret_cast<int8_t*>(srcptr + i / 2), vmask);
+    vout_y = _mm256_sub_epi8(vout_y, vbias);
+    _mm256_storeu_si256((__m256i*)(dstptr + i), vout_y);
+  }
+  if (velt < elesize) {
+    if (elesize >= 32) {
+      i = elesize - 32;
+      auto vout_y = unpack_4bits(reinterpret_cast<int8_t*>(srcptr + i / 2), vmask);
+      vout_y = _mm256_sub_epi8(vout_y, vbias);
+      _mm256_storeu_si256((__m256i*)(dstptr + i), vout_y);
+    } else {
+      ref::decompress_kblock_s4_s8<1, 1>(srcptr + i / 2, nullptr, dstptr + i, 0, 0, 0, 0, 1, elesize - i, nullptr, 0);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int PackRow, int NTILE>
+inline BTLA_CODE decompress_kblock_s4_s8(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp,
+                                         int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) {
+  if (zpptr) {
+    typedef BTLA_CODE (*decompfunc)(utils::int4x2 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp,
+                                    int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize);
+    decompfunc func = nullptr;
+    if (col == NTILE) {
+      if constexpr (PackRow == 4) {
+        func = &decompress_kblock_s4_s8_pack4_row<NTILE>;
+      }
+      if constexpr (PackRow == 1) {
+        func = &decompress_kblock_s4_s8_pack1_row<NTILE>;
+      }
+      if constexpr (PackRow == 2) {
+        func = &decompress_kblock_s4_s8_pack2_row<NTILE>;
+      }
+      if (func) {
+        int head_end = utils::padto(k_offset, blocksize);
+        head_end = std::min(head_end, k_offset + row);
+        int head_size = head_end - k_offset;
+        if (head_size > 0) {
+          (*func)(srcptr, zpptr, dstptr, blocksize, ldzp, n_offset, k_offset, head_size, tmp, tmpsize);
+        }
+        int body_size = row - head_size;
+        if (body_size > 0) {
+          (*func)(srcptr + head_size * NTILE / 2, zpptr, dstptr + head_size * NTILE, blocksize, ldzp, n_offset,
+                  head_end, body_size, tmp, tmpsize);
+        }
+        return BTLA_CODE::Success;
+      }
+    }
+    assert(0);
+    return BTLA_CODE::NotSupport;
+  } else {
+    size_t elesize = static_cast<size_t>(row) * col;
+    return decompress_s4_s8(srcptr, dstptr, elesize, tmp, tmpsize);
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE decompress_s2_s8(utils::bit2x4* bit2ptr, int8_t* dstptr, size_t unpack_elt, int8_t* tmp,
+                                         size_t tmpsize) {
+  int constexpr VBits = 256;
+  int constexpr VElt = VBits / 8;
+  int i = 0;
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vbias = _mm256_set1_epi8(2);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+  int elt_pad = utils::padto_le(unpack_elt, VElt);
+  for (; i < elt_pad; i += VElt) {
+    auto vout = unpack_2bits(bit2ptr + i / 4, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+    vout = _mm256_sub_epi8(vout, vbias);
+    _mm256_storeu_si256((__m256i*)(dstptr + i), vout);
+  }
+  if (elt_pad < unpack_elt) {
+    if (unpack_elt >= 32) {
+      i = unpack_elt - 32;
+      auto vout = unpack_2bits(bit2ptr + i / 4, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+      vout = _mm256_sub_epi8(vout, vbias);
+      _mm256_storeu_si256((__m256i*)(dstptr + i), vout);
+    } else {
+      ref::decompress_s2_s8(bit2ptr + i / 4, dstptr + i, unpack_elt - i, tmp, tmpsize);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int PackRow, int NTILE>
+static inline BTLA_CODE decompress_kblock_s2_s8(utils::bit2x4* bit2ptr, int8_t* zpptr, int8_t* dstptr, int blocksize,
+                                                int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp,
+                                                size_t tmpsize) {
+  if (zpptr) {
+    typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp,
+                                    int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize);
+    decompfunc func = nullptr;
+    if (col == NTILE) {
+      if constexpr (PackRow == 4) {
+        func = &decompress_kblock_s2_s8_pack4_row<NTILE>;
+      }
+      if constexpr (PackRow == 1) {
+        func = &decompress_kblock_s2_s8_pack1_row<NTILE>;
+      }
+      if constexpr (PackRow == 2) {
+        func = &decompress_kblock_s2_s8_pack2_row<NTILE>;
+      }
+      if (func) {
+        int head_end = utils::padto(k_offset, blocksize);
+        head_end = std::min(head_end, k_offset + row);
+        int head_size = head_end - k_offset;
+        if (head_size > 0) {
+          (*func)(bit2ptr, zpptr, dstptr, blocksize, ldzp, n_offset, k_offset, head_size, tmp, tmpsize);
+        }
+        int body_size = row - head_size;
+        if (body_size > 0) {
+          (*func)(bit2ptr + head_size * NTILE / 4, zpptr, dstptr + head_size * NTILE, blocksize, ldzp, n_offset,
+                  head_end, body_size, tmp, tmpsize);
+        }
+        return BTLA_CODE::Success;
+      }
+    }
+    assert(0);
+    return BTLA_CODE::NotSupport;
+  } else {
+    size_t elesize = static_cast<size_t>(row) * col;
+    return decompress_s2_s8(bit2ptr, dstptr, elesize, tmp, tmpsize);
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE decompress_s3_s8(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, int8_t* dstptr,
+                                         size_t unpack_elt, int8_t* tmp, size_t tmpsize) {
+  int constexpr VBits = 256;
+  int constexpr VElt = VBits / 8;
+  int i = 0;
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vbias = _mm256_set1_epi8(4);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+
+  const __m256i highMask = _mm256_set1_epi8(0x04);
+  const __m256i bit1Mask = _mm256_set1_epi32(0x0F);
+  const __m256i bit1Shift_1 = _mm256_set_epi32(28, 24, 20, 16, 12, 8, 4, 0);
+  const __m256i bit1Shift_2 = _mm256_set1_epi32((1 << 23) + (1 << 16) + (1 << 9) + (1 << 2));
+  int elt_pad = utils::padto_le(unpack_elt, VElt);
+  for (; i < elt_pad; i += VElt) {
+    auto vout = unpack_2bits(bit2ptr + i / 4, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+    auto vb1 = unpack_1bits(bit1ptr + i / 8, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+    vout = _mm256_or_si256(vout, vb1);
+    vout = _mm256_sub_epi8(vout, vbias);
+    _mm256_storeu_si256((__m256i*)(dstptr + i), vout);
+  }
+  if (elt_pad < unpack_elt) {
+    if (unpack_elt >= 32) {
+      i = unpack_elt - 32;
+      auto vout = unpack_2bits(bit2ptr + i / 4, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+      auto vb1 = unpack_1bits(bit1ptr + i / 8, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+      vout = _mm256_or_si256(vout, vb1);
+      vout = _mm256_sub_epi8(vout, vbias);
+      _mm256_storeu_si256((__m256i*)(dstptr + i), vout);
+    } else {
+      ref::decompress_s3_s8(bit2ptr + i / 4, bit1ptr + i / 8, dstptr + i, unpack_elt - i, tmp, tmpsize);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s3_s8_pack4_row(utils::bit2x4* srcptr, utils::bit1x8* bit1ptr, int8_t* zpptr,
+                                                          int8_t* dstptr, int blocksize, int ldzp, int n_offset,
+                                                          int k_offset, int row, int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  static_assert((NTILE % 8) == 0);
+  int constexpr PackRow = 4;
+  __m256i v_zp_y[NReg];
+  const auto vindex = _mm256_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0);
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vbias = _mm256_set1_epi8(4);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+
+  const __m256i highMask = _mm256_set1_epi8(0x04);
+  const __m256i bit1Mask = _mm256_set1_epi32(0x0F);
+  const __m256i bit1Shift_1 = _mm256_set_epi32(28, 24, 20, 16, 12, 8, 4, 0);
+  const __m256i bit1Shift_2 = _mm256_set1_epi32((1 << 23) + (1 << 16) + (1 << 9) + (1 << 2));
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    for (int i = 0; i < NReg; i++) {
+      v_zp_y[i] = load_zp_epi8_broadcast_epi32(zptr + i * 8, vindex);
+      v_zp_y[i] = _mm256_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    for (int ib = 0; ib < k_remain; ib += PackRow) {
+      auto b2ptr = srcptr + (ir + ib) * NTILE / 4;
+      auto b1ptr = bit1ptr + (ir + ib) * NTILE / 8;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits(b2ptr + i * 8, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        auto vb1 = unpack_1bits(b1ptr + i * 4, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+        v_s8_y = _mm256_or_si256(v_s8_y, vb1);
+        v_s8_y = _mm256_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm256_storeu_si256((__m256i*)(dstptr + i * 32 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s3_s8_pack2_row(utils::bit2x4* srcptr, utils::bit1x8* bit1ptr, int8_t* zpptr,
+                                                          int8_t* dstptr, int blocksize, int ldzp, int n_offset,
+                                                          int k_offset, int row, int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  static_assert((NTILE % 8) == 0);
+  int constexpr PackRow = 2;
+  int constexpr Unroll = 2;
+  __m256i v_zp_y[NReg];
+  const auto vindex = _mm256_set_epi8(14, 14, 12, 12, 10, 10, 8, 8, 6, 6, 4, 4, 2, 2, 0, 0, 14, 14, 12, 12, 10, 10, 8,
+                                      8, 6, 6, 4, 4, 2, 2, 0, 0);
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vbias = _mm256_set1_epi8(4);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+
+  const __m256i highMask = _mm256_set1_epi8(0x04);
+  const __m256i bit1Mask = _mm256_set1_epi32(0x0F);
+  const __m256i bit1Shift_1 = _mm256_set_epi32(28, 24, 20, 16, 12, 8, 4, 0);
+  const __m256i bit1Shift_2 = _mm256_set1_epi32((1 << 23) + (1 << 16) + (1 << 9) + (1 << 2));
+
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    memcpy(tmp, zptr, NTILE * sizeof(int8_t));
+    memcpy(tmp + NTILE, zptr, NTILE * sizeof(int8_t));
+    for (int i = 0; i < NReg; i++) {
+      v_zp_y[i] = load_zp_epi8_broadcast_epi16_v16(tmp + i * 16, vindex);
+      v_zp_y[i] = _mm256_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    int k_remain_unrll = utils::padto_le(k_remain, PackRow * Unroll);
+    int ib = 0;
+    for (; ib < k_remain_unrll; ib += PackRow * Unroll) {
+      auto b2ptr = srcptr + (ir + ib) * NTILE / 4;
+      auto b1ptr = bit1ptr + (ir + ib) * NTILE / 8;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits(b2ptr + i * 8, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        auto vb1 = unpack_1bits(b1ptr + i * 4, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+        v_s8_y = _mm256_or_si256(v_s8_y, vb1);
+        v_s8_y = _mm256_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm256_storeu_si256((__m256i*)(dstptr + i * 32 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+    int k_tail = k_remain - k_remain_unrll;
+    if (k_tail > 0) {
+      auto tmpb2ptr = tmp;
+      memcpy(tmpb2ptr, srcptr + (ir + ib) * NTILE / 4, k_tail * NTILE / 4);
+      auto tmpb1ptr = tmp + Unroll * NTILE / 2;
+      memcpy(tmpb1ptr, bit1ptr + (ir + ib) * NTILE / 8, k_tail * NTILE / 8);
+      auto tmpout = tmp + Unroll * NTILE;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits((utils::bit2x4*)(tmpb2ptr + i * 8), vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        auto vb1 = unpack_1bits((utils::bit1x8*)(tmpb1ptr + i * 4), bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+        v_s8_y = _mm256_or_si256(v_s8_y, vb1);
+        v_s8_y = _mm256_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm256_storeu_si256((__m256i*)(tmpout + i * 32), v_s8_y);
+      }
+      memcpy(dstptr + (ir + ib) * NTILE, tmpout, k_tail * NTILE);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s3_s8_pack1_row(utils::bit2x4* srcptr, utils::bit1x8* bit1ptr, int8_t* zpptr,
+                                                          int8_t* dstptr, int blocksize, int ldzp, int n_offset,
+                                                          int k_offset, int row, int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  static_assert((NTILE % 8) == 0);
+  int constexpr PackRow = 1;
+  int constexpr Unroll = 4;
+  int constexpr UnpackLoop = Unroll * NTILE / 32;
+  __m256i v_zp_y[UnpackLoop];
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vbias = _mm256_set1_epi8(4);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+
+  const __m256i highMask = _mm256_set1_epi8(0x04);
+  const __m256i bit1Mask = _mm256_set1_epi32(0x0F);
+  const __m256i bit1Shift_1 = _mm256_set_epi32(28, 24, 20, 16, 12, 8, 4, 0);
+  const __m256i bit1Shift_2 = _mm256_set1_epi32((1 << 23) + (1 << 16) + (1 << 9) + (1 << 2));
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    for (int i = 0; i < Unroll; i++) {
+      memcpy(tmp + i * NTILE, zptr, NTILE * sizeof(int8_t));
+    }
+    for (int i = 0; i < UnpackLoop; i++) {
+      v_zp_y[i] = _mm256_loadu_si256((const __m256i*)(tmp + i * 32));
+      v_zp_y[i] = _mm256_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    int k_remain_unrll = utils::padto_le(k_remain, Unroll);
+    int ib = 0;
+    for (; ib < k_remain_unrll; ib += Unroll) {
+      auto b2ptr = srcptr + (ir + ib) * NTILE / 4;
+      auto b1ptr = bit1ptr + (ir + ib) * NTILE / 8;
+      for (int i = 0; i < UnpackLoop; i++) {
+        auto v_s8_y = unpack_2bits(b2ptr + i * 8, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        auto vb1 = unpack_1bits(b1ptr + i * 4, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+        v_s8_y = _mm256_or_si256(v_s8_y, vb1);
+        v_s8_y = _mm256_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm256_storeu_si256((__m256i*)(dstptr + i * 32 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+
+    int k_tail = k_remain - k_remain_unrll;
+    if (k_tail > 0) {
+      auto tmpb2ptr = tmp;
+      memcpy(tmpb2ptr, srcptr + (ir + ib) * NTILE / 4, k_tail * NTILE / 4);
+      auto tmpb1ptr = tmp + Unroll * NTILE / 2;
+      memcpy(tmpb1ptr, bit1ptr + (ir + ib) * NTILE / 8, k_tail * NTILE / 8);
+      auto tmpout = tmp + Unroll * NTILE;
+      for (int i = 0; i < UnpackLoop; i++) {
+        auto v_s8_y = unpack_2bits((utils::bit2x4*)(tmpb2ptr + i * 8), vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        auto vb1 = unpack_1bits((utils::bit1x8*)(tmpb1ptr + i * 4), bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+        v_s8_y = _mm256_or_si256(v_s8_y, vb1);
+        v_s8_y = _mm256_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm256_storeu_si256((__m256i*)(tmpout + i * 32), v_s8_y);
+      }
+      memcpy(dstptr + (ir + ib) * NTILE, tmpout, k_tail * NTILE);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int PackRow, int NTILE>
+static inline BTLA_CODE decompress_kblock_s3_s8(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, int8_t* zpptr,
+                                                int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset,
+                                                int row, int col, int8_t* tmp, size_t tmpsize) {
+  if (zpptr) {
+    typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * bit2ptr, utils::bit1x8 * bit1ptr, int8_t * zpptr, int8_t * dstptr,
+                                    int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp,
+                                    size_t tmpsize);
+    decompfunc func = nullptr;
+    if (col == NTILE) {
+      if constexpr (PackRow == 1) {
+        func = &decompress_kblock_s3_s8_pack1_row<NTILE>;
+      }
+      if constexpr (PackRow == 2) {
+        func = &decompress_kblock_s3_s8_pack2_row<NTILE>;
+      }
+      if constexpr (PackRow == 4) {
+        func = &decompress_kblock_s3_s8_pack4_row<NTILE>;
+      }
+      if (func) {
+        int head_end = utils::padto(k_offset, blocksize);
+        head_end = std::min(head_end, k_offset + row);
+        int head_size = head_end - k_offset;
+        if (head_size > 0) {
+          (*func)(bit2ptr, bit1ptr, zpptr, dstptr, blocksize, ldzp, n_offset, k_offset, head_size, tmp, tmpsize);
+        }
+        int body_size = row - head_size;
+        if (body_size > 0) {
+          (*func)(bit2ptr + head_size * NTILE / 4, bit1ptr + head_size * NTILE / 8, zpptr, dstptr + head_size * NTILE,
+                  blocksize, ldzp, n_offset, head_end, body_size, tmp, tmpsize);
+        }
+        return BTLA_CODE::Success;
+      }
+    }
+    assert(0);
+    return BTLA_CODE::NotSupport;
+  } else {
+    size_t elesize = static_cast<size_t>(row) * col;
+    return decompress_s3_s8(bit2ptr, bit1ptr, dstptr, elesize, tmp, tmpsize);
+  }
+  return BTLA_CODE::Success;
+}
+
+template <bool WITH_SCALE, typename _DST_T, int _PACK_ROW, typename _S_T>
+inline BTLA_CODE decompress_kblock_f8_fp(utils::f8* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
+                                         _S_T* scales, int k_offset, int kblock, int NPad, BTLA_DTYPE src_f8_type) {
+  int align_col = col / 16 * 16;
+  int col_tail = col - align_col;
+  auto ebits = utils::bestla_dtype_get_f8_ebits(src_f8_type);
+  auto mantissabit = 7 - ebits;
+  auto sign_revert_and_mask = _mm256_set1_epi32(0x80000000);
+  auto e_revert_and_mask = _mm256_set1_epi32(0x0000007f);
+  auto e_revert_shift = _mm256_set1_epi32(1);
+  e_revert_shift = _mm256_slli_epi32(e_revert_shift, ebits - 1);
+  e_revert_shift = _mm256_sub_epi32(e_revert_shift, _mm256_set1_epi32(128));
+  auto mantissa_revert_and_mask = _mm256_set1_epi32(0x007fffff);
+  auto packrow2_permute_idx = _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3);
+  for (int i = 0; i < row; i++) {
+    int kpos = (k_offset + i) / kblock;
+    auto sptr = scales + kpos * NPad;
+    int j = 0;
+    auto quant = [&]() {
+      auto sign_revert = _mm256_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + i * ld_src + j)));
+      auto e_revert = sign_revert;
+      auto mantissa_revert = sign_revert;
+      sign_revert = _mm256_slli_epi32(sign_revert, 24);
+      sign_revert = _mm256_and_si256(sign_revert, sign_revert_and_mask);
+      e_revert = _mm256_and_si256(e_revert, e_revert_and_mask);
+      e_revert = _mm256_srli_epi32(e_revert, mantissabit);
+      if constexpr (WITH_SCALE && std::is_same_v<_S_T, utils::f8>) {
+        auto scale = _mm256_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(sptr + j / _PACK_ROW)));
+        if constexpr (_PACK_ROW == 2) scale = _mm256_permutexvar_epi32(packrow2_permute_idx, scale);
+        e_revert = _mm256_add_epi32(e_revert, scale);
+      }
+      e_revert = _mm256_sub_epi32(e_revert, e_revert_shift);
+      e_revert = _mm256_slli_epi32(e_revert, 23);
+      mantissa_revert = _mm256_slli_epi32(mantissa_revert, 23 - mantissabit);
+      mantissa_revert = _mm256_and_si256(mantissa_revert, mantissa_revert_and_mask);
+      auto fp_v = _mm256_or_ps(_mm256_castsi256_ps(sign_revert), _mm256_castsi256_ps(e_revert));
+      fp_v = _mm256_or_ps(fp_v, _mm256_castsi256_ps(mantissa_revert));
+      if constexpr (WITH_SCALE && std::is_same_v<_S_T, float>) {
+        auto scale = _mm256_loadu_ps(sptr + j / _PACK_ROW);
+        if constexpr (_PACK_ROW == 2) scale = _mm256_permutexvar_ps(packrow2_permute_idx, scale);
+        fp_v = _mm256_mul_ps(fp_v, scale);
+      }
+      if constexpr (std::is_same_v<_DST_T, float>) {
+        _mm256_storeu_ps(dstptr + i * ld_dst + j, fp_v);
+      } else {
+        assert(0);
+      }
+    };
+    for (; j < align_col; j += 8) quant();
+    for (; j < col; j++) {
+      auto fp_v = ref::f8_to_fp32(srcptr[i * ld_src + j], src_f8_type);
+      if constexpr (WITH_SCALE) {
+        if constexpr (std::is_same_v<_S_T, utils::f8>) {
+          dstptr[i * ld_dst + j] = sptr[j / _PACK_ROW].mul(fp_v);
+        } else if constexpr (std::is_same_v<_S_T, float>) {
+          dstptr[i * ld_dst + j] = fp_v * sptr[j / _PACK_ROW];
+        }
+      } else {
+        dstptr[i * ld_dst + j] = fp_v;
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename SCA_T>
+static inline BTLA_CODE accum_alphaN_f32_f32(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
+                                             const int dststep, const int M, const int N) {
+  int constexpr Vlen = 8;
+  auto vN = utils::padto_le(N, Vlen);
+  int j = 0;
+  for (; j < vN; j += Vlen) {
+    __m256 valpha;
+    if constexpr (std::is_same_v<SCA_T, float>) {
+      valpha = _mm256_loadu_ps(alpha + j);
+    } else if constexpr (std::is_same_v<SCA_T, utils::bf16>) {
+      auto tmp = _mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha + j));
+      valpha = ymm_cvt_bf16_fp32(tmp);
+    } else if constexpr (std::is_same_v<SCA_T, utils::f8>) {
+      auto ebit = _mm256_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha + j)));
+      ebit = _mm256_add_epi32(_mm256_set1_epi32(127), ebit);
+      valpha = _mm256_castsi256_ps(_mm256_slli_epi32(ebit, 23));
+    }
+    for (size_t i = 0; i < M; i++) {
+      auto vsrc = _mm256_loadu_ps(srcptr + i * srcstep + j);
+      auto vsrc1 = _mm256_loadu_ps(dstptr + i * dststep + j);
+      auto vdst = _mm256_fmadd_ps(valpha, vsrc, vsrc1);
+      _mm256_storeu_ps(dstptr + i * dststep + j, vdst);
+    }
+  }
+  for (; j < N; j += 1) {
+    for (size_t i = 0; i < M; i++) {
+      if constexpr (!std::is_same_v<SCA_T, utils::f8>) {
+        dstptr[i * dststep + j] += alpha[j] * srcptr[i * srcstep + j];
+      } else {
+        dstptr[i * dststep + j] += alpha[j].mul(srcptr[i * srcstep + j]);
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int N, typename _DST_T, BTLA_DTYPE F4_T, bool MULS_T>
+static inline void dequant_f4_N(_DST_T* dstptr, int8_t* srcptr, __m256* vscales, __m256 vLutL, __m256 vLutH) {
+  static_assert(N % 8 == 0);
+  int constexpr VLoop = N / 8;
+  auto v7 = _mm256_set1_epi32(7);
+  auto v8 = _mm256_set1_epi32(8);
+  for (int iv = 0; iv < VLoop; iv++) {
+    auto idx = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + iv * 8));
+    auto pad_idx = _mm256_cvtepu8_epi32(idx);
+    auto mskgt8 = _mm256_cmpgt_epi32(pad_idx, v7);
+    auto fp32_dq_v0 = _mm256_permutevar8x32_ps(vLutL, pad_idx);
+    pad_idx = _mm256_sub_epi32(pad_idx, v8);
+    auto fp32_dq_v1 = _mm256_permutevar8x32_ps(vLutH, pad_idx);
+    auto fp32_dq_v = _mm256_blendv_ps(fp32_dq_v0, fp32_dq_v1, _mm256_castsi256_ps(mskgt8));
+    if constexpr (MULS_T) {
+      fp32_dq_v = _mm256_mul_ps(fp32_dq_v, vscales[iv]);
+    }
+    if constexpr (std::is_same_v<_DST_T, float>) {
+      _mm256_storeu_ps(dstptr + iv * 8, fp32_dq_v);
+    } else if constexpr (std::is_same_v<_DST_T, utils::bf16>) {
+      auto bf16v = ymm_cvt_fp32_bf16(fp32_dq_v);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr + iv * 8), bf16v);
+    }
+  }
+}
+
+template <int N, BTLA_DTYPE QT_T>
+static inline void convert_s4_s8_N_avx2(int8_t* dstptr, int8_t* srcptr, __m256i mask) {
+  static_assert(N % 2 == 0);
+  static_assert(N <= 64);
+  const auto vbias = _mm256_set1_epi8(8);
+  if constexpr (N == 32) {
+    auto dst0 = unpack_4bits(srcptr, mask);
+    if constexpr (QT_T == BTLA_DTYPE::S4_CLIP) {
+      dst0 = _mm256_sub_epi8(dst0, vbias);
+    }
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr), dst0);
+  } else if constexpr (N > 32) {
+    auto dst0 = unpack_4bits(srcptr, mask);
+    if constexpr (QT_T == BTLA_DTYPE::S4_CLIP) {
+      dst0 = _mm256_sub_epi8(dst0, vbias);
+    }
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr), dst0);
+    int8_t temp[32];
+    memcpy(temp, srcptr + 16, (N - 32) / 2);
+    dst0 = unpack_4bits(temp, mask);
+    if constexpr (QT_T == BTLA_DTYPE::S4_CLIP) {
+      dst0 = _mm256_sub_epi8(dst0, vbias);
+    }
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(temp), dst0);
+    memcpy(dstptr + 32, temp, (N - 32));
+  } else {
+    int8_t temp[32];
+    memcpy(temp, srcptr, N / 2);
+    auto dst0 = unpack_4bits(temp, mask);
+    if constexpr (QT_T == BTLA_DTYPE::S4_CLIP) {
+      dst0 = _mm256_sub_epi8(dst0, vbias);
+    }
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(temp), dst0);
+    memcpy(dstptr, temp, N);
+  }
+}
+
+template <BTLA_DTYPE F4_T, typename DST_T>
+inline BTLA_CODE decompress_kblock_f4_fp_noscale(utils::f4x2* srcptr, DST_T* dstptr, int row, int col, int ld_src,
+                                                 int ld_dst, int8_t* tmp, size_t tmpsize) {
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
+  float* LUT;
+  static_assert(F4_T == BTLA_DTYPE::F4_BNB || F4_T == BTLA_DTYPE::F4_NF4 || F4_T == BTLA_DTYPE::F4_E2M1,
+                "Unsupported F4 type");
+  if constexpr (F4_T == BTLA_DTYPE::F4_BNB) {
+    LUT = fp4_bnb_dequant_fp32_LUT;
+  } else if constexpr (F4_T == BTLA_DTYPE::F4_NF4) {
+    LUT = nf4_dequant_fp32_LUT;
+  } else if constexpr (F4_T == BTLA_DTYPE::F4_E2M1) {
+    LUT = fp4_e2m1_dequant_fp32_LUT;
+  }
+  auto vLutL = _mm256_loadu_ps(LUT);
+  auto vLutH = _mm256_loadu_ps(LUT + 8);
+  if (col == ld_src) {
+    size_t elesize = static_cast<size_t>(row) * col;
+    size_t velt = utils::padto_le(elesize, 32);
+    size_t i = 0;
+    assert(tmpsize >= 32);
+    for (; i < velt; i += 32) {
+      convert_s4_s8_N_avx2<32, F4_T>(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2), vmask);
+      dequant_f4_N<32, DST_T, F4_T, false>(dstptr + i, tmp, nullptr, vLutL, vLutH);
+    }
+    for (; i < elesize; i += 2) {
+      auto tmp = srcptr[i / 2];
+      dstptr[i + 0] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.x));
+      dstptr[i + 1] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.y));
+    }
+    return BTLA_CODE::Success;
+  }
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE QT_T, bool _IS_SYM, int _NCOL, typename _ST, typename _DST_T>
+static inline BTLA_CODE decompress_kblock_bit4_packrow1(utils::bit4x2* srcptr, _DST_T* dstptr, int row, int col,
+                                                        int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
+                                                        int k_offset, int kblock, int NPad, int8_t* tmpbuf,
+                                                        size_t tmpsize) {
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
+  float* LUT = nullptr;
+  if constexpr (QT_T == BTLA_DTYPE::F4_BNB) {
+    LUT = fp4_bnb_dequant_fp32_LUT;
+  } else if constexpr (QT_T == BTLA_DTYPE::F4_NF4) {
+    LUT = nf4_dequant_fp32_LUT;
+  } else if constexpr (QT_T == BTLA_DTYPE::F4_E2M1) {
+    LUT = fp4_e2m1_dequant_fp32_LUT;
+  }
+  __m256 vLutL, vLutH;
+  if (LUT) {
+    vLutL = _mm256_loadu_ps(LUT);
+    vLutH = _mm256_loadu_ps(LUT + 8);
+  }
+  int constexpr NReg = _NCOL / 8;
+  assert(col == _NCOL);
+  assert(ld_src == _NCOL);
+  assert(ld_dst == _NCOL);
+  __m256 vscales[NReg];
+  __m256i vzps[NReg];
+  int constexpr UnrollRow = 4;
+  assert(kblock % UnrollRow == 0);
+  int constexpr NTile = 32;
+  int constexpr Loop32 = _NCOL * UnrollRow / NTile;
+  assert(tmpsize >= (_NCOL * UnrollRow));
+  int row0 = kblock - k_offset % kblock;
+  row0 = row0 == kblock ? 0 : row0;
+  row0 = row0 > row ? row : row0;
+  int row1 = row - row0;
+  int irow = 0;
+  auto dequantize = [&](_DST_T* dstptr, int8_t* srcptr, __m256* vscales, __m256i* vzps = nullptr) {
+    if constexpr (QT_T == BTLA_DTYPE::S4_CLIP) {
+      dequant_s8_N_avx2<_NCOL, _IS_SYM>(dstptr, srcptr, vscales, vzps);
+    } else {
+      dequant_f4_N<_NCOL, _DST_T, QT_T, true>(dstptr, srcptr, vscales, vLutL, vLutH);
+    }
+  };
+  if (row0) {
+    int rowpad4 = utils::padto_le(row0, UnrollRow);
+    for (int iv = 0; iv < NReg; iv++) {
+      vscales[iv] = _mm256_loadu_ps(scales + (k_offset + irow) / kblock * NPad + iv * 8);
+      if constexpr (!_IS_SYM) {
+        auto tmp =
+            _mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 8));
+        vzps[iv] = _mm256_cvtepi8_epi32(tmp);
+      }
+    }
+    for (; irow < rowpad4; irow += UnrollRow) {
+      for (int iter16 = 0; iter16 < Loop32; iter16++)
+        convert_s4_s8_N_avx2<NTile, QT_T>(
+            tmpbuf + iter16 * NTile, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + NTile / 2 * iter16), vmask);
+      for (int iterr = 0; iterr < UnrollRow; iterr++)
+        dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * _NCOL, vscales, vzps);
+    }
+    for (; irow < row0; irow++) {
+      convert_s4_s8_N_avx2<_NCOL, QT_T>(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2), vmask);
+
+      dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
+    }
+  }
+
+  int row1_blk = utils::padto_le(row1, kblock) + row0;
+  for (; irow < row1_blk; irow += kblock) {
+    for (int iv = 0; iv < NReg; iv++) {
+      vscales[iv] = _mm256_loadu_ps(scales + (k_offset + irow) / kblock * NPad + iv * 8);
+      if constexpr (!_IS_SYM) {
+        auto tmp =
+            _mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 8));
+        vzps[iv] = _mm256_cvtepi8_epi32(tmp);
+      }
+    }
+    for (int irr = 0; irr < kblock; irr += UnrollRow) {
+      for (int iter16 = 0; iter16 < Loop32; iter16++)
+        convert_s4_s8_N_avx2<NTile, QT_T>(
+            tmpbuf + iter16 * NTile, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + NTile / 2 * iter16),
+            vmask);
+      for (int iterr = 0; iterr < UnrollRow; iterr++)
+        dequantize(dstptr + (irow + irr + iterr) * ld_src, tmpbuf + iterr * _NCOL, vscales, vzps);
+    }
+  }
+  if (irow < row) {
+    for (int iv = 0; iv < NReg; iv++) {
+      vscales[iv] = _mm256_loadu_ps(scales + (k_offset + irow) / kblock * NPad + iv * 8);
+      if constexpr (!_IS_SYM) {
+        auto tmp =
+            _mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 8));
+        vzps[iv] = _mm256_cvtepi8_epi32(tmp);
+      }
+    }
+    auto rowre = row - irow;
+    int rowpad4 = utils::padto_le(rowre, UnrollRow) + irow;
+    for (; irow < rowpad4; irow += UnrollRow) {
+      for (int iter16 = 0; iter16 < Loop32; iter16++)
+        convert_s4_s8_N_avx2<NTile, QT_T>(
+            tmpbuf + iter16 * NTile, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + NTile / 2 * iter16), vmask);
+      for (int iterr = 0; iterr < UnrollRow; iterr++)
+        dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * _NCOL, vscales, vzps);
+    }
+    for (; irow < row; irow++) {
+      convert_s4_s8_N_avx2<_NCOL, QT_T>(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2), vmask);
+      dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE S4_T, bool _IS_SYM, typename _ST, typename _DST_T>
+static inline BTLA_CODE decompress_kblock_bit4_packrow2(utils::bit4x2* srcptr, _DST_T* dstptr, int row, int col,
+                                                        int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
+                                                        int k_offset, int kblock, int NPad, int8_t* tmp,
+                                                        size_t tmpsize) {
+  return BTLA_CODE::NotSupport;
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s8_fp_row(int8_t* srcptr, DST_T* dstptr, int row, void* scales_, BTLA_DTYPE sdtype,
+                                             int8_t* zero_points, int k_offset, int n_offset, int blocksize, int ldzp,
+                                             int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  const auto DstSize = row * NTILE * sizeof(DST_T);
+  const auto S8Size = row * NTILE * sizeof(int8_t);
+  if (zero_points == nullptr) {
+    for (int ir = 0; ir < row; ir += blocksize) {
+      int k_remain = utils::remainsize(ir, row, blocksize);
+      int ele_off = (k_offset + ir) / blocksize * ldzp + n_offset;
+      if constexpr (PackRow == 1) {
+        __m256 vscale_y[NReg];
+        if (sdtype == BTLA_DTYPE::F32) {
+          auto sptr = (float*)scales_ + ele_off;
+          for (int i = 0; i < NReg; i++) vscale_y[i] = _mm256_loadu_ps(sptr + i * 8);
+        } else if (sdtype == BTLA_DTYPE::BF16) {
+          auto sptr = (utils::bf16*)scales_ + ele_off;
+          for (int i = 0; i < NReg; i++) vscale_y[i] = load_bf16_fp32(sptr + i * 8);
+        }
+        for (int ib = 0; ib < k_remain; ib += PackRow) {
+          auto b8ptr = srcptr + (ir + ib) * NTILE;
+          for (int i = 0; i < NReg; i++) {
+            auto vdeq_y = dequant_s8_fp(b8ptr + i * 8, vscale_y[i]);
+            store_fp_T(vdeq_y, dstptr + (ir + ib) * NTILE + i * 8);
+          }
+        }
+      } else if constexpr (PackRow == 4) {
+        const auto vshuf_index_y = _mm256_set_epi8(15, 14, 13, 12, 15, 14, 13, 12, 11, 10, 9, 8, 11, 10, 9, 8, 7, 6, 5,
+                                                   4, 7, 6, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0);
+        __m256 vscale_y[PackRow * NReg];
+        for (int i = 0; i < NReg; i++) {
+          __m256 vraw;
+          if (sdtype == BTLA_DTYPE::F32) {
+            auto sptr = (float*)scales_ + ele_off;
+            vraw = _mm256_loadu_ps(sptr + i * 8);
+          } else if (sdtype == BTLA_DTYPE::BF16) {
+            auto sptr = (utils::bf16*)scales_ + ele_off;
+            vraw = load_bf16_fp32(sptr + i * 8);
+          } else {
+            assert(0);
+          }
+          auto vcast_y = broadcast_ps_1_2<true>(vraw, vshuf_index_y);
+          vscale_y[i * PackRow + 0] = broadcast_ps_1_2<true>(vcast_y, vshuf_index_y);
+          vscale_y[i * PackRow + 1] = broadcast_ps_1_2<false>(vcast_y, vshuf_index_y);
+          vcast_y = broadcast_ps_1_2<false>(vraw, vshuf_index_y);
+          vscale_y[i * PackRow + 2] = broadcast_ps_1_2<true>(vcast_y, vshuf_index_y);
+          vscale_y[i * PackRow + 3] = broadcast_ps_1_2<false>(vcast_y, vshuf_index_y);
+        }
+        for (int ib = 0; ib < k_remain; ib += PackRow) {
+          auto b8ptr = srcptr + (ir + ib) * NTILE;
+          for (int i = 0; i < NReg; i++) {
+            for (int ip = 0; ip < PackRow; ip++) {
+              auto vdeq_y = dequant_s8_fp(b8ptr + i * 8 * PackRow + ip * 8, vscale_y[i * PackRow + ip]);
+              store_fp_T(vdeq_y, dstptr + (ir + ib) * NTILE + i * 8 * PackRow + ip * 8);
+            }
+          }
+        }
+      } else if constexpr (PackRow == 2) {
+        const auto vshuf_index_y = _mm256_set_epi8(15, 14, 13, 12, 15, 14, 13, 12, 11, 10, 9, 8, 11, 10, 9, 8, 7, 6, 5,
+                                                   4, 7, 6, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0);
+        __m256 vscale_y[PackRow * NReg];
+        for (int i = 0; i < NReg; i++) {
+          __m256 vraw;
+          if (sdtype == BTLA_DTYPE::F32) {
+            auto sptr = (float*)scales_ + ele_off;
+            vraw = _mm256_loadu_ps(sptr + i * 8);
+          } else if (sdtype == BTLA_DTYPE::BF16) {
+            auto sptr = (utils::bf16*)scales_ + ele_off;
+            vraw = load_bf16_fp32(sptr + i * 8);
+          }
+          vscale_y[i * PackRow + 0] = broadcast_ps_1_2<true>(vraw, vshuf_index_y);
+          vscale_y[i * PackRow + 1] = broadcast_ps_1_2<false>(vraw, vshuf_index_y);
+        }
+        for (int ib = 0; ib < k_remain; ib += PackRow) {
+          auto b8ptr = srcptr + (ir + ib) * NTILE;
+          for (int i = 0; i < NReg; i++) {
+            for (int ip = 0; ip < PackRow; ip++) {
+              auto vdeq_y = dequant_s8_fp(b8ptr + i * 8 * PackRow + ip * 8, vscale_y[i * PackRow + ip]);
+              store_fp_T(vdeq_y, dstptr + (ir + ib) * NTILE + i * 8 * PackRow + ip * 8);
+            }
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+    return BTLA_CODE::Success;
+  } else {
+    for (int ir = 0; ir < row; ir += blocksize) {
+      int k_remain = utils::remainsize(ir, row, blocksize);
+      int ele_off = (k_offset + ir) / blocksize * ldzp + n_offset;
+      if constexpr (PackRow == 1) {
+        __m256 vscale_y[NReg];
+        if (sdtype == BTLA_DTYPE::F32) {
+          auto sptr = (float*)scales_ + ele_off;
+          for (int i = 0; i < NReg; i++) vscale_y[i] = _mm256_loadu_ps(sptr + i * 8);
+        } else if (sdtype == BTLA_DTYPE::BF16) {
+          auto sptr = (utils::bf16*)scales_ + ele_off;
+          for (int i = 0; i < NReg; i++) vscale_y[i] = load_bf16_fp32(sptr + i * 8);
+        }
+        __m256i vzp_y[NReg];
+        for (int i = 0; i < NReg; i++) vzp_y[i] = load_s8_s32(zero_points + ele_off + i * 8);
+        for (int ib = 0; ib < k_remain; ib += PackRow) {
+          auto b8ptr = srcptr + (ir + ib) * NTILE;
+          for (int i = 0; i < NReg; i++) {
+            auto vdeq_y = dequant_s8_fp<true>(b8ptr + i * 8, vscale_y[i], vzp_y[i]);
+            store_fp_T(vdeq_y, dstptr + (ir + ib) * NTILE + i * 8);
+          }
+        }
+      } else if constexpr (PackRow == 4) {
+        const auto vshuf_index_y = _mm256_set_epi8(15, 14, 13, 12, 15, 14, 13, 12, 11, 10, 9, 8, 11, 10, 9, 8, 7, 6, 5,
+                                                   4, 7, 6, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0);
+        __m256 vscale_y[PackRow * NReg];
+        __m256i vzp_y[PackRow * NReg];
+        for (int i = 0; i < NReg; i++) {
+          __m256 vraw;
+          if (sdtype == BTLA_DTYPE::F32) {
+            auto sptr = (float*)scales_ + ele_off;
+            vraw = _mm256_loadu_ps(sptr + i * 8);
+          } else if (sdtype == BTLA_DTYPE::BF16) {
+            auto sptr = (utils::bf16*)scales_ + ele_off;
+            vraw = load_bf16_fp32(sptr + i * 8);
+          } else {
+            assert(0);
+          }
+          auto vcast_y = broadcast_ps_1_2<true>(vraw, vshuf_index_y);
+          vscale_y[i * PackRow + 0] = broadcast_ps_1_2<true>(vcast_y, vshuf_index_y);
+          vscale_y[i * PackRow + 1] = broadcast_ps_1_2<false>(vcast_y, vshuf_index_y);
+          vcast_y = broadcast_ps_1_2<false>(vraw, vshuf_index_y);
+          vscale_y[i * PackRow + 2] = broadcast_ps_1_2<true>(vcast_y, vshuf_index_y);
+          vscale_y[i * PackRow + 3] = broadcast_ps_1_2<false>(vcast_y, vshuf_index_y);
+
+          auto tmp = load_s8_s32(zero_points + ele_off + i * 8);
+          auto vcasti_y = broadcast_epi32_1_2<true>(tmp, vshuf_index_y);
+          vzp_y[i * PackRow + 0] = broadcast_epi32_1_2<true>(vcasti_y, vshuf_index_y);
+          vzp_y[i * PackRow + 1] = broadcast_epi32_1_2<false>(vcasti_y, vshuf_index_y);
+          vcasti_y = broadcast_epi32_1_2<false>(tmp, vshuf_index_y);
+          vzp_y[i * PackRow + 2] = broadcast_epi32_1_2<true>(vcasti_y, vshuf_index_y);
+          vzp_y[i * PackRow + 3] = broadcast_epi32_1_2<false>(vcasti_y, vshuf_index_y);
+        }
+        for (int ib = 0; ib < k_remain; ib += PackRow) {
+          auto b8ptr = srcptr + (ir + ib) * NTILE;
+          for (int i = 0; i < NReg; i++) {
+            for (int ip = 0; ip < PackRow; ip++) {
+              auto vdeq_y = dequant_s8_fp<true>(b8ptr + i * 8 * PackRow + ip * 8, vscale_y[i * PackRow + ip],
+                                                vzp_y[i * PackRow + ip]);
+              store_fp_T(vdeq_y, dstptr + (ir + ib) * NTILE + i * 8 * PackRow + ip * 8);
+            }
+          }
+        }
+      } else if constexpr (PackRow == 2) {
+        const auto vshuf_index_y = _mm256_set_epi8(15, 14, 13, 12, 15, 14, 13, 12, 11, 10, 9, 8, 11, 10, 9, 8, 7, 6, 5,
+                                                   4, 7, 6, 5, 4, 3, 2, 1, 0, 3, 2, 1, 0);
+        __m256 vscale_y[PackRow * NReg];
+        __m256i vzp_y[PackRow * NReg];
+        for (int i = 0; i < NReg; i++) {
+          __m256 vraw;
+          if (sdtype == BTLA_DTYPE::F32) {
+            auto sptr = (float*)scales_ + ele_off;
+            vraw = _mm256_loadu_ps(sptr + i * 8);
+          } else if (sdtype == BTLA_DTYPE::BF16) {
+            auto sptr = (utils::bf16*)scales_ + ele_off;
+            vraw = load_bf16_fp32(sptr + i * 8);
+          }
+          vscale_y[i * PackRow + 0] = broadcast_ps_1_2<true>(vraw, vshuf_index_y);
+          vscale_y[i * PackRow + 1] = broadcast_ps_1_2<false>(vraw, vshuf_index_y);
+          auto tmp = load_s8_s32(zero_points + ele_off + i * 8);
+          vzp_y[i * PackRow + 0] = broadcast_epi32_1_2<true>(tmp, vshuf_index_y);
+          vzp_y[i * PackRow + 1] = broadcast_epi32_1_2<false>(tmp, vshuf_index_y);
+        }
+        for (int ib = 0; ib < k_remain; ib += PackRow) {
+          auto b8ptr = srcptr + (ir + ib) * NTILE;
+          for (int i = 0; i < NReg; i++) {
+            for (int ip = 0; ip < PackRow; ip++) {
+              auto vdeq_y = dequant_s8_fp<true>(b8ptr + i * 8 * PackRow + ip * 8, vscale_y[i * PackRow + ip],
+                                                vzp_y[i * PackRow + ip]);
+              store_fp_T(vdeq_y, dstptr + (ir + ib) * NTILE + i * 8 * PackRow + ip * 8);
+            }
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+    return BTLA_CODE::Success;
+  }
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s8_fp(int8_t* srcptr, DST_T* dstptr, int row, int col, void* scales_,
+                                         BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset, int n_offset,
+                                         int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  auto ret = BTLA_CODE::NotSupport;
+  if (col == NTILE) {
+    int head_end = utils::padto(k_offset, blocksize);
+    head_end = std::min(head_end, k_offset + row);
+    int head_size = head_end - k_offset;
+    if (head_size > 0) {
+      decompress_kblock_s8_fp_row<PackRow, NTILE, DST_T>(srcptr, dstptr, head_size, scales_, sdtype, zero_points,
+                                                         k_offset, n_offset, blocksize, ldzp, tmp, tmpsize);
+    }
+    int body_size = row - head_size;
+    if (body_size > 0) {
+      decompress_kblock_s8_fp_row<PackRow, NTILE, DST_T>(srcptr + head_size * NTILE, dstptr + head_size * NTILE,
+                                                         body_size, scales_, sdtype, zero_points, head_end, n_offset,
+                                                         blocksize, ldzp, tmp, tmpsize);
+    }
+    return BTLA_CODE::Success;
+  }
+  return ret;
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s4_fp_row(utils::int4x2* srcptr, DST_T* dstptr, int row, void* scales_,
+                                             BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset, int n_offset,
+                                             int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  const auto DstSize = row * NTILE * sizeof(DST_T);
+  const auto S8Size = row * NTILE * sizeof(int8_t);
+  auto tmps8ptr = (int8_t*)dstptr;
+  tmps8ptr += DstSize - S8Size;
+  auto ret = decompress_kblock_s4_s8<PackRow, NTILE>(srcptr, zero_points, tmps8ptr, blocksize, ldzp, n_offset, k_offset,
+                                                     row, NTILE, tmp, tmpsize);
+  assert(ret == BTLA_CODE::Success);
+  return decompress_kblock_s8_fp_row<PackRow, NTILE, DST_T>(tmps8ptr, dstptr, row, scales_, sdtype, nullptr, k_offset,
+                                                            n_offset, blocksize, ldzp, tmp, tmpsize);
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s4_fp(utils::int4x2* srcptr, DST_T* dstptr, int row, int col, void* scales_,
+                                         BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset, int n_offset,
+                                         int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  auto ret = BTLA_CODE::NotSupport;
+  if (col == NTILE) {
+    int head_end = utils::padto(k_offset, blocksize);
+    head_end = std::min(head_end, k_offset + row);
+    int head_size = head_end - k_offset;
+    if (head_size > 0) {
+      decompress_kblock_s4_fp_row<PackRow, NTILE, DST_T>(srcptr, dstptr, head_size, scales_, sdtype, zero_points,
+                                                         k_offset, n_offset, blocksize, ldzp, tmp, tmpsize);
+    }
+    int body_size = row - head_size;
+    if (body_size > 0) {
+      decompress_kblock_s4_fp_row<PackRow, NTILE, DST_T>(srcptr + head_size * NTILE / 2, dstptr + head_size * NTILE,
+                                                         body_size, scales_, sdtype, zero_points, head_end, n_offset,
+                                                         blocksize, ldzp, tmp, tmpsize);
+    }
+    return BTLA_CODE::Success;
+  }
+  return ret;
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s3_fp_row(utils::bit2x4* b2ptr, utils::bit1x8* b1ptr, DST_T* dstptr, int row,
+                                             void* scales_, BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset,
+                                             int n_offset, int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  const auto DstSize = row * NTILE * sizeof(DST_T);
+  const auto S8Size = row * NTILE * sizeof(int8_t);
+  auto tmps8ptr = (int8_t*)dstptr;
+  tmps8ptr += DstSize - S8Size;
+  auto ret = decompress_kblock_s3_s8<PackRow, NTILE>(b2ptr, b1ptr, zero_points, tmps8ptr, blocksize, ldzp, n_offset,
+                                                     k_offset, row, NTILE, tmp, tmpsize);
+  assert(ret == BTLA_CODE::Success);
+  return decompress_kblock_s8_fp_row<PackRow, NTILE, DST_T>(tmps8ptr, dstptr, row, scales_, sdtype, nullptr, k_offset,
+                                                            n_offset, blocksize, ldzp, tmp, tmpsize);
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s3_fp(utils::bit2x4* b2ptr, utils::bit1x8* b1ptr, DST_T* dstptr, int row, int col,
+                                         void* scales_, BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset,
+                                         int n_offset, int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  auto ret = BTLA_CODE::NotSupport;
+  if (col == NTILE) {
+    int head_end = utils::padto(k_offset, blocksize);
+    head_end = std::min(head_end, k_offset + row);
+    int head_size = head_end - k_offset;
+    if (head_size > 0) {
+      decompress_kblock_s3_fp_row<PackRow, NTILE, DST_T>(b2ptr, b1ptr, dstptr, head_size, scales_, sdtype, zero_points,
+                                                         k_offset, n_offset, blocksize, ldzp, tmp, tmpsize);
+    }
+    int body_size = row - head_size;
+    if (body_size > 0) {
+      decompress_kblock_s3_fp_row<PackRow, NTILE, DST_T>(
+          b2ptr + head_size * NTILE / 4, b1ptr + head_size * NTILE / 8, dstptr + head_size * NTILE, body_size, scales_,
+          sdtype, zero_points, head_end, n_offset, blocksize, ldzp, tmp, tmpsize);
+    }
+    return BTLA_CODE::Success;
+  }
+  return ret;
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s2_fp_row(utils::bit2x4* b2ptr, DST_T* dstptr, int row, void* scales_,
+                                             BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset, int n_offset,
+                                             int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  const auto DstSize = row * NTILE * sizeof(DST_T);
+  const auto S8Size = row * NTILE * sizeof(int8_t);
+  auto tmps8ptr = (int8_t*)dstptr;
+  tmps8ptr += DstSize - S8Size;
+  auto ret = decompress_kblock_s2_s8<PackRow, NTILE>(b2ptr, zero_points, tmps8ptr, blocksize, ldzp, n_offset, k_offset,
+                                                     row, NTILE, tmp, tmpsize);
+  assert(ret == BTLA_CODE::Success);
+  return decompress_kblock_s8_fp_row<PackRow, NTILE, DST_T>(tmps8ptr, dstptr, row, scales_, sdtype, nullptr, k_offset,
+                                                            n_offset, blocksize, ldzp, tmp, tmpsize);
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s2_fp(utils::bit2x4* b2ptr, DST_T* dstptr, int row, int col, void* scales_,
+                                         BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset, int n_offset,
+                                         int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  auto ret = BTLA_CODE::NotSupport;
+  if (col == NTILE) {
+    int head_end = utils::padto(k_offset, blocksize);
+    head_end = std::min(head_end, k_offset + row);
+    int head_size = head_end - k_offset;
+    if (head_size > 0) {
+      decompress_kblock_s2_fp_row<PackRow, NTILE, DST_T>(b2ptr, dstptr, head_size, scales_, sdtype, zero_points,
+                                                         k_offset, n_offset, blocksize, ldzp, tmp, tmpsize);
+    }
+    int body_size = row - head_size;
+    if (body_size > 0) {
+      decompress_kblock_s2_fp_row<PackRow, NTILE, DST_T>(b2ptr + head_size * NTILE / 4, dstptr + head_size * NTILE,
+                                                         body_size, scales_, sdtype, zero_points, head_end, n_offset,
+                                                         blocksize, ldzp, tmp, tmpsize);
+    }
+    return BTLA_CODE::Success;
+  }
+  return ret;
+}
+
+template <BTLA_DTYPE _F4_T, typename _DST_T, int _PACK_ROW, typename _ST>
+static inline BTLA_CODE decompress_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
+                                                int ld_dst, _ST* scales, int k_offset, int kblock, int NPad,
+                                                int8_t* tmp, size_t tmpsize) {
+  if constexpr (_PACK_ROW == 1) {
+    if (col == 24) {
+      return decompress_kblock_bit4_packrow1<_F4_T, true, 24, _ST, _DST_T>(
+          srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr, k_offset, kblock, NPad, tmp, tmpsize);
+    }
+    if (col == 48) {
+      return decompress_kblock_bit4_packrow1<_F4_T, true, 48, _ST, _DST_T>(
+          srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr, k_offset, kblock, NPad, tmp, tmpsize);
+    }
+  } else if constexpr (_PACK_ROW == 2) {
+    return decompress_kblock_bit4_packrow2<_F4_T, true, _ST, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales,
+                                                                     nullptr, k_offset, kblock, NPad, tmp, tmpsize);
+  }
+  assert(0);
+  return BTLA_CODE::NotSupport;
+}
+
+enum class AVX2_REDUCE_TYPE { MAX, MIN, ADD };
+#define AVX2_REDUCE_OP                                                  \
+  if constexpr (TYPE == AVX2_REDUCE_TYPE::MAX) x = _mm256_max_ps(x, y); \
+  if constexpr (TYPE == AVX2_REDUCE_TYPE::MIN) x = _mm256_min_ps(x, y); \
+  if constexpr (TYPE == AVX2_REDUCE_TYPE::ADD) x = _mm256_add_ps(x, y);
+
+template <AVX2_REDUCE_TYPE TYPE>
+inline float avx2_reduce_ps(__m256 x) {
+  __m256 y = _mm256_permute2f128_ps(x, x, 1);
+  AVX2_REDUCE_OP
+  y = _mm256_permute_ps(x, 0b01001110);
+  AVX2_REDUCE_OP
+  y = _mm256_permute_ps(x, 0b10110001);
+  AVX2_REDUCE_OP
+  return _mm256_cvtss_f32(x);
+}
+
+#define AVX2_REDUCE_OP_EPI32(dst, src)                                           \
+  if constexpr (TYPE == AVX2_REDUCE_TYPE::MAX) dst = _mm256_max_epi32(dst, src); \
+  if constexpr (TYPE == AVX2_REDUCE_TYPE::MIN) dst = _mm256_min_epi32(dst, src); \
+  if constexpr (TYPE == AVX2_REDUCE_TYPE::ADD) dst = _mm256_add_epi32(dst, src);
+
+#ifndef _mm256_cvtsi256_si32
+#define _mm256_cvtsi256_si32(a) (_mm_cvtsi128_si32(_mm256_castsi256_si128(a)))
+#endif
+
+template <AVX2_REDUCE_TYPE TYPE>
+inline int avx2_reduce_epi32(__m256i xd) {
+  auto x = _mm256_castsi256_ps(xd);
+  __m256 y = _mm256_permute2f128_ps(x, x, 1);
+  auto yd = _mm256_castps_si256(y);
+  AVX2_REDUCE_OP_EPI32(xd, yd);
+  x = _mm256_castsi256_ps(xd);
+  y = _mm256_permute_ps(x, 0b01001110);
+  yd = _mm256_castps_si256(y);
+  AVX2_REDUCE_OP_EPI32(xd, yd);
+  x = _mm256_castsi256_ps(xd);
+  y = _mm256_permute_ps(x, 0b10110001);
+  yd = _mm256_castps_si256(y);
+  AVX2_REDUCE_OP_EPI32(xd, yd);
+  return _mm256_cvtsi256_si32(xd);
+}
+
+inline __m128i avx2_cvtepi32_epu8(__m256i x) {
+  auto out_v = _mm_packus_epi32(_mm256_castsi256_si128(x), _mm256_extractf128_si256(x, 1));
+  out_v = _mm_packus_epi16(out_v, out_v);
+  return out_v;
+}
+
+template <typename SRC_T>
+static inline BTLA_CODE quantize_fp_u8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr,
+                                                int ld_dst, float* scales, int ld_scale, uint8_t* zps, int blocksize,
+                                                float* blkreduce) {
+  int constexpr VLen = 8;
+  auto vff = _mm256_set1_epi32(255);
+  auto v0 = _mm256_set1_epi32(0);
+  int vblocksize = utils::padto_le(blocksize, VLen);
+  int colblk = utils::padto_le(col, blocksize);
+  for (int i = 0; i < row; i++) {
+    size_t j = 0;
+    for (; j < colblk; j += blocksize) {
+      __m256 vmaxval = _mm256_set1_ps(0.f);
+      __m256 vminval = _mm256_set1_ps(0.f);
+      size_t ij = 0;
+      for (; ij < vblocksize; ij += VLen) {
+        __m256 vsrc;
+        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm256_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
+        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
+          auto vtmp =
+              _mm_loadu_si128(reinterpret_cast<__m128i*>(const_cast<utils::bf16*>(&srcptr[(j + ij) + i * ld_src])));
+          vsrc = ymm_cvt_bf16_fp32(vtmp);
+        }
+        vmaxval = _mm256_max_ps(vmaxval, vsrc);
+        vminval = _mm256_min_ps(vminval, vsrc);
+      }
+      auto maxval = avx2_reduce_ps<AVX2_REDUCE_TYPE::MAX>(vmaxval);
+      auto minval = avx2_reduce_ps<AVX2_REDUCE_TYPE::MIN>(vminval);
+      if (ij < blocksize) {
+        for (; ij < blocksize; ij++) {
+          auto srcval = (float)srcptr[(j + ij) + i * ld_src];
+          maxval = std::max(maxval, srcval);
+          minval = std::min(minval, srcval);
+        }
+      }
+      float scale = (maxval - minval) / 255;
+      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
+      scales[j / blocksize + i * ld_scale] = scale;
+      zps[j / blocksize + i * ld_scale] = zp;
+      int sum = 0;
+      float rscale = 1.f / scale;
+      auto vrscale = _mm256_set1_ps(rscale);
+      auto vdzp = _mm256_set1_epi32(zp);
+      ij = 0;
+      if (blkreduce) {
+        for (; ij < vblocksize; ij += VLen) {
+          __m256 vsrc;
+          if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm256_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
+          if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
+            auto vtmp =
+                _mm_loadu_si128(reinterpret_cast<__m128i*>(const_cast<utils::bf16*>(&srcptr[(j + ij) + i * ld_src])));
+            vsrc = ymm_cvt_bf16_fp32(vtmp);
+          }
+          vsrc = _mm256_mul_ps(vsrc, vrscale);
+          auto vdsrc = _mm256_cvtps_epi32(vsrc);
+          sum += avx2_reduce_epi32<AVX2_REDUCE_TYPE::ADD>(vdsrc);
+          vdsrc = _mm256_add_epi32(vdsrc, vdzp);
+          vdsrc = _mm256_min_epi32(vdsrc, vff);
+          vdsrc = _mm256_max_epi32(vdsrc, v0);
+          auto vbsrc = avx2_cvtepi32_epu8(vdsrc);
+          _mm_storel_epi64(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
+        }
+      } else {
+        for (; ij < vblocksize; ij += VLen) {
+          __m256 vsrc;
+          if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm256_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
+          if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
+            auto vtmp =
+                _mm_loadu_si128(reinterpret_cast<__m128i*>(const_cast<utils::bf16*>(&srcptr[(j + ij) + i * ld_src])));
+            vsrc = ymm_cvt_bf16_fp32(vtmp);
+          }
+          vsrc = _mm256_mul_ps(vsrc, vrscale);
+          auto vdsrc = _mm256_cvtps_epi32(vsrc);
+          vdsrc = _mm256_add_epi32(vdsrc, vdzp);
+          vdsrc = _mm256_min_epi32(vdsrc, vff);
+          vdsrc = _mm256_max_epi32(vdsrc, v0);
+          auto vbsrc = avx2_cvtepi32_epu8(vdsrc);
+          _mm_storel_epi64(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
+        }
+      }
+      for (; ij < blocksize; ij++) {
+        auto srcval = (float)srcptr[(j + ij) + i * ld_src];
+        srcval = srcval * rscale;
+        auto srcint = int(roundf(srcval));
+        sum += srcint;
+        srcint += zp;
+        srcint = std::min(srcint, 0xff);
+        srcint = std::max(srcint, 0);
+        dstptr[(j + ij) + i * ld_dst] = static_cast<uint8_t>(srcint);
+      }
+      if (blkreduce) {
+        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
+      }
+    }
+    if (j < col) {
+      float maxval = 0.f;
+      float minval = 0.f;
+      for (size_t ij = j; ij < col; ij++) {
+        maxval = std::max((float)srcptr[ij + i * ld_src], maxval);
+        minval = std::min((float)srcptr[ij + i * ld_src], minval);
+      }
+      float scale = (maxval - minval) / 255;
+      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
+      float rscale = 1.f / scale;
+      scales[j / blocksize + i * ld_scale] = scale;
+      zps[j / blocksize + i * ld_scale] = zp;
+      int sum = 0;
+      for (size_t ij = j; ij < col; ij++) {
+        auto srcint = utils::cast<float, int>(srcptr[ij + i * ld_src] * rscale);
+        sum += srcint;
+        srcint += zp;
+        srcint = srcint <= 255 ? srcint : 255;
+        srcint = srcint >= 0 ? srcint : 0;
+        dstptr[ij + i * ld_dst] = utils::cast<int, uint8_t>(srcint);
+      }
+      if (blkreduce) {
+        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename SRC_T>
+static inline BTLA_CODE col_block_reduce_sum(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize,
+                                             float* reduce, int ldr) {
+  int constexpr VLen = 8;
+  auto vblock2_ = utils::padto_le(blocksize, VLen * 2);
+  auto vblock_ = utils::padto_le(blocksize, VLen);
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col; j += blocksize) {
+      auto tmp = 0.f;
+      auto vsum = _mm256_set1_ps(0.f);
+      int jj = 0;
+      auto vblock2 = j + vblock2_ <= col ? vblock2_ : 0;
+      auto vblock = j + vblock_ <= col ? vblock_ : 0;
+      for (; jj < vblock2; jj += VLen * 2) {
+        auto vtmp = _mm256_loadu_ps(srcptr + i * ldsrc + j + jj);
+        auto vtmp1 = _mm256_loadu_ps(srcptr + i * ldsrc + j + jj + VLen);
+        auto s0 = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vtmp);
+        auto s1 = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vtmp1);
+        tmp += s0;
+        tmp += s1;
+      }
+      if (jj + VLen <= vblock) {
+        for (; jj < vblock; jj += VLen) {
+          auto vtmp = _mm256_loadu_ps(srcptr + i * ldsrc + j + jj);
+          auto s0 = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vtmp);
+          tmp += s0;
+        }
+      }
+      for (; jj < blocksize; jj++) {
+        tmp += *(srcptr + i * ldsrc + j + jj);
+      }
+      reduce[i * ldr + j / blocksize] = tmp;
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr, float* dst_ptr, int row, int col,
+                                                    int src_step, int dst_step, bool zeropadding) {
+  const int npadding = (dst_step - col) * sizeof(float);
+  constexpr int simd_proc_elt = 8;
+  auto col_body = col / simd_proc_elt * simd_proc_elt;
+  for (int i = 0; i < row; i++) {
+    auto src = const_cast<utils::bf16*>(src_ptr + i * src_step);
+    auto dst = dst_ptr + i * dst_step;
+    int j = 0;
+    for (; j < col_body; j += simd_proc_elt) {
+      auto bf16_v = _mm_loadu_si128(reinterpret_cast<__m128i*>(src + j));
+      auto fp32_v = _mm256_castsi256_ps(_mm256_bslli_epi128(_mm256_cvtepu16_epi32(bf16_v), 2));
+      _mm256_storeu_ps(dst + j, fp32_v);
+    }
+    for (; j < col; j++) {
+      *(dst + j) = (src + j)->tofloat();
+    }
+    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
+  }
+  return BTLA_CODE::Success;
+}
+
+static const uint8_t avx2_bf16_convert_maigc_num[32] = {
+    0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+static inline __m128i cvt_fp32_to_bf16(const __m256 src, __m256i* and_helper, __m256i* add_helper) {
+  auto shuffle_v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(avx2_bf16_convert_maigc_num));
+  auto round_bias = _mm256_castps_si256(src);
+  round_bias = _mm256_and_si256(*and_helper, _mm256_srli_si256(round_bias, 2));
+  round_bias = _mm256_add_epi32(round_bias, *add_helper);
+  auto round_fp32_v = _mm256_add_epi32(_mm256_castps_si256(src), round_bias);
+  __m256i trunc_elements = _mm256_shuffle_epi8(round_fp32_v, shuffle_v);
+  __m256i ordered = _mm256_permute4x64_epi64(trunc_elements, 0x58);
+  return _mm256_castsi256_si128(ordered);
+}
+
+static inline BTLA_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
+                                                    int srcstride, int dststride, bool zeropadding) {
+  auto srcptr = reinterpret_cast<const char*>(raw_srcptr);
+  auto dstptr = reinterpret_cast<char*>(raw_dstptr);
+  constexpr int simd_proc_elt = 8;
+  auto bf16_and_helper = _mm256_set1_epi32(0X00000001);
+  auto bf16_add_helper = _mm256_set1_epi32(0x00007FFF);
+  auto col_body_loop = col / simd_proc_elt * simd_proc_elt;
+  int npadding = dststride - col * sizeof(utils::bf16);
+  for (int i = 0; i < row; i++) {
+    auto src = srcptr + i * srcstride;
+    auto dst = dstptr + i * dststride;
+    int j = 0;
+    for (; j < col_body_loop; j += simd_proc_elt) {
+      auto pack_bf16_value = cvt_fp32_to_bf16(_mm256_loadu_ps(reinterpret_cast<const float*>(src) + j),
+                                              &bf16_and_helper, &bf16_add_helper);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + j * sizeof(utils::bf16)), pack_bf16_value);
+    }
+    for (; j < col; j++) {
+      (reinterpret_cast<utils::bf16*>(dst) + j)->fromfloat(*(reinterpret_cast<const float*>(src) + j));
+    }
+    if (zeropadding && npadding) {
+      std::memset(dst + col * sizeof(utils::bf16), 0, npadding);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE layernorm(const float* srcptr, const float* scaleptr, const float* biasptr, float epsilon,
+                                  int norm_size, float* dstptr, float* mean_out, float* mean_square_out,
+                                  bool simplified) {
+  int constexpr VLen = 8;
+  int norm_size8 = utils::padto_le(norm_size, VLen);
+  int h = 0;
+  __m256 vmean = _mm256_setzero_ps(), vmeansq = _mm256_setzero_ps();
+  for (; h < norm_size8; h += VLen) {
+    auto tmp = _mm256_loadu_ps(srcptr + h);
+    vmean = _mm256_add_ps(vmean, tmp);
+    tmp = _mm256_mul_ps(tmp, tmp);
+    vmeansq = _mm256_add_ps(vmeansq, tmp);
+  }
+  float mean = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vmean);
+  float mean_square = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vmeansq);
+  for (; h < norm_size; h++) {
+    mean += srcptr[h];
+    mean_square += srcptr[h] * srcptr[h];
+  }
+  mean = mean / norm_size;
+  if (simplified) {
+    mean_square = std::sqrt(mean_square / norm_size + epsilon);
+  } else {
+    mean_square = std::sqrt(mean_square / norm_size - mean * mean + epsilon);
+  }
+  auto vm = _mm256_set1_ps(mean);
+  float inv_meansq = 1.f / mean_square;
+  auto vms = _mm256_set1_ps(inv_meansq);
+  h = 0;
+  if (simplified) {
+    if (scaleptr) {
+      for (; h < norm_size8; h += VLen) {
+        auto inp = _mm256_loadu_ps(srcptr + h);
+        auto scale = _mm256_loadu_ps(scaleptr + h);
+        inp = _mm256_mul_ps(inp, scale);
+        inp = _mm256_mul_ps(inp, vms);
+        _mm256_storeu_ps(dstptr + h, inp);
+      }
+      for (; h < norm_size; h++) {
+        dstptr[h] = srcptr[h] * inv_meansq * scaleptr[h];
+      }
+    } else {
+      for (; h < norm_size8; h += VLen) {
+        auto inp = _mm256_loadu_ps(srcptr + h);
+        inp = _mm256_mul_ps(inp, vms);
+        _mm256_storeu_ps(dstptr + h, inp);
+      }
+      for (; h < norm_size; h++) {
+        dstptr[h] = srcptr[h] * inv_meansq;
+      }
+    }
+
+  } else {
+    if (scaleptr) {
+      if (biasptr == nullptr) {
+        for (; h < norm_size8; h += VLen) {
+          auto inp = _mm256_loadu_ps(srcptr + h);
+          auto scale = _mm256_loadu_ps(scaleptr + h);
+          inp = _mm256_sub_ps(inp, vm);
+          inp = _mm256_mul_ps(inp, scale);
+          inp = _mm256_mul_ps(inp, vms);
+          _mm256_storeu_ps(dstptr + h, inp);
+        }
+        for (; h < norm_size; h++) {
+          dstptr[h] = (srcptr[h] - mean) * inv_meansq * scaleptr[h];
+        }
+      } else {
+        for (; h < norm_size8; h += VLen) {
+          auto inp = _mm256_loadu_ps(srcptr + h);
+          auto scale = _mm256_loadu_ps(scaleptr + h);
+          inp = _mm256_sub_ps(inp, vm);
+          inp = _mm256_mul_ps(inp, vms);
+          inp = _mm256_mul_ps(inp, scale);
+          auto bias = _mm256_loadu_ps(biasptr + h);
+          inp = _mm256_add_ps(inp, bias);
+          _mm256_storeu_ps(dstptr + h, inp);
+        }
+        for (; h < norm_size; h++) {
+          dstptr[h] = (srcptr[h] - mean) * inv_meansq * scaleptr[h] + biasptr[h];
+        }
+      }
+    } else {
+      for (; h < norm_size8; h += VLen) {
+        auto inp = _mm256_loadu_ps(srcptr + h);
+        inp = _mm256_sub_ps(inp, vm);
+        inp = _mm256_mul_ps(inp, vms);
+        _mm256_storeu_ps(dstptr + h, inp);
+      }
+      for (; h < norm_size; h++) {
+        dstptr[h] = (srcptr[h] - mean) * inv_meansq;
+      }
+    }
+  }
+
+  if (mean_out) {
+    *mean_out = mean;
+  }
+  if (mean_square_out) {
+    *mean_square_out = mean_square;
+  }
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE S3_T, typename _DST_T>
+inline BTLA_CODE decompress_kblock_s3_s8fp(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, _DST_T* dstptr,
+                                           int interleave_n_offset, int unpack_elt, int8_t* tmp, size_t tmpsize) {
+  auto head_ignore_num = interleave_n_offset % 128;
+  const __m256i lowMask = _mm256_set1_epi8(0x03);
+  const __m256i highMask = _mm256_set1_epi8(0x04);
+  const __m256i bit1Mask = _mm256_set1_epi32(0x0F);
+  const __m256i bit1Shift_1 = _mm256_set_epi32(28, 24, 20, 16, 12, 8, 4, 0);
+  const __m256i bit1Shift_2 = _mm256_set1_epi32((1 << 23) + (1 << 16) + (1 << 9) + (1 << 2));
+
+  auto bit3_interleave_decompress_pack128 = [&](utils::bit2x4* src1, utils::bit1x8* src2, int8_t* dst) {
+    __m256i bit2_data = _mm256_loadu_si256((const __m256i*)src1);
+    int32_t* bit1_ptr = reinterpret_cast<int32_t*>(src2);
+    for (int i = 0; i < 4; i++) {
+      auto bit1x32 = _mm256_set1_epi32(bit1_ptr[i]);
+      bit1x32 = _mm256_srlv_epi32(bit1x32, bit1Shift_1);
+      bit1x32 = _mm256_and_si256(bit1x32, bit1Mask);
+      bit1x32 = _mm256_mullo_epi32(bit1x32, bit1Shift_2);
+      bit1x32 = _mm256_and_si256(highMask, bit1x32);
+
+      auto bit2x32 = _mm256_and_si256(lowMask, _mm256_srli_epi16(bit2_data, 2 * i));
+      auto res = _mm256_add_epi8(bit1x32, bit2x32);
+      res = _mm256_sub_epi8(res, highMask);
+      _mm256_storeu_si256((__m256i*)(dst + 32 * i), res);
+    }
+  };
+  int compress_wei_ptr_offset = 0;
+  if (head_ignore_num != 0) {
+    assert(head_ignore_num % 8 == 0);
+
+    auto base_bit2ptr = bit2ptr - head_ignore_num / 4;
+    auto base_bit1ptr = bit1ptr - head_ignore_num / 8;
+    auto head_write_num = 128 - head_ignore_num;
+    bit3_interleave_decompress_pack128(base_bit2ptr, base_bit1ptr, tmp);
+    for (int i = 0; i < head_write_num; i++) dstptr[i] = tmp[head_ignore_num + i];
+    compress_wei_ptr_offset += head_write_num;
+    unpack_elt -= head_write_num;
+  }
+  auto body_loop = unpack_elt / 128;
+  auto tail_proc_num = unpack_elt % 128;
+
+  bestla::kernel::jit::DecompressS3::forward_avx2(bit2ptr + compress_wei_ptr_offset / 4,
+                                                  bit1ptr + compress_wei_ptr_offset / 8,
+                                                  dstptr + compress_wei_ptr_offset, tmp, body_loop * 128);
+  compress_wei_ptr_offset += body_loop * 128;
+  if (tail_proc_num > 0) {
+    bit3_interleave_decompress_pack128(bit2ptr + compress_wei_ptr_offset / 4, bit1ptr + compress_wei_ptr_offset / 8,
+                                       tmp);
+    for (int i = 0; i < tail_proc_num; i++) dstptr[compress_wei_ptr_offset + i] = tmp[i];
+  }
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE _S3_T, typename _DST_T, int _PACK_ROW, typename _ST>
+static inline BTLA_CODE decompress_kblock_bit3_packrow_fp(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr,
+                                                          _DST_T* dstptr, int interleave_n_offset, int row, int col,
+                                                          _ST* scales, int8_t* zero_points, int k_offset, int kblock,
+                                                          int NPad, void* tmp, size_t tmpsize) {
+  auto unpack_elt = row * col;
+  decompress_kblock_s3_s8fp<_S3_T>(bit2ptr, bit1ptr, dstptr, interleave_n_offset, unpack_elt,
+                                   reinterpret_cast<int8_t*>(tmp), tmpsize);
+  // TODO(zhe): simd version
+  for (int i = 0; i < row; i++) {
+    int kpos = (k_offset + i) / kblock;
+    auto sptr = scales + kpos * NPad;
+    for (int j = 0; j < col; j++) {
+      float tmp = static_cast<float>(dstptr[i * col + j]);
+      if (zero_points != nullptr) tmp -= static_cast<float>(zero_points[kpos * NPad + j / _PACK_ROW]);
+      dstptr[i * col + j] = static_cast<_DST_T>(tmp * sptr[j / _PACK_ROW]);
+    }
+  }
+
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE S2_T, typename _DST_T>
+inline BTLA_CODE decompress_kblock_s2_s8fp(utils::bit2x4* bit2ptr, _DST_T* dstptr, int unpack_elt, int8_t* tmp,
+                                           size_t tmpsize) {
+  int constexpr VBits = 256;
+  int constexpr VElt = VBits / 8;
+  int i = 0;
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+  int elt_pad = utils::padto_le(unpack_elt, VElt);
+  for (; i < elt_pad; i += VElt) {
+    auto vout = unpack_2bits(bit2ptr + i / 4, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+    if (std::is_same_v<_DST_T, int8_t>) {
+      _mm256_storeu_si256((__m256i*)(dstptr + i), vout);
+    } else {
+      _mm256_storeu_si256((__m256i*)tmp, vout);
+      for (int j = 0; j < VElt; j++) {
+        dstptr[i + j] = tmp[j];
+      }
+    }
+  }
+  ref::decompress_kblock_s2_s8fp<S2_T, _DST_T>(bit2ptr + i / 4, dstptr + i, unpack_elt - i, tmp, tmpsize);
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE _S2_T, typename _DST_T, int _PACK_ROW, typename _ST>
+static inline BTLA_CODE decompress_kblock_bit2_packrow_fp(utils::bit2x4* bit2ptr, _DST_T* dstptr, int row, int col,
+                                                          _ST* scales, int8_t* zero_points, int k_offset, int kblock,
+                                                          int NPad, void* tmp, size_t tmpsize) {
+  auto unpack_elt = row * col;
+  decompress_kblock_s2_s8fp<_S2_T>(bit2ptr, dstptr, unpack_elt, reinterpret_cast<int8_t*>(tmp), tmpsize);
+  // TODO(zhe): simd version
+  for (int i = 0; i < row; i++) {
+    int kpos = (k_offset + i) / kblock;
+    auto sptr = scales + kpos * NPad;
+    for (int j = 0; j < col; j++) {
+      float tmp = static_cast<float>(dstptr[i * col + j]);
+      if (zero_points != nullptr) tmp -= static_cast<float>(zero_points[kpos * NPad + j / _PACK_ROW]);
+      dstptr[i * col + j] = static_cast<_DST_T>(tmp * sptr[j / _PACK_ROW]);
+    }
+  }
+
+  return BTLA_CODE::Success;
+}
+
+inline __m256 poly_scale_2nd_ps(const __m256i z, const __m256 f, const __m256 c0, const __m256 c1, const __m256 c2) {
+  const auto y = _mm256_fmadd_ps(_mm256_fmadd_ps(f, c0, c1), f, c2);  // auto y = (f * c0 + c1) * f + c2;
+  static const auto mask_exp = _mm256_set1_epi32(0x7f800000);
+  static const auto mask_not_exp = _mm256_set1_epi32(~0x7f800000);
+
+  const auto y_exp = _mm256_and_si256(_mm256_castps_si256(y), mask_exp);
+  const auto y_not_exp = _mm256_and_si256(_mm256_castps_si256(y), mask_not_exp);
+
+  const auto y_exp_scaled = _mm256_add_epi32(y_exp, _mm256_slli_epi32(z, 23));
+  return _mm256_castsi256_ps(_mm256_or_si256(y_not_exp, _mm256_and_si256(y_exp_scaled, mask_exp)));
+}
+
+inline __m256 exp_ps_0_1(const __m256 x) {
+  static const auto c0 = _mm256_set1_ps(0.240226507f);
+  static const auto c1 = _mm256_set1_ps(0.452920674f);
+  static const auto c2 = _mm256_set1_ps(0.713483036f);
+  static const float v_log2e = std::log2(std::exp(1.f));
+  static const auto log2e = _mm256_set1_ps(v_log2e);
+  static const auto half = _mm256_set1_ps(.5f);
+
+  static const auto upper_bound = _mm256_set1_ps(88.722838);   // log(max_positive_float)
+  static const auto lower_bound = _mm256_set1_ps(-87.336549);  // log(min_positive_float)
+  __m256 x1 = _mm256_min_ps(x, upper_bound);
+  x1 = _mm256_max_ps(x1, lower_bound);
+
+  x1 = _mm256_fmadd_ps(x1, log2e, half);  // auto x1 = x * log2e + _mm256_set1_ps(.5f);
+  const auto z = _mm256_floor_ps(x1);
+  const auto f = _mm256_sub_ps(x1, z);  // auto f = x1 - z;
+
+  return poly_scale_2nd_ps(_mm256_cvtps_epi32(z), f, c0, c1, c2);
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"  // https://stackoverflow.com/a/49216021
+#endif
+// Interleave 8 xmm vectors of words inplace
+static inline std::array<__m128i, 8> tr_x8_word(std::array<__m128i, 8>& src) {  // NOLINT [runtime/references]
+  std::array<__m128i, 8> dst;
+
+  for (int i = 0; i < 8; i += 2) {
+    dst[i + 0] = _mm_unpacklo_epi16(src[i + 0], src[i + 1]);
+    dst[i + 1] = _mm_unpackhi_epi16(src[i + 0], src[i + 1]);
+  }
+  for (int i = 0; i < 8; i += 4) {
+    src[i + 0] = _mm_unpacklo_epi32(dst[i + 0], dst[i + 2]);
+    src[i + 1] = _mm_unpackhi_epi32(dst[i + 0], dst[i + 2]);
+    src[i + 2] = _mm_unpacklo_epi32(dst[i + 1], dst[i + 3]);
+    src[i + 3] = _mm_unpackhi_epi32(dst[i + 1], dst[i + 3]);
+  }
+  dst[0] = _mm_unpacklo_epi64(src[0], src[4]);
+  dst[1] = _mm_unpackhi_epi64(src[0], src[4]);
+  dst[2] = _mm_unpacklo_epi64(src[1], src[5]);
+  dst[3] = _mm_unpackhi_epi64(src[1], src[5]);
+  dst[4] = _mm_unpacklo_epi64(src[2], src[6]);
+  dst[5] = _mm_unpackhi_epi64(src[2], src[6]);
+  dst[6] = _mm_unpacklo_epi64(src[3], src[7]);
+  dst[7] = _mm_unpackhi_epi64(src[3], src[7]);
+  return dst;
+}
+
+template <int tail>
+inline std::array<__m128i, 8> load_fp32_fp16_tr_x8_word(const float* a, size_t lda) {
+  static_assert(tail > 0 && tail <= 8, "Unexpected tail value.");
+  std::array<__m128i, 8> dst;
+  for (int i = 0; i < tail; ++i) {
+    dst[i] = _mm256_cvtps_ph(_mm256_loadu_ps(a + i * lda), _MM_FROUND_TO_NEAREST_INT);
+  }
+  for (int i = tail; i < 8; ++i) dst[i] = _mm_setzero_si128();
+  return tr_x8_word(dst);
+}
+constexpr decltype(load_fp32_fp16_tr_x8_word<1>)* load_fp32_fp16_tr_x8_word_tbl[9]{
+    load_fp32_fp16_tr_x8_word<1>, load_fp32_fp16_tr_x8_word<1>, load_fp32_fp16_tr_x8_word<2>,
+    load_fp32_fp16_tr_x8_word<3>, load_fp32_fp16_tr_x8_word<4>, load_fp32_fp16_tr_x8_word<5>,
+    load_fp32_fp16_tr_x8_word<6>, load_fp32_fp16_tr_x8_word<7>, load_fp32_fp16_tr_x8_word<8>};
+
+template <int tail>
+inline std::array<__m128i, 8> load_maskz_fp32_fp16_tr_x8_word(const float* a, size_t lda, __m256i mask) {
+  static_assert(tail > 0 && tail <= 8, "Unexpected tail value.");
+  std::array<__m128i, 8> dst;
+  for (int i = 0; i < tail; ++i) {
+    dst[i] = _mm256_cvtps_ph(_mm256_maskload_ps(a + i * lda, mask), _MM_FROUND_TO_NEAREST_INT);
+  }
+  for (int i = tail; i < 8; ++i) dst[i] = _mm_setzero_si128();
+  return tr_x8_word(dst);
+}
+constexpr decltype(load_maskz_fp32_fp16_tr_x8_word<1>)* load_maskz_fp32_fp16_tr_x8_word_tbl[9]{
+    load_maskz_fp32_fp16_tr_x8_word<1>, load_maskz_fp32_fp16_tr_x8_word<1>, load_maskz_fp32_fp16_tr_x8_word<2>,
+    load_maskz_fp32_fp16_tr_x8_word<3>, load_maskz_fp32_fp16_tr_x8_word<4>, load_maskz_fp32_fp16_tr_x8_word<5>,
+    load_maskz_fp32_fp16_tr_x8_word<6>, load_maskz_fp32_fp16_tr_x8_word<7>, load_maskz_fp32_fp16_tr_x8_word<8>};
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+template <int MTILE, int NReg, int Unroll>
+static inline void accumulate_fp32_s8_fp32(const float* Aptr, int lda, int8_t* Bptr, __m256* vacc, __m256* vsca) {
+  if constexpr (MTILE == 1) {
+    for (int ikk = 0; ikk < Unroll; ikk++) {
+      __m256 va = _mm256_set1_ps(*(Aptr + ikk));
+      for (int i = 0; i < NReg; i++) {
+        auto ftmp = load_s8_fp32(Bptr + i * 8 + ikk * NReg * 8);
+        ftmp = _mm256_mul_ps(ftmp, vsca[i]);
+        vacc[i] = _mm256_fmadd_ps(va, ftmp, vacc[i]);
+      }
+    }
+  } else {
+    for (int ikk = 0; ikk < Unroll; ikk++) {
+      __m256 va[MTILE];
+      for (int i = 0; i < NReg; i++) {
+        auto ftmp = load_s8_fp32(Bptr + i * 8 + ikk * NReg * 8);
+        ftmp = _mm256_mul_ps(ftmp, vsca[i]);
+        for (int im = 0; im < MTILE; im++) {
+          if (i == 0) {
+            va[im] = _mm256_set1_ps(*(Aptr + ikk + im * lda));
+          }
+          vacc[im * NReg + i] = _mm256_fmadd_ps(va[im], ftmp, vacc[im * NReg + i]);
+        }
+      }
+    }
+  }
+}
+
+template <int MTILE, int NReg, int Unroll>
+static inline void accumulate_fp32_s8_fp32(const float* Aptr, int lda, int8_t* Bptr, __m256* vacc_loc) {
+  if constexpr (MTILE == 1) {
+    for (int ikk = 0; ikk < Unroll; ikk++) {
+      __m256 va = _mm256_set1_ps(*(Aptr + ikk));
+      for (int i = 0; i < NReg; i++) {
+        auto ftmp = load_s8_fp32(Bptr + i * 8 + ikk * NReg * 8);
+        vacc_loc[i] = _mm256_fmadd_ps(va, ftmp, vacc_loc[i]);
+      }
+    }
+  } else {
+    for (int ikk = 0; ikk < Unroll; ikk++) {
+      __m256 va[MTILE];
+      for (int i = 0; i < NReg; i++) {
+        auto ftmp = load_s8_fp32(Bptr + i * 8 + ikk * NReg * 8);
+        for (int im = 0; im < MTILE; im++) {
+          if (i == 0) {
+            va[im] = _mm256_set1_ps(*(Aptr + ikk + im * lda));
+          }
+          vacc_loc[im * NReg + i] = _mm256_fmadd_ps(va[im], ftmp, vacc_loc[im * NReg + i]);
+        }
+      }
+    }
+  }
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_4bit_fp32_fp32(const float* A, int lda, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto& b4ptr = B.b4ptr;
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / 8;
+  int constexpr MReg = MTILE;
+  // Initialize accumulator with zeros
+  __m256 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
+  auto vbias = _mm256_set1_epi8(8);
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * B.ldzp;
+    __m256 v_b_scale[NReg];
+    for (int i = 0; i < NReg; i++) {
+      v_b_scale[i] = load_T_fp32(bsptr + i * 8);
+    }
+
+    int constexpr Unroll = 4;
+    assert((blocksize % 4) == 0);
+    assert(tmpsize >= NTILE * Unroll);
+
+    if (B.zpptr) {
+      __m256i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+
+      for (int i = 0; i < Unroll; i++) {
+        memcpy(tmp + i * NTILE, bzptr, NTILE);
+      }
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = _mm256_loadu_si256((const __m256i*)(tmp + i * 32));
+        bzp[i] = _mm256_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += Unroll) {
+        for (int i = 0; i < NReg; i++) {
+          auto vb = kernel::avx2::unpack_4bits((void*)(b4ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+          vb = _mm256_sub_epi8(vb, bzp[i]);
+          _mm256_storeu_si256((__m256i*)(tmp + 32 * i), vb);
+        }
+        accumulate_fp32_s8_fp32<MTILE, NReg, Unroll>(A + ib * blocksize + ik, lda, tmp, acc, v_b_scale);
+      }
+
+    } else {
+      for (int ik = 0; ik < blocksize; ik += Unroll) {
+        for (int i = 0; i < NReg; i++) {
+          auto vb = kernel::avx2::unpack_4bits((void*)(b4ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+          vb = _mm256_sub_epi8(vb, vbias);
+          _mm256_storeu_si256((__m256i*)(tmp + 32 * i), vb);
+        }
+        accumulate_fp32_s8_fp32<MTILE, NReg, Unroll>(A + ib * blocksize + ik, lda, tmp, acc, v_b_scale);
+      }
+    }
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm256_storeu_ps(C + i * 8 + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_2bit_fp32_fp32(const float* A, int lda, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto b2ptr = (utils::bit2x4*)B.b2ptr;
+
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / 8;
+  int constexpr MReg = MTILE;
+  // Initialize accumulator with zeros
+  __m256 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0_y = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+  auto vbias = _mm256_set1_epi8(2);
+
+  int constexpr KTILE = 1;
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * B.ldzp;
+
+    __m256 acc_loc[NReg * MReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      acc_loc[i] = _mm256_setzero_ps();
+    }
+    int constexpr Unroll = 4;
+    assert((blocksize % 4) == 0);
+    assert(tmpsize >= NTILE * Unroll);
+
+    if (B.zpptr) {
+      __m256i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < Unroll; i++) {
+        memcpy(tmp + i * NTILE, bzptr, NTILE);
+      }
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = _mm256_loadu_si256((const __m256i*)(tmp + i * 32));
+        bzp[i] = _mm256_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += Unroll) {
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+          vb = _mm256_sub_epi8(vb, bzp[i]);
+          _mm256_storeu_si256((__m256i*)(tmp + 32 * i), vb);
+          b2ptr += 8 * Unroll / 4;
+        }
+        accumulate_fp32_s8_fp32<MTILE, NReg, Unroll>(A + ib * blocksize + ik, lda, tmp, acc_loc);
+      }
+
+    } else {
+      for (int ik = 0; ik < blocksize; ik += Unroll) {
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+          vb = _mm256_sub_epi8(vb, vbias);
+          _mm256_storeu_si256((__m256i*)(tmp + 32 * i), vb);
+          b2ptr += 8 * Unroll / 4;
+        }
+        accumulate_fp32_s8_fp32<MTILE, NReg, Unroll>(A + ib * blocksize + ik, lda, tmp, acc_loc);
+      }
+    }
+
+    __m256 v_b_scale[NReg];
+    for (int i = 0; i < NReg; i++) {
+      v_b_scale[i] = load_T_fp32(bsptr + i * 8);
+    }
+    for (int im = 0; im < MTILE; im++) {
+      for (int in = 0; in < NReg; in++) {
+        acc[im * NReg + in] = _mm256_fmadd_ps(acc_loc[im * NReg + in], v_b_scale[in], acc[im * NReg + in]);
+      }
+    }
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm256_storeu_ps(C + i * 8 + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_3bit_fp32_fp32(const float* A, int lda, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto b2ptr = (utils::bit2x4*)B.b2ptr;
+  auto b1ptr = (utils::bit1x8*)B.b1ptr;
+
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / 8;
+  int constexpr MReg = MTILE;
+  // Initialize accumulator with zeros
+  __m256 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0_y = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+  auto vbias = _mm256_set1_epi8(4);
+
+  const __m256i highMask = _mm256_set1_epi8(0x04);
+  const __m256i bit1Mask = _mm256_set1_epi32(0x0F);
+  const __m256i bit1Shift_1 = _mm256_set_epi32(28, 24, 20, 16, 12, 8, 4, 0);
+  const __m256i bit1Shift_2 = _mm256_set1_epi32((1 << 23) + (1 << 16) + (1 << 9) + (1 << 2));
+  int constexpr KTILE = 1;
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * B.ldzp;
+
+    __m256 acc_loc[NReg * MReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      acc_loc[i] = _mm256_setzero_ps();
+    }
+    int constexpr Unroll = 4;
+    assert((blocksize % 4) == 0);
+    assert(tmpsize >= NTILE * Unroll);
+
+    if (B.zpptr) {
+      __m256i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < Unroll; i++) {
+        memcpy(tmp + i * NTILE, bzptr, NTILE);
+      }
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = _mm256_loadu_si256((const __m256i*)(tmp + i * 32));
+        bzp[i] = _mm256_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += Unroll) {
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+          auto vb1 = unpack_1bits(b1ptr, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+          vb = _mm256_or_si256(vb, vb1);
+          vb = _mm256_sub_epi8(vb, bzp[i]);
+          _mm256_storeu_si256((__m256i*)(tmp + 32 * i), vb);
+          b2ptr += 8 * Unroll / 4;
+          b1ptr += 8 * Unroll / 8;
+        }
+        accumulate_fp32_s8_fp32<MTILE, NReg, Unroll>(A + ib * blocksize + ik, lda, tmp, acc_loc);
+      }
+
+    } else {
+      for (int ik = 0; ik < blocksize; ik += Unroll) {
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+          auto vb1 = unpack_1bits(b1ptr, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+          vb = _mm256_or_si256(vb, vb1);
+          vb = _mm256_sub_epi8(vb, vbias);
+          _mm256_storeu_si256((__m256i*)(tmp + 32 * i), vb);
+          b2ptr += 8 * Unroll / 4;
+          b1ptr += 8 * Unroll / 8;
+        }
+        accumulate_fp32_s8_fp32<MTILE, NReg, Unroll>(A + ib * blocksize + ik, lda, tmp, acc_loc);
+      }
+    }
+
+    __m256 v_b_scale[NReg];
+    for (int i = 0; i < NReg; i++) {
+      v_b_scale[i] = load_T_fp32(bsptr + i * 8);
+    }
+    for (int im = 0; im < MTILE; im++) {
+      for (int in = 0; in < NReg; in++) {
+        acc[im * NReg + in] = _mm256_fmadd_ps(acc_loc[im * NReg + in], v_b_scale[in], acc[im * NReg + in]);
+      }
+    }
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm256_storeu_ps(C + i * 8 + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline __m256i _mm256_dpbusd_avx2_epi32(__m256i& c, const __m256i& a, const __m256i& b) {
+  const __m256i dot2 = _mm256_maddubs_epi16(a, b);
+  const __m256i ones = _mm256_set1_epi16(1);
+  const __m256i sum4 = _mm256_madd_epi16(ones, dot2);
+  return _mm256_add_epi32(c, sum4);
+}
+
+template <typename ScaleT, int NReg, int MTILE>
+static inline void gemv_dequant_s32fp32(const float* asptr, int ldzp, const ScaleT* bsptr, __m256i* iacc,
+                                        __m256* facc) {
+  __m256 v_a_scale[MTILE];
+  for (int im = 0; im < MTILE; im++) {
+    v_a_scale[im] = _mm256_set1_ps(*(asptr + im * ldzp));
+  }
+
+  for (int i = 0; i < NReg; i++) {
+    __m256 v_b_scale = load_T_fp32(bsptr + i * 8);
+    for (int im = 0; im < MTILE; im++) {
+      auto vtmp = _mm256_mul_ps(v_a_scale[im], v_b_scale);
+      auto tmp = _mm256_cvtepi32_ps(iacc[im * NReg + i]);
+      facc[im * NReg + i] = _mm256_fmadd_ps(tmp, vtmp, facc[im * NReg + i]);
+    }
+  }
+}
+
+template <int NReg, int MReg>
+static inline void gemv_remove_zp(const uint8_t* azptr, int ldzp, __m256i* iacc, __m256i* bacc) {
+  if constexpr (MReg == 1) {
+    auto zp = int(azptr[0]);
+    __m256i v_a_zp = _mm256_set1_epi32(zp);
+    for (int in = 0; in < NReg; in++) {
+      auto vtmp = _mm256_mullo_epi32(v_a_zp, bacc[in]);
+      iacc[in] = _mm256_sub_epi32(iacc[in], vtmp);
+    }
+  } else {
+    __m256i v_a_zp[MReg];
+    for (int im = 0; im < MReg; im++) {
+      auto zp = int(azptr[im * ldzp]);
+      v_a_zp[im] = _mm256_set1_epi32(zp);
+      for (int in = 0; in < NReg; in++) {
+        auto vtmp = _mm256_mullo_epi32(v_a_zp[im], bacc[in]);
+        iacc[im * NReg + in] = _mm256_sub_epi32(iacc[im * NReg + in], vtmp);
+      }
+    }
+  }
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_4bit_u8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto& a8ptr = A.aptr;
+  auto& b4ptr = B.b4ptr;
+  auto& asptr = A.sptr;
+  auto& azptr = A.zpptr;
+
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / 8;
+  int constexpr MReg = MTILE;
+  // Initialize accumulator with zeros
+  __m256 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
+  const __m256i onesu8 = _mm256_set1_epi8(1);
+  const __m256i vbias = _mm256_set1_epi8(8);
+  const auto vindex = _mm256_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0);
+
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m256i iacc[NReg * MReg];
+    __m256i bacc[NReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      iacc[i] = _mm256_setzero_si256();
+    }
+    for (int i = 0; i < NReg; i++) {
+      bacc[i] = _mm256_setzero_si256();
+    }
+    if (B.zpptr) {
+      __m256i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * 8, vindex);
+        bzp[i] = _mm256_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += 4) {
+        if constexpr (MTILE == 1) {
+          __m256i va = _mm256_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik));
+
+          for (int i = 0; i < NReg; i++) {
+            auto vb = kernel::avx2::unpack_4bits((void*)(b4ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+            vb = _mm256_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm256_dpbusd_avx2_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm256_dpbusd_avx2_epi32(iacc[i], va, vb);
+          }
+        } else {
+          __m256i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm256_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = kernel::avx2::unpack_4bits((void*)(b4ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+            vb = _mm256_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm256_dpbusd_avx2_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm256_dpbusd_avx2_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+          }
+        }
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += 4) {
+        if constexpr (MTILE == 1) {
+          __m256i va = _mm256_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = kernel::avx2::unpack_4bits((void*)(b4ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+            vb = _mm256_sub_epi8(vb, vbias);
+            bacc[i] = _mm256_dpbusd_avx2_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm256_dpbusd_avx2_epi32(iacc[i], va, vb);
+          }
+        } else {
+          __m256i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm256_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = kernel::avx2::unpack_4bits((void*)(b4ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+            vb = _mm256_sub_epi8(vb, vbias);
+            bacc[i] = _mm256_dpbusd_avx2_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm256_dpbusd_avx2_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+          }
+        }
+      }
+    }
+
+    gemv_remove_zp<NReg, MTILE>(A.zpptr + ib, A.ldzp, iacc, bacc);
+    gemv_dequant_s32fp32<ScaleT, NReg, MTILE>(A.sptr + ib, A.ldzp, B.sptr + ib * B.ldzp, iacc, acc);
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm256_storeu_ps(C + i * 8 + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_3bit_u8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  auto b1ptr = reinterpret_cast<utils::bit1x8*>(B.b1ptr);
+
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / 8;
+  int constexpr MReg = MTILE;
+  __m256 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0_y = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+  const __m256i onesu8 = _mm256_set1_epi8(1);
+  const __m256i vbias = _mm256_set1_epi8(4);
+  const auto vindex = _mm256_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0);
+
+  const __m256i highMask = _mm256_set1_epi8(0x04);
+  const __m256i bit1Mask = _mm256_set1_epi32(0x0F);
+  const __m256i bit1Shift_1 = _mm256_set_epi32(28, 24, 20, 16, 12, 8, 4, 0);
+  const __m256i bit1Shift_2 = _mm256_set1_epi32((1 << 23) + (1 << 16) + (1 << 9) + (1 << 2));
+  int constexpr KTILE = 4;
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m256i iacc[NReg * MReg];
+    __m256i bacc[NReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      iacc[i] = _mm256_setzero_si256();
+    }
+    for (int i = 0; i < NReg; i++) {
+      bacc[i] = _mm256_setzero_si256();
+    }
+    if (B.zpptr) {
+      __m256i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * 8, vindex);
+        bzp[i] = _mm256_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        if constexpr (MTILE == 1) {
+          __m256i va = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            auto vb1 = unpack_1bits(b1ptr, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+            vb = _mm256_or_si256(vb, vb1);
+            vb = _mm256_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm256_dpbusd_avx2_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm256_dpbusd_avx2_epi32(iacc[i], va, vb);
+            b2ptr += 8 * KTILE / 4;
+            b1ptr += 8 * KTILE / 8;
+          }
+        } else {
+          __m256i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            auto vb1 = unpack_1bits(b1ptr, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+            vb = _mm256_or_si256(vb, vb1);
+            vb = _mm256_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm256_dpbusd_avx2_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm256_dpbusd_avx2_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+            b2ptr += 8 * KTILE / 4;
+            b1ptr += 8 * KTILE / 8;
+          }
+        }
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        if constexpr (MTILE == 1) {
+          __m256i va = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            auto vb1 = unpack_1bits(b1ptr, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+            vb = _mm256_or_si256(vb, vb1);
+            vb = _mm256_sub_epi8(vb, vbias);
+            bacc[i] = _mm256_dpbusd_avx2_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm256_dpbusd_avx2_epi32(iacc[i], va, vb);
+
+            b2ptr += 8 * KTILE / 4;
+            b1ptr += 8 * KTILE / 8;
+          }
+        } else {
+          __m256i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            auto vb1 = unpack_1bits(b1ptr, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+            vb = _mm256_or_si256(vb, vb1);
+            vb = _mm256_sub_epi8(vb, vbias);
+            bacc[i] = _mm256_dpbusd_avx2_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm256_dpbusd_avx2_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+            b2ptr += 8 * KTILE / 4;
+            b1ptr += 8 * KTILE / 8;
+          }
+        }
+      }
+    }
+
+    gemv_remove_zp<NReg, MReg>(A.zpptr + ib, A.ldzp, iacc, bacc);
+    gemv_dequant_s32fp32<ScaleT, NReg, MTILE>(A.sptr + ib, A.ldzp, B.sptr + ib * B.ldzp, iacc, acc);
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm256_storeu_ps(C + i * 8 + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_2bit_u8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / 8;
+  int constexpr MReg = MTILE;
+  __m256 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0_y = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+  const __m256i onesu8 = _mm256_set1_epi8(1);
+  const __m256i vbias = _mm256_set1_epi8(2);
+  const auto vindex = _mm256_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0);
+  int constexpr KTILE = 4;
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m256i iacc[NReg * MReg];
+    __m256i bacc[NReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      iacc[i] = _mm256_setzero_si256();
+    }
+    for (int i = 0; i < NReg; i++) {
+      bacc[i] = _mm256_setzero_si256();
+    }
+    if (B.zpptr) {
+      __m256i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * 8, vindex);
+        bzp[i] = _mm256_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        if constexpr (MTILE == 1) {
+          __m256i va = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            vb = _mm256_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm256_dpbusd_avx2_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm256_dpbusd_avx2_epi32(iacc[i], va, vb);
+            b2ptr += 8 * KTILE / 4;
+          }
+        } else {
+          __m256i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            vb = _mm256_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm256_dpbusd_avx2_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm256_dpbusd_avx2_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+            b2ptr += 8 * KTILE / 4;
+          }
+        }
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        if constexpr (MTILE == 1) {
+          __m256i va = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            vb = _mm256_sub_epi8(vb, vbias);
+            bacc[i] = _mm256_dpbusd_avx2_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm256_dpbusd_avx2_epi32(iacc[i], va, vb);
+            b2ptr += 8 * KTILE / 4;
+          }
+        } else {
+          __m256i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            vb = _mm256_sub_epi8(vb, vbias);
+            bacc[i] = _mm256_dpbusd_avx2_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm256_dpbusd_avx2_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+            b2ptr += 8 * KTILE / 4;
+          }
+        }
+      }
+    }
+
+    gemv_remove_zp<NReg, MReg>(A.zpptr + ib, A.ldzp, iacc, bacc);
+    gemv_dequant_s32fp32<ScaleT, NReg, MTILE>(A.sptr + ib, A.ldzp, B.sptr + ib * B.ldzp, iacc, acc);
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm256_storeu_ps(C + i * 8 + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+namespace vnni {
+
+#if CompileAVXVNNI()
+#ifdef __GNUC__
+#pragma GCC push_options
+#pragma GCC target("avxvnni")
+#endif
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_4bit_u8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto& a8ptr = A.aptr;
+  auto& b4ptr = B.b4ptr;
+  auto& asptr = A.sptr;
+  auto& azptr = A.zpptr;
+
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / 8;
+  int constexpr MReg = MTILE;
+  // Initialize accumulator with zeros
+  __m256 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
+  const __m256i onesu8 = _mm256_set1_epi8(1);
+  const __m256i vbias = _mm256_set1_epi8(8);
+  const auto vindex = _mm256_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0);
+
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m256i iacc[NReg * MReg];
+    __m256i bacc[NReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      iacc[i] = _mm256_setzero_si256();
+    }
+    for (int i = 0; i < NReg; i++) {
+      bacc[i] = _mm256_setzero_si256();
+    }
+    if (B.zpptr) {
+      __m256i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * 8, vindex);
+        bzp[i] = _mm256_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += 4) {
+        if constexpr (MTILE == 1) {
+          __m256i va = _mm256_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = kernel::avx2::unpack_4bits((void*)(b4ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+            vb = _mm256_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm256_dpbusd_avx_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm256_dpbusd_avx_epi32(iacc[i], va, vb);
+          }
+        } else {
+          __m256i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm256_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = kernel::avx2::unpack_4bits((void*)(b4ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+            vb = _mm256_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm256_dpbusd_avx_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm256_dpbusd_avx_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+          }
+        }
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += 4) {
+        if constexpr (MTILE == 1) {
+          __m256i va = _mm256_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = kernel::avx2::unpack_4bits((void*)(b4ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+            vb = _mm256_sub_epi8(vb, vbias);
+            bacc[i] = _mm256_dpbusd_avx_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm256_dpbusd_avx_epi32(iacc[i], va, vb);
+          }
+        } else {
+          __m256i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm256_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = kernel::avx2::unpack_4bits((void*)(b4ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+            vb = _mm256_sub_epi8(vb, vbias);
+            bacc[i] = _mm256_dpbusd_avx_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm256_dpbusd_avx_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+          }
+        }
+      }
+    }
+
+    gemv_remove_zp<NReg, MReg>(A.zpptr + ib, A.ldzp, iacc, bacc);
+    gemv_dequant_s32fp32<ScaleT, NReg, MTILE>(A.sptr + ib, A.ldzp, B.sptr + ib * B.ldzp, iacc, acc);
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm256_storeu_ps(C + i * 8 + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_4bit_s8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto& a8ptr = A.aptr;
+  auto& b4ptr = B.b4ptr;
+  auto& asptr = A.sptr;
+
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / 8;
+  int constexpr MReg = MTILE;
+  __m256 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+  const __m256i vbias = _mm256_set1_epi8(8);
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
+  const auto vindex = _mm256_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0);
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m256i iacc[NReg * MReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      iacc[i] = _mm256_setzero_si256();
+    }
+    if (B.zpptr) {
+      __m256i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * 8, vindex);
+        bzp[i] = _mm256_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += 4) {
+        __m256i va[MReg];
+        for (int i = 0; i < MReg; i++) {
+          va[i] = _mm256_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik + i * A.lda));
+        }
+        for (int i = 0; i < NReg; i++) {
+          auto vb = kernel::avx2::unpack_4bits((void*)(b4ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+          vb = _mm256_sub_epi8(vb, bzp[i]);
+          for (int j = 0; j < MReg; j++) {
+            auto vsb = _mm256_sign_epi8(vb, va[j]);
+            auto vabsa = _mm256_sign_epi8(va[j], va[j]);
+            iacc[j * NReg + i] = _mm256_dpbusd_avx_epi32(iacc[j * NReg + i], vabsa, vsb);
+          }
+        }
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += 4) {
+        __m256i va[MReg];
+        for (int i = 0; i < MReg; i++) {
+          va[i] = _mm256_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik + i * A.lda));
+        }
+        for (int i = 0; i < NReg; i++) {
+          auto vb = kernel::avx2::unpack_4bits((void*)(b4ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+          vb = _mm256_sub_epi8(vb, vbias);
+          for (int j = 0; j < MReg; j++) {
+            auto vsb = _mm256_sign_epi8(vb, va[j]);
+            auto vabsa = _mm256_sign_epi8(va[j], va[j]);
+            iacc[j * NReg + i] = _mm256_dpbusd_avx_epi32(iacc[j * NReg + i], vabsa, vsb);
+          }
+        }
+      }
+    }
+
+    gemv_dequant_s32fp32<ScaleT, NReg, MTILE>(A.sptr + ib, A.ldzp, B.sptr + ib * B.ldzp, iacc, acc);
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm256_storeu_ps(C + i * 8 + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE>
+static inline BTLA_CODE gemv_3bit_u8s8_fp32_align128(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B,
+                                                     float* C, int k, int ld_scaleb, int blocksize, int8_t* tmp,
+                                                     size_t tmpsize) {
+  auto a8ptr = A.aptr;
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  auto b1ptr = reinterpret_cast<utils::bit1x8*>(B.b1ptr);
+  auto asptr = A.sptr;
+  auto azptr = A.zpptr;
+
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / 8;
+  // Initialize accumulator with zeros
+  __m256 acc[NReg];
+  int constexpr EltPadding = 128;
+  static_assert(NTILE % 8 == 0);
+  int constexpr KTILE = 4;
+  int constexpr UnpackElt = EltPadding / 8 / KTILE;
+  int constexpr TotalElt = UnpackElt * NTILE * KTILE;
+  int constexpr Loop128 = TotalElt / 128;
+  int8_t UnpackBuf[TotalElt];
+  for (int i = 0; i < NReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
+  const __m256i onesu8 = _mm256_set1_epi8(1);
+  const __m256i lowMask = _mm256_set1_epi8(0x03);
+  const __m256i highMask = _mm256_set1_epi8(0x04);
+  const __m256i bit1Mask = _mm256_set1_epi32(0x0F);
+  const __m256i bit1Shift_1 = _mm256_set_epi32(28, 24, 20, 16, 12, 8, 4, 0);
+  const __m256i bit1Shift_2 = _mm256_set1_epi32((1 << 23) + (1 << 16) + (1 << 9) + (1 << 2));
+  const auto vindex = _mm256_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0);
+  auto bit3_interleave_decompress_pack128 = [&](utils::bit2x4* src1, utils::bit1x8* src2, int8_t* dst) {
+    __m256i bit2_data = _mm256_loadu_si256((const __m256i*)src1);
+    int32_t* bit1_ptr = reinterpret_cast<int32_t*>(src2);
+    for (int i = 0; i < 4; i++) {
+      auto bit1x32 = _mm256_set1_epi32(bit1_ptr[i]);
+      bit1x32 = _mm256_srlv_epi32(bit1x32, bit1Shift_1);
+      bit1x32 = _mm256_and_si256(bit1x32, bit1Mask);
+      bit1x32 = _mm256_mullo_epi32(bit1x32, bit1Shift_2);
+      bit1x32 = _mm256_and_si256(highMask, bit1x32);
+
+      auto bit2x32 = _mm256_and_si256(lowMask, _mm256_srli_epi16(bit2_data, 2 * i));
+      auto res = _mm256_add_epi8(bit1x32, bit2x32);
+      res = _mm256_sub_epi8(res, highMask);
+      _mm256_storeu_si256((__m256i*)(dst + 32 * i), res);
+    }
+  };
+  assert(azptr);
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m256i iacc[NReg];
+    __m256i bacc[NReg];
+    for (int i = 0; i < NReg; i++) {
+      iacc[i] = _mm256_setzero_si256();
+      bacc[i] = _mm256_setzero_si256();
+    }
+    if (B.zpptr) {
+      __m256i bzp[NReg];
+      auto bzptr = B.zpptr + ib * ld_scaleb;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * 8, vindex);
+      }
+      for (int ik = 0; ik < blocksize; ik += KTILE * UnpackElt) {
+        for (int il = 0; il < Loop128; il++) {
+          bit3_interleave_decompress_pack128(b2ptr, b1ptr, UnpackBuf + il * 128);
+          b2ptr += 128 / 4;
+          b1ptr += 128 / 8;
+        }
+        for (int iu = 0; iu < UnpackElt; iu++) {
+          auto va = _mm256_set1_epi32(*(int*)(a8ptr + iu * KTILE));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = _mm256_loadu_si256((const __m256i*)(UnpackBuf + iu * NTILE * KTILE + i * 32));
+            vb = _mm256_sub_epi8(vb, bzp[i]);
+            iacc[i] = _mm256_dpbusd_avx_epi32(iacc[i], va, vb);
+            bacc[i] = _mm256_dpbusd_avx_epi32(bacc[i], onesu8, vb);
+          }
+        }
+        a8ptr += KTILE * UnpackElt;
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += KTILE * UnpackElt) {
+        for (int il = 0; il < Loop128; il++) {
+          bit3_interleave_decompress_pack128(b2ptr, b1ptr, UnpackBuf + il * 128);
+          b2ptr += 128 / 4;
+          b1ptr += 128 / 8;
+        }
+        for (int iu = 0; iu < UnpackElt; iu++) {
+          auto va = _mm256_set1_epi32(*(int*)(a8ptr + iu * KTILE));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = _mm256_loadu_si256((const __m256i*)(UnpackBuf + iu * NTILE * KTILE + i * 32));
+            iacc[i] = _mm256_dpbusd_avx_epi32(iacc[i], va, vb);
+            bacc[i] = _mm256_dpbusd_avx_epi32(bacc[i], onesu8, vb);
+          }
+        }
+        a8ptr += KTILE * UnpackElt;
+      }
+    }
+
+    const __m256 v_a_scale = _mm256_set1_ps(*(asptr + ib));
+    auto zp = int(azptr[ib]);
+    const __m256i v_a_zp = _mm256_set1_epi32(zp);
+    auto bsptr = B.sptr + ib * ld_scaleb;
+    for (int i = 0; i < NReg; i++) {
+      bacc[i] = _mm256_mullo_epi32(v_a_zp, bacc[i]);
+      iacc[i] = _mm256_sub_epi32(iacc[i], bacc[i]);
+      __m256 v_b_scale;
+      if constexpr (std::is_same_v<ScaleT, float>) {
+        v_b_scale = _mm256_loadu_ps(bsptr + i * 8);
+      } else if constexpr (std::is_same_v<ScaleT, utils::bf16>) {
+        auto tmp = _mm_loadu_si128((const __m128i*)(bsptr + i * 8));
+        v_b_scale = kernel::avx2::ymm_cvt_bf16_fp32(tmp);
+      }
+      v_b_scale = _mm256_mul_ps(v_a_scale, v_b_scale);
+      auto tmp = _mm256_cvtepi32_ps(iacc[i]);
+      acc[i] = _mm256_fmadd_ps(tmp, v_b_scale, acc[i]);
+    }
+  }
+
+  for (int i = 0; i < NReg; i++) {
+    _mm256_storeu_ps(C + i * 8, acc[i]);
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE>
+static inline BTLA_CODE gemv_3bit_s8s8_fp32_align128(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B,
+                                                     float* C, int k, int ld_scaleb, int blocksize, int8_t* tmp,
+                                                     size_t tmpsize) {
+  auto a8ptr = A.aptr;
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  auto b1ptr = reinterpret_cast<utils::bit1x8*>(B.b1ptr);
+  auto asptr = A.sptr;
+  auto azptr = A.zpptr;
+
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / 8;
+  // Initialize accumulator with zeros
+  __m256 acc[NReg];
+  int constexpr EltPadding = 128;
+  static_assert(NTILE % 8 == 0);
+  int constexpr KTILE = 4;
+  int constexpr UnpackElt = EltPadding / 8 / KTILE;
+  int constexpr TotalElt = UnpackElt * NTILE * KTILE;
+  int constexpr Loop128 = TotalElt / 128;
+  int8_t UnpackBuf[TotalElt];
+  for (int i = 0; i < NReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
+  const __m256i lowMask = _mm256_set1_epi8(0x03);
+  const __m256i highMask = _mm256_set1_epi8(0x04);
+  const __m256i bit1Mask = _mm256_set1_epi32(0x0F);
+  const __m256i bit1Shift_1 = _mm256_set_epi32(28, 24, 20, 16, 12, 8, 4, 0);
+  const __m256i bit1Shift_2 = _mm256_set1_epi32((1 << 23) + (1 << 16) + (1 << 9) + (1 << 2));
+  const auto vindex = _mm256_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0);
+  auto bit3_interleave_decompress_pack128 = [&](utils::bit2x4* src1, utils::bit1x8* src2, int8_t* dst) {
+    __m256i bit2_data = _mm256_loadu_si256((const __m256i*)src1);
+    int32_t* bit1_ptr = reinterpret_cast<int32_t*>(src2);
+    for (int i = 0; i < 4; i++) {
+      auto bit1x32 = _mm256_set1_epi32(bit1_ptr[i]);
+      bit1x32 = _mm256_srlv_epi32(bit1x32, bit1Shift_1);
+      bit1x32 = _mm256_and_si256(bit1x32, bit1Mask);
+      bit1x32 = _mm256_mullo_epi32(bit1x32, bit1Shift_2);
+      bit1x32 = _mm256_and_si256(highMask, bit1x32);
+
+      auto bit2x32 = _mm256_and_si256(lowMask, _mm256_srli_epi16(bit2_data, 2 * i));
+      auto res = _mm256_add_epi8(bit1x32, bit2x32);
+      res = _mm256_slli_epi32(res, 5);
+      _mm256_storeu_si256((__m256i*)(dst + 32 * i), res);
+    }
+  };
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m256i iacc[NReg];
+    for (int i = 0; i < NReg; i++) {
+      iacc[i] = _mm256_setzero_si256();
+    }
+    if (B.zpptr) {
+      __m256i bzp[NReg];
+      auto bzptr = B.zpptr + ib * ld_scaleb;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * 8, vindex);
+      }
+      for (int ik = 0; ik < blocksize; ik += KTILE * UnpackElt) {
+        for (int il = 0; il < Loop128; il++) {
+          bit3_interleave_decompress_pack128(b2ptr, b1ptr, UnpackBuf + il * 128);
+          b2ptr += 128 / 4;
+          b1ptr += 128 / 8;
+        }
+        for (int iu = 0; iu < UnpackElt; iu++) {
+          auto va = _mm256_set1_epi32(*(int*)(a8ptr + iu * KTILE));
+          auto vabsa = _mm256_sign_epi8(va, va);
+          for (int i = 0; i < NReg; i++) {
+            auto vb = _mm256_loadu_si256((const __m256i*)(UnpackBuf + iu * NTILE * KTILE + i * 32));
+            vb = _mm256_sub_epi8(vb, bzp[i]);
+            vb = _mm256_sign_epi8(vb, va);
+            iacc[i] = _mm256_dpbusd_avx_epi32(iacc[i], vabsa, vb);
+          }
+        }
+        a8ptr += KTILE * UnpackElt;
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += KTILE * UnpackElt) {
+        for (int il = 0; il < Loop128; il++) {
+          bit3_interleave_decompress_pack128(b2ptr, b1ptr, UnpackBuf + il * 128);
+          b2ptr += 128 / 4;
+          b1ptr += 128 / 8;
+        }
+        for (int iu = 0; iu < UnpackElt; iu++) {
+          auto va = _mm256_set1_epi32(*(int*)(a8ptr + iu * KTILE));
+          auto vabsa = _mm256_sign_epi8(va, va);
+          for (int i = 0; i < NReg; i++) {
+            auto vb = _mm256_loadu_si256((const __m256i*)(UnpackBuf + iu * NTILE * KTILE + i * 32));
+            vb = _mm256_sign_epi8(vb, va);
+            iacc[i] = _mm256_dpbusd_avx_epi32(iacc[i], vabsa, vb);
+          }
+        }
+        a8ptr += KTILE * UnpackElt;
+      }
+    }
+
+    const __m256 v_a_scale = _mm256_set1_ps(*(asptr + ib));
+    auto bsptr = B.sptr + ib * ld_scaleb;
+    for (int i = 0; i < NReg; i++) {
+      __m256 v_b_scale;
+      if constexpr (std::is_same_v<ScaleT, float>) {
+        v_b_scale = _mm256_loadu_ps(bsptr + i * 8);
+      } else if constexpr (std::is_same_v<ScaleT, utils::bf16>) {
+        auto tmp = _mm_loadu_si128((const __m128i*)(bsptr + i * 8));
+        v_b_scale = kernel::avx2::ymm_cvt_bf16_fp32(tmp);
+      }
+      v_b_scale = _mm256_mul_ps(v_a_scale, v_b_scale);
+      auto tmp = _mm256_cvtepi32_ps(iacc[i]);
+      acc[i] = _mm256_fmadd_ps(tmp, v_b_scale, acc[i]);
+    }
+  }
+
+  for (int i = 0; i < NReg; i++) {
+    _mm256_storeu_ps(C + i * 8, acc[i]);
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_2bit_u8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / 8;
+  int constexpr MReg = MTILE;
+  __m256 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0_y = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+  const __m256i onesu8 = _mm256_set1_epi8(1);
+  const __m256i vbias = _mm256_set1_epi8(2);
+  const auto vindex = _mm256_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0);
+  int constexpr KTILE = 4;
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m256i iacc[NReg * MReg];
+    __m256i bacc[NReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      iacc[i] = _mm256_setzero_si256();
+    }
+    for (int i = 0; i < NReg; i++) {
+      bacc[i] = _mm256_setzero_si256();
+    }
+    if (B.zpptr) {
+      __m256i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * 8, vindex);
+        bzp[i] = _mm256_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        if constexpr (MTILE == 1) {
+          __m256i va = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            vb = _mm256_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm256_dpbusd_avx_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm256_dpbusd_avx_epi32(iacc[i], va, vb);
+            b2ptr += 8 * KTILE / 4;
+          }
+        } else {
+          __m256i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            vb = _mm256_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm256_dpbusd_avx_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm256_dpbusd_avx_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+            b2ptr += 8 * KTILE / 4;
+          }
+        }
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        if constexpr (MTILE == 1) {
+          __m256i va = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            vb = _mm256_sub_epi8(vb, vbias);
+            bacc[i] = _mm256_dpbusd_avx_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm256_dpbusd_avx_epi32(iacc[i], va, vb);
+            b2ptr += 8 * KTILE / 4;
+          }
+        } else {
+          __m256i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            vb = _mm256_sub_epi8(vb, vbias);
+            bacc[i] = _mm256_dpbusd_avx_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm256_dpbusd_avx_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+            b2ptr += 8 * KTILE / 4;
+          }
+        }
+      }
+    }
+
+    gemv_remove_zp<NReg, MReg>(A.zpptr + ib, A.ldzp, iacc, bacc);
+    gemv_dequant_s32fp32<ScaleT, NReg, MTILE>(A.sptr + ib, A.ldzp, B.sptr + ib * B.ldzp, iacc, acc);
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm256_storeu_ps(C + i * 8 + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_2bit_s8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / 8;
+  int constexpr MReg = MTILE;
+  __m256 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0_y = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+  const __m256i onesu8 = _mm256_set1_epi8(1);
+  const __m256i vbias = _mm256_set1_epi8(2);
+  const auto vindex = _mm256_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0);
+  int constexpr KTILE = 4;
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m256i iacc[NReg * MReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      iacc[i] = _mm256_setzero_si256();
+    }
+
+    if (B.zpptr) {
+      __m256i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * 8, vindex);
+        bzp[i] = _mm256_add_epi8(vbias, bzp[i]);
+      }
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        __m256i va[MReg];
+        for (int i = 0; i < MReg; i++) {
+          va[i] = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+        }
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+          vb = _mm256_sub_epi8(vb, bzp[i]);
+          for (int j = 0; j < MReg; j++) {
+            auto vsb = _mm256_sign_epi8(vb, va[j]);
+            auto vabsa = _mm256_sign_epi8(va[j], va[j]);
+            iacc[j * NReg + i] = _mm256_dpbusd_avx_epi32(iacc[j * NReg + i], vabsa, vsb);
+          }
+          b2ptr += 8 * KTILE / 4;
+        }
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        __m256i va[MReg];
+        for (int i = 0; i < MReg; i++) {
+          va[i] = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+        }
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+          vb = _mm256_sub_epi8(vb, vbias);
+          for (int j = 0; j < MReg; j++) {
+            auto vsb = _mm256_sign_epi8(vb, va[j]);
+            auto vabsa = _mm256_sign_epi8(va[j], va[j]);
+            iacc[j * NReg + i] = _mm256_dpbusd_avx_epi32(iacc[j * NReg + i], vabsa, vsb);
+          }
+          b2ptr += 8 * KTILE / 4;
+        }
+      }
+    }
+    gemv_dequant_s32fp32<ScaleT, NReg, MTILE>(A.sptr + ib, A.ldzp, B.sptr + ib * B.ldzp, iacc, acc);
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm256_storeu_ps(C + i * 8 + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_3bit_u8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  auto b1ptr = reinterpret_cast<utils::bit1x8*>(B.b1ptr);
+
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / 8;
+  int constexpr MReg = MTILE;
+  __m256 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0_y = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+  const __m256i onesu8 = _mm256_set1_epi8(1);
+  const __m256i vbias = _mm256_set1_epi8(4);
+  const auto vindex = _mm256_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0);
+
+  const __m256i highMask = _mm256_set1_epi8(0x04);
+  const __m256i bit1Mask = _mm256_set1_epi32(0x0F);
+  const __m256i bit1Shift_1 = _mm256_set_epi32(28, 24, 20, 16, 12, 8, 4, 0);
+  const __m256i bit1Shift_2 = _mm256_set1_epi32((1 << 23) + (1 << 16) + (1 << 9) + (1 << 2));
+  int constexpr KTILE = 4;
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m256i iacc[NReg * MReg];
+    __m256i bacc[NReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      iacc[i] = _mm256_setzero_si256();
+    }
+    for (int i = 0; i < NReg; i++) {
+      bacc[i] = _mm256_setzero_si256();
+    }
+    if (B.zpptr) {
+      __m256i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * 8, vindex);
+        bzp[i] = _mm256_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        if constexpr (MTILE == 1) {
+          __m256i va = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            auto vb1 = unpack_1bits(b1ptr, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+            vb = _mm256_or_si256(vb, vb1);
+            vb = _mm256_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm256_dpbusd_avx_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm256_dpbusd_avx_epi32(iacc[i], va, vb);
+            b2ptr += 8 * KTILE / 4;
+            b1ptr += 8 * KTILE / 8;
+          }
+        } else {
+          __m256i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            auto vb1 = unpack_1bits(b1ptr, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+            vb = _mm256_or_si256(vb, vb1);
+            vb = _mm256_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm256_dpbusd_avx_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm256_dpbusd_avx_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+            b2ptr += 8 * KTILE / 4;
+            b1ptr += 8 * KTILE / 8;
+          }
+        }
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        if constexpr (MTILE == 1) {
+          __m256i va = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            auto vb1 = unpack_1bits(b1ptr, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+            vb = _mm256_or_si256(vb, vb1);
+            vb = _mm256_sub_epi8(vb, vbias);
+            bacc[i] = _mm256_dpbusd_avx_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm256_dpbusd_avx_epi32(iacc[i], va, vb);
+
+            b2ptr += 8 * KTILE / 4;
+            b1ptr += 8 * KTILE / 8;
+          }
+        } else {
+          __m256i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+            auto vb1 = unpack_1bits(b1ptr, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+            vb = _mm256_or_si256(vb, vb1);
+            vb = _mm256_sub_epi8(vb, vbias);
+            bacc[i] = _mm256_dpbusd_avx_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm256_dpbusd_avx_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+            b2ptr += 8 * KTILE / 4;
+            b1ptr += 8 * KTILE / 8;
+          }
+        }
+      }
+    }
+
+    gemv_remove_zp<NReg, MReg>(A.zpptr + ib, A.ldzp, iacc, bacc);
+    gemv_dequant_s32fp32<ScaleT, NReg, MTILE>(A.sptr + ib, A.ldzp, B.sptr + ib * B.ldzp, iacc, acc);
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm256_storeu_ps(C + i * 8 + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_3bit_s8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  auto b1ptr = reinterpret_cast<utils::bit1x8*>(B.b1ptr);
+
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / 8;
+  int constexpr MReg = MTILE;
+  __m256 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0_y = _mm256_set_epi64x(*(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0, *(int64_t*)&mask0);
+  auto vshift_y = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
+  const __m256i onesu8 = _mm256_set1_epi8(1);
+  const __m256i vbias = _mm256_set1_epi8(4);
+  const auto vindex = _mm256_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0);
+
+  const __m256i highMask = _mm256_set1_epi8(0x04);
+  const __m256i bit1Mask = _mm256_set1_epi32(0x0F);
+  const __m256i bit1Shift_1 = _mm256_set_epi32(28, 24, 20, 16, 12, 8, 4, 0);
+  const __m256i bit1Shift_2 = _mm256_set1_epi32((1 << 23) + (1 << 16) + (1 << 9) + (1 << 2));
+  int constexpr KTILE = 4;
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m256i iacc[NReg * MReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      iacc[i] = _mm256_setzero_si256();
+    }
+    if (B.zpptr) {
+      __m256i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * 8, vindex);
+        bzp[i] = _mm256_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        __m256i va[MReg];
+        for (int i = 0; i < MReg; i++) {
+          va[i] = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+        }
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+          auto vb1 = unpack_1bits(b1ptr, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+          vb = _mm256_or_si256(vb, vb1);
+          vb = _mm256_sub_epi8(vb, bzp[i]);
+          for (int j = 0; j < MReg; j++) {
+            auto vsb = _mm256_sign_epi8(vb, va[j]);
+            auto vabsa = _mm256_sign_epi8(va[j], va[j]);
+            iacc[j * NReg + i] = _mm256_dpbusd_avx_epi32(iacc[j * NReg + i], vabsa, vsb);
+          }
+          b2ptr += 8 * KTILE / 4;
+          b1ptr += 8 * KTILE / 8;
+        }
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        __m256i va[MReg];
+        for (int i = 0; i < MReg; i++) {
+          va[i] = _mm256_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+        }
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0_y, vsfhl_mask_y, vorder_y);
+          auto vb1 = unpack_1bits(b1ptr, bit1Shift_1, bit1Mask, bit1Shift_2, highMask);
+          vb = _mm256_or_si256(vb, vb1);
+          vb = _mm256_sub_epi8(vb, vbias);
+          for (int j = 0; j < MReg; j++) {
+            auto vsb = _mm256_sign_epi8(vb, va[j]);
+            auto vabsa = _mm256_sign_epi8(va[j], va[j]);
+            iacc[j * NReg + i] = _mm256_dpbusd_avx_epi32(iacc[j * NReg + i], vabsa, vsb);
+          }
+          b2ptr += 8 * KTILE / 4;
+          b1ptr += 8 * KTILE / 8;
+        }
+      }
+    }
+
+    gemv_dequant_s32fp32<ScaleT, NReg, MTILE>(A.sptr + ib, A.ldzp, B.sptr + ib * B.ldzp, iacc, acc);
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm256_storeu_ps(C + i * 8 + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+#ifdef __GNUC__
+#pragma GCC pop_options
+#else
+#endif
+#endif
+}  // namespace vnni
+
+#ifdef __GNUC__
+#pragma GCC pop_options
+#else
+#endif
+#endif
+}  // namespace avx2
+}  // namespace kernel
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_avx512_bf16.h b/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_avx512_bf16.h
new file mode 100644
index 000000000..6a396ec20
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_avx512_bf16.h
@@ -0,0 +1,96 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <immintrin.h>
+#include "kernel_avx512f.h"
+#include "bestla_utils.h"
+
+namespace bestla {
+namespace kernel {
+namespace avx512_bf16 {
+#if CompileBF16()
+#if defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("avx512bf16", "avx512vl", "avx512bw")
+#endif
+#endif
+static inline BTLA_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr, float* dst_ptr, int row, int col,
+                                                    int src_step, int dst_step, bool zeropadding) {
+#if CompileBF16()
+  const int npadding = (dst_step - col) * sizeof(float);
+  constexpr int simd_proc_elt = 16;
+  auto col_body = col / simd_proc_elt * simd_proc_elt;
+  auto col_tail = col % simd_proc_elt;
+  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
+  for (int i = 0; i < row; i++) {
+    auto src = const_cast<utils::bf16*>(src_ptr + i * src_step);
+    auto dst = dst_ptr + i * dst_step;
+    int j = 0;
+    for (; j < col_body; j += simd_proc_elt)
+      _mm512_storeu_ps(dst + j,  //
+                       _mm512_castsi512_ps(_mm512_bslli_epi128(_mm512_cvtepu16_epi32(_mm256_loadu_epi16(src + j)), 2)));
+    if (col_tail > 0)
+      _mm512_mask_storeu_ps(
+          dst + j, tail_mask,
+          _mm512_castsi512_ps(_mm512_bslli_epi128(_mm512_cvtepu16_epi32(_mm256_loadu_epi16(src + j)), 2)));
+    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
+  }
+  return BTLA_CODE::Success;
+#endif
+  return avx512f::bf16_cvt_fp32_2D_write_back(src_ptr, dst_ptr, row, col, src_step, dst_step, zeropadding);
+}
+
+static inline BTLA_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
+                                                    int srcstride, int dststride, bool zeropadding) {
+#if CompileBF16()
+  auto srcptr = reinterpret_cast<const char*>(raw_srcptr);
+  auto dstptr = reinterpret_cast<char*>(raw_dstptr);
+  constexpr int simd_proc_elt = 32;
+  auto col_body_loop = col / simd_proc_elt;
+  auto col_tail = col % simd_proc_elt;
+  const uint32_t tail_mask = (1U << col_tail) - 1;
+  int npadding = dststride - col * sizeof(utils::bf16);
+  for (int i = 0; i < row; i++) {
+    auto src = srcptr + i * srcstride;
+    auto dst = dstptr + i * dststride;
+    int j = 0;
+    for (; j < col_body_loop; j++) {
+      _mm512_storeu_epi16(
+          (dst + (j * simd_proc_elt) * sizeof(utils::bf16)),
+          (__m512i)_mm512_cvtne2ps_pbh(_mm512_loadu_ps(src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 16),
+                                       _mm512_loadu_ps(src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 0)));
+    }
+    if (col_tail > 0) {
+      _mm512_mask_storeu_epi16(
+          (dst + (j * simd_proc_elt) * sizeof(utils::bf16)), tail_mask,  //
+          (__m512i)_mm512_cvtne2ps_pbh(
+              _mm512_maskz_loadu_ps(tail_mask >> 16, src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 16),
+              _mm512_maskz_loadu_ps(tail_mask >> 0, src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 0)));
+    }
+    if (zeropadding && npadding) {
+      std::memset(dst + col * sizeof(utils::bf16), 0, npadding);
+    }
+  }
+  return BTLA_CODE::Success;
+#endif
+  return avx512f::fp32_cvt_bf16_2D_write_back(raw_srcptr, raw_dstptr, row, col, srcstride, dststride, zeropadding);
+}
+#if CompileBF16()
+#if defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#endif
+}  // namespace avx512_bf16
+}  // namespace kernel
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_avx512f.h b/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_avx512f.h
new file mode 100644
index 000000000..41c82bbcf
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_avx512f.h
@@ -0,0 +1,4421 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include "bestla.h"
+#include "bestla_utils.h"
+#include "kernel_jit.h"
+#include "kernel_ref.h"
+
+#include <array>
+#include <cmath>
+#include <cstring>
+#include <type_traits>
+#if CompileAVX512F()
+#include <immintrin.h>
+#endif
+
+namespace bestla {
+namespace kernel {
+namespace avx512f {
+#if CompileAVX512F()
+#ifdef __GNUC__
+#pragma GCC push_options
+#pragma GCC target("avx512f", "avx512bw", "avx512vl", "avx512vbmi", "avx512dq")
+#if CompileBF16()
+#pragma GCC target("avx512bf16")
+#endif
+#if CompileFP16()
+#pragma GCC target("avx512fp16")
+#endif
+#else
+#endif
+
+inline __m512 zmm_cvt_bf16_fp32(__m256i vbf16) {
+  auto vf32 = _mm512_cvtepu16_epi32(vbf16);
+  return _mm512_castsi512_ps(_mm512_slli_epi32(vf32, 16));
+}
+
+inline __m256i zmm_cvt_fp32_bf16(__m512 vfp32) {
+#if CompileBF16()
+  return (__m256i)_mm512_cvtneps_pbh(vfp32);
+#else
+  return _mm512_cvtepi32_epi16(_mm512_bsrli_epi128(_mm512_castps_si512(vfp32), 2));
+#endif
+}
+
+static inline __m512 load_bf16_fp32(const utils::bf16* srcptr) {
+  auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr));
+  auto vf32 = zmm_cvt_bf16_fp32(tmp);
+  return vf32;
+}
+
+static inline __m512i unpack_4bits(void* srcptr, __m512i mask) {
+  auto raw_data = _mm256_loadu_si256(reinterpret_cast<__m256i*>(srcptr));
+  auto ymm0 = _mm512_cvtepu8_epi16(raw_data);
+  auto ymm1 = _mm512_slli_epi16(ymm0, 4);
+  ymm0 = _mm512_or_si512(ymm0, ymm1);
+  ymm0 = _mm512_and_si512(ymm0, mask);
+  return ymm0;
+}
+
+static inline __m512i unpack_2bits(utils::bit2x4* ptr, const __m512i& vshift_y, const __m512i& vmask0_y,
+                                   const __m512i& vsfhl_mask_y, const __m512i& vorder_y) {
+  auto vraw_x = _mm_loadu_si128((const __m128i*)ptr);
+  auto vsrc_y = _mm512_broadcast_i64x2(vraw_x);
+  auto vordered_y = _mm512_permutex2var_epi32(vsrc_y, vorder_y, vsrc_y);
+  auto vs_y = _mm512_srlv_epi32(vordered_y, vshift_y);
+  auto v2_y = _mm512_and_si512(vs_y, vmask0_y);
+  auto vout_y = _mm512_shuffle_epi8(v2_y, vsfhl_mask_y);
+  return vout_y;
+}
+
+static inline __m512i unpack_1bits(utils::bit1x8* ptr, const __m512i& zmm_0x00, const __m512i& zmm_0x04) {
+  auto bit1_mask1 = _cvtu64_mask64(*(uint64_t*)ptr);
+  auto zmm1_ = _mm512_mask_mov_epi8(zmm_0x00, bit1_mask1, zmm_0x04);
+  return zmm1_;
+}
+
+static inline __m512i unpack_4bits_high(__m256i v4bits, __m512i vmask) {
+  auto ymm1 = _mm256_slli_epi32(v4bits, 4);
+  auto zmm = _mm512_cvtepi8_epi16(v4bits);
+  auto zmm1 = _mm512_cvtepi8_epi16(ymm1);
+  zmm = _mm512_slli_epi16(zmm, 8);
+  zmm1 = _mm512_mask_mov_epi8(zmm1, 0xaaaaaaaaaaaaaaaa, zmm);
+  zmm1 = _mm512_and_epi32(zmm1, vmask);
+  return zmm1;
+}
+
+static inline void convert_s4_s8_highbits(int8_t* dstptr, int8_t* srcptr, __m512i vmask, int LoadMask) {
+  auto ymm = _mm256_maskz_loadu_epi32(__mmask8(LoadMask), reinterpret_cast<const __m256i*>(srcptr));
+  auto zmm = unpack_4bits_high(ymm, vmask);
+  _mm512_mask_storeu_epi64(dstptr, __mmask8(LoadMask), zmm);
+}
+
+static inline void convert_s4_s8_highbits_v32(int8_t* dstptr, int8_t* srcptr, __m512i vmask, int LoadMask) {
+  auto xmm = _mm_maskz_loadu_epi32(__mmask8(LoadMask), reinterpret_cast<const __m256i*>(srcptr));
+  auto ymm = _mm256_castsi128_si256(xmm);
+  auto zmm = unpack_4bits_high(ymm, vmask);
+  auto ymm_out = _mm512_castsi512_si256(zmm);
+  _mm256_mask_storeu_epi64(dstptr, __mmask8(LoadMask), ymm_out);
+}
+
+template <typename T>
+static inline void convert_s8_fp_v16(T* dstptr, int8_t* srcptr) {
+  auto xmm = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcptr));
+  auto zmm = _mm512_cvtepi8_epi32(xmm);
+  auto zmm1 = _mm512_cvtepi32_ps(zmm);
+  if constexpr (std::is_same_v<T, utils::bf16>) {
+    auto ymm = zmm_cvt_fp32_bf16(zmm1);
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr), ymm);
+  } else {
+    _mm512_storeu_ps(dstptr, zmm1);
+  }
+}
+
+constexpr void (*pad_fp4)(int8_t* dstptr, int8_t* srcptr, __m512i vmask, int) = &convert_s4_s8_highbits;
+
+template <int N, typename _DST_T, bool _IS_SYM>
+static inline void dequant_s8_N(_DST_T* dstptr, int8_t* srcptr, __m512* vscales, __m512i* vzps = nullptr) {
+  static_assert(N % 16 == 0);
+  int constexpr VLoop = N / 16;
+  for (int iv = 0; iv < VLoop; iv += 1) {
+    auto src_s8 = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + iv * 16));
+    auto zmm = _mm512_cvtepi8_epi32(src_s8);
+    if constexpr (!_IS_SYM) zmm = _mm512_sub_epi32(zmm, vzps[iv]);
+    auto fzmm = _mm512_cvtepi32_ps(zmm);
+    fzmm = _mm512_mul_ps(fzmm, vscales[iv]);
+    if constexpr (std::is_same<_DST_T, float>::value) {
+      _mm512_storeu_ps(dstptr + iv * 16, fzmm);
+    } else if constexpr (std::is_same<_DST_T, utils::bf16>::value) {
+      auto bf16_v = zmm_cvt_fp32_bf16(fzmm);
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + iv * 16), bf16_v);
+    } else {
+      assert(false);
+    }
+  }
+}
+
+static inline __m512i load_s8_s32(int8_t* srcptr) {
+  auto xmm = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr));
+  auto ymm = _mm512_cvtepi8_epi32(xmm);
+  return ymm;
+}
+
+template <bool IsAsym = false>
+static inline __m512 dequant_s8_fp(int8_t* srcptr, __m512 vscales, __m512i vzps = __m512i()) {
+  auto src_s32_y = load_s8_s32(srcptr);
+  if constexpr (IsAsym) src_s32_y = _mm512_sub_epi32(src_s32_y, vzps);
+  auto src_fp_y = _mm512_cvtepi32_ps(src_s32_y);
+  src_fp_y = _mm512_mul_ps(src_fp_y, vscales);
+  return src_fp_y;
+}
+
+template <typename T>
+static inline void store_fp_T(__m512 src_y, T* dstptr) {
+  if constexpr (std::is_same_v<T, utils::bf16>) {
+    auto xmm = zmm_cvt_fp32_bf16(src_y);
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr), xmm);
+  } else if constexpr (std::is_same_v<T, float>) {
+    _mm512_storeu_ps(dstptr, src_y);
+  } else {
+    assert(false);
+  }
+}
+
+template <int N, typename _DST_T, BTLA_DTYPE F4_T>
+static inline void dequant_f4_N(_DST_T* dstptr, int8_t* srcptr, __m512* vscales, __m512i* vzps = nullptr) {
+  static_assert(N % 16 == 0);
+  int constexpr VLoop = N / 16;
+  float* LUT;
+  static_assert(F4_T == BTLA_DTYPE::F4_BNB || F4_T == BTLA_DTYPE::F4_NF4 || F4_T == BTLA_DTYPE::F4_E2M1,
+                "Unsupported F4 type");
+  if constexpr (F4_T == BTLA_DTYPE::F4_BNB) {
+    LUT = fp4_bnb_dequant_fp32_LUT;
+  } else if constexpr (F4_T == BTLA_DTYPE::F4_NF4) {
+    LUT = nf4_dequant_fp32_LUT;
+  } else if constexpr (F4_T == BTLA_DTYPE::F4_E2M1) {
+    LUT = fp4_e2m1_dequant_fp32_LUT;
+  }
+  for (int iv = 0; iv < VLoop; iv += 1) {
+    auto idx = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + iv * 16));
+    idx = _mm_srli_epi32(idx, 4);
+    auto pad_idx = _mm512_cvtepu8_epi32(idx);
+    auto lut = _mm512_loadu_si512(LUT);
+    auto fp32_dq_v = _mm512_permutexvar_epi32(pad_idx, lut);
+    auto fzmm = _mm512_mul_ps(_mm512_castsi512_ps(fp32_dq_v), vscales[iv]);
+    if constexpr (std::is_same<_DST_T, float>::value) {
+      _mm512_storeu_ps(dstptr + iv * 16, fzmm);
+    } else if constexpr (std::is_same<_DST_T, utils::bf16>::value) {
+      auto bf16_v = zmm_cvt_fp32_bf16(fzmm);
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + iv * 16), bf16_v);
+    } else {
+      assert(false);
+    }
+  }
+}
+
+template <int N, typename _DST_T, BTLA_DTYPE F4_T>
+static inline void unpack_f4_N(_DST_T* dstptr, int8_t* srcptr) {
+  static_assert(N % 16 == 0);
+  int constexpr VLoop = N / 16;
+  float* LUT;
+  static_assert(F4_T == BTLA_DTYPE::F4_BNB || F4_T == BTLA_DTYPE::F4_NF4 || F4_T == BTLA_DTYPE::F4_E2M1,
+                "Unsupported F4 type");
+  if constexpr (F4_T == BTLA_DTYPE::F4_BNB) {
+    LUT = fp4_bnb_dequant_fp32_LUT;
+  } else if constexpr (F4_T == BTLA_DTYPE::F4_NF4) {
+    LUT = nf4_dequant_fp32_LUT;
+  } else if constexpr (F4_T == BTLA_DTYPE::F4_E2M1) {
+    LUT = fp4_e2m1_dequant_fp32_LUT;
+  }
+  for (int iv = 0; iv < VLoop; iv += 1) {
+    auto idx = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + iv * 16));
+    idx = _mm_srli_epi32(idx, 4);
+    auto pad_idx = _mm512_cvtepu8_epi32(idx);
+    auto lut = _mm512_loadu_si512(LUT);
+    auto fp32_dq_v = _mm512_permutexvar_epi32(pad_idx, lut);
+    auto fzmm = _mm512_castsi512_ps(fp32_dq_v);
+    if constexpr (std::is_same<_DST_T, float>::value) {
+      _mm512_storeu_ps(dstptr + iv * 16, fzmm);
+    } else if constexpr (std::is_same<_DST_T, utils::bf16>::value) {
+      auto bf16_v = zmm_cvt_fp32_bf16(fzmm);
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + iv * 16), bf16_v);
+    } else {
+      assert(false);
+    }
+  }
+}
+
+template <typename _ST>
+static inline __m512 vec_loadscalex16(_ST* ptr) {
+  return _mm512_loadu_ps(ptr);
+}
+
+template <>
+inline __m512 vec_loadscalex16(utils::bf16* ptr) {
+  auto vbf16 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(ptr));
+  return zmm_cvt_bf16_fp32(vbf16);
+}
+
+static inline void vec_broadcast_epi32_1_2(__m512i* dst2regs, __m512i* src1regs) {
+  dst2regs[0] = _mm512_unpacklo_epi32(src1regs[0], src1regs[0]);
+  dst2regs[1] = _mm512_unpackhi_epi32(src1regs[0], src1regs[0]);
+}
+
+static inline void vec_broadcast_ps_1_2(__m512* dst2regs, __m512* src1regs, __m512i idxreg) {
+  auto tmpreg = _mm512_permutexvar_epi64(idxreg, _mm512_castps_si512(src1regs[0]));
+  dst2regs[0] = _mm512_castsi512_ps(_mm512_unpacklo_epi32(tmpreg, tmpreg));
+  dst2regs[1] = _mm512_castsi512_ps(_mm512_unpackhi_epi32(tmpreg, tmpreg));
+}
+
+template <bool LowBits>
+static inline __m512 broadcast_ps_1_2(__m512 vsrc_y, const __m512i& vshuf_index_high, const __m512i& vshuf_index_low) {
+  __m512 tmp;
+  if constexpr (LowBits) {
+    tmp = _mm512_permutex2var_ps(vsrc_y, vshuf_index_low, vsrc_y);
+  } else {
+    tmp = _mm512_permutex2var_ps(vsrc_y, vshuf_index_high, vsrc_y);
+  }
+  return tmp;
+}
+
+template <bool LowBits>
+static inline __m512i broadcast_epi32_1_2(__m512i vsrc_y, const __m512i& vshuf_index_high,
+                                          const __m512i& vshuf_index_low) {
+  return _mm512_castps_si512(broadcast_ps_1_2<LowBits>(_mm512_castsi512_ps(vsrc_y), vshuf_index_high, vshuf_index_low));
+}
+
+static inline void vec_broadcast_epi32_1_2(__m512i* dst2regs, __m512i* src1regs, __m512i idxreg) {
+  auto tmpreg = _mm512_permutexvar_epi64(idxreg, src1regs[0]);
+  dst2regs[0] = _mm512_unpacklo_epi32(tmpreg, tmpreg);
+  dst2regs[1] = _mm512_unpackhi_epi32(tmpreg, tmpreg);
+}
+
+static inline void vec_broadcast_pi8_1_2(__m128i* dst2regs, __m128i* src1regs, __m128i idxreg) {
+  auto tmpreg = _mm_permutexvar_epi16(idxreg, src1regs[0]);
+  dst2regs[0] = _mm_unpacklo_epi8(tmpreg, tmpreg);
+  dst2regs[1] = _mm_unpackhi_epi8(tmpreg, tmpreg);
+}
+
+static inline void vec_broadcast_epi32_2_4(__m512i* dst4regs, __m512i* src2regs) {
+  vec_broadcast_epi32_1_2(dst4regs, src2regs);
+  vec_broadcast_epi32_1_2(dst4regs + 2, src2regs + 1);
+}
+
+template <typename _ST, typename _DT, bool _IS_SYM>
+static inline BTLA_CODE decompress_kblock_bit4_packrow1(utils::bit4x2* srcptr, _DT* dstptr, int row, int col,
+                                                        int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
+                                                        int k_offset, int kblock, int NPad,
+                                                        void (*dequantize)(_DT*, int8_t*, __m512*, __m512i*),
+                                                        void (*pad_bit4)(int8_t*, int8_t*, __m512i, int),
+                                                        int8_t* tmpbuf, size_t tmpsize) {
+  uint32_t mask = 0xf0f0f0f0;
+  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
+  if (col == 48) {
+    constexpr int ColTile = 48;
+    constexpr int NRegs = ColTile / 16;
+    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
+    constexpr int LoadMask48 = (1 << (48 / 8)) - 1;
+    __m512 vscales[NRegs];
+    __m512i vzps[NRegs];
+    int constexpr UnrollRow = 4;
+    int constexpr Loop64 = ColTile * UnrollRow / 64;
+    assert(tmpsize >= (ColTile * UnrollRow));
+    int row0 = kblock - k_offset % kblock;
+    row0 = row0 == kblock ? 0 : row0;
+    row0 = row0 > row ? row : row0;
+    int row1 = row - row0;
+    int irow = 0;
+    if (row0) {
+      int rowpad4 = utils::padto_le(row0, UnrollRow);
+      for (int iv = 0; iv < 3; iv++) {
+        vscales[iv] = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16);
+        if constexpr (!_IS_SYM) {
+          auto tmp =
+              _mm_loadu_si128(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16));
+          vzps[iv] = _mm512_cvtepi8_epi32(tmp);
+        }
+      }
+      for (; irow < rowpad4; irow += UnrollRow) {
+        for (int iter64 = 0; iter64 < Loop64; iter64++) {
+          pad_bit4(tmpbuf + iter64 * 64, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 32 * iter64), zmm_mask,
+                   LoadMask64);
+        }
+        for (int iterr = 0; iterr < UnrollRow; iterr++) {
+          if constexpr (_IS_SYM) {
+            dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, nullptr);
+          } else {
+            dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, vzps);
+          }
+        }
+      }
+      for (; irow < row0; irow++) {
+        pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2), zmm_mask, LoadMask48);
+        if constexpr (_IS_SYM) {
+          dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, nullptr);
+        } else {
+          dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
+        }
+      }
+    }
+
+    int row1_blk = utils::padto_le(row1, kblock) + row0;
+    assert(kblock % UnrollRow == 0);
+    assert(ld_src == 48);  // no padding for unroll process
+
+    for (; irow < row1_blk; irow += kblock) {
+      for (int iv = 0; iv < 3; iv++) {
+        vscales[iv] = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16);
+        if constexpr (!_IS_SYM) {
+          auto tmp =
+              _mm_loadu_si128(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16));
+          vzps[iv] = _mm512_cvtepi8_epi32(tmp);
+        }
+      }
+
+      for (int irr = 0; irr < kblock; irr += UnrollRow) {
+        for (int iter64 = 0; iter64 < Loop64; iter64++) {
+          pad_bit4(tmpbuf + iter64 * 64, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + 32 * iter64),
+                   zmm_mask, LoadMask64);
+        }
+        for (int iterr = 0; iterr < UnrollRow; iterr++) {
+          if constexpr (_IS_SYM) {
+            dequantize(dstptr + (irow + irr + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, nullptr);
+          } else {
+            dequantize(dstptr + (irow + irr + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, vzps);
+          }
+        }
+      }
+    }
+    if (irow < row) {
+      for (int iv = 0; iv < 3; iv++) {
+        vscales[iv] = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16);
+        if constexpr (!_IS_SYM) {
+          auto tmp =
+              _mm_loadu_si128(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16));
+          vzps[iv] = _mm512_cvtepi8_epi32(tmp);
+        }
+      }
+      auto rowre = row - irow;
+      int rowpad4 = utils::padto_le(rowre, UnrollRow) + irow;
+      for (; irow < rowpad4; irow += UnrollRow) {
+        for (int iter64 = 0; iter64 < Loop64; iter64++) {
+          pad_bit4(tmpbuf + iter64 * 64, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 32 * iter64), zmm_mask,
+                   LoadMask64);
+        }
+        for (int iterr = 0; iterr < UnrollRow; iterr++) {
+          if constexpr (_IS_SYM) {
+            dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, nullptr);
+          } else {
+            dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, vzps);
+          }
+        }
+      }
+      for (; irow < row; irow++) {
+        pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2), zmm_mask, LoadMask48);
+        if constexpr (_IS_SYM) {
+          dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, nullptr);
+        } else {
+          dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
+        }
+      }
+    }
+    return BTLA_CODE::Success;
+  }
+  return BTLA_CODE::NotSupport;
+}
+
+template <BTLA_DTYPE _SRCT, typename _ST, typename _DT, bool _IS_SYM = true>
+static inline BTLA_CODE decompress_kblock_bit4_packrow2(utils::bit4x2* srcptr, _DT* dstptr, int row, int col,
+                                                        int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
+                                                        int k_offset, int kblock, int NPad, int8_t* tmpbuf,
+                                                        size_t tmpsize) {
+  uint32_t mask = 0xf0f0f0f0;
+  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
+  auto broadcast_idx = _mm512_setr_epi64(0, 4, 1, 5, 2, 6, 3, 7);
+  auto broadcast_idx_128 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+  auto constexpr SRC_TYPE =
+      static_cast<BTLA_DTYPE>(utils::bestla_dtype_get_mask_val(_SRCT, BTLA_DTYPE::TypeMask, BTLA_DTYPE::TypeShift));
+  if (col % 64 == 0) {
+    constexpr int ColTile = 64;
+    constexpr int NRegs = ColTile / 16;
+    constexpr int ScaleRegCount = NRegs / 2;
+    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
+    for (int icol = 0; icol < col; icol += ColTile) {
+      __m512 vscales[NRegs];
+      __m512i vzps[NRegs];
+      assert(tmpsize >= ColTile);
+      int row0 = kblock - k_offset % kblock;
+      row0 = row0 == kblock ? 0 : row0;
+      row0 = row0 > row ? row : row0;
+      int row1 = row - row0;
+      int irow = 0;
+      if (row0) {
+        for (int iv = 0; iv < ScaleRegCount; iv++) {
+          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
+          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
+          if constexpr (!_IS_SYM) {
+            auto tmpzp = _mm_loadu_si128(
+                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
+            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
+            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
+          }
+        }
+
+        for (; irow < row0; irow++) {
+          convert_s4_s8_highbits(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + icol / 2), zmm_mask,
+                                 LoadMask64);
+          if constexpr (SRC_TYPE == BTLA_DTYPE::TypeFloat) {
+            dequant_f4_N<ColTile, _DT, _SRCT>(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
+          } else {
+            dequant_s8_N<ColTile, _DT, _IS_SYM>(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
+          }
+        }
+      }
+
+      int row1_blk = utils::padto_le(row1, kblock) + row0;
+      for (; irow < row1_blk; irow += kblock) {
+        for (int iv = 0; iv < ScaleRegCount; iv++) {
+          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
+          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
+          if constexpr (!_IS_SYM) {
+            auto tmpzp = _mm_loadu_si128(
+                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
+            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
+            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
+          }
+        }
+
+        for (int irr = 0; irr < kblock; irr += 1) {
+          convert_s4_s8_highbits(tmpbuf, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + icol / 2),
+                                 zmm_mask, LoadMask64);
+          if constexpr (SRC_TYPE == BTLA_DTYPE::TypeFloat) {
+            dequant_f4_N<ColTile, _DT, _SRCT>(dstptr + (irow + irr) * ld_dst + icol, tmpbuf, vscales, vzps);
+          } else {
+            dequant_s8_N<ColTile, _DT, _IS_SYM>(dstptr + (irow + irr) * ld_dst + icol, tmpbuf, vscales, vzps);
+          }
+        }
+      }
+      if (irow < row) {
+        for (int iv = 0; iv < ScaleRegCount; iv++) {
+          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
+          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
+          if constexpr (!_IS_SYM) {
+            auto tmpzp = _mm_loadu_si128(
+                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
+            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
+            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
+          }
+        }
+      }
+      for (; irow < row; irow++) {
+        convert_s4_s8_highbits(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + icol / 2), zmm_mask,
+                               LoadMask64);
+        if constexpr (SRC_TYPE == BTLA_DTYPE::TypeFloat) {
+          dequant_f4_N<ColTile, _DT, _SRCT>(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
+        } else {
+          dequant_s8_N<ColTile, _DT, _IS_SYM>(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
+        }
+      }
+    }
+    return BTLA_CODE::Success;
+  } else if (col % 96 == 0) {
+    constexpr int ColTile = 96;
+    constexpr int NRegs = ColTile / 16;
+    constexpr int ScaleRegCount = NRegs / 2;
+    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
+    for (int icol = 0; icol < col; icol += ColTile) {
+      __m512 vscales[NRegs];
+      __m512i vzps[NRegs];
+      assert(tmpsize >= ColTile);
+      int row0 = kblock - k_offset % kblock;
+      row0 = row0 == kblock ? 0 : row0;
+      row0 = row0 > row ? row : row0;
+      int row1 = row - row0;
+      int irow = 0;
+      if (row0) {
+        for (int iv = 0; iv < ScaleRegCount; iv++) {
+          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
+          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
+          if constexpr (!_IS_SYM) {
+            auto tmpzp = _mm_loadu_si128(
+                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
+            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
+            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
+          }
+        }
+
+        for (; irow < row0; irow++) {
+          convert_s4_s8_highbits(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + icol / 2), zmm_mask,
+                                 LoadMask64);
+          convert_s4_s8_highbits_v32(tmpbuf + 64, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + icol / 2 + 32),
+                                     zmm_mask, LoadMask64);
+          if constexpr (SRC_TYPE == BTLA_DTYPE::TypeFloat) {
+            dequant_f4_N<ColTile, _DT, _SRCT>(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
+          } else {
+            dequant_s8_N<ColTile, _DT, _IS_SYM>(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
+          }
+        }
+      }
+
+      int row1_blk = utils::padto_le(row1, kblock) + row0;
+      for (; irow < row1_blk; irow += kblock) {
+        for (int iv = 0; iv < ScaleRegCount; iv++) {
+          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
+          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
+          if constexpr (!_IS_SYM) {
+            auto tmpzp = _mm_loadu_si128(
+                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
+            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
+            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
+          }
+        }
+
+        for (int irr = 0; irr < kblock; irr += 1) {
+          convert_s4_s8_highbits(tmpbuf, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + icol / 2),
+                                 zmm_mask, LoadMask64);
+          convert_s4_s8_highbits_v32(tmpbuf + 64,
+                                     reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + icol / 2 + 32),
+                                     zmm_mask, LoadMask64);
+          if constexpr (SRC_TYPE == BTLA_DTYPE::TypeFloat) {
+            dequant_f4_N<ColTile, _DT, _SRCT>(dstptr + (irow + irr) * ld_dst + icol, tmpbuf, vscales, vzps);
+          } else {
+            dequant_s8_N<ColTile, _DT, _IS_SYM>(dstptr + (irow + irr) * ld_dst + icol, tmpbuf, vscales, vzps);
+          }
+        }
+      }
+      if (irow < row) {
+        for (int iv = 0; iv < ScaleRegCount; iv++) {
+          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
+          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
+          if constexpr (!_IS_SYM) {
+            auto tmpzp = _mm_loadu_si128(
+                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
+            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
+            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
+          }
+        }
+      }
+      for (; irow < row; irow++) {
+        convert_s4_s8_highbits(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + icol / 2), zmm_mask,
+                               LoadMask64);
+        convert_s4_s8_highbits_v32(tmpbuf + 64, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + icol / 2 + 32),
+                                   zmm_mask, LoadMask64);
+        if constexpr (SRC_TYPE == BTLA_DTYPE::TypeFloat) {
+          dequant_f4_N<ColTile, _DT, _SRCT>(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
+        } else {
+          dequant_s8_N<ColTile, _DT, _IS_SYM>(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
+        }
+      }
+    }
+
+    return BTLA_CODE::Success;
+  }
+  return BTLA_CODE::NotSupport;
+}
+
+template <bool WITH_SCALE, typename _DST_T, int _PACK_ROW, typename _S_T>
+inline BTLA_CODE decompress_kblock_f8_fp(utils::f8* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
+                                         _S_T* scales, int k_offset, int kblock, int NPad, BTLA_DTYPE src_f8_type) {
+  int align_col = col / 16 * 16;
+  int col_tail = col - align_col;
+  auto ebits = utils::bestla_dtype_get_f8_ebits(src_f8_type);
+  auto mantissabit = 7 - ebits;
+  auto sign_revert_and_mask = _mm512_set1_epi32(0x80000000);
+  auto e_revert_and_mask = _mm512_set1_epi32(0x0000007f);
+  auto e_revert_shift = _mm512_set1_epi32(1);
+  e_revert_shift = _mm512_slli_epi32(e_revert_shift, ebits - 1);
+  e_revert_shift = _mm512_sub_epi32(e_revert_shift, _mm512_set1_epi32(128));
+  auto mantissa_revert_and_mask = _mm512_set1_epi32(0x007fffff);
+  auto packrow2_permute_idx = _mm512_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
+  for (int i = 0; i < row; i++) {
+    int kpos = (k_offset + i) / kblock;
+    auto sptr = scales + kpos * NPad;
+    int j = 0;
+    auto quant = [&](__mmask16 mask) {
+      auto sign_revert =
+          _mm512_cvtepi8_epi32(_mm_maskz_loadu_epi8(mask, reinterpret_cast<__m128i*>(srcptr + i * ld_src + j)));
+      auto e_revert = sign_revert;
+      auto mantissa_revert = sign_revert;
+      sign_revert = _mm512_slli_epi32(sign_revert, 24);
+      sign_revert = _mm512_and_epi32(sign_revert, sign_revert_and_mask);
+      e_revert = _mm512_and_epi32(e_revert, e_revert_and_mask);
+      e_revert = _mm512_srli_epi32(e_revert, mantissabit);
+      e_revert = _mm512_sub_epi32(e_revert, e_revert_shift);
+      if constexpr (WITH_SCALE && std::is_same_v<_S_T, utils::f8>) {
+        auto scale = _mm512_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(sptr + j / _PACK_ROW)));
+        if constexpr (_PACK_ROW == 2) scale = _mm512_permutexvar_epi32(packrow2_permute_idx, scale);
+        e_revert = _mm512_add_epi32(e_revert, scale);
+      }
+      e_revert = _mm512_slli_epi32(e_revert, 23);
+      mantissa_revert = _mm512_slli_epi32(mantissa_revert, 23 - mantissabit);
+      mantissa_revert = _mm512_and_epi32(mantissa_revert, mantissa_revert_and_mask);
+      auto fp_v = _mm512_or_ps(_mm512_castsi512_ps(sign_revert), _mm512_castsi512_ps(e_revert));
+      fp_v = _mm512_or_ps(fp_v, _mm512_castsi512_ps(mantissa_revert));
+      if constexpr (WITH_SCALE && std::is_same_v<_S_T, float>) {
+        auto scale = _mm512_loadu_ps(sptr + j / _PACK_ROW);
+        if constexpr (_PACK_ROW == 2) scale = _mm512_permutexvar_ps(packrow2_permute_idx, scale);
+        fp_v = _mm512_mul_ps(fp_v, scale);
+      }
+      if constexpr (std::is_same_v<_DST_T, float>) {
+        _mm512_mask_storeu_ps(dstptr + i * ld_dst + j, mask, fp_v);
+      } else if constexpr (std::is_same_v<_DST_T, utils::bf16>) {
+        auto bf16_v = zmm_cvt_fp32_bf16(fp_v);
+        _mm256_mask_storeu_epi16(reinterpret_cast<__m256i*>(dstptr + i * ld_dst + j), mask, bf16_v);
+      } else {
+        assert(0);
+      }
+    };
+    for (; j < align_col; j += 16) quant(_cvtu32_mask16(0xffff));
+    if (col_tail > 0) quant(_cvtu32_mask16(0xffff >> (16 - col_tail)));
+  }
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE S4_T, typename _DST_T, int _PACK_ROW, typename _ST>
+static inline BTLA_CODE decompress_kblock_s4_fp_Dep(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
+                                                    int ld_dst, _ST* scales, int8_t* zero_points, int k_offset,
+                                                    int kblock, int NPad, int8_t* tmp, size_t tmpsize) {
+  if constexpr (_PACK_ROW == 1) {
+    if (zero_points == nullptr) {
+      return decompress_kblock_bit4_packrow1<_ST, _DST_T, true>(
+          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
+          &dequant_s8_N<48, _DST_T, true>, &convert_s4_s8_highbits, tmp, tmpsize);
+    } else {
+      return decompress_kblock_bit4_packrow1<_ST, _DST_T, false>(
+          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
+          &dequant_s8_N<48, _DST_T, false>, &convert_s4_s8_highbits, tmp, tmpsize);
+    }
+  } else if constexpr (_PACK_ROW == 2) {
+    if (zero_points == nullptr) {
+      return decompress_kblock_bit4_packrow2<S4_T, _ST, _DST_T, true>(
+          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad, tmp, tmpsize);
+    } else {
+      return decompress_kblock_bit4_packrow2<S4_T, _ST, _DST_T, false>(
+          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad, tmp, tmpsize);
+    }
+  }
+  return BTLA_CODE::NotSupport;
+}
+
+template <BTLA_DTYPE S3_T, typename _DST_T>
+inline BTLA_CODE decompress_kblock_s3_s8fp(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, _DST_T* dstptr,
+                                           int interleave_n_offset, int unpack_elt, int8_t* tmp, size_t tmpsize) {
+  auto head_ignore_num = interleave_n_offset % 128;
+  auto zmm_0x04 = _mm512_set1_epi8(0x04);
+  auto zmm_0x00 = _mm512_set1_epi8(0x00);
+  auto zmm_shift = _mm512_set1_epi32(5);
+
+  auto bit3_interleave_decompress_pack128 = [&](utils::bit2x4* src1, utils::bit1x8* src2, int8_t* dst) {
+    const __m256i lowMask = _mm256_set1_epi8(0x03);
+    const __m256i bit2_data = _mm256_loadu_si256((const __m256i*)src1);
+    auto ymm0 = _mm256_and_si256(lowMask, bit2_data);                        // uop:1 p:015
+    auto ymm1 = _mm256_and_si256(lowMask, _mm256_srli_epi16(bit2_data, 2));  // uop:1 p:01
+    auto ymm2 = _mm256_and_si256(lowMask, _mm256_srli_epi16(bit2_data, 4));
+    auto ymm3 = _mm256_and_si256(lowMask, _mm256_srli_epi16(bit2_data, 6));
+    auto zmm1 = _mm512_inserti32x8(_mm512_castsi256_si512(ymm0), ymm1, 0x1);  // lat3, tp1 uop1 p:5
+    auto zmm2 = _mm512_inserti32x8(_mm512_castsi256_si512(ymm2), ymm3, 0x1);
+
+    unsigned long long* bit1_ptr = reinterpret_cast<unsigned long long*>(src2);
+    auto bit1_mask1 = _cvtu64_mask64(*bit1_ptr);
+    auto bit1_mask2 = _cvtu64_mask64(*(bit1_ptr + 1));
+    auto zmm1_ = _mm512_mask_mov_epi8(zmm_0x00, bit1_mask1, zmm_0x04);
+    auto zmm2_ = _mm512_mask_mov_epi8(zmm_0x00, bit1_mask2, zmm_0x04);
+    zmm1 = _mm512_add_epi8(zmm1, zmm1_);
+    zmm2 = _mm512_add_epi8(zmm2, zmm2_);
+    zmm1 = _mm512_sllv_epi32(zmm1, zmm_shift);  // int3_clip => int8
+    zmm2 = _mm512_sllv_epi32(zmm2, zmm_shift);  // int3_clip => int8
+
+    _mm512_storeu_si512((__m512i*)dst, zmm1);
+    _mm512_storeu_si512((__m512i*)(dst + 64), zmm2);
+  };
+  int compress_wei_ptr_offset = 0;
+  if (head_ignore_num != 0) {
+    assert(head_ignore_num % 8 == 0);
+
+    auto base_bit2ptr = bit2ptr - head_ignore_num / 4;
+    auto base_bit1ptr = bit1ptr - head_ignore_num / 8;
+    auto head_write_num = 128 - head_ignore_num;
+    bit3_interleave_decompress_pack128(base_bit2ptr, base_bit1ptr, tmp);
+    for (int i = 0; i < head_write_num; i++) dstptr[i] = tmp[head_ignore_num + i];
+    compress_wei_ptr_offset += head_write_num;
+    unpack_elt -= head_write_num;
+  }
+  auto body_loop = unpack_elt / 128;
+  auto tail_proc_num = unpack_elt % 128;
+
+  bestla::kernel::jit::DecompressS3::forward_avx512f(bit2ptr + compress_wei_ptr_offset / 4,
+                                                     bit1ptr + compress_wei_ptr_offset / 8,
+                                                     dstptr + compress_wei_ptr_offset, tmp, body_loop * 128);
+  compress_wei_ptr_offset += body_loop * 128;
+  if (tail_proc_num > 0) {
+    bit3_interleave_decompress_pack128(bit2ptr + compress_wei_ptr_offset / 4, bit1ptr + compress_wei_ptr_offset / 8,
+                                       tmp);
+    for (int i = 0; i < tail_proc_num; i++) dstptr[compress_wei_ptr_offset + i] = tmp[i];
+  }
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE _S3_T, typename _DST_T, int _PACK_ROW, typename _ST>
+static inline BTLA_CODE decompress_kblock_bit3_packrow_fp(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr,
+                                                          _DST_T* dstptr, int interleave_n_offset, int row, int col,
+                                                          _ST* scales, int8_t* zero_points, int k_offset, int kblock,
+                                                          int NPad, void* tmp, size_t tmpsize) {
+  auto unpack_elt = row * col;
+  decompress_kblock_s3_s8fp<_S3_T>(bit2ptr, bit1ptr, dstptr, interleave_n_offset, unpack_elt,
+                                   reinterpret_cast<int8_t*>(tmp), tmpsize);
+  // TODO(zhe): simd version
+  for (int i = 0; i < row; i++) {
+    int kpos = (k_offset + i) / kblock;
+    auto sptr = scales + kpos * NPad;
+    for (int j = 0; j < col; j++) {
+      float tmp = static_cast<float>(dstptr[i * col + j]);
+      if (zero_points != nullptr) tmp -= static_cast<float>(zero_points[kpos * NPad + j / _PACK_ROW]);
+      dstptr[i * col + j] = static_cast<_DST_T>(tmp * sptr[j / _PACK_ROW]);
+    }
+  }
+
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE _F4_T, typename _DST_T, int _PACK_ROW, typename _ST>
+static inline BTLA_CODE decompress_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
+                                                int ld_dst, _ST* scales, int k_offset, int kblock, int NPad,
+                                                int8_t* tmp, size_t tmpsize) {
+  if constexpr (_PACK_ROW == 1) {
+    return decompress_kblock_bit4_packrow1<_ST, _DST_T, true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
+                                                              k_offset, kblock, NPad, &dequant_f4_N<48, _DST_T, _F4_T>,
+                                                              pad_fp4, tmp, tmpsize);
+  } else if constexpr (_PACK_ROW == 2) {
+    return decompress_kblock_bit4_packrow2<_F4_T, _ST, _DST_T, true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales,
+                                                                     nullptr, k_offset, kblock, NPad, tmp, tmpsize);
+  }
+  return BTLA_CODE::NotSupport;
+}
+
+template <BTLA_DTYPE F4_T, typename DST_T>
+inline BTLA_CODE decompress_kblock_f4_fp_noscale(utils::f4x2* srcptr, DST_T* dstptr, int row, int col, int ld_src,
+                                                 int ld_dst, int8_t* tmp, size_t tmpsize) {
+  uint32_t mask = 0xf0f0f0f0;
+  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
+  if (col == ld_src) {
+    size_t elesize = (size_t)row * col;
+    size_t ele256 = utils::padto_le(elesize, 256);
+    size_t ele64 = utils::padto_le(elesize, 64);
+    assert(tmpsize >= 256);
+    size_t i = 0;
+    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
+    for (; i < ele256; i += 256) {
+      pad_fp4(tmp + 0, reinterpret_cast<int8_t*>(srcptr + i / 2 + 0), zmm_mask, LoadMask64);
+      pad_fp4(tmp + 64, reinterpret_cast<int8_t*>(srcptr + i / 2 + 32), zmm_mask, LoadMask64);
+      pad_fp4(tmp + 128, reinterpret_cast<int8_t*>(srcptr + i / 2 + 64), zmm_mask, LoadMask64);
+      pad_fp4(tmp + 192, reinterpret_cast<int8_t*>(srcptr + i / 2 + 96), zmm_mask, LoadMask64);
+      for (size_t j = 0; j < 256; j += 64) {
+        unpack_f4_N<64, DST_T, F4_T>(dstptr + i + j, tmp + j);
+      }
+    }
+    if (i + 64 <= ele64) {
+      for (; i < ele64; i += 64) {
+        pad_fp4(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2), zmm_mask, LoadMask64);
+        unpack_f4_N<64, DST_T, F4_T>(dstptr + i, tmp);
+      }
+    }
+    for (; i < elesize; i += 2) {
+      auto tmp = srcptr[i / 2];
+      dstptr[i + 0] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.x));
+      dstptr[i + 1] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.y));
+    }
+    return BTLA_CODE::Success;
+  }
+  return BTLA_CODE::NotSupport;
+}
+
+static inline BTLA_CODE quantize_f32_sign_int_rowblock_sym(const float* srcptr, int8_t* dstptr, int row, int col,
+                                                           int ld_src, int ld_dst, float* scales, int blocksize) {
+  int constexpr VLen = 16;
+  auto v127 = _mm512_set1_ps(127.f);
+  int col16 = utils::padto_le(col, 16);
+  int i = 0;
+  auto align_row = row / blocksize * blocksize;
+  for (; i < col16; i += VLen) {
+    int j = 0;
+    auto simd_process_block = [&](int size) {
+      __m512 vscale;
+      __m512 vmaxval = _mm512_set1_ps(0.f);
+      for (size_t ij = 0; ij < size; ij++) {
+        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
+        vsrc = _mm512_abs_ps(vsrc);
+        vmaxval = _mm512_max_ps(vmaxval, vsrc);
+      }
+      vscale = _mm512_div_ps(vmaxval, v127);
+      auto vrscale = _mm512_div_ps(v127, vmaxval);
+      _mm512_storeu_ps(&scales[j / blocksize * ld_dst + i], vscale);
+      for (size_t ij = 0; ij < size; ij++) {
+        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
+        vsrc = _mm512_mul_ps(vsrc, vrscale);
+        auto vdsrc = _mm512_cvtps_epi32(vsrc);
+        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) * ld_dst + i]), vbsrc);
+      }
+    };
+    for (; j < align_row; j += blocksize) simd_process_block(blocksize);
+    if (j < row) simd_process_block(row - align_row);
+  }
+  for (; i < col; i++) {
+    int j = 0;
+    auto scalar_process_block = [&](int size) {
+      float maxval = std::numeric_limits<float>::min();
+      for (size_t ij = 0; ij < size; ij++) {
+        maxval = std::max(maxval, std::abs(srcptr[(j + ij) * ld_src + i]));
+      }
+      float scale = maxval / 127;
+      float rscale = 1.f / scale;
+      scales[j / blocksize * ld_dst + i] = scale;
+      for (size_t ij = 0; ij < size; ij++) {
+        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>(srcptr[(j + ij) * ld_src + i] * rscale);
+      }
+    };
+    for (; j < align_row; j += blocksize) scalar_process_block(blocksize);
+    if (j < row) scalar_process_block(row - align_row);
+  }
+  return BTLA_CODE::Success;
+}
+template <BTLA_DTYPE QDT_T>
+static inline BTLA_CODE quantize_f32_sign_int_rowblock_sym_auto(const float* srcptr, int8_t* dstptr, int row, int col,
+                                                                int ld_src, int ld_dst, float* scales, int blocksize) {
+  int constexpr VLen = 16;
+  int col16 = utils::padto_le(col, VLen);
+  int i = 0;
+  auto align_row = row / blocksize * blocksize;
+  for (; i < col16; i += VLen) {
+    int j = 0;
+    float tmp_min[VLen];
+    float tmp_max[VLen];
+    float tmp_abs[VLen];
+    auto simd_process_block = [&](int size) {
+      __m512 vscale;
+      __m512 vmaxval = _mm512_set1_ps(std::numeric_limits<float>::min());
+      __m512 vminval = _mm512_set1_ps(std::numeric_limits<float>::max());
+      __m512 vabsval = _mm512_set1_ps(0.f);
+      for (size_t ij = 0; ij < size; ij++) {
+        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
+        vmaxval = _mm512_max_ps(vmaxval, vsrc);
+        vminval = _mm512_min_ps(vminval, vsrc);
+        vsrc = _mm512_abs_ps(vsrc);
+        vabsval = _mm512_max_ps(vabsval, vsrc);
+      }
+      _mm512_storeu_ps(tmp_min, vminval);
+      _mm512_storeu_ps(tmp_max, vmaxval);
+      _mm512_storeu_ps(tmp_abs, vabsval);
+      auto constexpr NBits = utils::bestla_dtype_bits(QDT_T);
+      int constexpr FullValue = 1 << (NBits - 1);
+      int constexpr GenValue = FullValue - 1;
+      for (int iv = 0; iv < VLen; iv++) {
+        int NVal = GenValue;
+        auto sum = tmp_max[iv] + tmp_min[iv];
+        if (abs(sum) >= tmp_abs[iv] / FullValue) {
+          NVal = sum > 0.f ? -FullValue : FullValue;
+        }
+        NVal = NVal << (8 - NBits);
+        tmp_abs[iv] = NVal;
+      }
+      auto vmag = _mm512_loadu_ps(tmp_abs);
+      vscale = _mm512_div_ps(vabsval, vmag);
+      auto vrscale = _mm512_div_ps(vmag, vabsval);
+      _mm512_storeu_ps(&scales[j / blocksize * ld_dst + i], vscale);
+      for (size_t ij = 0; ij < size; ij++) {
+        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
+        vsrc = _mm512_mul_ps(vsrc, vrscale);
+        auto vdsrc = _mm512_cvtps_epi32(vsrc);
+        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) * ld_dst + i]), vbsrc);
+      }
+    };
+    for (; j < align_row; j += blocksize) simd_process_block(blocksize);
+    if (j < row) simd_process_block(row - align_row);
+  }
+  kernel::ref::quantize_f32_sign_int_rowblock<QDT_T>(srcptr + i, dstptr + i, row, col - i, ld_src, ld_dst, scales + i,
+                                                     nullptr, blocksize);
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE quantize_f32_sign_int_rowblock_asym(const float* srcptr, int8_t* dstptr, int row, int col,
+                                                            int ld_src, int ld_dst, float* scales, int8_t* zero_points,
+                                                            int blocksize) {
+  int constexpr VLen = 16;
+  auto v255 = _mm512_set1_ps(255.f);
+  auto v2 = _mm512_set1_ps(2.f);
+  auto v0 = _mm512_set1_ps(0.f);
+  int col16 = utils::padto_le(col, 16);
+  int i = 0;
+  auto align_row = row / blocksize * blocksize;
+  for (; i < col16; i += VLen) {
+    int j = 0;
+    auto simd_process_block = [&](int size) {
+      __m512 vscale;
+      __m512 vzp;
+      __m512 vmaxval = v0;
+      __m512 vminval = vmaxval;
+      for (size_t ij = 0; ij < size; ij++) {
+        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
+        vmaxval = _mm512_max_ps(vmaxval, vsrc);
+        vminval = _mm512_min_ps(vminval, vsrc);
+      }
+      auto vsub = _mm512_sub_ps(vmaxval, vminval);
+      vscale = _mm512_div_ps(vsub, v255);
+      auto vrscale = _mm512_div_ps(v255, vsub);
+      _mm512_storeu_ps(&scales[j / blocksize * ld_dst + i], vscale);
+      auto vsum = _mm512_add_ps(vmaxval, vminval);
+      auto vmedium = _mm512_div_ps(vsum, v2);
+      vzp = _mm512_mul_ps(_mm512_sub_ps(v0, vmedium), vrscale);
+      auto vbzp = _mm512_cvtsepi32_epi8(_mm512_cvtps_epi32(vzp));
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(&zero_points[j / blocksize * ld_dst + i]), vbzp);
+      for (size_t ij = 0; ij < size; ij++) {
+        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
+        vsrc = _mm512_mul_ps(_mm512_sub_ps(vsrc, vmedium), vrscale);
+        auto vdsrc = _mm512_cvtps_epi32(vsrc);
+        auto vbsrc = _mm512_cvtsepi32_epi8(vdsrc);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) * ld_dst + i]), vbsrc);
+      }
+    };
+    for (; j < align_row; j += blocksize) simd_process_block(blocksize);
+    if (j < row) simd_process_block(row - align_row);
+  }
+  for (; i < col; i++) {
+    int j = 0;
+    auto scalar_process_block = [&](int size) {
+      float maxval = 0;
+      float minval = 0;
+      for (size_t ij = 0; ij < size; ij++) {
+        maxval = std::max(maxval, srcptr[(j + ij) * ld_src + i]);
+        minval = std::min(maxval, srcptr[(j + ij) * ld_src + i]);
+      }
+      float scale = (maxval - minval) / 255.f;
+      float rscale = 1.f / scale;
+      scales[j / blocksize * ld_dst + i] = scale;
+      float fmedium = (maxval + minval) / 2.f;
+      int8_t bzp = utils::cast<float, int8_t>((0 - fmedium) * rscale);
+      zero_points[j / blocksize * ld_dst + i] = bzp;
+      for (size_t ij = 0; ij < size; ij++) {
+        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>((srcptr[(j + ij) * ld_src + i] - fmedium) * rscale);
+      }
+    };
+    for (; j < align_row; j += blocksize) scalar_process_block(blocksize);
+    if (j < row) scalar_process_block(row - align_row);
+  }
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE QDT_T>
+static inline BTLA_CODE quantize_f32_sign_int_rowblock(const float* srcptr, int8_t* dstptr, int row, int col,
+                                                       int ld_src, int ld_dst, float* scales, int8_t* zero_points,
+                                                       int blocksize) {
+  if (zero_points == nullptr)
+    if constexpr (QDT_T == BTLA_DTYPE::S4_CLIP || QDT_T == BTLA_DTYPE::S3_CLIP) {
+      return quantize_f32_sign_int_rowblock_sym_auto<QDT_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales,
+                                                            blocksize);
+    } else {
+      return quantize_f32_sign_int_rowblock_sym(srcptr, dstptr, row, col, ld_src, ld_dst, scales, blocksize);
+    }
+  else
+    return quantize_f32_sign_int_rowblock_asym(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
+                                               blocksize);
+}
+
+static float F4_NF4_quant_sub_helper[] = {0.f,         0.23746347f, 0.38810113f, 0.50841697f, 0.61348899f, 0.71018467f,
+                                          0.80257138f, 0.88788655f, 0.96835165f, 1.05161765f, 1.14011017f, 1.23740894f,
+                                          1.34975982f, 1.49088332f, 1.70957482f, 2.0f};
+static float F4_BNB_quant_sub_helper[] = {0.00260417f, 0.0859375f, 0.20833333f, 0.29166667f,
+                                          0.4166667f,  0.583333f,  0.8333333f,  1.01f};
+static float F4_E2M1_quant_sub_helper[] = {0.00520833f, 0.08854167f, 0.20833333f, 0.29166667f,
+                                           0.41666667f, 0.58333333f, 0.83333333f, 1.01f};
+constexpr static int8_t F4_NF4_simd_quant_v[] = {0b0111, 0b0001, 0b0010, 0b0011, 0b0100, 0b0101, 0b0110, 0b0000,
+                                                 0b1000, 0b1001, 0b1010, 0b1011, 0b1100, 0b1101, 0b1110, 0b1111};
+constexpr static int8_t F4_BNB_simd_quant_v[] = {0b0000, 0b0001, 0b0110, 0b0111, 0b0100, 0b0101, 0b0010, 0b0011};
+constexpr static int8_t F4_E2M1_simd_quant_v[] = {0b0000, 0b0001, 0b0010, 0b0011, 0b0100, 0b0101, 0b0110, 0b0111};
+
+template <std::size_t N, std::size_t... I>
+constexpr auto broadcast_N_2_Nx16(const int8_t* arr, std::index_sequence<I...>) {
+  return std::array<int8_t, N * 16>{(arr[I / 16])...};
+}
+
+template <std::size_t N>
+constexpr auto broadcast_N_2_Nx16(const int8_t* arr) {
+  return broadcast_N_2_Nx16<N>(arr, std::make_index_sequence<N * 16>{});
+}
+
+template <BTLA_DTYPE F4_T>
+inline void f32_f4_quantize_4x16(const float* srcptr, int8_t* dstptr, int ld_src, int ld_dst,
+                                 const int8_t* broadcast_f4_v, float* scales, __mmask16 ls_mask) {
+  __m128i xmm0{}, xmm1{}, xmm2{}, xmm3{};
+  __m512 zmm0{}, zmm1{}, zmm2{}, zmm3{}, zmm4, zmm5, zmm6, zmm7, zmm_scale{};
+  __mmask16 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+  zmm_scale = _mm512_rcp14_ps(_mm512_mask_loadu_ps(zmm_scale, ls_mask, scales));
+  auto avoid_double_cmp = _mm512_set1_ps(100.f);
+  auto zmm_v0 = _mm512_set1_ps(0.f);
+  zmm0 = _mm512_mask_loadu_ps(zmm0, ls_mask, srcptr);
+  zmm1 = _mm512_mask_loadu_ps(zmm1, ls_mask, srcptr + 1 * ld_src);
+  zmm2 = _mm512_mask_loadu_ps(zmm2, ls_mask, srcptr + 2 * ld_src);
+  zmm3 = _mm512_mask_loadu_ps(zmm3, ls_mask, srcptr + 3 * ld_src);
+  zmm0 = _mm512_mul_ps(zmm0, zmm_scale);
+  zmm1 = _mm512_mul_ps(zmm1, zmm_scale);
+  zmm2 = _mm512_mul_ps(zmm2, zmm_scale);
+  zmm3 = _mm512_mul_ps(zmm3, zmm_scale);
+  if constexpr (F4_T == BTLA_DTYPE::F4_NF4) {
+    auto zmm_zp = _mm512_set1_ps(0.8480964004993439f);
+    zmm0 = _mm512_add_ps(zmm0, zmm_zp);
+    zmm1 = _mm512_add_ps(zmm1, zmm_zp);
+    zmm2 = _mm512_add_ps(zmm2, zmm_zp);
+    zmm3 = _mm512_add_ps(zmm3, zmm_zp);
+  } else {
+    mask4 = _mm512_cmp_ps_mask(zmm0, zmm_v0, 1);
+    mask5 = _mm512_cmp_ps_mask(zmm1, zmm_v0, 1);
+    mask6 = _mm512_cmp_ps_mask(zmm2, zmm_v0, 1);
+    mask7 = _mm512_cmp_ps_mask(zmm3, zmm_v0, 1);
+
+    zmm0 = _mm512_abs_ps(zmm0);
+    zmm1 = _mm512_abs_ps(zmm1);
+    zmm2 = _mm512_abs_ps(zmm2);
+    zmm3 = _mm512_abs_ps(zmm3);
+  }
+  constexpr int loop_num = F4_T == BTLA_DTYPE::F4_NF4 ? 16 : 8;
+  for (int i = 0; i < loop_num; i++) {
+    __m512 sub_v;
+    if constexpr (F4_T == BTLA_DTYPE::F4_NF4) sub_v = _mm512_set1_ps(F4_NF4_quant_sub_helper[i]);
+    if constexpr (F4_T == BTLA_DTYPE::F4_BNB) sub_v = _mm512_set1_ps(F4_BNB_quant_sub_helper[i]);
+    if constexpr (F4_T == BTLA_DTYPE::F4_E2M1) sub_v = _mm512_set1_ps(F4_E2M1_quant_sub_helper[i]);
+    zmm4 = _mm512_sub_ps(zmm0, sub_v);
+    zmm5 = _mm512_sub_ps(zmm1, sub_v);
+    zmm6 = _mm512_sub_ps(zmm2, sub_v);
+    zmm7 = _mm512_sub_ps(zmm3, sub_v);
+    mask0 = _mm512_cmp_ps_mask(zmm4, zmm_v0, 2);
+    mask1 = _mm512_cmp_ps_mask(zmm5, zmm_v0, 2);
+    mask2 = _mm512_cmp_ps_mask(zmm6, zmm_v0, 2);
+    mask3 = _mm512_cmp_ps_mask(zmm7, zmm_v0, 2);
+    xmm0 = _mm_mask_blend_epi8(mask0, xmm0, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
+    xmm1 = _mm_mask_blend_epi8(mask1, xmm1, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
+    xmm2 = _mm_mask_blend_epi8(mask2, xmm2, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
+    xmm3 = _mm_mask_blend_epi8(mask3, xmm3, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
+    zmm0 = _mm512_mask_add_ps(zmm0, mask0, zmm0, avoid_double_cmp);
+    zmm1 = _mm512_mask_add_ps(zmm1, mask1, zmm1, avoid_double_cmp);
+    zmm2 = _mm512_mask_add_ps(zmm2, mask2, zmm2, avoid_double_cmp);
+    zmm3 = _mm512_mask_add_ps(zmm3, mask3, zmm3, avoid_double_cmp);
+  }
+  if constexpr (F4_T != BTLA_DTYPE::F4_NF4) {
+    auto xmm_bias = _mm_set1_epi8(0x08);
+    xmm0 = _mm_mask_add_epi8(xmm0, mask4, xmm0, xmm_bias);
+    xmm1 = _mm_mask_add_epi8(xmm1, mask5, xmm1, xmm_bias);
+    xmm2 = _mm_mask_add_epi8(xmm2, mask6, xmm2, xmm_bias);
+    xmm3 = _mm_mask_add_epi8(xmm3, mask7, xmm3, xmm_bias);
+  }
+  _mm_mask_storeu_epi8(dstptr, ls_mask, xmm0);
+  _mm_mask_storeu_epi8(dstptr + 1 * ld_dst, ls_mask, xmm1);
+  _mm_mask_storeu_epi8(dstptr + 2 * ld_dst, ls_mask, xmm2);
+  _mm_mask_storeu_epi8(dstptr + 3 * ld_dst, ls_mask, xmm3);
+}
+
+template <BTLA_DTYPE F4_T>
+inline void f32_f4_quantize_1x16(const float* srcptr, int8_t* dstptr, int ld_src, int ld_dst,
+                                 const int8_t* broadcast_f4_v, float* scales, __mmask16 ls_mask) {
+  __m512 zmm0{}, zmm1, zmm_scale{};
+  zmm_scale = _mm512_rcp14_ps(_mm512_mask_loadu_ps(zmm_scale, ls_mask, scales));
+  auto avoid_double_cmp = _mm512_set1_ps(100.f);
+  auto zmm_v0 = _mm512_set1_ps(0.f);
+  __m128i xmm0{};
+  __mmask16 mask0, mask1;
+  zmm0 = _mm512_mask_loadu_ps(zmm0, ls_mask, srcptr);
+  zmm0 = _mm512_mul_ps(zmm0, zmm_scale);
+  if constexpr (F4_T == BTLA_DTYPE::F4_NF4) {
+    auto zp = _mm512_set1_ps(0.8480964004993439f);
+    zmm0 = _mm512_add_ps(zmm0, zp);
+  } else {
+    mask1 = _mm512_cmp_ps_mask(zmm0, zmm_v0, 1);
+    zmm0 = _mm512_abs_ps(zmm0);
+  }
+  constexpr int loop_num = F4_T == BTLA_DTYPE::F4_NF4 ? 16 : 8;
+  for (int i = 0; i < loop_num; i++) {
+    __m512 sub_v;
+    if constexpr (F4_T == BTLA_DTYPE::F4_NF4) sub_v = _mm512_set1_ps(F4_NF4_quant_sub_helper[i]);
+    if constexpr (F4_T == BTLA_DTYPE::F4_BNB) sub_v = _mm512_set1_ps(F4_BNB_quant_sub_helper[i]);
+    if constexpr (F4_T == BTLA_DTYPE::F4_E2M1) sub_v = _mm512_set1_ps(F4_E2M1_quant_sub_helper[i]);
+    zmm1 = _mm512_sub_ps(zmm0, sub_v);
+    mask0 = _mm512_cmp_ps_mask(zmm1, zmm_v0, 2);
+    xmm0 = _mm_mask_blend_epi8(mask0, xmm0, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
+    zmm0 = _mm512_mask_add_ps(zmm0, mask0, zmm0, avoid_double_cmp);
+  }
+  if constexpr (F4_T != BTLA_DTYPE::F4_NF4) {
+    auto xmm_bias = _mm_set1_epi8(0x08);
+    xmm0 = _mm_mask_add_epi8(xmm0, mask1, xmm0, xmm_bias);
+  }
+  _mm_mask_storeu_epi8(dstptr, ls_mask, xmm0);
+}
+
+inline void calc_blkx16_scale(const float* srcptr, int blocksize, int ld_src, float* scales, __mmask16 ls_mask) {
+  auto absmax = _mm512_set1_ps(0.f);
+  __m512 tmp{};
+  for (int i = 0; i < blocksize; i++) {
+    absmax = _mm512_range_ps(absmax, _mm512_mask_loadu_ps(tmp, ls_mask, srcptr + i * ld_src), 7);
+  }
+  _mm512_mask_storeu_ps(scales, ls_mask, absmax);
+}
+
+constexpr auto broadcast_F4_NF4_quantv = broadcast_N_2_Nx16<16>(F4_NF4_simd_quant_v);
+constexpr auto broadcast_F4_BNB_quantv = broadcast_N_2_Nx16<8>(F4_BNB_simd_quant_v);
+constexpr auto broadcast_F4_E2M1_quantv = broadcast_N_2_Nx16<8>(F4_E2M1_simd_quant_v);
+
+template <BTLA_DTYPE F4_T>
+inline BTLA_CODE quantize_f32_f4_rowblock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
+                                          float* scales, int8_t* zero_points, int blocksize) {
+  // assert(col % 16 == 0);
+  auto align_row = row / blocksize * blocksize;
+  auto align_blk = blocksize / 4 * 4;
+  int8_t* broadcast_f4_quantv;
+  if constexpr (F4_T == BTLA_DTYPE::F4_NF4) broadcast_f4_quantv = const_cast<int8_t*>(broadcast_F4_NF4_quantv.data());
+  if constexpr (F4_T == BTLA_DTYPE::F4_BNB) broadcast_f4_quantv = const_cast<int8_t*>(broadcast_F4_BNB_quantv.data());
+  if constexpr (F4_T == BTLA_DTYPE::F4_E2M1) broadcast_f4_quantv = const_cast<int8_t*>(broadcast_F4_E2M1_quantv.data());
+  int i = 0;
+  int align_col = col / 16 * 16;
+
+  auto process_row_blk = [&](int i, int col_size) {
+    int j = 0;
+    __mmask16 ls_mask = _cvtu32_mask16(0xffff >> (16 - col_size));
+    for (; j < align_row; j += blocksize) {
+      calc_blkx16_scale(srcptr + j * ld_src + i, blocksize, ld_src, scales + j / blocksize * ld_dst + i, ls_mask);
+      int k = 0;
+      for (; k < align_blk; k += 4) {
+        f32_f4_quantize_4x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
+                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
+      }
+      for (; k < blocksize; k++) {
+        f32_f4_quantize_1x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
+                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
+      }
+    }
+    if (j < row) {
+      auto fin_row = row - align_row;
+      calc_blkx16_scale(srcptr + j * ld_src + i, fin_row, ld_src, scales + j / blocksize * ld_dst + i, ls_mask);
+      int k = 0;
+      auto align_fin_blk = fin_row / 4 * 4;
+      for (; k < align_fin_blk; k += 4) {
+        f32_f4_quantize_4x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
+                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
+      }
+      for (; k < fin_row; k++) {
+        f32_f4_quantize_1x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
+                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
+      }
+    }
+  };
+
+  for (; i < align_col; i += 16) process_row_blk(i, 16);
+  if (i < col) process_row_blk(i, col - i);
+
+  return BTLA_CODE::Success;
+}
+
+template <typename SRC_T>
+static inline BTLA_CODE quantize_fp_u8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr,
+                                                int ld_dst, float* scales, int ld_scale, uint8_t* zps, int blocksize,
+                                                float* blkreduce) {
+  int constexpr VLen = 16;
+  auto vff = _mm512_set1_epi32(255);
+  auto v0 = _mm512_set1_epi32(0);
+  int vblocksize = utils::padto_le(blocksize, VLen);
+  int colblk = utils::padto_le(col, blocksize);
+  for (int i = 0; i < row; i += 1) {
+    size_t j = 0;
+    for (; j < colblk; j += blocksize) {
+      __m512 vmaxval = _mm512_set1_ps(0.f);
+      __m512 vminval = _mm512_set1_ps(0.f);
+      size_t ij = 0;
+      for (; ij < vblocksize; ij += VLen) {
+        __m512 vsrc;
+        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
+
+        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
+          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
+          vsrc = zmm_cvt_bf16_fp32(tmp);
+        }
+        vmaxval = _mm512_max_ps(vmaxval, vsrc);
+        vminval = _mm512_min_ps(vminval, vsrc);
+      }
+      auto maxval = _mm512_reduce_max_ps(vmaxval);
+      auto minval = _mm512_reduce_min_ps(vminval);
+      if (ij < blocksize) {
+        for (; ij < blocksize; ij++) {
+          auto srcval = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
+          maxval = std::max(maxval, srcval);
+          minval = std::min(minval, srcval);
+        }
+      }
+      float scale = (maxval - minval) / 255;
+      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
+      scales[j / blocksize + i * ld_scale] = scale;
+      zps[j / blocksize + i * ld_scale] = zp;
+      float rscale = 1.f / scale;
+      auto vrscale = _mm512_set1_ps(rscale);
+      auto vdzp = _mm512_set1_epi32(zp);
+      int sum = 0;
+      ij = 0;
+      for (; ij < vblocksize; ij += VLen) {
+        __m512 vsrc;
+        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
+        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
+          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
+          vsrc = zmm_cvt_bf16_fp32(tmp);
+        }
+        vsrc = _mm512_mul_ps(vsrc, vrscale);
+        auto vdsrc = _mm512_cvtps_epi32(vsrc);
+        if (blkreduce) {
+          sum += _mm512_reduce_add_epi32(vdsrc);
+        }
+        vdsrc = _mm512_add_epi32(vdsrc, vdzp);
+        vdsrc = _mm512_min_epi32(vdsrc, vff);
+        vdsrc = _mm512_max_epi32(vdsrc, v0);
+        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
+      }
+      for (; ij < blocksize; ij++) {
+        auto srcval = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
+        srcval = srcval * rscale;
+        auto srcint = utils::cast<float, int>(srcval);
+        sum += srcint;
+        srcint += zp;
+        srcint = std::min(srcint, 0xff);
+        srcint = std::max(srcint, 0);
+        dstptr[(j + ij) + i * ld_dst] = static_cast<uint8_t>(srcint);
+      }
+      if (blkreduce) {
+        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
+      }
+    }
+
+    if (j < col) {
+      float maxval = 0.f;
+      float minval = 0.f;
+      for (size_t ij = j; ij < col; ij++) {
+        auto fsrc = static_cast<float>(srcptr[ij + i * ld_src]);
+        maxval = std::max(fsrc, maxval);
+        minval = std::min(fsrc, minval);
+      }
+      float scale = (maxval - minval) / 255;
+      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
+      float rscale = 1.f / scale;
+      scales[j / blocksize + i * ld_scale] = scale;
+      zps[j / blocksize + i * ld_scale] = zp;
+      int sum = 0;
+      for (size_t ij = j; ij < col; ij++) {
+        auto fsrc = static_cast<float>(srcptr[ij + i * ld_src]);
+        auto srcint = utils::cast<float, int>(fsrc * rscale);
+        sum += srcint;
+        srcint += zp;
+        srcint = srcint <= 255 ? srcint : 255;
+        srcint = srcint >= 0 ? srcint : 0;
+        dstptr[ij + i * ld_dst] = srcint;
+      }
+      if (blkreduce) {
+        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename SRC_T>
+static inline BTLA_CODE quantize_fp_s8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, int8_t* dstptr,
+                                                int ld_dst, float* scales, int ld_scale, int blocksize, float* reduce) {
+  int constexpr VLen = 16;
+  auto vpos = _mm512_set1_epi32(127);
+  auto vneg = _mm512_set1_epi32(-128);
+  int VBlockSize = utils::padto_le(blocksize, VLen);
+  int colblk = utils::padto_le(col, blocksize);
+  for (int i = 0; i < row; i += 1) {
+    size_t j = 0;
+    for (; j < colblk; j += blocksize) {
+      __m512 vmaxval = _mm512_set1_ps(std::numeric_limits<float>::min());
+      size_t ij = 0;
+      for (; ij < VBlockSize; ij += VLen) {
+        __m512 vsrc;
+        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
+        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
+          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
+          vsrc = zmm_cvt_bf16_fp32(tmp);
+        }
+        vsrc = _mm512_abs_ps(vsrc);
+        vmaxval = _mm512_max_ps(vmaxval, vsrc);
+      }
+      auto maxval = _mm512_reduce_max_ps(vmaxval);
+      if (ij < blocksize) {
+        for (; ij < blocksize; ij++) {
+          auto srcval = std::abs(static_cast<float>(srcptr[(j + ij) + i * ld_src]));
+          maxval = std::max(maxval, srcval);
+        }
+      }
+      float scale = maxval / 127;
+      scales[j / blocksize + i * ld_scale] = scale;
+      float rscale = 1.f / scale;
+      auto vrscale = _mm512_set1_ps(rscale);
+      ij = 0;
+      int sum = 0;
+
+      for (; ij < VBlockSize; ij += VLen) {
+        __m512 vsrc;
+        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
+        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
+          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
+          vsrc = zmm_cvt_bf16_fp32(tmp);
+        }
+        vsrc = _mm512_mul_ps(vsrc, vrscale);
+        auto vdsrc = _mm512_cvtps_epi32(vsrc);
+        sum += _mm512_reduce_add_epi32(vdsrc);
+        vdsrc = _mm512_min_epi32(vdsrc, vpos);
+        vdsrc = _mm512_max_epi32(vdsrc, vneg);
+        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
+      }
+      if (ij < blocksize) {
+        for (; ij < blocksize; ij++) {
+          auto srcval = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
+          srcval = srcval * rscale;
+          auto srcint = int(roundf(srcval));
+          sum += srcint;
+          srcint = std::min(srcint, 127);
+          srcint = std::max(srcint, -127);
+          dstptr[(j + ij) + i * ld_dst] = static_cast<uint8_t>(srcint);
+        }
+      }
+      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
+    }
+    if (j < col) {
+      float absmaxval = std::numeric_limits<float>::min();
+      for (size_t ij = j; ij < col; ij++) {
+        absmaxval = std::max(std::abs((float)srcptr[(ij) + i * ld_src]), absmaxval);
+      }
+      float scale = absmaxval / 127;
+      float rscale = 1.f / scale;
+      scales[j / blocksize + i * ld_scale] = scale;
+      int sum = 0;
+      for (size_t ij = j; ij < col; ij++) {
+        dstptr[(ij) + i * ld_dst] = utils::cast<float, int8_t>((float)srcptr[(ij) + i * ld_src] * rscale);
+        sum += dstptr[(ij) + i * ld_dst];
+      }
+      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+inline BTLA_CODE dq8_get_fp_scale(uint8_t* src, float* dst, int row, int col, int scale_offset, int dq_blk,
+                                  int dq_offset_idx, float* dq_scale, int src_stride, int dst_stride, bool zeropadding,
+                                  int mN) {
+  auto head_proc_num = utils::updiv(scale_offset, 16) * 16 - scale_offset;
+  auto zmm_dq_offset = _mm512_set1_ps(dq_scale[dq_offset_idx]);
+
+  auto get_fp_scale = [&](int proc_src_num, __mmask16 mask, int scale_offset, uint8_t* src, float* dst) {
+    auto dq_s_idx = scale_offset / dq_blk;
+    auto zmm_dq_scale = _mm512_set1_ps(dq_scale[dq_s_idx]);
+    float tmp[16];
+    for (int i = 0; i < proc_src_num; i++) tmp[i] = dq8_bnb_LUT[src[i]];
+    __m512 fp32_dq_zmm = _mm512_loadu_ps(tmp);
+    auto fzmm = _mm512_mul_ps(fp32_dq_zmm, zmm_dq_scale);
+    fzmm = _mm512_add_ps(fzmm, zmm_dq_offset);
+    _mm512_mask_storeu_ps(dst, mask, fzmm);
+  };
+
+  for (int i = 0; i < row; i++) {
+    if (head_proc_num > col) {
+      auto mask = _cvtu32_mask16(0xffff >> (16 - col));
+      get_fp_scale(col, mask, scale_offset + i * mN, src + i * src_stride, dst + i * dst_stride);
+    } else {
+      // TODO(zhe): consider head_proc_num==0 case.
+      auto head_mask = _cvtu32_mask16(0xffff >> (16 - head_proc_num));
+      auto body_mask = _cvtu32_mask16(0xffff);
+      get_fp_scale(head_proc_num, head_mask, scale_offset + i * mN, src + i * src_stride, dst + i * dst_stride);
+      auto scale_offset_iter = scale_offset + i * mN + head_proc_num;
+      uint8_t* src_iter_ptr = src + head_proc_num;
+      float* dst_iter_ptr = dst + head_proc_num;
+      auto body_loop = (col - head_proc_num) / 16;
+      auto tail_proc_num = (col - head_proc_num) % 16;
+      int ii = 0;
+      for (; ii < body_loop; ii++) {
+        get_fp_scale(16, body_mask, scale_offset_iter + ii * 16, src_iter_ptr + i * src_stride + ii * 16,
+                     dst_iter_ptr + i * dst_stride + ii * 16);
+      }
+      if (tail_proc_num > 0) {
+        auto tail_mask = _cvtu32_mask16(0xffff >> (16 - tail_proc_num));
+        get_fp_scale(tail_proc_num, tail_mask, scale_offset_iter + ii * 16, src_iter_ptr + i * src_stride + ii * 16,
+                     dst_iter_ptr + i * dst_stride + ii * 16);
+      }
+    }
+  }
+  if (zeropadding) assert(0);
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE alphabeta_f32_f32(const float alpha, const float* srcptr, const int srcstep, const float beta,
+                                          const float* src1ptr, const int src1step, float* dstptr, const int dststep,
+                                          const int M, const int N) {
+  int constexpr Vlen = 16;
+  auto vN = utils::padto_le(N, Vlen);
+  auto valpha = _mm512_set1_ps(alpha);
+  auto vbeta = _mm512_set1_ps(beta);
+
+  for (int i = 0; i < M; i++) {
+    int j = 0;
+    if (beta != 0.f) {
+      for (; j < vN; j += Vlen) {
+        auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
+        auto vsrc1 = _mm512_loadu_ps(src1ptr + i * src1step + j);
+        auto vdst = _mm512_mul_ps(valpha, vsrc);
+        vdst = _mm512_fmadd_ps(vbeta, vsrc1, vdst);
+        _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
+      }
+      for (; j < N; j += 1) {
+        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j] + beta * src1ptr[i * src1step + j];
+      }
+    } else {
+      for (; j < vN; j += Vlen) {
+        auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
+        auto vdst = _mm512_mul_ps(valpha, vsrc);
+        _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
+      }
+      for (; j < N; j += 1) {
+        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j];
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename SCA_T>
+static inline BTLA_CODE accum_alphaN_f32_f32(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
+                                             const int dststep, const int M, const int N) {
+  int constexpr Vlen = 16;
+  auto vN = utils::padto_le(N, Vlen);
+  int j = 0;
+  for (; j < vN; j += Vlen) {
+    __m512 valpha;
+    if constexpr (std::is_same_v<SCA_T, float>) {
+      valpha = _mm512_loadu_ps(alpha + j);
+    } else if constexpr (std::is_same_v<SCA_T, utils::bf16>) {
+      auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(alpha + j));
+      valpha = zmm_cvt_bf16_fp32(tmp);
+    } else if constexpr (std::is_same_v<SCA_T, utils::f8>) {
+      valpha = _mm512_scalef_ps(
+          _mm512_set1_ps(1),
+          _mm512_cvtepi32_ps(_mm512_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha + j)))));
+    }
+    for (size_t i = 0; i < M; i++) {
+      auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
+      auto vsrc1 = _mm512_loadu_ps(dstptr + i * dststep + j);
+      auto vdst = _mm512_fmadd_ps(valpha, vsrc, vsrc1);
+      _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
+    }
+  }
+  for (; j < N; j += 1) {
+    for (size_t i = 0; i < M; i++) {
+      if constexpr (!std::is_same_v<SCA_T, utils::f8>) {
+        dstptr[i * dststep + j] += static_cast<float>(alpha[j]) * srcptr[i * srcstep + j];
+      } else {
+        dstptr[i * dststep + j] += alpha[j].mul(srcptr[i * srcstep + j]);
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE accum_f32_f32(const float* srcptr, const int srcstep, float* dstptr, const int dststep,
+                                      const int M, const int N) {
+  int constexpr Vlen = 16;
+  auto vN = utils::padto_le(N, Vlen);
+  int j = 0;
+  for (; j < vN; j += Vlen) {
+    for (size_t i = 0; i < M; i++) {
+      auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
+      auto vsrc1 = _mm512_loadu_ps(dstptr + i * dststep + j);
+      auto vdst = _mm512_add_ps(vsrc, vsrc1);
+      _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
+    }
+  }
+  for (; j < N; j += 1) {
+    for (size_t i = 0; i < M; i++) {
+      dstptr[i * dststep + j] += srcptr[i * srcstep + j];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline void vec_quanout_s32_u32_v16(const int32_t* srcptr, __m512& vfactor, __m512i& vzp, __m512i& vzeros,
+                                           __m512i& v255, uint8_t* dstptr) {
+  auto vsrcd = _mm512_loadu_si512(srcptr);
+  auto vsrcf = _mm512_mul_ps(vfactor, _mm512_cvtepi32_ps(vsrcd));
+  vsrcd = _mm512_cvtps_epi32(vsrcf);
+  vsrcd = _mm512_add_epi32(vsrcd, vzp);
+  vsrcd = _mm512_max_epi32(vsrcd, vzeros);
+  vsrcd = _mm512_min_epi32(vsrcd, v255);
+  auto vdstb = _mm512_cvtepi32_epi8(vsrcd);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), vdstb);
+}
+
+static inline BTLA_CODE quanout_s32_u32(const float alpha, const int32_t* srcptr, const int srcstep, uint8_t* dstptr,
+                                        const int dststep, const int M, const int N, float scaleSrc, float scaleDst,
+                                        int zpDst) {
+  float factor = alpha * scaleSrc / scaleDst;
+  auto vfactor = _mm512_set1_ps(factor);
+  auto vzp = _mm512_set1_epi32(zpDst);
+  auto vzeros = _mm512_set1_epi32(0);
+  auto v255 = _mm512_set1_epi32(255);
+  int N64 = utils::padto_le(N, 64);
+  int N48 = utils::padto_le(N, 48);
+  int N16 = utils::padto_le(N, 16);
+  for (int i = 0; i < M; i++) {
+    int j = 0;
+    for (; j < N64; j += 64) {
+      for (int iv = 0; iv < 4; iv++) {
+        vec_quanout_s32_u32_v16(&srcptr[i * srcstep + j + iv * 16], vfactor, vzp, vzeros, v255,
+                                &dstptr[i * dststep + j + iv * 16]);
+      }
+    }
+    if (N48 - j >= 48) {
+      for (; j < N48; j += 48) {
+        for (int iv = 0; iv < 3; iv++) {
+          vec_quanout_s32_u32_v16(&srcptr[i * srcstep + j + iv * 16], vfactor, vzp, vzeros, v255,
+                                  &dstptr[i * dststep + j + iv * 16]);
+        }
+      }
+    }
+    if (N16 - j >= 16) {
+      for (; j < N16; j += 16) {
+        vec_quanout_s32_u32_v16(&srcptr[i * srcstep + j], vfactor, vzp, vzeros, v255, &dstptr[i * dststep + j]);
+      }
+    }
+    for (; j < N; j++) {
+      float fsrc = static_cast<float>(srcptr[i * srcstep + j]) * factor;
+      dstptr[i * dststep + j] = utils::cast<float, uint8_t>(fsrc + static_cast<float>(zpDst));
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE accumulate_dequantize_s32_f32(const int32_t* srcptr, float* dstptr, float alpha, float beta,
+                                                      int row, int col, int ld_src, int ld_dst, float* ascales,
+                                                      int ldas, float* wscales) {
+  auto vbeta = _mm512_set1_ps(beta);
+  int col16 = utils::padto_le(col, 16);
+  for (int irow = 0; irow < row; irow++) {
+    auto scale = ascales[irow * ldas] * alpha;
+    auto valpha = _mm512_set1_ps(scale);
+    int icol = 0;
+    for (; icol < col16; icol += 16) {
+      auto vwscale = _mm512_loadu_ps(wscales + icol);
+      auto vscale = _mm512_mul_ps(valpha, vwscale);
+      auto vdst = _mm512_loadu_ps(dstptr + irow * ld_dst + icol);
+      vdst = _mm512_mul_ps(vdst, vbeta);
+      auto vsrcd = _mm512_loadu_si512(srcptr + irow * ld_src + icol);
+      auto vsrc = _mm512_cvtepi32_ps(vsrcd);
+      vsrc = _mm512_fmadd_ps(vsrc, vscale, vdst);
+      _mm512_storeu_ps(dstptr + irow * ld_dst + icol, vsrc);
+    }
+    for (; icol < col; icol += 1) {
+      dstptr[irow * ld_dst + icol] =
+          scale * wscales[icol] * srcptr[irow * ld_src + icol] + beta * dstptr[irow * ld_dst + icol];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename SCAB_T>
+static inline BTLA_CODE dequant_s32_fp32(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep,
+                                         const int row, const int col, const float* scaleA, const int ldsa,
+                                         const SCAB_T* scaleB) {
+  int col16 = utils::padto_le(col, 16);
+  int col64 = utils::padto_le(col, 64);
+  for (int irow = 0; irow < row; irow++) {
+    auto scale = scaleA[irow * ldsa];
+    auto valpha = _mm512_set1_ps(scale);
+    int icol = 0;
+    for (; icol < col64; icol += 64) {
+      for (int ic = 0; ic < 4; ic++) {
+        __m512 vwscale;
+        if constexpr (std::is_same_v<SCAB_T, float>) {
+          vwscale = _mm512_loadu_ps(scaleB + icol + ic * 16);
+        } else if constexpr (std::is_same_v<SCAB_T, utils::bf16>) {
+          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(scaleB + icol + ic * 16));
+          vwscale = zmm_cvt_bf16_fp32(tmp);
+        }
+        auto vscale = _mm512_mul_ps(valpha, vwscale);
+        auto vsrcd = _mm512_loadu_si512(srcptr + irow * srcstep + icol + ic * 16);
+        auto vsrc = _mm512_cvtepi32_ps(vsrcd);
+        vsrc = _mm512_mul_ps(vsrc, vscale);
+        _mm512_storeu_ps(dstptr + irow * dststep + icol + ic * 16, vsrc);
+      }
+    }
+    if (icol + 16 <= col16) {
+      for (; icol < col16; icol += 16) {
+        __m512 vwscale;
+        if constexpr (std::is_same_v<SCAB_T, float>) {
+          vwscale = _mm512_loadu_ps(scaleB + icol);
+        } else if constexpr (std::is_same_v<SCAB_T, utils::bf16>) {
+          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(scaleB + icol));
+          vwscale = zmm_cvt_bf16_fp32(tmp);
+        }
+        auto vscale = _mm512_mul_ps(valpha, vwscale);
+        auto vsrcd = _mm512_loadu_si512(srcptr + irow * srcstep + icol);
+        auto vsrc = _mm512_cvtepi32_ps(vsrcd);
+        vsrc = _mm512_mul_ps(vsrc, vscale);
+        _mm512_storeu_ps(dstptr + irow * dststep + icol, vsrc);
+      }
+    }
+    for (; icol < col; icol += 1) {
+      dstptr[irow * dststep + icol] = scale * scaleB[icol] * srcptr[irow * srcstep + icol];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE broadcast_u8(int num, const uint8_t& srcval, uint8_t* dstptr) {
+  int i = 0;
+  int constexpr VN = 64 / sizeof(srcval);
+  int numv = utils::padto_le(num, VN);
+  auto vsrc = _mm512_set1_epi8(srcval);
+  for (; i < numv; i += VN) {
+    _mm512_storeu_si512(dstptr + i, vsrc);
+  }
+  int num32 = utils::padto_le(num, 32);
+  if (i + 32 <= num32) {
+    for (; i < num32; i += 32) {
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + i), _mm512_castsi512_si256(vsrc));
+    }
+  }
+  for (; i < num; i++) {
+    dstptr[i] = srcval;
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE remove_act_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zps,
+                                                  float* scales, int lds, const float* reduce) {
+  int constexpr VLen = 16;
+  auto col16 = utils::padto_le(col, VLen);
+  for (int i = 0; i < row; i++) {
+    auto zpf = static_cast<float>(zps[i * lds]) * scales[i * lds];
+    int j = 0;
+    auto vzp = _mm512_set1_ps(-zpf);
+    for (; j < col16; j += VLen) {
+      auto vreduce = _mm512_loadu_ps(reduce + j);
+      auto vacc = _mm512_loadu_ps(&accptr[i * ldacc + j]);
+      vacc = _mm512_fmadd_ps(vzp, vreduce, vacc);
+      _mm512_storeu_ps(&accptr[i * ldacc + j], vacc);
+    }
+    if (j < col) {
+      for (; j < col; j++) {
+        accptr[i * ldacc + j] -= zpf * reduce[j];
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE remove_wei_zeropoint_bias(float* accptr, int ldacc, int row, int col, int8_t* zps,
+                                                  float* scales, int lds, const float* reduce) {
+  int constexpr VLen = 16;
+  auto col16 = utils::padto_le(col, VLen);
+  for (int i = 0; i < row; i++) {
+    auto vreduce = _mm512_set1_ps(-reduce[i * lds]);
+    int j = 0;
+    for (; j < col16; j += VLen) {
+      auto vzp_s32 = _mm512_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(zps + j)));
+      auto vzp_f32 = _mm512_cvtepi32_ps(vzp_s32);
+      auto vzp = _mm512_mul_ps(vzp_f32, _mm512_loadu_ps(scales + j));
+      auto vacc = _mm512_loadu_ps(&accptr[i * ldacc + j]);
+      vacc = _mm512_fmadd_ps(vzp, vreduce, vacc);
+      _mm512_storeu_ps(&accptr[i * ldacc + j], vacc);
+    }
+    if (j < col) {
+      for (; j < col; j++) {
+        accptr[i * ldacc + j] -= static_cast<float>(zps[j]) * scales[j] * reduce[i * lds];
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE remove_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
+                                              float* scalea, float* scaleb, int lds, int k, const float* reducea,
+                                              const float* reduceb) {
+  int constexpr VLen = 16;
+  auto col16 = utils::padto_le(col, VLen);
+  auto vk = _mm512_set1_ps(static_cast<float>(k));
+  for (int i = 0; i < row; i++) {
+    auto vreducea = _mm512_set1_ps(-reducea[i * lds]);
+    auto zpaf = static_cast<float>(zpa[i * lds]) * scalea[i * lds];
+    auto vzpa = _mm512_set1_ps(-zpaf);
+    int j = 0;
+    for (; j < col16; j += VLen) {
+      auto vzp_s32 = _mm512_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(zpb + j)));
+      auto vzp_f32 = _mm512_cvtepi32_ps(vzp_s32);
+      auto vzpb = _mm512_mul_ps(vzp_f32, _mm512_loadu_ps(scaleb + j));
+      auto vreduceb = _mm512_loadu_ps(reduceb + j);
+      auto vacc = _mm512_loadu_ps(&accptr[i * ldacc + j]);
+      vacc = _mm512_fmadd_ps(vzpa, vreduceb, vacc);
+      vacc = _mm512_fmadd_ps(vzpb, vreducea, vacc);
+      vzpb = _mm512_mul_ps(vzpb, vk);
+      vacc = _mm512_fmadd_ps(vzpa, vzpb, vacc);
+      _mm512_storeu_ps(&accptr[i * ldacc + j], vacc);
+    }
+    if (j < col) {
+      for (; j < col; j++) {
+        float zpbf = static_cast<float>(zpb[j]) * scaleb[j];
+        accptr[i * ldacc + j] -= zpbf * reducea[i * lds];
+        accptr[i * ldacc + j] -= zpaf * reduceb[j];
+        accptr[i * ldacc + j] -= zpaf * zpbf * k;
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
+                                                    int srcstride, int dststride, bool zeropadding) {
+  auto srcptr = reinterpret_cast<const char*>(raw_srcptr);
+  auto dstptr = reinterpret_cast<char*>(raw_dstptr);
+  constexpr int simd_proc_elt = 16;
+  auto col_body_loop = col / simd_proc_elt;
+  auto col_tail = col % simd_proc_elt;
+  auto tail_mask = _cvtu32_mask16(0xffff >> (16 - col_tail));
+  int npadding = dststride - col * sizeof(utils::bf16);
+  auto bf16_and_helper = _mm512_set1_epi32(0x00000001);
+  auto bf16_add_helper = _mm512_set1_epi32(0X00007FFF);
+  for (int i = 0; i < row; i++) {
+    auto src = srcptr + i * srcstride;
+    auto dst = dstptr + i * dststride;
+    int j = 0;
+    for (; j < col_body_loop; j++) {
+      auto round_bias = _mm512_loadu_si512(src + sizeof(float) * simd_proc_elt * j);
+      round_bias = _mm512_and_epi32(bf16_and_helper, _mm512_bsrli_epi128(round_bias, 2));
+      round_bias = _mm512_add_epi32(round_bias, bf16_add_helper);
+      auto round_fp32_v = _mm512_add_epi32(round_bias, _mm512_loadu_si512(src + sizeof(float) * simd_proc_elt * j));
+      auto pack_bf16_value = _mm512_cvtepi32_epi16(_mm512_srli_epi32(round_fp32_v, 16));
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + (j * simd_proc_elt) * sizeof(utils::bf16)), pack_bf16_value);
+    }
+    if (col_tail > 0) {
+      auto round_bias = _mm512_maskz_loadu_epi32(tail_mask, src + sizeof(float) * simd_proc_elt * j);
+      round_bias = _mm512_and_epi32(bf16_and_helper, _mm512_bsrli_epi128(round_bias, 2));
+      round_bias = _mm512_add_epi32(round_bias, bf16_add_helper);
+      auto round_fp32_v =
+          _mm512_add_epi32(round_bias, _mm512_maskz_loadu_epi32(tail_mask, src + sizeof(float) * simd_proc_elt * j));
+      auto pack_bf16_tail = _mm512_cvtepi32_epi16(_mm512_srli_epi32(round_fp32_v, 16));
+      _mm256_mask_storeu_epi16(reinterpret_cast<__m256i*>(dst + (j * simd_proc_elt) * sizeof(utils::bf16)), tail_mask,
+                               pack_bf16_tail);
+    }
+    if (zeropadding && npadding) {
+      std::memset(dst + col * sizeof(utils::bf16), 0, npadding);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename SRC_T>
+static inline BTLA_CODE col_block_reduce_sum(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize,
+                                             float* reduce, int ldr) {
+  int constexpr VLen = 16;
+  auto vblock2_ = utils::padto_le(blocksize, VLen * 2);
+  auto vblock_ = utils::padto_le(blocksize, VLen);
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col; j += blocksize) {
+      auto tmp = 0.f;
+      auto vsum = _mm512_set1_ps(0.f);
+      int jj = 0;
+      auto vblock2 = j + vblock2_ <= col ? vblock2_ : 0;
+      auto vblock = j + vblock_ <= col ? vblock_ : 0;
+      for (; jj < vblock2; jj += VLen * 2) {
+        auto vtmp = _mm512_loadu_ps(srcptr + i * ldsrc + j + jj);
+        auto vtmp1 = _mm512_loadu_ps(srcptr + i * ldsrc + j + jj + VLen);
+        auto s0 = _mm512_reduce_add_ps(vtmp);
+        auto s1 = _mm512_reduce_add_ps(vtmp1);
+        tmp += s0;
+        tmp += s1;
+      }
+      if (jj + VLen <= vblock) {
+        for (; jj < vblock; jj += VLen) {
+          auto vtmp = _mm512_loadu_ps(srcptr + i * ldsrc + j + jj);
+          auto s0 = _mm512_reduce_add_ps(vtmp);
+          tmp += s0;
+        }
+      }
+      for (; jj < blocksize; jj++) {
+        tmp += *(srcptr + i * ldsrc + j + jj);
+      }
+      reduce[i * ldr + j / blocksize] = tmp;
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE fp32_cvt_fp16_2D_write_back(const float* src_ptr, utils::fp16* dst_ptr, int row, int col,
+                                                    int src_step, int dst_step, bool zeropadding) {
+#if CompileFP16()
+  const int npadding = (dst_step - col) * sizeof(utils::fp16);
+  constexpr int simd_proc_elt = 16;
+  auto col_body = col / simd_proc_elt * simd_proc_elt;
+  auto col_tail = col % simd_proc_elt;
+  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
+  for (int i = 0; i < row; i++) {
+    const auto src = src_ptr + i * src_step;
+    const auto dst = dst_ptr + i * dst_step;
+    int j = 0;
+    for (; j < col_body; j += simd_proc_elt) {
+      _mm256_storeu_ph(dst + j, _mm512_cvtxps_ph(_mm512_loadu_ps(src + j)));
+    }
+    if (col_tail > 0) {
+      _mm256_mask_storeu_epi16(  //
+          dst + j, tail_mask, _mm256_castph_si256(_mm512_cvtxps_ph(_mm512_maskz_loadu_ps(tail_mask, src + j))));
+    }
+    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
+  }
+  return BTLA_CODE::Success;
+#else
+  return BTLA_CODE::NotSupport;
+#endif
+}
+
+static inline BTLA_CODE fp16_cvt_fp32_2D_write_back(const utils::fp16* src_ptr, float* dst_ptr, int row, int col,
+                                                    int src_step, int dst_step, bool zeropadding) {
+#if CompileFP16()
+  const int npadding = (dst_step - col) * sizeof(float);
+  constexpr int simd_proc_elt = 16;
+  auto col_body = col / simd_proc_elt * simd_proc_elt;
+  auto col_tail = col % simd_proc_elt;
+  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
+  for (int i = 0; i < row; i++) {
+    const auto src = src_ptr + i * src_step;
+    const auto dst = dst_ptr + i * dst_step;
+    int j = 0;
+    for (; j < col_body; j += simd_proc_elt) {
+      _mm512_storeu_ps(dst + j, _mm512_cvtxph_ps(_mm256_loadu_ph(src + j)));
+    }
+    if (col_tail > 0) {
+      _mm512_mask_storeu_ps(dst + j, tail_mask,
+                            _mm512_cvtxph_ps(_mm256_castsi256_ph(_mm256_maskz_loadu_epi16(tail_mask, src + j))));
+    }
+    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
+  }
+  return BTLA_CODE::Success;
+#else
+  return BTLA_CODE::NotSupport;
+#endif
+}
+
+static inline BTLA_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr, float* dst_ptr, int row, int col,
+                                                    int src_step, int dst_step, bool zeropadding) {
+  const int npadding = (dst_step - col) * sizeof(float);
+  constexpr int simd_proc_elt = 16;
+  auto col_body = col / simd_proc_elt * simd_proc_elt;
+  auto col_tail = col % simd_proc_elt;
+  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
+  for (int i = 0; i < row; i++) {
+    auto src = const_cast<utils::bf16*>(src_ptr + i * src_step);
+    auto dst = dst_ptr + i * dst_step;
+    int j = 0;
+    for (; j < col_body; j += simd_proc_elt)
+      _mm512_storeu_ps(
+          dst + j,
+          _mm512_castsi512_ps(_mm512_bslli_epi128(
+              _mm512_cvtepu16_epi32(_mm256_castps_si256(_mm256_loadu_ps(reinterpret_cast<float*>(src + j)))), 2)));
+    if (col_tail > 0)
+      _mm512_mask_storeu_ps(
+          dst + j, tail_mask,
+          _mm512_castsi512_ps(_mm512_bslli_epi128(
+              _mm512_cvtepu16_epi32(_mm256_castps_si256(_mm256_loadu_ps(reinterpret_cast<float*>(src + j)))), 2)));
+    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
+  }
+  return BTLA_CODE::Success;
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"  // https://stackoverflow.com/a/49216021
+#endif
+// Interleave 2 bf16 zmm vectors inplace
+static inline void interleave_word(std::array<__m512i, 2>& dst) {  // NOLINT [runtime/references]
+  static constexpr uint32_t perm_idx_a[16]{
+      0 | 0,  1 | 0,  2 | 0,  3 | 0,   //
+      0 | 16, 1 | 16, 2 | 16, 3 | 16,  //
+      4 | 0,  5 | 0,  6 | 0,  7 | 0,   //
+      4 | 16, 5 | 16, 6 | 16, 7 | 16,  //
+  };
+  static constexpr uint32_t perm_idx_b[16]{
+      8 | 0,   9 | 0,   10 | 0,  11 | 0,   //
+      8 | 16,  9 | 16,  10 | 16, 11 | 16,  //
+      12 | 0,  13 | 0,  14 | 0,  15 | 0,   //
+      12 | 16, 13 | 16, 14 | 16, 15 | 16,  //
+  };
+  static const auto v_perm_idx_a = _mm512_loadu_si512(perm_idx_a);
+  static const auto v_perm_idx_b = _mm512_loadu_si512(perm_idx_b);
+
+  __m512i tmp[2];
+  tmp[0] = _mm512_unpacklo_epi16(dst[0], dst[1]);
+  tmp[1] = _mm512_unpackhi_epi16(dst[0], dst[1]);
+  dst[0] = _mm512_permutex2var_epi32(tmp[0], v_perm_idx_a, tmp[1]);
+  dst[1] = _mm512_permutex2var_epi32(tmp[0], v_perm_idx_b, tmp[1]);
+}
+
+// Interleave 16 zmm vectors of dwords inplace
+static inline void tr_x16_dword(std::array<__m512i, 16>& dst) {  // NOLINT [runtime/references]
+  __m512i tmp[16];
+
+  for (int i = 0; i < 8; ++i) {
+    tmp[2 * i] = _mm512_unpacklo_epi32(dst[2 * i], dst[2 * i + 1]);
+    tmp[2 * i + 1] = _mm512_unpackhi_epi32(dst[2 * i], dst[2 * i + 1]);
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    dst[4 * i] = _mm512_unpacklo_epi64(tmp[4 * i], tmp[4 * i + 2]);
+    dst[4 * i + 1] = _mm512_unpackhi_epi64(tmp[4 * i], tmp[4 * i + 2]);
+    dst[4 * i + 2] = _mm512_unpacklo_epi64(tmp[4 * i + 1], tmp[4 * i + 3]);
+    dst[4 * i + 3] = _mm512_unpackhi_epi64(tmp[4 * i + 1], tmp[4 * i + 3]);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    tmp[8 * i + 0] = _mm512_shuffle_i32x4(dst[8 * i + 0], dst[8 * i + 4], 0x88);
+    tmp[8 * i + 1] = _mm512_shuffle_i32x4(dst[8 * i + 1], dst[8 * i + 5], 0x88);
+    tmp[8 * i + 2] = _mm512_shuffle_i32x4(dst[8 * i + 2], dst[8 * i + 6], 0x88);
+    tmp[8 * i + 3] = _mm512_shuffle_i32x4(dst[8 * i + 3], dst[8 * i + 7], 0x88);
+    tmp[8 * i + 4] = _mm512_shuffle_i32x4(dst[8 * i + 0], dst[8 * i + 4], 0xdd);
+    tmp[8 * i + 5] = _mm512_shuffle_i32x4(dst[8 * i + 1], dst[8 * i + 5], 0xdd);
+    tmp[8 * i + 6] = _mm512_shuffle_i32x4(dst[8 * i + 2], dst[8 * i + 6], 0xdd);
+    tmp[8 * i + 7] = _mm512_shuffle_i32x4(dst[8 * i + 3], dst[8 * i + 7], 0xdd);
+  }
+
+  dst[0] = _mm512_shuffle_i32x4(tmp[0], tmp[8], 0x88);
+  dst[1] = _mm512_shuffle_i32x4(tmp[1], tmp[9], 0x88);
+  dst[2] = _mm512_shuffle_i32x4(tmp[2], tmp[10], 0x88);
+  dst[3] = _mm512_shuffle_i32x4(tmp[3], tmp[11], 0x88);
+  dst[4] = _mm512_shuffle_i32x4(tmp[4], tmp[12], 0x88);
+  dst[5] = _mm512_shuffle_i32x4(tmp[5], tmp[13], 0x88);
+  dst[6] = _mm512_shuffle_i32x4(tmp[6], tmp[14], 0x88);
+  dst[7] = _mm512_shuffle_i32x4(tmp[7], tmp[15], 0x88);
+  dst[8] = _mm512_shuffle_i32x4(tmp[0], tmp[8], 0xdd);
+  dst[9] = _mm512_shuffle_i32x4(tmp[1], tmp[9], 0xdd);
+  dst[10] = _mm512_shuffle_i32x4(tmp[2], tmp[10], 0xdd);
+  dst[11] = _mm512_shuffle_i32x4(tmp[3], tmp[11], 0xdd);
+  dst[12] = _mm512_shuffle_i32x4(tmp[4], tmp[12], 0xdd);
+  dst[13] = _mm512_shuffle_i32x4(tmp[5], tmp[13], 0xdd);
+  dst[14] = _mm512_shuffle_i32x4(tmp[6], tmp[14], 0xdd);
+  dst[15] = _mm512_shuffle_i32x4(tmp[7], tmp[15], 0xdd);
+}
+
+#if CompileBF16() && CompileFP16()
+// Load 2 fp16 vectors; convert them to bf16 and interleave them
+template <int tail>
+static inline std::array<__m512i, 2> load_fp16_bf16_interleave_word(const utils::fp16* a, size_t lda) {
+  static_assert(tail > 0 && tail <= 2, "Unexpected tail value.");
+  std::array<__m512i, 2> dst;
+  for (int i = 0; i < tail; ++i) {
+    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                     //
+        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 16)),  //
+        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 0))));
+  }
+  for (int i = tail; i < 2; ++i) dst[i] = _mm512_setzero_epi32();
+  interleave_word(dst);
+  return dst;
+}
+
+// load_fp16_bf16_interleave_word with maskz
+template <int tail>
+static inline std::array<__m512i, 2> load_maskz_fp16_bf16_interleave_word(const utils::fp16* a, size_t lda,
+                                                                          uint32_t mask) {
+  static_assert(tail > 0 && tail <= 2, "Unexpected tail value.");
+
+  const auto mask_lo = mask;
+  const auto mask_hi = mask >> 16;
+  std::array<__m512i, 2> dst;
+  for (int i = 0; i < tail; ++i) {
+    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                                    //
+        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_hi, a + i * lda + 16)),  //
+        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_lo, a + i * lda + 0))));
+  }
+  for (int i = tail; i < 2; ++i) dst[i] = _mm512_setzero_epi32();
+  interleave_word(dst);
+  return dst;
+}
+
+template <int tail>
+static inline std::array<__m512i, 16> load_fp16_bf16_tr_x16_dword(const utils::fp16* a, size_t lda) {
+  static_assert(tail > 0 && tail <= 16, "Unexpected tail value.");
+  std::array<__m512i, 16> dst;
+  for (int i = 0; i < tail; ++i) {
+    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                     //
+        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 16)),  //
+        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 0))));
+  }
+  for (int i = tail; i < 16; ++i) dst[i] = _mm512_setzero_epi32();
+  tr_x16_dword(dst);
+  return dst;
+}
+static constexpr decltype(load_fp16_bf16_tr_x16_dword<1>)* load_fp16_bf16_tr_x16_dword_tbl[17]{
+    load_fp16_bf16_tr_x16_dword<1>,  load_fp16_bf16_tr_x16_dword<1>,  load_fp16_bf16_tr_x16_dword<2>,
+    load_fp16_bf16_tr_x16_dword<3>,  load_fp16_bf16_tr_x16_dword<4>,  load_fp16_bf16_tr_x16_dword<5>,
+    load_fp16_bf16_tr_x16_dword<6>,  load_fp16_bf16_tr_x16_dword<7>,  load_fp16_bf16_tr_x16_dword<8>,
+    load_fp16_bf16_tr_x16_dword<9>,  load_fp16_bf16_tr_x16_dword<10>, load_fp16_bf16_tr_x16_dword<11>,
+    load_fp16_bf16_tr_x16_dword<12>, load_fp16_bf16_tr_x16_dword<13>, load_fp16_bf16_tr_x16_dword<14>,
+    load_fp16_bf16_tr_x16_dword<15>, load_fp16_bf16_tr_x16_dword<16>,
+};
+
+template <int tail>
+static inline std::array<__m512i, 16> load_maskz_fp16_bf16_tr_x16_dword(const utils::fp16* a, size_t lda,
+                                                                        uint32_t mask) {
+  static_assert(tail > 0 && tail <= 16, "Unexpected tail value.");
+  std::array<__m512i, 16> dst;
+
+  const auto mask_lo = mask;
+  const auto mask_hi = mask >> 16;
+  for (int i = 0; i < tail; ++i) {
+    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                                    //
+        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_hi, a + i * lda + 16)),  //
+        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_lo, a + i * lda + 0))));
+  }
+  for (int i = tail; i < 16; ++i) dst[i] = _mm512_setzero_epi32();
+  tr_x16_dword(dst);
+  return dst;
+}
+static constexpr decltype(load_maskz_fp16_bf16_tr_x16_dword<1>)* load_maskz_fp16_bf16_tr_x16_dword_tbl[17]{
+    load_maskz_fp16_bf16_tr_x16_dword<1>,  load_maskz_fp16_bf16_tr_x16_dword<1>,  load_maskz_fp16_bf16_tr_x16_dword<2>,
+    load_maskz_fp16_bf16_tr_x16_dword<3>,  load_maskz_fp16_bf16_tr_x16_dword<4>,  load_maskz_fp16_bf16_tr_x16_dword<5>,
+    load_maskz_fp16_bf16_tr_x16_dword<6>,  load_maskz_fp16_bf16_tr_x16_dword<7>,  load_maskz_fp16_bf16_tr_x16_dword<8>,
+    load_maskz_fp16_bf16_tr_x16_dword<9>,  load_maskz_fp16_bf16_tr_x16_dword<10>, load_maskz_fp16_bf16_tr_x16_dword<11>,
+    load_maskz_fp16_bf16_tr_x16_dword<12>, load_maskz_fp16_bf16_tr_x16_dword<13>, load_maskz_fp16_bf16_tr_x16_dword<14>,
+    load_maskz_fp16_bf16_tr_x16_dword<15>, load_maskz_fp16_bf16_tr_x16_dword<16>,
+};
+#endif
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+template <typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
+struct padding_interleave_cvt {
+  padding_interleave_cvt() = delete;
+  static BTLA_CODE forward(const T_SRC* src, T_DST* dst, int NTile, int row, int col, int row_pad, int col_pad,
+                           int src_step, int dst_step) {
+    return BTLA_CODE::NotSupport;
+  }
+};
+#if CompileBF16() && CompileFP16()
+template <>
+struct padding_interleave_cvt<utils::fp16, utils::bf16, 2> {
+  static constexpr int RowPack = 2;
+  padding_interleave_cvt() = delete;
+
+  // M x N ===> N/NTile x M/RowPack x NTile x RowPack (leading dim stride = NTile * dststride)
+  static BTLA_CODE forward(const utils::fp16* src, utils::bf16* dst, int NTile, int row, int col, int row_pad,
+                           int col_pad, int src_step, int dst_step) {
+    int i = 0;
+    for (; i < row / RowPack * RowPack; i += RowPack) {
+      int j = 0;
+      for (; j < col / NTile * NTile; j += NTile) {
+        assert(NTile % 32 == 0);
+        for (int jj = 0; jj < NTile; jj += 32) {
+          const auto xss = load_fp16_bf16_interleave_word<2>(src + i * src_step + j + jj, src_step);
+          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
+          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
+        }
+      }
+      if (j < col) {  // j: tail processing
+        int jj = 0;
+        for (; j + jj < col / 32 * 32; jj += 32) {
+          const auto xss = load_fp16_bf16_interleave_word<2>(src + i * src_step + j + jj, src_step);
+          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
+          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
+        }
+        if (j + jj < col) {  // jj: tail processing
+          const uint32_t mask = (1U << (col - j - jj)) - 1;
+          const auto xss = load_maskz_fp16_bf16_interleave_word<2>(src + i * src_step + j + jj, src_step, mask);
+          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
+          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
+          jj += 32;
+        }
+        for (; jj < NTile; jj += 32) {  // jj: padding zero
+          memset(dst + i * NTile + j * dst_step + jj * RowPack, 0, sizeof(utils::bf16) * 32 * RowPack);
+        }
+        j += NTile;
+      }
+      for (; j < col_pad; j += NTile) {  // j: padding zero
+        memset(dst + i * NTile + j * dst_step, 0, sizeof(utils::bf16) * NTile * RowPack);
+      }
+    }
+    if (i < row) {                      // i: tail processing
+      static constexpr int tail_m = 1;  // must be 1
+      int j = 0;
+      for (; j < col / NTile * NTile; j += NTile) {
+        assert(NTile % 32 == 0);
+        for (int jj = 0; jj < NTile; jj += 32) {
+          const auto xss = load_fp16_bf16_interleave_word<tail_m>(src + i * src_step + j + jj, src_step);
+          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
+          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
+        }
+      }
+      if (j < col) {  // j: tail processing
+        int jj = 0;
+        for (; j + jj < col / 32 * 32; jj += 32) {
+          const auto xss = load_fp16_bf16_interleave_word<tail_m>(src + i * src_step + j + jj, src_step);
+          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
+          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
+        }
+        if (j + jj < col) {  // jj: tail processing
+          const uint32_t mask = (1U << (col - j - jj)) - 1;
+          const auto xss = load_maskz_fp16_bf16_interleave_word<tail_m>(src + i * src_step + j + jj, src_step, mask);
+          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
+          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
+          jj += 32;
+        }
+        for (; jj < NTile; jj += 32) {  // jj: padding zero
+          memset(dst + i * NTile + j * dst_step + jj * RowPack, 0, sizeof(utils::bf16) * 32 * RowPack);
+        }
+        j += NTile;
+      }
+      for (; j < col_pad; j += NTile) {  // j: padding zero
+        memset(dst + i * NTile + j * dst_step, 0, sizeof(utils::bf16) * NTile * RowPack);
+      }
+      i += RowPack;
+    }
+    for (; i < row_pad; i += RowPack) {  // i: padding zero
+      for (int j = 0; j < col_pad; j += NTile) {
+        memset(dst + i * NTile + j * dst_step, 0, sizeof(utils::bf16) * NTile * RowPack);
+      }
+    }
+    return BTLA_CODE::Success;
+  }
+};
+#endif
+
+template <typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
+struct padding_trans_interleave_cvt {
+  padding_trans_interleave_cvt() = delete;
+  static BTLA_CODE forward(const T_SRC* src, T_DST* dst, int MTile, int row, int col, int row_pad, int col_pad,
+                           int src_step, int dst_step) {
+    return BTLA_CODE::NotSupport;
+  }
+};
+#if CompileBF16() && CompileFP16()
+template <>
+struct padding_trans_interleave_cvt<utils::fp16, utils::bf16, 2> {
+  static constexpr int ColPack = 2;
+  padding_trans_interleave_cvt() = delete;
+
+  static BTLA_CODE forward(const utils::fp16* src, utils::bf16* dst, int MTile, int row, int col, int row_pad,
+                           int col_pad, int src_step, int dst_step) {
+    assert(row_pad % 16 == 0 && col_pad % 32 == 0);
+    int i = 0;
+    for (; i < row / MTile * MTile; i += MTile) {
+      assert(MTile % 16 == 0);
+      int j = 0;
+      for (; j < col / 32 * 32; j += 32) {
+        for (int ii = 0; ii < MTile; ii += 16) {
+          assert(MTile % 16 == 0);
+          const auto xss = load_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step);
+          for (int jj = 0; jj < 32; jj += 2) {
+            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
+          }
+        }
+      }
+      if (j < col) {  // j: tail processing
+        for (int ii = 0; ii < MTile; ii += 16) {
+          assert(MTile % 16 == 0);
+          const uint32_t mask = (1U << (col - j)) - 1;
+          const auto xss = load_maskz_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step, mask);
+          for (int jj = 0; jj < 32; jj += 2) {
+            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
+          }
+        }
+        j += 32;
+      }
+      for (; j < col_pad; j += 2) {  // j: padding zero
+        memset(dst + i * dst_step + j * MTile, 0, 2 * sizeof(utils::bf16) * MTile);
+      }
+    }
+    if (i < row) {  // i: tail processing
+      int ii = 0;
+      for (; i + ii < row / 16 * 16; ii += 16) {
+        int j = 0;
+        for (; j < col / 32 * 32; j += 32) {
+          assert(MTile % 16 == 0);
+          const auto xss = load_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step);
+          for (int jj = 0; jj < 32; jj += 2) {
+            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
+          }
+        }
+        if (j < col) {  // j: tail processing
+          assert(MTile % 16 == 0);
+          const uint32_t mask = (1U << (col - j)) - 1;
+          const auto xss = load_maskz_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step, mask);
+          for (int jj = 0; jj < 32; jj += 2) {
+            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
+          }
+          j += 32;
+        }
+        for (; j < col_pad; j += 2) {  // j: padding zero
+          memset(dst + i * dst_step + ii * ColPack + j * MTile, 0, 2 * sizeof(utils::bf16) * 16);
+        }
+      }
+      if (i + ii < row) {  // ii: tail processing
+        const int tbl_idx = row - i - ii;
+        int j = 0;
+        for (; j < col / 32 * 32; j += 32) {
+          assert(MTile % 16 == 0);
+          const auto xss = load_fp16_bf16_tr_x16_dword_tbl[tbl_idx](src + (i + ii) * src_step + j, src_step);
+          for (int jj = 0; jj < 32; jj += 2) {
+            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
+          }
+        }
+        if (j < col) {  // j: tail processing
+          assert(MTile % 16 == 0);
+          const uint32_t mask = (1U << (col - j)) - 1;
+          const auto xss =
+              load_maskz_fp16_bf16_tr_x16_dword_tbl[tbl_idx](src + (i + ii) * src_step + j, src_step, mask);
+          for (int jj = 0; jj < 32; jj += 2) {
+            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
+          }
+          j += 32;
+        }
+        for (; j < col_pad; j += 2) {  // j: padding zero
+          memset(dst + i * dst_step + ii * ColPack + j * MTile, 0, 2 * sizeof(utils::bf16) * 16);
+        }
+        ii += 16;
+      }
+      for (; ii < MTile; ii += 16) {  // ii: padding zero
+        for (int j = 0; j < col_pad; j += 2) {
+          memset(dst + i * dst_step + ii * ColPack + j * MTile, 0, 2 * sizeof(utils::bf16) * 16);
+        }
+      }
+      assert(ii == MTile);
+      i += MTile;
+    }
+    assert(row_pad % MTile == 0);
+    for (; i < row_pad; i += MTile) {  // i: padding zero
+      for (int j = 0; j < col_pad; j += 2) {
+        memset(dst + i * dst_step + j * MTile, 0, 2 * sizeof(utils::bf16) * MTile);
+      }
+    }
+    return BTLA_CODE::Success;
+  }
+};
+#endif
+
+static inline BTLA_CODE layernorm(const float* srcptr, const float* scaleptr, const float* biasptr, float epsilon,
+                                  int norm_size, float* dstptr, float* mean_out, float* mean_square_out,
+                                  bool simplified) {
+  int constexpr VLen = 16;
+  int norm_size16 = utils::padto_le(norm_size, VLen);
+  int h = 0;
+  __m512 vmean = _mm512_setzero_ps(), vmeansq = _mm512_setzero_ps();
+  for (; h < norm_size16; h += VLen) {
+    auto tmp = _mm512_loadu_ps(srcptr + h);
+    vmean = _mm512_add_ps(vmean, tmp);
+    tmp = _mm512_mul_ps(tmp, tmp);
+    vmeansq = _mm512_add_ps(vmeansq, tmp);
+  }
+  float mean = _mm512_reduce_add_ps(vmean);
+  float mean_square = _mm512_reduce_add_ps(vmeansq);
+  for (; h < norm_size; h++) {
+    mean += srcptr[h];
+    mean_square += srcptr[h] * srcptr[h];
+  }
+  mean = mean / norm_size;
+  if (simplified) {
+    mean_square = std::sqrt(mean_square / norm_size + epsilon);
+  } else {
+    mean_square = std::sqrt(mean_square / norm_size - mean * mean + epsilon);
+  }
+  auto vm = _mm512_set1_ps(mean);
+  float inv_meansq = 1.f / mean_square;
+  auto vms = _mm512_set1_ps(inv_meansq);
+  h = 0;
+  if (simplified) {
+    if (scaleptr) {
+      for (; h < norm_size16; h += VLen) {
+        auto inp = _mm512_loadu_ps(srcptr + h);
+        auto scale = _mm512_loadu_ps(scaleptr + h);
+        inp = _mm512_mul_ps(inp, vms);
+        inp = _mm512_mul_ps(inp, scale);
+        _mm512_storeu_ps(dstptr + h, inp);
+      }
+      for (; h < norm_size; h++) {
+        dstptr[h] = srcptr[h] * inv_meansq * scaleptr[h];
+      }
+    } else {
+      for (; h < norm_size16; h += VLen) {
+        auto inp = _mm512_loadu_ps(srcptr + h);
+        inp = _mm512_mul_ps(inp, vms);
+        _mm512_storeu_ps(dstptr + h, inp);
+      }
+      for (; h < norm_size; h++) {
+        dstptr[h] = srcptr[h] * inv_meansq;
+      }
+    }
+
+  } else {
+    if (scaleptr) {
+      if (biasptr == nullptr) {
+        for (; h < norm_size16; h += VLen) {
+          auto inp = _mm512_loadu_ps(srcptr + h);
+          auto scale = _mm512_loadu_ps(scaleptr + h);
+          inp = _mm512_sub_ps(inp, vm);
+          inp = _mm512_mul_ps(inp, vms);
+          inp = _mm512_mul_ps(inp, scale);
+          _mm512_storeu_ps(dstptr + h, inp);
+        }
+        for (; h < norm_size; h++) {
+          dstptr[h] = (srcptr[h] - mean) * inv_meansq * scaleptr[h];
+        }
+      } else {
+        for (; h < norm_size16; h += VLen) {
+          auto inp = _mm512_loadu_ps(srcptr + h);
+          auto scale = _mm512_loadu_ps(scaleptr + h);
+          inp = _mm512_sub_ps(inp, vm);
+          inp = _mm512_mul_ps(inp, vms);
+          inp = _mm512_mul_ps(inp, scale);
+          auto bias = _mm512_loadu_ps(biasptr + h);
+          inp = _mm512_add_ps(inp, bias);
+          _mm512_storeu_ps(dstptr + h, inp);
+        }
+        for (; h < norm_size; h++) {
+          dstptr[h] = (srcptr[h] - mean) * inv_meansq * scaleptr[h] + biasptr[h];
+        }
+      }
+    } else {
+      for (; h < norm_size16; h += VLen) {
+        auto inp = _mm512_loadu_ps(srcptr + h);
+        inp = _mm512_sub_ps(inp, vm);
+        inp = _mm512_mul_ps(inp, vms);
+        _mm512_storeu_ps(dstptr + h, inp);
+      }
+      for (; h < norm_size; h++) {
+        dstptr[h] = (srcptr[h] - mean) * inv_meansq;
+      }
+    }
+  }
+
+  if (mean_out) {
+    *mean_out = mean;
+  }
+  if (mean_square_out) {
+    *mean_square_out = mean_square;
+  }
+  return BTLA_CODE::Success;
+}
+
+inline __m512 poly_scale_2nd_ps(const __m512 z, const __m512 f, const __m512 c0, const __m512 c1, const __m512 c2) {
+  const auto y = _mm512_fmadd_ps(_mm512_fmadd_ps(f, c0, c1), f, c2);  // auto y = (f * c0 + c1) * f + c2;
+  const auto exp = _mm512_scalef_ps(y, z);
+  return exp;
+}
+
+inline __m512 exp_ps_0_1(const __m512 x) {
+  static const auto c0 = _mm512_set1_ps(0.240226507f);
+  static const auto c1 = _mm512_set1_ps(0.452920674f);
+  static const auto c2 = _mm512_set1_ps(0.713483036f);
+  static const float v_log2e = std::log2(std::exp(1.f));
+  static const auto log2e = _mm512_set1_ps(v_log2e);
+  static const auto half = _mm512_set1_ps(.5f);
+
+  const auto x1 = _mm512_fmadd_ps(x, log2e, half);  // auto x1 = x * log2e + _mm512_set1_ps(.5f);
+  const auto z = _mm512_floor_ps(x1);
+  const auto f = _mm512_sub_ps(x1, z);  // auto f = x1 - z;
+
+  return poly_scale_2nd_ps(z, f, c0, c1, c2);
+}
+
+static inline __m512i load_zp_epi8_broadcast_epi16(int8_t* zpptr, const __m512i& vindex) {
+  auto v_zp_x = _mm256_loadu_si256((const __m256i*)zpptr);
+  auto v_zp_y = _mm512_cvtepi8_epi16(v_zp_x);
+  auto v_zp_y_cast = _mm512_shuffle_epi8(v_zp_y, vindex);  // TODO(Yu) AVX512F only
+  return v_zp_y_cast;
+}
+
+static inline __m512i load_zp_epi8_broadcast_epi32(int8_t* zpptr, const __m512i& vindex) {
+  auto v_zp_x = _mm_loadu_si128((const __m128i*)zpptr);
+  auto v_zp_y = _mm512_cvtepi8_epi32(v_zp_x);
+  auto v_zp_y_cast = _mm512_shuffle_epi8(v_zp_y, vindex);  // TODO(Yu) AVX512F only
+  return v_zp_y_cast;
+}
+
+static inline BTLA_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, size_t elesize, int8_t* tmp,
+                                         size_t tmpsize) {
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
+  size_t velt = utils::padto_le(elesize, 64);
+  size_t i = 0;
+  auto vbias = _mm512_set1_epi8(8);
+  for (; i < velt; i += 64) {
+    auto vout_y = unpack_4bits(reinterpret_cast<int8_t*>(srcptr + i / 2), vmask);
+    vout_y = _mm512_sub_epi8(vout_y, vbias);
+    _mm512_storeu_si512((__m512i*)(dstptr + i), vout_y);
+  }
+  if (velt < elesize) {
+    if (elesize >= 64) {
+      i = elesize - 64;
+      auto vout_y = unpack_4bits(reinterpret_cast<int8_t*>(srcptr + i / 2), vmask);
+      vout_y = _mm512_sub_epi8(vout_y, vbias);
+      _mm512_storeu_si512((__m512i*)(dstptr + i), vout_y);
+    } else {
+      ref::decompress_kblock_s4_s8<1, 1>(srcptr + i / 2, nullptr, dstptr + i, 0, 0, 0, 0, 1, elesize - i, nullptr, 0);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s4_s8_pack1_row(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr,
+                                                          int blocksize, int ldzp, int n_offset, int k_offset, int row,
+                                                          int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 16;
+  static_assert((NTILE % 16) == 0);
+  int constexpr PackRow = 1;
+  int constexpr Unroll = 4;
+  int constexpr UnpackLoop = Unroll * NTILE / 64;
+  __m512i v_zp_y[UnpackLoop];
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
+  auto vbias = _mm512_set1_epi8(8);
+  const auto vindex = _mm512_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12,
+                                      12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    for (int i = 0; i < Unroll; i++) {
+      memcpy(tmp + i * NTILE, zptr, NTILE * sizeof(int8_t));
+    }
+    for (int i = 0; i < UnpackLoop; i++) {
+      v_zp_y[i] = _mm512_loadu_si512((const __m512i*)(tmp + i * 64));
+      v_zp_y[i] = _mm512_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    int k_remain_unrll = utils::padto_le(k_remain, Unroll);
+    int ib = 0;
+    for (; ib < k_remain_unrll; ib += Unroll) {
+      auto b4ptr = srcptr + (ir + ib) * NTILE / 2;
+      for (int i = 0; i < UnpackLoop; i++) {
+        auto v_s8_y = unpack_4bits(b4ptr + i * 32, vmask);
+        v_s8_y = _mm512_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm512_storeu_si512((__m512i*)(dstptr + i * 64 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+
+    int k_tail = k_remain - k_remain_unrll;
+    if (k_tail > 0) {
+      memcpy(tmp, srcptr + (ir + ib) * NTILE / 2, k_tail * NTILE / 2);
+      auto tmpout = tmp + Unroll * NTILE / 2;
+      for (int i = 0; i < UnpackLoop; i++) {
+        auto v_s8_y = unpack_4bits(tmp + i * 32, vmask);
+        v_s8_y = _mm512_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm512_storeu_si512((__m512i*)(tmpout + i * 64), v_s8_y);
+      }
+      memcpy(dstptr + (ir + ib) * NTILE, tmpout, k_tail * NTILE);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s4_s8_pack2_row(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr,
+                                                          int blocksize, int ldzp, int n_offset, int k_offset, int row,
+                                                          int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 16;
+  static_assert((NTILE % 16) == 0);
+  int constexpr PackRow = 2;
+  int constexpr Unroll = 2;
+  __m512i v_zp_y[NReg];
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
+  auto vbias = _mm512_set1_epi8(8);
+  const auto vindex = _mm512_set_epi8(14, 14, 12, 12, 10, 10, 8, 8, 6, 6, 4, 4, 2, 2, 0, 0, 14, 14, 12, 12, 10, 10, 8,
+                                      8, 6, 6, 4, 4, 2, 2, 0, 0, 14, 14, 12, 12, 10, 10, 8, 8, 6, 6, 4, 4, 2, 2, 0, 0,
+                                      14, 14, 12, 12, 10, 10, 8, 8, 6, 6, 4, 4, 2, 2, 0, 0);
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    memcpy(tmp, zptr, NTILE * sizeof(int8_t));
+    memcpy(tmp + NTILE, zptr, NTILE * sizeof(int8_t));
+    for (int i = 0; i < NReg; i++) {
+      v_zp_y[i] = load_zp_epi8_broadcast_epi16(tmp + i * 32, vindex);
+      v_zp_y[i] = _mm512_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    int k_remain_unrll = utils::padto_le(k_remain, PackRow * Unroll);
+    int ib = 0;
+    for (; ib < k_remain_unrll; ib += PackRow * Unroll) {
+      auto b4ptr = srcptr + (ir + ib) * NTILE / 2;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_4bits(b4ptr + i * 32, vmask);
+        v_s8_y = _mm512_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm512_storeu_si512((__m512i*)(dstptr + i * 64 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+
+    int k_tail = k_remain - k_remain_unrll;
+    if (k_tail > 0) {
+      memcpy(tmp, srcptr + (ir + ib) * NTILE / 2, k_tail * NTILE / 2);
+      auto tmpout = tmp + Unroll * PackRow * NTILE / 2;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_4bits(tmp + i * 32, vmask);
+        v_s8_y = _mm512_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm512_storeu_si512((__m512i*)(tmpout + i * 64), v_s8_y);
+      }
+      memcpy(dstptr + (ir + ib) * NTILE, tmpout, k_tail * NTILE);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s4_s8_pack4_row(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr,
+                                                          int blocksize, int ldzp, int n_offset, int k_offset, int row,
+                                                          int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 16;
+  static_assert((NTILE % 16) == 0);
+  int constexpr PackRow = 4;
+  __m512i v_zp_y[NReg];
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
+  auto vbias = _mm512_set1_epi8(8);
+  const auto vindex = _mm512_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12,
+                                      12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    for (int i = 0; i < NReg; i++) {
+      v_zp_y[i] = load_zp_epi8_broadcast_epi32(zptr + i * 16, vindex);
+      v_zp_y[i] = _mm512_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    for (int ib = 0; ib < k_remain; ib += PackRow) {
+      auto b4ptr = srcptr + (ir + ib) * NTILE / 2;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_4bits(b4ptr + i * 32, vmask);
+        v_s8_y = _mm512_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm512_storeu_si512((__m512i*)(dstptr + i * 64 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int PackRow, int NTILE>
+inline BTLA_CODE decompress_kblock_s4_s8(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp,
+                                         int n_offset, int k_offset, int row, int col, int8_t* tmp, size_t tmpsize) {
+  if (zpptr) {
+    typedef BTLA_CODE (*decompfunc)(utils::int4x2 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp,
+                                    int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize);
+    decompfunc func = nullptr;
+    if (col == NTILE) {
+      if constexpr (PackRow == 1) {
+        func = &decompress_kblock_s4_s8_pack1_row<NTILE>;
+      }
+      if constexpr (PackRow == 2) {
+        func = &decompress_kblock_s4_s8_pack2_row<NTILE>;
+      }
+      if constexpr (PackRow == 4) {
+        func = &decompress_kblock_s4_s8_pack4_row<NTILE>;
+      }
+      if (func) {
+        int head_end = utils::padto(k_offset, blocksize);
+        head_end = std::min(head_end, k_offset + row);
+        int head_size = head_end - k_offset;
+        if (head_size > 0) {
+          (*func)(srcptr, zpptr, dstptr, blocksize, ldzp, n_offset, k_offset, head_size, tmp, tmpsize);
+        }
+        int body_size = row - head_size;
+        if (body_size > 0) {
+          (*func)(srcptr + head_size * NTILE / 2, zpptr, dstptr + head_size * NTILE, blocksize, ldzp, n_offset,
+                  head_end, body_size, tmp, tmpsize);
+        }
+        return BTLA_CODE::Success;
+      }
+    }
+    assert(0);
+    return BTLA_CODE::NotSupport;
+  } else {
+    size_t elesize = static_cast<size_t>(row) * col;
+    return decompress_s4_s8(srcptr, dstptr, elesize, tmp, tmpsize);
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE decompress_s2_s8(utils::bit2x4* bit2ptr, int8_t* dstptr, size_t unpack_elt, int8_t* tmp,
+                                         size_t tmpsize) {
+  int constexpr VBits = 512;
+  int constexpr VElt = VBits / 8;
+  int i = 0;
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm512_set1_epi64(*(int64_t*)&mask0);
+  auto vbias = _mm512_set1_epi8(2);
+  auto vshift_y = _mm512_set_epi32(6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm512_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0,
+                                      15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+  int elt_pad = utils::padto_le(unpack_elt, VElt);
+  for (; i < elt_pad; i += VElt) {
+    auto vout = unpack_2bits(bit2ptr + i / 4, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+    vout = _mm512_sub_epi8(vout, vbias);
+    _mm512_storeu_si512((__m512i*)(dstptr + i), vout);
+  }
+  if (elt_pad < unpack_elt) {
+    if (unpack_elt >= VElt) {
+      i = unpack_elt - VElt;
+      auto vout = unpack_2bits(bit2ptr + i / 4, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+      vout = _mm512_sub_epi8(vout, vbias);
+      _mm512_storeu_si512((__m512i*)(dstptr + i), vout);
+    } else {
+      ref::decompress_s2_s8(bit2ptr + i / 4, dstptr + i, unpack_elt - i, tmp, tmpsize);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s2_s8_pack4_row(utils::bit2x4* srcptr, int8_t* zpptr, int8_t* dstptr,
+                                                          int blocksize, int ldzp, int n_offset, int k_offset, int row,
+                                                          int8_t* tmp, size_t tmpsize) {
+  int constexpr VLen = 16;
+  int constexpr NReg = NTILE / VLen;
+  static_assert((NTILE % VLen) == 0);
+  int constexpr PackRow = 4;
+  __m512i v_zp_y[NReg];
+  const auto vindex = _mm512_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12,
+                                      12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm512_set1_epi64(*(int64_t*)&mask0);
+  auto vbias = _mm512_set1_epi8(2);
+  auto vshift_y = _mm512_set_epi32(6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm512_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0,
+                                      15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    for (int i = 0; i < NReg; i++) {
+      v_zp_y[i] = load_zp_epi8_broadcast_epi32(zptr + i * 16, vindex);
+      v_zp_y[i] = _mm512_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    for (int ib = 0; ib < k_remain; ib += PackRow) {
+      auto b2ptr = srcptr + (ir + ib) * NTILE / 4;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits(b2ptr + i * 16, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        v_s8_y = _mm512_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm512_storeu_si512((__m512i*)(dstptr + i * 64 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s2_s8_pack2_row(utils::bit2x4* srcptr, int8_t* zpptr, int8_t* dstptr,
+                                                          int blocksize, int ldzp, int n_offset, int k_offset, int row,
+                                                          int8_t* tmp, size_t tmpsize) {
+  int constexpr VLen = 16;
+  int constexpr NReg = NTILE / VLen;
+  static_assert((NTILE % VLen) == 0);
+  int constexpr PackRow = 2;
+  int constexpr Unroll = 2;
+  __m512i v_zp_y[NReg];
+  const auto vindex = _mm512_set_epi8(14, 14, 12, 12, 10, 10, 8, 8, 6, 6, 4, 4, 2, 2, 0, 0, 14, 14, 12, 12, 10, 10, 8,
+                                      8, 6, 6, 4, 4, 2, 2, 0, 0, 14, 14, 12, 12, 10, 10, 8, 8, 6, 6, 4, 4, 2, 2, 0, 0,
+                                      14, 14, 12, 12, 10, 10, 8, 8, 6, 6, 4, 4, 2, 2, 0, 0);
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm512_set1_epi64(*(int64_t*)&mask0);
+  auto vbias = _mm512_set1_epi8(2);
+  auto vshift_y = _mm512_set_epi32(6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm512_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0,
+                                      15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    memcpy(tmp, zptr, NTILE * sizeof(int8_t));
+    memcpy(tmp + NTILE, zptr, NTILE * sizeof(int8_t));
+    for (int i = 0; i < NReg; i++) {
+      v_zp_y[i] = load_zp_epi8_broadcast_epi16(tmp + i * 32, vindex);
+      v_zp_y[i] = _mm512_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    int k_remain_unrll = utils::padto_le(k_remain, PackRow * Unroll);
+    int ib = 0;
+    for (; ib < k_remain_unrll; ib += PackRow * Unroll) {
+      auto b2ptr = srcptr + (ir + ib) * NTILE / 4;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits(b2ptr + i * 16, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        v_s8_y = _mm512_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm512_storeu_si512((__m512i*)(dstptr + i * 64 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+    int k_tail = k_remain - k_remain_unrll;
+    if (k_tail > 0) {
+      memcpy(tmp, srcptr + (ir + ib) * NTILE / 4, k_tail * NTILE / 4);
+      auto tmpout = tmp + Unroll * PackRow * NTILE / 4;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits((utils::bit2x4*)(tmp + i * 16), vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        v_s8_y = _mm512_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm512_storeu_si512((__m512i*)(tmpout + i * 64), v_s8_y);
+      }
+      memcpy(dstptr + (ir + ib) * NTILE, tmpout, k_tail * NTILE);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s2_s8_pack1_row(utils::bit2x4* srcptr, int8_t* zpptr, int8_t* dstptr,
+                                                          int blocksize, int ldzp, int n_offset, int k_offset, int row,
+                                                          int8_t* tmp, size_t tmpsize) {
+  int constexpr VLen = 16;
+  int constexpr NReg = NTILE / VLen;
+  static_assert((NTILE % VLen) == 0);
+  int constexpr PackRow = 1;
+  int constexpr Unroll = 4;
+  __m512i v_zp_y[NReg];
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm512_set1_epi64(*(int64_t*)&mask0);
+  auto vbias = _mm512_set1_epi8(2);
+  auto vshift_y = _mm512_set_epi32(6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm512_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0,
+                                      15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    for (int i = 0; i < Unroll; i++) {
+      memcpy(tmp + i * NTILE, zptr, NTILE * sizeof(int8_t));
+    }
+    for (int i = 0; i < NReg; i++) {
+      v_zp_y[i] = _mm512_loadu_si512((const __m512i*)(tmp + i * 64));
+      v_zp_y[i] = _mm512_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    int k_remain_unrll = utils::padto_le(k_remain, Unroll);
+    int ib = 0;
+    for (; ib < k_remain_unrll; ib += Unroll) {
+      auto b2ptr = srcptr + (ir + ib) * NTILE / 4;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits(b2ptr + i * 16, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        v_s8_y = _mm512_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm512_storeu_si512((__m512i*)(dstptr + i * 64 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+
+    int k_tail = k_remain - k_remain_unrll;
+    if (k_tail > 0) {
+      memcpy(tmp, srcptr + (ir + ib) * NTILE / 4, k_tail * NTILE / 4);
+      auto tmpout = tmp + Unroll * NTILE / 4;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits((utils::bit2x4*)(tmp + i * 16), vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        v_s8_y = _mm512_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm512_storeu_si512((__m512i*)(tmpout + i * 64), v_s8_y);
+      }
+      memcpy(dstptr + (ir + ib) * NTILE, tmpout, k_tail * NTILE);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int PackRow, int NTILE>
+static inline BTLA_CODE decompress_kblock_s2_s8(utils::bit2x4* bit2ptr, int8_t* zpptr, int8_t* dstptr, int blocksize,
+                                                int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp,
+                                                size_t tmpsize) {
+  if (zpptr) {
+    typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * srcptr, int8_t * zpptr, int8_t * dstptr, int blocksize, int ldzp,
+                                    int n_offset, int k_offset, int row, int8_t* tmp, size_t tmpsize);
+    decompfunc func = nullptr;
+    if (col == NTILE) {
+      if constexpr (PackRow == 1) {
+        func = &decompress_kblock_s2_s8_pack1_row<NTILE>;
+      }
+      if constexpr (PackRow == 2) {
+        func = &decompress_kblock_s2_s8_pack2_row<NTILE>;
+      }
+      if constexpr (PackRow == 4) {
+        func = &decompress_kblock_s2_s8_pack4_row<NTILE>;
+      }
+      if (func) {
+        int head_end = utils::padto(k_offset, blocksize);
+        head_end = std::min(head_end, k_offset + row);
+        int head_size = head_end - k_offset;
+        if (head_size > 0) {
+          (*func)(bit2ptr, zpptr, dstptr, blocksize, ldzp, n_offset, k_offset, head_size, tmp, tmpsize);
+        }
+        int body_size = row - head_size;
+        if (body_size > 0) {
+          (*func)(bit2ptr + head_size * NTILE / 4, zpptr, dstptr + head_size * NTILE, blocksize, ldzp, n_offset,
+                  head_end, body_size, tmp, tmpsize);
+        }
+        return BTLA_CODE::Success;
+      }
+    }
+    assert(0);
+    return BTLA_CODE::NotSupport;
+  } else {
+    size_t elesize = static_cast<size_t>(row) * col;
+    return decompress_s2_s8(bit2ptr, dstptr, elesize, tmp, tmpsize);
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE decompress_s3_s8(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, int8_t* dstptr,
+                                         size_t unpack_elt, int8_t* tmp, size_t tmpsize) {
+  int constexpr VBits = 512;
+  int constexpr VElt = VBits / 8;
+  int i = 0;
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm512_set1_epi64(*(int64_t*)&mask0);
+  auto vbias = _mm512_set1_epi8(4);
+  auto vshift_y = _mm512_set_epi32(6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm512_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0,
+                                      15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+
+  auto zmm_0x04 = _mm512_set1_epi8(0x04);
+  auto zmm_0x00 = _mm512_set1_epi8(0x00);
+  int elt_pad = utils::padto_le(unpack_elt, VElt);
+  for (; i < elt_pad; i += VElt) {
+    auto vout = unpack_2bits(bit2ptr + i / 4, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+    auto vb1 = unpack_1bits(bit1ptr + i / 8, zmm_0x00, zmm_0x04);
+    vout = _mm512_or_si512(vout, vb1);
+    vout = _mm512_sub_epi8(vout, vbias);
+    _mm512_storeu_si512((__m512i*)(dstptr + i), vout);
+  }
+  if (elt_pad < unpack_elt) {
+    if (unpack_elt >= VElt) {
+      i = unpack_elt - VElt;
+      auto vout = unpack_2bits(bit2ptr + i / 4, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+      auto vb1 = unpack_1bits(bit1ptr + i / 8, zmm_0x00, zmm_0x04);
+      vout = _mm512_or_si512(vout, vb1);
+      vout = _mm512_sub_epi8(vout, vbias);
+      _mm512_storeu_si512((__m512i*)(dstptr + i), vout);
+    } else {
+      ref::decompress_s3_s8(bit2ptr + i / 4, bit1ptr + i / 8, dstptr + i, unpack_elt - i, tmp, tmpsize);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s3_s8_pack1_row(utils::bit2x4* srcptr, utils::bit1x8* bit1ptr, int8_t* zpptr,
+                                                          int8_t* dstptr, int blocksize, int ldzp, int n_offset,
+                                                          int k_offset, int row, int8_t* tmp, size_t tmpsize) {
+  int constexpr VLen = 16;
+  int constexpr NReg = NTILE / VLen;
+  static_assert((NTILE % VLen) == 0);
+  int constexpr PackRow = 1;
+  int constexpr Unroll = 4;
+  __m512i v_zp_y[NReg];
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm512_set1_epi64(*(int64_t*)&mask0);
+  auto vbias = _mm512_set1_epi8(4);
+  auto vshift_y = _mm512_set_epi32(6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm512_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0,
+                                      15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+
+  auto zmm_0x04 = _mm512_set1_epi8(0x04);
+  auto zmm_0x00 = _mm512_set1_epi8(0x00);
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    for (int i = 0; i < Unroll; i++) {
+      memcpy(tmp + i * NTILE, zptr, NTILE * sizeof(int8_t));
+    }
+    for (int i = 0; i < NReg; i++) {
+      v_zp_y[i] = _mm512_loadu_si512((const __m512i*)(tmp + i * 64));
+      v_zp_y[i] = _mm512_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    int k_remain_unrll = utils::padto_le(k_remain, Unroll);
+    int ib = 0;
+    for (; ib < k_remain_unrll; ib += Unroll) {
+      auto b2ptr = srcptr + (ir + ib) * NTILE / 4;
+      auto b1ptr = bit1ptr + (ir + ib) * NTILE / 8;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits(b2ptr + i * 16, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        auto vb1 = unpack_1bits(b1ptr + i * 8, zmm_0x00, zmm_0x04);
+        v_s8_y = _mm512_or_si512(v_s8_y, vb1);
+        v_s8_y = _mm512_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm512_storeu_si512((__m512i*)(dstptr + i * 64 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+
+    int k_tail = k_remain - k_remain_unrll;
+    if (k_tail > 0) {
+      auto tmpb2ptr = tmp;
+      memcpy(tmpb2ptr, srcptr + (ir + ib) * NTILE / 4, k_tail * NTILE / 4);
+      auto tmpb1ptr = tmp + Unroll * NTILE / 2;
+      memcpy(tmpb1ptr, bit1ptr + (ir + ib) * NTILE / 8, k_tail * NTILE / 8);
+      auto tmpout = tmp + Unroll * NTILE;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits((utils::bit2x4*)(tmpb2ptr + i * 16), vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        auto vb1 = unpack_1bits((utils::bit1x8*)(tmpb1ptr + i * 8), zmm_0x00, zmm_0x04);
+        v_s8_y = _mm512_or_si512(v_s8_y, vb1);
+        v_s8_y = _mm512_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm512_storeu_si512((__m512i*)(tmpout + i * 64), v_s8_y);
+      }
+      memcpy(dstptr + (ir + ib) * NTILE, tmpout, k_tail * NTILE);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s3_s8_pack2_row(utils::bit2x4* srcptr, utils::bit1x8* bit1ptr, int8_t* zpptr,
+                                                          int8_t* dstptr, int blocksize, int ldzp, int n_offset,
+                                                          int k_offset, int row, int8_t* tmp, size_t tmpsize) {
+  int constexpr VLen = 16;
+  int constexpr NReg = NTILE / VLen;
+  static_assert((NTILE % VLen) == 0);
+  int constexpr PackRow = 1;
+  int constexpr Unroll = 4;
+  __m512i v_zp_y[NReg];
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm512_set1_epi64(*(int64_t*)&mask0);
+  auto vbias = _mm512_set1_epi8(4);
+  auto vshift_y = _mm512_set_epi32(6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm512_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0,
+                                      15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+
+  auto zmm_0x04 = _mm512_set1_epi8(0x04);
+  auto zmm_0x00 = _mm512_set1_epi8(0x00);
+
+  const auto vindex = _mm512_set_epi8(14, 14, 12, 12, 10, 10, 8, 8, 6, 6, 4, 4, 2, 2, 0, 0, 14, 14, 12, 12, 10, 10, 8,
+                                      8, 6, 6, 4, 4, 2, 2, 0, 0, 14, 14, 12, 12, 10, 10, 8, 8, 6, 6, 4, 4, 2, 2, 0, 0,
+                                      14, 14, 12, 12, 10, 10, 8, 8, 6, 6, 4, 4, 2, 2, 0, 0);
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    memcpy(tmp, zptr, NTILE * sizeof(int8_t));
+    memcpy(tmp + NTILE, zptr, NTILE * sizeof(int8_t));
+    for (int i = 0; i < NReg; i++) {
+      v_zp_y[i] = load_zp_epi8_broadcast_epi16(tmp + i * 32, vindex);
+      v_zp_y[i] = _mm512_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    int k_remain_unrll = utils::padto_le(k_remain, PackRow * Unroll);
+    int ib = 0;
+    for (; ib < k_remain_unrll; ib += PackRow * Unroll) {
+      auto b2ptr = srcptr + (ir + ib) * NTILE / 4;
+      auto b1ptr = bit1ptr + (ir + ib) * NTILE / 8;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits(b2ptr + i * 16, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        auto vb1 = unpack_1bits(b1ptr + i * 8, zmm_0x00, zmm_0x04);
+        v_s8_y = _mm512_or_si512(v_s8_y, vb1);
+        v_s8_y = _mm512_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm512_storeu_si512((__m512i*)(dstptr + i * 64 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+    int k_tail = k_remain - k_remain_unrll;
+    if (k_tail > 0) {
+      auto tmpb2ptr = tmp;
+      memcpy(tmpb2ptr, srcptr + (ir + ib) * NTILE / 4, k_tail * NTILE / 4);
+      auto tmpb1ptr = tmp + Unroll * NTILE / 2;
+      memcpy(tmpb1ptr, bit1ptr + (ir + ib) * NTILE / 8, k_tail * NTILE / 8);
+      auto tmpout = tmp + Unroll * NTILE;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits((utils::bit2x4*)(tmpb2ptr + i * 16), vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        auto vb1 = unpack_1bits((utils::bit1x8*)(tmpb1ptr + i * 8), zmm_0x00, zmm_0x04);
+        v_s8_y = _mm512_or_si512(v_s8_y, vb1);
+        v_s8_y = _mm512_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm512_storeu_si512((__m512i*)(tmpout + i * 64), v_s8_y);
+      }
+      memcpy(dstptr + (ir + ib) * NTILE, tmpout, k_tail * NTILE);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int NTILE>
+static inline BTLA_CODE decompress_kblock_s3_s8_pack4_row(utils::bit2x4* srcptr, utils::bit1x8* bit1ptr, int8_t* zpptr,
+                                                          int8_t* dstptr, int blocksize, int ldzp, int n_offset,
+                                                          int k_offset, int row, int8_t* tmp, size_t tmpsize) {
+  int constexpr VLen = 16;
+  int constexpr NReg = NTILE / VLen;
+  static_assert((NTILE % VLen) == 0);
+  int constexpr PackRow = 4;
+  __m512i v_zp_y[NReg];
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm512_set1_epi64(*(int64_t*)&mask0);
+  auto vbias = _mm512_set1_epi8(4);
+  auto vshift_y = _mm512_set_epi32(6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm512_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0,
+                                      15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+
+  auto zmm_0x04 = _mm512_set1_epi8(0x04);
+  auto zmm_0x00 = _mm512_set1_epi8(0x00);
+  const auto vindex = _mm512_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12,
+                                      12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+  for (int ir = 0; ir < row; ir += blocksize) {
+    auto zptr = zpptr + (k_offset + ir) / blocksize * ldzp + n_offset;
+    for (int i = 0; i < NReg; i++) {
+      v_zp_y[i] = load_zp_epi8_broadcast_epi32(zptr + i * 16, vindex);
+      v_zp_y[i] = _mm512_add_epi8(v_zp_y[i], vbias);
+    }
+    int k_remain = utils::remainsize(ir, row, blocksize);
+    for (int ib = 0; ib < k_remain; ib += PackRow) {
+      auto b2ptr = srcptr + (ir + ib) * NTILE / 4;
+      auto b1ptr = bit1ptr + (ir + ib) * NTILE / 8;
+      for (int i = 0; i < NReg; i++) {
+        auto v_s8_y = unpack_2bits(b2ptr + i * 16, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+        auto vb1 = unpack_1bits(b1ptr + i * 8, zmm_0x00, zmm_0x04);
+        v_s8_y = _mm512_or_si512(v_s8_y, vb1);
+        v_s8_y = _mm512_sub_epi8(v_s8_y, v_zp_y[i]);
+        _mm512_storeu_si512((__m512i*)(dstptr + i * 64 + (ir + ib) * NTILE), v_s8_y);
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int PackRow, int NTILE>
+static inline BTLA_CODE decompress_kblock_s3_s8(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, int8_t* zpptr,
+                                                int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset,
+                                                int row, int col, int8_t* tmp, size_t tmpsize) {
+  if (zpptr) {
+    typedef BTLA_CODE (*decompfunc)(utils::bit2x4 * bit2ptr, utils::bit1x8 * bit1ptr, int8_t * zpptr, int8_t * dstptr,
+                                    int blocksize, int ldzp, int n_offset, int k_offset, int row, int8_t* tmp,
+                                    size_t tmpsize);
+    decompfunc func = nullptr;
+    if (col == NTILE) {
+      if constexpr (PackRow == 1) {
+        func = &decompress_kblock_s3_s8_pack1_row<NTILE>;
+      }
+      if constexpr (PackRow == 2) {
+        func = &decompress_kblock_s3_s8_pack2_row<NTILE>;
+      }
+      if constexpr (PackRow == 4) {
+        func = &decompress_kblock_s3_s8_pack4_row<NTILE>;
+      }
+
+      if (func) {
+        int head_end = utils::padto(k_offset, blocksize);
+        head_end = std::min(head_end, k_offset + row);
+        int head_size = head_end - k_offset;
+        if (head_size > 0) {
+          (*func)(bit2ptr, bit1ptr, zpptr, dstptr, blocksize, ldzp, n_offset, k_offset, head_size, tmp, tmpsize);
+        }
+        int body_size = row - head_size;
+        if (body_size > 0) {
+          (*func)(bit2ptr + head_size * NTILE / 4, bit1ptr + head_size * NTILE / 8, zpptr, dstptr + head_size * NTILE,
+                  blocksize, ldzp, n_offset, head_end, body_size, tmp, tmpsize);
+        }
+        return BTLA_CODE::Success;
+      }
+    }
+    assert(0);
+    return BTLA_CODE::NotSupport;
+  } else {
+    size_t elesize = static_cast<size_t>(row) * col;
+    return decompress_s3_s8(bit2ptr, bit1ptr, dstptr, elesize, tmp, tmpsize);
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s8_fp_row(int8_t* srcptr, DST_T* dstptr, int row, void* scales_, BTLA_DTYPE sdtype,
+                                             int8_t* zero_points, int k_offset, int n_offset, int blocksize, int ldzp,
+                                             int8_t* tmp, size_t tmpsize) {
+  int constexpr VLen = 16;
+  int constexpr NReg = NTILE / VLen;
+  const auto DstSize = row * NTILE * sizeof(DST_T);
+  const auto S8Size = row * NTILE * sizeof(int8_t);
+  const auto vshuf_index_low = _mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+  const auto vshuf_index_high = _mm512_set_epi32(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
+  if (zero_points == nullptr) {
+    for (int ir = 0; ir < row; ir += blocksize) {
+      int k_remain = utils::remainsize(ir, row, blocksize);
+      int ele_off = (k_offset + ir) / blocksize * ldzp + n_offset;
+      if constexpr (PackRow == 1) {
+        __m512 vscale_y[NReg];
+        if (sdtype == BTLA_DTYPE::F32) {
+          auto sptr = (float*)scales_ + ele_off;
+          for (int i = 0; i < NReg; i++) vscale_y[i] = _mm512_loadu_ps(sptr + i * VLen);
+        } else if (sdtype == BTLA_DTYPE::BF16) {
+          auto sptr = (utils::bf16*)scales_ + ele_off;
+          for (int i = 0; i < NReg; i++) vscale_y[i] = load_bf16_fp32(sptr + i * VLen);
+        }
+        for (int ib = 0; ib < k_remain; ib += PackRow) {
+          auto b8ptr = srcptr + (ir + ib) * NTILE;
+          for (int i = 0; i < NReg; i++) {
+            auto vdeq_y = dequant_s8_fp(b8ptr + i * VLen, vscale_y[i]);
+            store_fp_T(vdeq_y, dstptr + (ir + ib) * NTILE + i * VLen);
+          }
+        }
+      } else if constexpr (PackRow == 4) {
+        __m512 vscale_y[PackRow * NReg];
+        for (int i = 0; i < NReg; i++) {
+          __m512 vraw;
+          if (sdtype == BTLA_DTYPE::F32) {
+            auto sptr = (float*)scales_ + ele_off;
+            vraw = _mm512_loadu_ps(sptr + i * VLen);
+          } else if (sdtype == BTLA_DTYPE::BF16) {
+            auto sptr = (utils::bf16*)scales_ + ele_off;
+            vraw = load_bf16_fp32(sptr + i * VLen);
+          } else {
+            assert(0);
+          }
+          auto vcast_y = broadcast_ps_1_2<true>(vraw, vshuf_index_high, vshuf_index_low);
+          vscale_y[i * PackRow + 0] = broadcast_ps_1_2<true>(vcast_y, vshuf_index_high, vshuf_index_low);
+          vscale_y[i * PackRow + 1] = broadcast_ps_1_2<false>(vcast_y, vshuf_index_high, vshuf_index_low);
+          vcast_y = broadcast_ps_1_2<false>(vraw, vshuf_index_high, vshuf_index_low);
+          vscale_y[i * PackRow + 2] = broadcast_ps_1_2<true>(vcast_y, vshuf_index_high, vshuf_index_low);
+          vscale_y[i * PackRow + 3] = broadcast_ps_1_2<false>(vcast_y, vshuf_index_high, vshuf_index_low);
+        }
+        for (int ib = 0; ib < k_remain; ib += PackRow) {
+          auto b8ptr = srcptr + (ir + ib) * NTILE;
+          for (int i = 0; i < NReg; i++) {
+            for (int ip = 0; ip < PackRow; ip++) {
+              auto vdeq_y = dequant_s8_fp(b8ptr + i * VLen * PackRow + ip * VLen, vscale_y[i * PackRow + ip]);
+              store_fp_T(vdeq_y, dstptr + (ir + ib) * NTILE + i * VLen * PackRow + ip * VLen);
+            }
+          }
+        }
+      } else if constexpr (PackRow == 2) {
+        __m512 vscale_y[PackRow * NReg];
+        for (int i = 0; i < NReg; i++) {
+          __m512 vraw;
+          if (sdtype == BTLA_DTYPE::F32) {
+            auto sptr = (float*)scales_ + ele_off;
+            vraw = _mm512_loadu_ps(sptr + i * VLen);
+          } else if (sdtype == BTLA_DTYPE::BF16) {
+            auto sptr = (utils::bf16*)scales_ + ele_off;
+            vraw = load_bf16_fp32(sptr + i * VLen);
+          }
+          vscale_y[i * PackRow + 0] = broadcast_ps_1_2<true>(vraw, vshuf_index_high, vshuf_index_low);
+          vscale_y[i * PackRow + 1] = broadcast_ps_1_2<false>(vraw, vshuf_index_high, vshuf_index_low);
+        }
+        for (int ib = 0; ib < k_remain; ib += PackRow) {
+          auto b8ptr = srcptr + (ir + ib) * NTILE;
+          for (int i = 0; i < NReg; i++) {
+            for (int ip = 0; ip < PackRow; ip++) {
+              auto vdeq_y = dequant_s8_fp(b8ptr + i * VLen * PackRow + ip * VLen, vscale_y[i * PackRow + ip]);
+              store_fp_T(vdeq_y, dstptr + (ir + ib) * NTILE + i * VLen * PackRow + ip * VLen);
+            }
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+    return BTLA_CODE::Success;
+  } else {
+    for (int ir = 0; ir < row; ir += blocksize) {
+      int k_remain = utils::remainsize(ir, row, blocksize);
+      int ele_off = (k_offset + ir) / blocksize * ldzp + n_offset;
+      if constexpr (PackRow == 1) {
+        __m512 vscale_y[NReg];
+        if (sdtype == BTLA_DTYPE::F32) {
+          auto sptr = (float*)scales_ + ele_off;
+          for (int i = 0; i < NReg; i++) vscale_y[i] = _mm512_loadu_ps(sptr + i * VLen);
+        } else if (sdtype == BTLA_DTYPE::BF16) {
+          auto sptr = (utils::bf16*)scales_ + ele_off;
+          for (int i = 0; i < NReg; i++) vscale_y[i] = load_bf16_fp32(sptr + i * VLen);
+        }
+        __m512i vzp_y[NReg];
+        for (int i = 0; i < NReg; i++) vzp_y[i] = load_s8_s32(zero_points + ele_off + i * VLen);
+        for (int ib = 0; ib < k_remain; ib += PackRow) {
+          auto b8ptr = srcptr + (ir + ib) * NTILE;
+          for (int i = 0; i < NReg; i++) {
+            auto vdeq_y = dequant_s8_fp<true>(b8ptr + i * VLen, vscale_y[i], vzp_y[i]);
+            store_fp_T(vdeq_y, dstptr + (ir + ib) * NTILE + i * VLen);
+          }
+        }
+      } else if constexpr (PackRow == 4) {
+        __m512 vscale_y[PackRow * NReg];
+        __m512i vzp_y[PackRow * NReg];
+        for (int i = 0; i < NReg; i++) {
+          __m512 vraw;
+          if (sdtype == BTLA_DTYPE::F32) {
+            auto sptr = (float*)scales_ + ele_off;
+            vraw = _mm512_loadu_ps(sptr + i * VLen);
+          } else if (sdtype == BTLA_DTYPE::BF16) {
+            auto sptr = (utils::bf16*)scales_ + ele_off;
+            vraw = load_bf16_fp32(sptr + i * VLen);
+          } else {
+            assert(0);
+          }
+          auto vcast_y = broadcast_ps_1_2<true>(vraw, vshuf_index_high, vshuf_index_low);
+          vscale_y[i * PackRow + 0] = broadcast_ps_1_2<true>(vcast_y, vshuf_index_high, vshuf_index_low);
+          vscale_y[i * PackRow + 1] = broadcast_ps_1_2<false>(vcast_y, vshuf_index_high, vshuf_index_low);
+          vcast_y = broadcast_ps_1_2<false>(vraw, vshuf_index_high, vshuf_index_low);
+          vscale_y[i * PackRow + 2] = broadcast_ps_1_2<true>(vcast_y, vshuf_index_high, vshuf_index_low);
+          vscale_y[i * PackRow + 3] = broadcast_ps_1_2<false>(vcast_y, vshuf_index_high, vshuf_index_low);
+
+          auto tmp = load_s8_s32(zero_points + ele_off + i * VLen);
+          auto vcasti_y = broadcast_epi32_1_2<true>(tmp, vshuf_index_high, vshuf_index_low);
+          vzp_y[i * PackRow + 0] = broadcast_epi32_1_2<true>(vcasti_y, vshuf_index_high, vshuf_index_low);
+          vzp_y[i * PackRow + 1] = broadcast_epi32_1_2<false>(vcasti_y, vshuf_index_high, vshuf_index_low);
+          vcasti_y = broadcast_epi32_1_2<false>(tmp, vshuf_index_high, vshuf_index_low);
+          vzp_y[i * PackRow + 2] = broadcast_epi32_1_2<true>(vcasti_y, vshuf_index_high, vshuf_index_low);
+          vzp_y[i * PackRow + 3] = broadcast_epi32_1_2<false>(vcasti_y, vshuf_index_high, vshuf_index_low);
+        }
+        for (int ib = 0; ib < k_remain; ib += PackRow) {
+          auto b8ptr = srcptr + (ir + ib) * NTILE;
+          for (int i = 0; i < NReg; i++) {
+            for (int ip = 0; ip < PackRow; ip++) {
+              auto vdeq_y = dequant_s8_fp<true>(b8ptr + i * VLen * PackRow + ip * VLen, vscale_y[i * PackRow + ip],
+                                                vzp_y[i * PackRow + ip]);
+              store_fp_T(vdeq_y, dstptr + (ir + ib) * NTILE + i * VLen * PackRow + ip * VLen);
+            }
+          }
+        }
+      } else if constexpr (PackRow == 2) {
+        __m512 vscale_y[PackRow * NReg];
+        __m512i vzp_y[PackRow * NReg];
+        for (int i = 0; i < NReg; i++) {
+          __m512 vraw;
+          if (sdtype == BTLA_DTYPE::F32) {
+            auto sptr = (float*)scales_ + ele_off;
+            vraw = _mm512_loadu_ps(sptr + i * VLen);
+          } else if (sdtype == BTLA_DTYPE::BF16) {
+            auto sptr = (utils::bf16*)scales_ + ele_off;
+            vraw = load_bf16_fp32(sptr + i * VLen);
+          }
+          vscale_y[i * PackRow + 0] = broadcast_ps_1_2<true>(vraw, vshuf_index_high, vshuf_index_low);
+          vscale_y[i * PackRow + 1] = broadcast_ps_1_2<false>(vraw, vshuf_index_high, vshuf_index_low);
+          auto tmp = load_s8_s32(zero_points + ele_off + i * VLen);
+          vzp_y[i * PackRow + 0] = broadcast_epi32_1_2<true>(tmp, vshuf_index_high, vshuf_index_low);
+          vzp_y[i * PackRow + 1] = broadcast_epi32_1_2<false>(tmp, vshuf_index_high, vshuf_index_low);
+        }
+        for (int ib = 0; ib < k_remain; ib += PackRow) {
+          auto b8ptr = srcptr + (ir + ib) * NTILE;
+          for (int i = 0; i < NReg; i++) {
+            for (int ip = 0; ip < PackRow; ip++) {
+              auto vdeq_y = dequant_s8_fp<true>(b8ptr + i * VLen * PackRow + ip * VLen, vscale_y[i * PackRow + ip],
+                                                vzp_y[i * PackRow + ip]);
+              store_fp_T(vdeq_y, dstptr + (ir + ib) * NTILE + i * VLen * PackRow + ip * VLen);
+            }
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+    return BTLA_CODE::Success;
+  }
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s8_fp(int8_t* srcptr, DST_T* dstptr, int row, int col, void* scales_,
+                                         BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset, int n_offset,
+                                         int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  auto ret = BTLA_CODE::NotSupport;
+  if (col == NTILE) {
+    int head_end = utils::padto(k_offset, blocksize);
+    head_end = std::min(head_end, k_offset + row);
+    int head_size = head_end - k_offset;
+    if (head_size > 0) {
+      decompress_kblock_s8_fp_row<PackRow, NTILE, DST_T>(srcptr, dstptr, head_size, scales_, sdtype, zero_points,
+                                                         k_offset, n_offset, blocksize, ldzp, tmp, tmpsize);
+    }
+    int body_size = row - head_size;
+    if (body_size > 0) {
+      decompress_kblock_s8_fp_row<PackRow, NTILE, DST_T>(srcptr + head_size * NTILE, dstptr + head_size * NTILE,
+                                                         body_size, scales_, sdtype, zero_points, head_end, n_offset,
+                                                         blocksize, ldzp, tmp, tmpsize);
+    }
+    return BTLA_CODE::Success;
+  }
+  return ret;
+}
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s4_fp_row(utils::int4x2* srcptr, DST_T* dstptr, int row, void* scales_,
+                                             BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset, int n_offset,
+                                             int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  const auto DstSize = row * NTILE * sizeof(DST_T);
+  const auto S8Size = row * NTILE * sizeof(int8_t);
+  auto tmps8ptr = (int8_t*)dstptr;
+  tmps8ptr += DstSize - S8Size;
+  auto ret = decompress_kblock_s4_s8<PackRow, NTILE>(srcptr, zero_points, tmps8ptr, blocksize, ldzp, n_offset, k_offset,
+                                                     row, NTILE, tmp, tmpsize);
+  assert(ret == BTLA_CODE::Success);
+  return decompress_kblock_s8_fp_row<PackRow, NTILE, DST_T>(tmps8ptr, dstptr, row, scales_, sdtype, nullptr, k_offset,
+                                                            n_offset, blocksize, ldzp, tmp, tmpsize);
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s4_fp(utils::int4x2* srcptr, DST_T* dstptr, int row, int col, void* scales_,
+                                         BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset, int n_offset,
+                                         int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  auto ret = BTLA_CODE::NotSupport;
+  if (col == NTILE) {
+    int head_end = utils::padto(k_offset, blocksize);
+    head_end = std::min(head_end, k_offset + row);
+    int head_size = head_end - k_offset;
+    if (head_size > 0) {
+      decompress_kblock_s4_fp_row<PackRow, NTILE, DST_T>(srcptr, dstptr, head_size, scales_, sdtype, zero_points,
+                                                         k_offset, n_offset, blocksize, ldzp, tmp, tmpsize);
+    }
+    int body_size = row - head_size;
+    if (body_size > 0) {
+      decompress_kblock_s4_fp_row<PackRow, NTILE, DST_T>(srcptr + head_size * NTILE / 2, dstptr + head_size * NTILE,
+                                                         body_size, scales_, sdtype, zero_points, head_end, n_offset,
+                                                         blocksize, ldzp, tmp, tmpsize);
+    }
+    return BTLA_CODE::Success;
+  }
+  return ret;
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s2_fp_row(utils::bit2x4* b2ptr, DST_T* dstptr, int row, void* scales_,
+                                             BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset, int n_offset,
+                                             int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  const auto DstSize = row * NTILE * sizeof(DST_T);
+  const auto S8Size = row * NTILE * sizeof(int8_t);
+  auto tmps8ptr = (int8_t*)dstptr;
+  tmps8ptr += DstSize - S8Size;
+  auto ret = decompress_kblock_s2_s8<PackRow, NTILE>(b2ptr, zero_points, tmps8ptr, blocksize, ldzp, n_offset, k_offset,
+                                                     row, NTILE, tmp, tmpsize);
+  assert(ret == BTLA_CODE::Success);
+  return decompress_kblock_s8_fp_row<PackRow, NTILE, DST_T>(tmps8ptr, dstptr, row, scales_, sdtype, nullptr, k_offset,
+                                                            n_offset, blocksize, ldzp, tmp, tmpsize);
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s2_fp(utils::bit2x4* b2ptr, DST_T* dstptr, int row, int col, void* scales_,
+                                         BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset, int n_offset,
+                                         int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  auto ret = BTLA_CODE::NotSupport;
+  if (col == NTILE) {
+    int head_end = utils::padto(k_offset, blocksize);
+    head_end = std::min(head_end, k_offset + row);
+    int head_size = head_end - k_offset;
+    if (head_size > 0) {
+      decompress_kblock_s2_fp_row<PackRow, NTILE, DST_T>(b2ptr, dstptr, head_size, scales_, sdtype, zero_points,
+                                                         k_offset, n_offset, blocksize, ldzp, tmp, tmpsize);
+    }
+    int body_size = row - head_size;
+    if (body_size > 0) {
+      decompress_kblock_s2_fp_row<PackRow, NTILE, DST_T>(b2ptr + head_size * NTILE / 4, dstptr + head_size * NTILE,
+                                                         body_size, scales_, sdtype, zero_points, head_end, n_offset,
+                                                         blocksize, ldzp, tmp, tmpsize);
+    }
+    return BTLA_CODE::Success;
+  }
+  return ret;
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s3_fp_row(utils::bit2x4* b2ptr, utils::bit1x8* b1ptr, DST_T* dstptr, int row,
+                                             void* scales_, BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset,
+                                             int n_offset, int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  int constexpr NReg = NTILE / 8;
+  const auto DstSize = row * NTILE * sizeof(DST_T);
+  const auto S8Size = row * NTILE * sizeof(int8_t);
+  auto tmps8ptr = (int8_t*)dstptr;
+  tmps8ptr += DstSize - S8Size;
+  auto ret = decompress_kblock_s3_s8<PackRow, NTILE>(b2ptr, b1ptr, zero_points, tmps8ptr, blocksize, ldzp, n_offset,
+                                                     k_offset, row, NTILE, tmp, tmpsize);
+  assert(ret == BTLA_CODE::Success);
+  return decompress_kblock_s8_fp_row<PackRow, NTILE, DST_T>(tmps8ptr, dstptr, row, scales_, sdtype, nullptr, k_offset,
+                                                            n_offset, blocksize, ldzp, tmp, tmpsize);
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s3_fp(utils::bit2x4* b2ptr, utils::bit1x8* b1ptr, DST_T* dstptr, int row, int col,
+                                         void* scales_, BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset,
+                                         int n_offset, int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  auto ret = BTLA_CODE::NotSupport;
+  if (col == NTILE) {
+    int head_end = utils::padto(k_offset, blocksize);
+    head_end = std::min(head_end, k_offset + row);
+    int head_size = head_end - k_offset;
+    if (head_size > 0) {
+      decompress_kblock_s3_fp_row<PackRow, NTILE, DST_T>(b2ptr, b1ptr, dstptr, head_size, scales_, sdtype, zero_points,
+                                                         k_offset, n_offset, blocksize, ldzp, tmp, tmpsize);
+    }
+    int body_size = row - head_size;
+    if (body_size > 0) {
+      decompress_kblock_s3_fp_row<PackRow, NTILE, DST_T>(
+          b2ptr + head_size * NTILE / 4, b1ptr + head_size * NTILE / 8, dstptr + head_size * NTILE, body_size, scales_,
+          sdtype, zero_points, head_end, n_offset, blocksize, ldzp, tmp, tmpsize);
+    }
+    return BTLA_CODE::Success;
+  }
+  return ret;
+}
+
+template <typename T>
+static inline __m512 load_T_fp32(const T* srcptr) {
+  __m512 vtmp;
+  if constexpr (std::is_same_v<T, float>) {
+    vtmp = _mm512_loadu_ps(srcptr);
+  } else if constexpr (std::is_same_v<T, utils::bf16>) {
+    vtmp = load_bf16_fp32(srcptr);
+  } else {
+    assert(0);
+  }
+  return vtmp;
+}
+
+static inline __m512 load_s8_fp32(int8_t* srcptr) {
+  auto src_y = load_s8_s32(srcptr);
+  auto dst_y = _mm512_cvtepi32_ps(src_y);
+  return dst_y;
+}
+
+static inline __m512i _mm512_sign_epi8(__m512i a, __m512i b) {
+  __m512i zero = _mm512_setzero_si512();
+  __mmask64 blt0 = _mm512_movepi8_mask(b);
+  return _mm512_mask_sub_epi8(a, blt0, zero, a);
+  ;
+}
+
+template <typename ScaleT, int NReg, int MTILE>
+static inline void gemv_dequant_s32fp32(const float* asptr, int ldzp, const ScaleT* bsptr, __m512i* iacc,
+                                        __m512* facc) {
+  __m512 v_a_scale[MTILE];
+  for (int im = 0; im < MTILE; im++) {
+    v_a_scale[im] = _mm512_set1_ps(*(asptr + im * ldzp));
+  }
+
+  for (int i = 0; i < NReg; i++) {
+    __m512 v_b_scale = load_T_fp32(bsptr + i * 16);
+    for (int im = 0; im < MTILE; im++) {
+      auto vtmp = _mm512_mul_ps(v_a_scale[im], v_b_scale);
+      auto tmp = _mm512_cvtepi32_ps(iacc[im * NReg + i]);
+      facc[im * NReg + i] = _mm512_fmadd_ps(tmp, vtmp, facc[im * NReg + i]);
+    }
+  }
+}
+
+template <int NReg, int MReg>
+static inline void gemv_remove_zp(const uint8_t* azptr, int ldzp, __m512i* iacc, __m512i* bacc) {
+  if constexpr (MReg == 1) {
+    auto zp = int(azptr[0]);
+    __m512i v_a_zp = _mm512_set1_epi32(zp);
+    for (int in = 0; in < NReg; in++) {
+      auto vtmp = _mm512_mullo_epi32(v_a_zp, bacc[in]);
+      iacc[in] = _mm512_sub_epi32(iacc[in], vtmp);
+    }
+  } else {
+    __m512i v_a_zp[MReg];
+    for (int im = 0; im < MReg; im++) {
+      auto zp = int(azptr[im * ldzp]);
+      v_a_zp[im] = _mm512_set1_epi32(zp);
+      for (int in = 0; in < NReg; in++) {
+        auto vtmp = _mm512_mullo_epi32(v_a_zp[im], bacc[in]);
+        iacc[im * NReg + in] = _mm512_sub_epi32(iacc[im * NReg + in], vtmp);
+      }
+    }
+  }
+}
+
+template <int MTILE, int NReg, int Unroll>
+static inline void accumulate_fp32_s8_fp32(const float* Aptr, int lda, int8_t* Bptr, __m512* vacc, __m512* vsca) {
+  if constexpr (MTILE == 1) {
+    for (int ikk = 0; ikk < Unroll; ikk++) {
+      __m512 va = _mm512_set1_ps(*(Aptr + ikk));
+      for (int i = 0; i < NReg; i++) {
+        auto ftmp = load_s8_fp32(Bptr + i * 16 + ikk * NReg * 16);
+        ftmp = _mm512_mul_ps(ftmp, vsca[i]);
+        vacc[i] = _mm512_fmadd_ps(va, ftmp, vacc[i]);
+      }
+    }
+  } else {
+    for (int ikk = 0; ikk < Unroll; ikk++) {
+      __m512 va[MTILE];
+      for (int i = 0; i < NReg; i++) {
+        auto ftmp = load_s8_fp32(Bptr + i * 16 + ikk * NReg * 16);
+        ftmp = _mm512_mul_ps(ftmp, vsca[i]);
+        for (int im = 0; im < MTILE; im++) {
+          if (i == 0) {
+            va[im] = _mm512_set1_ps(*(Aptr + ikk + im * lda));
+          }
+          vacc[im * NReg + i] = _mm512_fmadd_ps(va[im], ftmp, vacc[im * NReg + i]);
+        }
+      }
+    }
+  }
+}
+
+template <int MTILE, int NReg, int Unroll>
+static inline void accumulate_fp32_s8_fp32(const float* Aptr, int lda, int8_t* Bptr, __m512* vacc_loc) {
+  if constexpr (MTILE == 1) {
+    for (int ikk = 0; ikk < Unroll; ikk++) {
+      __m512 va = _mm512_set1_ps(*(Aptr + ikk));
+      for (int i = 0; i < NReg; i++) {
+        auto ftmp = load_s8_fp32(Bptr + i * 16 + ikk * NReg * 16);
+        vacc_loc[i] = _mm512_fmadd_ps(va, ftmp, vacc_loc[i]);
+      }
+    }
+  } else {
+    for (int ikk = 0; ikk < Unroll; ikk++) {
+      __m512 va[MTILE];
+      for (int i = 0; i < NReg; i++) {
+        auto ftmp = load_s8_fp32(Bptr + i * 16 + ikk * NReg * 16);
+        for (int im = 0; im < MTILE; im++) {
+          if (i == 0) {
+            va[im] = _mm512_set1_ps(*(Aptr + ikk + im * lda));
+          }
+          vacc_loc[im * NReg + i] = _mm512_fmadd_ps(va[im], ftmp, vacc_loc[im * NReg + i]);
+        }
+      }
+    }
+  }
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_4bit_fp32_fp32(const float* A, int lda, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto& b4ptr = B.b4ptr;
+  int blks = k / blocksize;
+  int constexpr VLen = 16;
+  int constexpr NReg = NTILE / VLen;
+  int constexpr MReg = MTILE;
+  // Initialize accumulator with zeros
+  __m512 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm512_setzero_ps();
+  }
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
+  auto vbias = _mm512_set1_epi8(8);
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * B.ldzp;
+    __m512 v_b_scale[NReg];
+    for (int i = 0; i < NReg; i++) {
+      v_b_scale[i] = load_T_fp32(bsptr + i * VLen);
+    }
+
+    int constexpr Unroll = 4;
+    assert((blocksize % 4) == 0);
+    assert(tmpsize >= NTILE * Unroll);
+
+    if (B.zpptr) {
+      __m512i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+
+      for (int i = 0; i < Unroll; i++) {
+        memcpy(tmp + i * NTILE, bzptr, NTILE);
+      }
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = _mm512_loadu_si512((const __m512i*)(tmp + i * 64));
+        bzp[i] = _mm512_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += Unroll) {
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_4bits((void*)(b4ptr + i * 32 + (ib * blocksize + ik) * NTILE / 2), vmask);
+          vb = _mm512_sub_epi8(vb, bzp[i]);
+          _mm512_storeu_si512((__m512i*)(tmp + 64 * i), vb);
+        }
+        accumulate_fp32_s8_fp32<MTILE, NReg, Unroll>(A + ib * blocksize + ik, lda, tmp, acc, v_b_scale);
+      }
+
+    } else {
+      for (int ik = 0; ik < blocksize; ik += Unroll) {
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_4bits((void*)(b4ptr + i * 32 + (ib * blocksize + ik) * NTILE / 2), vmask);
+          vb = _mm512_sub_epi8(vb, vbias);
+          _mm512_storeu_si512((__m512i*)(tmp + 64 * i), vb);
+        }
+        accumulate_fp32_s8_fp32<MTILE, NReg, Unroll>(A + ib * blocksize + ik, lda, tmp, acc, v_b_scale);
+      }
+    }
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm512_storeu_ps(C + i * VLen + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_2bit_fp32_fp32(const float* A, int lda, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto b2ptr = (utils::bit2x4*)B.b2ptr;
+  int constexpr VLen = 16;
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / VLen;
+  int constexpr MReg = MTILE;
+  __m512 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm512_setzero_ps();
+  }
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm512_set1_epi64(*(int64_t*)&mask0);
+  auto vbias = _mm512_set1_epi8(2);
+  auto vshift_y = _mm512_set_epi32(6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm512_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0,
+                                      15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+
+  int constexpr KTILE = 1;
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * B.ldzp;
+
+    __m512 acc_loc[NReg * MReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      acc_loc[i] = _mm512_setzero_ps();
+    }
+    int constexpr Unroll = 4;
+    assert((blocksize % 4) == 0);
+    assert(tmpsize >= NTILE * Unroll);
+
+    if (B.zpptr) {
+      __m512i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < Unroll; i++) {
+        memcpy(tmp + i * NTILE, bzptr, NTILE);
+      }
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = _mm512_loadu_si512((const __m512i*)(tmp + i * 64));
+        bzp[i] = _mm512_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += Unroll) {
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+          vb = _mm512_sub_epi8(vb, bzp[i]);
+          _mm512_storeu_si512((__m512i*)(tmp + 64 * i), vb);
+          b2ptr += VLen * Unroll / 4;
+        }
+        accumulate_fp32_s8_fp32<MTILE, NReg, Unroll>(A + ib * blocksize + ik, lda, tmp, acc_loc);
+      }
+
+    } else {
+      for (int ik = 0; ik < blocksize; ik += Unroll) {
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+          vb = _mm512_sub_epi8(vb, vbias);
+          _mm512_storeu_si512((__m512i*)(tmp + 64 * i), vb);
+          b2ptr += VLen * Unroll / 4;
+        }
+        accumulate_fp32_s8_fp32<MTILE, NReg, Unroll>(A + ib * blocksize + ik, lda, tmp, acc_loc);
+      }
+    }
+
+    __m512 v_b_scale[NReg];
+    for (int i = 0; i < NReg; i++) {
+      v_b_scale[i] = load_T_fp32(bsptr + i * VLen);
+    }
+    for (int im = 0; im < MTILE; im++) {
+      for (int in = 0; in < NReg; in++) {
+        acc[im * NReg + in] = _mm512_fmadd_ps(acc_loc[im * NReg + in], v_b_scale[in], acc[im * NReg + in]);
+      }
+    }
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm512_storeu_ps(C + i * VLen + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_3bit_fp32_fp32(const float* A, int lda, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto b2ptr = (utils::bit2x4*)B.b2ptr;
+  auto b1ptr = (utils::bit1x8*)B.b1ptr;
+
+  int constexpr VLen = 16;
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / VLen;
+  int constexpr MReg = MTILE;
+  __m512 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm512_setzero_ps();
+  }
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm512_set1_epi64(*(int64_t*)&mask0);
+  auto vbias = _mm512_set1_epi8(4);
+  auto vshift_y = _mm512_set_epi32(6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm512_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0,
+                                      15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+
+  auto zmm_0x04 = _mm512_set1_epi8(0x04);
+  auto zmm_0x00 = _mm512_set1_epi8(0x00);
+  int constexpr KTILE = 1;
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * B.ldzp;
+
+    __m512 acc_loc[NReg * MReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      acc_loc[i] = _mm512_setzero_ps();
+    }
+    int constexpr Unroll = 4;
+    assert((blocksize % 4) == 0);
+    assert(tmpsize >= NTILE * Unroll);
+
+    if (B.zpptr) {
+      __m512i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < Unroll; i++) {
+        memcpy(tmp + i * NTILE, bzptr, NTILE);
+      }
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = _mm512_loadu_si512((const __m512i*)(tmp + i * 64));
+        bzp[i] = _mm512_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += Unroll) {
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+          auto vb1 = unpack_1bits(b1ptr, zmm_0x00, zmm_0x04);
+          vb = _mm512_or_si512(vb, vb1);
+          vb = _mm512_sub_epi8(vb, bzp[i]);
+          _mm512_storeu_si512((__m512i*)(tmp + 64 * i), vb);
+          b2ptr += VLen * Unroll / 4;
+          b1ptr += VLen * Unroll / 8;
+        }
+        accumulate_fp32_s8_fp32<MTILE, NReg, Unroll>(A + ib * blocksize + ik, lda, tmp, acc_loc);
+      }
+
+    } else {
+      for (int ik = 0; ik < blocksize; ik += Unroll) {
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+          auto vb1 = unpack_1bits(b1ptr, zmm_0x00, zmm_0x04);
+          vb = _mm512_or_si512(vb, vb1);
+          vb = _mm512_sub_epi8(vb, vbias);
+          _mm512_storeu_si512((__m512i*)(tmp + 64 * i), vb);
+          b2ptr += VLen * Unroll / 4;
+          b1ptr += VLen * Unroll / 8;
+        }
+        accumulate_fp32_s8_fp32<MTILE, NReg, Unroll>(A + ib * blocksize + ik, lda, tmp, acc_loc);
+      }
+    }
+
+    __m512 v_b_scale[NReg];
+    for (int i = 0; i < NReg; i++) {
+      v_b_scale[i] = load_T_fp32(bsptr + i * VLen);
+    }
+    for (int im = 0; im < MTILE; im++) {
+      for (int in = 0; in < NReg; in++) {
+        acc[im * NReg + in] = _mm512_fmadd_ps(acc_loc[im * NReg + in], v_b_scale[in], acc[im * NReg + in]);
+      }
+    }
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm512_storeu_ps(C + i * VLen + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+namespace vnni {
+
+#if CompileAVX512VNNI()
+#ifdef __GNUC__
+#pragma GCC push_options
+#pragma GCC target("avx512vnni")
+#endif
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_4bit_u8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto& a8ptr = A.aptr;
+  auto& b4ptr = B.b4ptr;
+  auto& asptr = A.sptr;
+  auto& azptr = A.zpptr;
+  int constexpr VLen = 16;
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / VLen;
+  int constexpr MReg = MTILE;
+  // Initialize accumulator with zeros
+  __m512 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm512_setzero_ps();
+  }
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
+  const __m512i onesu8 = _mm512_set1_epi8(1);
+  const __m512i vbias = _mm512_set1_epi8(8);
+  const auto vindex = _mm512_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12,
+                                      12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m512i iacc[NReg * MReg];
+    __m512i bacc[NReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      iacc[i] = _mm512_setzero_si512();
+    }
+    for (int i = 0; i < NReg; i++) {
+      bacc[i] = _mm512_setzero_si512();
+    }
+    if (B.zpptr) {
+      __m512i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * VLen, vindex);
+        bzp[i] = _mm512_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += 4) {
+        if constexpr (MTILE == 1) {
+          __m512i va = _mm512_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_4bits((void*)(b4ptr + i * 32 + (ib * blocksize + ik) * NTILE / 2), vmask);
+            vb = _mm512_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm512_dpbusd_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm512_dpbusd_epi32(iacc[i], va, vb);
+          }
+        } else {
+          __m512i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm512_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_4bits((void*)(b4ptr + i * 32 + (ib * blocksize + ik) * NTILE / 2), vmask);
+            vb = _mm512_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm512_dpbusd_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm512_dpbusd_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+          }
+        }
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += 4) {
+        if constexpr (MTILE == 1) {
+          __m512i va = _mm512_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_4bits((void*)(b4ptr + i * 32 + (ib * blocksize + ik) * NTILE / 2), vmask);
+            vb = _mm512_sub_epi8(vb, vbias);
+            bacc[i] = _mm512_dpbusd_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm512_dpbusd_epi32(iacc[i], va, vb);
+          }
+        } else {
+          __m512i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm512_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_4bits((void*)(b4ptr + i * 32 + (ib * blocksize + ik) * NTILE / 2), vmask);
+            vb = _mm512_sub_epi8(vb, vbias);
+            bacc[i] = _mm512_dpbusd_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm512_dpbusd_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+          }
+        }
+      }
+    }
+    gemv_remove_zp<NReg, MReg>(A.zpptr + ib, A.ldzp, iacc, bacc);
+    gemv_dequant_s32fp32<ScaleT, NReg, MTILE>(A.sptr + ib, A.ldzp, B.sptr + ib * B.ldzp, iacc, acc);
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm512_storeu_ps(C + i * VLen + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_4bit_s8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto& a8ptr = A.aptr;
+  auto& b4ptr = B.b4ptr;
+  auto& asptr = A.sptr;
+
+  int blks = k / blocksize;
+  int constexpr VLen = 16;
+  int constexpr NReg = NTILE / VLen;
+  int constexpr MReg = MTILE;
+  // Initialize accumulator with zeros
+  __m512 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm512_setzero_ps();
+  }
+  const __m512i vbias = _mm512_set1_epi8(8);
+  uint32_t mask = 0x0f0f0f0f;
+  auto vmask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
+  const auto vindex = _mm512_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12,
+                                      12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m512i iacc[NReg * MReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      iacc[i] = _mm512_setzero_si512();
+    }
+    if (B.zpptr) {
+      __m512i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * VLen, vindex);
+        bzp[i] = _mm512_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += 4) {
+        __m512i va[MReg];
+        for (int i = 0; i < MReg; i++) {
+          va[i] = _mm512_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik + i * A.lda));
+        }
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_4bits((void*)(b4ptr + i * 32 + (ib * blocksize + ik) * NTILE / 2), vmask);
+          vb = _mm512_sub_epi8(vb, bzp[i]);
+          for (int j = 0; j < MReg; j++) {
+            auto vsb = _mm512_sign_epi8(vb, va[j]);
+            auto vabsa = _mm512_sign_epi8(va[j], va[j]);
+            iacc[j * NReg + i] = _mm512_dpbusd_epi32(iacc[j * NReg + i], vabsa, vsb);
+          }
+        }
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += 4) {
+        __m512i va[MReg];
+        for (int i = 0; i < MReg; i++) {
+          va[i] = _mm512_set1_epi32(*(int*)(a8ptr + ib * blocksize + ik + i * A.lda));
+        }
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_4bits((void*)(b4ptr + i * 32 + (ib * blocksize + ik) * NTILE / 2), vmask);
+          vb = _mm512_sub_epi8(vb, vbias);
+          for (int j = 0; j < MReg; j++) {
+            auto vsb = _mm512_sign_epi8(vb, va[j]);
+            auto vabsa = _mm512_sign_epi8(va[j], va[j]);
+            iacc[j * NReg + i] = _mm512_dpbusd_epi32(iacc[j * NReg + i], vabsa, vsb);
+          }
+        }
+      }
+    }
+
+    gemv_dequant_s32fp32<ScaleT, NReg, MTILE>(A.sptr + ib, A.ldzp, B.sptr + ib * B.ldzp, iacc, acc);
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm512_storeu_ps(C + i * VLen + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_2bit_u8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  int constexpr VLen = 16;
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / VLen;
+  int constexpr MReg = MTILE;
+  __m512 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm512_setzero_ps();
+  }
+
+  const auto onesu8 = _mm512_set1_epi8(1);
+  const auto vindex = _mm512_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12,
+                                      12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm512_set1_epi64(*(int64_t*)&mask0);
+  auto vbias = _mm512_set1_epi8(2);
+  auto vshift_y = _mm512_set_epi32(6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm512_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0,
+                                      15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+  int constexpr KTILE = 4;
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m512i iacc[NReg * MReg];
+    __m512i bacc[NReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      iacc[i] = _mm512_setzero_si512();
+    }
+    for (int i = 0; i < NReg; i++) {
+      bacc[i] = _mm512_setzero_si512();
+    }
+    if (B.zpptr) {
+      __m512i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * 16, vindex);
+        bzp[i] = _mm512_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        if constexpr (MTILE == 1) {
+          __m512i va = _mm512_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+            vb = _mm512_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm512_dpbusd_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm512_dpbusd_epi32(iacc[i], va, vb);
+            b2ptr += VLen * KTILE / 4;
+          }
+        } else {
+          __m512i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm512_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+            vb = _mm512_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm512_dpbusd_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm512_dpbusd_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+            b2ptr += VLen * KTILE / 4;
+          }
+        }
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        if constexpr (MTILE == 1) {
+          __m512i va = _mm512_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+            vb = _mm512_sub_epi8(vb, vbias);
+            bacc[i] = _mm512_dpbusd_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm512_dpbusd_epi32(iacc[i], va, vb);
+            b2ptr += VLen * KTILE / 4;
+          }
+        } else {
+          __m512i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm512_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+            vb = _mm512_sub_epi8(vb, vbias);
+            bacc[i] = _mm512_dpbusd_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm512_dpbusd_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+            b2ptr += VLen * KTILE / 4;
+          }
+        }
+      }
+    }
+
+    gemv_remove_zp<NReg, MReg>(A.zpptr + ib, A.ldzp, iacc, bacc);
+    gemv_dequant_s32fp32<ScaleT, NReg, MTILE>(A.sptr + ib, A.ldzp, B.sptr + ib * B.ldzp, iacc, acc);
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm512_storeu_ps(C + i * VLen + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_2bit_s8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  int constexpr VLen = 16;
+  int blks = k / blocksize;
+  int constexpr NReg = NTILE / VLen;
+  int constexpr MReg = MTILE;
+  __m512 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm512_setzero_ps();
+  }
+
+  const auto onesu8 = _mm512_set1_epi8(1);
+  const auto vindex = _mm512_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12,
+                                      12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm512_set1_epi64(*(int64_t*)&mask0);
+  auto vbias = _mm512_set1_epi8(2);
+  auto vshift_y = _mm512_set_epi32(6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm512_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0,
+                                      15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+  int constexpr KTILE = 4;
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m512i iacc[NReg * MReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      iacc[i] = _mm512_setzero_si512();
+    }
+
+    if (B.zpptr) {
+      __m512i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * 16, vindex);
+        bzp[i] = _mm512_add_epi8(vbias, bzp[i]);
+      }
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        __m512i va[MReg];
+        for (int i = 0; i < MReg; i++) {
+          va[i] = _mm512_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+        }
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+          vb = _mm512_sub_epi8(vb, bzp[i]);
+          for (int j = 0; j < MReg; j++) {
+            auto vsb = _mm512_sign_epi8(vb, va[j]);
+            auto vabsa = _mm512_sign_epi8(va[j], va[j]);
+            iacc[j * NReg + i] = _mm512_dpbusd_epi32(iacc[j * NReg + i], vabsa, vsb);
+          }
+          b2ptr += VLen * KTILE / 4;
+        }
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        __m512i va[MReg];
+        for (int i = 0; i < MReg; i++) {
+          va[i] = _mm512_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+        }
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+          vb = _mm512_sub_epi8(vb, vbias);
+          for (int j = 0; j < MReg; j++) {
+            auto vsb = _mm512_sign_epi8(vb, va[j]);
+            auto vabsa = _mm512_sign_epi8(va[j], va[j]);
+            iacc[j * NReg + i] = _mm512_dpbusd_epi32(iacc[j * NReg + i], vabsa, vsb);
+          }
+          b2ptr += VLen * KTILE / 4;
+        }
+      }
+    }
+    gemv_dequant_s32fp32<ScaleT, NReg, MTILE>(A.sptr + ib, A.ldzp, B.sptr + ib * B.ldzp, iacc, acc);
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm512_storeu_ps(C + i * VLen + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_3bit_u8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  auto b1ptr = reinterpret_cast<utils::bit1x8*>(B.b1ptr);
+
+  int blks = k / blocksize;
+  int constexpr VLen = 16;
+  int constexpr NReg = NTILE / VLen;
+  int constexpr MReg = MTILE;
+  __m512 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm512_setzero_ps();
+  }
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm512_set1_epi64(*(int64_t*)&mask0);
+  auto vbias = _mm512_set1_epi8(4);
+  auto vshift_y = _mm512_set_epi32(6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm512_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0,
+                                      15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+
+  auto zmm_0x04 = _mm512_set1_epi8(0x04);
+  auto zmm_0x00 = _mm512_set1_epi8(0x00);
+  const auto vindex = _mm512_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12,
+                                      12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+  const auto onesu8 = _mm512_set1_epi8(1);
+  int constexpr KTILE = 4;
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m512i iacc[NReg * MReg];
+    __m512i bacc[NReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      iacc[i] = _mm512_setzero_si512();
+    }
+    for (int i = 0; i < NReg; i++) {
+      bacc[i] = _mm512_setzero_si512();
+    }
+    if (B.zpptr) {
+      __m512i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * 16, vindex);
+        bzp[i] = _mm512_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        if constexpr (MTILE == 1) {
+          __m512i va = _mm512_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+            auto vb1 = unpack_1bits(b1ptr, zmm_0x00, zmm_0x04);
+            vb = _mm512_or_si512(vb, vb1);
+            vb = _mm512_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm512_dpbusd_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm512_dpbusd_epi32(iacc[i], va, vb);
+            b2ptr += VLen * KTILE / 4;
+            b1ptr += VLen * KTILE / 8;
+          }
+        } else {
+          __m512i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm512_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+            auto vb1 = unpack_1bits(b1ptr, zmm_0x00, zmm_0x04);
+            vb = _mm512_or_si512(vb, vb1);
+            vb = _mm512_sub_epi8(vb, bzp[i]);
+            bacc[i] = _mm512_dpbusd_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm512_dpbusd_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+            b2ptr += VLen * KTILE / 4;
+            b1ptr += VLen * KTILE / 8;
+          }
+        }
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        if constexpr (MTILE == 1) {
+          __m512i va = _mm512_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik));
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+            auto vb1 = unpack_1bits(b1ptr, zmm_0x00, zmm_0x04);
+            vb = _mm512_or_si512(vb, vb1);
+            vb = _mm512_sub_epi8(vb, vbias);
+            bacc[i] = _mm512_dpbusd_epi32(bacc[i], onesu8, vb);
+            iacc[i] = _mm512_dpbusd_epi32(iacc[i], va, vb);
+            b2ptr += VLen * KTILE / 4;
+            b1ptr += VLen * KTILE / 8;
+          }
+        } else {
+          __m512i va[MReg];
+          for (int i = 0; i < MReg; i++) {
+            va[i] = _mm512_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+          }
+          for (int i = 0; i < NReg; i++) {
+            auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+            auto vb1 = unpack_1bits(b1ptr, zmm_0x00, zmm_0x04);
+            vb = _mm512_or_si512(vb, vb1);
+            vb = _mm512_sub_epi8(vb, vbias);
+            bacc[i] = _mm512_dpbusd_epi32(bacc[i], onesu8, vb);
+            for (int j = 0; j < MReg; j++) {
+              iacc[j * NReg + i] = _mm512_dpbusd_epi32(iacc[j * NReg + i], va[j], vb);
+            }
+            b2ptr += VLen * KTILE / 4;
+            b1ptr += VLen * KTILE / 8;
+          }
+        }
+      }
+    }
+
+    gemv_remove_zp<NReg, MReg>(A.zpptr + ib, A.ldzp, iacc, bacc);
+    gemv_dequant_s32fp32<ScaleT, NReg, MTILE>(A.sptr + ib, A.ldzp, B.sptr + ib * B.ldzp, iacc, acc);
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm512_storeu_ps(C + i * VLen + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_3bit_s8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  auto b1ptr = reinterpret_cast<utils::bit1x8*>(B.b1ptr);
+
+  int blks = k / blocksize;
+  int constexpr VLen = 16;
+  int constexpr NReg = NTILE / VLen;
+  int constexpr MReg = MTILE;
+  __m512 acc[NReg * MReg];
+  for (int i = 0; i < NReg * MReg; i++) {
+    acc[i] = _mm512_setzero_ps();
+  }
+  uint64_t mask0 = 0x0303030303030303;
+  auto vmask0 = _mm512_set1_epi64(*(int64_t*)&mask0);
+  auto vbias = _mm512_set1_epi8(4);
+  auto vshift_y = _mm512_set_epi32(6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0);
+  auto vsfhl_mask_y = _mm512_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2,
+                                      13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0,
+                                      15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  auto vorder_y = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+
+  auto zmm_0x04 = _mm512_set1_epi8(0x04);
+  auto zmm_0x00 = _mm512_set1_epi8(0x00);
+  const auto vindex = _mm512_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4,
+                                      4, 4, 4, 0, 0, 0, 0, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0, 12, 12,
+                                      12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+  int constexpr KTILE = 4;
+  for (int ib = 0; ib < blks; ib += 1) {
+    __m512i iacc[NReg * MReg];
+    for (int i = 0; i < NReg * MReg; i++) {
+      iacc[i] = _mm512_setzero_si512();
+    }
+    if (B.zpptr) {
+      __m512i bzp[NReg];
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int i = 0; i < NReg; i++) {
+        bzp[i] = load_zp_epi8_broadcast_epi32(bzptr + i * 16, vindex);
+        bzp[i] = _mm512_add_epi8(bzp[i], vbias);
+      }
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        __m512i va[MReg];
+        for (int i = 0; i < MReg; i++) {
+          va[i] = _mm512_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+        }
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+          auto vb1 = unpack_1bits(b1ptr, zmm_0x00, zmm_0x04);
+          vb = _mm512_or_si512(vb, vb1);
+          vb = _mm512_sub_epi8(vb, bzp[i]);
+          for (int j = 0; j < MReg; j++) {
+            auto vsb = _mm512_sign_epi8(vb, va[j]);
+            auto vabsa = _mm512_sign_epi8(va[j], va[j]);
+            iacc[j * NReg + i] = _mm512_dpbusd_epi32(iacc[j * NReg + i], vabsa, vsb);
+          }
+          b2ptr += VLen * KTILE / 4;
+          b1ptr += VLen * KTILE / 8;
+        }
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += KTILE) {
+        __m512i va[MReg];
+        for (int i = 0; i < MReg; i++) {
+          va[i] = _mm512_set1_epi32(*(int*)(A.aptr + ib * blocksize + ik + i * A.lda));
+        }
+        for (int i = 0; i < NReg; i++) {
+          auto vb = unpack_2bits(b2ptr, vshift_y, vmask0, vsfhl_mask_y, vorder_y);
+          auto vb1 = unpack_1bits(b1ptr, zmm_0x00, zmm_0x04);
+          vb = _mm512_or_si512(vb, vb1);
+          vb = _mm512_sub_epi8(vb, vbias);
+          for (int j = 0; j < MReg; j++) {
+            auto vsb = _mm512_sign_epi8(vb, va[j]);
+            auto vabsa = _mm512_sign_epi8(va[j], va[j]);
+            iacc[j * NReg + i] = _mm512_dpbusd_epi32(iacc[j * NReg + i], vabsa, vsb);
+          }
+          b2ptr += VLen * KTILE / 4;
+          b1ptr += VLen * KTILE / 8;
+        }
+      }
+    }
+
+    gemv_dequant_s32fp32<ScaleT, NReg, MTILE>(A.sptr + ib, A.ldzp, B.sptr + ib * B.ldzp, iacc, acc);
+  }
+
+  for (int j = 0; j < MReg; j++) {
+    for (int i = 0; i < NReg; i++) {
+      _mm512_storeu_ps(C + i * VLen + j * ldc, acc[j * NReg + i]);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+#ifdef __GNUC__
+#pragma GCC pop_options
+#else
+#endif
+#endif
+}  // namespace vnni
+
+#ifdef __GNUC__
+#pragma GCC pop_options
+#else
+#endif
+#endif
+}  // namespace avx512f
+}  // namespace kernel
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_jit.h b/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_jit.h
new file mode 100644
index 000000000..3347f272f
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_jit.h
@@ -0,0 +1,1730 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "bestla.h"
+#include "bestla_device.h"
+#include "bestla_utils.h"
+#include "bestla_jit.h"
+#include "kernel_jit_injector.h"
+
+namespace bestla {
+namespace kernel {
+namespace jit {
+
+class DequanS8FP {
+ public:
+  class MicroKernelAVX512F : protected xbyak::JitAvx512f {
+   public:
+    struct params {
+      void *srcptr, *dstptr;
+      int row, col;
+      int srcstride, dststride;
+      float* scales;
+      int8_t* zps;
+    };
+    typedef long long (*func_t)(params*);
+    static int constexpr VBytes = 64;
+    static int constexpr RegScale = 0;
+    static int constexpr RegZP = 4;
+    static int constexpr RegTmp = RegScale + 8;
+    MicroKernelAVX512F(BTLA_DTYPE dst_dt, bool is_sym_, int pack_row) {
+      assert(dst_dt == BTLA_DTYPE::F32 || dst_dt == BTLA_DTYPE::BF16);
+      is_sym = is_sym_;
+      generate(dst_dt, pack_row);
+      this->ready();
+      mKernel = this->getCode<func_t>();
+    }
+
+    void generate(BTLA_DTYPE dst_dt, int pack_row) {
+      assert(pack_row == 1 || pack_row == 2 || pack_row == 4);
+      int zmm_scale_step = 64 / pack_row;
+      Xbyak::Label data_label;
+      inLocalLabel();  // use local label for multiple instance
+      {
+        int SF_TmpSize = 64;
+        int SF_TmpPos = 16 * 14;
+        Xbyak::util::StackFrame st(this, 1, 13, SF_TmpPos + SF_TmpSize);
+        parambase = st.p[0];
+        reg_srcptr = st.t[0];
+        reg_dstptr = st.t[1];
+        reg_srcstride = st.t[2];
+        reg_dststride = st.t[3];
+        reg_rowsize = st.t[4];
+        reg_colsize = st.t[5];
+        reg_iterrow = st.t[6];
+        reg_itercol = st.t[7];
+        reg_tmp = st.t[8];
+        reg_scaleptr = st.t[9];
+        reg_tmpdst = st.t[10];
+        reg_tmp1 = st.t[12];
+        reg_ret = rax;
+
+        vreg_push(rsp);
+
+        mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
+        mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
+        mov(reg_scaleptr, ptr[parambase + OFFSET(scales)]);
+        xor_(reg_srcstride, reg_srcstride);
+        mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
+        xor_(reg_dststride, reg_dststride);
+        mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
+
+        load32(reg_colsize, ptr[parambase + OFFSET(col)]);
+        load32(reg_rowsize, ptr[parambase + OFFSET(row)]);
+        xor_(reg_itercol, reg_itercol);
+
+        // reuse parambase reg
+        if (!is_sym) {
+          mov(reg_tmp1, ptr[parambase + OFFSET(zps)]);
+          mov(reg_zpptr, reg_tmp1);
+          xor_(reg_tmp1, reg_tmp1);
+        }
+
+        auto get_dst_step = [&] {
+          if (dst_dt == BTLA_DTYPE::BF16) return 2;
+          return 4;  // f32 case.
+        };
+
+        auto generateNTile = [&](int N, BTLA_DTYPE dst_dt, int zmm_scale_step, std::string row_label) {
+          if (pack_row == 2) {
+            vmovups(Xbyak::Zmm(RegTmp), ptr[rip + data_label + 8]);
+          } else if (pack_row == 4) {
+            vmovups(Xbyak::Zmm(RegTmp), ptr[rip + data_label + 72]);
+          }
+          for (int i = 0; i < N; i++) {
+            vmovups(Xbyak::Zmm(RegScale + i), ptr[reg_scaleptr + i * zmm_scale_step]);
+            if (pack_row == 2 || pack_row == 4) {
+              vpermd(Xbyak::Zmm(RegScale + i), Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegScale + i));
+            }
+            if (!is_sym) {
+              vpmovsxbd(Xbyak::Zmm(RegZP + i),
+                        ptr[reg_zpptr + i * zmm_scale_step / sizeof(float)]);  // revert to zp_step.
+              if (pack_row == 2 || pack_row == 4) {
+                vpermd(Xbyak::Zmm(RegZP + i), Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegZP + i));
+              }
+            }
+          }
+          xor_(reg_iterrow, reg_iterrow);
+          mov(reg_tmp, reg_srcptr);
+          mov(reg_tmp1, reg_dstptr);
+          L(row_label);
+          for (int i = 0; i < N; i++) {
+            vpmovsxbd(Xbyak::Zmm(RegTmp), ptr[reg_tmp + i * 16]);
+            if (!is_sym) {
+              vpsubd(Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegZP + i));
+            }
+            vcvtdq2ps(Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegTmp));
+            vmulps(Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegScale + i));
+            if (dst_dt == BTLA_DTYPE::F32) {
+              vmovups(ptr[reg_tmp1 + i * 64], Xbyak::Zmm(RegTmp));
+            }
+            if (dst_dt == BTLA_DTYPE::BF16) {
+              Xbyak::Ymm ymm_v = Xbyak::Ymm(RegTmp);
+              Xbyak::Zmm zmm_v = Xbyak::Zmm(RegTmp);
+              if (device::CpuDevice::getInstance()->AVX512_BF16()) {
+                vcvtneps2bf16(ymm_v, zmm_v);
+              } else {
+                vmovups(Xbyak::Zmm(31), zmm_v);
+                vpsrldq(zmm_v, zmm_v, 2);
+                vpandd(zmm_v, zmm_v, zword_b[rip + data_label]);
+                vpaddd(zmm_v, zmm_v, zword_b[rip + data_label + 4]);
+                vpaddd(zmm_v, zmm_v, Xbyak::Zmm(31));
+                vpsrld(zmm_v, zmm_v, 16);
+                vpmovdw(ymm_v, zmm_v);
+              }
+              vmovups(ptr[reg_tmp1 + i * 32], ymm_v);
+            }
+          }
+          add(reg_tmp, reg_srcstride);
+          add(reg_tmp1, reg_dststride);
+          add(reg_iterrow, 1);
+          cmp(reg_iterrow, reg_rowsize);
+          jb(row_label);
+        };
+
+        L(".colloop");
+        mov(reg_tmp, reg_colsize);
+        sub(reg_tmp, reg_itercol);
+        cmp(reg_tmp, 64);
+        jl(".proc48", T_NEAR);
+        generateNTile(4, dst_dt, zmm_scale_step, ".rowloop1");
+        add(reg_itercol, 64);
+        add(reg_srcptr, 1 * 64);
+        add(reg_dstptr, get_dst_step() * 64);
+        add(reg_scaleptr, 4 * 64 / pack_row);
+        if (!is_sym) add(reg_zpptr, 1 * 64 / pack_row);
+        jmp(".colend", T_NEAR);
+
+        L(".proc48");
+        cmp(reg_tmp, 48);
+        jl(".proc32", T_NEAR);
+        generateNTile(3, dst_dt, zmm_scale_step, ".rowloop2");
+        add(reg_itercol, 48);
+        add(reg_srcptr, 1 * 48);
+        add(reg_dstptr, get_dst_step() * 48);
+        add(reg_scaleptr, 4 * 48 / pack_row);
+        if (!is_sym) add(reg_zpptr, 1 * 48 / pack_row);
+        jmp(".colend", T_NEAR);
+
+        L(".proc32");
+        generateNTile(2, dst_dt, zmm_scale_step, ".rowloop3");
+        add(reg_itercol, 32);
+        add(reg_srcptr, 1 * 32);
+        add(reg_dstptr, get_dst_step() * 32);
+        add(reg_scaleptr, 4 * 32 / pack_row);
+        if (!is_sym) add(reg_zpptr, 1 * 32 / pack_row);
+
+        L(".colend");
+        cmp(reg_itercol, reg_colsize);
+        jb(".colloop");
+
+        mov(reg_ret, 0);
+        vreg_pop(rsp);
+      }
+      outLocalLabel();  // end of local label
+      L(data_label);
+      uint32_t bf16_cvt_magic_num[2] = {0x00000001, 0X00007FFF};
+      uint32_t packrow2_permute_idx[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
+      uint32_t packrow4_permute_idx[16] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
+      db(reinterpret_cast<uint8_t*>(bf16_cvt_magic_num), sizeof(bf16_cvt_magic_num));
+      db(reinterpret_cast<uint8_t*>(packrow2_permute_idx), sizeof(packrow2_permute_idx));
+      db(reinterpret_cast<uint8_t*>(packrow4_permute_idx), sizeof(packrow4_permute_idx));
+    }
+
+    func_t mKernel = nullptr;
+
+   private:
+    Xbyak::Reg64 parambase;
+    Xbyak::Reg64 reg_srcptr;
+    Xbyak::Reg64 reg_dstptr;
+    Xbyak::Reg64 reg_srcstride;
+    Xbyak::Reg64 reg_dststride;
+    Xbyak::Reg64 reg_rowsize;
+    Xbyak::Reg64 reg_colsize;
+    Xbyak::Reg64 reg_iterrow;
+    Xbyak::Reg64 reg_itercol;
+    Xbyak::Reg64 reg_tmp;
+    Xbyak::Reg64 reg_scaleptr;
+    Xbyak::Reg64 reg_tmpdst;
+    Xbyak::Reg64 reg_tmp1;
+    Xbyak::Reg64 reg_ret;
+    Xbyak::Reg64 reg_zpptr = reg_ret;
+    bool is_sym;
+  };
+  template <int PACK_ROW, typename _DST_T>
+  static void forward_avx512f(int8_t* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst, float* scales,
+                              int8_t* zero_points) {
+    static MicroKernelAVX512F mAVX512FSym(utils::bestla_dtype<_DST_T>, true, PACK_ROW);
+    static MicroKernelAVX512F mAVX512FASym(utils::bestla_dtype<_DST_T>, false, PACK_ROW);
+    auto param = MicroKernelAVX512F::params{srcptr,
+                                            dstptr,
+                                            row,
+                                            col,
+                                            static_cast<int>(ld_src * sizeof(int8_t)),
+                                            static_cast<int>(ld_dst * sizeof(_DST_T)),
+                                            scales,
+                                            zero_points};
+    if (zero_points == nullptr) {
+      mAVX512FSym.mKernel(&param);
+    } else {
+      mAVX512FASym.mKernel(&param);
+    }
+  }
+};
+class DecompressS3 {
+ public:
+  template <typename _DST_T>
+  class MicroKernelAVX2 : protected xbyak::JitAvx2 {
+   public:
+    struct params {
+      void *bit2ptr, *bit1ptr, *dstptr, *tmpbuf;
+      int unpack_elt;
+      const int8_t lowMask = 3, highMask = 4;
+      const int32_t bit1Mask = 0x0F, bit1Shift2 = (1 << 23) + (1 << 16) + (1 << 9) + (1 << 2);
+      const int32_t bit1Shift1[8] = {0, 4, 8, 12, 16, 20, 24, 28};
+    };
+    typedef long long (*func_t)(params*);
+    static int constexpr VBytes = 32;
+    MicroKernelAVX2() {
+      generate();
+      this->ready();
+      mKernel = this->getCode<func_t>();
+    }
+
+    void generate() {
+      inLocalLabel();  // use local label for multiple instance
+      {
+        Xbyak::util::StackFrame st(this, 1, 13, 10 * 16);
+        parambase = st.p[0];
+        reg_bit1ptr = st.t[0];
+        reg_bit2ptr = st.t[1];
+        reg_loop = st.t[2];
+        reg_iter = st.t[3];
+        reg_dst = st.t[4];
+        reg_tmp = st.t[5];
+        reg_cache = st.t[6];
+        reg_ret = rax;
+        xor_(reg_loop, reg_loop);
+        mov(reg_loop.cvt32(), dword[parambase + OFFSET(unpack_elt)]);
+        xor_(reg_iter, reg_iter);
+        vreg_push(rsp);
+        Xbyak::Ymm lowMask = ymm15;
+        Xbyak::Ymm highMask = ymm14;
+        Xbyak::Ymm bit1Mask = ymm13;
+        Xbyak::Ymm bit1Shift1 = ymm12;
+        Xbyak::Ymm bit1Shift2 = ymm11;
+        vpbroadcastb(lowMask, byte[parambase + OFFSET(lowMask)]);
+        vpbroadcastb(highMask, byte[parambase + OFFSET(highMask)]);
+        vpbroadcastd(bit1Mask, dword[parambase + OFFSET(bit1Mask)]);
+        vpbroadcastd(bit1Shift2, dword[parambase + OFFSET(bit1Shift2)]);
+        vmovdqu(bit1Shift1, ptr[parambase + OFFSET(bit1Shift1)]);
+        mov(reg_bit1ptr, ptr[parambase + OFFSET(bit1ptr)]);
+        mov(reg_bit2ptr, ptr[parambase + OFFSET(bit2ptr)]);
+        mov(reg_dst, ptr[parambase + OFFSET(dstptr)]);
+        if constexpr (!std::is_same_v<_DST_T, int8_t>) mov(reg_cache, ptr[parambase + OFFSET(tmpbuf)]);
+        L("loop_label");
+        Xbyak::Ymm bit2_data = ymm8;
+        vmovups(bit2_data, ptr[reg_bit2ptr]);
+        for (int i = 0; i < 4; i++) {
+          vpbroadcastd(Xbyak::Ymm(i), dword[reg_bit1ptr + 4 * i]);
+          vpsrlvd(Xbyak::Ymm(i), Xbyak::Ymm(i), bit1Shift1);  // todo : check m256
+          vpand(Xbyak::Ymm(i), Xbyak::Ymm(i), bit1Mask);
+          vpmulld(Xbyak::Ymm(i), Xbyak::Ymm(i), bit1Shift2);
+          vpand(Xbyak::Ymm(i), Xbyak::Ymm(i), highMask);
+          vpsrlw(Xbyak::Ymm(4 + i), bit2_data, 2 * i);
+          vpand(Xbyak::Ymm(4 + i), Xbyak::Ymm(4 + i), lowMask);
+          vpaddb(Xbyak::Ymm(i), Xbyak::Ymm(i), Xbyak::Ymm(4 + i));
+          vpsubb(Xbyak::Ymm(i), Xbyak::Ymm(i), highMask);
+          if constexpr (std::is_same_v<_DST_T, int8_t>) {
+            vmovdqu(ptr[reg_dst + 32 * i], Xbyak::Ymm(i));
+          } else if constexpr (std::is_same_v<_DST_T, float>) {
+            vmovdqu(ptr[reg_cache + 32 * i], Xbyak::Ymm(i));
+
+            vpmovsxbd(Xbyak::Ymm(i), qword[reg_cache + 32 * i + 0]);
+            vpmovsxbd(Xbyak::Ymm(4 + i), qword[reg_cache + 32 * i + 8]);
+            vcvtdq2ps(Xbyak::Ymm(i), Xbyak::Ymm(i));
+            vcvtdq2ps(Xbyak::Ymm(4 + i), Xbyak::Ymm(4 + i));
+            vmovups(ptr[reg_dst + 4 * (32 * i + 0)], Xbyak::Ymm(i));
+            vmovups(ptr[reg_dst + 4 * (32 * i + 8)], Xbyak::Ymm(4 + i));
+
+            vpmovsxbd(Xbyak::Ymm(i), qword[reg_cache + 32 * i + 16]);
+            vpmovsxbd(Xbyak::Ymm(4 + i), qword[reg_cache + 32 * i + 24]);
+            vcvtdq2ps(Xbyak::Ymm(i), Xbyak::Ymm(i));
+            vcvtdq2ps(Xbyak::Ymm(4 + i), Xbyak::Ymm(4 + i));
+            vmovups(ptr[reg_dst + 4 * (32 * i + 16)], Xbyak::Ymm(i));
+            vmovups(ptr[reg_dst + 4 * (32 * i + 24)], Xbyak::Ymm(4 + i));
+          } else {
+            assert(0);
+          }
+        }
+
+        add(reg_iter, 1);
+        add(reg_bit1ptr, 16);
+        add(reg_bit2ptr, 32);
+        add(reg_dst, 128 * sizeof(_DST_T));
+        cmp(reg_iter, reg_loop);
+        jb("loop_label");
+        mov(reg_ret, 0);
+        vreg_pop(rsp);
+      }
+      outLocalLabel();  // end of local label
+    }
+
+    func_t mKernel = nullptr;
+
+   private:
+    Xbyak::Reg64 parambase;
+    Xbyak::Reg64 reg_bit1ptr;
+    Xbyak::Reg64 reg_bit2ptr;
+    Xbyak::Reg64 reg_loop;
+    Xbyak::Reg64 reg_iter;
+    Xbyak::Reg64 reg_dst;
+    Xbyak::Reg64 reg_tmp;
+    Xbyak::Reg64 reg_cache;
+    Xbyak::Reg64 reg_ret;
+    Xbyak::Reg64 bit;
+  };
+  template <typename _DST_T>
+  static void forward_avx2(void* bit2ptr, void* bit1ptr, _DST_T* dstptr, void* tmpbuf, int unpack_elt) {
+    static MicroKernelAVX2<_DST_T> ker;
+    typename MicroKernelAVX2<_DST_T>::params param{bit2ptr, bit1ptr, dstptr, tmpbuf, unpack_elt / 128};
+    ker.mKernel(&param);
+  }
+  template <typename _DST_T>
+  class MicroKernelAVX512F : protected xbyak::JitAvx512f {
+   public:
+    struct params {
+      void *bit2ptr, *bit1ptr, *dstptr, *tmpbuf;
+      int unpack_elt;
+      int8_t ox3, ox4;
+      int ox5;
+    };
+    typedef long long (*func_t)(params*);
+    static int constexpr VBytes = 64;
+    MicroKernelAVX512F() {
+      generate();
+      this->ready();
+      mKernel = this->getCode<func_t>();
+    }
+
+    void generate() {
+      inLocalLabel();  // use local label for multiple instance
+      {
+        Xbyak::util::StackFrame st(this, 1, 13);
+        parambase = st.p[0];
+        reg_bit1ptr = st.t[0];
+        reg_bit2ptr = st.t[1];
+        reg_loop = st.t[2];
+        reg_iter = st.t[3];
+        reg_dst = st.t[4];
+        reg_tmp = st.t[5];
+        reg_cache = st.t[6];
+        reg_ret = rax;
+        xor_(reg_loop, reg_loop);
+        mov(reg_loop.cvt32(), ptr[parambase + OFFSET(unpack_elt)]);
+        xor_(reg_iter, reg_iter);
+        Xbyak::Ymm LowMask = ymm1;
+        Xbyak::Zmm zmm_0x04 = zmm31;
+        Xbyak::Zmm zmm_shift = zmm30;
+        vpbroadcastb(LowMask, ptr[parambase + OFFSET(ox3)]);
+        vpbroadcastb(zmm_0x04, ptr[parambase + OFFSET(ox4)]);
+        vpbroadcastd(zmm_shift, ptr[parambase + OFFSET(ox5)]);
+        mov(reg_bit1ptr, ptr[parambase + OFFSET(bit1ptr)]);
+        mov(reg_bit2ptr, ptr[parambase + OFFSET(bit2ptr)]);
+        mov(reg_dst, ptr[parambase + OFFSET(dstptr)]);
+        if constexpr (!std::is_same_v<_DST_T, int8_t>) mov(reg_cache, ptr[parambase + OFFSET(tmpbuf)]);
+        L("loop_label");
+        imul(reg_tmp, reg_iter, 16);
+        kmovq(bit1_mask1, ptr[reg_bit1ptr + reg_tmp]);
+        kmovq(bit1_mask2, ptr[reg_bit1ptr + reg_tmp + 8]);
+        Xbyak::Zmm bit2_data_zmm = zmm0;
+        imul(reg_tmp, reg_iter, 32);
+        vmovups(ymm2, ptr[reg_bit2ptr + reg_tmp]);
+
+        vpand(ymm4, LowMask, ymm2);
+        vpsrlw(ymm2, ymm2, 2);
+        vpand(ymm5, LowMask, ymm2);
+        vpsrlw(ymm2, ymm2, 2);
+        vpand(ymm6, LowMask, ymm2);
+        vpsrlw(ymm2, ymm2, 2);
+        vpand(ymm7, LowMask, ymm2);
+        vinsertf32x8(zmm4, zmm4, ymm5, 1);
+        vinsertf32x8(zmm6, zmm6, ymm7, 1);
+
+        vxorps(zmm12, zmm12);
+        vxorps(zmm13, zmm13);
+        vmovdqu8(zmm12 | bit1_mask1, zmm_0x04);
+        vmovdqu8(zmm13 | bit1_mask2, zmm_0x04);
+
+        vpaddb(zmm4, zmm4, zmm12);
+        vpaddb(zmm6, zmm6, zmm13);
+
+        vpsllvd(zmm4, zmm4, zmm_shift);
+        vpsllvd(zmm6, zmm6, zmm_shift);
+
+        if constexpr (std::is_same_v<_DST_T, int8_t>) {
+          imul(reg_tmp, reg_iter, 128);
+          vmovups(ptr[reg_dst + reg_tmp], zmm4);
+          vmovups(ptr[reg_dst + reg_tmp + 64], zmm6);
+        } else if constexpr (std::is_same_v<_DST_T, float> || std::is_same_v<_DST_T, utils::bf16>) {
+          vmovups(ptr[reg_cache], zmm4);
+          vmovups(ptr[reg_cache + 64], zmm6);
+          for (int i = 0; i < 8; i++) vpmovsxbd(Xbyak::Zmm(16 + i), ptr[reg_cache + 16 * i]);
+          for (int i = 0; i < 8; i++) vcvtdq2ps(Xbyak::Zmm(16 + i), Xbyak::Zmm(16 + i));
+          imul(reg_tmp, reg_iter, 128 * sizeof(_DST_T));
+          if constexpr (std::is_same_v<_DST_T, float>) {
+            for (int i = 0; i < 8; i++) vmovups(ptr[reg_dst + reg_tmp + i * 64], Xbyak::Zmm(16 + i));
+          } else {
+            for (int i = 0; i < 8; i++) vcvtneps2bf16(Xbyak::Ymm(16 + i), Xbyak::Zmm(16 + i));
+            for (int i = 0; i < 8; i++) vmovups(ptr[reg_dst + reg_tmp + i * 32], Xbyak::Ymm(16 + i));
+          }
+        } else {
+          assert(0);
+        }
+
+        add(reg_iter, 1);
+        cmp(reg_iter, reg_loop);
+        jb("loop_label");
+        mov(reg_ret, 0);
+      }
+      outLocalLabel();  // end of local label
+    }
+
+    func_t mKernel = nullptr;
+
+   private:
+    Xbyak::Reg64 parambase;
+    Xbyak::Reg64 reg_bit1ptr;
+    Xbyak::Reg64 reg_bit2ptr;
+    Xbyak::Reg64 reg_loop;
+    Xbyak::Reg64 reg_iter;
+    Xbyak::Reg64 reg_dst;
+    Xbyak::Reg64 reg_tmp;
+    Xbyak::Reg64 reg_cache;
+    Xbyak::Reg64 reg_ret;
+    Xbyak::Opmask bit1_mask1 = Xbyak::Opmask(1);
+    Xbyak::Opmask bit1_mask2 = Xbyak::Opmask(2);
+    Xbyak::Opmask bit1_mask3 = Xbyak::Opmask(3);
+    Xbyak::Opmask bit1_mask4 = Xbyak::Opmask(4);
+  };
+  template <typename _DST_T>
+  static void forward_avx512f(void* bit2ptr, void* bit1ptr, _DST_T* dstptr, void* tmpbuf, int unpack_elt) {
+    static MicroKernelAVX512F<_DST_T> ker;
+    typename MicroKernelAVX512F<_DST_T>::params param{bit2ptr, bit1ptr, dstptr, tmpbuf, unpack_elt / 128, 0x03, 0x4, 5};
+    ker.mKernel(&param);
+  }
+};
+
+class DequanKBlockS8Fp {
+ public:
+  template <int _PACK_ROW, typename _ST, typename _DST_T>
+  static inline BTLA_CODE forward_avx512f(int8_t* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
+                                          _ST* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
+    int row0 = kblock - k_offset % kblock;
+    row0 = row0 == kblock ? 0 : row0;
+    row0 = row0 > row ? row : row0;
+    int row1 = row - row0;
+    int row1_blk = utils::padto_le(row1, kblock);
+    int row2 = row - row1_blk - row0;
+    auto sptr = scales + k_offset / kblock * NPad;
+    int8_t* zptr = nullptr;
+    if (zero_points != nullptr) zptr = zero_points + k_offset / kblock * NPad;
+    if (row0 > 0) {
+      DequanS8FP::forward_avx512f<_PACK_ROW>(srcptr, dstptr, row0, col, ld_src, ld_dst, sptr, zptr);
+      srcptr += row0 * ld_src;
+      dstptr += row0 * ld_dst;
+      sptr += NPad;
+      if (zero_points != nullptr) zptr += NPad;
+    }
+    for (int i = 0; i < row1_blk; i += kblock) {
+      DequanS8FP::forward_avx512f<_PACK_ROW>(srcptr, dstptr, kblock, col, ld_src, ld_dst, sptr, zptr);
+      srcptr += kblock * ld_src;
+      dstptr += kblock * ld_dst;
+      sptr += NPad;
+      if (zero_points != nullptr) zptr += NPad;
+    }
+    if (row2 > 0) {
+      DequanS8FP::forward_avx512f<_PACK_ROW>(srcptr, dstptr, row2, col, ld_src, ld_dst, sptr, zptr);
+    }
+    return BTLA_CODE::Success;
+  }
+};
+
+struct DataConvertConfig {
+  enum class cvt_direct {
+    NO_CVT,
+    BF16_TO_FP32,
+    FP32_TO_BF16,
+    F16_TO_FP32,
+    FP32_TO_F16,
+  };
+
+  DataConvertConfig(BTLA_DTYPE src_t, BTLA_DTYPE dst_t, std::vector<kernel::jit_injector::eltwise_injector> injectors) {
+    input_dt = src_t;
+    output_dt = dst_t;
+    if (injectors.size() != 0) {
+      assert(src_t == BTLA_DTYPE::F32 || src_t == BTLA_DTYPE::BF16 || src_t == BTLA_DTYPE::F16);
+      if (src_t == BTLA_DTYPE::BF16) before_postop = DataConvertConfig::cvt_direct::BF16_TO_FP32;
+      if (src_t == BTLA_DTYPE::F16) before_postop = DataConvertConfig::cvt_direct::F16_TO_FP32;
+    }
+    // once contain postop, data_type before store will be fp32.
+    if (injectors.size() != 0 || src_t == BTLA_DTYPE::F32) {
+      if (dst_t == BTLA_DTYPE::BF16) before_store = DataConvertConfig::cvt_direct::FP32_TO_BF16;
+      if (dst_t == BTLA_DTYPE::F16) {
+        if (!device::CpuDevice::getInstance()->AVX512_FP16()) assert(0);
+        before_store = DataConvertConfig::cvt_direct::FP32_TO_F16;
+      }
+    } else if (src_t == BTLA_DTYPE::BF16 && dst_t == BTLA_DTYPE::F32) {
+      before_store = DataConvertConfig::cvt_direct::BF16_TO_FP32;
+    } else if (src_t == BTLA_DTYPE::F16 && dst_t == BTLA_DTYPE::F32) {
+      assert(device::CpuDevice::getInstance()->AVX512_FP16());
+      before_store = DataConvertConfig::cvt_direct::F16_TO_FP32;
+    }
+  }
+
+  int get_store_process_bytes(int VBytes) {
+    if (before_store == DataConvertConfig::cvt_direct::BF16_TO_FP32 ||
+        before_store == DataConvertConfig::cvt_direct::F16_TO_FP32)
+      return 2 * VBytes;
+    if (before_store == DataConvertConfig::cvt_direct::FP32_TO_BF16 ||
+        before_store == DataConvertConfig::cvt_direct::FP32_TO_F16)
+      return VBytes / 2;
+    return VBytes;
+  }
+
+  cvt_direct before_postop = cvt_direct::NO_CVT;
+  cvt_direct before_store = cvt_direct::NO_CVT;
+  BTLA_DTYPE input_dt, output_dt;
+};
+
+template <typename SIMD_REG>
+struct MemcpyStoreParam {
+  SIMD_REG vmm_v;
+  Xbyak::RegExp store_addr;
+  bool tail;
+  Xbyak::Opmask store_mask = Xbyak::util::k1;
+};
+
+class JitMemcpy2DAvx2 : protected xbyak::JitAvx2 {
+ public:
+  struct params {
+    void *srcptr, *dstptr, *elt_const_v;
+    int col;
+  };
+  typedef long long (*func_t)(params*);
+
+ public:
+  static int constexpr VBytes = 32;
+  JitMemcpy2DAvx2(int unroll_row, BTLA_DTYPE src_t, BTLA_DTYPE dst_t,
+                  std::vector<kernel::jit_injector::eltwise_injector> injectors = {}) {
+    DataConvertConfig dt_cvt_cfg(src_t, dst_t, injectors);
+    generate(unroll_row, injectors, dt_cvt_cfg);
+  }
+
+  template <typename _SRC_T, typename _DST_T>
+  static BTLA_CODE forward(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
+                           void* elt_const_v = nullptr) {
+    static JitMemcpy2DAvx2 instance_withops(1, utils::bestla_dtype<_SRC_T>, utils::bestla_dtype<_DST_T>);
+    for (int i = 0; i < row; i++) {
+      auto param = params{reinterpret_cast<char*>(const_cast<_SRC_T*>(srcptr)) + i * srcstep * sizeof(_SRC_T),
+                          reinterpret_cast<char*>(dstptr) + i * dststep * sizeof(_DST_T), elt_const_v,
+                          static_cast<int>(col * sizeof(_SRC_T))};
+      instance_withops.mKernel(&param);
+    }
+    return BTLA_CODE::Success;
+  }
+
+  template <typename _SRC_T, typename _DST_T, BTLA_ELTWISEOP Op>
+  static BTLA_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
+                            void* elt_const_v = nullptr) {
+    static JitMemcpy2DAvx2 instance_withops(1, utils::bestla_dtype<_SRC_T>, utils::bestla_dtype<_DST_T>,
+                                            {kernel::jit_injector::eltwise_injector(Op)});
+    for (int i = 0; i < row; i++) {
+      auto param = params{reinterpret_cast<char*>(const_cast<_SRC_T*>(srcptr)) + i * srcstep * sizeof(_SRC_T),
+                          reinterpret_cast<char*>(dstptr) + i * dststep * sizeof(_DST_T), elt_const_v,
+                          static_cast<int>(col * sizeof(_SRC_T))};
+      instance_withops.mKernel(&param);
+    }
+    return BTLA_CODE::Success;
+  }
+
+ protected:
+  void generate(int unrollk, std::vector<kernel::jit_injector::eltwise_injector>& injectors,
+                DataConvertConfig dt_cvt_cfg) {
+    // unrollK=[1,2]
+    if (unrollk != 1 && unrollk != 2) {
+      assert(false);
+      return;
+    }
+    Xbyak::Label data_label;
+    inLocalLabel();  // use local label for multiple instance
+    {
+      int SF_TmpSize = 64;
+      int SF_TmpPos = 16 * 10;
+      Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
+      const Xbyak::Reg64& parambase = st.p[0];
+      const Xbyak::Reg64& reg_srcptr = st.t[0];
+      const Xbyak::Reg64& reg_dstptr = st.t[1];
+      const Xbyak::Reg64& reg_colsize = st.t[2];
+      const Xbyak::Reg64& reg_itercol = st.t[3];
+      const Xbyak::Reg64& reg_tmp = st.t[4];
+      const Xbyak::Reg64& reg_elt_constv = st.t[5];  // alias of reg_tmp.
+      const Xbyak::Reg64& reg_tmp1 = st.t[6];
+      const Xbyak::Reg64& reg_tmp2 = st.t[7];
+      const Xbyak::Reg64& reg_ret = rax;
+
+      vreg_push(rsp);
+
+      mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
+      mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
+      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
+
+      load32(reg_colsize, ptr[parambase + OFFSET(col)]);
+      int const ColUnroll = 4;
+      int const ymm_tmp_num = 2;
+      std::array<Xbyak::Ymm, ymm_tmp_num> ymm_tmps = {Xbyak::Ymm(unrollk * ColUnroll),
+                                                      Xbyak::Ymm(unrollk * ColUnroll + 1)};
+      for (int i = 0; i < unrollk * ColUnroll; i++) used_ymm_idx.insert(i);
+      for (auto&& injector : injectors) {
+        injector.assign_resources(this, used_ymm_idx, reg_ret);
+        injector.assign_reg_elt_constp(reg_elt_constv);
+      }
+
+      auto store_ymm_v = [&](MemcpyStoreParam<Xbyak::Ymm> p) { vmovups(ptr[p.store_addr], p.vmm_v); };
+
+      auto unpack_ymm_16bit_withfunc = [&](MemcpyStoreParam<Xbyak::Ymm> p,
+                                           std::function<void(MemcpyStoreParam<Xbyak::Ymm>)> func,
+                                           BTLA_DTYPE BIT16_DT) {
+        vmovups(ymm_tmps[0], p.vmm_v);
+        Xbyak::Ymm ymm_v = Xbyak::Ymm(p.vmm_v.getIdx());
+        if (BIT16_DT == BTLA_DTYPE::BF16) {
+          vpmovzxwd(p.vmm_v, ymm_v);
+          vpslld(p.vmm_v, p.vmm_v, 16);
+        }
+        func(p);
+        vextractf128(Xbyak::Xmm(ymm_tmps[0].getIdx()), ymm_tmps[0], 1);
+        if (BIT16_DT == BTLA_DTYPE::BF16) {
+          vpmovzxwd(ymm_tmps[0], Xbyak::Ymm(ymm_tmps[0].getIdx()));
+          vpslld(ymm_tmps[0], ymm_tmps[0], 16);
+        }
+        p.vmm_v = ymm_tmps[0];
+        p.store_addr = p.store_addr + VBytes;
+        func(p);
+      };
+
+      auto apply_postop_and_store = [&](MemcpyStoreParam<Xbyak::Ymm> p) {
+        for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(p.vmm_v, k * 3 * sizeof(float));
+        if (dt_cvt_cfg.before_store == DataConvertConfig::cvt_direct::NO_CVT) {
+          store_ymm_v(p);
+        } else if (dt_cvt_cfg.before_store == DataConvertConfig::cvt_direct::BF16_TO_FP32) {
+          unpack_ymm_16bit_withfunc(p, store_ymm_v, BTLA_DTYPE::BF16);
+        } else if (dt_cvt_cfg.before_store == DataConvertConfig::cvt_direct::FP32_TO_BF16) {
+          Xbyak::Xmm xmm_v = Xbyak::Xmm(p.vmm_v.getIdx());
+          Xbyak::Xmm xmm_tmp = Xbyak::Xmm(ymm_tmps[1].getIdx());
+          vmovups(ymm_tmps[0], p.vmm_v);
+          vpsrldq(p.vmm_v, p.vmm_v, 2);
+          mov(reg_tmp.cvt32(), 0x00000001);
+          vmovd(xmm_tmp, reg_tmp.cvt32());
+          vpbroadcastd(ymm_tmps[1], xmm_tmp);
+          vpand(p.vmm_v, p.vmm_v, ymm_tmps[1]);
+          mov(reg_tmp.cvt32(), 0x00007FFF);
+          vmovd(xmm_tmp, reg_tmp.cvt32());
+          vpbroadcastd(ymm_tmps[1], xmm_tmp);
+          vpaddd(p.vmm_v, p.vmm_v, ymm_tmps[1]);
+          vpaddd(p.vmm_v, p.vmm_v, ymm_tmps[0]);
+          vpshufb(p.vmm_v, p.vmm_v, ptr[rip + data_label + 32]);
+          vpermq(p.vmm_v, p.vmm_v, 0x58);
+          vmovups(ptr[p.store_addr], xmm_v);
+        } else {
+          assert(0);
+        }
+      };
+
+      auto load_store_value = [&](Xbyak::Ymm ymm_v, Xbyak::RegExp load_addr, Xbyak::RegExp store_addr) {
+        vmovups(ymm_v, ptr[load_addr]);
+        if (dt_cvt_cfg.before_postop == DataConvertConfig::cvt_direct::NO_CVT) {
+          apply_postop_and_store({ymm_v, store_addr});
+        } else if (dt_cvt_cfg.before_postop == DataConvertConfig::cvt_direct::BF16_TO_FP32) {
+          unpack_ymm_16bit_withfunc({ymm_v, store_addr}, apply_postop_and_store, BTLA_DTYPE::BF16);
+        } else {
+          assert(0);
+        }
+      };
+
+      xor_(reg_itercol, reg_itercol);
+
+      L(".colloop");
+      mov(reg_tmp, reg_colsize);
+      sub(reg_tmp, reg_itercol);
+      cmp(reg_tmp, ColUnroll * VBytes);
+      jl(".maskproc", T_NEAR);
+
+      for (int i = 0; i < ColUnroll; i++)
+        load_store_value(Xbyak::Ymm(i), reg_srcptr + i * VBytes,
+                         reg_dstptr + i * dt_cvt_cfg.get_store_process_bytes(VBytes));
+
+      add(reg_srcptr, ColUnroll * VBytes);
+      add(reg_dstptr, ColUnroll * dt_cvt_cfg.get_store_process_bytes(VBytes));
+      add(reg_itercol, ColUnroll * VBytes);
+      jmp(".colend", T_NEAR);
+      L(".maskproc");
+      mov(reg_tmp2, reg_colsize);
+      sub(reg_tmp2, reg_itercol);
+      cmp(reg_tmp2, VBytes);
+      jb(".maskflag", T_NEAR);
+      cmp(reg_tmp2, 0);
+      jl(".maskend", T_NEAR);
+      load_store_value(Xbyak::Ymm(0), reg_srcptr, reg_dstptr);
+      jmp(".maskend", T_NEAR);
+      L(".maskflag");
+      // 0<tail<8
+      mov(reg_tmp1.cvt32(), 1);
+      shlx(reg_tmp1.cvt32(), reg_tmp1.cvt32(), reg_tmp2.cvt32());
+      sub(reg_tmp1.cvt32(), 1);
+      vmovd(Xbyak::Xmm(1), reg_tmp1.cvt32());
+      vpbroadcastd(Xbyak::Ymm(1), Xbyak::Xmm(1));
+      vpsllvd(Xbyak::Ymm(1), Xbyak::Ymm(1), ptr[rip + data_label]);
+      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
+      vpmaskmovd(Xbyak::Ymm(0), Xbyak::Ymm(1), ptr[reg_srcptr]);
+      for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
+      vpmaskmovd(ptr[reg_dstptr], Xbyak::Ymm(1), Xbyak::Ymm(0));
+      L(".maskend");
+      add(reg_srcptr, VBytes);
+      add(reg_dstptr, dt_cvt_cfg.get_store_process_bytes(VBytes));
+      add(reg_itercol, VBytes);
+      L(".colend");
+      cmp(reg_itercol, reg_colsize);
+      jb(".colloop");
+      mov(reg_ret, 0);
+      vreg_pop(rsp);
+    }
+    outLocalLabel();  // end of local label
+    L(data_label);
+    uint32_t mask_bias[8] = {28, 24, 20, 16, 12, 8, 4, 0};
+    const uint8_t avx2_bf16_convert_maigc_num[32] = {0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x80, 0x80, 0x80,
+                                                     0x80, 0x80, 0x80, 0x80, 0x80, 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b,
+                                                     0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+    db(reinterpret_cast<uint8_t*>(mask_bias), sizeof(mask_bias));
+    db(avx2_bf16_convert_maigc_num, sizeof(avx2_bf16_convert_maigc_num));
+    for (auto&& injector : injectors) injector.prepare_table();
+    this->ready();
+    mKernel = this->getCode<func_t>();
+  }
+
+  func_t mKernel = nullptr;
+  std::set<int> used_ymm_idx;
+};
+
+class JitMemcpy2DAvx512f : protected xbyak::JitAvx512f {
+ public:
+  struct params {
+    void *srcptr, *dstptr, *elt_const_v;
+    int col;
+  };
+  typedef long long (*func_t)(params*);
+
+ public:
+  static int constexpr VBytes = 64;
+  JitMemcpy2DAvx512f(int unroll_row, BTLA_DTYPE src_t, BTLA_DTYPE dst_t,
+                     std::vector<kernel::jit_injector::eltwise_injector> injectors = {}) {
+    DataConvertConfig dt_cvt_cfg(src_t, dst_t, injectors);
+    generate(unroll_row, injectors, dt_cvt_cfg);
+  }
+
+  template <typename _SRC_T, typename _DST_T>
+  static BTLA_CODE forward(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
+                           void* elt_const_v = nullptr) {
+    static JitMemcpy2DAvx512f instance_withops(1, utils::bestla_dtype<_SRC_T>, utils::bestla_dtype<_DST_T>);
+
+    for (int i = 0; i < row; i++) {
+      auto param = params{reinterpret_cast<char*>(const_cast<_SRC_T*>(srcptr)) + i * srcstep * sizeof(_SRC_T),
+                          reinterpret_cast<char*>(dstptr) + i * dststep * sizeof(_DST_T), elt_const_v,
+                          static_cast<int>(col * sizeof(_SRC_T))};
+      instance_withops.mKernel(&param);
+    }
+    return BTLA_CODE::Success;
+  }
+
+  template <typename _SRC_T, typename _DST_T, BTLA_ELTWISEOP Op>
+  static BTLA_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
+                            void* elt_const_v = nullptr) {
+    static JitMemcpy2DAvx512f instance_withops(1, utils::bestla_dtype<_SRC_T>, utils::bestla_dtype<_DST_T>,
+                                               {kernel::jit_injector::eltwise_injector(Op)});
+    for (int i = 0; i < row; i++) {
+      auto param = params{reinterpret_cast<char*>(const_cast<_SRC_T*>(srcptr)) + i * srcstep * sizeof(_SRC_T),
+                          reinterpret_cast<char*>(dstptr) + i * dststep * sizeof(_DST_T), elt_const_v,
+                          static_cast<int>(col * sizeof(_SRC_T))};
+      instance_withops.mKernel(&param);
+    }
+    return BTLA_CODE::Success;
+  }
+
+ protected:
+  void generate(int unrollk, std::vector<kernel::jit_injector::eltwise_injector>& injectors,
+                DataConvertConfig dt_cvt_cfg) {
+    inLocalLabel();  // use local label for multiple instance
+    Xbyak::Label data_label;
+    {
+      int SF_TmpSize = 64;
+      Xbyak::util::StackFrame st(this, 1, 13, 10 * 16 + SF_TmpSize);
+      const Xbyak::Reg64& parambase = st.p[0];
+      const Xbyak::Reg64& reg_src = st.t[0];
+      const Xbyak::Reg64& reg_dst = st.t[1];
+      const Xbyak::Reg64& reg_size = st.t[2];
+      const Xbyak::Reg64& reg_iter = st.t[3];
+      const Xbyak::Reg64& reg_tmp = st.t[4];
+      const Xbyak::Reg64& reg_tmp2 = st.t[5];
+      const Xbyak::Reg64& reg_elt_constv = st.t[6];
+      const Xbyak::Reg64& reg_ret = rax;
+
+      vreg_push(rsp);
+
+      int const ColUnroll = 4;
+      int const zmm_tmp_num = 2;
+      std::array<Xbyak::Zmm, zmm_tmp_num> zmm_tmps = {Xbyak::Zmm(unrollk * ColUnroll),
+                                                      Xbyak::Zmm(unrollk * ColUnroll + 1)};
+      for (int i = 0; i < unrollk * ColUnroll; i++) used_zmm_idx.insert(i);
+      for (int i = 0; i < zmm_tmp_num; i++) used_zmm_idx.insert(i + unrollk * ColUnroll);
+      for (auto&& injector : injectors) {
+        injector.assign_resources(this, used_zmm_idx, reg_ret, k2);
+        injector.assign_reg_elt_constp(reg_elt_constv);
+      }
+
+      auto store_zmm_v = [&](MemcpyStoreParam<Xbyak::Zmm> p) {
+        if (p.tail) {
+          vmovdqu8(ptr[p.store_addr], p.vmm_v | p.store_mask);
+        } else {
+          vmovups(ptr[p.store_addr], p.vmm_v);
+        }
+      };
+
+      auto unpack_zmm_16bit_withfunc = [&](MemcpyStoreParam<Xbyak::Zmm> p,
+                                           std::function<void(MemcpyStoreParam<Xbyak::Zmm>)> func,
+                                           BTLA_DTYPE BIT16_DT) {
+        vmovups(zmm_tmps[0], p.vmm_v);
+        Xbyak::Ymm ymm_v = Xbyak::Ymm(p.vmm_v.getIdx());
+        if (BIT16_DT == BTLA_DTYPE::BF16) {
+          vpmovzxwd(p.vmm_v, ymm_v);
+          vpslld(p.vmm_v, p.vmm_v, 16);
+        }
+        if (BIT16_DT == BTLA_DTYPE::F16) vcvtph2psx(p.vmm_v, ymm_v);
+        p.store_mask = k3;
+        func(p);
+        vextractf32x8(Xbyak::Ymm(zmm_tmps[0].getIdx()), zmm_tmps[0], 1);
+        if (BIT16_DT == BTLA_DTYPE::BF16) {
+          vpmovzxwd(zmm_tmps[0], Xbyak::Ymm(zmm_tmps[0].getIdx()));
+          vpslld(zmm_tmps[0], zmm_tmps[0], 16);
+        }
+        if (BIT16_DT == BTLA_DTYPE::F16) vcvtph2psx(zmm_tmps[0], Xbyak::Ymm(zmm_tmps[0].getIdx()));
+        p.vmm_v = zmm_tmps[0];
+        p.store_addr = p.store_addr + VBytes;
+        p.store_mask = k4;
+        func(p);
+      };
+
+      auto apply_postop_and_store = [&](MemcpyStoreParam<Xbyak::Zmm> p) {
+        for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(p.vmm_v, k * 3 * sizeof(float));
+        if (dt_cvt_cfg.before_store == DataConvertConfig::cvt_direct::NO_CVT) {
+          store_zmm_v(p);
+        } else if (dt_cvt_cfg.before_store == DataConvertConfig::cvt_direct::BF16_TO_FP32) {
+          unpack_zmm_16bit_withfunc(p, store_zmm_v, BTLA_DTYPE::BF16);
+        } else if (dt_cvt_cfg.before_store == DataConvertConfig::cvt_direct::F16_TO_FP32) {
+          unpack_zmm_16bit_withfunc(p, store_zmm_v, BTLA_DTYPE::F16);
+        } else if (dt_cvt_cfg.before_store == DataConvertConfig::cvt_direct::FP32_TO_BF16) {
+          Xbyak::Ymm ymm_v = Xbyak::Ymm(p.vmm_v.getIdx());
+          if (device::CpuDevice::getInstance()->AVX512_BF16()) {
+            vcvtneps2bf16(ymm_v, p.vmm_v);
+          } else {
+            vmovups(zmm_tmps[1], p.vmm_v);
+            vpsrldq(p.vmm_v, p.vmm_v, 2);
+            vpandd(p.vmm_v, p.vmm_v, zword_b[rip + data_label]);
+            vpaddd(p.vmm_v, p.vmm_v, zword_b[rip + data_label + 4]);
+            vpaddd(p.vmm_v, p.vmm_v, zmm_tmps[1]);
+            vpsrld(p.vmm_v, p.vmm_v, 16);
+            vpmovdw(ymm_v, p.vmm_v);
+          }
+          if (p.tail) {
+            vmovdqu8(ptr[p.store_addr], ymm_v | k3);
+          } else {
+            vmovups(ptr[p.store_addr], ymm_v);
+          }
+        } else if (dt_cvt_cfg.before_store == DataConvertConfig::cvt_direct::FP32_TO_F16) {
+          Xbyak::Ymm ymm_v = Xbyak::Ymm(p.vmm_v.getIdx());
+          vcvtps2phx(ymm_v, p.vmm_v);
+          if (p.tail) {
+            vmovdqu8(ptr[p.store_addr], ymm_v | k3);
+          } else {
+            vmovups(ptr[p.store_addr], ymm_v);
+          }
+        } else {
+          assert(0);
+        }
+      };
+
+      auto load_store_value = [&](Xbyak::Zmm zmm_v, Xbyak::RegExp load_addr, Xbyak::RegExp store_addr,
+                                  bool tail = false) {
+        if (tail) {
+          vmovdqu8(zmm_v | k1, ptr[load_addr]);
+        } else {
+          vmovups(zmm_v, ptr[load_addr]);
+        }
+        if (dt_cvt_cfg.before_postop == DataConvertConfig::cvt_direct::NO_CVT) {
+          apply_postop_and_store({zmm_v, store_addr, tail});
+        } else if (dt_cvt_cfg.before_postop == DataConvertConfig::cvt_direct::BF16_TO_FP32) {
+          unpack_zmm_16bit_withfunc({zmm_v, store_addr, tail}, apply_postop_and_store, BTLA_DTYPE::BF16);
+        } else if (dt_cvt_cfg.before_postop == DataConvertConfig::cvt_direct::F16_TO_FP32) {
+          unpack_zmm_16bit_withfunc({zmm_v, store_addr, tail}, apply_postop_and_store, BTLA_DTYPE::F16);
+        } else {
+          assert(0);
+        }
+      };
+
+      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
+      mov(reg_src, ptr[parambase + OFFSET(srcptr)]);
+      mov(reg_dst, ptr[parambase + OFFSET(dstptr)]);
+      load32(reg_size, ptr[parambase + OFFSET(col)]);
+      xor_(reg_iter, reg_iter);
+      L(".colloop");
+      mov(reg_tmp, reg_size);
+      sub(reg_tmp, reg_iter);
+      cmp(reg_tmp, ColUnroll * VBytes);
+      jl(".maskproc", T_NEAR);
+      for (int i = 0; i < ColUnroll; i++)
+        load_store_value(Xbyak::Zmm(i), reg_src + i * VBytes, reg_dst + i * dt_cvt_cfg.get_store_process_bytes(VBytes));
+      add(reg_src, ColUnroll * VBytes);
+      add(reg_dst, ColUnroll * dt_cvt_cfg.get_store_process_bytes(VBytes));
+      add(reg_iter, ColUnroll * VBytes);
+      jmp(".colend", T_NEAR);
+      L(".maskproc");
+      generate_Nbitsmask(k1, reg_iter, reg_size, reg_tmp, reg_tmp2, VBytes);
+
+      if (dt_cvt_cfg.before_store == DataConvertConfig::cvt_direct::FP32_TO_BF16) {
+        push(reg_iter);
+        push(reg_size);
+        int vbytes = VBytes;
+        // consider a case that input==bf16 but apply postop, betore store will be fp32_to_bf16 but need to normal gen
+        // mask.
+        if (dt_cvt_cfg.input_dt == BTLA_DTYPE::F32) {
+          shr(reg_iter, 1);
+          shr(reg_size, 1);
+          vbytes /= 2;
+        }
+        generate_Nbitsmask(k3, reg_iter, reg_size, reg_tmp, reg_tmp2, vbytes);
+        pop(reg_size);
+        pop(reg_iter);
+      }
+      // once enable postop the data-type before store will not be bf16.
+      if (dt_cvt_cfg.before_store == DataConvertConfig::cvt_direct::BF16_TO_FP32 ||
+          dt_cvt_cfg.before_store == DataConvertConfig::cvt_direct::F16_TO_FP32) {
+        push(reg_iter);
+        push(reg_size);
+        shl(reg_iter, 1);
+        shl(reg_size, 1);
+        generate_Nbitsmask(k3, reg_iter, reg_size, reg_tmp, reg_tmp2, VBytes);
+        add(reg_iter, VBytes);
+        generate_Nbitsmask(k4, reg_iter, reg_size, reg_tmp, reg_tmp2, VBytes);
+        pop(reg_size);
+        pop(reg_iter);
+      }
+      load_store_value(Xbyak::Zmm(0), reg_src, reg_dst, true);
+      add(reg_src, VBytes);
+      add(reg_dst, dt_cvt_cfg.get_store_process_bytes(VBytes));
+      add(reg_iter, VBytes);
+      L(".colend");
+      cmp(reg_iter, reg_size);
+      jb(".colloop");
+      mov(reg_ret, 0);
+      vreg_pop(rsp);
+    }
+    outLocalLabel();  // end of local label
+    L(data_label);
+    uint32_t bf16_cvt_magic_num[2] = {0x00000001, 0X00007FFF};
+    db(reinterpret_cast<uint8_t*>(bf16_cvt_magic_num), sizeof(bf16_cvt_magic_num));
+    for (auto&& injector : injectors) injector.prepare_table();
+    this->ready();
+    mKernel = this->getCode<func_t>();
+  }
+
+  func_t mKernel = nullptr;
+  std::set<int> used_zmm_idx;
+};
+
+static inline Xbyak::Zmm unpack_4bit(Xbyak::CodeGenerator* jit, Xbyak::Ymm v4bits, Xbyak::Zmm zmm, Xbyak::Zmm zmm1,
+                                     Xbyak::Zmm vmask, Xbyak::Opmask unpack_mask) {
+  Xbyak::Ymm ymm1(zmm1.getIdx());
+  jit->vpmovsxbw(zmm, v4bits);
+  jit->vpslld(ymm1, v4bits, 4);
+  jit->vpmovsxbw(zmm1, ymm1);
+  jit->vpsllw(zmm, zmm, 8);
+  jit->vmovdqu8(zmm1 | unpack_mask, zmm);
+  jit->vpandd(zmm1, vmask, zmm1);
+  return zmm1;
+}
+
+static inline Xbyak::Zmm unpack_4bit_2regs(Xbyak::CodeGenerator* jit, Xbyak::Ymm v4bits, Xbyak::Zmm tmp,
+                                           Xbyak::Zmm vmask, Xbyak::Opmask unpack_mask) {
+  Xbyak::Zmm dst(v4bits.getIdx());
+  jit->vpmovsxbw(tmp, v4bits);
+  jit->vpslld(v4bits, v4bits, 4);
+  jit->vpmovsxbw(dst, v4bits);
+  jit->vpsllw(tmp, tmp, 8);
+  jit->vmovdqu8(dst | unpack_mask, tmp);
+  jit->vpandd(dst, vmask, dst);
+  return dst;
+}
+
+class DecompressS4S8_AVX512F : protected xbyak::JitAvx512f {
+ public:
+  struct params {
+    void *srcptr, *dstptr;
+    size_t size;
+  };
+  typedef long long (*func_t)(params*);
+
+ public:
+  static int constexpr VBytes = 64;
+  DecompressS4S8_AVX512F() {
+    inLocalLabel();  // use local label for multiple instance
+    int SF_TmpSize = 64;
+    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
+    const Xbyak::Reg64& parambase = st.p[0];
+    const Xbyak::Reg64& reg_srcptr = st.t[0];
+    const Xbyak::Reg64& reg_dstptr = st.t[1];
+    const Xbyak::Reg64& reg_size = st.t[5];
+    const Xbyak::Reg64& reg_iterrow = st.t[6];
+    const Xbyak::Reg64& reg_itercol = st.t[7];
+    const Xbyak::Reg64& reg_tmp = st.t[8];
+    const Xbyak::Reg64& reg_tmp1 = st.t[12];
+    const Xbyak::Reg64& reg_ret = rax;
+
+    vreg_push(rsp);
+
+    mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
+    mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
+    mov(reg_size, ptr[parambase + OFFSET(size)]);
+    Xbyak::Opmask unpack_mask(4);
+    Xbyak::Zmm zmm_mask(31);
+    mov(reg_tmp.cvt32(), uint32_t(0xf0f0f0f0));
+    vpbroadcastd(zmm_mask, reg_tmp.cvt32());
+    mov(reg_tmp, 0xaaaaaaaaaaaaaaaa);
+    kmovq(unpack_mask, reg_tmp);
+    int const ColUnroll = 4;
+    xor_(reg_iterrow, reg_iterrow);
+    xor_(reg_itercol, reg_itercol);
+    L(".colloop");
+    mov(reg_tmp, reg_size);
+    sub(reg_tmp, reg_itercol);
+    cmp(reg_tmp, ColUnroll * VBytes);
+    jl(".maskproc", T_NEAR);
+    mov(reg_tmp, reg_itercol);
+    shr(reg_tmp, 1);
+    for (int i = 0; i < ColUnroll; i++) {
+      vmovups(Xbyak::Ymm(i), ptr[reg_srcptr + reg_tmp + i * VBytes / 2]);
+      unpack_4bit_2regs(this, Xbyak::Ymm(i), Xbyak::Zmm(ColUnroll), zmm_mask, unpack_mask);
+      vmovups(ptr[reg_dstptr + reg_itercol + i * VBytes], Xbyak::Zmm(i));
+    }
+    add(reg_itercol, ColUnroll * VBytes);
+    jmp(".colend");
+    L(".maskproc");
+    generate_Nbitsmask(k1, reg_itercol, reg_size, reg_tmp, reg_tmp1, VBytes);
+    mov(reg_tmp, reg_itercol);
+    shr(reg_tmp, 1);
+    vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_srcptr + reg_tmp]);
+    unpack_4bit_2regs(this, Xbyak::Ymm(0), Xbyak::Zmm(ColUnroll), zmm_mask, unpack_mask);
+    vmovdqu8(ptr[reg_dstptr + reg_itercol], Xbyak::Zmm(0) | k1);
+    add(reg_itercol, VBytes);
+    L(".colend");
+    cmp(reg_itercol, reg_size);
+    jb(".colloop");
+
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+    outLocalLabel();  // end of local label
+
+    this->ready();
+    mKernel = this->getCode<func_t>();
+  }
+
+  static BTLA_CODE forward(void* srcptr, void* dstptr, size_t size) {
+    static DecompressS4S8_AVX512F instance;
+    auto param = params{srcptr, dstptr, size};
+    instance.mKernel(&param);
+    return BTLA_CODE::Success;
+  }
+
+ private:
+  func_t mKernel = nullptr;
+};
+
+static inline BTLA_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src,
+                                         int ld_dst) {
+  if (col != ld_src) {  // memory is not continuous
+    return BTLA_CODE::NotSupport;
+  }
+  DecompressS4S8_AVX512F::forward(srcptr, dstptr, (size_t)row * col);
+  return BTLA_CODE::Success;
+}
+
+// src: row x col => dst: ⌈col/n_tile⌉ x ⌈row/row_pack⌉ x n_tile x row_pack (zeor-padded)
+// Extra padding can be applied with memset calls in `static void forward(...)`
+class PaddingInterleaveCvt : protected xbyak::JitAvx512f {
+ public:
+  struct params {
+    const void* srcptr;
+    void* dstptr;
+    int row, col;
+    int srcstride, dststride;  // dst = dst_base + dststride * n_idx, where n_idx % n_tile == 0
+  };
+  typedef void (*func_t)(params* p);
+  void operator()(params* p) const { mKernel(p); }
+
+ private:
+  static inline const uint16_t idx_interleave_self[32] = {
+      0,  16, 1,  17, 2,  18, 3,  19,  //
+      4,  20, 5,  21, 6,  22, 7,  23,  //
+      8,  24, 9,  25, 10, 26, 11, 27,  //
+      12, 28, 13, 29, 14, 30, 15, 31,  //
+  };
+
+  PaddingInterleaveCvt(int n_tile, BTLA_DTYPE dst_t) : PaddingInterleaveCvt(n_tile, dst_t, dst_t) {}
+  PaddingInterleaveCvt(int n_tile, BTLA_DTYPE dst_t, BTLA_DTYPE src_t, int row_pack = 0) : xbyak::JitAvx512f() {
+    inLocalLabel();  // use local label for multiple instance
+    const auto src_bytes = static_cast<int>(utils::bestla_dtype_size(src_t));
+    const auto dst_bytes = static_cast<int>(utils::bestla_dtype_size(dst_t));
+    if (row_pack == 0) row_pack = 4 / dst_bytes;  // default value
+    const auto ne_zmm = 64 / std::max(src_bytes, dst_bytes);
+    const auto src_bytes_vmm = ne_zmm * src_bytes;
+
+    assert(n_tile % ne_zmm == 0);
+    assert(row_pack > 0 && row_pack < 3);  // TODO(yi): int8 interleave not implemented
+
+    int SF_TmpSize = 64;
+    Xbyak::Label l_idx_interleave_self;
+    std::shared_ptr<void> epilogue{
+        // generate code at the very end
+        nullptr, [&](void*) {
+          align(64);
+          L(l_idx_interleave_self);
+          db(reinterpret_cast<const uint8_t*>(idx_interleave_self), sizeof(idx_interleave_self));
+          outLocalLabel();  // end of local label
+
+          this->ready();
+          this->mKernel = this->getCode<func_t>();
+        }};
+    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
+    const Xbyak::Reg64& parambase = st.p[0];
+    const Xbyak::Reg64& reg_srcptr = st.t[0];
+    const Xbyak::Reg64& reg_dstptr = st.t[1];
+    const Xbyak::Reg64& reg_srcstride = st.t[2];
+    const Xbyak::Reg64& reg_dststride = st.t[3];
+    const Xbyak::Reg64& reg_colsize = st.t[5];
+    const Xbyak::Reg64& reg_iterrow = st.t[6];
+    const Xbyak::Reg64& reg_itercol = st.t[7];
+    const Xbyak::Reg64& reg_tmp = st.t[8];
+    const Xbyak::Reg64& reg_tmp1 = st.t[9];
+    const Xbyak::Reg64& reg_tmp2 = st.t[12];
+    const Xbyak::Reg64& reg_tmp3 = st.t[10];
+
+    const Xbyak::Reg64& reg_ret = rax;
+    auto& mask_rd = k1;
+    const Xbyak::Zmm& vreg_idx0 = zmm31;
+
+    vreg_push(rsp);
+    vmovups(vreg_idx0, zword[rip + l_idx_interleave_self]);
+    mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
+    mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
+    mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
+    mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
+    mov(reg_colsize.cvt32(), ptr[parambase + OFFSET(col)]);
+
+    std::vector<Xbyak::Zmm> reg_srcs(row_pack), reg_tmps(row_pack);
+    const int ZIDX_TranSrc = 0;
+    const int ZIDX_TransTmp = row_pack;
+    for (int i = 0; i < row_pack; i++) reg_srcs[i] = Xbyak::Zmm(ZIDX_TranSrc + i);
+    for (int i = 0; i < row_pack; i++) reg_tmps[i] = Xbyak::Zmm(ZIDX_TransTmp + i);
+
+    xor_(reg_iterrow, reg_iterrow);
+    L(".rowloop");
+    xor_(reg_itercol, reg_itercol);
+    mov(reg_tmp2.cvt32(), ptr[parambase + OFFSET(row)]);
+    sub(reg_tmp2, reg_iterrow);
+    cmp(reg_tmp2, row_pack);
+    jb(".tailrowloop", T_NEAR);
+
+    L(".colloop");
+    mov(reg_tmp1, reg_itercol);
+    imul(reg_tmp1, reg_dststride);
+    lea(reg_tmp, ptr[reg_dstptr + reg_tmp1]);
+    lea(reg_tmp1, ptr[reg_srcptr + reg_itercol * src_bytes]);
+    for (int jj = 0; jj < n_tile; jj += ne_zmm) {
+      generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize - jj], reg_tmp2, reg_tmp3, ne_zmm);
+      for (int ii = 0; ii < row_pack; ii++) {
+        const Xbyak::Xmm reg_srcs_ii = src_bytes_vmm == 64   ? Xbyak::Zmm(reg_srcs[ii].getIdx())
+                                       : src_bytes_vmm == 32 ? Xbyak::Ymm(reg_srcs[ii].getIdx())
+                                       : src_bytes_vmm == 16 ? Xbyak::Xmm(reg_srcs[ii].getIdx())
+                                                             : (assert(false), reg_srcs[ii]);
+        if (src_bytes == 1) {
+          vmovdqu8(reg_srcs_ii | mask_rd | T_z, ptr[reg_tmp1 + ii * reg_srcstride + jj * src_bytes]);
+        } else if (src_bytes == 2) {
+          vmovdqu16(reg_srcs_ii | mask_rd | T_z, ptr[reg_tmp1 + ii * reg_srcstride + jj * src_bytes]);
+        } else if (src_bytes == 4) {
+          vmovdqu32(reg_srcs_ii | mask_rd | T_z, ptr[reg_tmp1 + ii * reg_srcstride + jj * src_bytes]);
+        }
+      }
+      if (src_t == BTLA_DTYPE::F32 && dst_t == BTLA_DTYPE::BF16) {
+        vcvtne2ps2bf16(reg_tmps[0], reg_srcs[1], reg_srcs[0]);
+        vpermt2w(reg_tmps[0], vreg_idx0, reg_tmps[0]);
+        vmovups(ptr[reg_tmp + jj * row_pack * dst_bytes], reg_tmps[0]);
+      } else {
+        // interleave_2rows_4regs(reg_srcs.data(), reg_tmps.data());
+        assert(false);  // Not implemented
+      }
+    }
+    add(reg_itercol, n_tile);
+    cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
+    jb(".colloop");
+    lea(reg_srcptr, ptr[reg_srcptr + row_pack * reg_srcstride]);
+    lea(reg_dstptr, ptr[reg_dstptr + row_pack * n_tile * dst_bytes]);
+
+    add(reg_iterrow, row_pack);
+    cmp(reg_iterrow.cvt32(), ptr[parambase + OFFSET(row)]);
+    jb(".rowloop");
+    jmp(".aftercolloop", T_NEAR);
+
+    L(".tailrowloop");
+    L(".tailcolloop");
+    mov(reg_tmp1, reg_itercol);
+    imul(reg_tmp1, reg_dststride);
+    lea(reg_tmp, ptr[reg_dstptr + reg_tmp1]);
+    lea(reg_tmp1, ptr[reg_srcptr + reg_itercol * src_bytes]);
+    for (int jj = 0; jj < n_tile; jj += ne_zmm) {
+      generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize - jj], reg_tmp2, reg_tmp3, ne_zmm);
+      if (row_pack == 2) {
+        const Xbyak::Xmm reg_srcs_0 = src_bytes_vmm == 64   ? Xbyak::Zmm(reg_srcs[0].getIdx())
+                                      : src_bytes_vmm == 32 ? Xbyak::Ymm(reg_srcs[0].getIdx())
+                                      : src_bytes_vmm == 16 ? Xbyak::Xmm(reg_srcs[0].getIdx())
+                                                            : (assert(false), reg_srcs[0]);
+        if (src_bytes == 1) {
+          vmovdqu8(reg_srcs_0 | mask_rd | T_z, ptr[reg_tmp1 + jj * src_bytes]);
+        } else if (src_bytes == 2) {
+          vmovdqu16(reg_srcs_0 | mask_rd | T_z, ptr[reg_tmp1 + jj * src_bytes]);
+        } else if (src_bytes == 4) {
+          vmovdqu32(reg_srcs_0 | mask_rd | T_z, ptr[reg_tmp1 + jj * src_bytes]);
+        }
+        vxorps(reg_srcs[1], reg_srcs[1]);
+      } else {
+        assert(false);
+      }
+      if (src_t == BTLA_DTYPE::F32 && dst_t == BTLA_DTYPE::BF16) {
+        vcvtne2ps2bf16(reg_tmps[0], reg_srcs[1], reg_srcs[0]);
+        vpermt2w(reg_tmps[0], vreg_idx0, reg_tmps[0]);
+        vmovups(ptr[reg_tmp + jj * row_pack * dst_bytes], reg_tmps[0]);
+      } else {
+        assert(false);
+      }
+    }
+    add(reg_itercol, n_tile);
+    cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
+    jb(".tailcolloop");
+    L(".aftercolloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+  }
+
+  func_t mKernel = nullptr;
+
+ public:
+  template <int NTile, typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
+  static void forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
+                      int dst_step) {
+    const auto kern_col_pad = utils::padto(col, NTile);
+    const auto kern_row_pad = utils::padto(row, RowPack);
+    assert(kern_col_pad <= col_pad && col_pad % NTile == 0);
+    assert(kern_row_pad <= row_pad && row_pad % RowPack == 0);
+    const auto src_stride = static_cast<int>(sizeof(T_SRC)) * src_step;
+    const auto dst_stride = static_cast<int>(sizeof(T_DST)) * dst_step;
+    params param = {src, dst, row, col, src_stride, dst_stride};
+    static const PaddingInterleaveCvt kern(NTile, utils::bestla_dtype<T_DST>, utils::bestla_dtype<T_SRC>, RowPack);
+    kern(&param);
+
+    // extra row and col pad
+    const auto row_pad_size_memset = sizeof(T_DST) * (row_pad - kern_row_pad) * NTile;
+    if (row_pad_size_memset) {
+      for (int j = 0; j < kern_col_pad; j += NTile)
+        memset(dst + j * dst_step + kern_row_pad * NTile, 0, row_pad_size_memset);
+    }
+    for (int j = kern_col_pad; j < col_pad; j += NTile)  //
+      memset(dst + j * dst_step, 0, sizeof(T_DST) * NTile * row_pad);
+  }
+
+  template <int NTile, typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
+  static void reference(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
+                        int dst_step) {
+    assert(utils::padto(col, NTile) <= col_pad && col_pad % NTile == 0);
+    assert(utils::padto(row, RowPack) <= row_pad && row_pad % RowPack == 0);
+    for (int i = 0; i < row_pad; i += RowPack)
+      for (int j = 0; j < col_pad; j += NTile)
+        for (int ii = 0; ii < RowPack; ++ii)
+          for (int jj = 0; jj < NTile; ++jj)
+            dst[i * NTile + j * dst_step + ii + jj * RowPack] =
+                static_cast<T_DST>((i + ii < row && j + jj < col) ? src[(i + ii) * src_step + j + jj] : 0);
+  }
+};
+
+// src: row x col => dst: ⌈row/m_tile⌉ x ⌈col/(trans_cell*col_pack==64/sizeof(t_dst))⌉ x m_tile x col_pack (zeor-padded)
+// Note1: the extra padding on the dimension of col due to the implementation limitation
+// Note2: dst will only be zero-padded to a multiple of trans_cell in the dimension of m_tile
+// Extra padding can be applied with memset calls in `static void forward(...)`
+class PaddingTransInterleaveCvt : protected xbyak::JitAvx512f {
+ public:
+  struct params {
+    const void* srcptr;
+    void* dstptr;
+    int row, col;
+    int srcstride;  // src = src_base + srcstride * m_idx
+    int dststride;  // dst = dst_base + dststride * m_idx, where m_idx % m_tile == 0
+  };
+  typedef void (*func_t)(params* p);
+  void operator()(params* p) const { mKernel(p); }
+  const int trans_cell;  // transpose matrices of size trans_cellxtrans_cell (in terms of #elements or #packs)
+
+ private:
+  PaddingTransInterleaveCvt(int m_tile, BTLA_DTYPE dst_t) : PaddingTransInterleaveCvt(m_tile, dst_t, dst_t) {}
+  PaddingTransInterleaveCvt(int m_tile, BTLA_DTYPE dst_t, BTLA_DTYPE src_t, int col_pack = 0)
+      : xbyak::JitAvx512f(), trans_cell(64 / col_pack / int(utils::bestla_dtype_size(dst_t))) {
+    const auto src_bytes = static_cast<int>(utils::bestla_dtype_size(src_t));
+    const auto dst_bytes = static_cast<int>(utils::bestla_dtype_size(dst_t));
+    if (col_pack == 0) col_pack = 4 / dst_bytes;  // default value
+    // const auto src_bytes_vmm = ne_zmm * src_bytes;
+    // const auto dst_bytes_vmm = ne_zmm * dst_bytes;
+
+    assert(m_tile % trans_cell == 0);
+    assert(col_pack > 0 && col_pack < 3);  // TODO(yi): int8 interleave not implemented
+
+    inLocalLabel();                // use local label for multiple instance
+    std::shared_ptr<void> epilogue{// generate code at the very end
+                                   nullptr, [&](void*) {
+                                     outLocalLabel();  // end of local label
+
+                                     this->ready();
+                                     this->mKernel = this->getCode<func_t>();
+                                   }};
+    Xbyak::util::StackFrame st(this, 1, 11 | Xbyak::util::UseRDX, 16 * 10);
+    const Xbyak::Reg64& parambase = st.p[0];
+    const Xbyak::Reg64& reg_srcptr = st.t[0];
+    const Xbyak::Reg64& reg_dstptr = st.t[1];
+    const Xbyak::Reg64& reg_srcstride = st.t[2];
+    const Xbyak::Reg64& reg_dststride = st.t[3];
+    const Xbyak::Reg64& reg_colsize = st.t[4];
+    const Xbyak::Reg64& reg_iterrow = st.t[5];
+    const Xbyak::Reg64& reg_itercol = st.t[6];
+    const Xbyak::Reg64& reg_tmp = st.t[7];
+    const Xbyak::Reg64& reg_tmp2 = st.t[9];
+    const Xbyak::Reg64& reg_tmp3 = st.t[10];
+
+    const Xbyak::Reg64& reg_ret = rax;
+    const auto& mask_rd = k1;
+    const auto& mask_rd2 = k2;
+
+    vreg_push(rsp);
+    mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
+    mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
+    mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
+    mov(reg_colsize.cvt32(), ptr[parambase + OFFSET(col)]);
+
+    std::vector<Xbyak::Zmm> reg_srcs(trans_cell), reg_tmps(trans_cell);
+    const int ZIDX_TranSrc = 0;
+    const int ZIDX_TransTmp = trans_cell;
+    for (int i = 0; i < trans_cell; i++) reg_srcs[i] = Xbyak::Zmm(ZIDX_TranSrc + i);
+    for (int i = 0; i < trans_cell; i++) reg_tmps[i] = Xbyak::Zmm(ZIDX_TransTmp + i);
+
+    xor_(reg_iterrow, reg_iterrow);
+    L(".rowloop");
+    xor_(rdx, rdx);
+    mov(rax, reg_iterrow);
+    mov(reg_tmp, m_tile);
+    div(reg_tmp);                                 // reg_iterrow `div` m_tile
+    imul(reg_dstptr, rdx, col_pack * dst_bytes);  // ii * col_pack
+    add(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
+    imul(reg_tmp, rax, m_tile);
+    imul(reg_tmp, reg_dststride);
+    lea(reg_dstptr, ptr[reg_dstptr + reg_tmp]);  // dst = dst_base + i * dst_step + ii * col_pack
+    xor_(reg_itercol, reg_itercol);
+
+    mov(reg_tmp2.cvt32(), ptr[parambase + OFFSET(row)]);
+    sub(reg_tmp2, reg_iterrow);
+    cmp(reg_tmp2, trans_cell);
+    jb(".tailrowloop", T_NEAR);
+
+    L(".colloop");
+    generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize], reg_tmp2, reg_tmp3, 64 / dst_bytes);
+    if (src_t == BTLA_DTYPE::F32 && dst_t == BTLA_DTYPE::BF16) {
+      kshiftrq(mask_rd2, mask_rd, 16);
+      assert(trans_cell == 16);
+      for (int ii = 0; ii < trans_cell; ++ii) {
+        lea(reg_tmp, (ii == 0) ? ptr[reg_srcptr + reg_itercol * src_bytes] : ptr[reg_tmp + reg_srcstride]);
+        vmovups(reg_srcs[ii] | mask_rd | T_z, zword[reg_tmp]);
+        vmovups(reg_tmps[ii] | mask_rd2 | T_z, zword[reg_tmp + 64]);
+        vcvtne2ps2bf16(reg_srcs[ii], reg_tmps[ii], reg_srcs[ii]);
+      }
+      transpose16x16_4B(reg_srcs.data(), reg_tmps.data());
+      for (int jj = 0; jj < trans_cell; ++jj) {
+        vmovups(ptr[reg_dstptr + jj * m_tile * col_pack * dst_bytes], reg_srcs[jj]);
+      }
+    } else {
+      assert(false);  // Not implemented
+    }
+    lea(reg_dstptr, ptr[reg_dstptr + col_pack * trans_cell * dst_bytes * m_tile]);
+    lea(reg_itercol, ptr[reg_itercol + col_pack * trans_cell]);
+    cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
+    jb(".colloop");
+
+    imul(reg_tmp, reg_srcstride, trans_cell);
+    lea(reg_srcptr, ptr[reg_srcptr + reg_tmp]);  // srcptr += trans_cell * srcstride
+    lea(reg_iterrow, ptr[reg_iterrow + trans_cell]);
+    cmp(reg_iterrow.cvt32(), ptr[parambase + OFFSET(row)]);
+    jb(".rowloop");
+    jmp(".aftercolloop", T_NEAR);
+
+    L(".tailrowloop");
+    // reg_itercol, reg_dstptr should have been set in the non-tail section
+    Xbyak::Label l_tail_tbl;
+    std::vector<Xbyak::Label> l_tail_case(trans_cell);
+    mov(reg_tmp, l_tail_tbl);                              // TODO(Yi): rip + l + offset?
+    jmp(ptr[reg_tmp + reg_tmp2 * sizeof(void*)], T_NEAR);  // switch(rows-iterrow) ...
+    align(sizeof(intptr_t));
+    L(l_tail_tbl);
+    db(reinterpret_cast<uintptr_t>(nullptr), sizeof(intptr_t));  // case 0 should never occur
+    for (int i = 1; i < trans_cell; ++i) putL(l_tail_case[i]);
+
+    for (int m_tail = 1; m_tail < trans_cell; ++m_tail) {  // case (m_tail):
+      auto& tailcolloop = l_tail_case[m_tail];
+      L(tailcolloop);
+      generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize], reg_tmp2, reg_tmp3, 64 / dst_bytes);
+      if (src_t == BTLA_DTYPE::F32 && dst_t == BTLA_DTYPE::BF16) {
+        kshiftrq(mask_rd2, mask_rd, 16);
+        assert(trans_cell == 16);
+        for (int ii = 0; ii < trans_cell; ++ii) {
+          if (ii < m_tail) {
+            lea(reg_tmp, (ii == 0) ? ptr[reg_srcptr + reg_itercol * src_bytes] : ptr[reg_tmp + reg_srcstride]);
+            vmovups(reg_srcs[ii] | mask_rd | T_z, zword[reg_tmp]);
+            vmovups(reg_tmps[ii] | mask_rd2 | T_z, zword[reg_tmp + 64]);
+            vcvtne2ps2bf16(reg_srcs[ii], reg_tmps[ii], reg_srcs[ii]);
+          } else if (ii == m_tail) {
+            vxorps(reg_srcs[ii], reg_srcs[ii], reg_srcs[ii]);
+          } else {
+            vmovaps(reg_srcs[ii], reg_srcs[m_tail]);
+          }
+        }
+        transpose16x16_4B(reg_srcs.data(), reg_tmps.data());
+        for (int jj = 0; jj < trans_cell; ++jj) {
+          vmovups(ptr[reg_dstptr + jj * m_tile * col_pack * dst_bytes], reg_srcs[jj]);
+        }
+      } else {
+        assert(false);  // Not implemented
+      }
+      lea(reg_dstptr, ptr[reg_dstptr + col_pack * trans_cell * dst_bytes * m_tile]);
+      lea(reg_itercol, ptr[reg_itercol + col_pack * trans_cell]);
+      cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
+      jb(tailcolloop);
+      jmp(".aftercolloop", T_NEAR);
+    }
+
+    L(".aftercolloop");
+    mov(reg_ret, 0);
+    vreg_pop(rsp);
+  }
+
+  func_t mKernel = nullptr;
+
+ public:
+  template <int MTile, typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
+  static void forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
+                      int dst_step) {
+    assert(utils::padto(row, MTile) <= row_pad && row_pad % MTile == 0);
+    assert(utils::padto(col, ColPack) <= col_pad && col_pad % ColPack == 0);
+    static const PaddingTransInterleaveCvt kern(MTile, utils::bestla_dtype<T_DST>, utils::bestla_dtype<T_SRC>, ColPack);
+    // 0-padded guarantee by jit kern
+    const auto kern_row_pad = utils::padto(row, kern.trans_cell),
+               kern_col_pad = utils::padto(col, kern.trans_cell * ColPack);
+    assert(kern_row_pad <= row_pad && row_pad % MTile == 0);
+    assert(kern_col_pad <= col_pad && col_pad % ColPack == 0);
+    const auto src_stride = static_cast<int>(sizeof(T_SRC)) * src_step;
+    const auto dst_stride = static_cast<int>(sizeof(T_DST)) * dst_step;
+    params param = {src, dst, row, col, src_stride, dst_stride};
+    kern(&param);
+
+    // extra row and col pad
+    const auto col_pad_size_memset = sizeof(T_DST) * (col_pad - kern_col_pad) * MTile;
+    if (col_pad_size_memset) {
+      for (int i = 0; i < kern_row_pad; i += MTile)
+        memset(dst + i * dst_step + kern_col_pad * MTile, 0, col_pad_size_memset);
+    }
+    const auto row_tail_pad_size_memset = sizeof(T_DST) * (utils::padto(row, MTile) - kern_row_pad) * ColPack;
+    if (row_tail_pad_size_memset) {  // row tail due to kernel limitation: kern_row_pad < next_multiple_of_MTile
+      const auto kern_row_pad_le_mtile = utils::padto_le(kern_row_pad, MTile);
+      const auto tail_dst_base = dst + kern_row_pad_le_mtile * dst_step + kern_row_pad % MTile * ColPack;
+      for (int j = 0; j < kern_col_pad; j += ColPack) memset(tail_dst_base + j * MTile, 0, row_tail_pad_size_memset);
+    }
+    for (int j = utils::padto(row, MTile); j < row_pad; j += MTile)
+      memset(dst + kern_row_pad * dst_step, 0, sizeof(T_DST) * MTile * col_pad);
+  }
+
+  template <int MTile, typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
+  static void reference(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
+                        int dst_step) {
+    assert(utils::padto(row, MTile) <= row_pad && row_pad % MTile == 0);
+    assert(utils::padto(col, ColPack) <= col_pad && col_pad % ColPack == 0);
+    for (int i = 0; i < row_pad; i += MTile)
+      for (int j = 0; j < col_pad; j += ColPack)
+        for (int ii = 0; ii < MTile; ++ii)
+          for (int jj = 0; jj < ColPack; ++jj)
+            dst[j * MTile + i * dst_step + jj + ii * ColPack] =
+                static_cast<T_DST>((j + jj < col && i + ii < row) ? src[(i + ii) * src_step + j + jj] : 0);
+  }
+};
+
+// Complex number matrix(interleaved) - vector(as diagonal matrix) multiplication; Typically used for
+// shift-RoPE
+//
+// vector: fp16 values; view every adjacent 2 values on colunm as a complex num
+// src: bf16 ⌈row/row_pack⌉ x n_tile x row_pack; view every adjacent 2 values on colunm as a complex num
+// dst: same as src
+class CScaleInterleavedBF16FP16 : protected xbyak::JitAvx512_fp16 {
+ public:
+  struct params {
+    void* srcptr;
+    const void* scaleptr;
+    int row;
+  };
+  typedef void (*func_t)(params* p);
+  void operator()(params* p) const { mKernel(p); }
+
+ private:
+  explicit CScaleInterleavedBF16FP16(int n_tile, int n_off, int row_pack = 2, int unroll = 2)
+      : xbyak::JitAvx512_fp16() {
+    inLocalLabel();  // use local label for multiple instance
+    assert(("n_tile must be a multiple of 16", n_tile % 16 == 0));
+    assert(row_pack > 0 && row_pack < 3);  // TODO(yi): int8 interleave not implemented
+    int SF_TmpSize = 64;
+    std::shared_ptr<void> epilogue{// generate code at the very end
+                                   nullptr, [&](void*) {
+                                     outLocalLabel();  // end of local label
+                                     this->ready();
+                                     this->mKernel = this->getCode<func_t>();
+                                   }};
+    Xbyak::util::StackFrame st(this, 1, 4, 16 * 10 + SF_TmpSize);
+    const Xbyak::Reg64& parambase = st.p[0];
+    const Xbyak::Reg64& reg_src = st.t[0];
+    const Xbyak::Reg64& reg_scale = st.t[1];
+    const Xbyak::Reg64& reg_rowsize = st.t[2];
+    const Xbyak::Reg64& reg_iterrow = st.t[3];
+    const Xbyak::Zmm& vreg_scale = zmm31;
+    const auto& mask = k1;
+    const auto masked_off = n_off % 16;
+    if (masked_off != 0) {
+      mov(reg_src, ((1ULL << (16 - masked_off)) - 1) << masked_off);
+      kmovw(mask, reg_src.cvt32());
+    }
+
+    vreg_push(rsp);
+    mov(reg_rowsize.cvt32(), ptr[parambase + OFFSET(row)]);
+    mov(reg_src, qword[parambase + OFFSET(srcptr)]);
+    mov(reg_scale, qword[parambase + OFFSET(scaleptr)]);
+
+    std::vector<Xbyak::Zmm> vreg_src(4 * n_tile / 16);
+    const int ZIDX_TranSrc = 0;
+    for (int i = 0; i < 4 * n_tile / 16; i++) vreg_src[i] = Xbyak::Zmm(ZIDX_TranSrc + i);
+
+    xor_(reg_iterrow, reg_iterrow);
+    Xbyak::Label rowloop;
+    L(rowloop);
+    {
+      assert(("only implement for pack2 bf16", row_pack == 2));
+      for (int i = 0; i < unroll * row_pack; i += row_pack) {
+        vpbroadcastd(vreg_scale, dword[reg_scale + reg_iterrow * sizeof(utils::fp16) + i * sizeof(utils::fp16)]);
+
+        if (masked_off != 0) {
+          int j = utils::padto_le(n_off, 16);
+
+          const auto& vreg0 = vreg_src[j / 16 * 4 + 0];
+          const auto& vreg1 = vreg_src[j / 16 * 4 + 1];
+          const auto& vreg2 = vreg_src[j / 16 * 4 + 2];
+          const auto& vreg3 = vreg_src[j / 16 * 4 + 3];
+          vpmovzxwd(vreg0, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 0]);
+          vpmovzxwd(vreg1, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 32]);
+          vpslldq(vreg0, vreg0, 2);
+          vpslldq(vreg1, vreg1, 2);
+          vcvtps2phx(Xbyak::Ymm(vreg0.getIdx()), vreg0);
+          vcvtps2phx(Xbyak::Ymm(vreg1.getIdx()), vreg1);
+          // #UD If (dest_reg == src1_reg) or (dest_reg == src2_reg)
+          vfmulcph(Xbyak::Ymm(vreg2.getIdx()), Xbyak::Ymm(vreg0.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
+          vfmulcph(Xbyak::Ymm(vreg3.getIdx()), Xbyak::Ymm(vreg1.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
+          vcvtph2psx(vreg0, Xbyak::Ymm(vreg2.getIdx()));
+          vcvtph2psx(vreg1, Xbyak::Ymm(vreg3.getIdx()));
+          vcvtne2ps2bf16(vreg0, vreg1, vreg0);
+          vmovups(zword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16)] | mask, vreg0);
+        }
+
+        for (int j = utils::padto(n_off, 16); j < n_tile; j += 16) {
+          const auto& vreg0 = vreg_src[j / 16 * 4 + 0];
+          const auto& vreg1 = vreg_src[j / 16 * 4 + 1];
+          const auto& vreg2 = vreg_src[j / 16 * 4 + 2];
+          const auto& vreg3 = vreg_src[j / 16 * 4 + 3];
+          vpmovzxwd(vreg0, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 0]);
+          vpmovzxwd(vreg1, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 32]);
+          vpslldq(vreg0, vreg0, 2);
+          vpslldq(vreg1, vreg1, 2);
+          vcvtps2phx(Xbyak::Ymm(vreg0.getIdx()), vreg0);
+          vcvtps2phx(Xbyak::Ymm(vreg1.getIdx()), vreg1);
+          // #UD If (dest_reg == src1_reg) or (dest_reg == src2_reg)
+          vfmulcph(Xbyak::Ymm(vreg2.getIdx()), Xbyak::Ymm(vreg0.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
+          vfmulcph(Xbyak::Ymm(vreg3.getIdx()), Xbyak::Ymm(vreg1.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
+          vcvtph2psx(vreg0, Xbyak::Ymm(vreg2.getIdx()));
+          vcvtph2psx(vreg1, Xbyak::Ymm(vreg3.getIdx()));
+          vcvtne2ps2bf16(vreg0, vreg1, vreg0);
+          vmovups(zword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16)], vreg0);
+        }
+      }
+    }
+    lea(reg_iterrow, ptr[reg_iterrow + unroll * row_pack]);
+    lea(reg_src, ptr[reg_src + unroll * row_pack * n_tile * sizeof(utils::bf16)]);
+    cmp(reg_iterrow, reg_rowsize);
+    jb(rowloop);
+
+    vreg_pop(rsp);
+  }
+
+  func_t mKernel = nullptr;
+
+ public:
+  template <int NTile, int RowPack = 2>
+  static void forward(utils::bf16* src, const utils::fp16* scale, int row, int col, int src_step, int n_offset) {
+    static_assert(RowPack == 2, "Only implement rowpack2 bf16");
+    static_assert(NTile % 16 == 0, "NTile must be a multiple of 16");
+    constexpr auto unroll = 2;
+    assert(("row should be paded", row % (RowPack * unroll) == 0));
+    assert(("cow should be paded", col % NTile == 0));
+    assert(("can not skip more than col", n_offset < col));
+    int j = utils::padto_le(n_offset, NTile);
+    if (n_offset % NTile != 0) {
+      static const CScaleInterleavedBF16FP16 kern_off(NTile, n_offset % NTile, RowPack, unroll);
+      params param = {src + j * src_step, scale, row};
+      kern_off(&param);
+      j += NTile;
+    }
+
+    for (; j < col; j += NTile) {
+      static const CScaleInterleavedBF16FP16 kern(NTile, 0, RowPack, unroll);
+      params param = {src + j * src_step, scale, row};
+      kern(&param);
+    }
+  }
+
+  template <int NTile, int RowPack = 2>
+  static void reference(utils::bf16* src, const utils::fp16* scale, int row, int col, int src_step, int n_offset) {
+    static_assert(RowPack == 2, "Only implement rowpack2 bf16");
+    static_assert(NTile % 16 == 0, "NTile must be a multiple of 16");
+    assert(("row should be paded", row % RowPack == 0));
+    assert(("cow should be paded", col % NTile == 0));
+    assert(("can not skip more than col", n_offset < col));
+    for (int j = 0; j < col; j += NTile) {
+      for (int i = 0; i < row; i += RowPack) {
+        for (int jj = 0; jj < NTile; ++jj) {
+          if (j + jj < n_offset) continue;
+          auto& rel = (src + j * src_step)[i * NTile + jj * RowPack + 0];
+          auto& img = (src + j * src_step)[i * NTile + jj * RowPack + 1];
+          const auto rel_f32 = static_cast<float>(rel);
+          const auto img_f32 = static_cast<float>(img);
+          const auto rel_scale = static_cast<float>(scale[i + 0]);
+          const auto img_scale = static_cast<float>(scale[i + 1]);
+          rel = static_cast<utils::bf16>(rel_f32 * rel_scale - img_f32 * img_scale);
+          img = static_cast<utils::bf16>(rel_f32 * img_scale + img_f32 * rel_scale);
+        }
+      }
+    }
+  }
+};
+
+}  // namespace jit
+}  // namespace kernel
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_jit_injector.h b/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_jit_injector.h
new file mode 100644
index 000000000..9b8581e1f
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_jit_injector.h
@@ -0,0 +1,934 @@
+//  Copyright (c) 2022 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#pragma once
+
+#include <utility>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <map>
+#include <set>
+#include <array>
+
+#include "bestla.h"
+#include "bestla_utils.h"
+#include "xbyak/xbyak.h"
+
+namespace bestla {
+namespace kernel {
+namespace jit_injector {
+using Zmm = Xbyak::Zmm;
+using Ymm = Xbyak::Ymm;
+using Xmm = Xbyak::Xmm;
+class eltwise_injector {
+ public:
+  eltwise_injector(BTLA_ELTWISEOP eltwiseop) : elt_op(eltwiseop) { reigster_table_entries(); }
+  virtual ~eltwise_injector() {}
+
+  void assign_resources(Xbyak::CodeGenerator* ptr, const std::set<int>& used_zmm_idx, const Xbyak::Reg64& table_reg,
+                        const Xbyak::Opmask& mask_reg) {
+    h = ptr;
+    k_mask = mask_reg;
+    p_table = table_reg;
+    assert(used_zmm_idx.size() <= 26);
+    assign_zmm(used_zmm_idx, &zmm_mask);
+    assign_zmm(used_zmm_idx, &zmm_aux0);
+    assign_zmm(used_zmm_idx, &zmm_aux1);
+    assign_zmm(used_zmm_idx, &zmm_aux2);
+    assign_zmm(used_zmm_idx, &zmm_aux3);
+    assign_zmm(used_zmm_idx, &zmm_aux4);
+  }
+  void assign_resources(Xbyak::CodeGenerator* ptr, const std::set<int>& used_ymm_idx, const Xbyak::Reg64& table_reg) {
+    h = ptr;
+    p_table = table_reg;
+    assert(used_ymm_idx.size() <= 10);
+    assign_ymm(used_ymm_idx, &ymm_mask);
+    assign_ymm(used_ymm_idx, &ymm_aux0);
+    assign_ymm(used_ymm_idx, &ymm_aux1);
+    assign_ymm(used_ymm_idx, &ymm_aux2);
+    assign_ymm(used_ymm_idx, &ymm_aux3);
+    assign_ymm(used_ymm_idx, &ymm_aux4);
+  }
+  void assign_reg_elt_constp(const Xbyak::Reg64& reg) { reg_rt_const_p = reg; }
+  void vector_compute(const Xbyak::Zmm& zmm_src, int const_p_offset = 0) {
+    load_table_addr();
+    switch (elt_op) {
+      case BTLA_ELTWISEOP::EXP:
+        exp_compute_vector_fwd(zmm_src);
+        break;
+      case BTLA_ELTWISEOP::TANH:
+        tanh_compute_vector_fwd(zmm_src);
+        break;
+      case BTLA_ELTWISEOP::GELU:
+        gelu_compute_vector_fwd(zmm_src);
+        break;
+      case BTLA_ELTWISEOP::RELU:
+        relu_compute_vector_fwd(zmm_src, const_p_offset);
+        break;
+      case BTLA_ELTWISEOP::LINEAR:
+        linear_compute_vector_fwd(zmm_src, const_p_offset);
+        break;
+      case BTLA_ELTWISEOP::LOW_PRECISION_EXP:
+        low_precision_exp_compute_vector_fwd(zmm_src);
+        break;
+      case BTLA_ELTWISEOP::SWISH:
+        swish_compute_vector_fwd(zmm_src, const_p_offset);
+        break;
+      default:
+        assert(false);
+        break;
+    }
+  }
+  void vector_compute(const Xbyak::Ymm& ymm_src, int const_p_offset = 0) {
+    load_table_addr();
+    switch (elt_op) {
+      case BTLA_ELTWISEOP::EXP:
+        exp_compute_vector_fwd(ymm_src);
+        break;
+      case BTLA_ELTWISEOP::TANH:
+        tanh_compute_vector_fwd(ymm_src);
+        break;
+      case BTLA_ELTWISEOP::GELU:
+        gelu_compute_vector_fwd(ymm_src);
+        break;
+      case BTLA_ELTWISEOP::LOW_PRECISION_EXP:
+        low_precision_exp_compute_vector_fwd(ymm_src);
+        break;
+      case BTLA_ELTWISEOP::SWISH:
+        swish_compute_vector_fwd(ymm_src, const_p_offset);
+        break;
+      default:
+        assert(false);
+        break;
+    }
+  }
+  void prepare_table() {
+    h->align(64);
+    h->L(l_table);
+    assert(sizeof(table_entry_val_t) == 4);  // sizeof(table_entry_val_t) should be 4
+    for (auto it = entry_map.begin(); it != entry_map.end(); it++) {
+      const auto& te = (*it).second;
+      const auto len = te.bcast ? 64u : sizeof(table_entry_val_t);
+      for (size_t d = 0; d < len; d += sizeof(table_entry_val_t)) h->dd(te.val);
+    }
+  }
+
+ private:
+  void reigster_table_entries() {
+    static const table_t common_values{
+        {zero, {0x00000000, true}},      {half, {0x3f000000, true}},          {one, {0x3f800000, true}},
+        {two, {0x40000000, true}},       {minus_one, {0xbf800000, true}},     {minus_two, {0xc0000000, true}},
+        {ln2f, {0x3f317218, true}},      {one_epi32, {0x00000001, true}},     {positive_mask, {0x7fffffff, true}},
+        {sign_mask, {0x80000000, true}}, {exponent_bias, {0x0000007f, true}},
+    };
+
+    static constexpr std::array<float, 3> exp_approx_f32_coeff{0.35815147f, 0.96963238f, 1.f};
+    static const table_t low_precision_exp_consts{
+        {low_precision_exp_const_v0, {utils::bit_cast<uint32_t>(exp_approx_f32_coeff[0]), true}},
+        {low_precision_exp_const_v1, {utils::bit_cast<uint32_t>(exp_approx_f32_coeff[1]), true}},
+        {low_precision_exp_const_v2, {utils::bit_cast<uint32_t>(exp_approx_f32_coeff[2]), true}},
+    };
+
+    static const table_t exp_consts{{exp_log2ef, {0x3fb8aa3b, true}},
+                                    {exp_ln_flt_max_f, {0x42b17218, true}},
+                                    {exp_ln_flt_min_f, {0xc2aeac50, true}}};
+
+    static const table_t exp_polynomial{
+        // p0 = 1.0f
+        {exp_pol, {0x3f7ffffb, true}},  // p1 = 0.999999701f
+        {exp_pol, {0x3efffee3, true}},  // p2 = 0.499991506f
+        {exp_pol, {0x3e2aad40, true}},  // p3 = 0.166676521f
+        {exp_pol, {0x3d2b9d0d, true}},  // p4 = 0.0418978221f
+        {exp_pol, {0x3c07cfce, true}}   // p5 = 0.00828929059f
+    };
+
+    static const table_t gelu_tanh_const{{gelu_tanh_fitting_const, {0x3d372713, true}},
+                                         {gelu_tanh_fitting_const_times_three, {0x3e095d4f, true}},
+                                         {gelu_tanh_sqrt_two_over_pi, {0x3f4c422a, true}},
+                                         {gelu_tanh_flt_max_x, {0x4154C480, true}},
+                                         {gelu_tanh_flt_min_x, {0xC154C480, true}}};
+
+    // tanh(x) constants for four interval approximation
+    static const table_t tanh_consts{{tanh_idx_bias, {0x39800000, true}},
+                                     {tanh_idx_mask, {0xffc00000, true}},
+                                     {tanh_linear_ubound, {0x39ddb3d7, true}},
+                                     {tanh_saturation_lbound, {0x41102cb3, true}}};
+
+    // tanh(x) polynomial approximation
+    // For each coefficient, there is 32 entries
+    static const table_t tanh_polynomial_table{
+        // coefficients of degree 0
+        {tanh_pol_table, {0x00000000, false}},
+        {tanh_pol_table, {0x39bfffff, false}},
+        {tanh_pol_table, {0x39ffffff, false}},
+        {tanh_pol_table, {0x3a3ffffe, false}},
+        {tanh_pol_table, {0x3a7ffffb, false}},
+        {tanh_pol_table, {0x3abffff7, false}},
+        {tanh_pol_table, {0x3affffeb, false}},
+        {tanh_pol_table, {0x3b3fffdc, false}},
+        {tanh_pol_table, {0x3b7fffab, false}},
+        {tanh_pol_table, {0x3bbfff70, false}},
+        {tanh_pol_table, {0x3bfffeab, false}},
+        {tanh_pol_table, {0x3c3ffdc0, false}},
+        {tanh_pol_table, {0x3c7ffaab, false}},
+        {tanh_pol_table, {0x3cbff701, false}},
+        {tanh_pol_table, {0x3cffeaad, false}},
+        {tanh_pol_table, {0x3d3fdc08, false}},
+        {tanh_pol_table, {0x3d7faacd, false}},
+        {tanh_pol_table, {0x3dbf7081, false}},
+        {tanh_pol_table, {0x3dfeacc9, false}},
+        {tanh_pol_table, {0x3e3dc7fd, false}},
+        {tanh_pol_table, {0x3e7acbf5, false}},
+        {tanh_pol_table, {0x3eb77a9f, false}},
+        {tanh_pol_table, {0x3eec9a9f, false}},
+        {tanh_pol_table, {0x3f22991f, false}},
+        {tanh_pol_table, {0x3f42f7d6, false}},
+        {tanh_pol_table, {0x3f67b7cc, false}},
+        {tanh_pol_table, {0x3f76ca83, false}},
+        {tanh_pol_table, {0x3f7ebbe9, false}},
+        {tanh_pol_table, {0x3f7fd40c, false}},
+        {tanh_pol_table, {0x3f7fff32, false}},
+        {tanh_pol_table, {0x3f7ffffc, false}},
+        {tanh_pol_table, {0x3f800000, false}},
+        // coefficients of degree 1
+        {tanh_pol_table, {0x3f800000, false}},
+        {tanh_pol_table, {0x3f800018, false}},
+        {tanh_pol_table, {0x3f7fffe8, false}},
+        {tanh_pol_table, {0x3f7fffda, false}},
+        {tanh_pol_table, {0x3f7fffdc, false}},
+        {tanh_pol_table, {0x3f7fffdc, false}},
+        {tanh_pol_table, {0x3f7fffac, false}},
+        {tanh_pol_table, {0x3f7fff70, false}},
+        {tanh_pol_table, {0x3f7ffeec, false}},
+        {tanh_pol_table, {0x3f7ffdc0, false}},
+        {tanh_pol_table, {0x3f7ffbed, false}},
+        {tanh_pol_table, {0x3f7ff704, false}},
+        {tanh_pol_table, {0x3f7feff5, false}},
+        {tanh_pol_table, {0x3f7fdbca, false}},
+        {tanh_pol_table, {0x3f7fbfff, false}},
+        {tanh_pol_table, {0x3f7f7041, false}},
+        {tanh_pol_table, {0x3f7f009b, false}},
+        {tanh_pol_table, {0x3f7dc36c, false}},
+        {tanh_pol_table, {0x3f7c0aa8, false}},
+        {tanh_pol_table, {0x3f7734b8, false}},
+        {tanh_pol_table, {0x3f70a4de, false}},
+        {tanh_pol_table, {0x3f5f1fd8, false}},
+        {tanh_pol_table, {0x3f495493, false}},
+        {tanh_pol_table, {0x3f18b9ec, false}},
+        {tanh_pol_table, {0x3ed706cb, false}},
+        {tanh_pol_table, {0x3e390b06, false}},
+        {tanh_pol_table, {0x3d90b11f, false}},
+        {tanh_pol_table, {0x3c21a053, false}},
+        {tanh_pol_table, {0x3aaf7fdb, false}},
+        {tanh_pol_table, {0x37ccc1a3, false}},
+        {tanh_pol_table, {0x355c6733, false}},
+        {tanh_pol_table, {0x00000000, false}},
+        // coefficients of degree 2
+        {tanh_pol_table, {0x00000000, false}},
+        {tanh_pol_table, {0xbe4e0ff1, false}},
+        {tanh_pol_table, {0x3d25b1b1, false}},
+        {tanh_pol_table, {0x3d6b6dab, false}},
+        {tanh_pol_table, {0x3c9fb1d5, false}},
+        {tanh_pol_table, {0xbabff06f, false}},
+        {tanh_pol_table, {0x3c07b3f6, false}},
+        {tanh_pol_table, {0xbb3fc1bc, false}},
+        {tanh_pol_table, {0x3a9f5921, false}},
+        {tanh_pol_table, {0xbbbf06f2, false}},
+        {tanh_pol_table, {0xbbb0f402, false}},
+        {tanh_pol_table, {0xbc47db9e, false}},
+        {tanh_pol_table, {0xbc73d5e7, false}},
+        {tanh_pol_table, {0xbca25bda, false}},
+        {tanh_pol_table, {0xbcfca780, false}},
+        {tanh_pol_table, {0xbd40e07c, false}},
+        {tanh_pol_table, {0xbd7dab03, false}},
+        {tanh_pol_table, {0xbdbe4a0f, false}},
+        {tanh_pol_table, {0xbdfb14a5, false}},
+        {tanh_pol_table, {0xbe36cc8d, false}},
+        {tanh_pol_table, {0xbe6bd102, false}},
+        {tanh_pol_table, {0xbe9fe7c5, false}},
+        {tanh_pol_table, {0xbeba0f10, false}},
+        {tanh_pol_table, {0xbec206a8, false}},
+        {tanh_pol_table, {0xbea3c388, false}},
+        {tanh_pol_table, {0xbe277d62, false}},
+        {tanh_pol_table, {0xbd8b7960, false}},
+        {tanh_pol_table, {0xbc209f49, false}},
+        {tanh_pol_table, {0xbaad44ca, false}},
+        {tanh_pol_table, {0xb7c6eeac, false}},
+        {tanh_pol_table, {0xb663aa41, false}},
+        {tanh_pol_table, {0x00000000, false}},
+        // coefficients of degree 3
+        {tanh_pol_table, {0x00000000, false}},
+        {tanh_pol_table, {0x45b3ae96, false}},
+        {tanh_pol_table, {0xc414eb20, false}},
+        {tanh_pol_table, {0xc450e02e, false}},
+        {tanh_pol_table, {0xc3152b4e, false}},
+        {tanh_pol_table, {0xbead2f56, false}},
+        {tanh_pol_table, {0xc2162e02, false}},
+        {tanh_pol_table, {0xbeb4bd5a, false}},
+        {tanh_pol_table, {0xc11a59a4, false}},
+        {tanh_pol_table, {0xbed2f507, false}},
+        {tanh_pol_table, {0xc020d32c, false}},
+        {tanh_pol_table, {0x3dd0f506, false}},
+        {tanh_pol_table, {0xbf2a75e2, false}},
+        {tanh_pol_table, {0xbff950e3, false}},
+        {tanh_pol_table, {0xbed47334, false}},
+        {tanh_pol_table, {0xbe809b8c, false}},
+        {tanh_pol_table, {0xbeb64532, false}},
+        {tanh_pol_table, {0xbe961a5b, false}},
+        {tanh_pol_table, {0xbe9b63ac, false}},
+        {tanh_pol_table, {0xbea0d4b2, false}},
+        {tanh_pol_table, {0xbe828a77, false}},
+        {tanh_pol_table, {0xbe378612, false}},
+        {tanh_pol_table, {0xbdc20908, false}},
+        {tanh_pol_table, {0x3d2d3957, false}},
+        {tanh_pol_table, {0x3dd46e89, false}},
+        {tanh_pol_table, {0x3db3f629, false}},
+        {tanh_pol_table, {0x3d2c5e7b, false}},
+        {tanh_pol_table, {0x3bd20403, false}},
+        {tanh_pol_table, {0x3a59dfae, false}},
+        {tanh_pol_table, {0x3770af45, false}},
+        {tanh_pol_table, {0x372cc014, false}},
+        {tanh_pol_table, {0x00000000, false}},
+        // coefficients of degree 4
+        {tanh_pol_table, {0x00000000, false}},
+        {tanh_pol_table, {0xcc981a1b, false}},
+        {tanh_pol_table, {0x4a7edd3d, false}},
+        {tanh_pol_table, {0x4ab1007c, false}},
+        {tanh_pol_table, {0x48fedd9c, false}},
+        {tanh_pol_table, {0x41a557b5, false}},
+        {tanh_pol_table, {0x477ee32a, false}},
+        {tanh_pol_table, {0x422557f5, false}},
+        {tanh_pol_table, {0x45ff3ce4, false}},
+        {tanh_pol_table, {0x42a55641, false}},
+        {tanh_pol_table, {0x446e0867, false}},
+        {tanh_pol_table, {0xc33dc19a, false}},
+        {tanh_pol_table, {0x42915214, false}},
+        {tanh_pol_table, {0x43af4fad, false}},
+        {tanh_pol_table, {0x4110fe88, false}},
+        {tanh_pol_table, {0xc1099b75, false}},
+        {tanh_pol_table, {0x3fc8a8dc, false}},
+        {tanh_pol_table, {0xbfbeaef5, false}},
+        {tanh_pol_table, {0xbe365aad, false}},
+        {tanh_pol_table, {0x3f4d9652, false}},
+        {tanh_pol_table, {0x3ddfa08f, false}},
+        {tanh_pol_table, {0x3e34e9b8, false}},
+        {tanh_pol_table, {0x3e2d07a6, false}},
+        {tanh_pol_table, {0x3dc63567, false}},
+        {tanh_pol_table, {0x3cdaeb78, false}},
+        {tanh_pol_table, {0xbcd17537, false}},
+        {tanh_pol_table, {0xbc92829c, false}},
+        {tanh_pol_table, {0xbb43ab99, false}},
+        {tanh_pol_table, {0xb9b471dd, false}},
+        {tanh_pol_table, {0xb6baad5a, false}},
+        {tanh_pol_table, {0xb78bafc7, false}},
+        {tanh_pol_table, {0x00000000, false}},
+        // coefficients of degree 5
+        {tanh_pol_table, {0x00000000, false}},
+        {tanh_pol_table, {0x52f688d5, false}},
+        {tanh_pol_table, {0xd0505c72, false}},
+        {tanh_pol_table, {0xd08f98e3, false}},
+        {tanh_pol_table, {0xce505cc9, false}},
+        {tanh_pol_table, {0xc7162b8a, false}},
+        {tanh_pol_table, {0xcc5061d6, false}},
+        {tanh_pol_table, {0xc7162bdf, false}},
+        {tanh_pol_table, {0xca50b37f, false}},
+        {tanh_pol_table, {0xc7162a3a, false}},
+        {tanh_pol_table, {0xc8422086, false}},
+        {tanh_pol_table, {0x471a714e, false}},
+        {tanh_pol_table, {0xc5ece1f1, false}},
+        {tanh_pol_table, {0xc70e3d90, false}},
+        {tanh_pol_table, {0xc3eba94a, false}},
+        {tanh_pol_table, {0x43e0c424, false}},
+        {tanh_pol_table, {0xc21f4552, false}},
+        {tanh_pol_table, {0x42217cc8, false}},
+        {tanh_pol_table, {0x405e7dc4, false}},
+        {tanh_pol_table, {0xc10dd401, false}},
+        {tanh_pol_table, {0x3e96b602, false}},
+        {tanh_pol_table, {0xbd1a6d2f, false}},
+        {tanh_pol_table, {0xbd393883, false}},
+        {tanh_pol_table, {0xbd674682, false}},
+        {tanh_pol_table, {0xbd310016, false}},
+        {tanh_pol_table, {0xb961e269, false}},
+        {tanh_pol_table, {0x3ba32495, false}},
+        {tanh_pol_table, {0x3a7680d5, false}},
+        {tanh_pol_table, {0x38b3173c, false}},
+        {tanh_pol_table, {0x35a9deea, false}},
+        {tanh_pol_table, {0x375c3f2a, false}},
+        {tanh_pol_table, {0x00000000, false}},
+        // coefficients of degree 6
+        {tanh_pol_table, {0x00000000, false}},
+        {tanh_pol_table, {0xd8995ed1, false}},
+        {tanh_pol_table, {0x558285ea, false}},
+        {tanh_pol_table, {0x55b2cd69, false}},
+        {tanh_pol_table, {0x53028625, false}},
+        {tanh_pol_table, {0x4bc9991f, false}},
+        {tanh_pol_table, {0x5082898a, false}},
+        {tanh_pol_table, {0x4b4999b3, false}},
+        {tanh_pol_table, {0x4e02c07c, false}},
+        {tanh_pol_table, {0x4ac99764, false}},
+        {tanh_pol_table, {0x4b72c822, false}},
+        {tanh_pol_table, {0xca40c0e1, false}},
+        {tanh_pol_table, {0x489413e4, false}},
+        {tanh_pol_table, {0x49b12224, false}},
+        {tanh_pol_table, {0x46134c4e, false}},
+        {tanh_pol_table, {0xc60c2d57, false}},
+        {tanh_pol_table, {0x43c83910, false}},
+        {tanh_pol_table, {0xc3c872d1, false}},
+        {tanh_pol_table, {0xc186bc9e, false}},
+        {tanh_pol_table, {0x42325bc3, false}},
+        {tanh_pol_table, {0xbf2ffa4a, false}},
+        {tanh_pol_table, {0x3d9a203c, false}},
+        {tanh_pol_table, {0xbc545a43, false}},
+        {tanh_pol_table, {0xbae08fee, false}},
+        {tanh_pol_table, {0x3c80225d, false}},
+        {tanh_pol_table, {0x3b1fd1df, false}},
+        {tanh_pol_table, {0xba36b9d1, false}},
+        {tanh_pol_table, {0xb91de544, false}},
+        {tanh_pol_table, {0xb71f100f, false}},
+        {tanh_pol_table, {0xb408e2ed, false}},
+        {tanh_pol_table, {0xb685fec8, false}},
+        {tanh_pol_table, {0x00000000, false}},
+    };
+
+    auto push_arg_entry_of = [&](const key_t key, const table_entry_val_t val, const bool broadcast) {
+      mapped_table_entry_t te{0, val, broadcast};
+      entry_map.insert(std::make_pair(key, te));
+    };
+
+    auto push_entries_of = [&](const table_t& t) {
+      for (auto it = t.begin(); it != t.end(); it++) {
+        auto key = it->first;
+        auto te = it->second;
+        push_arg_entry_of(key, te.val, te.bcast);
+      }
+    };
+
+    auto set_table_term_offset = [&]() {
+      size_t off = 0;
+      for (auto it = entry_map.begin(); it != entry_map.end(); it++) {
+        auto& te = (*it).second;
+        te.off = off;
+        off += te.bcast ? 64u : sizeof(table_entry_val_t);
+      }
+    };
+
+    struct need_t {
+      explicit need_t(BTLA_ELTWISEOP& op) {
+        if (op == BTLA_ELTWISEOP::EXP) exp_ = true;
+        if (op == BTLA_ELTWISEOP::TANH) tanh_ = true;
+        if (op == BTLA_ELTWISEOP::GELU) gelu_ = true;
+        if (op == BTLA_ELTWISEOP::SWISH) swish_ = true;
+        if (op == BTLA_ELTWISEOP::LOW_PRECISION_EXP) low_precision_exp_ = true;
+      }
+      bool bf16_ = false;
+      bool exp_ = false;
+      bool tanh_ = false;
+      bool gelu_ = false;
+      bool low_precision_exp_ = false;
+      bool swish_ = false;
+
+      bool bf16() const { return bf16_; }
+      bool exp() const { return exp_; }
+      bool tanh() const { return tanh_; }
+      bool gelu() const { return gelu_; }
+      bool low_precision_exp() { return low_precision_exp_; }
+      bool swish() const { return swish_; }
+    };
+
+    need_t need(elt_op);
+    push_entries_of(common_values);
+    if (need.exp()) {
+      push_entries_of(exp_consts);
+      push_entries_of(exp_polynomial);
+    }
+    if (need.low_precision_exp() || need.swish()) {
+      push_entries_of(exp_polynomial);
+      push_entries_of(exp_consts);
+      push_entries_of(low_precision_exp_consts);
+    }
+    if (need.tanh() || need.gelu()) {
+      push_entries_of(tanh_consts);
+      push_entries_of(tanh_polynomial_table);
+    }
+    if (need.gelu()) push_entries_of(gelu_tanh_const);
+
+    set_table_term_offset();
+  }
+  void exp_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
+    /* exp code */
+    h->vcmpps(ymm_mask, ymm_src, table_val(exp_ln_flt_min_f), _cmp_lt_os);
+    h->vminps(ymm_src, ymm_src, table_val(exp_ln_flt_max_f));
+    h->vmaxps(ymm_src, ymm_src, table_val(exp_ln_flt_min_f));
+    h->vmovups(ymm_aux1, ymm_src);
+    h->vmulps(ymm_src, ymm_src, table_val(exp_log2ef));
+    h->vaddps(ymm_src, ymm_src, table_val(half));
+    h->vroundps(ymm_aux2, ymm_src, _op_floor);
+
+    // keep ymm_src = fx for further computations
+    h->vmovups(ymm_src, ymm_aux2);
+
+    // x = x - fx * ln2
+    h->vfnmadd231ps(ymm_aux1, ymm_aux2, table_val(ln2f));
+
+    // We do not count 2^n here, because n can reach 128 and 2^128 is not
+    // representable by fp32, so to get around this problem, instead of
+    // computing 2^n * exp(r) will be counted 2*2^(n-1)*exp(r), because 2^127
+    // and 2 are numbers representable in fp32.
+
+    // compute 2^(n-1)
+    h->vsubps(ymm_src, ymm_src, table_val(one));
+    h->vcvtps2dq(ymm_aux2, ymm_src);
+    h->vpaddd(ymm_aux2, ymm_aux2, table_val(exponent_bias));
+    h->vpslld(ymm_aux2, ymm_aux2, n_mantissa_bits);
+
+    // use ymm_src as tmp ymm_zero when applying mask
+    h->vxorps(ymm_src, ymm_src, ymm_src);
+
+    // set zeroes at those points which were < log(FLT_MIN)
+    h->vblendvps(ymm_aux2, ymm_aux2, ymm_src, ymm_mask);
+
+    // compute polynomial
+    h->vmovups(ymm_src, table_val(exp_pol, 4));
+    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 3));
+    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 2));
+    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 1));
+    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 0));
+    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(one));
+
+    // y = y * 2^n
+
+    h->vmulps(ymm_src, ymm_src, ymm_aux2);
+    h->vmulps(ymm_src, ymm_src, table_val(two));
+  }
+  void exp_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
+    /* exp code */
+    h->vcmpps(k_mask, zmm_src, table_val(exp_ln_flt_min_f), _cmp_lt_os);
+    h->vminps(zmm_src, zmm_src, table_val(exp_ln_flt_max_f));
+    h->vmaxps(zmm_src, zmm_src, table_val(exp_ln_flt_min_f));
+    h->vmovups(zmm_aux1, zmm_src);
+    h->vmulps(zmm_src, zmm_src, table_val(exp_log2ef));
+    h->vaddps(zmm_src, zmm_src, table_val(half));
+    h->vrndscaleps(zmm_aux2, zmm_src, _op_floor & 0x3);
+
+    // keep zmm_src = fx for further computations
+    h->vmovups(zmm_src, zmm_aux2);
+
+    // x = x - fx * ln2
+    h->vfnmadd231ps(zmm_aux1, zmm_aux2, table_val(ln2f));
+
+    // We do not count 2^n here, because n can reach 128 and 2^128 is not
+    // representable by fp32, so to get around this problem, instead of computing
+    // 2^n * exp(r) will be counted 2*2^(n-1)*exp(r), because 2^127
+    // and 2 are numbers representable in fp32.
+
+    // compute 2^(n-1)
+    h->vsubps(zmm_src, zmm_src, table_val(one));
+    h->vcvtps2dq(zmm_aux2, zmm_src);
+    h->vpaddd(zmm_aux2, zmm_aux2, table_val(exponent_bias));
+    h->vpslld(zmm_aux2, zmm_aux2, n_mantissa_bits);
+
+    // use zmm_src as tmp zmm_zero when applying mask
+    h->vxorps(zmm_src, zmm_src, zmm_src);
+
+    // set zeroes at those points which were < log(FLT_MIN)
+    h->vblendmps(zmm_aux2 | k_mask, zmm_aux2, zmm_src);
+
+    // compute polynomial
+    h->vmovups(zmm_src, table_val(exp_pol, 4));
+    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 3));
+    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 2));
+    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 1));
+    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 0));
+    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(one));
+
+    // y = y * 2^n
+
+    h->vmulps(zmm_src, zmm_src, zmm_aux2);
+    h->vmulps(zmm_src, zmm_src, table_val(two));
+  }
+  void low_precision_exp_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
+    // support abs(x)<23
+    auto code = [&](Xbyak::CodeGenerator* h, const Ymm& dst, const Ymm& src, const Xbyak::Operand& log2e,
+                    const Xbyak::Operand& ln2, const Xbyak::Operand& coeff0, const Xbyak::Operand& coeff1,
+                    const Xbyak::Operand& coeff2, const std::array<Ymm, 4>& tmp) {
+      h->vmulps(tmp[0], src, log2e);      // x / ln2
+      h->vroundps(tmp[0], tmp[0], 0x0A);  // round up
+      const auto& z = tmp[0];
+      h->vmulps(tmp[1], tmp[0], ln2);
+      h->vsubps(tmp[1], src, tmp[1]);  // x mod ln2 (can we use fmsub?)
+      h->vmovaps(dst, coeff1);
+      h->vfmadd231ps(dst, tmp[1], coeff0);  // dst = f * c0 + c1
+      h->vfmadd213ps(dst, tmp[1], coeff2);  // dst = (f * c0 + c1) * f + c2
+
+      const auto& z_sign = tmp[2];
+      const auto& z_abs = tmp[3];
+      h->vcmpps(z_sign, z, table_val(zero), _cmp_lt_os);
+      h->vcvtps2dq(z, z);
+      h->vpabsd(z_abs, z);
+      h->vmovdqu(tmp[1], table_val(one_epi32));
+      h->vpsllvd(z_abs, tmp[1], z_abs);  // 2^z
+      h->vcvtdq2ps(z_abs, z_abs);
+      h->vrcpps(z, z_abs);
+      h->vblendvps(z, z_abs, z, z_sign);
+      h->vmulps(dst, dst, z);  // dst = exp(f) * 2^z
+    };
+    code(h, ymm_src, ymm_src, table_val(exp_log2ef), table_val(ln2f),  //
+         table_val(low_precision_exp_const_v0), table_val(low_precision_exp_const_v1),
+         table_val(low_precision_exp_const_v2), {ymm_aux1, ymm_aux2, ymm_aux3, ymm_aux4});
+  }
+  void low_precision_exp_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
+    auto code = [&](Xbyak::CodeGenerator* h, const Zmm& dst, const Zmm& src, const Xbyak::Operand& log2e,
+                    const Xbyak::Operand& ln2, const Xbyak::Operand& coeff0, const Xbyak::Operand& coeff1,
+                    const Xbyak::Operand& coeff2, const std::array<Zmm, 2>& tmp) {
+      h->vmovups(tmp[0], log2e);
+      h->vmulps(tmp[0] | h->T_ru_sae, src, tmp[0]);  // round up(x / ln2)
+      const auto& z = tmp[0];
+      h->vmulps(tmp[1], tmp[0], ln2);
+      h->vsubps(tmp[1], src, tmp[1]);  // x mod ln2 (can we use fmsub?)
+      h->vmovaps(dst, coeff1);
+      h->vfmadd231ps(dst, tmp[1], coeff0);  // dst = f * c0 + c1
+      h->vfmadd213ps(dst, tmp[1], coeff2);  // dst = (f * c0 + c1) * f + c2
+      h->vscalefps(dst, dst, z);            // dst = exp(f) * 2^z
+    };
+    code(h, zmm_src, zmm_src, table_val(exp_log2ef), table_val(ln2f),  //
+         table_val(low_precision_exp_const_v0), table_val(low_precision_exp_const_v1),
+         table_val(low_precision_exp_const_v2), {zmm_aux1, zmm_aux2});
+  }
+  void swish_compute_vector_fwd(const Xbyak::Ymm& ymm_src, int const_p_offset) {
+    h->vbroadcastss(ymm_aux0, h->ptr[reg_rt_const_p + const_p_offset]);
+    h->vmulps(ymm_aux0, ymm_aux0, ymm_src);
+    exp_compute_vector_fwd(ymm_aux0);
+    h->vaddps(ymm_aux0, ymm_aux0, table_val(one));
+    h->vrcpps(ymm_aux0, ymm_aux0);
+    h->vmulps(ymm_src, ymm_src, ymm_aux0);
+  }
+  void swish_compute_vector_fwd(const Xbyak::Zmm& zmm_src, int const_p_offset) {
+    h->vmovups(zmm_aux0, zmm_src);
+    h->vmulps(zmm_aux0, zmm_aux0, h->zword_b[reg_rt_const_p + const_p_offset]);
+    if (bestla::utils::isFastExp()) {
+      low_precision_exp_compute_vector_fwd(zmm_aux0);
+    } else {
+      exp_compute_vector_fwd(zmm_aux0);
+    }
+    h->vaddps(zmm_aux0, zmm_aux0, table_val(one));
+    h->vrcp14ps(zmm_aux0, zmm_aux0);
+    h->vmulps(zmm_src, zmm_src, zmm_aux0);
+  }
+  void tanh_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
+    // register mapping
+    Ymm ymm_dst = ymm_aux1, ymm_src_shift = ymm_aux1, ymm_coeff = ymm_aux1, ymm_pol = ymm_aux2, ymm_indices = ymm_aux3,
+        ymm_src_original = ymm_aux4, ymm_sign = ymm_aux4;
+
+    const int tanh_n_polynomials = 32;
+
+    // We split the positive domain in 33 intervals:
+    // a) [0; linear_ubound]: in this interval tanh(x) = x
+    // b) [linear_ubound; 0x1.8p-12]: This interval spans part of a
+    //    half binade
+    // c) [0x1.8p-12; 0x1.0p-11], ..., [0x1.8p2; 0x1.0p3]:
+    //    one interval for each half binade, there are 29 of those
+    // d) [0x1.0p3; saturation_ubound]:
+    //    This interval spans part of a half binade
+    // e) [0x1.205966p3; saturation_ubound]: in this interval, tanh(x) = 1
+    // For b-d, we need 31 polynomials and will do a table lookup for those.
+    // To simplify the logic, we will also put a) in the table.
+    auto coeffs_address = [&](int coeff_off, int off = 0) {
+      return table_val(tanh_pol_table, coeff_off * tanh_n_polynomials + off);
+    };
+    auto gather_coefficient = [&](Ymm vmm_coeff, int coeff_idx, Ymm vmm_pol_idx) {
+      Ymm ymm_coeff(vmm_coeff.getIdx());
+      Ymm ymm_pol_idx(vmm_pol_idx.getIdx());
+      Xbyak::Address idx_addr =
+          h->ptr[p_table + table_off(tanh_pol_table, coeff_idx * tanh_n_polynomials) + ymm_pol_idx * sizeof(float)];
+      h->vcmpps(ymm_mask, ymm_mask, ymm_mask, _cmp_eq_oq);
+      h->vgatherdps(vmm_coeff, idx_addr, ymm_mask);
+    };
+
+    // because tanh(x) = -tanh(-x), we extract sign to make x positive
+    // and reapply sign at the end
+    h->vmovups(ymm_src_original, ymm_src);
+    h->vandps(ymm_src, ymm_src, table_val(positive_mask));
+
+    // We compute the indices for the table lookup
+    h->vmovups(ymm_indices, ymm_src);
+    h->vpsubd(ymm_indices, ymm_indices, table_val(tanh_idx_bias));
+    h->vandps(ymm_indices, ymm_indices, table_val(tanh_idx_mask));
+    h->vpsrld(ymm_indices, ymm_indices, 22);
+
+    // we do the argument reduction
+    h->vmovups(ymm_src_shift, ymm_src);
+    h->vandps(ymm_src_shift, ymm_src_shift, table_val(tanh_idx_mask));
+    h->vsubps(ymm_src, ymm_src, ymm_src_shift);
+
+    // we gather and evaluate the polynonials
+    gather_coefficient(ymm_pol, 6, ymm_indices);
+    for (int deg = 5; deg >= 0; --deg) {
+      gather_coefficient(ymm_coeff, deg, ymm_indices);
+      h->vfmadd213ps(ymm_pol, ymm_src, ymm_coeff);
+    }
+
+    // we restore src with cleared sign, and keep sign
+    h->vmovups(ymm_src, ymm_src_original);
+    h->vandps(ymm_sign, ymm_sign, table_val(sign_mask));
+    h->vandps(ymm_src, ymm_src, table_val(positive_mask));
+
+    // Now we blend the results
+    // [saturation_ubound; +inf[ : we return +/- 1
+    h->vmovups(ymm_dst, table_val(one));
+    // [linear_ubound; saturation_lbound] : we return +/- P(x)
+    h->vmovups(ymm_mask, table_val(tanh_saturation_lbound));
+    h->vcmpps(ymm_mask, ymm_mask, ymm_src, _cmp_nle_us);
+    h->vblendvps(ymm_dst, ymm_dst, ymm_pol, ymm_mask);
+    // [0; linear_ubound]  : we return x
+    h->vmovups(ymm_mask, table_val(tanh_linear_ubound));
+    h->vcmpps(ymm_mask, ymm_mask, ymm_src, _cmp_nle_us);
+    h->vblendvps(ymm_dst, ymm_dst, ymm_src, ymm_mask);
+
+    // We reapply the sign and return
+    h->vxorps(ymm_dst, ymm_dst, ymm_sign);
+    h->vmovups(ymm_src, ymm_dst);
+  }
+  void tanh_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
+    // register mapping
+    Zmm zmm_dst = zmm_aux1, zmm_src_shift = zmm_aux1, zmm_coeff = zmm_aux1, zmm_pol = zmm_aux2, zmm_indices = zmm_aux3,
+        zmm_src_original = zmm_aux4, zmm_sign = zmm_aux4;
+
+    const int tanh_n_polynomials = 32;
+
+    // We split the positive domain in 33 intervals:
+    // a) [0; linear_ubound]: in this interval tanh(x) = x
+    // b) [linear_ubound; 0x1.8p-12]: This interval spans part of a
+    //    half binade
+    // c) [0x1.8p-12; 0x1.0p-11], ..., [0x1.8p2; 0x1.0p3]:
+    //    one interval for each half binade, there are 29 of those
+    // d) [0x1.0p3; saturation_ubound]:
+    //    This interval spans part of a half binade
+    // e) [0x1.205966p3; saturation_ubound]: in this interval, tanh(x) = 1
+    // For b-d, we need 31 polynomials and will do a table lookup for those.
+    // To simplify the logic, we will also put a) in the table.
+    auto coeffs_address = [&](int coeff_off, int off = 0) {
+      return table_val(tanh_pol_table, (size_t)coeff_off * tanh_n_polynomials + off);
+    };
+    auto gather_coefficient = [&](Zmm vmm_coeff, int coeff_idx, Zmm vmm_pol_idx) {
+      Zmm zmm_coeff(vmm_coeff.getIdx());
+      Zmm zmm_pol_idx(vmm_pol_idx.getIdx());
+      h->vmovups(zmm_coeff, coeffs_address(coeff_idx, 0));
+      h->vpermt2ps(zmm_coeff, zmm_pol_idx, coeffs_address(coeff_idx, 16));
+    };
+
+    // because tanh(x) = -tanh(-x), we extract sign to make x positive
+    // and reapply sign at the end
+    h->vmovups(zmm_src_original, zmm_src);
+    h->vpandd(zmm_src, zmm_src, table_val(positive_mask));
+
+    // We compute the indices for the table lookup
+    h->vmovups(zmm_indices, zmm_src);
+    h->vpsubd(zmm_indices, zmm_indices, table_val(tanh_idx_bias));
+    h->vpandd(zmm_indices, zmm_indices, table_val(tanh_idx_mask));
+    h->vpsrld(zmm_indices, zmm_indices, 22);
+
+    // we do the argument reduction
+    h->vmovups(zmm_src_shift, zmm_src);
+    h->vpandd(zmm_src_shift, zmm_src_shift, table_val(tanh_idx_mask));
+    h->vsubps(zmm_src, zmm_src, zmm_src_shift);
+
+    // we gather and evaluate the polynonials
+    gather_coefficient(zmm_pol, 6, zmm_indices);
+    for (int deg = 5; deg >= 0; --deg) {
+      gather_coefficient(zmm_coeff, deg, zmm_indices);
+      h->vfmadd213ps(zmm_pol, zmm_src, zmm_coeff);
+    }
+
+    // we restore src with cleared sign, and keep sign
+    h->vmovups(zmm_src, zmm_src_original);
+    h->vpandd(zmm_sign, zmm_sign, table_val(sign_mask));
+    h->vpandd(zmm_src, zmm_src, table_val(positive_mask));
+
+    // Now we blend the results
+    // [saturation_ubound; +inf[ : we return +/- 1
+    h->vmovups(zmm_dst, table_val(one));
+    // [linear_ubound; saturation_lbound] : we return +/- P(x)
+    h->vmovups(zmm_mask, table_val(tanh_saturation_lbound));
+    h->vcmpps(k_mask, zmm_mask, zmm_src, _cmp_nle_us);
+    h->vblendmps(zmm_dst | k_mask, zmm_dst, zmm_pol);
+    // [0; linear_ubound]  : we return x
+    h->vmovups(zmm_mask, table_val(tanh_linear_ubound));
+    h->vcmpps(k_mask, zmm_mask, zmm_src, _cmp_nle_us);
+    h->vblendmps(zmm_dst | k_mask, zmm_dst, zmm_src);
+
+    // We reapply the sign and return
+    h->vpxord(zmm_dst, zmm_dst, zmm_sign);
+    h->vmovups(zmm_src, zmm_dst);
+  }
+  void gelu_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
+    h->vmovups(ymm_aux0, ymm_src);
+    // compute G(x) = sqrt_root_two_over_pi * x * (1 + fitting_const * x * x)
+    h->vmulps(ymm_src, ymm_src, ymm_src);
+    h->vmovups(ymm_aux1, table_val(gelu_tanh_fitting_const));
+    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(one));
+    h->vmulps(ymm_src, ymm_src, ymm_aux0);
+    h->vmulps(ymm_src, ymm_src, table_val(gelu_tanh_sqrt_two_over_pi));
+
+    // compute tanh(G(x))
+    tanh_compute_vector_fwd(ymm_src);
+
+    // compute 0.5 * x * (1 + tanh(G(x)))
+    h->vaddps(ymm_src, ymm_src, table_val(one));
+    h->vmulps(ymm_src, ymm_src, table_val(half));
+    h->vmulps(ymm_src, ymm_src, ymm_aux0);
+  }
+  void gelu_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
+    h->vmovups(zmm_aux0, zmm_src);
+    // compute G(x) = sqrt_root_two_over_pi * x * (1 + fitting_const * x * x)
+    h->vmulps(zmm_src, zmm_src, zmm_src);
+    h->vmovups(zmm_aux1, table_val(gelu_tanh_fitting_const));
+    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(one));
+    h->vmulps(zmm_src, zmm_src, zmm_aux0);
+    h->vmulps(zmm_src, zmm_src, table_val(gelu_tanh_sqrt_two_over_pi));
+
+    // compute tanh(G(x))
+    tanh_compute_vector_fwd(zmm_src);
+
+    // compute 0.5 * x * (1 + tanh(G(x)))
+    h->vaddps(zmm_src, zmm_src, table_val(one));
+    h->vmulps(zmm_src, zmm_src, table_val(half));
+    h->vmulps(zmm_src, zmm_src, zmm_aux0);
+  }
+  void relu_compute_vector_fwd(const Xbyak::Zmm& zmm_src, int const_p_offset) {
+    h->vmovups(zmm_aux1, zmm_src);
+    h->vcmpps(k_mask, zmm_src, table_val(zero), _cmp_nle_us);
+    h->vmulps(zmm_src, zmm_src, h->zword_b[reg_rt_const_p + const_p_offset]);
+    h->vblendmps(zmm_src | k_mask, zmm_src, zmm_aux1);
+  }
+  void linear_compute_vector_fwd(const Xbyak::Zmm& zmm_src, int const_p_offset) {
+    h->vbroadcastss(zmm_aux0, h->dword[reg_rt_const_p + const_p_offset]);
+    h->vfmadd213ps(zmm_src, zmm_aux0, h->zword_b[reg_rt_const_p + const_p_offset + 1 * sizeof(float)]);
+  }
+  void load_table_addr() { h->mov(p_table, l_table); }
+  void assign_zmm(const std::set<int>& used_zmm_idx, Zmm* zmm) {
+    constexpr int max_zmm_idx = 32;
+    for (int idx = 0; idx < max_zmm_idx; idx++) {
+      if (used_zmm_idx.count(idx) == 0 && assign_vmm_idx.count(idx) == 0) {
+        *zmm = Zmm(idx);
+        assign_vmm_idx.insert(idx);
+        break;
+      }
+    }
+  }
+  void assign_ymm(const std::set<int>& used_ymm_idx, Ymm* ymm) {
+    constexpr int max_ymm_idx = 16;
+    for (int idx = 0; idx < max_ymm_idx; idx++) {
+      if (used_ymm_idx.count(idx) == 0 && assign_vmm_idx.count(idx) == 0) {
+        *ymm = Ymm(idx);
+        assign_vmm_idx.insert(idx);
+        break;
+      }
+    }
+  }
+
+ private:
+  BTLA_ELTWISEOP elt_op;
+  Xbyak::CodeGenerator* h = nullptr;
+
+  /*labels*/
+  Xbyak::Label l_table;
+
+  /*register for fwd*/
+  Xbyak::Reg64 p_table;
+  Xbyak::Reg64 reg_rt_const_p;
+  std::set<int> assign_vmm_idx;  // use for zmm (in avx512) or ymm (in avx2)
+  Zmm zmm_mask, zmm_aux0, zmm_aux1, zmm_aux2, zmm_aux3, zmm_aux4;
+  Ymm ymm_mask, ymm_aux0, ymm_aux1, ymm_aux2, ymm_aux3, ymm_aux4;
+  Xbyak::Opmask k_mask;
+  static constexpr int n_mantissa_bits = 23;
+
+  enum {
+    _cmp_eq_oq = 0u,
+    _cmp_lt_os = 1u,
+    _cmp_le_os = 2u,
+    _cmp_neq_uq = 4u,
+    _cmp_nlt_us = 5u,
+    _cmp_nle_us = 6u,
+
+    _op_floor = 1u,
+    _op_mxcsr = 4u,
+  };
+
+  enum key_t {
+    zero = 0,                             // 0.f
+    half,                                 // 0.5f
+    one,                                  // 1.f  or  mask for exponent bits
+    two,                                  // 2.f
+    three,                                // 3.f
+    six,                                  // 6.f
+    minus_one,                            // -1.f  or  changes sign to opposite
+    minus_two,                            // -2.f
+    minus_three,                          // -3.f
+    ln2f,                                 // 0.69314718f
+    one_epi32,                            // 1 in int32
+    positive_mask,                        // changes sign to positive
+    sign_mask,                            // gets sign value
+    exponent_bias,                        // (127 = 2^7 - 1), gets exponent bits
+    exp_log2ef,                           // 1.44269502f - formula-based for approx
+    exp_ln_flt_max_f,                     // logf(FLT_MAX) - max normal value
+    exp_ln_flt_min_f,                     // logf(FLT_MIN) - min normal value
+    exp_pol,                              // see correspondent table for float values
+    gelu_tanh_fitting_const,              // 0.044715f
+    gelu_tanh_fitting_const_times_three,  // 0.134145f
+    gelu_tanh_sqrt_two_over_pi,           // sqrtf(2.f/pi) = 0.797884f
+    gelu_tanh_flt_max_x,
+    gelu_tanh_flt_min_x,
+    tanh_idx_bias,
+    tanh_idx_mask,
+    tanh_linear_ubound,
+    tanh_saturation_lbound,
+    tanh_pol_table,
+    low_precision_exp_const_v0,
+    low_precision_exp_const_v1,
+    low_precision_exp_const_v2,
+    undef_key,
+  };
+
+  size_t table_off(key_t key, size_t key_off_val_shift = 0) {
+    const auto it = entry_map.find(key);
+    assert(it != entry_map.end());  // "key is not in entry_map"
+    const auto& te = (*it).second;
+    const auto scale = te.bcast ? 64u : sizeof(table_entry_val_t);
+    return te.off + key_off_val_shift * scale;
+  }
+  Xbyak::Address table_val(key_t key, size_t key_off_val_shift = 0) {
+    auto off = table_off(key, key_off_val_shift);
+    return h->ptr[p_table + off];
+  }
+  using table_entry_val_t = uint32_t;
+  using table_entry_offset_t = size_t;  // offsets are in bytes wrt p_table
+  using table_entry_bcast_t = bool;
+
+  struct table_entry_t {
+    table_entry_val_t val;
+    table_entry_bcast_t bcast;
+  };
+  struct mapped_table_entry_t {
+    table_entry_offset_t off;
+    table_entry_val_t val;
+    table_entry_bcast_t bcast;
+  };
+  using table_t = std::multimap<key_t, table_entry_t>;
+  using mapped_table_t = std::multimap<key_t, mapped_table_entry_t>;
+  mapped_table_t entry_map = {};
+};
+}  // namespace jit_injector
+}  // namespace kernel
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_ref.h b/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_ref.h
new file mode 100644
index 000000000..fb3fb1f65
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_ref.h
@@ -0,0 +1,2385 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <cassert>
+#include <type_traits>
+#include <vector>
+#include <algorithm>
+#include <limits>
+#include "bestla.h"
+#include "bestla_utils.h"
+
+namespace bestla {
+namespace kernel {
+namespace ref {
+
+template <typename T>
+static inline BTLA_CODE shuffle_activation(T* src, T* dst, int shuffle_m, int shuffle_k, int m_offset, int k_offset,
+                                           int* indices, int src_stride, int dst_stride) {
+  T* cur_src = src + m_offset * src_stride;
+  for (int i = 0; i < shuffle_m; i++) {
+    for (int j = 0; j < shuffle_k; j++) {
+      dst[i * dst_stride + j] = cur_src[i * src_stride + indices[k_offset + j]];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename T_SRC, typename T_DST = T_SRC>
+static inline BTLA_CODE padding_interleave(const T_SRC* src_ptr, T_DST* dst_ptr, int row, int col, int rowpad,
+                                           int colpad, int src_step, int dst_step, int NTile, int RowPack) {
+  const T_DST dst_0(0);
+  static_assert(sizeof(T_SRC) == sizeof(T_DST), "SRC & DST size should be the same");
+  for (int i = 0; i < rowpad; i += RowPack) {
+    for (int j = 0; j < colpad; j += NTile) {
+      for (int jj = 0; jj < NTile; jj++) {
+        for (int ii = 0; ii < RowPack; ii++) {
+          dst_ptr[i * NTile + j * dst_step + jj * RowPack + ii] =
+              (i + ii) < row && (j + jj) < col  //
+                  ? static_cast<T_DST>(src_ptr[(i + ii) * src_step + (j + jj)])
+                  : dst_0;
+        }
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+// revert padding and interleave
+// row*col <= colpad/NTile*rowpad*NTile
+template <typename T_SRC, typename T_DST = T_SRC>
+static inline BTLA_CODE revert_padding_interleave(const T_SRC* src_ptr, T_DST* dst_ptr, int row, int col, int rowpad,
+                                                  int colpad, int src_step, int dst_step, int NTile, int RowPack) {
+  static_assert(sizeof(T_SRC) == sizeof(T_DST), "SRC & DST size should be the same");
+  for (int i = 0; i < rowpad; i += RowPack) {
+    for (int j = 0; j < colpad; j += NTile) {
+      for (int jj = 0; jj < NTile; jj++) {
+        if ((j + jj) < col) {
+          for (int ii = 0; ii < RowPack; ii++) {
+            if ((i + ii) < row) {
+              dst_ptr[(i + ii) * dst_step + (j + jj)] =
+                  static_cast<T_DST>(src_ptr[i * NTile + j * src_step + jj * RowPack + ii]);
+            }
+          }
+        }
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+// M x N ===> M/MTile x N/colPack x MTile x colPack (leading dim stride = MTile * dst_stride)
+template <typename T_SRC, typename T_DST = T_SRC>
+static inline BTLA_CODE padding_trans_interleave(const T_SRC* src, T_DST* dst, int row, int col, int rowpad, int colpad,
+                                                 int src_step, int dst_step, int MTile, int ColPack) {
+  // Note: rows/cols and i/j are in terms of src
+  static_assert(sizeof(T_SRC) == sizeof(T_DST), "SRC & DST size should be the same");
+  const T_DST dst_0(0);
+  for (int i = 0; i < rowpad; i += MTile) {
+    for (int j = 0; j < colpad; j += ColPack) {
+      for (int ii = 0; ii < MTile; ii++) {
+        for (int jj = 0; jj < ColPack; jj++) {
+          dst[i * dst_step + j * MTile + ii * ColPack + jj] =
+              (i + ii) < row && (j + jj) < col  //
+                  ? static_cast<T_DST>(src[(i + ii) * src_step + (j + jj)])
+                  : dst_0;
+        }
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename SRC_DT, typename DST_DT>
+static inline BTLA_CODE dt_cvt_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col, int srcstride,
+                                             int dststride, bool zeropadding) {
+  for (int i = 0; i < row; i++) {
+    int j = 0;
+    for (; j < col; j++) {
+      const auto src = reinterpret_cast<const SRC_DT*>(reinterpret_cast<const char*>(raw_srcptr) + i * srcstride);
+      const auto dst = reinterpret_cast<DST_DT*>(reinterpret_cast<char*>(raw_dstptr) + i * dststride);
+      dst[j] = static_cast<DST_DT>(src[j]);
+    }
+    if (zeropadding) {
+      for (int bj = j * sizeof(DST_DT); bj < dststride; bj++) {
+        (reinterpret_cast<char*>(raw_dstptr) + i * dststride)[bj] = 0;
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename _DST_T>
+static inline BTLA_CODE dequan_s8_fp(int8_t* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
+                                     float* scales) {
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col; j++) {
+      dstptr[i * ld_dst + j] = static_cast<float>(srcptr[i * ld_src + j]) * scales[j];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE dequan_s8_bf16(int8_t* srcptr, uint16_t* dstptr, int row, int col, int ld_src, int ld_dst,
+                                       float* scales) {
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col; j++) {
+      dstptr[i * ld_dst + j] =
+          utils::cast<float, utils::bf16>(static_cast<float>(srcptr[i * ld_src + j]) * scales[j]).x;
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename _T>
+static inline BTLA_CODE transpose2d(const _T* srcptr, _T* dstptr, int row, int col, int ld_src, int ld_dst) {
+  for (int i = 0; i < col; i++) {
+    for (size_t j = 0; j < row; j++) {
+      dstptr[j + i * ld_dst] = srcptr[j * ld_src + i];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE compress_s8_s4(const int8_t* srcptr, utils::int4x2* dstptr, int row, int col, int ld_src,
+                                       int ld_dst) {
+  for (int j = 0; j < row; j++) {
+    for (int ii = 0; ii < col; ii += 2) {
+      utils::int4x2 tmp;
+      tmp.x = utils::int4x2::convert(srcptr[j * ld_src + ii + 0]) + 8;
+      tmp.y = utils::int4x2::convert(srcptr[j * ld_src + ii + 1]) + 8;
+      dstptr[j * ld_dst / 2 + ii / 2] = tmp;
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE compress_f4(const int8_t* srcptr, utils::f4x2* dstptr, int row, int col, int ld_src,
+                                    int ld_dst) {
+  for (int j = 0; j < row; j++) {
+    for (int ii = 0; ii < col; ii += 2) {
+      utils::f4x2 tmp;
+      tmp.x = srcptr[j * ld_src + ii + 0];
+      tmp.y = srcptr[j * ld_src + ii + 1];
+      dstptr[j * ld_dst / 2 + ii / 2] = tmp;
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE compress_3bit_align128(const int8_t* srcptr, bestla::utils::bit2x4* bit2ptr,
+                                               utils::bit1x8* bit1ptr, int row, int col, int ld_src, int ld_dst) {
+  assert(col % 128 == 0);
+  auto round3bit = [](int8_t src) {
+    int32_t dst = src;
+    dst = dst > 3 ? 3 : dst;
+    dst = dst < -4 ? -4 : dst;
+    return static_cast<int8_t>(dst);
+  };
+  auto bit2_interleave = [&](int8_t* src, int8_t* dst) {
+    for (int i = 0; i < 128 / 4; i++) {
+      dst[4 * i] = src[i];
+      dst[4 * i + 1] = src[128 / 4 + i];
+      dst[4 * i + 2] = src[128 / 4 * 2 + i];
+      dst[4 * i + 3] = src[128 / 4 * 3 + i];
+    }
+  };
+
+  int8_t round_buf[128];
+  int8_t interleave_buf[128];
+
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col; j += 128) {
+      for (int k = 0; k < 128; k++) {
+        round_buf[k] = round3bit(const_cast<int8_t*>(srcptr + i * ld_src + j + k)[0]) + 4;
+      }
+      bit2_interleave(round_buf, interleave_buf);
+      for (int k = 0; k < 32; k++) {
+        bit2ptr[i * ld_dst / 4 + j / 4 + k].a = interleave_buf[4 * k];
+        bit2ptr[i * ld_dst / 4 + j / 4 + k].b = interleave_buf[4 * k + 1];
+        bit2ptr[i * ld_dst / 4 + j / 4 + k].c = interleave_buf[4 * k + 2];
+        bit2ptr[i * ld_dst / 4 + j / 4 + k].d = interleave_buf[4 * k + 3];
+      }
+      for (int k = j; k < j + 128; k += 8) {
+        bit1ptr[i * ld_dst / 8 + k / 8].a = round_buf[k - j] >> 2;
+        bit1ptr[i * ld_dst / 8 + k / 8].b = round_buf[k - j + 1] >> 2;
+        bit1ptr[i * ld_dst / 8 + k / 8].c = round_buf[k - j + 2] >> 2;
+        bit1ptr[i * ld_dst / 8 + k / 8].d = round_buf[k - j + 3] >> 2;
+        bit1ptr[i * ld_dst / 8 + k / 8].e = round_buf[k - j + 4] >> 2;
+        bit1ptr[i * ld_dst / 8 + k / 8].f = round_buf[k - j + 5] >> 2;
+        bit1ptr[i * ld_dst / 8 + k / 8].g = round_buf[k - j + 6] >> 2;
+        bit1ptr[i * ld_dst / 8 + k / 8].h = round_buf[k - j + 7] >> 2;
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE compress_3bit(const int8_t* srcptr, bestla::utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr,
+                                      size_t size) {
+  assert(size % 8 == 0);
+  auto round3bit = [](int8_t src) {
+    int32_t dst = src;
+    dst = dst > 3 ? 3 : dst;
+    dst = dst < -4 ? -4 : dst;
+    return static_cast<int8_t>(dst);
+  };
+
+  for (int j = 0; j < size; j += 8) {
+    auto tmp = round3bit(srcptr[j + 0]) + 4;
+    bit2ptr[j / 4 + 0].a = tmp & 0x3;
+    bit1ptr[j / 8].a = tmp >> 2;
+    tmp = round3bit(srcptr[j + 1]) + 4;
+    bit2ptr[j / 4 + 0].b = tmp & 0x3;
+    bit1ptr[j / 8].b = tmp >> 2;
+    tmp = round3bit(srcptr[j + 2]) + 4;
+    bit2ptr[j / 4 + 0].c = tmp & 0x3;
+    bit1ptr[j / 8].c = tmp >> 2;
+    tmp = round3bit(srcptr[j + 3]) + 4;
+    bit2ptr[j / 4 + 0].d = tmp & 0x3;
+    bit1ptr[j / 8].d = tmp >> 2;
+
+    tmp = round3bit(srcptr[j + 4]) + 4;
+    bit2ptr[j / 4 + 1].a = tmp & 0x3;
+    bit1ptr[j / 8].e = tmp >> 2;
+    tmp = round3bit(srcptr[j + 5]) + 4;
+    bit2ptr[j / 4 + 1].b = tmp & 0x3;
+    bit1ptr[j / 8].f = tmp >> 2;
+    tmp = round3bit(srcptr[j + 6]) + 4;
+    bit2ptr[j / 4 + 1].c = tmp & 0x3;
+    bit1ptr[j / 8].g = tmp >> 2;
+    tmp = round3bit(srcptr[j + 7]) + 4;
+    bit2ptr[j / 4 + 1].d = tmp & 0x3;
+    bit1ptr[j / 8].h = tmp >> 2;
+  }
+
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE compress_2bit(const int8_t* srcptr, bestla::utils::bit2x4* bit2ptr, size_t size) {
+  assert(size % 4 == 0);
+  auto round2bit = [](int8_t src) {
+    int32_t dst = src;
+    dst = dst > 1 ? 1 : dst;
+    dst = dst < -2 ? -2 : dst;
+    return static_cast<int8_t>(dst);
+  };
+
+  for (size_t i = 0; i < size; i += 4) {
+    bit2ptr[i / 4].a = round2bit(const_cast<int8_t*>(srcptr + i)[0]) + 2;
+    bit2ptr[i / 4].b = round2bit(const_cast<int8_t*>(srcptr + i + 1)[0]) + 2;
+    bit2ptr[i / 4].c = round2bit(const_cast<int8_t*>(srcptr + i + 2)[0]) + 2;
+    bit2ptr[i / 4].d = round2bit(const_cast<int8_t*>(srcptr + i + 3)[0]) + 2;
+  }
+
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE Q4T>
+static inline void convert_s4_s8_8(int8_t* dstptr, int8_t* srcptr) {
+  auto src32 = *reinterpret_cast<uint32_t*>(srcptr);
+  auto tmp = static_cast<int>(src32 & 0xf);
+  if constexpr (Q4T == BTLA_DTYPE::S4_CLIP) {
+    tmp -= 8;
+  }
+  dstptr[0] = static_cast<int8_t>(tmp);
+  tmp = static_cast<int>(src32 & 0xf0) >> 4;
+  if constexpr (Q4T == BTLA_DTYPE::S4_CLIP) {
+    tmp -= 8;
+  }
+  dstptr[1] = static_cast<int8_t>(tmp);
+  tmp = static_cast<int>((src32 & 0xf00) >> 8);
+  if constexpr (Q4T == BTLA_DTYPE::S4_CLIP) {
+    tmp -= 8;
+  }
+  dstptr[2] = static_cast<int8_t>(tmp);
+  tmp = static_cast<int>((src32 & 0xf000) >> 12);
+  if constexpr (Q4T == BTLA_DTYPE::S4_CLIP) {
+    tmp -= 8;
+  }
+  dstptr[3] = static_cast<int8_t>(tmp);
+  tmp = static_cast<int>((src32 & 0xf0000) >> 16);
+  if constexpr (Q4T == BTLA_DTYPE::S4_CLIP) {
+    tmp -= 8;
+  }
+  dstptr[4] = static_cast<int8_t>(tmp);
+  tmp = static_cast<int>((src32 & 0xf00000) >> 20);
+  if constexpr (Q4T == BTLA_DTYPE::S4_CLIP) {
+    tmp -= 8;
+  }
+  dstptr[5] = static_cast<int8_t>(tmp);
+  tmp = static_cast<int>((src32 & 0xf000000) >> 24);
+  if constexpr (Q4T == BTLA_DTYPE::S4_CLIP) {
+    tmp -= 8;
+  }
+  dstptr[6] = static_cast<int8_t>(tmp);
+  tmp = static_cast<int>((src32 & 0xf0000000) >> 28);
+  if constexpr (Q4T == BTLA_DTYPE::S4_CLIP) {
+    tmp -= 8;
+  }
+  dstptr[7] = static_cast<int8_t>(tmp);
+}
+
+static inline BTLA_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, size_t unpackelt, int8_t* tmp,
+                                         size_t tmpsize) {
+  for (int j = 0; j < unpackelt; j += 2) {
+    auto tmp = srcptr[j / 2];
+    dstptr[j + 0] = tmp.x - 8;
+    dstptr[j + 1] = tmp.y - 8;
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE decompress_s3_s8(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, int8_t* dstptr, int unpack_elt,
+                                         int8_t* tmp, size_t tmpsize) {
+  for (size_t i = 0; i < unpack_elt; i += 8) {
+    auto bit1 = bit1ptr[i / 8];
+    auto tmp = bit2ptr[i / 4];
+    dstptr[i + 0] = (tmp.a | (bit1.a << 2)) - 4;
+    dstptr[i + 1] = (tmp.b | (bit1.b << 2)) - 4;
+    dstptr[i + 2] = (tmp.c | (bit1.c << 2)) - 4;
+    dstptr[i + 3] = (tmp.d | (bit1.d << 2)) - 4;
+    tmp = bit2ptr[i / 4 + 1];
+    dstptr[i + 4] = (tmp.a | (bit1.e << 2)) - 4;
+    dstptr[i + 5] = (tmp.b | (bit1.f << 2)) - 4;
+    dstptr[i + 6] = (tmp.c | (bit1.g << 2)) - 4;
+    dstptr[i + 7] = (tmp.d | (bit1.h << 2)) - 4;
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE decompress_s2_s8(utils::bit2x4* srcptr, int8_t* dstptr, size_t unpackelt, int8_t* tmp,
+                                         size_t tmpsize) {
+  for (int j = 0; j < unpackelt; j += 4) {
+    auto tmp = srcptr[j / 4];
+    dstptr[j + 0] = tmp.a - 2;
+    dstptr[j + 1] = tmp.b - 2;
+    dstptr[j + 2] = tmp.c - 2;
+    dstptr[j + 3] = tmp.d - 2;
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int PackRow, int NTILE>
+static inline BTLA_CODE decompress_kblock_s4_s8(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize,
+                                                int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp,
+                                                size_t tmpsize) {
+  if (zpptr) {
+    if constexpr (PackRow == 4 || PackRow == 2) {
+      for (int i = 0; i < row; i += PackRow) {
+        auto zptr = zpptr + (i + k_offset) / blocksize * ldzp + n_offset;
+        for (int j = 0; j < col; j += 1) {
+          auto zp = zptr[j] + 8;
+          for (int ir = 0; ir < PackRow; ir += 2) {
+            auto tmp = srcptr[i * col / 2 + j * PackRow / 2 + ir / 2];
+            dstptr[i * col + j * PackRow + ir + 0] = tmp.x - zp;
+            dstptr[i * col + j * PackRow + ir + 1] = tmp.y - zp;
+          }
+        }
+      }
+    } else if constexpr (PackRow == 1) {
+      for (int i = 0; i < row; i += 1) {
+        auto zptr = zpptr + (i + k_offset) / blocksize * ldzp + n_offset;
+        for (int j = 0; j < col; j += 2) {
+          auto tmp = srcptr[i * col / 2 + j / 2];
+          dstptr[i * col + j + 0] = tmp.x - 8 - zptr[j + 0];
+          dstptr[i * col + j + 1] = tmp.y - 8 - zptr[j + 1];
+        }
+      }
+    } else {
+      static_assert(PackRow == 1 || PackRow == 2 || PackRow == 4);
+    }
+  } else {
+    return decompress_s4_s8(srcptr, dstptr, size_t(row) * col, tmp, tmpsize);
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int PackRow, int NTILE>
+static inline BTLA_CODE decompress_kblock_s3_s8(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, int8_t* zpptr,
+                                                int8_t* dstptr, int blocksize, int ldzp, int n_offset, int k_offset,
+                                                int row, int col, int8_t* tmp, size_t tmpsize) {
+  static_assert(NTILE % 8 == 0);
+  assert(((col * PackRow) % 8) == 0);
+  if (zpptr) {
+    if constexpr (PackRow == 4) {
+      for (int i = 0; i < row; i += PackRow) {
+        auto zptr = zpptr + (i + k_offset) / blocksize * ldzp + n_offset;
+        for (int j = 0; j < col; j += 2) {
+          auto zp = zptr[j] + 4;
+          auto bit1 = bit1ptr[(i * col + j * PackRow) / 8];
+          auto tmp = bit2ptr[(i * col + j * PackRow) / 4];
+          dstptr[i * col + j * PackRow + 0] = (tmp.a | (bit1.a << 2)) - zp;
+          dstptr[i * col + j * PackRow + 1] = (tmp.b | (bit1.b << 2)) - zp;
+          dstptr[i * col + j * PackRow + 2] = (tmp.c | (bit1.c << 2)) - zp;
+          dstptr[i * col + j * PackRow + 3] = (tmp.d | (bit1.d << 2)) - zp;
+          zp = zptr[j + 1] + 4;
+          tmp = bit2ptr[(i * col + j * PackRow) / 4 + 1];
+          dstptr[i * col + j * PackRow + 4] = (tmp.a | (bit1.e << 2)) - zp;
+          dstptr[i * col + j * PackRow + 5] = (tmp.b | (bit1.f << 2)) - zp;
+          dstptr[i * col + j * PackRow + 6] = (tmp.c | (bit1.g << 2)) - zp;
+          dstptr[i * col + j * PackRow + 7] = (tmp.d | (bit1.h << 2)) - zp;
+        }
+      }
+    } else if constexpr (PackRow == 1) {
+      for (int i = 0; i < row; i += 1) {
+        auto zptr = zpptr + (i + k_offset) / blocksize * ldzp + n_offset;
+        for (int j = 0; j < col; j += 8) {
+          auto bit1 = bit1ptr[(i * col + j * PackRow) / 8];
+          auto tmp = bit2ptr[(i * col + j * PackRow) / 4];
+          dstptr[i * col + j * PackRow + 0] = (tmp.a | (bit1.a << 2)) - 4 - zptr[j + 0];
+          dstptr[i * col + j * PackRow + 1] = (tmp.b | (bit1.b << 2)) - 4 - zptr[j + 1];
+          dstptr[i * col + j * PackRow + 2] = (tmp.c | (bit1.c << 2)) - 4 - zptr[j + 2];
+          dstptr[i * col + j * PackRow + 3] = (tmp.d | (bit1.d << 2)) - 4 - zptr[j + 3];
+          tmp = bit2ptr[(i * col + j * PackRow) / 4 + 1];
+          dstptr[i * col + j * PackRow + 4] = (tmp.a | (bit1.e << 2)) - 4 - zptr[j + 4];
+          dstptr[i * col + j * PackRow + 5] = (tmp.b | (bit1.f << 2)) - 4 - zptr[j + 5];
+          dstptr[i * col + j * PackRow + 6] = (tmp.c | (bit1.g << 2)) - 4 - zptr[j + 6];
+          dstptr[i * col + j * PackRow + 7] = (tmp.d | (bit1.h << 2)) - 4 - zptr[j + 7];
+        }
+      }
+    } else if constexpr (PackRow == 2) {
+      for (int i = 0; i < row; i += PackRow) {
+        auto zptr = zpptr + (i + k_offset) / blocksize * ldzp + n_offset;
+        for (int j = 0; j < col; j += 4) {
+          auto bit1 = bit1ptr[(i * col + j * PackRow) / 8];
+          auto tmp = bit2ptr[(i * col + j * PackRow) / 4];
+          auto zp = zptr[j] + 4;
+          dstptr[i * col + j * PackRow + 0] = (tmp.a | (bit1.a << 2)) - zp;
+          dstptr[i * col + j * PackRow + 1] = (tmp.b | (bit1.b << 2)) - zp;
+          zp = zptr[j + 1] + 4;
+          dstptr[i * col + j * PackRow + 2] = (tmp.c | (bit1.c << 2)) - zp;
+          dstptr[i * col + j * PackRow + 3] = (tmp.d | (bit1.d << 2)) - zp;
+          tmp = bit2ptr[(i * col + j * PackRow) / 4 + 1];
+          zp = zptr[j + 2] + 4;
+          dstptr[i * col + j * PackRow + 4] = (tmp.a | (bit1.e << 2)) - zp;
+          dstptr[i * col + j * PackRow + 5] = (tmp.b | (bit1.f << 2)) - zp;
+          zp = zptr[j + 3] + 4;
+          dstptr[i * col + j * PackRow + 6] = (tmp.c | (bit1.g << 2)) - zp;
+          dstptr[i * col + j * PackRow + 7] = (tmp.d | (bit1.h << 2)) - zp;
+        }
+      }
+    } else {
+      static_assert(PackRow == 1 || PackRow == 2 || PackRow == 4);
+    }
+  } else {
+    return decompress_s3_s8(bit2ptr, bit1ptr, dstptr, size_t(row) * col, tmp, tmpsize);
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int PackRow, int NTILE>
+static inline BTLA_CODE decompress_kblock_s2_s8(utils::bit2x4* bit2ptr, int8_t* zpptr, int8_t* dstptr, int blocksize,
+                                                int ldzp, int n_offset, int k_offset, int row, int col, int8_t* tmp,
+                                                size_t tmpsize) {
+  static_assert(NTILE % 4 == 0);
+  assert(((col * PackRow) % 4) == 0);
+  if (zpptr) {
+    if constexpr (PackRow == 4) {
+      for (int i = 0; i < row; i += PackRow) {
+        auto zptr = zpptr + (i + k_offset) / blocksize * ldzp + n_offset;
+        for (int j = 0; j < col; j += 1) {
+          auto zp = zptr[j] + 2;
+          auto tmp = bit2ptr[(i * col + j * PackRow) / 4];
+          dstptr[i * col + j * PackRow + 0] = (tmp.a) - zp;
+          dstptr[i * col + j * PackRow + 1] = (tmp.b) - zp;
+          dstptr[i * col + j * PackRow + 2] = (tmp.c) - zp;
+          dstptr[i * col + j * PackRow + 3] = (tmp.d) - zp;
+        }
+      }
+    } else if constexpr (PackRow == 1) {
+      for (int i = 0; i < row; i += 1) {
+        auto zptr = zpptr + (i + k_offset) / blocksize * ldzp + n_offset;
+        for (int j = 0; j < col; j += 4) {
+          auto tmp = bit2ptr[(i * col + j * PackRow) / 4];
+          dstptr[i * col + j * PackRow + 0] = (tmp.a) - 2 - zptr[j + 0];
+          dstptr[i * col + j * PackRow + 1] = (tmp.b) - 2 - zptr[j + 1];
+          dstptr[i * col + j * PackRow + 2] = (tmp.c) - 2 - zptr[j + 2];
+          dstptr[i * col + j * PackRow + 3] = (tmp.d) - 2 - zptr[j + 3];
+        }
+      }
+    } else if constexpr (PackRow == 2) {
+      for (int i = 0; i < row; i += PackRow) {
+        auto zptr = zpptr + (i + k_offset) / blocksize * ldzp + n_offset;
+        for (int j = 0; j < col; j += 2) {
+          auto tmp = bit2ptr[(i * col + j * PackRow) / 4];
+          auto zp = zptr[j] + 2;
+          dstptr[i * col + j * PackRow + 0] = (tmp.a) - zp;
+          dstptr[i * col + j * PackRow + 1] = (tmp.b) - zp;
+          zp = zptr[j + 1] + 2;
+          dstptr[i * col + j * PackRow + 2] = (tmp.c) - zp;
+          dstptr[i * col + j * PackRow + 3] = (tmp.d) - zp;
+        }
+      }
+    } else {
+      static_assert(PackRow == 1 || PackRow == 2 || PackRow == 4);
+    }
+  } else {
+    return decompress_s2_s8(bit2ptr, dstptr, size_t(row) * col, tmp, tmpsize);
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline float f8_to_fp32(utils::f8 v, BTLA_DTYPE f8_t) {
+  uint32_t sign_revert = v.x;
+  uint32_t e_revert = v.x;
+  uint32_t mantissa_revert = v.x;
+  sign_revert <<= 24;
+  sign_revert &= 0x80000000;
+  auto ebits = utils::bestla_dtype_get_f8_ebits(f8_t);
+  auto mantissabit = 7 - ebits;
+  e_revert &= 0x7f;
+  e_revert >>= mantissabit;
+  e_revert = e_revert - std::pow(2, ebits - 1) + 1 + 127;
+  e_revert <<= 23;
+  mantissa_revert <<= (23 - mantissabit);
+  mantissa_revert &= 0x007fffff;
+  uint32_t revert = sign_revert | e_revert | mantissa_revert;
+  float* fp_v = reinterpret_cast<float*>(&revert);
+  return *fp_v;
+}
+
+template <typename _DST_T, int _PACK_ROW, typename _S_T>
+static inline BTLA_CODE decompress_kblock_f8_fp(utils::f8* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
+                                                int ld_dst, _S_T* scales, int k_offset, int kblock, int NPad,
+                                                BTLA_DTYPE src_f8_type) {
+  for (int i = 0; i < row; i++) {
+    int kpos = (k_offset + i) / kblock;
+    auto sptr = scales + kpos * NPad;
+    for (int j = 0; j < col; j++) {
+      auto fp_v = f8_to_fp32(srcptr[i * ld_src + j], src_f8_type);
+      float scale;
+      if constexpr (std::is_same_v<_S_T, utils::f8>) {
+        int shared_exp = sptr[j / _PACK_ROW].x;
+        scale = std::pow(2, shared_exp);
+      } else if constexpr (std::is_same_v<_S_T, float>) {
+        scale = scales[j / _PACK_ROW];
+      } else {
+        assert(0);
+      }
+      dstptr[i * ld_dst + j] = fp_v * scale;
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+inline BTLA_CODE decompress_kblock_s8_fp(int8_t* srcptr, DST_T* dstptr, int row, int col, void* scales_,
+                                         BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset, int n_offset,
+                                         int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  for (int i = 0; i < row; i += PackRow) {
+    int kpos = (k_offset + i) / blocksize * ldzp + n_offset;
+    if (sdtype == BTLA_DTYPE::F32) {
+      auto sptr = (float*)scales_ + kpos;
+      for (int j = 0; j < col; j += 1) {
+        auto scale = float(sptr[j]);
+        auto zp = zero_points ? zero_points[kpos + j] : 0;
+        for (int ir = 0; ir < PackRow; ir++) {
+          float tmp = static_cast<float>(srcptr[i * col + j * PackRow + ir] - zp) * scale;
+          dstptr[i * col + j * PackRow + ir] = tmp;
+        }
+      }
+    } else if (sdtype == BTLA_DTYPE::BF16) {
+      auto sptr = (utils::bf16*)scales_ + kpos;
+      for (int j = 0; j < col; j += 1) {
+        auto scale = float(sptr[j]);
+        auto zp = zero_points ? zero_points[kpos + j] : 0;
+        for (int ir = 0; ir < PackRow; ir++) {
+          float tmp = static_cast<float>(srcptr[i * col + j * PackRow + ir] - zp) * scale;
+          dstptr[i * col + j * PackRow + ir] = tmp;
+        }
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+static inline BTLA_CODE decompress_kblock_s4_fp(utils::int4x2* srcptr, DST_T* dstptr, int row, int col, void* scales_,
+                                                BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset, int n_offset,
+                                                int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  assert(tmpsize >= PackRow * NTILE);
+  assert(NTILE == col);
+  const auto DstSize = row * NTILE * sizeof(DST_T);
+  const auto S8Size = row * NTILE * sizeof(int8_t);
+  auto tmps8ptr = (int8_t*)dstptr;
+  tmps8ptr += DstSize - S8Size;
+  decompress_kblock_s4_s8<PackRow, NTILE>(srcptr, zero_points, tmps8ptr, blocksize, ldzp, n_offset, k_offset, row, col,
+                                          tmp, tmpsize);
+  decompress_kblock_s8_fp<PackRow, NTILE>(tmps8ptr, dstptr, row, col, scales_, sdtype, nullptr, k_offset, n_offset,
+                                          blocksize, ldzp, tmp, tmpsize);
+  return BTLA_CODE::Success;
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+static inline BTLA_CODE decompress_kblock_s3_fp(utils::bit2x4* b2ptr, utils::bit1x8* b1ptr, DST_T* dstptr, int row,
+                                                int col, void* scales_, BTLA_DTYPE sdtype, int8_t* zero_points,
+                                                int k_offset, int n_offset, int blocksize, int ldzp, int8_t* tmp,
+                                                size_t tmpsize) {
+  assert(tmpsize >= PackRow * NTILE);
+  assert(NTILE == col);
+  const auto DstSize = row * NTILE * sizeof(DST_T);
+  const auto S8Size = row * NTILE * sizeof(int8_t);
+  auto tmps8ptr = (int8_t*)dstptr;
+  tmps8ptr += DstSize - S8Size;
+  decompress_kblock_s3_s8<PackRow, NTILE>(b2ptr, b1ptr, zero_points, tmps8ptr, blocksize, ldzp, n_offset, k_offset, row,
+                                          col, tmp, tmpsize);
+  decompress_kblock_s8_fp<PackRow, NTILE>(tmps8ptr, dstptr, row, col, scales_, sdtype, nullptr, k_offset, n_offset,
+                                          blocksize, ldzp, tmp, tmpsize);
+  return BTLA_CODE::Success;
+}
+
+template <int PackRow, int NTILE, typename DST_T>
+static inline BTLA_CODE decompress_kblock_s2_fp(utils::bit2x4* b2ptr, DST_T* dstptr, int row, int col, void* scales_,
+                                                BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset, int n_offset,
+                                                int blocksize, int ldzp, int8_t* tmp, size_t tmpsize) {
+  assert(tmpsize >= PackRow * NTILE);
+  assert(NTILE == col);
+  const auto DstSize = row * NTILE * sizeof(DST_T);
+  const auto S8Size = row * NTILE * sizeof(int8_t);
+  auto tmps8ptr = (int8_t*)dstptr;
+  tmps8ptr += DstSize - S8Size;
+  decompress_kblock_s2_s8<PackRow, NTILE>(b2ptr, zero_points, tmps8ptr, blocksize, ldzp, n_offset, k_offset, row, col,
+                                          tmp, tmpsize);
+  decompress_kblock_s8_fp<PackRow, NTILE>(tmps8ptr, dstptr, row, col, scales_, sdtype, nullptr, k_offset, n_offset,
+                                          blocksize, ldzp, tmp, tmpsize);
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE S4_T, typename _DST_T, int _PACK_ROW>
+static inline BTLA_CODE decompress_dq_kblock_s4_fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
+                                                   int ld_dst, uint8_t* scales, float* dq_scale, int k_offset,
+                                                   int n_offset, int kblock, int dq_blk, int dq_offset_idx, int NPad,
+                                                   int N, void* tmp, size_t tmpsize) {
+  auto sptr_base = scales + n_offset;
+  for (int i = 0; i < row; i++) {
+    int kpos = (k_offset + i) / kblock;
+    auto sptr = sptr_base + kpos * NPad;
+    for (int j = 0; j < col; j += 2) {
+      auto tmp = srcptr[i * ld_src / 2 + j / 2];
+      float scale0, scale1, dst0, dst1;
+      int s0_idx, s1_idx;
+      s0_idx = j / _PACK_ROW;
+      s1_idx = (j + 1) / _PACK_ROW;
+      auto dq_s0_idx = (n_offset + kpos * N + s0_idx) / dq_blk;
+      auto dq_s1_idx = (n_offset + kpos * N + s1_idx) / dq_blk;
+      scale0 = dq8_bnb_LUT[sptr[s0_idx]] * dq_scale[dq_s0_idx] + dq_scale[dq_offset_idx];
+      scale1 = dq8_bnb_LUT[sptr[s1_idx]] * dq_scale[dq_s1_idx] + dq_scale[dq_offset_idx];
+      dst0 = static_cast<float>(tmp.x - 8) * scale0;
+      dst1 = static_cast<float>(tmp.y - 8) * scale1;
+      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(dst0);
+      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(dst1);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline float fp4_bnb_unpack(uint8_t val) {
+  float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
+  if ((val & 0b0100) == 4)          // 0
+    if ((val & 0b0010) == 2)        // 01
+      if ((val & 0b0001) == 1)      // 111
+        return 0.25000000f * sign;  // 1111
+      else
+        return 0.16666667f * sign;  // 1110
+    else if ((val & 0b0001) == 1)   // 110
+      return 0.50000000f * sign;    // 1101
+    else
+      return 0.33333333f * sign;  // 1100
+  else if ((val & 0b0010) == 2)   // 10
+    if ((val & 0b0001) == 1)      // 101
+      return 1.00000000f * sign;  // 1011
+    else
+      return 0.66666667f * sign;     // 1010
+  else if ((val & 0b0001) == 1)      // 100
+    return 5.208333333e-03f * sign;  // 1001
+  else
+    return 0.00000000f * sign;  // 1000
+}
+
+static inline float fp4_bnb_dequantize(uint8_t val, float absmax) { return fp4_bnb_unpack(val) * absmax; }
+
+static inline int8_t fp4_bnb_quantize(float x) {
+  int sign = x < 0 ? 0b1000 : 0b0000;
+  x = fabsf(x);
+  if (x > 0.29166667f)
+    if (x > 0.583333f)
+      if (x > 0.8333333f)
+        return static_cast<int8_t>(0b0011 + sign);
+      else
+        return static_cast<int8_t>(0b0010 + sign);
+    else if (x > 0.4166667f)
+      return static_cast<int8_t>(0b101 + sign);
+    else
+      return static_cast<int8_t>(0b100 + sign);
+  else if (x > 0.0859375f)
+    if (x > 0.20833333f)
+      return static_cast<int8_t>(0b0111 + sign);
+    else
+      return static_cast<int8_t>(0b0110 + sign);
+  else if (x > 0.00260417f)
+    return static_cast<int8_t>(0b0001 + sign);
+  else
+    return static_cast<int8_t>(0b0000 + sign);
+}
+
+static inline int8_t fp4_e2m1_quantize(float x) {
+  // FP4 with bias of 1
+  // first bit is a sign
+  // subnormals
+  // 0b000 = 0
+  // 0b001 = 0.0625
+  // 0b010 = 1
+  // 0b011 = 1.5
+  // 0b100 = 2
+  // 0b101 = 3
+  // 0b110 = 4
+  // 0b111 = 6
+
+  int sign = x < 0 ? 0b1000 : 0b0000;
+  x = fabsf(x);
+  if (x > 1.75f / 6) {
+    if (x > 3.5f / 6) {
+      if (x > 5.f / 6)
+        return static_cast<int8_t>(0b111 + sign);  // 6
+      else
+        return static_cast<int8_t>(0b110 + sign);  // 4
+    } else {
+      if (x > 2.5f / 6)
+        return static_cast<int8_t>(0b101 + sign);  // 3
+      else
+        return static_cast<int8_t>(0b100 + sign);  // 2
+    }
+  } else {
+    if (x > 0.53125f / 6) {
+      if (x > 1.25f / 6)
+        return static_cast<int8_t>(0b011 + sign);  // 1.5
+      else
+        return static_cast<int8_t>(0b010 + sign);  // 1
+    } else {
+      if (x > 0.03125f / 6)
+        return static_cast<int8_t>(0b0001 + sign);  // 0.0625
+      else
+        return static_cast<int8_t>(0b0000 + sign);  // 0
+    }
+  }
+}
+
+static inline float fp4_e2m1_unpack(uint8_t val) {
+  float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
+  if ((val & 0b0100) == 4)      // 0
+    if ((val & 0b0010) == 2)    // 01
+      if ((val & 0b0001) == 1)  // 111
+        return 1.f * sign;      // 1111
+      else
+        return 0.6666666666666666f * sign;  // 1110
+    else if ((val & 0b0001) == 1)           // 110
+      return 0.5f * sign;                   // 1101
+    else
+      return 0.3333333333333333f * sign;  // 1100
+  else if ((val & 0b0010) == 2)           // 10
+    if ((val & 0b0001) == 1)              // 101
+      return 0.25f * sign;                // 1011
+    else
+      return 0.16666666666666666f * sign;  // 1010
+  else if ((val & 0b0001) == 1)            // 100
+    return 0.010416666666666666f * sign;   // 1001
+  else
+    return 0.00000000f * sign;  // 1000
+}
+
+static inline float fp4_e2m1_dequantize(uint8_t val, float absmax) { return fp4_e2m1_unpack(val) * absmax; }
+
+static inline float nf4_unpack(int8_t val) {
+  if ((val & 0b1000) == 8)
+    if ((val & 0b0100) == 4)      // 1
+      if ((val & 0b0010) == 2)    // 11
+        if ((val & 0b0001) == 1)  // 111
+          return 1.0f;
+        else
+          return 0.7229568362236023f;
+      else if ((val & 0b0001) == 1)  // 110
+        return 0.5626170039176941f;
+      else
+        return 0.44070982933044434f;
+    else if ((val & 0b0010) == 2)  // 10
+      if ((val & 0b0001) == 1)     // 101
+        return 0.33791524171829224f;
+      else
+        return 0.24611230194568634f;
+    else if ((val & 0b0001) == 1)  // 100
+      return 0.16093020141124725f;
+    else
+      return 0.07958029955625534f;
+
+  else if ((val & 0b0100) == 4)  // 0
+    if ((val & 0b0010) == 2)     // 01
+      if ((val & 0b0001) == 1)   // 011
+        return -1.f;
+      else
+        return -0.09105003625154495f;
+    else if ((val & 0b0001) == 1)  // 010
+      return -0.18477343022823334f;
+    else
+      return -0.28444138169288635f;
+  else if ((val & 0b0010) == 2)  // 00
+    if ((val & 0b0001) == 1)     // 001
+      return -0.39491748809814453f;
+    else
+      return -0.5250730514526367f;
+  else if ((val & 0b0001) == 1)  // 000
+    return -0.6961928009986877f;
+  else
+    return 0.f;
+}
+
+static inline float nf4_dequantize(int8_t val, float absmax) { return nf4_unpack(val) * absmax; }
+
+// Note: In the BNB Nf4 definition, 0 has a non-zero value after dequantization, but BTLA uses 0 for padding, which
+// leads to calculation errors. We ultimately choose to swap the binary bits of -1 and 0 in Nf4 to avoid this
+// conflict.
+static inline int8_t nf4_quantize(float x) {
+  if (x > 0.03979014977812767f)
+    if (x > 0.3893125355243683f)      // 1
+      if (x > 0.6427869200706482f)    // 11
+        if (x > 0.8614784181118011f)  // 111
+          return 0b1111;
+        else
+          return 0b1110;
+      else if (x > 0.5016634166240692f)  // 110
+        return 0b1101;
+      else
+        return 0b1100;
+    else if (x > 0.2035212516784668f)  // 10
+      if (x > 0.2920137718319893f)     // 101
+        return 0b1011;
+      else
+        return 0b1010;
+    else if (x > 0.1202552504837513f)  // 100
+      return 0b1001;
+    else
+      return 0b1000;
+  else if (x > -0.33967943489551544f)  // 0
+    if (x > -0.13791173323988914f)     // 01
+      if (x > -0.045525018125772476f)  // 011
+        return 0b0000;
+      else
+        return 0b0110;
+    else if (x > -0.23460740596055984f)  // 010
+      return 0b0101;
+    else
+      return 0b0100;
+  else if (x > -0.6106329262256622f)  // 00
+    if (x > -0.4599952697753906f)     // 001
+      return 0b0011;
+    else
+      return 0b0010;
+  else if (x > -0.8480964004993439f)  // 000
+    return 0b0001;
+  else
+    return 0b0111;
+}
+
+template <BTLA_DTYPE F4_T>
+static inline float f4_unpack(int8_t v) {
+  static_assert(F4_T == BTLA_DTYPE::F4_BNB || F4_T == BTLA_DTYPE::F4_NF4 || F4_T == BTLA_DTYPE::F4_E2M1,
+                "Unsupported F4 type");
+  switch (F4_T) {
+    case BTLA_DTYPE::F4_BNB:
+      return fp4_bnb_unpack(v);
+    case BTLA_DTYPE::F4_NF4:
+      return nf4_unpack(v);
+    case BTLA_DTYPE::F4_E2M1:
+      return fp4_e2m1_unpack(v);
+    default:
+      break;
+  }
+  return std::numeric_limits<float>::quiet_NaN();
+}
+
+template <BTLA_DTYPE F4_T>
+static inline float f4_dequantize(int8_t v, float scale) {
+  static_assert(F4_T == BTLA_DTYPE::F4_BNB || F4_T == BTLA_DTYPE::F4_NF4 || F4_T == BTLA_DTYPE::F4_E2M1,
+                "Unsupported F4 type");
+  return f4_unpack<F4_T>(v) * scale;
+}
+
+template <BTLA_DTYPE F4_T>
+static inline int8_t f4_quantize(float x) {
+  static_assert(F4_T == BTLA_DTYPE::F4_BNB || F4_T == BTLA_DTYPE::F4_NF4 || F4_T == BTLA_DTYPE::F4_E2M1,
+                "Unsupported F4 type");
+  switch (F4_T) {
+    case BTLA_DTYPE::F4_BNB:
+      return fp4_bnb_quantize(x);
+    case BTLA_DTYPE::F4_NF4:
+      return nf4_quantize(x);
+    case BTLA_DTYPE::F4_E2M1:
+      return fp4_e2m1_quantize(x);
+    default:
+      break;
+  }
+  return static_cast<int8_t>(0);
+}
+
+template <BTLA_DTYPE F4_T, typename _DST_T, int _PACK_ROW, typename _S_T>
+static inline BTLA_CODE decompress_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
+                                                int ld_dst, _S_T* scales, int k_offset, int kblock, int NPad,
+                                                int8_t* tmp, size_t tmpsize) {
+  for (int i = 0; i < row; i++) {
+    int kpos = (k_offset + i) / kblock;
+    auto sptr = scales + kpos * NPad;
+    for (int j = 0; j < col; j += 2) {
+      auto tmp = srcptr[i * ld_src / 2 + j / 2];
+      float scale0, scale1, dst0, dst1;
+      int s0_idx, s1_idx;
+      s0_idx = j / _PACK_ROW;
+      s1_idx = (j + 1) / _PACK_ROW;
+      scale0 = static_cast<float>(sptr[s0_idx]);
+      scale1 = static_cast<float>(sptr[s1_idx]);
+      dst0 = f4_dequantize<F4_T>(tmp.x, scale0);
+      dst1 = f4_dequantize<F4_T>(tmp.y, scale1);
+      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(dst0);
+      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(dst1);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE F4_T, int _PACK_ROW, typename _DST_T, typename SCA_T>
+static inline BTLA_CODE decompress_dq_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
+                                                   int ld_dst, SCA_T* scales, float* dq_scale, int k_offset,
+                                                   int n_offset, int kblock, int dq_blk, int dq_offset_idx, int NPad,
+                                                   int N, void* tmp, size_t tmpsize) {
+  auto sptr_base = scales + n_offset;
+  for (int i = 0; i < row; i++) {
+    int kpos = (k_offset + i) / kblock;
+    auto sptr = sptr_base + kpos * NPad;
+    for (int j = 0; j < col; j += 2) {
+      auto tmp = srcptr[i * ld_src / 2 + j / 2];
+      float scale0, scale1, dst0, dst1;
+      int s0_idx, s1_idx;
+      s0_idx = j / _PACK_ROW;
+      s1_idx = (j + 1) / _PACK_ROW;
+      auto dq_s0_idx = (n_offset + kpos * N + s0_idx) / dq_blk;
+      auto dq_s1_idx = (n_offset + kpos * N + s1_idx) / dq_blk;
+      scale0 = dq8_bnb_LUT[sptr[s0_idx]] * dq_scale[dq_s0_idx] + dq_scale[dq_offset_idx];
+      scale1 = dq8_bnb_LUT[sptr[s1_idx]] * dq_scale[dq_s1_idx] + dq_scale[dq_offset_idx];
+      dst0 = ref::f4_dequantize<F4_T>(tmp.x, scale0);
+      dst1 = ref::f4_dequantize<F4_T>(tmp.y, scale1);
+      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(dst0);
+      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(dst1);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE F4_T, typename _DST_T>
+static inline BTLA_CODE decompress_kblock_f4_fp_noscale(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col,
+                                                        int ld_src, int ld_dst, int8_t* tmp, size_t tmpsize) {
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col; j += 2) {
+      auto tmp = srcptr[i * ld_src / 2 + j / 2];
+      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(f4_unpack<F4_T>(tmp.x));
+      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(f4_unpack<F4_T>(tmp.y));
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename _DST_T>
+static inline BTLA_CODE decompress_kblock_f8_fp_noscale(utils::f8* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
+                                                        int ld_dst, BTLA_DTYPE src_f8_t) {
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col; j++) {
+      dstptr[i * ld_dst + j] = f8_to_fp32(srcptr[i * ld_src + j], src_f8_t);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE memcpy2d_dw2highw(const void* srcptr, void* dstptr, int row, int col, int srcstride,
+                                          int dststride) {
+  auto bsrcptr = (char*)srcptr;
+  auto bdstptr = (char*)dstptr;
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col; j++) {
+      std::memcpy(bdstptr + i * dststride + j * sizeof(utils::bf16), bsrcptr + i * srcstride + j * sizeof(float) + 2,
+                  sizeof(utils::bf16));
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename _SRC_T, typename _DST_T>
+static inline BTLA_CODE memcpy2d(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstride, int dststride) {
+  auto bsrcptr = (const char*)srcptr;
+  auto bdstptr = (char*)dstptr;
+  for (int i = 0; i < row; i++) {
+    if constexpr (std::is_same_v<_SRC_T, _DST_T>) {
+      std::memcpy(bdstptr + i * dststride, bsrcptr + i * srcstride, col);
+    } else if constexpr (std::is_same_v<_SRC_T, float> &&
+                         (std::is_same_v<_DST_T, utils::bf16> || std::is_same_v<_DST_T, utils::fp16>)) {
+      for (int j = 0; j < col; j += sizeof(_SRC_T))
+        dstptr[(i * dststride + j / 2) / sizeof(_DST_T)] =
+            static_cast<_DST_T>(srcptr[(i * srcstride + j) / sizeof(_SRC_T)]);
+    } else if constexpr ((std::is_same_v<_SRC_T, utils::bf16> ||
+                          std::is_same_v<_SRC_T, utils::fp16>)&&std::is_same_v<_DST_T, float>) {
+      for (int j = 0; j < col; j += sizeof(_SRC_T))
+        dstptr[(i * dststride + j * 2) / sizeof(_DST_T)] =
+            static_cast<_DST_T>(srcptr[(i * srcstride + j) / sizeof(_SRC_T)]);
+    } else {
+      assert(0);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline float postop(float x, BTLA_ELTWISEOP op, void* const_elt_v) {
+  if (op == BTLA_ELTWISEOP::GELU) {
+    return 0.5f * x * (1.f + tanhf(0.7978845834732056f * (x + 0.044714998453855515f * x * x * x)));
+  }
+  if (op == BTLA_ELTWISEOP::SWISH) {
+    return x / (1 + exp(-x));
+  }
+  assert(0);
+  return std::numeric_limits<float>::infinity();
+}
+
+template <typename _SRC_T, typename _DST_T, BTLA_ELTWISEOP OP_T>
+static inline BTLA_CODE memcpy2d_withop(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstride,
+                                        int dststride, void* const_elt_v) {
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col; j += sizeof(_SRC_T)) {
+      float v = srcptr[(i * srcstride + j) / sizeof(_SRC_T)];
+      v = postop(v, OP_T, const_elt_v);
+      dstptr[(i * srcstride + j) / sizeof(_DST_T)] = v;
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE get2d_e8m0_scale(const void* srcptr, void* dstptr, int row, int col, int srcstride,
+                                         int dststride) {
+  auto f8_v = (const utils::f8*)srcptr;
+  auto f32_v = (float*)dstptr;
+  auto f8_stride = srcstride / sizeof(utils::f8);
+  auto f32_stride = dststride / sizeof(float);
+  auto col_elt = col / sizeof(utils::f8);
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col_elt; j++) {
+      f32_v[i * f32_stride + j] = std::pow(2, f8_v[i * f8_stride + j].x);
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE QDT_T>
+static inline BTLA_CODE quantize_f32_sign_int_rowblock(const float* srcptr, int8_t* dstptr, int row, int col,
+                                                       int ld_src, int ld_dst, float* scales, int8_t* zero_points,
+                                                       int blocksize) {
+  int raw_blocksize = blocksize;
+  for (int i = 0; i < col; i++) {
+    int align_row_loop = row / blocksize * blocksize;
+    int j = 0;
+    auto s8_calc_store_scale_and_quantv_sym = [&](int blocksize) {
+      float maxval = std::numeric_limits<float>::min();
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        maxval = std::max(maxval, std::abs(srcptr[(j + ij) * ld_src + i]));
+      }
+      float scale = maxval / 127;
+      float rscale = 1.f / scale;
+      scales[j / raw_blocksize * ld_dst + i] = scale;
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>(srcptr[(j + ij) * ld_src + i] * rscale);
+      }
+    };
+    auto s8_calc_store_scale_and_quantv_asym = [&](int blocksize) {
+      float maxval = 0.f;
+      float minval = 0.f;
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        maxval = std::max(maxval, srcptr[(j + ij) * ld_src + i]);
+        minval = std::min(minval, srcptr[(j + ij) * ld_src + i]);
+      }
+      float scale = (maxval - minval) / 255;
+      float rscale = 1.f / scale;
+      scales[j / raw_blocksize * ld_dst + i] = scale;
+      int8_t bzp = utils::cast<float, int8_t>(utils::cast<float, int>((0 - minval) * rscale) - 128);
+      zero_points[j / raw_blocksize * ld_dst + i] = bzp;
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        dstptr[(j + ij) * ld_dst + i] =
+            utils::cast<float, int8_t>(utils::cast<float, int>((srcptr[(j + ij) * ld_src + i]) * rscale) + bzp);
+      }
+    };
+    auto sNauto_calc_store_scale_and_quantv_sym = [&](int blocksize) {
+      auto constexpr NBits = utils::bestla_dtype_bits(QDT_T);
+      int constexpr FullValue = 1 << (NBits - 1);
+      int constexpr GenValue = FullValue - 1;
+      float maxval = std::numeric_limits<float>::min();
+      float minval = std::numeric_limits<float>::max();
+      float absmax = 0;
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        maxval = std::max(maxval, srcptr[(j + ij) * ld_src + i]);
+        minval = std::min(minval, srcptr[(j + ij) * ld_src + i]);
+        absmax = std::max(absmax, std::abs(srcptr[(j + ij) * ld_src + i]));
+      }
+      int NVal = GenValue;
+      auto sum = maxval + minval;
+      if (abs(sum) >= absmax / FullValue) {
+        NVal = sum > 0.f ? -FullValue : FullValue;
+      }
+      float scale = absmax / NVal;
+      float rscale = 1.f / scale;
+      scales[j / raw_blocksize * ld_dst + i] = scale;
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>(srcptr[(j + ij) * ld_src + i] * rscale);
+      }
+    };
+
+    auto sNauto_calc_store_scale_and_quantv_asym = [&](int blocksize) {
+      auto constexpr NBits = utils::bestla_dtype_bits(QDT_T);
+      int constexpr FullValue = 1 << (NBits - 1);
+      float maxval = 0.f;
+      float minval = 0.f;
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        maxval = std::max(maxval, srcptr[(j + ij) * ld_src + i]);
+        minval = std::min(minval, srcptr[(j + ij) * ld_src + i]);
+      }
+      float scale = (maxval - minval) / ((1 << NBits) - 1);
+      float rscale = 1.f / scale;
+      scales[j / raw_blocksize * ld_dst + i] = scale;
+      int bzp = utils::cast<float, int>((0 - minval) * rscale) - FullValue;
+      auto clip = [&](int s) {
+        s = std::max(s, -FullValue);
+        s = std::min(s, FullValue - 1);
+        return s;
+      };
+      bzp = clip(bzp);
+      zero_points[j / raw_blocksize * ld_dst + i] = static_cast<int8_t>(bzp);
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        auto tmp = utils::cast<float, int>((srcptr[(j + ij) * ld_src + i]) * rscale) + bzp;
+        tmp = clip(tmp);
+        dstptr[(j + ij) * ld_dst + i] = tmp;
+      }
+    };
+    auto dispatch_calc = [&](int blocksize) {
+      switch (QDT_T) {
+        case BTLA_DTYPE::S8:
+        case BTLA_DTYPE::S2_CLIP:
+        case BTLA_DTYPE::S3_CLIP:
+        case BTLA_DTYPE::S4_CLIP:
+          if (zero_points == nullptr) {
+            sNauto_calc_store_scale_and_quantv_sym(blocksize);
+          } else {
+            sNauto_calc_store_scale_and_quantv_asym(blocksize);
+          }
+          break;
+        default:
+          assert(false);
+          break;
+      }
+    };
+
+    for (; j < align_row_loop; j += blocksize) dispatch_calc(blocksize);
+    if (j < row) dispatch_calc(row - align_row_loop);
+  }
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE F8_T>
+static inline int8_t f8_mx_quantize(float v, float scale, BTLA_DTYPE scale_dtype) {
+  if (scale_dtype == BTLA_DTYPE::F8_E8M0) {
+    v /= std::pow(2, scale);
+  } else {
+    v /= scale;
+  }
+  auto ebits = utils::bestla_dtype_get_f8_ebits(F8_T);
+  auto quant_mantissa = utils::bestla_dtype_get_f8_quant_mbits(F8_T);
+  auto store_mantissa = 7 - ebits;
+  auto private_exp = std::floor(std::log2(std::abs(v == 0 ? v + 1 : v)));
+  auto min_exp = -1 * (std::pow(2, ebits - 1)) + 2;
+  private_exp = private_exp < min_exp ? min_exp : private_exp;
+
+  // Scale up so appropriate number of bits are in the integer portion of the number
+  v = v / std::pow(2, private_exp) * std::pow(2, quant_mantissa - 2);
+  auto sign = v > 0 ? 1 : -1;
+  v = sign * std::floor(std::abs(v) + 0.5);
+  // Undo scaling
+  v = v / std::pow(2, quant_mantissa - 2) * std::pow(2, private_exp);
+
+  // saturate normals.
+  auto max_norm = utils::get_mxfp_maxnorm(F8_T, ebits, quant_mantissa);
+  v = std::clamp(v, -1 * max_norm, max_norm);
+  uint32_t* shift_v = reinterpret_cast<uint32_t*>(&v);
+  // get sign;
+  char* p = reinterpret_cast<char*>(&v);
+  uint8_t store_signbit = (*(p + 3) & 0x80);
+  *shift_v <<= 1;
+  uint8_t store_ebit = (*(p + 3) & 0xFF);
+  store_ebit = store_ebit - 127 + std::pow(2, ebits - 1) - 1;
+  if (store_ebit > 15 && F8_T == BTLA_DTYPE::F8_E4M3) store_ebit = 0;
+  if (store_ebit > 31 && F8_T == BTLA_DTYPE::F8_E5M2) store_ebit = 0;
+  store_ebit <<= store_mantissa;
+  *shift_v <<= 8;
+  int8_t ox80_shift = -128 >> (store_mantissa - 1);
+  uint8_t store_mantissabit = (*(p + 3) & ox80_shift);
+  store_mantissabit >>= (1 + ebits);
+  auto ret = store_signbit | store_ebit | store_mantissabit;
+  return ret;
+}
+
+template <BTLA_DTYPE F8_T>
+static inline BTLA_CODE quantize_f32_f8_rowblock_mxscale(const float* srcptr, int8_t* dstptr, int row, int col,
+                                                         int ld_src, int ld_dst, float* scales, int blocksize,
+                                                         BTLA_DTYPE scale_dtype) {
+  for (int i = 0; i < col; i++) {
+    int align_row_loop = row / blocksize * blocksize;
+    int j = 0;
+    auto f8_blk_quant = [&](int blksize) {
+      float scale = std::numeric_limits<float>::min();
+      for (size_t ij = 0; ij < blksize; ij++) {
+        scale = std::max(scale, std::abs(srcptr[(j + ij) * ld_src + i]));
+      }
+      if (scale_dtype == BTLA_DTYPE::F8_E8M0) {
+        if (scale == 0) scale += std::abs(std::numeric_limits<float>::min());
+        scale = std::floor(std::log2(scale));
+        auto ebits = utils::bestla_dtype_get_f8_ebits(F8_T);
+        auto emax = std::pow(2, ebits - 1);
+        if (F8_T == BTLA_DTYPE::F8_E5M2) emax -= 1;
+        scale -= emax;
+        auto scale_max = std::pow(2, 7) - 1;  // e8m0 scale type.
+        scale = scale < (-1 * scale_max) ? (-1 * scale_max) : scale;
+      } else if (scale_dtype == BTLA_DTYPE::F32) {
+        scale /= utils::get_mxfp_maxnorm(F8_T, utils::bestla_dtype_get_f8_ebits(F8_T),
+                                         utils::bestla_dtype_get_f8_quant_mbits(F8_T));
+      } else {
+        assert(0);
+      }
+      scales[j / blocksize * ld_dst + i] = scale;
+      for (size_t ij = 0; ij < blksize; ij++) {
+        dstptr[(j + ij) * ld_dst + i] = f8_mx_quantize<F8_T>(srcptr[(j + ij) * ld_src + i], scale, scale_dtype);
+      }
+    };
+    for (; j < align_row_loop; j += blocksize) f8_blk_quant(blocksize);
+    if (j < row) f8_blk_quant(row - align_row_loop);
+  }
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE F4_T>
+static inline BTLA_CODE quantize_f32_f4_rowblock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src,
+                                                 int ld_dst, float* scales, int8_t* zero_points, int blocksize) {
+  int raw_blocksize = blocksize;
+  for (int i = 0; i < col; i++) {
+    int align_row_loop = row / blocksize * blocksize;
+    int j = 0;
+    auto calc_store_scale_and_quantv_sym = [&](int blocksize) {
+      float absmax = std::numeric_limits<float>::min();
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        absmax = std::max(absmax, std::abs(srcptr[(j + ij) * ld_src + i]));
+      }
+      scales[j / raw_blocksize * ld_dst + i] = absmax;
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        dstptr[(j + ij) * ld_dst + i] = f4_quantize<F4_T>(srcptr[(j + ij) * ld_src + i] * (1.f / absmax));
+      }
+    };
+    auto calc_store_scale_and_quantv_asym = [&](int blocksize) {
+      float amax = 0;
+      float amin = 0;
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        amax = std::max(amax, srcptr[(j + ij) * ld_src + i]);
+        amin = std::max(amax, srcptr[(j + ij) * ld_src + i]);
+      }
+      float scale = (amax - amin) / 2;
+      scales[j / raw_blocksize * ld_dst + i] = scale;
+      float fmedium = (amax + amin) / 2;
+      zero_points[j / raw_blocksize * ld_dst + i] = f4_quantize<F4_T>((0 - fmedium) * (1.f / scale));
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        dstptr[(j + ij) * ld_dst + i] = f4_quantize<F4_T>((srcptr[(j + ij) * ld_src + i] - fmedium) * (1.f / scale));
+      }
+    };
+    auto dispatch_calc = [&](int blocksize) {
+      if (zero_points == nullptr) {
+        calc_store_scale_and_quantv_sym(blocksize);
+      } else {
+        calc_store_scale_and_quantv_asym(blocksize);
+      }
+    };
+    for (; j < align_row_loop; j += blocksize) dispatch_calc(blocksize);
+    if (j < row) dispatch_calc(row - align_row_loop);
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename SRC_T>
+static inline BTLA_CODE quantize_fp_u8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr,
+                                                int ld_dst, float* scales, int ld_scale, uint8_t* zps, int blocksize,
+                                                float* blkreduce) {
+  int colblk = utils::padto_le(col, blocksize);
+  for (int i = 0; i < row; i++) {
+    size_t j = 0;
+    for (; j < colblk; j += blocksize) {
+      float maxval = std::numeric_limits<float>::min();
+      float minval = 0.f;
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
+        maxval = std::max(fsrc, maxval);
+        minval = std::min(fsrc, minval);
+      }
+      float scale = (maxval - minval) / 255;
+      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
+      float rscale = 1.f / scale;
+      scales[j / blocksize + i * ld_scale] = scale;
+      zps[j / blocksize + i * ld_scale] = zp;
+      int sum = 0;
+      auto zpf = static_cast<float>(zp);
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
+        auto qtmp = utils::cast<float, int>(fsrc * rscale);
+        sum += qtmp;
+        dstptr[(j + ij) + i * ld_dst] = utils::cast<float, uint8_t>(zpf + qtmp);
+      }
+      if (blkreduce) {
+        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
+      }
+    }
+    if (j < col) {
+      float maxval = 0.f;
+      float minval = 0.f;
+      for (size_t ij = j; ij < col; ij++) {
+        auto fsrc = static_cast<float>(srcptr[(ij) + i * ld_src]);
+        maxval = std::max(fsrc, maxval);
+        minval = std::min(fsrc, minval);
+      }
+      float scale = (maxval - minval) / 255;
+      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
+      float rscale = 1.f / scale;
+      scales[j / blocksize + i * ld_scale] = scale;
+      zps[j / blocksize + i * ld_scale] = zp;
+      int sum = 0;
+      auto zpf = float(zp);
+      for (size_t ij = j; ij < col; ij++) {
+        auto fsrc = static_cast<float>(srcptr[(ij) + i * ld_src]);
+        auto qtmp = utils::cast<float, int>(fsrc * rscale);
+        sum += qtmp;
+        dstptr[(ij) + i * ld_dst] = utils::cast<float, uint8_t>(zpf + qtmp);
+      }
+      if (blkreduce) {
+        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename SRC_T>
+static inline BTLA_CODE quantize_fp_s8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, int8_t* dstptr,
+                                                int ld_dst, float* scales, int ld_scale, int blocksize, float* reduce) {
+  int colblk = utils::padto_le(col, blocksize);
+  for (int i = 0; i < row; i++) {
+    size_t j = 0;
+    for (; j < colblk; j += blocksize) {
+      float absmaxval = std::numeric_limits<float>::min();
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
+        absmaxval = std::max(std::abs(fsrc), absmaxval);
+      }
+      float scale = absmaxval / 127;
+      float rscale = 1.f / scale;
+      int sum = 0;
+      scales[j / blocksize + i * ld_scale] = scale;
+      for (size_t ij = 0; ij < blocksize; ij++) {
+        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
+        auto tmp = utils::cast<float, int8_t>(fsrc * rscale);
+        dstptr[(j + ij) + i * ld_dst] = tmp;
+        sum += tmp;
+      }
+      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
+    }
+    if (j < col) {
+      float absmaxval = std::numeric_limits<float>::min();
+      for (size_t ij = j; ij < col; ij++) {
+        auto fsrc = static_cast<float>(srcptr[(ij) + i * ld_src]);
+        absmaxval = std::max(std::abs(fsrc), absmaxval);
+      }
+      float scale = absmaxval / 127;
+      float rscale = 1.f / scale;
+      scales[j / blocksize + i * ld_scale] = scale;
+      int sum = 0;
+      for (size_t ij = j; ij < col; ij++) {
+        auto fsrc = static_cast<float>(srcptr[(ij) + i * ld_src]);
+        dstptr[(ij) + i * ld_dst] = utils::cast<float, int8_t>(fsrc * rscale);
+        sum += dstptr[(ij) + i * ld_dst];
+      }
+      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline uint8_t get_dq8_bnb(float v) {
+  int left = 0;
+  int right = 255;
+  while (left <= right) {
+    int mid = left + (right - left) / 2;
+    if (dq8_bnb_LUT[mid] == v) {
+      return mid;
+    } else if (dq8_bnb_LUT[mid] < v) {
+      left = mid + 1;
+    } else {
+      right = mid - 1;
+    }
+  }
+  if (right < 0) {
+    return 0;
+  } else if (left >= 256) {
+    return 255;
+  } else {
+    return (v - dq8_bnb_LUT[right] < dq8_bnb_LUT[left] - v) ? right : left;
+  }
+}
+template <bool QDQ_SCALE>
+static inline BTLA_CODE dq8_bnb_double_quant(float* scale, size_t scale_size, int dq_blocksize, float* dq_buf) {
+  float offset = 0.f;
+  for (int i = 0; i < scale_size; i++) offset += scale[i];
+  offset /= scale_size;
+  dq_buf[utils::updiv(scale_size, dq_blocksize)] = offset;  // store zp.
+  int align_blk_size = scale_size / dq_blocksize * dq_blocksize;
+  int i = 0;
+  auto calc_scale = [&](int blksize) {
+    float absmax = std::numeric_limits<float>::min();
+    for (int j = 0; j < blksize; j++) {
+      scale[i + j] -= offset;
+      absmax = std::max(absmax, std::abs(scale[i + j]));
+    }
+    for (int j = 0; j < blksize; j++) {
+      scale[i + j] /= absmax;
+      scale[i + j] = get_dq8_bnb(scale[i + j]);
+      if constexpr (QDQ_SCALE) {
+        scale[i + j] = dq8_bnb_LUT[static_cast<int>(scale[i + j])];
+        scale[i + j] *= absmax;
+        scale[i + j] += offset;
+      }
+    }
+    return absmax;
+  };
+  for (; i < align_blk_size; i += dq_blocksize) dq_buf[i / dq_blocksize] = calc_scale(dq_blocksize);
+  if (i < scale_size) dq_buf[i / dq_blocksize + 1] = calc_scale(scale_size - i);
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE dq8_get_fp_scale(uint8_t* src, float* dst, int row, int col, int scale_offset, int dq_blk,
+                                         int dq_offset_idx, float* dq_scale, int src_stride, int dst_stride,
+                                         bool zeropadding, int mN) {
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col; j++) {
+      auto dq_s_idx = (i * mN + scale_offset + j) / dq_blk;
+      dst[i * dst_stride + j] = dq8_bnb_LUT[src[i * src_stride + j]] * dq_scale[dq_s_idx] + dq_scale[dq_offset_idx];
+    }
+  }
+  if (zeropadding) assert(0);
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE alphabeta_f32_f32(const float alpha, const float* srcptr, const int srcstep, const float beta,
+                                          const float* src1ptr, const int src1step, float* dstptr, const int dststep,
+                                          const int M, const int N) {
+  if (beta != 0.f) {
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j] + beta * src1ptr[i * src1step + j];
+      }
+    }
+    return BTLA_CODE::Success;
+  }
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+template <typename SCA_T>
+static inline BTLA_CODE accum_alphaN_f32_f32(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
+                                             const int dststep, const int M, const int N) {
+  for (size_t i = 0; i < M; i++) {
+    for (size_t j = 0; j < N; j++) {
+      if constexpr (!std::is_same_v<SCA_T, utils::f8>) {
+        dstptr[i * dststep + j] = static_cast<float>(alpha[j]) * srcptr[i * srcstep + j] + dstptr[i * dststep + j];
+      } else {
+        dstptr[i * dststep + j] =
+            std::pow(2, alpha[j].x) * srcptr[i * srcstep + j] + dstptr[i * dststep + j];  // e8m0 scale.
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE accum_f32_f32(const float* srcptr, const int srcstep, float* dstptr, const int dststep,
+                                      const int M, const int N) {
+  for (size_t i = 0; i < M; i++) {
+    for (size_t j = 0; j < N; j++) {
+      dstptr[i * dststep + j] = srcptr[i * srcstep + j] + dstptr[i * dststep + j];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE quanout_s32_u32(const float alpha, const int32_t* srcptr, const int srcstep, uint8_t* dstptr,
+                                        const int dststep, const int M, const int N, float scaleSrc, float scaleDst,
+                                        int zpDst) {
+  float factor = alpha * scaleSrc / scaleDst;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      float fsrc = static_cast<float>(srcptr[i * srcstep + j]) * factor;
+      dstptr[i * dststep + j] = utils::cast<float, uint8_t>(fsrc + static_cast<float>(zpDst));
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename SCAB_T>
+static inline BTLA_CODE dequant_s32_fp32(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep,
+                                         const int M, const int N, const float* scaleA, const int ldsa,
+                                         const SCAB_T* scaleB) {
+  for (int i = 0; i < M; i++) {
+    float scale = scaleA[i * ldsa];
+    for (int j = 0; j < N; j++) {
+      float fsrc = static_cast<float>(srcptr[i * srcstep + j]) * static_cast<float>(scaleB[j]) * scale;
+      dstptr[i * dststep + j] = fsrc;
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE minmax_f32_kblock(const float* srcptr, int row, int col, int ld_src, float* minmaxptr,
+                                          int ld_minmax, int fsize_minmax, int blocksize) {
+  for (int i = 0; i < row; i++) {
+    if (col >= blocksize) {
+      for (int icol = 0; icol < col; icol += blocksize) {
+        float maxval = std::numeric_limits<float>::min();
+        float minval = std::numeric_limits<float>::max();
+        for (int ii = 0; ii < blocksize; ii++) {
+          maxval = std::max(srcptr[i * ld_src + icol + ii], maxval);
+          minval = std::min(srcptr[i * ld_src + icol + ii], minval);
+        }
+        auto colptr = &minmaxptr[i * ld_minmax + icol / blocksize * fsize_minmax];
+        colptr[0] = minval;
+        colptr[1] = maxval;
+      }
+    } else {
+      float maxval = std::numeric_limits<float>::min();
+      float minval = std::numeric_limits<float>::max();
+      for (int icol = 0; icol < col; icol++) {
+        maxval = std::max(srcptr[i * ld_src + icol], maxval);
+        minval = std::min(srcptr[i * ld_src + icol], minval);
+      }
+      minmaxptr[i * ld_minmax + 0] = minval;
+      minmaxptr[i * ld_minmax + 1] = maxval;
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE accumulate_dequantize_s32_f32(const int32_t* srcptr, float* dstptr, float alpha, float beta,
+                                                      int row, int col, int ld_src, int ld_dst, float* ascales,
+                                                      int ldas, float* wscales) {
+  for (int irow = 0; irow < row; irow++) {
+    for (int icol = 0; icol < col; icol++) {
+      float scale = ascales[irow * ldas] * wscales[icol] * alpha;
+      dstptr[irow * ld_dst + icol] = scale * srcptr[irow * ld_src + icol] + beta * dstptr[irow * ld_dst + icol];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE broadcast_u8(int num, const uint8_t& srcval, uint8_t* dstptr) {
+  int i = 0;
+  for (; i < num; i++) {
+    dstptr[i] = srcval;
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename _RT>
+static inline BTLA_CODE quant_s8_row_reduce_sum(const int8_t* srcptr, int ldsrc, const float* scales,
+                                                const int8_t* zero_points, int row, int col, _RT* reduce) {
+  std::memset(reduce, 0, sizeof(reduce[0]) * col);
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col; j++) {
+      if (zero_points != nullptr) {
+        reduce[j] += static_cast<_RT>((static_cast<float>(srcptr[i * ldsrc + j]) - static_cast<float>(zero_points[j])) *
+                                      static_cast<float>(scales[j]));
+      } else {
+        reduce[j] += static_cast<_RT>(srcptr[i * ldsrc + j] * scales[j]);
+      }
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename _RT>
+static inline BTLA_CODE row_reduce_sum(const float* srcptr, int ldsrc, int row, int col, _RT* reduce) {
+  for (int j = 0; j < col; j++) {
+    float tmp = 0.f;
+    for (int i = 0; i < row; i++) {
+      tmp += srcptr[i * ldsrc + j];
+    }
+    reduce[j] = static_cast<_RT>(tmp);
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename SRC_T>
+static inline BTLA_CODE col_block_reduce_sum(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize,
+                                             float* reduce, int ldr) {
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col; j += blocksize) {
+      auto tmp = 0.f;
+      for (size_t jj = 0; jj < blocksize; jj++) {
+        if (j + jj < col) {
+          tmp += srcptr[i * ldsrc + j + jj];
+        }
+      }
+      reduce[i * ldr + j / blocksize] = tmp;
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE remove_act_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zps,
+                                                  float* scales, int lds, const float* reduce) {
+  for (int i = 0; i < row; i++) {
+    auto zpf = static_cast<float>(zps[i * lds]) * scales[i * lds];
+    for (int j = 0; j < col; j++) {
+      accptr[i * ldacc + j] -= zpf * reduce[j];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE remove_wei_zeropoint_bias(float* accptr, int ldacc, int row, int col, int8_t* zps,
+                                                  float* scales, int lds, const float* reduce) {
+  for (int i = 0; i < row; i++) {
+    auto reducef = reduce[i * lds];
+    for (int j = 0; j < col; j++) {
+      accptr[i * ldacc + j] -= static_cast<float>(zps[j]) * scales[j] * reducef;
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+static inline BTLA_CODE remove_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
+                                              float* scalea, float* scaleb, int lds, int k, const float* reducea,
+                                              const float* reduceb) {
+  for (int i = 0; i < row; i++) {
+    auto reduceaf = reducea[i * lds];
+    auto zpaf = static_cast<float>(zpa[i * lds]) * scalea[i * lds];
+    for (int j = 0; j < col; j++) {
+      auto zpbf = static_cast<float>(zpb[j]) * scaleb[j];
+      accptr[i * ldacc + j] -= zpbf * reduceaf;
+      accptr[i * ldacc + j] -= zpaf * reduceb[j];
+      accptr[i * ldacc + j] -= zpaf * zpbf * k;
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename T>
+static inline BTLA_CODE layernorm(const T* srcptr, const T* scaleptr, const T* biasptr, T epsilon, int norm_size,
+                                  T* dstptr, T* mean_out, T* mean_square_out, bool simplified) {
+  T mean = 0;
+  T mean_square = 0;
+
+  for (int h = 0; h < norm_size; h++) {
+    mean += srcptr[h];
+    mean_square += srcptr[h] * srcptr[h];
+  }
+
+  mean = mean / norm_size;
+  if (simplified) {
+    mean_square = std::sqrt(mean_square / norm_size + epsilon);
+  } else {
+    mean_square = std::sqrt(mean_square / norm_size - mean * mean + epsilon);
+  }
+  float inv_mean_square = 1.f / mean_square;
+  if (simplified) {
+    if (scaleptr) {
+      for (int h = 0; h < norm_size; h++) {
+        dstptr[h] = srcptr[h] * inv_mean_square * scaleptr[h];
+      }
+    } else {
+      for (int h = 0; h < norm_size; h++) {
+        dstptr[h] = srcptr[h] * inv_mean_square;
+      }
+    }
+  } else {
+    if (scaleptr) {
+      if (biasptr == nullptr) {
+        for (int h = 0; h < norm_size; h++) {
+          dstptr[h] = (srcptr[h] - mean) * inv_mean_square * scaleptr[h];
+        }
+      } else {
+        for (int h = 0; h < norm_size; h++) {
+          dstptr[h] = (srcptr[h] - mean) * inv_mean_square * scaleptr[h] + biasptr[h];
+        }
+      }
+    } else {
+      for (int h = 0; h < norm_size; h++) {
+        dstptr[h] = (srcptr[h] - mean) * inv_mean_square;
+      }
+    }
+  }
+
+  if (mean_out) {
+    *mean_out = mean;
+  }
+  if (mean_square_out) {
+    *mean_square_out = mean_square;
+  }
+  return BTLA_CODE::Success;
+}
+
+inline float exp_ps_0_1(float x) {
+  static const float log2e = std::log2(std::exp(1.f));
+  static const float ln2 = std::log(2.f);
+  const float x1 = x * log2e + .5f;
+  const float z = std::floor(x1);
+  const float f = x1 - z;
+  constexpr std::array<float, 3> coeff{0.240226507f, 0.452920674f, 0.713483036f};
+  // same as a * std::pow(2, z) but more precise
+  return ldexpf(coeff[0] * f * f + coeff[1] * f + coeff[2], static_cast<int>(z));
+}
+
+template <BTLA_DTYPE S3_T, typename _DST_T>
+static inline BTLA_CODE decompress_kblock_s3_s8fp(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr, _DST_T* dstptr,
+                                                  int interleave_n_offset, int unpack_elt, int8_t* tmp,
+                                                  size_t tmpsize) {
+  auto head_ignore_num = interleave_n_offset % 128;
+  auto bit3_interleave_decompress_pack128 = [&](utils::bit2x4* src1, utils::bit1x8* src2, int8_t* dst) {
+    auto b2ptr = reinterpret_cast<uint8_t*>(src1);
+    for (size_t i = 0; i < 128; i += 8) {
+      auto bit1off = i >> 3;
+      auto bit2off = (i >> 5) << 1;
+      auto byteoff = i % 32;
+      uint8_t bit1 = *(uint8_t*)(src2 + bit1off);
+      for (size_t j = 0; j < 8; j++) {
+        uint8_t bit2 = *(b2ptr + byteoff + j);
+        bit2 >>= bit2off;
+        uint8_t dst8 = ((bit2 & 0x3)) | ((bit1 & 0x1) << 2);
+        bit1 >>= 1;
+        dst[i + j] = (*(int8_t*)&dst8) - 4;
+      }
+    }
+  };
+
+  assert(head_ignore_num % 8 == 0);
+
+  auto base_bit2ptr = bit2ptr - head_ignore_num / 4;
+  auto base_bit1ptr = bit1ptr - head_ignore_num / 8;
+  int compress_wei_ptr_offset = 0;
+  int8_t* s8_ptr = reinterpret_cast<int8_t*>(tmp);
+  auto head_write_num = 128 - head_ignore_num;
+  if (head_ignore_num != 0) {
+    bit3_interleave_decompress_pack128(base_bit2ptr, base_bit1ptr, tmp);
+    for (int i = 0; i < head_write_num; i++) dstptr[i] = s8_ptr[head_ignore_num + i];
+    compress_wei_ptr_offset += head_write_num;
+  }
+
+  auto body_loop = (unpack_elt - head_write_num % 128) / 128;
+  auto tail_proc_num = (unpack_elt - head_write_num % 128) % 128;
+  for (size_t i = 0; i < body_loop; i++) {
+    bit3_interleave_decompress_pack128(bit2ptr + compress_wei_ptr_offset / 4 + i * 32,
+                                       bit1ptr + compress_wei_ptr_offset / 8 + i * 16, tmp);
+    for (int j = 0; j < 128; j++) dstptr[compress_wei_ptr_offset + i * 128 + j] = tmp[j];
+  }
+  compress_wei_ptr_offset += body_loop * 128;
+  if (tail_proc_num > 0) {
+    bit3_interleave_decompress_pack128(base_bit2ptr, base_bit1ptr, tmp);
+    bit3_interleave_decompress_pack128(bit2ptr + compress_wei_ptr_offset / 4, bit1ptr + compress_wei_ptr_offset / 8,
+                                       tmp);
+    for (int i = 0; i < tail_proc_num; i++) dstptr[compress_wei_ptr_offset + i] = s8_ptr[i];
+  }
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE _S3_T, typename _DST_T, int _PACK_ROW, typename _ST>
+static inline BTLA_CODE decompress_kblock_bit3_packrow_fp(utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr,
+                                                          _DST_T* dstptr, int interleave_n_offset, int row, int col,
+                                                          _ST* scales, int8_t* zero_points, int k_offset, int kblock,
+                                                          int NPad, void* tmp, size_t tmpsize) {
+  auto unpack_elt = row * col;
+  decompress_kblock_s3_s8fp<_S3_T>(bit2ptr, bit1ptr, dstptr, interleave_n_offset, unpack_elt,
+                                   reinterpret_cast<int8_t*>(tmp), tmpsize);
+  // TODO(zhe): simd version
+  for (int i = 0; i < row; i++) {
+    int kpos = (k_offset + i) / kblock;
+    auto sptr = scales + kpos * NPad;
+    for (int j = 0; j < col; j++) {
+      float tmp = static_cast<float>(dstptr[i * col + j]);
+      if (zero_points != nullptr) tmp -= static_cast<float>(zero_points[kpos * NPad + j / _PACK_ROW]);
+      dstptr[i * col + j] = static_cast<_DST_T>(tmp * sptr[j / _PACK_ROW]);
+    }
+  }
+
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE S2_T, typename _DST_T>
+static inline BTLA_CODE decompress_kblock_s2_s8fp(utils::bit2x4* bit2ptr, _DST_T* dstptr, size_t unpack_elt,
+                                                  int8_t* tmp, size_t tmpsize) {
+  for (size_t i = 0; i < unpack_elt; i += 4) {
+    auto tmp = bit2ptr[i / 4];
+    dstptr[i + 0] = _DST_T(tmp.a - 2);
+    dstptr[i + 1] = _DST_T(tmp.b - 2);
+    dstptr[i + 2] = _DST_T(tmp.c - 2);
+    dstptr[i + 3] = _DST_T(tmp.d - 2);
+  }
+  return BTLA_CODE::Success;
+}
+
+template <BTLA_DTYPE _S2_T, typename _DST_T, int _PACK_ROW, typename _ST>
+static inline BTLA_CODE decompress_kblock_bit2_packrow_fp(utils::bit2x4* bit2ptr, _DST_T* dstptr, int row, int col,
+                                                          _ST* scales, int8_t* zero_points, int k_offset, int kblock,
+                                                          int NPad, void* tmp, size_t tmpsize) {
+  auto unpack_elt = row * col;
+  decompress_kblock_s2_s8fp<_S2_T>(bit2ptr, dstptr, unpack_elt, reinterpret_cast<int8_t*>(tmp), tmpsize);
+  // TODO(zhe): simd version
+  for (int i = 0; i < row; i++) {
+    int kpos = (k_offset + i) / kblock;
+    auto sptr = scales + kpos * NPad;
+    for (int j = 0; j < col; j++) {
+      float tmp = static_cast<float>(dstptr[i * col + j]);
+      if (zero_points != nullptr) tmp -= static_cast<float>(zero_points[kpos * NPad + j / _PACK_ROW]);
+      dstptr[i * col + j] = static_cast<_DST_T>(tmp * sptr[j / _PACK_ROW]);
+    }
+  }
+
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_4bit_u8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  int blks = k / blocksize;
+  float accf[NTILE * MTILE];
+  std::memset(accf, 0, sizeof(accf));
+  auto a8ptr = A.aptr;
+  auto b4ptr = B.b4ptr;
+  auto asptr = A.sptr;
+  auto azptr = A.zpptr;
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * B.ldzp;
+    if (B.zpptr) {
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int ik = 0; ik < blocksize; ik += 4) {
+        for (int im = 0; im < MTILE; im++) {
+          int azp = azptr[ib + im * A.ldzp];
+          float ascale = asptr[ib + im * A.ldzp];
+          for (int in = 0; in < NTILE; in++) {
+            auto bv0 = *(utils::int4x2*)(b4ptr + in * 2);
+            auto bv1 = *(utils::int4x2*)(b4ptr + in * 2 + 1);
+            auto vscale = ascale * bsptr[in];
+            int bzp = bzptr[in] + 8;
+            accf[im * NTILE + in] += int(a8ptr[0 + im * A.lda] - azp) * (bv0.x - bzp) * vscale;
+            accf[im * NTILE + in] += int(a8ptr[1 + im * A.lda] - azp) * (bv0.y - bzp) * vscale;
+            accf[im * NTILE + in] += int(a8ptr[2 + im * A.lda] - azp) * (bv1.x - bzp) * vscale;
+            accf[im * NTILE + in] += int(a8ptr[3 + im * A.lda] - azp) * (bv1.y - bzp) * vscale;
+          }
+        }
+        a8ptr += 4;
+        b4ptr += NTILE * 2;
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += 4) {
+        for (int im = 0; im < MTILE; im++) {
+          int azp = azptr[ib + im * A.ldzp];
+          float ascale = asptr[ib + im * A.ldzp];
+          for (int in = 0; in < NTILE; in++) {
+            auto bv0 = *(utils::int4x2*)(b4ptr + in * 2);
+            auto bv1 = *(utils::int4x2*)(b4ptr + in * 2 + 1);
+            auto vscale = ascale * bsptr[in];
+            accf[im * NTILE + in] += int(a8ptr[0 + im * A.lda] - azp) * (bv0.x - 8) * vscale;
+            accf[im * NTILE + in] += int(a8ptr[1 + im * A.lda] - azp) * (bv0.y - 8) * vscale;
+            accf[im * NTILE + in] += int(a8ptr[2 + im * A.lda] - azp) * (bv1.x - 8) * vscale;
+            accf[im * NTILE + in] += int(a8ptr[3 + im * A.lda] - azp) * (bv1.y - 8) * vscale;
+          }
+        }
+        a8ptr += 4;
+        b4ptr += NTILE * 2;
+      }
+    }
+  }
+  for (int im = 0; im < MTILE; im++) {
+    for (int in = 0; in < NTILE; in++) {
+      C[in + im * ldc] = accf[im * NTILE + in];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_4bit_s8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  int blks = k / blocksize;
+  float accf[NTILE * MTILE];
+  std::memset(accf, 0, sizeof(accf));
+  auto a8ptr = reinterpret_cast<int8_t*>(A.aptr);
+  auto b4ptr = B.b4ptr;
+  auto asptr = A.sptr;
+  auto azptr = A.zpptr;
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * B.ldzp;
+    if (B.zpptr) {
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int ik = 0; ik < blocksize; ik += 4) {
+        for (int im = 0; im < MTILE; im++) {
+          float ascale = asptr[ib + im * A.ldzp];
+          for (int in = 0; in < NTILE; in++) {
+            auto bv0 = *(utils::int4x2*)(b4ptr + in * 2);
+            auto bv1 = *(utils::int4x2*)(b4ptr + in * 2 + 1);
+            auto vscale = ascale * bsptr[in];
+            auto bzp = bzptr[in] + 8;
+            accf[im * NTILE + in] += int(a8ptr[0 + im * A.lda]) * (bv0.x - bzp) * vscale;
+            accf[im * NTILE + in] += int(a8ptr[1 + im * A.lda]) * (bv0.y - bzp) * vscale;
+            accf[im * NTILE + in] += int(a8ptr[2 + im * A.lda]) * (bv1.x - bzp) * vscale;
+            accf[im * NTILE + in] += int(a8ptr[3 + im * A.lda]) * (bv1.y - bzp) * vscale;
+          }
+        }
+        a8ptr += 4;
+        b4ptr += NTILE * 2;
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += 4) {
+        for (int im = 0; im < MTILE; im++) {
+          float ascale = asptr[ib + im * A.ldzp];
+          for (int in = 0; in < NTILE; in++) {
+            auto bv0 = *(utils::int4x2*)(b4ptr + in * 2);
+            auto bv1 = *(utils::int4x2*)(b4ptr + in * 2 + 1);
+            auto vscale = ascale * bsptr[in];
+            accf[im * NTILE + in] += int(a8ptr[0 + im * A.lda]) * (bv0.x - 8) * vscale;
+            accf[im * NTILE + in] += int(a8ptr[1 + im * A.lda]) * (bv0.y - 8) * vscale;
+            accf[im * NTILE + in] += int(a8ptr[2 + im * A.lda]) * (bv1.x - 8) * vscale;
+            accf[im * NTILE + in] += int(a8ptr[3 + im * A.lda]) * (bv1.y - 8) * vscale;
+          }
+        }
+        a8ptr += 4;
+        b4ptr += NTILE * 2;
+      }
+    }
+  }
+  for (int im = 0; im < MTILE; im++) {
+    for (int in = 0; in < NTILE; in++) {
+      C[in + im * ldc] = accf[im * NTILE + in];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_4bit_fp32_fp32(const float* A, int lda, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  int blks = k / blocksize;
+  float accf[NTILE * MTILE];
+  std::memset(accf, 0, sizeof(accf));
+  auto b4ptr = B.b4ptr;
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * B.ldzp;
+    if (B.zpptr) {
+      auto bzptr = B.zpptr + ib * B.ldzp;
+      for (int ik = 0; ik < blocksize; ik += 1) {
+        for (int im = 0; im < MTILE; im++) {
+          auto aval = A[ib * blocksize + ik + im * lda];
+          for (int in = 0; in < NTILE; in += 2) {
+            auto bv0 = *(utils::int4x2*)(b4ptr + in / 2);
+            accf[im * NTILE + in + 0] += aval * (bv0.x - 8 - bzptr[in + 0]) * bsptr[in + 0];
+            accf[im * NTILE + in + 1] += aval * (bv0.y - 8 - bzptr[in + 1]) * bsptr[in + 1];
+          }
+        }
+        b4ptr += NTILE / 2;
+      }
+    } else {
+      for (int ik = 0; ik < blocksize; ik += 1) {
+        for (int im = 0; im < MTILE; im++) {
+          auto aval = A[ib * blocksize + ik + im * lda];
+          for (int in = 0; in < NTILE; in += 2) {
+            auto bv0 = *(utils::int4x2*)(b4ptr + in / 2);
+            accf[im * NTILE + in + 0] += aval * (bv0.x - 8) * bsptr[in + 0];
+            accf[im * NTILE + in + 1] += aval * (bv0.y - 8) * bsptr[in + 1];
+          }
+        }
+        b4ptr += NTILE / 2;
+      }
+    }
+  }
+  for (int im = 0; im < MTILE; im++) {
+    for (int in = 0; in < NTILE; in++) {
+      C[in + im * ldc] = accf[im * NTILE + in];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE>
+static inline BTLA_CODE gemv_3bit_u8s8_fp32_align128(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B,
+                                                     float* C, int k, int ld_scaleb, int blocksize, int8_t* tmp,
+                                                     size_t tmpsize) {
+  int blks = k / blocksize;
+  float accf[NTILE];
+  std::memset(accf, 0, sizeof(accf));
+  auto a8ptr = A.aptr;
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  auto b1ptr = reinterpret_cast<utils::bit1x8*>(B.b1ptr);
+  auto asptr = A.sptr;
+  auto azptr = A.zpptr;
+  int constexpr EltPadding = 128;
+  static_assert(NTILE % 8 == 0);
+  int constexpr KTILE = 4;
+  int constexpr UnpackElt = EltPadding / 8 / KTILE;
+  int8_t UnpackBuf[UnpackElt * NTILE * KTILE];
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * ld_scaleb;
+    int acci[NTILE];
+    std::memset(acci, 0, sizeof(acci));
+    int wacc[NTILE];
+    std::memset(wacc, 0, sizeof(wacc));
+    for (int ik = 0; ik < blocksize; ik += KTILE * UnpackElt) {
+      decompress_kblock_s3_s8fp<BTLA_DTYPE::S3_CLIP, int8_t>(b2ptr, b1ptr, UnpackBuf, ik * NTILE,
+                                                             NTILE * KTILE * UnpackElt, tmp, tmpsize);
+      for (int iu = 0; iu < UnpackElt; iu++) {
+        for (int in = 0; in < NTILE; in++) {
+          for (int ikt = 0; ikt < KTILE; ikt++) {
+            auto bval = UnpackBuf[iu * NTILE * KTILE + in * KTILE + ikt];
+            acci[in] += int(a8ptr[iu * KTILE + ikt]) * bval;
+            wacc[in] += bval;
+          }
+        }
+      }
+
+      b2ptr += KTILE * UnpackElt * NTILE / 4;
+      b1ptr += KTILE * UnpackElt * NTILE / 8;
+      a8ptr += KTILE * UnpackElt;
+    }
+    float scale = asptr[ib];
+    int zp = azptr[ib];
+    for (int in = 0; in < NTILE; in++) {
+      auto tmp = float(acci[in] - zp * wacc[in]);
+      tmp = tmp * (scale * bsptr[in]);
+      accf[in] += tmp;
+    }
+  }
+  for (int in = 0; in < NTILE; in++) {
+    C[in] = accf[in];
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE>
+static inline BTLA_CODE gemv_3bit_s8s8_fp32_align128(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B,
+                                                     float* C, int k, int ld_scaleb, int blocksize, int8_t* tmp,
+                                                     size_t tmpsize) {
+  int blks = k / blocksize;
+  float accf[NTILE];
+  std::memset(accf, 0, sizeof(accf));
+  auto a8ptr = reinterpret_cast<int8_t*>(A.aptr);
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  auto b1ptr = reinterpret_cast<utils::bit1x8*>(B.b1ptr);
+  auto asptr = A.sptr;
+  int constexpr EltPadding = 128;
+  static_assert(NTILE % 8 == 0);
+  int constexpr KTILE = 4;
+  int constexpr UnpackElt = EltPadding / 8 / KTILE;
+  int8_t UnpackBuf[UnpackElt * NTILE * KTILE];
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * ld_scaleb;
+    int acci[NTILE];
+    std::memset(acci, 0, sizeof(acci));
+    for (int ik = 0; ik < blocksize; ik += KTILE * UnpackElt) {
+      decompress_kblock_s3_s8fp<BTLA_DTYPE::S3_CLIP, int8_t>(b2ptr, b1ptr, UnpackBuf, ik * NTILE,
+                                                             NTILE * KTILE * UnpackElt, tmp, tmpsize);
+      for (int iu = 0; iu < UnpackElt; iu++) {
+        for (int in = 0; in < NTILE; in++) {
+          for (int ikt = 0; ikt < KTILE; ikt++) {
+            auto bval = UnpackBuf[iu * NTILE * KTILE + in * KTILE + ikt];
+            acci[in] += int(a8ptr[iu * KTILE + ikt]) * bval;
+          }
+        }
+      }
+      b2ptr += KTILE * UnpackElt * NTILE / 4;
+      b1ptr += KTILE * UnpackElt * NTILE / 8;
+      a8ptr += KTILE * UnpackElt;
+    }
+
+    float scale = asptr[ib];
+    for (int in = 0; in < NTILE; in++) {
+      auto tmp = float(acci[in]);
+      tmp = tmp * (scale * bsptr[in]);
+      accf[in] += tmp;
+    }
+  }
+  for (int in = 0; in < NTILE; in++) {
+    C[in] = accf[in];
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_2bit_u8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  int blks = k / blocksize;
+  float accf[NTILE * MTILE];
+  std::memset(accf, 0, sizeof(accf));
+  auto a8ptr = A.aptr;
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  int constexpr KTILE = 4;
+  int8_t UnpackBuf[NTILE * KTILE];
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * B.ldzp;
+    auto bzptr = B.zpptr + ib * B.ldzp;
+    for (int ik = 0; ik < blocksize; ik += KTILE) {
+      decompress_kblock_s2_s8<4, NTILE>(b2ptr, B.zpptr ? bzptr : nullptr, UnpackBuf, blocksize, B.ldzp, 0, 0, KTILE,
+                                        NTILE, tmp, tmpsize);
+      for (int im = 0; im < MTILE; im++) {
+        float ascale = A.sptr[ib + im * A.ldzp];
+        auto azp = A.zpptr[ib + im * A.ldzp];
+        for (int in = 0; in < NTILE; in++) {
+          for (int ikt = 0; ikt < KTILE; ikt++) {
+            auto bval = (UnpackBuf[in * KTILE + ikt]) * bsptr[in];
+            auto aval = int(a8ptr[ikt + im * A.lda] - azp) * ascale;
+            accf[im * NTILE + in] += aval * bval;
+          }
+        }
+      }
+      b2ptr += KTILE * NTILE / 4;
+      a8ptr += KTILE;
+    }
+  }
+  for (int im = 0; im < MTILE; im++) {
+    for (int in = 0; in < NTILE; in++) {
+      C[in + im * ldc] = accf[im * NTILE + in];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_2bit_s8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  int blks = k / blocksize;
+  float accf[NTILE * MTILE];
+  std::memset(accf, 0, sizeof(accf));
+  auto a8ptr = (int8_t*)A.aptr;
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  int constexpr KTILE = 4;
+  int8_t UnpackBuf[NTILE * KTILE];
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * B.ldzp;
+    auto bzptr = B.zpptr + ib * B.ldzp;
+    for (int ik = 0; ik < blocksize; ik += KTILE) {
+      decompress_kblock_s2_s8<4, NTILE>(b2ptr, B.zpptr ? bzptr : nullptr, UnpackBuf, blocksize, B.ldzp, 0, 0, KTILE,
+                                        NTILE, tmp, tmpsize);
+      for (int im = 0; im < MTILE; im++) {
+        float ascale = A.sptr[ib + im * A.ldzp];
+        for (int in = 0; in < NTILE; in++) {
+          for (int ikt = 0; ikt < KTILE; ikt++) {
+            auto bval = (UnpackBuf[in * KTILE + ikt]) * bsptr[in];
+            auto aval = int(a8ptr[ikt + im * A.lda]) * ascale;
+            accf[im * NTILE + in] += aval * bval;
+          }
+        }
+      }
+      b2ptr += KTILE * NTILE / 4;
+      a8ptr += KTILE;
+    }
+  }
+  for (int im = 0; im < MTILE; im++) {
+    for (int in = 0; in < NTILE; in++) {
+      C[in + im * ldc] = accf[im * NTILE + in];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_2bit_fp32_fp32(const float* A, int lda, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  int blks = k / blocksize;
+  float accf[NTILE * MTILE];
+  std::memset(accf, 0, sizeof(accf));
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  int constexpr KTILE = 1;
+  int constexpr Unroll = 4;
+  assert((blocksize % 4) == 0);
+  assert(tmpsize >= NTILE * Unroll);
+  int8_t UnpackBuf[NTILE * Unroll];
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * B.ldzp;
+    auto bzptr = B.zpptr + ib * B.ldzp;
+    for (int ik = 0; ik < blocksize; ik += Unroll) {
+      decompress_kblock_s2_s8fp<BTLA_DTYPE::S2_CLIP, int8_t>(b2ptr, UnpackBuf, NTILE * Unroll, tmp, tmpsize);
+      if (B.zpptr) {
+        for (int ikt = 0; ikt < Unroll; ikt++) {
+          for (int im = 0; im < MTILE; im++) {
+            for (int in = 0; in < NTILE; in++) {
+              auto bval = (UnpackBuf[in + ikt * NTILE] - bzptr[in]) * bsptr[in];
+              auto aval = A[ikt + im * lda];
+              accf[im * NTILE + in] += aval * bval;
+            }
+          }
+        }
+      } else {
+        for (int im = 0; im < MTILE; im++) {
+          for (int in = 0; in < NTILE; in++) {
+            for (int ikt = 0; ikt < Unroll; ikt++) {
+              auto bval = (UnpackBuf[in + ikt * NTILE]) * bsptr[in];
+              auto aval = A[ikt + im * lda];
+              accf[im * NTILE + in] += aval * bval;
+            }
+          }
+        }
+      }
+      b2ptr += Unroll * NTILE / 4;
+      A += Unroll;
+    }
+  }
+  for (int im = 0; im < MTILE; im++) {
+    for (int in = 0; in < NTILE; in++) {
+      C[in + im * ldc] = accf[im * NTILE + in];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_3bit_fp32_fp32(const float* A, int lda, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  int blks = k / blocksize;
+  float accf[NTILE * MTILE];
+  std::memset(accf, 0, sizeof(accf));
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  auto b1ptr = reinterpret_cast<utils::bit1x8*>(B.b1ptr);
+  int constexpr KTILE = 1;
+  int constexpr Unroll = 4;
+  assert((blocksize % 4) == 0);
+  assert(tmpsize >= NTILE * Unroll);
+  int8_t UnpackBuf[NTILE * Unroll];
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * B.ldzp;
+    auto bzptr = B.zpptr + ib * B.ldzp;
+    for (int ik = 0; ik < blocksize; ik += Unroll) {
+      decompress_kblock_s3_s8<1, NTILE>(b2ptr, b1ptr, B.zpptr ? bzptr : nullptr, UnpackBuf, blocksize, B.ldzp, 0, 0,
+                                        Unroll, NTILE, tmp, tmpsize);
+      for (int im = 0; im < MTILE; im++) {
+        for (int in = 0; in < NTILE; in++) {
+          for (int ikt = 0; ikt < Unroll; ikt++) {
+            auto bval = (UnpackBuf[in + ikt * NTILE]) * bsptr[in];
+            auto aval = A[ikt + im * lda];
+            accf[im * NTILE + in] += aval * bval;
+          }
+        }
+      }
+      b2ptr += Unroll * NTILE / 4;
+      b1ptr += Unroll * NTILE / 8;
+      A += Unroll;
+    }
+  }
+  for (int im = 0; im < MTILE; im++) {
+    for (int in = 0; in < NTILE; in++) {
+      C[in + im * ldc] = accf[im * NTILE + in];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_3bit_u8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  int blks = k / blocksize;
+  float accf[NTILE * MTILE];
+  std::memset(accf, 0, sizeof(accf));
+  auto a8ptr = A.aptr;
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  auto b1ptr = reinterpret_cast<utils::bit1x8*>(B.b1ptr);
+  int constexpr KTILE = 4;
+  int8_t UnpackBuf[NTILE * KTILE];
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * B.ldzp;
+    auto bzptr = B.zpptr + ib * B.ldzp;
+    for (int ik = 0; ik < blocksize; ik += KTILE) {
+      decompress_kblock_s3_s8<4, NTILE>(b2ptr, b1ptr, B.zpptr ? bzptr : nullptr, UnpackBuf, blocksize, B.ldzp, 0, 0,
+                                        KTILE, NTILE, tmp, tmpsize);
+      for (int im = 0; im < MTILE; im++) {
+        float ascale = A.sptr[ib + im * A.ldzp];
+        auto azp = A.zpptr[ib + im * A.ldzp];
+        for (int in = 0; in < NTILE; in++) {
+          for (int ikt = 0; ikt < KTILE; ikt++) {
+            auto bval = (UnpackBuf[in * KTILE + ikt]) * bsptr[in];
+            auto aval = int(a8ptr[ikt + im * A.lda] - azp) * ascale;
+            accf[im * NTILE + in] += aval * bval;
+          }
+        }
+      }
+      b2ptr += KTILE * NTILE / 4;
+      b1ptr += KTILE * NTILE / 8;
+      a8ptr += KTILE;
+    }
+  }
+  for (int im = 0; im < MTILE; im++) {
+    for (int in = 0; in < NTILE; in++) {
+      C[in + im * ldc] = accf[im * NTILE + in];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+
+template <typename ScaleT, int NTILE, int MTILE>
+static inline BTLA_CODE gemv_3bit_s8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, int8_t* tmp, size_t tmpsize) {
+  int blks = k / blocksize;
+  float accf[NTILE * MTILE];
+  std::memset(accf, 0, sizeof(accf));
+  auto a8ptr = (int8_t*)A.aptr;
+  auto b2ptr = reinterpret_cast<utils::bit2x4*>(B.b2ptr);
+  auto b1ptr = reinterpret_cast<utils::bit1x8*>(B.b1ptr);
+  int constexpr KTILE = 4;
+  int8_t UnpackBuf[NTILE * KTILE];
+  for (int ib = 0; ib < blks; ib += 1) {
+    auto bsptr = B.sptr + ib * B.ldzp;
+    auto bzptr = B.zpptr + ib * B.ldzp;
+    for (int ik = 0; ik < blocksize; ik += KTILE) {
+      decompress_kblock_s3_s8<4, NTILE>(b2ptr, b1ptr, B.zpptr ? bzptr : nullptr, UnpackBuf, blocksize, B.ldzp, 0, 0,
+                                        KTILE, NTILE, tmp, tmpsize);
+      for (int im = 0; im < MTILE; im++) {
+        float ascale = A.sptr[ib + im * A.ldzp];
+        for (int in = 0; in < NTILE; in++) {
+          for (int ikt = 0; ikt < KTILE; ikt++) {
+            auto bval = (UnpackBuf[in * KTILE + ikt]) * bsptr[in];
+            auto aval = int(a8ptr[ikt + im * A.lda]) * ascale;
+            accf[im * NTILE + in] += aval * bval;
+          }
+        }
+      }
+      b2ptr += KTILE * NTILE / 4;
+      b1ptr += KTILE * NTILE / 8;
+      a8ptr += KTILE;
+    }
+  }
+  for (int im = 0; im < MTILE; im++) {
+    for (int in = 0; in < NTILE; in++) {
+      C[in + im * ldc] = accf[im * NTILE + in];
+    }
+  }
+  return BTLA_CODE::Success;
+}
+}  // namespace ref
+}  // namespace kernel
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_wrapper.h b/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_wrapper.h
new file mode 100644
index 000000000..4d799f2b3
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/kernel_wrapper.h
@@ -0,0 +1,1078 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <array>
+#include <cassert>
+#include <type_traits>
+
+#include "bestla.h"
+#include "bestla_utils.h"
+#include "kernel_avx2.h"
+#include "kernel_avx512f.h"
+#include "kernel_avx512_bf16.h"
+#include "kernel_jit.h"
+#include "kernel_ref.h"
+
+namespace bestla {
+namespace kernel {
+namespace wrapper {
+template <int NTile, int RowPack>
+class PaddingInterleaveMN {
+  // M x N ===> N/NTile x M/RowPack x NTile x RowPack (leading dim stride = NTile * dststride)
+ public:
+  template <BTLA_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
+  static BTLA_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
+                           int dst_step) {
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      const auto kern_ret = kernel::avx512f::padding_interleave_cvt<T_SRC, T_DST, RowPack>::forward(
+          src, dst, NTile, row, col, row_pad, col_pad, src_step, dst_step);
+      if (kern_ret != BTLA_CODE::NotSupport) return kern_ret;
+    }
+    return ref::padding_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, NTile, RowPack);
+  }
+};
+
+template <int NTile, int RowPack>
+class RevertPaddingInterleaveMN {
+  // M x N ===> N/NTile x M/RowPack x NTile x RowPack (leading dim stride = NTile * dststride)
+ public:
+  template <BTLA_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
+  static BTLA_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
+                           int dst_step) {
+    return ref::revert_padding_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, NTile, RowPack);
+  }
+};
+
+template <int MTile, int ColPack>
+class PaddingTransInterleaveMN {
+  // row and cols are in terms of src
+  // M x N ===> M/MTile x N/ColPack x MTile x ColPack (leading dim stride = MTile * dststride)
+ public:
+  template <BTLA_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
+  static BTLA_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
+                           int dst_step) {
+    // Note: rows/cols and i/j are in terms of src
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      const auto kern_ret = kernel::avx512f::padding_trans_interleave_cvt<T_SRC, T_DST, ColPack>::forward(
+          src, dst, MTile, row, col, row_pad, col_pad, src_step, dst_step);
+      if (kern_ret != BTLA_CODE::NotSupport) return kern_ret;
+    }
+    return ref::padding_trans_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, MTile, ColPack);
+  }
+};
+
+class Memcpy2D {
+ public:
+  template <BTLA_ISA ISA_T, typename _SRC_T, typename _DST_T>
+  static BTLA_CODE forward(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
+                           void* const_elt_v = nullptr) {
+    auto ret = BTLA_CODE::NotSupport;
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      ret = kernel::jit::JitMemcpy2DAvx512f::forward<_SRC_T, _DST_T>(srcptr, dstptr, row, col, srcstep, dststep,
+                                                                     const_elt_v);
+      if (ret == BTLA_CODE::Success) {
+        return ret;
+      }
+    }
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      auto align_col = col * sizeof(_SRC_T) / 32 * 32 / sizeof(_SRC_T);
+      ret = kernel::jit::JitMemcpy2DAvx2::forward<_SRC_T, _DST_T>(srcptr, dstptr, row, align_col, srcstep, dststep,
+                                                                  const_elt_v);
+      if (col - align_col > 0)
+        ret = kernel::ref::memcpy2d(srcptr + align_col, dstptr + align_col, row, (col - align_col) * sizeof(_SRC_T),
+                                    srcstep * sizeof(_SRC_T), dststep * sizeof(_DST_T));
+      if (ret == BTLA_CODE::Success) {
+        return ret;
+      }
+    }
+#endif
+    return kernel::ref::memcpy2d(srcptr, dstptr, row, col * sizeof(_SRC_T), srcstep * sizeof(_SRC_T),
+                                 dststep * sizeof(_DST_T));
+  }
+
+  template <BTLA_ISA ISA_T, typename _SRC_T, typename _DST_T, BTLA_ELTWISEOP OP_T>
+  static BTLA_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
+                            void* const_elt_v = nullptr) {
+    auto ret = BTLA_CODE::NotSupport;
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      ret = kernel::jit::JitMemcpy2DAvx512f::forward1<_SRC_T, _DST_T, OP_T>(srcptr, dstptr, row, col, srcstep, dststep,
+                                                                            const_elt_v);
+      if (ret == BTLA_CODE::Success) {
+        return ret;
+      }
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      auto align_col = col * sizeof(_SRC_T) / 32 * 32 / sizeof(_SRC_T);
+      ret = kernel::jit::JitMemcpy2DAvx2::forward1<_SRC_T, _DST_T, OP_T>(srcptr, dstptr, row, align_col, srcstep,
+                                                                         dststep, const_elt_v);
+      if (col - align_col > 0)
+        ret = kernel::ref::memcpy2d_withop<_SRC_T, _DST_T, OP_T>(
+            srcptr + align_col, dstptr + align_col, row, (col - align_col) * sizeof(_SRC_T), srcstep * sizeof(_SRC_T),
+            dststep * sizeof(_DST_T), const_elt_v);
+      if (ret == BTLA_CODE::Success) {
+        return ret;
+      }
+    }
+#endif
+    return ref::memcpy2d_withop<_SRC_T, _DST_T, OP_T>(srcptr, dstptr, row, col, srcstep, dststep, const_elt_v);
+  }
+};
+
+class Memcpy2DFp32CvtBf16 {
+ public:
+  template <BTLA_ISA ISA_T>
+  static BTLA_CODE forward(const void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
+                           bool zeropadding) {
+#if CompileBF16()
+    if constexpr (utils::isa_base<ISA_T>::amx_bf16) {
+      return kernel::avx512_bf16::fp32_cvt_bf16_2D_write_back(srcptr, dstptr, row, col, srcstride, dststride,
+                                                              zeropadding);
+    }
+#endif
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return kernel::avx512f::fp32_cvt_bf16_2D_write_back(srcptr, dstptr, row, col, srcstride, dststride, zeropadding);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return kernel::avx2::fp32_cvt_bf16_2D_write_back(srcptr, dstptr, row, col, srcstride, dststride, zeropadding);
+    }
+#endif
+    return kernel::ref::dt_cvt_2D_write_back<float, utils::bf16>(srcptr, dstptr, row, col, srcstride, dststride,
+                                                                 zeropadding);
+  }
+};
+
+class Memcpy2DFp32CvtFp16 {
+ public:
+  template <BTLA_ISA ISA_T>
+  static BTLA_CODE forward(void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
+                           bool zeropadding) {
+#if CompileFP16()
+    if constexpr (utils::isa_base<ISA_T>::avx512_fp16) {
+      return kernel::avx512f::fp32_cvt_fp16_2D_write_back(
+          reinterpret_cast<const float*>(srcptr), reinterpret_cast<utils::fp16*>(dstptr), row, col,
+          srcstride / sizeof(float), dststride / sizeof(utils::fp16), zeropadding);
+    }
+#endif
+    return BTLA_CODE::NotSupport;
+  }
+};
+
+class Memcpy2DFp16CvtFp32 {
+ public:
+  template <BTLA_ISA ISA_T>
+  static BTLA_CODE forward(void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
+                           bool zeropadding) {
+#if CompileFP16()
+    if constexpr (utils::isa_base<ISA_T>::avx512_fp16) {
+      return kernel::avx512f::fp16_cvt_fp32_2D_write_back(  //
+          reinterpret_cast<const utils::fp16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
+          srcstride / sizeof(utils::fp16), dststride / sizeof(float), zeropadding);
+    }
+#endif
+    return BTLA_CODE::NotSupport;
+  }
+};
+
+class Memcpy2DBf16CvtFp32 {
+ public:
+  template <BTLA_ISA ISA_T>
+  static BTLA_CODE forward(void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
+                           bool zeropadding) {
+#if CompileBF16()
+    if constexpr (ISA_T >= BTLA_ISA::AMX_BF16) {
+      return kernel::avx512_bf16::bf16_cvt_fp32_2D_write_back(  //
+          reinterpret_cast<const utils::bf16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
+          srcstride / sizeof(utils::bf16), dststride / sizeof(float), zeropadding);
+    }
+#endif
+#if CompileAVX512F()
+    if constexpr (ISA_T >= BTLA_ISA::AVX512F) {
+      return kernel::avx512f::bf16_cvt_fp32_2D_write_back(  //
+          reinterpret_cast<const utils::bf16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
+          srcstride / sizeof(utils::bf16), dststride / sizeof(float), zeropadding);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (ISA_T >= BTLA_ISA::AVX2) {
+      return kernel::avx2::bf16_cvt_fp32_2D_write_back(
+          reinterpret_cast<const utils::bf16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
+          srcstride / sizeof(utils::bf16), dststride / sizeof(float), zeropadding);
+    }
+#endif
+    return kernel::ref::dt_cvt_2D_write_back<utils::bf16, float>(srcptr, dstptr, row, col, srcstride, dststride,
+                                                                 zeropadding);
+  }
+};
+
+template <typename _DST_T, int _PACK_ROW>
+class DecompressDQKBlockS4Fp {
+ public:
+  template <BTLA_ISA ISA_T, BTLA_DTYPE S4_T>
+  static inline BTLA_CODE forward(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
+                                  uint8_t* scales, float* dq_scale, int k_offset, int n_offset, int kblock, int NPad,
+                                  int N, int dq_blk, int dq_offset_idx, void* tmp, size_t tmpsize) {
+    BTLA_CODE ret = BTLA_CODE::NotSupport;
+    ret = ref::decompress_dq_kblock_s4_fp<S4_T, _DST_T, _PACK_ROW>(
+        srcptr, dstptr, row, col, ld_src, ld_dst, scales, dq_scale, k_offset, n_offset, kblock, dq_blk, dq_offset_idx,
+        NPad, N, reinterpret_cast<int8_t*>(tmp), tmpsize);
+    return ret;
+  }
+};
+
+class Dq8GetScale {
+ public:
+  template <BTLA_ISA ISA_T>
+  static BTLA_CODE forward(uint8_t* src, float* dst, int row, int col, int scale_offset, int dq_blk, int dq_offset_idx,
+                           float* dq_scale, int src_stride, int dst_stride, bool zeropadding, int mN) {
+#if CompileAVX512F()
+    if (ISA_T >= BTLA_ISA::AVX512F) {
+      return kernel::avx512f::dq8_get_fp_scale(src, dst, row, col, scale_offset, dq_blk, dq_offset_idx, dq_scale,
+                                               src_stride, dst_stride, zeropadding, mN);
+    }
+#endif
+#if CompileAVX2()
+    if (ISA_T >= BTLA_ISA::AVX2) {
+      return kernel::avx2::dq8_get_fp_scale(src, dst, row, col, scale_offset, dq_blk, dq_offset_idx, dq_scale,
+                                            src_stride, dst_stride, zeropadding, mN);
+    }
+#endif
+    return kernel::ref::dq8_get_fp_scale(src, dst, row, col, scale_offset, dq_blk, dq_offset_idx, dq_scale, src_stride,
+                                         dst_stride, zeropadding, mN);
+  }
+};
+
+class CompressS8S4 {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(const int8_t* srcptr, bestla::utils::int4x2* dstptr, int row, int col, int ld_src,
+                                  int ld_dst) {
+    return ref::compress_s8_s4(srcptr, dstptr, row, col, ld_src, ld_dst);
+  }
+};
+
+class CompressFp4 {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(const int8_t* srcptr, bestla::utils::f4x2* dstptr, int row, int col, int ld_src,
+                                  int ld_dst) {
+    return ref::compress_f4(srcptr, dstptr, row, col, ld_src, ld_dst);
+  }
+};
+
+class CompressBit3 {
+ public:
+    template <BTLA_ISA ISA_T>
+    static inline BTLA_CODE forward(const int8_t* srcptr, bestla::utils::bit2x4* bit2ptr, utils::bit1x8* bit1ptr,
+                                  size_t size) {
+      return ref::compress_3bit(srcptr, bit2ptr, bit1ptr, size);
+    }
+};
+
+class CompressBit2 {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(const int8_t* srcptr, bestla::utils::bit2x4* bit2ptr, size_t size) {
+    return ref::compress_2bit(srcptr, bit2ptr, size);
+  }
+};
+
+template <typename _T>
+class Transpose2D {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(const _T* srcptr, _T* dstptr, int row, int col, int ld_src, int ld_dst) {
+    return ref::transpose2d(srcptr, dstptr, row, col, ld_src, ld_dst);
+  }
+};
+
+class QuantizeSignIntRowBlock {
+ public:
+  template <BTLA_ISA ISA_T, BTLA_DTYPE QDT_T>
+  static inline BTLA_CODE forward(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
+                                  float* scales, int8_t* zero_points, int blocksize) {
+    // TODO(Yu) simd version for quick quant
+    // #if CompileAVX512F()
+    //     if constexpr (utils::isa_base<ISA_T>::avx512f) {
+    //       return avx512f::quantize_f32_sign_int_rowblock<QDT_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales,
+    //                                                             zero_points, blocksize);
+    //     }
+    // #endif
+    return ref::quantize_f32_sign_int_rowblock<QDT_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
+                                                      blocksize);
+  }
+};
+
+class QuantizeF8RowBlock {
+ public:
+  template <BTLA_ISA ISA_T, BTLA_DTYPE F8_T>
+  static inline BTLA_CODE forward(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
+                                  float* scales, int blocksize, BTLA_DTYPE scale_dtype) {
+    return ref::quantize_f32_f8_rowblock_mxscale<F8_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, blocksize,
+                                                       scale_dtype);
+  }
+};
+
+class QuantizeF4RowBlock {
+ public:
+  template <BTLA_ISA ISA_T, BTLA_DTYPE F4_T>
+  static inline BTLA_CODE forward(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
+                                  float* scales, int8_t* zero_points, int blocksize) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::quantize_f32_f4_rowblock<F4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
+                                                     blocksize);
+    }
+#endif
+    return ref::quantize_f32_f4_rowblock<F4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
+                                               blocksize);
+  }
+};
+
+class QuantizeU8ColBlock {
+ public:
+  template <BTLA_ISA ISA_T, typename SRC_T>
+  static inline BTLA_CODE forward(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr, int ld_dst,
+                                  float* scales, int ld_scale, uint8_t* zps, int blocksize, float* blkreduce) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::quantize_fp_u8_colblock<SRC_T>(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, zps,
+                                                     blocksize, blkreduce);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::quantize_fp_u8_colblock<SRC_T>(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, zps,
+                                                  blocksize, blkreduce);
+    }
+#endif
+    return ref::quantize_fp_u8_colblock(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, zps, blocksize,
+                                        blkreduce);
+  }
+};
+
+class QuantizeS8ColBlock {
+ public:
+  template <BTLA_ISA ISA_T, typename SRC_T>
+  static inline BTLA_CODE forward(int row, int col, const SRC_T* srcptr, int ld_src, int8_t* dstptr, int ld_dst,
+                                  float* scales, int ld_scale, int blocksize, float* reduce) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::quantize_fp_s8_colblock<SRC_T>(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale,
+                                                     blocksize, reduce);
+    }
+#endif
+    return ref::quantize_fp_s8_colblock(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, blocksize, reduce);
+  }
+};
+
+class Broadcast {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(int num, const uint8_t& srcval, uint8_t* dstptr) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::broadcast_u8(num, srcval, dstptr);
+    }
+#endif
+    return ref::broadcast_u8(num, srcval, dstptr);
+  }
+};
+
+class AccumulateDequantizeS32F32 {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(const int32_t* srcptr, float* dstptr, float alpha, float beta, int row, int col,
+                                  int ld_src, int ld_dst, float* ascales, int ldas, float* wscales) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::accumulate_dequantize_s32_f32(srcptr, dstptr, alpha, beta, row, col, ld_src, ld_dst, ascales,
+                                                    ldas, wscales);
+    }
+#endif
+    return ref::accumulate_dequantize_s32_f32(srcptr, dstptr, alpha, beta, row, col, ld_src, ld_dst, ascales, ldas,
+                                              wscales);
+  }
+};
+
+template <int PackRow, int NTILE>
+class DecompressKBlockS4S8 {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(utils::int4x2* srcptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp,
+                                  int n_offset, int k_offset, int row, int col, void* tmp, size_t tmpsize) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::decompress_kblock_s4_s8<PackRow, NTILE>(srcptr, zpptr, dstptr, blocksize, ldzp, n_offset,
+                                                              k_offset, row, col, (int8_t*)tmp, tmpsize);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::decompress_kblock_s4_s8<PackRow, NTILE>(srcptr, zpptr, dstptr, blocksize, ldzp, n_offset, k_offset,
+                                                           row, col, (int8_t*)tmp, tmpsize);
+    }
+#endif
+    return ref::decompress_kblock_s4_s8<PackRow, NTILE>(srcptr, zpptr, dstptr, blocksize, ldzp, n_offset, k_offset, row,
+                                                        col, (int8_t*)tmp, tmpsize);
+  }
+};
+
+template <int PackRow, int NTILE>
+class DecompressKBlockS3S8 {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(utils::bit2x4* b2ptr, utils::bit1x8* b1ptr, int8_t* zpptr, int8_t* dstptr,
+                                  int blocksize, int ldzp, int n_offset, int k_offset, int row, int col, void* tmp,
+                                  size_t tmpsize) {
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::decompress_kblock_s3_s8<PackRow, NTILE>(b2ptr, b1ptr, zpptr, dstptr, blocksize, ldzp, n_offset,
+                                                           k_offset, row, col, (int8_t*)tmp, tmpsize);
+    }
+#endif
+    return ref::decompress_kblock_s3_s8<PackRow, NTILE>(b2ptr, b1ptr, zpptr, dstptr, blocksize, ldzp, n_offset,
+                                                        k_offset, row, col, (int8_t*)tmp, tmpsize);
+  }
+};
+
+template <int PackRow, int NTILE>
+class DecompressKBlockS2S8 {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(utils::bit2x4* b2ptr, int8_t* zpptr, int8_t* dstptr, int blocksize, int ldzp,
+                                  int n_offset, int k_offset, int row, int col, void* tmp, size_t tmpsize) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::decompress_kblock_s2_s8<PackRow, NTILE>(b2ptr, zpptr, dstptr, blocksize, ldzp, n_offset, k_offset,
+                                                              row, col, (int8_t*)tmp, tmpsize);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::decompress_kblock_s2_s8<PackRow, NTILE>(b2ptr, zpptr, dstptr, blocksize, ldzp, n_offset, k_offset,
+                                                           row, col, (int8_t*)tmp, tmpsize);
+    }
+#endif
+    return ref::decompress_kblock_s2_s8<PackRow, NTILE>(b2ptr, zpptr, dstptr, blocksize, ldzp, n_offset, k_offset, row,
+                                                        col, (int8_t*)tmp, tmpsize);
+  }
+};
+
+template <int PackRow, int NTILE, typename DstT>
+class DecompressKBlockS8Fp {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(int8_t* srcptr, DstT* dstptr, int row, int col, void* scales, BTLA_DTYPE sdtype,
+                                  int8_t* zero_points, int k_offset, int n_offset, int kblock, int NPad, void* tmp,
+                                  size_t tmpsize) {
+    BTLA_CODE ret = BTLA_CODE::NotSupport;
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      ret = avx512f::decompress_kblock_s8_fp<PackRow, NTILE, DstT>(srcptr, dstptr, row, col, scales, sdtype,
+                                                                   zero_points, k_offset, n_offset, kblock, NPad,
+                                                                   reinterpret_cast<int8_t*>(tmp), tmpsize);
+      if (ret == BTLA_CODE::Success) return ret;
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      ret = avx2::decompress_kblock_s8_fp<PackRow, NTILE, DstT>(srcptr, dstptr, row, col, scales, sdtype, zero_points,
+                                                                k_offset, n_offset, kblock, NPad,
+                                                                reinterpret_cast<int8_t*>(tmp), tmpsize);
+      if (ret == BTLA_CODE::Success) return ret;
+    }
+#endif
+    ret = ref::decompress_kblock_s8_fp<PackRow, NTILE, DstT>(srcptr, dstptr, row, col, scales, sdtype, zero_points,
+                                                             k_offset, n_offset, kblock, NPad,
+                                                             reinterpret_cast<int8_t*>(tmp), tmpsize);
+    return ret;
+  }
+};
+
+template <int PackRow, int NTILE, typename DstT>
+class DecompressKBlockS4Fp {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(utils::int4x2* srcptr, DstT* dstptr, int row, int col, void* scales,
+                                  BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset, int n_offset, int kblock,
+                                  int NPad, void* tmp, size_t tmpsize) {
+    BTLA_CODE ret = BTLA_CODE::NotSupport;
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::decompress_kblock_s4_fp<PackRow, NTILE, DstT>(srcptr, dstptr, row, col, scales, sdtype,
+                                                                    zero_points, k_offset, n_offset, kblock, NPad,
+                                                                    reinterpret_cast<int8_t*>(tmp), tmpsize);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::decompress_kblock_s4_fp<PackRow, NTILE, DstT>(srcptr, dstptr, row, col, scales, sdtype, zero_points,
+                                                                 k_offset, n_offset, kblock, NPad,
+                                                                 reinterpret_cast<int8_t*>(tmp), tmpsize);
+    }
+#endif
+    ret = ref::decompress_kblock_s4_fp<PackRow, NTILE, DstT>(srcptr, dstptr, row, col, scales, sdtype, zero_points,
+                                                             k_offset, n_offset, kblock, NPad,
+                                                             reinterpret_cast<int8_t*>(tmp), tmpsize);
+    return ret;
+  }
+};
+
+template <int PackRow, int NTILE, typename DstT>
+class DecompressKBlockS3Fp {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(utils::bit2x4* b2ptr, utils::bit1x8* b1ptr, DstT* dstptr, int row, int col,
+                                  void* scales, BTLA_DTYPE sdtype, int8_t* zero_points, int k_offset, int n_offset,
+                                  int kblock, int NPad, void* tmp, size_t tmpsize) {
+    BTLA_CODE ret = BTLA_CODE::NotSupport;
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::decompress_kblock_s3_fp<PackRow, NTILE, DstT>(b2ptr, b1ptr, dstptr, row, col, scales, sdtype,
+                                                                    zero_points, k_offset, n_offset, kblock, NPad,
+                                                                    reinterpret_cast<int8_t*>(tmp), tmpsize);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::decompress_kblock_s3_fp<PackRow, NTILE, DstT>(b2ptr, b1ptr, dstptr, row, col, scales, sdtype,
+                                                                 zero_points, k_offset, n_offset, kblock, NPad,
+                                                                 reinterpret_cast<int8_t*>(tmp), tmpsize);
+    }
+#endif
+    ret = ref::decompress_kblock_s3_fp<PackRow, NTILE, DstT>(b2ptr, b1ptr, dstptr, row, col, scales, sdtype,
+                                                             zero_points, k_offset, n_offset, kblock, NPad,
+                                                             reinterpret_cast<int8_t*>(tmp), tmpsize);
+    return ret;
+  }
+};
+
+template <int PackRow, int NTILE, typename DstT>
+class DecompressKBlockS2Fp {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(utils::bit2x4* b2ptr, DstT* dstptr, int row, int col, void* scales, BTLA_DTYPE sdtype,
+                                  int8_t* zero_points, int k_offset, int n_offset, int kblock, int NPad, void* tmp,
+                                  size_t tmpsize) {
+    BTLA_CODE ret = BTLA_CODE::NotSupport;
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::decompress_kblock_s2_fp<PackRow, NTILE, DstT>(b2ptr, dstptr, row, col, scales, sdtype,
+                                                                    zero_points, k_offset, n_offset, kblock, NPad,
+                                                                    reinterpret_cast<int8_t*>(tmp), tmpsize);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::decompress_kblock_s2_fp<PackRow, NTILE, DstT>(b2ptr, dstptr, row, col, scales, sdtype, zero_points,
+                                                                 k_offset, n_offset, kblock, NPad,
+                                                                 reinterpret_cast<int8_t*>(tmp), tmpsize);
+    }
+#endif
+    ret = ref::decompress_kblock_s2_fp<PackRow, NTILE, DstT>(b2ptr, dstptr, row, col, scales, sdtype, zero_points,
+                                                             k_offset, n_offset, kblock, NPad,
+                                                             reinterpret_cast<int8_t*>(tmp), tmpsize);
+    return ret;
+  }
+};
+
+template <typename _DST_T, int _PACK_ROW>
+class DecompressKBlockF4Fp {
+ public:
+  template <BTLA_ISA ISA_T, typename SCA_T, BTLA_DTYPE F4_T>
+  static inline BTLA_CODE forward(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
+                                  SCA_T* scales, int k_offset, int kblock, int NPad, void* tmp, size_t tmpsize) {
+    BTLA_CODE ret = BTLA_CODE::NotSupport;
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      ret = avx512f::decompress_kblock_f4_fp<F4_T, _DST_T, _PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                             scales, k_offset, kblock, NPad,
+                                                                             reinterpret_cast<int8_t*>(tmp), tmpsize);
+      if (ret == BTLA_CODE::Success) return ret;
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<SCA_T, float>) {
+      ret = avx2::decompress_kblock_f4_fp<F4_T, _DST_T, _PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                          scales, k_offset, kblock, NPad,
+                                                                          reinterpret_cast<int8_t*>(tmp), tmpsize);
+      if (ret == BTLA_CODE::Success) return ret;
+    }
+#endif
+    return ref::decompress_kblock_f4_fp<F4_T, _DST_T, _PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                        scales, k_offset, kblock, NPad,
+                                                                        reinterpret_cast<int8_t*>(tmp), tmpsize);
+  }
+};
+
+template <typename _DST_T, int _PACK_ROW>
+class DecompressDqKBlockF4Fp {
+ public:
+  template <BTLA_ISA ISA_T, BTLA_DTYPE F4_T, typename SCA_T>
+  static inline BTLA_CODE forward(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
+                                  SCA_T* scales, float* dq_scale, int k_offset, int n_offset, int kblock, int dq_blk,
+                                  int dq_offset_idx, int NPad, int N, void* tmp, size_t tmpsize) {
+    return ref::decompress_dq_kblock_f4_fp<F4_T, _PACK_ROW>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, dq_scale,
+                                                            k_offset, n_offset, kblock, dq_blk, dq_offset_idx, NPad, N,
+                                                            tmp, tmpsize);
+  }
+};
+
+template <typename _DST_T>
+class DecompressKBlockF4FpNoscale {
+ public:
+  template <BTLA_ISA ISA_T, BTLA_DTYPE F4_T>
+  static inline BTLA_CODE forward(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
+                                  void* tmp, size_t tmpsize) {
+    BTLA_CODE ret = BTLA_CODE::NotSupport;
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                    reinterpret_cast<int8_t*>(tmp), tmpsize);
+    }
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                 reinterpret_cast<int8_t*>(tmp), tmpsize);
+    }
+    return ref::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                              reinterpret_cast<int8_t*>(tmp), tmpsize);
+  }
+};
+
+template <int PACK_ROW>
+class DecompressKBlockF8FP {
+ public:
+  template <BTLA_ISA ISA_T, typename SCA_T, typename DST_T>
+  static inline BTLA_CODE forward(utils::f8* srcptr, DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
+                                  SCA_T* scales, int k_offset, int kblock, int NPad, BTLA_DTYPE src_f8_type) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::decompress_kblock_f8_fp<true, DST_T, PACK_ROW, SCA_T>(
+          srcptr, dstptr, row, col, ld_src, ld_dst, scales, k_offset, kblock, NPad, src_f8_type);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::decompress_kblock_f8_fp<true, DST_T, PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                                         scales, k_offset, kblock, NPad, src_f8_type);
+    }
+#endif
+    return ref::decompress_kblock_f8_fp<DST_T, PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales,
+                                                                k_offset, kblock, NPad, src_f8_type);
+  }
+};
+
+template <typename _DST_T>
+class DecompressKBlockF8FpNoScale {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(utils::f8* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
+                                  void* tmp, size_t tmpsize, BTLA_DTYPE src_f8_t) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::decompress_kblock_f8_fp<false, _DST_T, 1>(
+          srcptr, dstptr, row, col, ld_src, ld_dst, reinterpret_cast<utils::f8*>(tmp), -1, -1, -1, src_f8_t);
+    }
+#endif
+#if CompileAVX2()
+    if (utils::isa_base<ISA_T>::avx2) {
+      return avx2::decompress_kblock_f8_fp<false, _DST_T, 1>(srcptr, dstptr, row, col, ld_src, ld_dst,
+                                                             reinterpret_cast<utils::f8*>(tmp), -1, -1, -1, src_f8_t);
+    }
+#endif
+    return ref::decompress_kblock_f8_fp_noscale<_DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst, src_f8_t);
+  }
+};
+
+class AlphaBetaF32F32 {
+ public:
+  template <BTLA_ISA ISA_T>
+  static BTLA_CODE forward(const float alpha, const float* srcptr, const int srcstep, const float beta,
+                           const float* src1ptr, const int src1step, float* dstptr, const int dststep, const int M,
+                           const int N) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::alphabeta_f32_f32(alpha, srcptr, srcstep, beta, src1ptr, src1step, dstptr, dststep, M, N);
+    }
+#endif
+#if CompileAVX2()
+    if (utils::isa_base<ISA_T>::avx2) {
+      return avx2::alphabeta_f32_f32(alpha, srcptr, srcstep, beta, src1ptr, src1step, dstptr, dststep, M, N);
+    }
+#endif
+    return ref::alphabeta_f32_f32(alpha, srcptr, srcstep, beta, src1ptr, src1step, dstptr, dststep, M, N);
+  }
+};
+
+class CompFp32BlockScale {
+ public:
+  template <BTLA_ISA ISA_T, typename SCA_T>
+  static BTLA_CODE forward(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr, const int dststep,
+                           const int M, const int N) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
+    }
+#endif
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
+    }
+    return ref::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
+  }
+};
+
+class AccumulateFp32 {
+ public:
+  template <BTLA_ISA ISA_T>
+  static BTLA_CODE forward(const float* srcptr, const int srcstep, float* dstptr, const int dststep, const int M,
+                           const int N) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::accum_f32_f32(srcptr, srcstep, dstptr, dststep, M, N);
+    }
+#endif
+    return ref::accum_f32_f32(srcptr, srcstep, dstptr, dststep, M, N);
+  }
+};
+
+class QuanOutS32U32 {
+ public:
+  template <BTLA_ISA ISA_T>
+  static BTLA_CODE forward(const float alpha, const int32_t* srcptr, const int srcstep, uint8_t* dstptr,
+                           const int dststep, const int M, const int N, float scaleSrc, float scaleDst, int zpDst) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::quanout_s32_u32(alpha, srcptr, srcstep, dstptr, dststep, M, N, scaleSrc, scaleDst, zpDst);
+    }
+#endif
+    return ref::quanout_s32_u32(alpha, srcptr, srcstep, dstptr, dststep, M, N, scaleSrc, scaleDst, zpDst);
+  }
+};
+
+// scaleA ldsa==0 per tensor, ldsa!=0 per M
+// scaleB per channel(N)
+class DequanS32Fp32 {
+ public:
+  template <BTLA_ISA ISA_T, typename SCAB_T>
+  static BTLA_CODE forward(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep, const int M,
+                           const int N, const float* scaleA, const int ldsa, const SCAB_T* scaleB) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::dequant_s32_fp32(srcptr, srcstep, dstptr, dststep, M, N, scaleA, ldsa, scaleB);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::dequant_s32_fp32(srcptr, srcstep, dstptr, dststep, M, N, scaleA, ldsa, scaleB);
+    }
+#endif
+    return ref::dequant_s32_fp32(srcptr, srcstep, dstptr, dststep, M, N, scaleA, ldsa, scaleB);
+  }
+};
+
+class MinMaxKBlock {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(const float* srcptr, int row, int col, int ld_src, float* minmaxptr, int ld_minmax,
+                                  int fsize_minmax, int blocksize) {
+    return ref::minmax_f32_kblock(srcptr, row, col, ld_src, minmaxptr, ld_minmax, fsize_minmax, blocksize);
+  }
+};
+
+template <typename _RT>
+class QuantS8RowReduceSum {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(const int8_t* srcptr, int ldsrc, const float* scales, const int8_t* zero_points,
+                                  int row, int col, _RT* reduce) {
+    return ref::quant_s8_row_reduce_sum(srcptr, ldsrc, scales, zero_points, row, col, reduce);
+  }
+};
+
+template <typename _RT>
+class RowReduceSum {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward(const float* srcptr, int ldsrc, int row, int col, _RT* reduce) {
+    return ref::row_reduce_sum<_RT>(srcptr, ldsrc, row, col, reduce);
+  }
+};
+
+class ColBlockReduceSum {
+ public:
+  template <BTLA_ISA ISA_T, typename SRC_T>
+  static inline BTLA_CODE forward(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize, float* reduce,
+                                  int ldr) {
+    if constexpr (utils::isa_base<ISA_T>::avx512f && std::is_same_v<SRC_T, float>) {
+      return avx512f::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
+    }
+    if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<SRC_T, float>) {
+      return avx2::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
+    }
+    return ref::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
+  }
+};
+
+class RemoveZeroPointBias {
+ public:
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward_wei(float* accptr, int ldacc, int row, int col, int8_t* zps, float* scales, int lds,
+                                      const float* reduce) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::remove_wei_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::remove_wei_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
+    }
+#endif
+    return ref::remove_wei_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
+  }
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward_act(float* accptr, int ldacc, int row, int col, uint8_t* zps, float* scales, int lds,
+                                      const float* reduce) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::remove_act_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::remove_act_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
+    }
+#endif
+    return ref::remove_act_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
+  }
+  template <BTLA_ISA ISA_T>
+  static inline BTLA_CODE forward_both(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
+                                       float* scalea, float* scaleb, int lds, int k, const float* reducea,
+                                       const float* reduceb) {
+#if CompileAVX512F()
+    if constexpr (utils::isa_base<ISA_T>::avx512f) {
+      return avx512f::remove_zeropoint_bias(accptr, ldacc, row, col, zpa, zpb, scalea, scaleb, lds, k, reducea,
+                                            reduceb);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (utils::isa_base<ISA_T>::avx2) {
+      return avx2::remove_zeropoint_bias(accptr, ldacc, row, col, zpa, zpb, scalea, scaleb, lds, k, reducea, reduceb);
+    }
+#endif
+    return ref::remove_zeropoint_bias(accptr, ldacc, row, col, zpa, zpb, scalea, scaleb, lds, k, reducea, reduceb);
+  }
+};
+
+class LayerNormalization {
+ public:
+  template <BTLA_ISA ISA_T, typename T>
+  static inline BTLA_CODE forward(const T* srcptr, const T* scaleptr, const T* biasptr, T epsilon, int norm_size,
+                                  T* dstptr, T* mean, T* mean_square, bool simplified) {
+    if constexpr (utils::isa_base<ISA_T>::avx512f && std::is_same_v<T, float>) {
+      return avx512f::layernorm(srcptr, scaleptr, biasptr, epsilon, norm_size, dstptr, mean, mean_square, simplified);
+    }
+    if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<T, float>) {
+      return avx2::layernorm(srcptr, scaleptr, biasptr, epsilon, norm_size, dstptr, mean, mean_square, simplified);
+    }
+    return ref::layernorm(srcptr, scaleptr, biasptr, epsilon, norm_size, dstptr, mean, mean_square, simplified);
+  }
+  template <typename T>
+  static inline BTLA_CODE forward_auto(const T* srcptr, const T* scaleptr, const T* biasptr, T epsilon, int norm_size,
+                                       T* dstptr, T* mean, T* mean_square, bool simplified) {
+    GetCPUDevice();
+    if (_cd->AVX512F()) {
+      return forward<BTLA_ISA::AVX512F, T>(srcptr, scaleptr, biasptr, epsilon, norm_size, dstptr, mean, mean_square,
+                                           simplified);
+    }
+    if (_cd->AVX2()) {
+      return forward<BTLA_ISA::AVX2, T>(srcptr, scaleptr, biasptr, epsilon, norm_size, dstptr, mean, mean_square,
+                                        simplified);
+    }
+    return forward<BTLA_ISA::NoSIMD, T>(srcptr, scaleptr, biasptr, epsilon, norm_size, dstptr, mean, mean_square,
+                                        simplified);
+  }
+};
+
+class GEMVWoqNBits {
+ public:
+  template <BTLA_ISA ISA_T, typename ScaleT, int NTILE, int MTILE>
+  static inline BTLA_CODE forward_u8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, void* tmp, size_t tmpsize) {
+    if (B.nbits == 4) {
+#if CompileAVX512VNNI()
+      if (ISA_T >= BTLA_ISA::AVX512_VNNI) {
+        return avx512f::vnni::gemv_4bit_u8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp,
+                                                                        tmpsize);
+      }
+#endif
+#if CompileAVXVNNI()
+      if (ISA_T >= BTLA_ISA::AVX_VNNI) {
+        return avx2::vnni::gemv_4bit_u8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+      }
+#endif
+#if CompileAVX2()
+      if (ISA_T >= BTLA_ISA::AVX2) {
+        return avx2::gemv_4bit_u8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+      }
+#endif
+      return ref::gemv_4bit_u8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+    }
+    if (B.nbits == 3) {
+#if CompileAVX512VNNI()
+      if (ISA_T >= BTLA_ISA::AVX512_VNNI) {
+        return avx512f::vnni::gemv_3bit_u8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp,
+                                                                        tmpsize);
+      }
+#endif
+#if CompileAVXVNNI()
+      if (ISA_T >= BTLA_ISA::AVX_VNNI) {
+        return avx2::vnni::gemv_3bit_u8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+      }
+#endif
+#if CompileAVX2()
+      if (ISA_T >= BTLA_ISA::AVX2) {
+        return avx2::gemv_3bit_u8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+      }
+#endif
+      return ref::gemv_3bit_u8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+    }
+    if (B.nbits == 2) {
+#if CompileAVX512VNNI()
+      if (ISA_T >= BTLA_ISA::AVX512_VNNI) {
+        return avx512f::vnni::gemv_2bit_u8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp,
+                                                                        tmpsize);
+      }
+#endif
+#if CompileAVXVNNI()
+      if (ISA_T >= BTLA_ISA::AVX_VNNI) {
+        return avx2::vnni::gemv_2bit_u8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+      }
+#endif
+#if CompileAVX2()
+      if (ISA_T >= BTLA_ISA::AVX2) {
+        return avx2::gemv_2bit_u8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+      }
+#endif
+      return ref::gemv_2bit_u8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+    }
+    return BTLA_CODE::NotSupport;
+  }
+
+  template <BTLA_ISA ISA_T, typename ScaleT, int NTILE, int MTILE>
+  static inline BTLA_CODE forward_s8s8_fp32(const utils::GemvParamA& A, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, void* tmp, size_t tmpsize) {
+    if (B.nbits == 4) {
+#if CompileAVX512VNNI()
+      if (ISA_T >= BTLA_ISA::AVX512_VNNI) {
+        return avx512f::vnni::gemv_4bit_s8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp,
+                                                                        tmpsize);
+      }
+#endif
+#if CompileAVXVNNI()
+      if (ISA_T >= BTLA_ISA::AVX_VNNI) {
+        return avx2::vnni::gemv_4bit_s8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+      }
+#endif
+      return ref::gemv_4bit_s8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+    }
+    if (B.nbits == 3) {
+#if CompileAVX512VNNI()
+      if (ISA_T >= BTLA_ISA::AVX512_VNNI) {
+        return avx512f::vnni::gemv_3bit_s8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp,
+                                                                        tmpsize);
+      }
+#endif
+#if CompileAVXVNNI()
+      if (ISA_T >= BTLA_ISA::AVX_VNNI) {
+        return avx2::vnni::gemv_3bit_s8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+      }
+#endif
+      return ref::gemv_3bit_s8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+    }
+    if (B.nbits == 2) {
+#if CompileAVX512VNNI()
+      if (ISA_T >= BTLA_ISA::AVX512_VNNI) {
+        return avx512f::vnni::gemv_2bit_s8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp,
+                                                                        tmpsize);
+      }
+#endif
+#if CompileAVXVNNI()
+      if (ISA_T >= BTLA_ISA::AVX_VNNI) {
+        return avx2::vnni::gemv_2bit_s8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+      }
+#endif
+      return ref::gemv_2bit_s8s8_fp32<ScaleT, NTILE, MTILE>(A, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+    }
+    return BTLA_CODE::NotSupport;
+  }
+
+  template <BTLA_ISA ISA_T, typename ScaleT, int NTILE, int MTILE>
+  static inline BTLA_CODE forward_fp32_fp32(const float* A, int lda, const utils::GemvParamB<ScaleT>& B, float* C,
+                                            int ldc, int k, int blocksize, void* tmp, size_t tmpsize) {
+    if (B.nbits == 4) {
+#if CompileAVX512F()
+      if (ISA_T >= BTLA_ISA::AVX512F) {
+        return avx512f::gemv_4bit_fp32_fp32<ScaleT, NTILE, MTILE>(A, lda, B, C, ldc, k, blocksize, (int8_t*)tmp,
+                                                                  tmpsize);
+      }
+#endif
+#if CompileAVX2()
+      if (ISA_T >= BTLA_ISA::AVX2) {
+        return avx2::gemv_4bit_fp32_fp32<ScaleT, NTILE, MTILE>(A, lda, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+      }
+#endif
+      return ref::gemv_4bit_fp32_fp32<ScaleT, NTILE, MTILE>(A, lda, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+    }
+    if (B.nbits == 3) {
+#if CompileAVX512F()
+      if (ISA_T >= BTLA_ISA::AVX512F) {
+        return avx512f::gemv_3bit_fp32_fp32<ScaleT, NTILE, MTILE>(A, lda, B, C, ldc, k, blocksize, (int8_t*)tmp,
+                                                                  tmpsize);
+      }
+#endif
+#if CompileAVX2()
+      if (ISA_T >= BTLA_ISA::AVX2) {
+        return avx2::gemv_3bit_fp32_fp32<ScaleT, NTILE, MTILE>(A, lda, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+      }
+#endif
+      return ref::gemv_3bit_fp32_fp32<ScaleT, NTILE, MTILE>(A, lda, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+    }
+    if (B.nbits == 2) {
+#if CompileAVX512F()
+      if (ISA_T >= BTLA_ISA::AVX512F) {
+        return avx512f::gemv_2bit_fp32_fp32<ScaleT, NTILE, MTILE>(A, lda, B, C, ldc, k, blocksize, (int8_t*)tmp,
+                                                                  tmpsize);
+      }
+#endif
+#if CompileAVX2()
+      if (ISA_T >= BTLA_ISA::AVX2) {
+        return avx2::gemv_2bit_fp32_fp32<ScaleT, NTILE, MTILE>(A, lda, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+      }
+#endif
+      return ref::gemv_2bit_fp32_fp32<ScaleT, NTILE, MTILE>(A, lda, B, C, ldc, k, blocksize, (int8_t*)tmp, tmpsize);
+    }
+    return BTLA_CODE::NotSupport;
+  }
+};
+
+}  // namespace wrapper
+}  // namespace kernel
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla.cpp b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla.cpp
new file mode 100644
index 000000000..dccf567e2
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla.cpp
@@ -0,0 +1 @@
+#include "../bestla.h"
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_benchmark.cpp b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_benchmark.cpp
new file mode 100644
index 000000000..5eece3208
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_benchmark.cpp
@@ -0,0 +1,1645 @@
+#include <stdio.h>
+#include "bestla_wrapper.h"
+#include "bestla_ut.h"
+#undef BTLA_UT_WRAPPER
+#undef BTLA_UT_PROLOGUE_B
+namespace bestla {
+using namespace utils;
+namespace ut {
+int constexpr TestMs = 500;
+class Benchmark_Fp32Fp32 {
+ public:
+  Benchmark_Fp32Fp32() {
+    UT_START();
+    benchmark_all(1, 4096, 4096);
+    benchmark_all(1024, 4096, 4096);
+    benchmark_all(2048, 4096, 4096);
+  }
+
+  using AType = float;
+  using BType = float;
+  using CType = float;
+  template <typename Core_T, typename LOG_T>
+  void benchmark(int m, int n, int k, int batch, AType* A, BType* B, CType* C, float timems, int threads) {
+    LOG_T log;
+    using Parallel = parallel::gemm::SchedulerBase<Core_T>;
+    using Launcher =
+        wrapper::gemm::LauncherBase<Core_T::ISA, Core_T, prologue_a::gemm::ActivationBase, prologue_b::gemm::WeightPack,
+                                    epilogue::gemm::AccumulatorWriteBackFp32>;
+    Launcher kernel;
+    UT_Threading::set_threads(threads);
+    auto corestr = gemm::CoreAttr::to_str(Core_T::ID);
+    utils::timer<std::chrono::milliseconds> tm;
+    auto tmpB = kernel.mProB.createStorage(n, k);
+    std::vector<storage::gemm::StoragePackedWeight> packBs(batch, 0);
+    avector<int8_t> bufB(tmpB.mSize * batch);
+    for (size_t i = 0; i < batch; i++) {
+      packBs[i] = tmpB;
+      packBs[i].assign(bufB.data() + i * tmpB.mSize);
+      kernel.mProB.packWeight(n, k, {B + i * n * k, n, &packBs[i]}, UT_Threading::get());
+    }
+    auto psize = (size_t)m * n * k * 2;
+    tm.start();
+    while (tm.stop() < timems) {
+      for (size_t i = 0; i < batch; i++) {
+        log.start();
+        utils::GemmProblem gp(1, m, n, k);
+        typename Launcher::Param args{gp, {A + i * m * k, k}, {0, 0, &packBs[i]}, {C + i * m * n, n}};
+        parallel::GemmRun<Parallel>(kernel, args, UT_Threading::get());
+        log.stop();
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    printf("Threads %d %s %s Flops:%.3f PerCoreFlops:%.3f\n", threads, corestr, log.get_log_str(), flops,
+           flops / threads);
+  }
+
+  void benchmark_all(int m, int n, int k) {
+    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F32, BTLA_DTYPE::F32, BTLA_DTYPE::F32);
+    auto batch = auto_batch(memsize);
+    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32),
+           bestla_dtype_str(BTLA_DTYPE::F32), bestla_dtype_str(BTLA_DTYPE::F32));
+    avector<AType> A(size_t(m) * k * batch);
+    avector<BType> B(size_t(k) * n * batch);
+    avector<CType> C(size_t(m) * n * batch, 0);
+    fill_buffer_randn(A.data(), m * k, -0.5f, 0.5f);
+    fill_buffer_randn(B.data(), n * k, -0.5f, 0.5f);
+    for (size_t i = 0; i < batch - 1; i++) {
+      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(AType));
+      memcpy(B.data() + i * n * k, B.data(), n * k * sizeof(BType));
+    }
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    float testtime = float(TestMs);
+    GetCPUDevice();
+    auto threads_cfg = UT_Threading::get_threads_config();
+    for (auto threads : threads_cfg) {
+      if (_cd->AVX512F()) {
+        benchmark<sAVX512F, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
+      }
+      if (_cd->AVX2()) {
+        benchmark<sAVX2, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
+      }
+    }
+  }
+};
+#ifdef BTLA_UT_WRAPPER
+static Benchmark_Fp32Fp32 sBenchmark_Fp32Fp32;
+#endif
+
+class Benchmark_U8S8S32 {
+ public:
+  Benchmark_U8S8S32() {
+    UT_START();
+    benchmark_all(1, 4096, 4096);
+    benchmark_all(1024, 4096, 4096);
+    benchmark_all(2048, 4096, 4096);
+  }
+
+  using AType = uint8_t;
+  using BType = int8_t;
+  using CType = int;
+  template <typename Core_T, typename LOG_T>
+  void benchmark(int m, int n, int k, int batch, AType* A, BType* B, CType* C, float timems, int threads) {
+    LOG_T log;
+    using Parallel = parallel::gemm::SchedulerBase<Core_T>;
+    using Launcher =
+        wrapper::gemm::LauncherBase<Core_T::ISA, Core_T, prologue_a::gemm::ActivationBase, prologue_b::gemm::WeightPack,
+                                    epilogue::gemm::AccumulatorWriteBackInt32>;
+    static Launcher kernel;
+    UT_Threading::set_threads(threads);
+    auto corestr = gemm::CoreAttr::to_str(Core_T::ID);
+    utils::timer<std::chrono::milliseconds> tm;
+    auto tmpB = kernel.mProB.createStorage(n, k);
+    std::vector<storage::gemm::StoragePackedWeight> packBs(batch, 0);
+    avector<int8_t> bufB(tmpB.mSize * batch);
+    for (size_t i = 0; i < batch; i++) {
+      packBs[i] = tmpB;
+      packBs[i].assign(bufB.data() + i * tmpB.mSize);
+      kernel.mProB.packWeight(n, k, {B + i * n * k, n, &packBs[i]}, UT_Threading::get());
+    }
+    auto psize = (size_t)m * n * k * 2;
+    tm.start();
+    while (tm.stop() < timems) {
+      for (size_t i = 0; i < batch; i++) {
+        log.start();
+        utils::GemmProblem gp(1, m, n, k);
+        typename Launcher::Param args{gp, {A + i * m * k, k}, {0, 0, &packBs[i]}, {C + i * m * n, n}};
+        parallel::GemmRun<Parallel>(kernel, args, UT_Threading::get());
+        log.stop();
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    printf("Threads %d %s %s Flops:%.3f PerCoreFlops:%.3f\n", threads, corestr, log.get_log_str(), flops,
+           flops / threads);
+  }
+
+  void benchmark_all(int m, int n, int k) {
+    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::U8, BTLA_DTYPE::S8, BTLA_DTYPE::S32);
+    auto batch = auto_batch(memsize);
+    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::U8), bestla_dtype_str(BTLA_DTYPE::S8),
+           bestla_dtype_str(BTLA_DTYPE::S32));
+    avector<AType> A(size_t(m) * k * batch);
+    avector<BType> B(size_t(k) * n * batch);
+    avector<CType> C(size_t(m) * n * batch);
+    fill_buffer_randn(A.data(), m * k, AType(0), AType(255));
+    fill_buffer_randn(B.data(), k * n, BType(-127), BType(127));
+    for (size_t i = 0; i < batch - 1; i++) {
+      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(AType));
+      memcpy(B.data() + i * n * k, B.data(), n * k * sizeof(BType));
+    }
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    float testtime = float(TestMs);
+    GetCPUDevice();
+    auto threads_cfg = UT_Threading::get_threads_config();
+    for (auto threads : threads_cfg) {
+      if (_cd->AMX_INT8()) {
+        benchmark<gemm::ICoreRowNAmxint8<32, 32>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
+        benchmark<gemm::ICoreRowNAmxint8<48, 16>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
+        benchmark<gemm::ICoreRowNAmxint8<64, 16>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
+      }
+      if (_cd->AVX512_VNNI()) {
+        benchmark<gemm::ICoreRowNAvx512vnni<64, 6>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime,
+                                                         threads);
+        benchmark<gemm::ICoreRowNAvx512vnni<48, 8>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime,
+                                                         threads);
+      }
+      if (_cd->AVX_VNNI()) {
+        benchmark<gemm::ICoreRowNAvxvnni<48, 2>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
+        benchmark<gemm::ICoreRowNAvxvnni<24, 4>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
+      }
+    }
+  }
+};
+#ifdef BTLA_UT_WRAPPER
+static Benchmark_U8S8S32 sBenchmark_U8S8S32;
+#endif
+
+class Benchmark_S8S8S32 {
+ public:
+  Benchmark_S8S8S32() {
+    UT_START();
+    benchmark_all(1, 4096, 4096);
+    benchmark_all(1024, 4096, 4096);
+    benchmark_all(2048, 4096, 4096);
+  }
+
+  using AType = int8_t;
+  using BType = int8_t;
+  using CType = int;
+  template <typename Core_T, typename LOG_T>
+  void benchmark(int m, int n, int k, int batch, AType* A, BType* B, CType* C, float timems, int threads) {
+    LOG_T log;
+    using Parallel = parallel::gemm::SchedulerBase<Core_T>;
+    using Launcher =
+        wrapper::gemm::LauncherBase<Core_T::ISA, Core_T, prologue_a::gemm::ActivationBase, prologue_b::gemm::WeightPack,
+                                    epilogue::gemm::AccumulatorWriteBackInt32>;
+    Launcher kernel;
+    UT_Threading::set_threads(threads);
+    auto corestr = gemm::CoreAttr::to_str(Core_T::ID);
+    utils::timer<std::chrono::milliseconds> tm;
+    auto tmpB = kernel.mProB.createStorage(n, k);
+    std::vector<storage::gemm::StoragePackedWeight> packBs(batch, 0);
+    avector<int8_t> bufB(tmpB.mSize * batch);
+    for (size_t i = 0; i < batch; i++) {
+      packBs[i] = tmpB;
+      packBs[i].assign(bufB.data() + i * tmpB.mSize);
+      kernel.mProB.packWeight(n, k, {B + i * n * k, n, &packBs[i]}, UT_Threading::get());
+    }
+    auto psize = (size_t)m * n * k * 2;
+    tm.start();
+    while (tm.stop() < timems) {
+      for (size_t i = 0; i < batch; i++) {
+        log.start();
+        utils::GemmProblem gp(1, m, n, k);
+        typename Launcher::Param args{gp, {A + i * m * k, k}, {0, 0, &packBs[i]}, {C + i * m * n, n}};
+        parallel::GemmRun<Parallel>(kernel, args, UT_Threading::get());
+        log.stop();
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    printf("Threads %d %s %s Flops:%.3f PerCoreFlops:%.3f\n", threads, corestr, log.get_log_str(), flops,
+           flops / threads);
+  }
+
+  void benchmark_all(int m, int n, int k) {
+    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::S8, BTLA_DTYPE::S8, BTLA_DTYPE::S32);
+    auto batch = auto_batch(memsize);
+    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::S8), bestla_dtype_str(BTLA_DTYPE::S8),
+           bestla_dtype_str(BTLA_DTYPE::S32));
+    avector<AType> A(size_t(m) * k * batch);
+    avector<BType> B(size_t(k) * n * batch);
+    avector<CType> C(size_t(m) * n * batch);
+    fill_buffer_randn(A.data(), m * k, AType(0), AType(255));
+    fill_buffer_randn(B.data(), k * n, BType(-127), BType(127));
+    for (size_t i = 0; i < batch - 1; i++) {
+      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(AType));
+      memcpy(B.data() + i * n * k, B.data(), n * k * sizeof(AType));
+    }
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    float testtime = float(TestMs);
+    GetCPUDevice();
+    auto threads_cfg = UT_Threading::get_threads_config();
+    for (auto threads : threads_cfg) {
+      if (_cd->AVX_VNNI()) {
+        benchmark<gemm::ICoreRowNAvxvnniSS<24, 4>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime,
+                                                        threads);
+        benchmark<gemm::ICoreRowNAvxvnniSS<24, 2>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime,
+                                                        threads);
+      }
+      if (_cd->AMX_INT8()) {
+        benchmark<gemm::ICoreRowNAmxint8SS<32, 32>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime,
+                                                         threads);
+        benchmark<gemm::ICoreRowNAmxint8SS<48, 16>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime,
+                                                         threads);
+        benchmark<gemm::ICoreRowNAmxint8SS<64, 16>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime,
+                                                         threads);
+      }
+    }
+  }
+};
+#ifdef BTLA_UT_WRAPPER
+static Benchmark_S8S8S32 sBenchmark_S8S8S32;
+#endif
+
+class Benchmark_Bf16Bf16Fp32 {
+ public:
+  Benchmark_Bf16Bf16Fp32() {
+    UT_START();
+    benchmark_all(1, 4096, 4096);
+    benchmark_all(1024, 4096, 4096);
+    benchmark_all(2048, 4096, 4096);
+  }
+
+  using AType = utils::bf16;
+  using BType = utils::bf16;
+  using CType = float;
+  template <typename Core_T, typename LOG_T>
+  void benchmark(int m, int n, int k, int batch, AType* A, BType* B, CType* C, float timems, int threads) {
+    LOG_T log;
+    using Parallel = parallel::gemm::SchedulerBase<Core_T>;
+    using Launcher =
+        wrapper::gemm::LauncherBase<Core_T::ISA, Core_T, prologue_a::gemm::ActivationBase, prologue_b::gemm::WeightPack,
+                                    epilogue::gemm::AccumulatorWriteBackFp32>;
+    Launcher kernel;
+    UT_Threading::set_threads(threads);
+    auto corestr = gemm::CoreAttr::to_str(Core_T::ID);
+    utils::timer<std::chrono::milliseconds> tm;
+    auto tmpB = kernel.mProB.createStorage(n, k);
+    std::vector<storage::gemm::StoragePackedWeight> packBs(batch, 0);
+    avector<int8_t> bufB(tmpB.mSize * batch);
+    for (size_t i = 0; i < batch; i++) {
+      packBs[i] = tmpB;
+      packBs[i].assign(bufB.data() + i * tmpB.mSize);
+      kernel.mProB.packWeight(n, k, {B + i * n * k, n, &packBs[i]}, UT_Threading::get());
+    }
+    auto psize = (size_t)m * n * k * 2;
+    tm.start();
+    while (tm.stop() < timems) {
+      for (size_t i = 0; i < batch; i++) {
+        log.start();
+        utils::GemmProblem gp(1, m, n, k);
+        typename Launcher::Param args{gp, {A + i * m * k, k}, {0, 0, &packBs[i]}, {C + i * m * n, n}};
+        parallel::GemmRun<Parallel>(kernel, args, UT_Threading::get());
+        log.stop();
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    printf("Threads %d %s %s Flops:%.3f PerCoreFlops:%.3f\n", threads, corestr, log.get_log_str(), flops,
+           flops / threads);
+  }
+
+  void benchmark_all(int m, int n, int k) {
+    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::BF16, BTLA_DTYPE::BF16, BTLA_DTYPE::F32);
+    auto batch = auto_batch(memsize);
+    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::BF16),
+           bestla_dtype_str(BTLA_DTYPE::BF16), bestla_dtype_str(BTLA_DTYPE::F32));
+    avector<AType> A(size_t(m) * k * batch);
+    avector<BType> B(size_t(k) * n * batch);
+    avector<CType> C(size_t(m) * n * batch);
+    fill_buffer_randn(A.data(), k * m, AType(-0.5f), AType(0.5f));
+    fill_buffer_randn(B.data(), k * n, BType(-0.5f), BType(0.5f));
+    for (size_t i = 0; i < batch - 1; i++) {
+      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(AType));
+      memcpy(B.data() + i * n * k, B.data(), n * k * sizeof(BType));
+    }
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    float testtime = float(TestMs);
+    GetCPUDevice();
+    auto threads_cfg = UT_Threading::get_threads_config();
+    for (auto threads : threads_cfg) {
+      if (_cd->AMX_BF16()) {
+        benchmark<gemm::HCoreRowNAmxbf16<32, 32>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
+        benchmark<gemm::HCoreRowNAmxbf16<48, 16>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
+        benchmark<gemm::HCoreRowNAmxbf16<64, 16>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
+      }
+    }
+  }
+};
+#ifdef BTLA_UT_WRAPPER
+static Benchmark_Bf16Bf16Fp32 sBenchmark_Bf16Bf16Fp32;
+#endif
+
+class Benchmark_Fp16Fp16Fp16 {
+ public:
+  Benchmark_Fp16Fp16Fp16() {
+    UT_START();
+    benchmark_all(1, 4096, 4096);
+    benchmark_all(1024, 4096, 4096);
+    benchmark_all(2048, 4096, 4096);
+  }
+
+  using AType = utils::fp16;
+  using BType = utils::fp16;
+  using CType = utils::fp16;
+  template <typename Core_T, typename LOG_T>
+  void benchmark(int m, int n, int k, int batch, AType* A, BType* B, CType* C, float timems, int threads) {
+    LOG_T log;
+    using Parallel = parallel::gemm::SchedulerBase<Core_T>;
+    using Launcher =
+        wrapper::gemm::LauncherBase<Core_T::ISA, Core_T, prologue_a::gemm::ActivationBase, prologue_b::gemm::WeightPack,
+                                    epilogue::gemm::AccumulatorWriteBackFp16>;
+    Launcher kernel;
+    UT_Threading::set_threads(threads);
+    auto corestr = gemm::CoreAttr::to_str(Core_T::ID);
+    utils::timer<std::chrono::milliseconds> tm;
+    auto tmpB = kernel.mProB.createStorage(n, k);
+    std::vector<storage::gemm::StoragePackedWeight> packBs(batch, 0);
+    avector<int8_t> bufB(tmpB.mSize * batch);
+    for (size_t i = 0; i < batch; i++) {
+      packBs[i] = tmpB;
+      packBs[i].assign(bufB.data() + i * tmpB.mSize);
+      kernel.mProB.packWeight(n, k, {B + i * n * k, n, &packBs[i]}, UT_Threading::get());
+    }
+    auto psize = (size_t)m * n * k * 2;
+    tm.start();
+    while (tm.stop() < timems) {
+      for (size_t i = 0; i < batch; i++) {
+        log.start();
+        GemmProblem gp(1, m, n, k);
+        typename Launcher::Param args{gp, {A + i * m * k, k}, {0, 0, &packBs[i]}, {C + i * m * n, n}};
+        parallel::GemmRun<Parallel>(kernel, args, UT_Threading::get());
+        log.stop();
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    printf("Threads %d %s %s Flops:%.3f PerCoreFlops:%.3f\n", threads, corestr, log.get_log_str(), flops,
+           flops / threads);
+  }
+
+  void benchmark_all(int m, int n, int k) {
+    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F16, BTLA_DTYPE::F16, BTLA_DTYPE::F16);
+    auto batch = auto_batch(memsize);
+    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F16),
+           bestla_dtype_str(BTLA_DTYPE::F16), bestla_dtype_str(BTLA_DTYPE::F16));
+    avector<AType> A(size_t(m) * k * batch);
+    avector<BType> B(size_t(k) * n * batch);
+    avector<CType> C(size_t(m) * n * batch);
+    fill_buffer_randn(A.data(), k * m, AType(-0.5f), AType(0.5f));
+    fill_buffer_randn(B.data(), k * n, AType(-0.5f), AType(0.5f));
+    for (size_t i = 0; i < batch - 1; i++) {
+      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(AType));
+      memcpy(B.data() + i * n * k, B.data(), n * k * sizeof(BType));
+    }
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    float testtime = float(TestMs);
+    GetCPUDevice();
+    auto threads_cfg = UT_Threading::get_threads_config();
+    for (auto threads : threads_cfg) {
+      if (_cd->AVX512_FP16()) {
+        benchmark<sAVX512_FP16, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime, threads);
+        benchmark<gemm::HCoreRowNAvx512fp16<64, 12>, LOG>(m, n, k, batch, A.data(), B.data(), C.data(), testtime,
+                                                          threads);
+      }
+    }
+  }
+};
+#ifdef BTLA_UT_WRAPPER
+static Benchmark_Fp16Fp16Fp16 sBenchmark_Fp16Fp16Fp16;
+#endif
+
+class UTWOQ_CompFp32 {
+ public:
+  UTWOQ_CompFp32() {
+    UT_START();
+    ut_s2();
+    ut_s4();
+    ut_s3();
+    // ut_s8();
+    // ut_f4();
+  }
+  void ut_s2() {
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S2_CLIP);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S2_CLIP);
+  }
+  void ut_s3() {
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S3_CLIP);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1024, 4096, 4096, BTLA_DTYPE::S3_CLIP);
+
+  void ut_s4() {
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S4_CLIP);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S4_CLIP);
+  }
+
+  void ut_s8() {
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S8);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S8);
+  }
+
+  void ut_f4() {
+    benchmark_all<prologue_b::gemm::WeightKBlockNFloat, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::F4_BNB);
+    benchmark_all<prologue_b::gemm::WeightKBlockNFloat, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::F4_BNB);
+  }
+
+  template <typename Core_T, typename LOG_T, template <class _T, BTLA_ISA> class Wei, typename Scale_T>
+  void benchmark(int m, int n, int k, int batch, int blocksize, float* A, float* B, float* C, float timems, int threads,
+                 BTLA_DTYPE qtype) {
+    LOG_T log;
+    using Parallel = parallel::gemm::SchedulerBase<Core_T>;
+    using Launcher = wrapper::gemm::LauncherBase<Core_T::ISA, Core_T, prologue_a::gemm::ActivationBase, Wei,
+                                                 epilogue::gemm::AccumulatorWriteBackFp32>;
+    Launcher kernel;
+    UT_Threading::set_threads(threads);
+    auto corestr = gemm::CoreAttr::to_str(Core_T::ID);
+    utils::timer<std::chrono::milliseconds> tm;
+    using WType = typename Wei<Core_T, Core_T::ISA>::StorageWeight;
+    WType tmpB(0);
+    if constexpr (std::is_same_v<Wei<Core_T, Core_T::ISA>,
+                                 prologue_b::gemm::WeightKBlockNInteger<Core_T, Core_T::ISA>>) {
+      tmpB = kernel.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>, bestla_dtype<float>, false);
+
+    } else if constexpr (std::is_same_v<Wei<Core_T, Core_T::ISA>,
+                                        prologue_b::gemm::WeightKBlockNFloat<Core_T, Core_T::ISA>>) {
+      tmpB = kernel.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>);
+    }
+    std::vector<WType> packBs(batch, 0);
+    avector<int8_t> bufB(tmpB.mSize * batch);
+    for (size_t i = 0; i < batch; i++) {
+      packBs[i] = tmpB;
+      packBs[i].assign(bufB.data() + i * tmpB.mSize);
+    }
+    kernel.mProB.packWeight(n, k, B, n, &packBs[0], UT_Threading::get());
+    for (size_t i = 1; i < batch; i++) {
+      memcpy(packBs[i].template WPtr<void>(), packBs[0].template WPtr<void>(), packBs[0].template WSize<char>());
+      memcpy(packBs[i].template SPtr<void>(), packBs[0].template SPtr<void>(), packBs[0].CSize() * sizeof(Scale_T));
+    }
+    auto psize = (size_t)m * n * k * 2;
+    auto memsize = (size_t)packBs[0].mSize + (m * k + m * n) * sizeof(float);
+    tm.start();
+    while (tm.stop() < timems) {
+      for (int i = 0; i < batch; i++) {
+        log.start();
+        GemmProblem gp(1, m, n, k, blocksize);
+        typename Launcher::Param args{gp, {A + i * m * k, k}, {&packBs[i]}, {C + i * m * n, n}};
+        parallel::GemmRun<Parallel>(kernel, args, UT_Threading::get());
+        log.stop();
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    double band = double(memsize) / log.min_val / 1e6;
+    printf("Threads %d Block %d %s %s Flops:%.3fG PerCoreFlops:%.3fG MemoryBandwidth:%.3fGB/s\n", threads, blocksize,
+           corestr, log.get_log_str(), flops, flops / threads, band);
+  }
+
+  template <template <class _T, BTLA_ISA> class Wei, typename Scale_T>
+  void benchmark_all(int m, int n, int k, BTLA_DTYPE qtype) {
+    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F32, qtype, BTLA_DTYPE::F32);
+    int batch = auto_batch(memsize);
+    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32), bestla_dtype_str(qtype),
+           bestla_dtype_str(BTLA_DTYPE::F32));
+    avector<float> A(size_t(m) * k * batch);
+    avector<float> B(size_t(k) * n);
+    avector<float> C(size_t(m) * n * batch);
+    fill_buffer_randn(A.data(), k * m, (-0.5f), (0.5f));
+    fill_buffer_randn(B.data(), k * n, (-0.5f), (0.5f));
+    for (int i = 1; i < batch; i++) {
+      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(float));
+    }
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    float testtime = float(TestMs);
+    GetCPUDevice();
+    auto threads_cfg = UT_Threading::get_threads_config();
+    for (auto threads : threads_cfg) {
+      for (auto blocksize : {32, 128}) {
+        if (_cd->AVX512F()) {
+          benchmark<gemm::SCoreRowNAvx512f<48, 8>, LOG, Wei, Scale_T>(m, n, k, batch, blocksize, A.data(), B.data(),
+                                                                      C.data(), testtime, threads, qtype);
+        }
+        if (_cd->AVX2()) {
+           benchmark<gemm::SCoreRowNAvx2<24, 4>, LOG, Wei, Scale_T>(m, n, k, batch, blocksize, A.data(), B.data(),
+                                                                   C.data(), testtime, threads, qtype);
+        }
+      }
+    }
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_B
+static UTWOQ_CompFp32 sUTWOQ_CompFp32;
+#endif
+
+class UTWOQ_CompBf16 {
+ public:
+  UTWOQ_CompBf16() {
+    UT_START();
+    ut_s4();
+    ut_s8();
+    ut_f4();
+  }
+
+  void ut_s4() {
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S4_CLIP);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S4_CLIP);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2048, 4096, 4096, BTLA_DTYPE::S4_CLIP);
+  }
+
+  void ut_s8() {
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S8);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S8);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2048, 4096, 4096, BTLA_DTYPE::S8);
+  }
+
+  void ut_f4() {
+    benchmark_all<prologue_b::gemm::WeightKBlockNFloat, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::F4_BNB);
+    benchmark_all<prologue_b::gemm::WeightKBlockNFloat, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::F4_BNB);
+    benchmark_all<prologue_b::gemm::WeightKBlockNFloat, utils::bf16>(2048, 4096, 4096, BTLA_DTYPE::F4_BNB);
+  }
+
+  template <typename Core_T, typename LOG_T, template <class _T, BTLA_ISA> class Wei, typename Scale_T>
+  void benchmark(int m, int n, int k, int batch, int blocksize, float* A, float* B, float* C, float timems, int threads,
+                 BTLA_DTYPE qtype) {
+    LOG_T log;
+    using Parallel = parallel::gemm::SchedulerBase<Core_T>;
+    using Launcher = wrapper::gemm::LauncherBase<Core_T::ISA, Core_T, prologue_a::gemm::ActivationConverterFp32, Wei,
+                                                 epilogue::gemm::AccumulatorWriteBackFp32>;
+    Launcher kernel;
+    UT_Threading::set_threads(threads);
+    auto corestr = gemm::CoreAttr::to_str(Core_T::ID);
+    utils::timer<std::chrono::milliseconds> tm;
+    using WType = typename Wei<Core_T, Core_T::ISA>::StorageWeight;
+    WType tmpB(0);
+    if constexpr (std::is_same_v<Wei<Core_T, Core_T::ISA>,
+                                 prologue_b::gemm::WeightKBlockNInteger<Core_T, Core_T::ISA>>) {
+      tmpB = kernel.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>, bestla_dtype<float>, false);
+
+    } else if constexpr (std::is_same_v<Wei<Core_T, Core_T::ISA>,
+                                        prologue_b::gemm::WeightKBlockNFloat<Core_T, Core_T::ISA>>) {
+      tmpB = kernel.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>);
+    }
+    std::vector<WType> packBs(batch, 0);
+    avector<int8_t> bufB(tmpB.mSize * batch);
+    for (size_t i = 0; i < batch; i++) {
+      packBs[i] = tmpB;
+      packBs[i].assign(bufB.data() + i * tmpB.mSize);
+    }
+    kernel.mProB.packWeight(n, k, B, n, &packBs[0], UT_Threading::get());
+    for (size_t i = 1; i < batch; i++) {
+      memcpy(packBs[i].template WPtr<void>(), packBs[0].template WPtr<void>(), packBs[0].template WSize<char>());
+      memcpy(packBs[i].template SPtr<void>(), packBs[0].template SPtr<void>(), packBs[0].CSize() * sizeof(Scale_T));
+    }
+    auto psize = (size_t)m * n * k * 2;
+    auto memsize = (size_t)packBs[0].mSize + (m * k + m * n) * sizeof(float);
+    tm.start();
+    while (tm.stop() < timems) {
+      for (int i = 0; i < batch; i++) {
+        log.start();
+        GemmProblem gp(1, m, n, k);
+        typename Launcher::Param args{gp, {A + i * m * k, k}, {&packBs[i]}, {C + i * m * n, n}};
+        parallel::GemmRun<Parallel>(kernel, args, UT_Threading::get());
+        log.stop();
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    double band = double(memsize) / log.min_val / 1e6;
+    printf("Threads %d Block %d %s %s Flops:%.3fG PerCoreFlops:%.3fG MemoryBandwidth:%.3fGB/s\n", threads, blocksize,
+           corestr, log.get_log_str(), flops, flops / threads, band);
+  }
+
+  template <template <class _T, BTLA_ISA> class Wei, typename Scale_T>
+  void benchmark_all(int m, int n, int k, BTLA_DTYPE qtype) {
+    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F32, qtype, BTLA_DTYPE::F32);
+    int batch = auto_batch(memsize);
+    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32), bestla_dtype_str(qtype),
+           bestla_dtype_str(BTLA_DTYPE::F32));
+    avector<float> A(size_t(m) * k * batch);
+    avector<float> B(size_t(k) * n);
+    avector<float> C(size_t(m) * n * batch);
+    fill_buffer_randn(A.data(), k * m, (-0.5f), (0.5f));
+    fill_buffer_randn(B.data(), k * n, (-0.5f), (0.5f));
+    for (int i = 1; i < batch; i++) {
+      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(float));
+    }
+    using LOG = timer_statistics_logger<TestMs * 2>;
+    float testtime = float(TestMs);
+    GetCPUDevice();
+    auto threads_cfg = UT_Threading::get_threads_config();
+    for (auto threads : threads_cfg) {
+      for (auto blocksize : {32, 128}) {
+        if (_cd->AMX_BF16()) {
+          benchmark<gemm::HCoreRowNAmxbf16<64, 16>, LOG, Wei, Scale_T>(m, n, k, batch, blocksize, A.data(), B.data(),
+                                                                       C.data(), testtime, threads, qtype);
+        }
+      }
+    }
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_B
+static UTWOQ_CompBf16 sUTWOQ_CompBf16;
+#endif
+
+class UTWOQ_CompInt8 {
+ public:
+  UTWOQ_CompInt8() {
+    UT_START();
+    ut_s2();
+    ut_s4();
+    ut_s3();
+    //   ut_s8();
+  }
+
+    void ut_s2() {
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S2_CLIP, true);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S2_CLIP);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S2_CLIP);
+    /*benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S4_CLIP);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2048, 4096, 4096, BTLA_DTYPE::S4_CLIP);*/
+  }
+
+  void ut_s3() {
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S3_CLIP);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S3_CLIP, true);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S3_CLIP);
+
+    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S3_CLIP);
+    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2048, 4096, 4096, BTLA_DTYPE::S3_CLIP);
+  }
+
+  void ut_s4() {
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S4_CLIP);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4096, 4096, BTLA_DTYPE::S4_CLIP, true);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S4_CLIP);
+    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S4_CLIP, true);
+    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S4_CLIP);
+    // benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2048, 4096, 4096, BTLA_DTYPE::S4_CLIP);
+  }
+
+  void ut_s8() {
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4096, 4096, BTLA_DTYPE::S8);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1024, 4096, 4096, BTLA_DTYPE::S8);
+    benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2048, 4096, 4096, BTLA_DTYPE::S8);
+  }
+
+  template <typename Core_T, typename LOG_T, template <class _T, BTLA_ISA> class Wei, typename Scale_T>
+  void benchmark(int m, int n, int k, int batch, int blocksize, float* A, float* B, float* C, float timems, int threads,
+                 BTLA_DTYPE qtype, bool isasym) {
+    LOG_T log;
+    using Parallel = parallel::gemm::SchedulerKBlockS<Core_T>;
+    using Launcher =
+        wrapper::gemm::LauncherIntKBlock<Core_T::ISA, Core_T, prologue_a::gemm::ActivationF32KBlockQuantize, Wei,
+                                         epilogue::gemm::AccumulatorWriteBackFp32>;
+    Launcher kernel;
+    UT_Threading::set_threads(threads);
+    auto corestr = gemm::CoreAttr::to_str(Core_T::ID);
+    utils::timer<std::chrono::milliseconds> tm;
+    using WType = typename Wei<Core_T, Core_T::ISA>::StorageWeight;
+    WType tmpB = kernel.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>, bestla_dtype<float>, isasym);
+    std::vector<WType> packBs(batch, 0);
+    avector<int8_t> bufB(tmpB.mSize * batch);
+    for (size_t i = 0; i < batch; i++) {
+      packBs[i] = tmpB;
+      packBs[i].assign(bufB.data() + i * tmpB.mSize);
+    }
+    kernel.mProB.packWeight(n, k, B, n, &packBs[0], UT_Threading::get());
+    for (size_t i = 1; i < batch; i++) {
+      memcpy(packBs[i].template WPtr<void>(), packBs[0].template WPtr<void>(), packBs[0].template WSize<char>());
+      memcpy(packBs[i].template SPtr<void>(), packBs[0].template SPtr<void>(), packBs[0].CSize() * sizeof(Scale_T));
+    }
+    auto quanA = kernel.mProA.createStorage(m, k, blocksize, false);
+    utils::avector<int8_t> bufferA(quanA.mSize);
+    quanA.assign(bufferA.data());
+    auto psize = (size_t)m * n * k * 2;
+    int blks = k / blocksize;
+    int nbits = utils::bestla_dtype_bits(qtype);
+    auto memsize = (size_t)(n * k * nbits / 8 + n * blks * sizeof(Scale_T)) + (m * k + m * n) * sizeof(float);
+    tm.start();
+    while (tm.stop() < timems) {
+      for (int i = 0; i < batch; i++) {
+        log.start();
+        GemmProblem gp(1, m, n, k, blocksize);
+        typename Launcher::Param args{gp, {A + i * m * k, k, &quanA}, {&packBs[i]}, {C + i * m * n, n}};
+        parallel::GemmRunWithA<Parallel>(kernel, args, UT_Threading::get());
+        log.stop();
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    double band = double(memsize) / log.min_val / 1e6;
+    printf("Threads %d Block %d %s %s Flops:%.3fG PerCoreFlops:%.3fG MemoryBandwidth:%.3fGB/s\n", threads, blocksize,
+           corestr, log.get_log_str(), flops, flops / threads, band);
+  }
+
+  template <template <class _T, BTLA_ISA> class Wei, typename Scale_T>
+  void benchmark_all(int m, int n, int k, BTLA_DTYPE qtype, bool isasym = false) {
+    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F32, qtype, BTLA_DTYPE::F32);
+    int batch = auto_batch(memsize);
+    printf("%d %d %d %d %s %s %s Asym:%d Scale:%s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32),
+           bestla_dtype_str(qtype), bestla_dtype_str(BTLA_DTYPE::F32), isasym, bestla_dtype_str(bestla_dtype<Scale_T>));
+    avector<float> A(size_t(m) * k * batch);
+    avector<float> B(size_t(k) * n);
+    avector<float> C(size_t(m) * n * batch);
+    fill_buffer_randn(A.data(), k * m, (-0.5f), (0.5f));
+    fill_buffer_randn(B.data(), k * n, (-0.5f), (0.5f));
+    for (int i = 1; i < batch; i++) {
+      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(float));
+    }
+    using LOG = timer_statistics_logger<TestMs / 2>;
+    float testtime = float(TestMs);
+    GetCPUDevice();
+    auto threads_cfg = UT_Threading::get_threads_config();
+    for (auto threads : threads_cfg) {
+      for (auto blocksize : {32, 128}) {
+        if (_cd->AMX_INT8() && blocksize % 64 == 0) {
+          benchmark<gemm::ICoreRowNAmxint8KBlock<64, 16>, LOG, Wei, Scale_T>(
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
+        }
+        if (_cd->AVX512_VNNI()) {
+          benchmark<gemm::ICoreRowNAvx512vnniKBlock<48, 4>, LOG, Wei, Scale_T>(
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
+        }
+        if (_cd->AVX_VNNI()) {
+          benchmark<gemm::ICoreRowNAvxvnniKBlock<24, 2>, LOG, Wei, Scale_T>(
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
+        }
+        if (_cd->AVX2()) {
+          benchmark<gemm::ICoreRowNAvx2vnniKBlock<24, 2>, LOG, Wei, Scale_T>(
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype, isasym);
+        }
+      }
+    }
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_B
+#endif
+static UTWOQ_CompInt8 sUTWOQ_CompInt8;
+
+#if 0
+typedef struct {
+  float d;             // delta
+  uint8_t qs[32 / 2];  // nibbles / quants
+} block_q4_0;
+typedef struct {
+  float d;         // delta
+  uint8_t qs[32];  // nibbles / quants
+} block_q8_0;
+#ifdef __GNUC__
+#pragma GCC push_options
+#pragma GCC target("avx2", "fma", "f16c", "avxvnni")
+#endif
+#define __AVX2__
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t* rsi) {
+  const __m128i tmp = _mm_loadu_si128((const __m128i*)rsi);
+  const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
+  const __m256i lowMask = _mm256_set1_epi8(0xF);
+  return _mm256_and_si256(lowMask, bytes);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+  const __m256i ones = _mm256_set1_epi16(1);
+  const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
+  return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+#if __AVXVNNI__
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
+  return _mm256_cvtepi32_ps(summed_pairs);
+#else
+  // Perform multiplication and create 16-bit values
+  const __m256i dot = _mm256_maddubs_epi16(ax, sy);
+  return sum_i16_pairs_float(dot);
+#endif
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+#if __AVXVNNIINT8__
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
+  return _mm256_cvtepi32_ps(summed_pairs);
+#else
+  // Get absolute values of x vectors
+  const __m256i ax = _mm256_sign_epi8(x, x);
+  // Sign the values of the y vectors
+  const __m256i sy = _mm256_sign_epi8(y, x);
+  return mul_sum_us8_pairs_float(ax, sy);
+#endif
+}
+
+// horizontally add 8 floats
+static inline float hsum_float_8(const __m256 x) {
+  __m128 res = _mm256_extractf128_ps(x, 1);
+  res = _mm_add_ps(res, _mm256_castps256_ps128(x));
+  res = _mm_add_ps(res, _mm_movehl_ps(res, res));
+  res = _mm_add_ss(res, _mm_movehdup_ps(res));
+  return _mm_cvtss_f32(res);
+}
+
+static void ne_vec_dot_q4_0_q8_0(const int n, float* s, const void* vx, const void* vy) {
+  const int qk = 32;
+  const int nb = n / qk;
+
+  assert(n % qk == 0);
+  assert(nb % 2 == 0);
+
+  const block_q4_0* x = (const block_q4_0*)vx;
+  const block_q8_0* y = (const block_q8_0*)vy;
+
+#if defined(__AVX2__)
+  // Initialize accumulator with zeros
+  __m256 acc = _mm256_setzero_ps();
+
+  // Main loop
+  for (int i = 0; i < nb; ++i) {
+    /* Compute combined scale for the block */
+    const __m256 d = _mm256_set1_ps((x[i].d) * (y[i].d));
+
+    __m256i bx = bytes_from_nibbles_32(x[i].qs);
+
+    // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+    const __m256i off = _mm256_set1_epi8(8);
+    bx = _mm256_sub_epi8(bx, off);
+
+    __m256i by = _mm256_loadu_si256((const __m256i*)y[i].qs);
+
+    const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+    /* Multiply q with scale and accumulate */
+    acc = _mm256_fmadd_ps(d, q, acc);
+  }
+
+  *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+  // Initialize accumulator with zeros
+  __m256 acc = _mm256_setzero_ps();
+
+  // Main loop
+  for (int i = 0; i < nb; ++i) {
+    // Compute combined scale for the block
+    const __m256 d = _mm256_set1_ps(NE_FP16_TO_FP32(x[i].d) * NE_FP16_TO_FP32(y[i].d));
+
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    const __m128i off = _mm_set1_epi8(8);
+
+    const __m128i tmp = _mm_loadu_si128((const __m128i*)x[i].qs);
+
+    __m128i bx = _mm_and_si128(lowMask, tmp);
+    __m128i by = _mm_loadu_si128((const __m128i*)y[i].qs);
+    bx = _mm_sub_epi8(bx, off);
+    const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
+
+    bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
+    by = _mm_loadu_si128((const __m128i*)(y[i].qs + 16));
+    bx = _mm_sub_epi8(bx, off);
+    const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
+
+    // Convert int32_t to float
+    __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1));
+
+    // Apply the scale, and accumulate
+    acc = _mm256_add_ps(_mm256_mul_ps(d, p), acc);
+  }
+
+  *s = hsum_float_8(acc);
+#elif defined(__SSSE3__)
+  // set constants
+  const __m128i lowMask = _mm_set1_epi8(0xF);
+  const __m128i off = _mm_set1_epi8(8);
+
+  // Initialize accumulator with zeros
+  __m128 acc_0 = _mm_setzero_ps();
+  __m128 acc_1 = _mm_setzero_ps();
+  __m128 acc_2 = _mm_setzero_ps();
+  __m128 acc_3 = _mm_setzero_ps();
+
+  // First round without accumulation
+  {
+    _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0);
+    _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
+
+    // Compute combined scale for the block 0 and 1
+    const __m128 d_0_1 = _mm_set1_ps(NE_FP16_TO_FP32(x[0].d) * NE_FP16_TO_FP32(y[0].d));
+
+    const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i*)x[0].qs);
+
+    __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+    __m128i by_0 = _mm_loadu_si128((const __m128i*)y[0].qs);
+    bx_0 = _mm_sub_epi8(bx_0, off);
+    const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+    __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+    __m128i by_1 = _mm_loadu_si128((const __m128i*)(y[0].qs + 16));
+    bx_1 = _mm_sub_epi8(bx_1, off);
+    const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+    _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
+    _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
+
+    // Compute combined scale for the block 2 and 3
+    const __m128 d_2_3 = _mm_set1_ps(NE_FP16_TO_FP32(x[1].d) * NE_FP16_TO_FP32(y[1].d));
+
+    const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i*)x[1].qs);
+
+    __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+    __m128i by_2 = _mm_loadu_si128((const __m128i*)y[1].qs);
+    bx_2 = _mm_sub_epi8(bx_2, off);
+    const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+    __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+    __m128i by_3 = _mm_loadu_si128((const __m128i*)(y[1].qs + 16));
+    bx_3 = _mm_sub_epi8(bx_3, off);
+    const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+    // Convert int32_t to float
+    __m128 p0 = _mm_cvtepi32_ps(i32_0);
+    __m128 p1 = _mm_cvtepi32_ps(i32_1);
+    __m128 p2 = _mm_cvtepi32_ps(i32_2);
+    __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+    // Apply the scale
+    acc_0 = _mm_mul_ps(d_0_1, p0);
+    acc_1 = _mm_mul_ps(d_0_1, p1);
+    acc_2 = _mm_mul_ps(d_2_3, p2);
+    acc_3 = _mm_mul_ps(d_2_3, p3);
+  }
+
+  // Main loop
+  for (int i = 2; i < nb; i += 2) {
+    _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
+    _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
+
+    // Compute combined scale for the block 0 and 1
+    const __m128 d_0_1 = _mm_set1_ps(NE_FP16_TO_FP32(x[i].d) * NE_FP16_TO_FP32(y[i].d));
+
+    const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i*)x[i].qs);
+
+    __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+    __m128i by_0 = _mm_loadu_si128((const __m128i*)y[i].qs);
+    bx_0 = _mm_sub_epi8(bx_0, off);
+    const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+    __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+    __m128i by_1 = _mm_loadu_si128((const __m128i*)(y[i].qs + 16));
+    bx_1 = _mm_sub_epi8(bx_1, off);
+    const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+    _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
+    _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
+
+    // Compute combined scale for the block 2 and 3
+    const __m128 d_2_3 = _mm_set1_ps(NE_FP16_TO_FP32(x[i + 1].d) * NE_FP16_TO_FP32(y[i + 1].d));
+
+    const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i*)x[i + 1].qs);
+
+    __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+    __m128i by_2 = _mm_loadu_si128((const __m128i*)y[i + 1].qs);
+    bx_2 = _mm_sub_epi8(bx_2, off);
+    const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+    __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+    __m128i by_3 = _mm_loadu_si128((const __m128i*)(y[i + 1].qs + 16));
+    bx_3 = _mm_sub_epi8(bx_3, off);
+    const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+    // Convert int32_t to float
+    __m128 p0 = _mm_cvtepi32_ps(i32_0);
+    __m128 p1 = _mm_cvtepi32_ps(i32_1);
+    __m128 p2 = _mm_cvtepi32_ps(i32_2);
+    __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+    // Apply the scale
+    __m128 p0_d = _mm_mul_ps(d_0_1, p0);
+    __m128 p1_d = _mm_mul_ps(d_0_1, p1);
+    __m128 p2_d = _mm_mul_ps(d_2_3, p2);
+    __m128 p3_d = _mm_mul_ps(d_2_3, p3);
+
+    // Acummulate
+    acc_0 = _mm_add_ps(p0_d, acc_0);
+    acc_1 = _mm_add_ps(p1_d, acc_1);
+    acc_2 = _mm_add_ps(p2_d, acc_2);
+    acc_3 = _mm_add_ps(p3_d, acc_3);
+  }
+
+  *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+#else
+  // scalar
+  float sumf = 0.0;
+
+  for (int i = 0; i < nb; i++) {
+    int sumi = 0;
+
+    for (int j = 0; j < qk / 2; ++j) {
+      const int v0 = (x[i].qs[j] & 0x0F) - 8;
+      const int v1 = (x[i].qs[j] >> 4) - 8;
+
+      sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk / 2]);
+    }
+    sumf += sumi * x[i].d * y[i].d;
+  }
+
+  *s = sumf;
+#endif
+}
+
+class UTWOQ_GGML {
+ public:
+  UTWOQ_GGML() {
+    UT_START();
+    ut_q40();
+  }
+
+  void ut_q40() { benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4608, 4096, BTLA_DTYPE::S4_CLIP); }
+
+  template <typename Core_T, typename LOG_T, template <class _T, BTLA_ISA> class Wei, typename Scale_T>
+  void benchmark(int m, int n, int k, int batch, int blocksize, float* A, float* B, float* C, float timems, int threads,
+                 BTLA_DTYPE qtype) {
+    LOG_T log;
+    using Parallel = parallel::gemm::SchedulerKBlockS<Core_T>;
+    using Launcher =
+        wrapper::gemm::LauncherIntKBlock<Core_T::ISA, Core_T, prologue_a::gemm::ActivationF32KBlockQuantize, Wei,
+                                         epilogue::gemm::AccumulatorWriteBackFp32>;
+    Launcher kernel;
+    UT_Threading::set_threads(threads);
+    auto corestr = gemm::CoreAttr::to_str(Core_T::ID);
+    utils::timer<std::chrono::milliseconds> tm;
+    using WType = typename Wei<Core_T, Core_T::ISA>::StorageWeight;
+    WType tmpB = kernel.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>, bestla_dtype<float>, false);
+    std::vector<WType> packBs(batch, 0);
+    avector<int8_t> bufB(tmpB.mSize * batch);
+    for (size_t i = 0; i < batch; i++) {
+      packBs[i] = tmpB;
+      packBs[i].assign(bufB.data() + i * tmpB.mSize);
+    }
+    kernel.mProB.packWeight(n, k, B, n, &packBs[0], UT_Threading::get());
+    for (size_t i = 1; i < batch; i++) {
+      memcpy(packBs[i].template WPtr<void>(), packBs[0].template WPtr<void>(), packBs[0].template WSize<char>());
+      memcpy(packBs[i].template SPtr<void>(), packBs[0].template SPtr<void>(), packBs[0].CSize() * sizeof(Scale_T));
+    }
+    auto quanA = kernel.mProA.createStorage(m, k, blocksize, false);
+    utils::avector<int8_t> bufferA(quanA.mSize);
+    quanA.assign(bufferA.data());
+    auto psize = (size_t)m * n * k * 2;
+    auto memsize = (size_t)packBs[0].mSize + (m * k + m * n) * sizeof(float);
+    int blks = updiv(k, blocksize);
+    std::vector<block_q4_0> QB(batch * n * blks);
+    std::vector<block_q8_0> QA(batch * m * blks);
+    auto memsize = sizeof(block_q4_0) * blks * n + sizeof(block_q8_0) * blks * m + m * n * sizeof(float);
+    int dr = updiv(n, threads);
+    parallel::gemm::SchedulerDispatcher<parallel::Scheduler2D> sch(
+        UT_Threading::get(), {UT_Threading::get()->num_threads(), 1, n, 1, Core_T::NTILE, 0, 0});
+    tm.start();
+    while (tm.stop() < timems) {
+      for (int i = 0; i < batch; i++) {
+        log.start();
+        auto qb = QB.data() + i * n * blks;
+        auto qa = QA.data() + i * m * blks;
+        auto cptr = C + i * m * n;
+        UT_Threading::get()->parallel_for([&](int idx) {
+          parallel::ThreadProblem2D thp{idx};
+          sch.getIndex(thp);
+          for (int ir = thp.loc[1]; ir < thp.loc[1] + thp.size[1]; ir++) {
+            ne_vec_dot_q4_0_q8_0(k, cptr + ir, qb + ir * blks, qa);
+          }
+        });
+        log.stop();
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    double band = double(memsize) / log.min_val / 1e6;
+    printf("Threads %d Block %d %s %s Flops:%.3fG PerCoreFlops:%.3fG MemoryBandwidth:%.3fGB/s\n", threads, blocksize,
+           corestr, log.get_log_str(), flops, flops / threads, band);
+  }
+
+  template <template <class _T, BTLA_ISA> class Wei, typename Scale_T>
+  void benchmark_all(int m, int n, int k, BTLA_DTYPE qtype) {
+    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F32, qtype, BTLA_DTYPE::F32);
+    int batch = auto_batch(memsize);
+    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32), bestla_dtype_str(qtype),
+           bestla_dtype_str(BTLA_DTYPE::F32));
+    avector<float> A(size_t(m) * k * batch);
+    avector<float> B(size_t(k) * n);
+    avector<float> C(size_t(m) * n * batch);
+    fill_buffer_randn(A.data(), k * m, (-0.5f), (0.5f));
+    fill_buffer_randn(B.data(), k * n, (-0.5f), (0.5f));
+    for (int i = 1; i < batch; i++) {
+      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(float));
+    }
+    using LOG = timer_statistics_logger<TestMs / 2>;
+    float testtime = float(TestMs);
+    GetCPUDevice();
+    auto threads_cfg = UT_Threading::get_threads_config();
+    for (auto threads : threads_cfg) {
+      for (auto blocksize : {32}) {
+        if (_cd->AVX2()) {
+          benchmark<gemm::ICoreRowNAvxvnniKBlock<24, 2>, LOG, Wei, Scale_T>(
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype);
+          benchmark<gemm::ICoreRowNAvxvnniKBlock<48, 1>, LOG, Wei, Scale_T>(
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype);
+        }
+      }
+    }
+  }
+};
+static UTWOQ_GGML sUTWOQ_GGML;
+#include "kernel_avx2.h"
+#define AVX_VNNI_ 1
+template <int NTILE, typename SBT>
+static void bestla_vec_dot_q4_0_q8_0(const int k_reduce, const int blocksize, float* out, const uint8_t* a_ptr,
+                                     const float* a_scale, const uint8_t* b_ptr, const SBT* b_scale, int b_step) {
+  const int k_blks = k_reduce / blocksize;
+  int constexpr NReg = NTILE / 8;
+  // Initialize accumulator with zeros
+  __m256 acc[NReg];
+  for (int i = 0; i < NReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+  uint32_t mask = 0xf0f0f0f0;
+  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
+  const __m256i ones = _mm256_set1_epi16(1);
+  // Main loop
+  for (int ib = 0; ib < k_blks; ++ib) {
+    /* Compute combined scale for the block */
+    __m256i iacc[NReg];
+    for (int i = 0; i < NReg; i++) {
+      iacc[i] = _mm256_setzero_si256();
+    }
+    for (int ik = 0; ik < blocksize; ik += 4) {
+      auto va = _mm256_set1_epi32(*(int*)(a_ptr + ib * blocksize + ik));
+      for (int i = 0; i < NReg; i++) {
+        auto vb = kernel::avx2::unpack_4bits_avx2<BTLA_DTYPE::S4_CLIP>(
+            (void*)(b_ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+#if AVX_VNNI_
+        iacc[i] = _mm256_dpbusd_avx_epi32(iacc[i], va, vb);
+#else
+        __m256i dot = _mm256_maddubs_epi16(va, vb);  // overflow of int16
+        __m256i summed_pairs = _mm256_madd_epi16(ones, dot);
+        iacc[i] = _mm256_add_epi32(iacc[i], summed_pairs);
+#endif
+      }
+    }
+    const __m256 v_a_scale = _mm256_set1_ps(*(a_scale + ib));
+    for (int i = 0; i < NReg; i++) {
+      __m256 v_b_scale;
+      if constexpr (std::is_same_v<SBT, float>) {
+        v_b_scale = _mm256_loadu_ps(b_scale + ib * b_step + i * 8);
+      } else if constexpr (std::is_same_v<SBT, utils::bf16>) {
+        auto tmp = _mm_loadu_si128((const __m128i*)(b_scale + ib * b_step + i * 8));
+        v_b_scale = kernel::avx2::ymm_cvt_bf16_fp32(tmp);
+      }
+      v_b_scale = _mm256_mul_ps(v_a_scale, v_b_scale);
+      auto tmp = _mm256_cvtepi32_ps(iacc[i]);
+      acc[i] = _mm256_fmadd_ps(tmp, v_b_scale, acc[i]);
+    }
+  }
+  for (int i = 0; i < NReg; i++) {
+    _mm256_storeu_ps(out + i * 8, acc[i]);
+  }
+}
+
+template <int NTILE, typename SBT>
+static void bestla_vec_dot_q4_0_f32(const int k_reduce, const int blocksize, float* out, const float* a_ptr,
+                                    const uint8_t* b_ptr, const SBT* b_scale, int b_step) {
+  const int k_blks = k_reduce / blocksize;
+  int constexpr NReg = NTILE / 8;
+  // Initialize accumulator with zeros
+  __m256 acc[NReg];
+  for (int i = 0; i < NReg; i++) {
+    acc[i] = _mm256_setzero_ps();
+  }
+  uint32_t mask = 0xf0f0f0f0;
+  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
+  // Main loop
+  for (int ib = 0; ib < k_blks; ++ib) {
+#if 1
+    __m256 v_b_scale[NReg];
+    for (int i = 0; i < NReg; i++) {
+      if constexpr (std::is_same_v<SBT, float>) {
+        v_b_scale[i] = _mm256_loadu_ps(b_scale + ib * b_step + i * 8);
+      } else if constexpr (std::is_same_v<SBT, utils::bf16>) {
+        auto tmp = _mm_loadu_si128((const __m128i*)(b_scale + ib * b_step + i * 8));
+        v_b_scale[i] = kernel::avx2::ymm_cvt_bf16_fp32(tmp);
+      }
+    }
+    int constexpr Unroll = 4;
+    int8_t tmpbuf[NTILE * Unroll];
+    for (int ik = 0; ik < blocksize; ik += Unroll) {
+      for (int i = 0; i < NReg; i++) {
+        auto vb =
+            kernel::avx2::unpack_4bits_avx2<BTLA_DTYPE::S4_CLIP>((void*)(b_ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+        _mm256_storeu_si256((__m256i*)(tmpbuf + 32 * i), vb);
+      }
+      for (int ikk = 0; ikk < Unroll; ikk++) {
+        auto va = _mm256_set1_ps(*(a_ptr + ib * blocksize + ik + ikk));
+        for (int i = 0; i < NReg; i++) {
+          auto tmp = _mm_loadl_epi64((const __m128i*)(tmpbuf + i * 8 + ikk * NTILE));
+          auto s32tmp = _mm256_cvtepi8_epi32(tmp);
+          auto ftmp = _mm256_cvtepi32_ps(s32tmp);
+          ftmp = _mm256_mul_ps(ftmp, v_b_scale[i]);
+          acc[i] = _mm256_fmadd_ps(va, ftmp, acc[i]);
+        }
+      }
+    }
+#else
+    __m256 acc_local[NReg];
+    for (int i = 0; i < NReg; i++) {
+      acc_local[i] = _mm256_setzero_ps();
+    }
+    int constexpr Unroll = 4;
+    int8_t tmpbuf[NTILE * Unroll];
+    for (int ik = 0; ik < blocksize; ik += Unroll) {
+      if constexpr (NTILE == 24) {
+        auto vb = kernel::avx2::unpack_4bits_avx2<BTLA_DTYPE::S4_CLIP>(
+            (void*)(b_ptr + 0 * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+        auto va = _mm256_set1_ps(*(a_ptr + ib * blocksize + ik + 0));
+        auto s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
+        vb = _mm256_permute4x64_epi64(vb, 57);
+        auto ftmp = _mm256_cvtepi32_ps(s32tmp);
+        acc_local[0] = _mm256_fmadd_ps(va, ftmp, acc_local[0]);
+
+        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
+        vb = _mm256_permute4x64_epi64(vb, 57);
+        ftmp = _mm256_cvtepi32_ps(s32tmp);
+        acc_local[1] = _mm256_fmadd_ps(va, ftmp, acc_local[1]);
+
+        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
+        vb = _mm256_permute4x64_epi64(vb, 57);
+        ftmp = _mm256_cvtepi32_ps(s32tmp);
+        acc_local[2] = _mm256_fmadd_ps(va, ftmp, acc_local[2]);
+        va = _mm256_set1_ps(*(a_ptr + ib * blocksize + ik + 1));
+
+        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
+        vb = _mm256_permute4x64_epi64(vb, 57);
+        ftmp = _mm256_cvtepi32_ps(s32tmp);
+        acc_local[0] = _mm256_fmadd_ps(va, ftmp, acc_local[0]);
+
+        vb = kernel::avx2::unpack_4bits_avx2<BTLA_DTYPE::S4_CLIP>(
+            (void*)(b_ptr + 1 * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+
+        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
+        vb = _mm256_permute4x64_epi64(vb, 57);
+        ftmp = _mm256_cvtepi32_ps(s32tmp);
+        acc_local[1] = _mm256_fmadd_ps(va, ftmp, acc_local[1]);
+
+        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
+        vb = _mm256_permute4x64_epi64(vb, 57);
+        ftmp = _mm256_cvtepi32_ps(s32tmp);
+        acc_local[2] = _mm256_fmadd_ps(va, ftmp, acc_local[2]);
+        va = _mm256_set1_ps(*(a_ptr + ib * blocksize + ik + 2));
+
+        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
+        vb = _mm256_permute4x64_epi64(vb, 57);
+        ftmp = _mm256_cvtepi32_ps(s32tmp);
+        acc_local[0] = _mm256_fmadd_ps(va, ftmp, acc_local[0]);
+
+        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
+        vb = _mm256_permute4x64_epi64(vb, 57);
+        ftmp = _mm256_cvtepi32_ps(s32tmp);
+        acc_local[1] = _mm256_fmadd_ps(va, ftmp, acc_local[1]);
+
+        vb = kernel::avx2::unpack_4bits_avx2<BTLA_DTYPE::S4_CLIP>(
+            (void*)(b_ptr + 2 * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+
+        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
+        vb = _mm256_permute4x64_epi64(vb, 57);
+        ftmp = _mm256_cvtepi32_ps(s32tmp);
+        acc_local[2] = _mm256_fmadd_ps(va, ftmp, acc_local[2]);
+        va = _mm256_set1_ps(*(a_ptr + ib * blocksize + ik + 3));
+
+        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
+        vb = _mm256_permute4x64_epi64(vb, 57);
+        ftmp = _mm256_cvtepi32_ps(s32tmp);
+        acc_local[0] = _mm256_fmadd_ps(va, ftmp, acc_local[0]);
+
+        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
+        vb = _mm256_permute4x64_epi64(vb, 57);
+        ftmp = _mm256_cvtepi32_ps(s32tmp);
+        acc_local[1] = _mm256_fmadd_ps(va, ftmp, acc_local[1]);
+
+        s32tmp = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vb));
+        vb = _mm256_permute4x64_epi64(vb, 57);
+        ftmp = _mm256_cvtepi32_ps(s32tmp);
+        acc_local[2] = _mm256_fmadd_ps(va, ftmp, acc_local[2]);
+      } else {
+        for (int i = 0; i < NReg; i++) {
+          auto vb = kernel::avx2::unpack_4bits_avx2<BTLA_DTYPE::S4_CLIP>(
+              (void*)(b_ptr + i * 16 + (ib * blocksize + ik) * NTILE / 2), vmask);
+          _mm256_storeu_si256((__m256i*)(tmpbuf + 32 * i), vb);
+        }
+        for (int ikk = 0; ikk < Unroll; ikk++) {
+          auto va = _mm256_set1_ps(*(a_ptr + ib * blocksize + ik + ikk));
+          for (int i = 0; i < NReg; i++) {
+            auto tmp = _mm_loadl_epi64((const __m128i*)(tmpbuf + i * 8 + ikk * NTILE));
+            auto s32tmp = _mm256_cvtepi8_epi32(tmp);
+            auto ftmp = _mm256_cvtepi32_ps(s32tmp);
+            acc_local[i] = _mm256_fmadd_ps(va, ftmp, acc_local[i]);
+          }
+        }
+      }
+    }
+    for (int i = 0; i < NReg; i++) {
+      __m256 v_b_scale;
+      if constexpr (std::is_same_v<SBT, float>) {
+        v_b_scale = _mm256_loadu_ps(b_scale + ib * b_step + i * 8);
+      } else if constexpr (std::is_same_v<SBT, utils::bf16>) {
+        auto tmp = _mm_loadu_si128((const __m128i*)(b_scale + ib * b_step + i * 8));
+        v_b_scale = kernel::avx2::ymm_cvt_bf16_fp32(tmp);
+      }
+      acc[i] = _mm256_fmadd_ps(acc_local[i], v_b_scale, acc[i]);
+    }
+#endif
+  }
+  for (int i = 0; i < NReg; i++) {
+    _mm256_storeu_ps(out + i * 8, acc[i]);
+  }
+}
+
+class UTWOQ_S4_VecDot {
+ public:
+  UTWOQ_S4_VecDot() {
+    UT_START();
+     benchmark_all<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4608, 4096, BTLA_DTYPE::S4_CLIP);
+     benchmark_all<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4608, 4096, BTLA_DTYPE::S4_CLIP);
+    benchmark_all_fp32<prologue_b::gemm::WeightKBlockNInteger, float>(1, 4608, 4096, BTLA_DTYPE::S4_CLIP);
+    benchmark_all_fp32<prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(1, 4608, 4096, BTLA_DTYPE::S4_CLIP);
+  }
+
+  template <typename Core_T, typename LOG_T, template <class _T, BTLA_ISA> class Wei, typename Scale_T>
+  void benchmark(int m, int n, int k, int batch, int blocksize, float* A, float* B, float* C, float timems, int threads,
+                 BTLA_DTYPE qtype) {
+    LOG_T log;
+    using Parallel = parallel::gemm::SchedulerKBlockS<Core_T>;
+    using Launcher =
+        wrapper::gemm::LauncherIntKBlock<Core_T::ISA, Core_T, prologue_a::gemm::ActivationF32KBlockQuantize, Wei,
+                                         epilogue::gemm::AccumulatorWriteBackFp32>;
+    Launcher kernel;
+    UT_Threading::set_threads(threads);
+    auto corestr = gemm::CoreAttr::to_str(Core_T::ID);
+    utils::timer<std::chrono::milliseconds> tm;
+    using WType = typename Wei<Core_T, Core_T::ISA>::StorageWeight;
+    WType tmpB = kernel.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>, bestla_dtype<float>, false);
+    std::vector<WType> packBs(batch, 0);
+    avector<int8_t> bufB(tmpB.mSize * batch);
+    for (size_t i = 0; i < batch; i++) {
+      packBs[i] = tmpB;
+      packBs[i].assign(bufB.data() + i * tmpB.mSize);
+    }
+    kernel.mProB.packWeight(n, k, B, n, &packBs[0], UT_Threading::get());
+    for (size_t i = 1; i < batch; i++) {
+      memcpy(packBs[i].template WPtr<void>(), packBs[0].template WPtr<void>(), packBs[0].template WSize<char>());
+      memcpy(packBs[i].template SPtr<void>(), packBs[0].template SPtr<void>(), packBs[0].CSize() * sizeof(Scale_T));
+    }
+    auto quanA = kernel.mProA.createStorage(m, k, blocksize, false);
+    std::vector<storage::gemm::StorageQuantActivation> As(batch);
+    utils::avector<int8_t> bufferA(quanA.mSize * batch);
+    for (size_t i = 0; i < batch; i++) {
+      As[i] = quanA;
+      As[i].assign(bufferA.data() + i * quanA.mSize);
+    }
+    kernel.mProA.quantize({A, k, &As[0]}, m, k, UT_Threading::get());
+    for (size_t i = 1; i < batch; i++) {
+      memcpy(As[i].template APtr<void>(), As[0].template APtr<void>(), As[0].template ASize<char>());
+      memcpy(As[i].template SPtr<void>(), As[0].template SPtr<void>(), As[0].CSize() * sizeof(float));
+    }
+    auto psize = (size_t)m * n * k * 2;
+    auto blks = updiv(k, blocksize);
+    auto memsize =
+        (size_t)(n * k / 2 + n * blks * sizeof(Scale_T)) + (m * k + m * blks * sizeof(float)) + (m * n) * sizeof(float);
+    assert(m == 1);
+    // parallel::Scheduler2D sch({UT_Threading::get()->num_threads(), 1, n, 1, Core_T::NTILE, 0, 0});
+    parallel::gemm::SchedulerDispatcher<parallel::Scheduler2D> sch(
+        UT_Threading::get(), {UT_Threading::get()->num_threads(), 1, n, 1, Core_T::NTILE, 0, 0});
+    int bcount = 0;
+    tm.start();
+    while (tm.stop() < timems) {
+      for (int i = 0; i < batch; i++) {
+        log.start();
+        auto cbptr = C + i * m * n;
+        auto awptr = As[i].template APtr<uint8_t>();
+        auto asptr = As[i].template SPtr<float>();
+        auto bwptr = packBs[i].template WPtr<uint8_t>();
+        auto bsptr = packBs[i].template SPtr<Scale_T>();
+        UT_Threading::get()->parallel_for([&](int idx) {
+          parallel::ThreadProblem2D thp{idx};
+          sch.getIndex(thp);
+          if (thp.valid) {
+            for (int in = 0; in < thp.size[1]; in += Core_T::NTILE) {
+              bestla_vec_dot_q4_0_q8_0<Core_T::NTILE>(k, blocksize, cbptr + thp.loc[1] + in, awptr, asptr,
+                                                      bwptr + (thp.loc[1] + in) * k / 2, bsptr + thp.loc[1] + in, n);
+            }
+          }
+        });
+        log.stop();
+        bcount += 1;
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    double band = double(memsize) / log.min_val / 1e6;
+    printf("Threads %d Block %d %s %s Flops:%.3fG PerCoreFlops:%.3fG MemoryBandwidth:%.3fGB/s\n", threads, blocksize,
+           corestr, log.get_log_str(), flops, flops / threads, band);
+
+    // avector<float> refC(m * n);
+    // avector<float> revB(n * k);
+    // kernel.mProB.unpackWeight(n, k, &packBs[0], revB.data(), n, UT_Threading::get());
+    // gemmref_fp32fp32fp32(m, n, k, A, revB.data(), refC.data(), k, n, n);
+    // bcount = std::min(bcount, batch);
+    // for (size_t i = 0; i < bcount; i++) {
+    //   buffer_error(refC.data(), C + i * m * n, m * n, 0.1f);
+    // }
+  }
+
+  template <template <class _T, BTLA_ISA> class Wei, typename Scale_T>
+  void benchmark_all(int m, int n, int k, BTLA_DTYPE qtype) {
+    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F32, qtype, BTLA_DTYPE::F32);
+    int batch = auto_batch(memsize);
+    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32), bestla_dtype_str(qtype),
+           bestla_dtype_str(BTLA_DTYPE::F32));
+    avector<float> A(size_t(m) * k * batch);
+    avector<float> B(size_t(k) * n);
+    avector<float> C(size_t(m) * n * batch);
+    fill_buffer_randn(A.data(), k * m, (0.01f), (0.5f));
+    fill_buffer_randn(B.data(), k * n, (-0.5f), (0.5f));
+    for (int i = 1; i < batch; i++) {
+      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(float));
+    }
+    using LOG = timer_statistics_logger<TestMs / 2>;
+    float testtime = float(TestMs);
+    GetCPUDevice();
+    auto threads_cfg = UT_Threading::get_threads_config();
+    for (auto threads : threads_cfg) {
+      for (auto blocksize : {32, 128}) {
+        if (_cd->AMX_INT8()) {
+          benchmark<gemm::ICoreRowNAmxint8KBlock<64, 16>, LOG, Wei, Scale_T>(
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype);
+        }
+        if (_cd->AVX512_VNNI()) {
+          benchmark<gemm::ICoreRowNAvx512vnniKBlock<48, 4>, LOG, Wei, Scale_T>(
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype);
+          benchmark<gemm::ICoreRowNAvx512vnniKBlock<96, 2>, LOG, Wei, Scale_T>(
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype);
+        }
+        if (_cd->AVX_VNNI()) {
+          benchmark<gemm::ICoreRowNAvxvnniKBlock<24, 2>, LOG, Wei, Scale_T>(
+              m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype);
+          // benchmark<gemm::ICoreRowNAvxvnniKBlock<48, 1>, LOG, Wei, Scale_T>(
+          //     m, n, k, batch, blocksize, A.data(), B.data(), C.data(), testtime, threads, qtype);
+        }
+      }
+    }
+  }
+
+  template <typename Core_T, typename LOG_T, template <class _T, BTLA_ISA> class Wei, typename Scale_T>
+  void benchmark_fp32(int m, int n, int k, int batch, int blocksize, float* A, float* B, float* C, float timems,
+                      int threads, BTLA_DTYPE qtype) {
+    LOG_T log;
+    using Parallel = parallel::gemm::SchedulerKBlockS<Core_T>;
+    using Launcher = wrapper::gemm::LauncherIntKBlock<Core_T::ISA, Core_T, prologue_a::gemm::ActivationBase, Wei,
+                                                      epilogue::gemm::AccumulatorWriteBackFp32>;
+    Launcher kernel;
+    UT_Threading::set_threads(threads);
+    auto corestr = gemm::CoreAttr::to_str(Core_T::ID);
+    utils::timer<std::chrono::milliseconds> tm;
+    using WType = typename Wei<Core_T, Core_T::ISA>::StorageWeight;
+    WType tmpB = kernel.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>, bestla_dtype<float>, false);
+    std::vector<WType> packBs(batch, 0);
+    avector<int8_t> bufB(tmpB.mSize * batch);
+    for (size_t i = 0; i < batch; i++) {
+      packBs[i] = tmpB;
+      packBs[i].assign(bufB.data() + i * tmpB.mSize);
+    }
+    kernel.mProB.packWeight(n, k, B, n, &packBs[0], UT_Threading::get());
+    for (size_t i = 1; i < batch; i++) {
+      memcpy(packBs[i].template WPtr<void>(), packBs[0].template WPtr<void>(), packBs[0].template WSize<char>());
+      memcpy(packBs[i].template SPtr<void>(), packBs[0].template SPtr<void>(), packBs[0].CSize() * sizeof(Scale_T));
+    }
+
+    auto psize = (size_t)m * n * k * 2;
+    auto blks = updiv(k, blocksize);
+    auto memsize = (size_t)(n * k / 2 + n * blks * sizeof(Scale_T)) + (m * k * sizeof(float)) + (m * n) * sizeof(float);
+    assert(m == 1);
+    // parallel::Scheduler2D sch({UT_Threading::get()->num_threads(), 1, n, 1, Core_T::NTILE, 0, 0});
+    parallel::gemm::SchedulerDispatcher<parallel::Scheduler2D> sch(
+        UT_Threading::get(), {UT_Threading::get()->num_threads(), 1, n, 1, Core_T::NTILE, 0, 0});
+    tm.start();
+    while (tm.stop() < timems) {
+      for (int i = 0; i < batch; i++) {
+        log.start();
+        auto cbptr = C + i * m * n;
+        auto aptr = A + i * m * k;
+        auto bwptr = packBs[i].template WPtr<uint8_t>();
+        auto bsptr = packBs[i].template SPtr<Scale_T>();
+        UT_Threading::get()->parallel_for([&](int idx) {
+          parallel::ThreadProblem2D thp{idx};
+          sch.getIndex(thp);
+          if (thp.valid) {
+            for (int in = 0; in < thp.size[1]; in += Core_T::NTILE) {
+              bestla_vec_dot_q4_0_f32<Core_T::NTILE>(k, blocksize, cbptr + thp.loc[1] + in, aptr,
+                                                     bwptr + (thp.loc[1] + in) * k / 2, bsptr + thp.loc[1] + in, n);
+            }
+          }
+        });
+        log.stop();
+        if (tm.stop() >= timems) {
+          break;
+        }
+      }
+    }
+    log.record();
+    double flops = double(psize) / log.min_val / 1e6;
+    double band = double(memsize) / log.min_val / 1e6;
+    printf("Threads %d Block %d %s %s Flops:%.3fG PerCoreFlops:%.3fG MemoryBandwidth:%.3fGB/s\n", threads, blocksize,
+           corestr, log.get_log_str(), flops, flops / threads, band);
+
+    /* avector<float> refC(m * n);
+     avector<float> revB(n * k);
+     kernel.mProB.unpackWeight(n, k, &packBs[0], revB.data(), n, UT_Threading::get());
+     gemmref_fp32fp32fp32(m, n, k, A, revB.data(), refC.data(), k, n, n);
+     for (size_t i = 0; i < batch; i++) {
+       buffer_error(refC.data(), C + i * m * n, m * n, 0.1f);
+     }*/
+  }
+  template <template <class _T, BTLA_ISA> class Wei, typename Scale_T>
+  void benchmark_all_fp32(int m, int n, int k, BTLA_DTYPE qtype) {
+    auto memsize = gemm_memsize(m, n, k, BTLA_DTYPE::F32, qtype, BTLA_DTYPE::F32);
+    int batch = auto_batch(memsize);
+    printf("%d %d %d %d %s %s %s\n", m, n, k, batch, bestla_dtype_str(BTLA_DTYPE::F32), bestla_dtype_str(qtype),
+           bestla_dtype_str(BTLA_DTYPE::F32));
+    avector<float> A(size_t(m) * k * batch);
+    avector<float> B(size_t(k) * n);
+    avector<float> C(size_t(m) * n * batch);
+    fill_buffer_randn(A.data(), k * m, (0.01f), (0.5f));
+    fill_buffer_randn(B.data(), k * n, (-0.5f), (0.5f));
+    for (int i = 1; i < batch; i++) {
+      memcpy(A.data() + i * m * k, A.data(), m * k * sizeof(float));
+    }
+    using LOG = timer_statistics_logger<TestMs / 2>;
+    float testtime = float(TestMs);
+    GetCPUDevice();
+    auto threads_cfg = UT_Threading::get_threads_config();
+    for (auto threads : threads_cfg) {
+      for (auto blocksize : {32, 128}) {
+        if (_cd->AVX2()) {
+          benchmark_fp32<gemm::SCoreRowNAvx2<24, 4>, LOG, Wei, Scale_T>(m, n, k, batch, blocksize, A.data(), B.data(),
+                                                                        C.data(), testtime, threads, qtype);
+        }
+      }
+    }
+  }
+};
+//static UTWOQ_S4_VecDot sUTWOQ_S4_VecDot;
+#endif
+}  // namespace ut
+}  // namespace bestla
+int main() {
+  printf("BesTLA Benchmark done\n");
+  return 0;
+}
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_epilogue.cpp b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_epilogue.cpp
new file mode 100644
index 000000000..d1d293b26
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_epilogue.cpp
@@ -0,0 +1,143 @@
+#include "bestla_epilogue.h"
+#include "bestla_ut.h"
+
+#ifdef BTLA_UT_EPILOGUE
+namespace bestla {
+using namespace utils;
+namespace ut {
+class UT_AccumulatorWriteBack {
+ public:
+  UT_AccumulatorWriteBack() {
+    UT_START();
+    CheckISA(AVX2);
+    fp32ut<BTLA_ISA::AVX2>(127, 255, 0, 0, 127, 255);
+    fp32ut<BTLA_ISA::AVX2>(101, 237, 10, 63, 30, 33);
+    fp32ut_with_custom_gelu<BTLA_ISA::AVX2>(15, 15, 0, 0, 15, 15);
+    fp32ut_with_custom_swish<BTLA_ISA::AVX2>(15, 15, 0, 0, 15, 15);
+    bf16ut<BTLA_ISA::AVX2>(127, 255, 0, 0, 127, 255);
+    bf16ut<BTLA_ISA::AVX2>(101, 237, 10, 63, 30, 33);
+    bf16fp32ut<BTLA_ISA::AVX2>(101, 237, 10, 63, 30, 33);
+    bf16fp32ut<BTLA_ISA::AVX2>(127, 255, 0, 0, 127, 255);
+    CheckISA(AVX512F);
+    fp32ut<BTLA_ISA::AVX512F>(127, 255, 0, 0, 127, 255);
+    fp32ut<BTLA_ISA::AVX512F>(101, 237, 10, 63, 30, 33);
+    bf16ut<BTLA_ISA::AVX512F>(127, 255, 0, 0, 127, 255);
+    bf16ut<BTLA_ISA::AVX512F>(101, 237, 10, 63, 30, 33);
+    fp32ut_with_custom_gelu<BTLA_ISA::AVX512F>(15, 15, 0, 0, 15, 15);
+    fp32ut_with_custom_swish<BTLA_ISA::AVX512F>(15, 15, 0, 0, 15, 15);
+
+    bf16fp32ut<BTLA_ISA::AVX512F>(101, 237, 10, 63, 30, 33);
+    bf16fp32ut<BTLA_ISA::AVX512F>(127, 255, 0, 0, 127, 255);
+  }
+  template <BTLA_ISA _RT_ISA_T>
+  void bf16fp32ut(int _M, int _N, int _M_offset, int _N_offset, int _cpy_M, int _cpy_N) {
+    printf("Test Case %s %d %d %d %d %d %d\n", __FUNCTION__, _M, _N, _M_offset, _N_offset, _cpy_M, _cpy_N);
+    std::vector<bf16> src(_M * _N);
+    for (int i = 0; i < _M * _N; i++) src[i].fromfloat(i);
+    std::vector<float> dstref(_M * _N, 0), dstker(_M * _N, 0);
+    epilogue::gemm::AccumulatorWriteBackBf16Fp32<_RT_ISA_T> ker;
+    epilogue::gemm::AccumulatorWriteBackBf16Fp32<BTLA_ISA::NoSIMD> kerref;
+
+    kerref.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstref.data(), _N}, cache, CacheSize);
+    ker.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstker.data(), _N}, cache, CacheSize);
+    ut::buffer_error(dstref.data(), dstker.data(), dstref.size());
+  }
+  template <BTLA_ISA _RT_ISA_T>
+  void bf16ut(int _M, int _N, int _M_offset, int _N_offset, int _cpy_M, int _cpy_N) {
+    printf("Test Case %s %d %d %d %d %d %d\n", __FUNCTION__, _M, _N, _M_offset, _N_offset, _cpy_M, _cpy_N);
+    std::vector<float> src(_M * _N);
+    for (int i = 0; i < _M * _N; i++) src[i] = float(i);
+    std::vector<uint16_t> dstref(_M * _N, 0), dstker(_M * _N, 0);
+    epilogue::gemm::AccumulatorWriteBackFp32Bf16<_RT_ISA_T> ker;
+    epilogue::gemm::AccumulatorWriteBackFp32Bf16<BTLA_ISA::NoSIMD> kerref;
+
+    kerref.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {reinterpret_cast<bf16*>(dstref.data()), _N},
+                   cache, CacheSize);
+    ker.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {reinterpret_cast<bf16*>(dstker.data()), _N},
+                cache, CacheSize);
+    ut::buffer_error<uint16_t>(dstref.data(), dstker.data(), dstref.size());
+  }
+  template <BTLA_ISA _RT_ISA_T>
+  void fp32ut(int _M, int _N, int _M_offset, int _N_offset, int _cpy_M, int _cpy_N) {
+    printf("Test Case %s %d %d %d %d %d %d\n", __FUNCTION__, _M, _N, _M_offset, _N_offset, _cpy_M, _cpy_N);
+    std::vector<float> src(_M * _N);
+    for (int i = 0; i < _M * _N; i++) src[i] = float(i);
+    std::vector<float> dstref(_M * _N, 0), dstker(_M * _N, 0);
+    epilogue::gemm::AccumulatorWriteBackFp32<_RT_ISA_T> ker;
+    epilogue::gemm::AccumulatorWriteBackFp32<BTLA_ISA::NoSIMD> kerref;
+
+    kerref.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstref.data(), _N}, cache, CacheSize);
+    ker.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstker.data(), _N}, cache, CacheSize);
+    ut::buffer_error<float>(dstref.data(), dstker.data(), dstref.size());
+  }
+  template <BTLA_ISA _RT_ISA_T>
+  void fp32ut_with_custom_gelu(int _M, int _N, int _M_offset, int _N_offset, int _cpy_M, int _cpy_N) {
+    printf("Test Case %s %d %d %d %d %d %d\n", __FUNCTION__, _N, _M, _M_offset, _N_offset, _cpy_M, _cpy_N);
+    std::vector<float> src(_M * _N);
+    for (int i = 0; i < _M * _N; i++) src[i] = float(i);
+    std::vector<float> dstref(_M * _N, 0), dstker(_M * _N, 0);
+    epilogue::gemm::AccumulatorWriteBackWithGeluFp32<_RT_ISA_T> ker;
+    ker.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstker.data(), _N}, cache, CacheSize);
+    auto gelu = [&](float x) {
+      return 0.5f * x * (1.f + tanhf(0.7978845834732056f * (x + 0.044714998453855515f * x * x * x)));
+    };
+    for (int i = 0; i < _M * _N; i++) src[i] = gelu(src[i]);
+    ut::buffer_error<float>(src.data(), dstker.data(), dstker.size(), 0.000001f);
+  }
+
+  template <BTLA_ISA _RT_ISA_T>
+  void fp32ut_with_custom_swish(int _M, int _N, int _M_offset, int _N_offset, int _cpy_M, int _cpy_N) {
+    printf("Test Case %s %d %d %d %d %d %d\n", __FUNCTION__, _N, _M, _M_offset, _N_offset, _cpy_M, _cpy_N);
+    std::vector<float> src(_M * _N);
+    for (int i = 0; i < _M * _N; i++) src[i] = float(i);
+    std::vector<float> dstref(_M * _N, 0), dstker(_M * _N, 0);
+    float elt_const_v[] = {-1.0f};
+    epilogue::gemm::AccumulatorWriteBackWithSwishFp32<_RT_ISA_T> ker;
+    ker.forward(src.data(), _N, _M_offset, _N_offset, _cpy_M, _cpy_N, {dstker.data(), _N, elt_const_v}, cache,
+                CacheSize);
+    auto swish = [&](float x) { return x / (1 + exp(-x)); };
+    for (int i = 0; i < _M * _N; i++) src[i] = swish(src[i]);
+    ut::buffer_error<float>(src.data(), dstker.data(), dstker.size(), 0.2f);  // swish use low lprecision exp
+  }
+};
+
+class UT_AlphaBetaProcessFp32 {
+ public:
+  UT_AlphaBetaProcessFp32() {
+    UT_START();
+    // ut(45, 8, padto(45, 48), 0, 45, 1.f, 1.f);
+    // ut(45, 8, padto(45, 48), 0, 45, 1.f, 0.f);
+    // ut(45, 8, padto(45, 48), 45, 45, 1.f, 0.f);
+    CheckISA(AVX512F);
+    ut(3, 8, padto(3, 48), 3, 3, 1.f, 1.f);
+    ut(3, 8, padto(3, 48), 0, 3, 1.f, 1.f);
+    ut(3, 8, padto(3, 48), 3, 3, 1.f, 0.f);
+  }
+
+  void ut(int _N, int _M, int _srcstep, int _src1step, int _dststep, float alpha, float beta) {
+    printf("Test Case %d %d %d %d %d %f %f\n", _N, _M, _srcstep, _src1step, _dststep, alpha, beta);
+    std::vector<float> src(_M * _srcstep), src1, dst(_M * _dststep, 0.f), dstref(_M * _dststep, 0.f);
+    if (_src1step == 0) {
+      src1.resize(_N, 10.f);
+    } else {
+      src1.resize(_M * _src1step, 10.f);
+    }
+    for (int i = 0; i < src.size(); i++) {
+      src[i] = float(i);
+    }
+    epilogue::gemm::AlphaBetaProcessFp32<BTLA_ISA::NoSIMD> kernref;
+    epilogue::gemm::AlphaBetaProcessFp32<BTLA_ISA::AVX512F> kern0;
+    kernref.forward(src.data(), _srcstep, 0, 0, _M, _N, {dstref.data(), src1.data(), _dststep, _src1step, alpha, beta},
+                    cache, CacheSize);
+    kern0.forward(src.data(), _srcstep, 0, 0, _M, _N, {dst.data(), src1.data(), _dststep, _src1step, alpha, beta},
+                  cache, CacheSize);
+    ut::buffer_error<float>(dstref.data(), dst.data(), dstref.size());
+  }
+};
+#ifdef BTLA_UT_EPILOGUE
+static UT_AccumulatorWriteBack sUT_AccumulatorWriteBack;
+static UT_AlphaBetaProcessFp32 sUT_AlphaBetaProcessFp32;
+#endif
+}  // namespace ut
+}  // namespace bestla
+#endif
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_gemm.cpp b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_gemm.cpp
new file mode 100644
index 000000000..2f1a38eab
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_gemm.cpp
@@ -0,0 +1,971 @@
+#include "bestla_gemm.h"
+#include "bestla_utils.h"
+#include "bestla_ut.h"
+
+namespace bestla {
+using namespace utils;
+
+template <int NTILE>
+void ref_bf16(utils::bf16* matA, utils::bf16* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride,
+              int _cstride, int kpos) {
+  int lda = _astride / sizeof(utils::bf16);
+  int ldb = _bstride / sizeof(utils::bf16);
+  int ldc = _cstride / sizeof(float);
+  int constexpr KPack = 4 / sizeof(utils::bf16);
+  for (int i = 0; i < _m; i++) {
+    for (int j = 0; j < _n; j += NTILE) {
+      for (int ij = 0; ij < NTILE; ij++) {
+        if (j + ij >= _n) {
+          continue;
+        }
+        float tmp = 0;
+        for (int k = 0; k < _k; k += KPack) {
+          for (int ik = 0; ik < KPack; ik++) {
+            if (k + ik >= _k) {
+              continue;
+            }
+            auto tmpA = utils::cast<utils::bf16, float>(utils::bf16{matA[i * lda + k + ik]});
+            auto tmpB = utils::cast<utils::bf16, float>(utils::bf16{matB[k * NTILE + ij * KPack + ik + j * ldb]});
+            tmp += tmpA * tmpB;
+          }
+        }
+        matC[i * ldc + j + ij] = tmp;
+      }
+    }
+  }
+}
+
+template <int NTILE>
+void ref_fp16(utils::fp16* matA, utils::fp16* matB, utils::fp16* matC, int _m, int _n, int _k, int _astride,
+              int _bstride, int _cstride, int kpos) {
+  int lda = _astride / sizeof(utils::fp16);
+  int ldb = _bstride / sizeof(utils::fp16);
+  int ldc = _cstride / sizeof(utils::fp16);
+  int constexpr KPack = 1;
+  for (int i = 0; i < _m; i++) {
+    for (int j = 0; j < _n; j += NTILE) {
+      for (int ij = 0; ij < NTILE; ij++) {
+        if (j + ij >= _n) {
+          continue;
+        }
+        utils::fp16 tmp = utils::fp16(0.f);
+        for (int k = 0; k < _k; k += KPack) {
+          for (int ik = 0; ik < KPack; ik++) {
+            if (k + ik >= _k) {
+              continue;
+            }
+            auto tmpA = utils::cast<utils::fp16, float>(matA[i * lda + k + ik]);
+            auto tmpB = utils::cast<utils::fp16, float>(matB[k * NTILE + ij * KPack + ik + j * ldb]);
+            tmp = float(tmp) + tmpA * tmpB;
+          }
+        }
+        matC[i * ldc + j + ij] = tmp;
+      }
+    }
+  }
+}
+
+template <int NTILE>
+void ref_fp32(float* matA, float* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
+              int kpos) {
+  int lda = _astride / sizeof(float);
+  int ldb = _bstride / sizeof(float);
+  int ldc = _cstride / sizeof(float);
+  int constexpr KPack = 4 / sizeof(float);
+  for (int i = 0; i < _m; i++) {
+    for (int j = 0; j < _n; j += NTILE) {
+      for (int ij = 0; ij < NTILE; ij++) {
+        if (j + ij >= _n) {
+          continue;
+        }
+        float tmp = 0;
+        for (int k = 0; k < _k; k += KPack) {
+          for (int ik = 0; ik < KPack; ik++) {
+            if (k + ik >= _k) {
+              continue;
+            }
+            auto tmpA = matA[i * lda + k + ik];
+            auto tmpB = matB[k * NTILE + ij * KPack + ik + j * ldb];
+            tmp += tmpA * tmpB;
+          }
+        }
+        matC[i * ldc + j + ij] = tmp;
+      }
+    }
+  }
+}
+
+template <int NTILE, class T_A_, class T_B_>
+void ref_int8(T_A_* matA, T_B_* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
+              int kpos) {
+  int lda = _astride / sizeof(T_A_);
+  int ldb = _bstride / sizeof(T_B_);
+  int ldc = _cstride / sizeof(int32_t);
+  int constexpr KPack = 4 / sizeof(T_B_);
+  for (int i = 0; i < _m; i++) {
+    for (int j = 0; j < _n; j += NTILE) {
+      for (int ij = 0; ij < NTILE; ij++) {
+        if (j + ij >= _n) {
+          continue;
+        }
+        int32_t tmp = 0;
+        for (int k = 0; k < _k; k += KPack) {
+          for (int ik = 0; ik < KPack; ik++) {
+            if (k + ik >= _k) {
+              continue;
+            }
+            auto tmpA = utils::cast<T_A_, int32_t>(matA[i * lda + k + ik]);
+            auto tmpB = utils::cast<T_B_, int32_t>(matB[k * NTILE + ij * KPack + ik + j * ldb]);
+            tmp += tmpA * tmpB;
+          }
+        }
+        matC[i * ldc + j + ij] = tmp;
+      }
+    }
+  }
+}
+
+template <int NTILE, typename T_SB_>
+void ref_kblock_int8(uint8_t* matA, int8_t* matB, float* matC, uint8_t* zpA, float* scaleA, int _ldsa, T_SB_* scaleB,
+                     float* reduceB, int _ldsb, int _m, int _n, int _k, int _kblock, int _astride, int _bstride,
+                     int _cstride, int kpos) {
+  int lda = _astride / sizeof(matA[0]);
+  int ldb = _bstride / sizeof(matB[0]);
+  int ldc = _cstride / sizeof(matC[0]);
+  for (int i = 0; i < _m; i++) {
+    for (int j = 0; j < _n; j += NTILE) {
+      for (int ij = 0; ij < NTILE; ij++) {
+        if (j + ij >= _n) {
+          break;
+        }
+        float tmpf = 0.f;
+        for (int k = 0; k < _k; k += _kblock) {
+          int tmp = 0;
+          int zpval = int(zpA[i * _ldsa + k / _kblock]);
+          for (int ik = 0; ik < _kblock; ik += 4) {
+            if (k + ik >= _k) {
+              break;
+            }
+            for (int ikk = 0; ikk < 4; ikk++) {
+              tmp += (int(matA[i * lda + k + ik + ikk])) * int(matB[(k + ik) * NTILE + ij * 4 + ikk + j * ldb]);
+            }
+          }
+          tmpf += tmp * scaleA[i * _ldsa + k / _kblock] * float(scaleB[j + ij + k / _kblock * _ldsb]);
+          tmpf -= zpval * scaleA[i * _ldsa + k / _kblock] * reduceB[j + ij + k / _kblock * _ldsb];
+        }
+        matC[i * ldc + j + ij] = tmpf;
+      }
+    }
+  }
+}
+template <int NTILE, typename T_SB_>
+void ref_kblock_int8_ss(int8_t* matA, int8_t* matB, float* matC, float* scaleA, int _ldsa, T_SB_* scaleB, int _ldsb,
+                        int _m, int _n, int _k, int _kblock, int _astride, int _bstride, int _cstride, int kpos) {
+  int lda = _astride / sizeof(matA[0]);
+  int ldb = _bstride / sizeof(matB[0]);
+  int ldc = _cstride / sizeof(matC[0]);
+  for (int i = 0; i < _m; i++) {
+    for (int j = 0; j < _n; j += NTILE) {
+      for (int ij = 0; ij < NTILE; ij++) {
+        if (j + ij >= _n) {
+          break;
+        }
+        float tmpf = 0.f;
+        for (int k = 0; k < _k; k += _kblock) {
+          int tmp = 0;
+          for (int ik = 0; ik < _kblock; ik += 4) {
+            if (k + ik >= _k) {
+              break;
+            }
+            for (int ikk = 0; ikk < 4; ikk++) {
+              tmp += (int(matA[i * lda + k + ik + ikk])) * int(matB[(k + ik) * NTILE + ij * 4 + ikk + j * ldb]);
+            }
+          }
+          tmpf += tmp * scaleA[i * _ldsa + k / _kblock] * float(scaleB[j + ij + k / _kblock * _ldsb]);
+        }
+        matC[i * ldc + j + ij] = tmpf;
+      }
+    }
+  }
+}
+namespace ut {
+class UT_GEMM_AVX2 {
+ public:
+  UT_GEMM_AVX2() {
+    UT_START();
+    CheckISA(AVX2);
+    ut_24(4, 24, 3);
+    ut_24(4, 48, 3);
+
+    ut_48(1, 48, 3);
+    ut_48(1, 144, 3);
+  }
+
+  void ut_24(int m, int n, int k) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::SCoreRowNAvx2<24>;
+    static Core gemm;
+    if (n % Core::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    if (m > Core::Code::MTILE) {
+      return;
+    }
+    avector<float> A(m * k), B(k * n), C(m * n, 0.f), RefC(m * n, 0.f);
+    fill_buffer_randn(A.data(), A.size(), -0.5f, 0.5f);
+    fill_buffer_randn(B.data(), B.size(), -0.5f, 0.5f);
+    ref_fp32<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), m, n, k, k * 4, k * 4, n * 4, 0);
+
+    gemm.forward(A.data(), B.data(), C.data(), m, n, k, k * 4, k * 4, n * 4, 0, cache, CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 0.001f);
+  }
+
+  void ut_48(int m, int n, int k) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::SCoreRowNAvx2<48>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    if (m > Core::Code::MTILE) {
+      return;
+    }
+    avector<float> A(m * k), B(k * n), C(m * n, 0.f), RefC(m * n, 0.f);
+    fill_buffer_randn(A.data(), A.size(), -0.5f, 0.5f);
+    fill_buffer_randn(B.data(), B.size(), -0.5f, 0.5f);
+    ref_fp32<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), m, n, k, k * 4, k * 4, n * 4, 0);
+
+    gemm.forward(A.data(), B.data(), C.data(), m, n, k, k * 4, k * 4, n * 4, 0, cache, CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_GEMM
+static UT_GEMM_AVX2 sUT_GEMM_AVX2;
+#endif
+
+class UT_GEMM_AVX512F {
+ public:
+  UT_GEMM_AVX512F() {
+    UT_START();
+    CheckISA(AVX512F);
+    ut_32(4, 32, 3);
+    ut_32(4, 64, 3);
+
+    ut_48(1, 48, 3);
+    ut_48(1, 144, 3);
+  }
+
+  void ut_32(int m, int n, int k) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::SCoreRowNAvx512f<32>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    avector<float> A(m * k), B(k * n), C(m * n, 0.f), RefC(m * n, 0.f);
+    fill_buffer_randn(A.data(), A.size(), -0.5f, 0.5f);
+    fill_buffer_randn(B.data(), B.size(), -0.5f, 0.5f);
+    ref_fp32<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), m, n, k, k * 4, k * 4, n * 4, 0);
+
+    gemm.forward(A.data(), B.data(), C.data(), m, n, k, k * 4, k * 4, n * 4, 0, cache, CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 0.001f);
+  }
+
+  void ut_48(int m, int n, int k) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::SCoreRowNAvx512f<48>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    avector<float> A(m * k), B(k * n), C(m * n, 0.f), RefC(m * n, 0.f);
+    fill_buffer_randn(A.data(), A.size(), -0.5f, 0.5f);
+    fill_buffer_randn(B.data(), B.size(), -0.5f, 0.5f);
+    ref_fp32<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), m, n, k, k * 4, k * 4, n * 4, 0);
+
+    gemm.forward(A.data(), B.data(), C.data(), m, n, k, k * 4, k * 4, n * 4, 0, cache, CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_GEMM
+static UT_GEMM_AVX512F sUT_GEMM_AVX512F;
+#endif
+
+class UT_GEMM_AVX512VNNI {
+ public:
+  UT_GEMM_AVX512VNNI() {
+    UT_START();
+    CheckISA(AVX512_VNNI);
+    ut<32, 0>(4, 64, 12);
+    ut<32, 12>(4, 64, 12);
+
+    ut<48, 0>(4, 96, 12);
+    ut<48, 8>(4, 96, 12);
+  }
+
+  template <int NTile, int MTile>
+  void ut(int m, int n, int k) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::ICoreRowNAvx512vnni<NTile, MTile>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    avector<uint8_t> A(m * k);
+    avector<int8_t> B(k * n);
+    avector<int> C(m * n, 0), RefC(m * n, 0);
+    fill_buffer_randn(A.data(), A.size(), (uint8_t)0, (uint8_t)255);
+    fill_buffer_randn(B.data(), B.size(), (int8_t)-127, (int8_t)127);
+    ref_int8<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), m, n, k, k * sizeof(A[0]), k * sizeof(B[0]),
+                                n * sizeof(C[0]), 0);
+
+    gemm.forward(A.data(), B.data(), C.data(), m, n, k, k * sizeof(A[0]), k * sizeof(B[0]), n * sizeof(C[0]), 0, cache,
+                 CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 1);
+  }
+};
+#ifdef BTLA_UT_GEMM
+static UT_GEMM_AVX512VNNI sUT_GEMM_AVX512VNNI;
+#endif
+
+class UT_GEMM_AVX512VNNI_KBLOCK {
+ public:
+  UT_GEMM_AVX512VNNI_KBLOCK() {
+    UT_START();
+    CheckISA(AVX512_VNNI);
+    ut<48, 4>(4, 96, 36, 36);
+    ut<48, 4>(4, 144, 128, 32);
+    ut<48, 4>(4, 144, 128, 128);
+    ut<48, 4>(4, 144, 256, 128);
+    ut_splitblock<48, 4>(4, 144, 128, 128, 64);
+  }
+
+  template <int NTile, int MTile>
+  void ut(int m, int n, int k, int kblock) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::ICoreRowNAvx512vnniKBlock<NTile, MTile>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    avector<uint8_t> A(m * k);
+    avector<int8_t> B(k * n);
+    avector<float> C(m * n, 0), RefC(m * n, 0);
+    int blk_num = utils::updiv(k, kblock);
+    avector<float> scaleA(blk_num * m), scaleB(blk_num * n), reduceB(blk_num * n, 0.f);
+    avector<uint8_t> zpA(m * blk_num);
+    fill_buffer_randn(A.data(), A.size(), (uint8_t)0, (uint8_t)255);
+    fill_buffer_randn(zpA.data(), zpA.size(), (uint8_t)0, (uint8_t)0);
+    fill_buffer_randn(B.data(), B.size(), (int8_t)-127, (int8_t)127);
+    fill_buffer_randn(scaleA.data(), scaleA.size(), 0.003f, 0.005f);
+    fill_buffer_randn(scaleB.data(), scaleB.size(), 0.003f, 0.005f);
+    for (size_t i = 0; i < k; i++) {
+      for (size_t j = 0; j < n; j++) {
+        reduceB[i / kblock * n + j] += B[i * n + j];
+      }
+    }
+    for (size_t i = 0; i < blk_num; i++) {
+      for (size_t j = 0; j < n; j++) {
+        reduceB[i * n + j] *= scaleB[i * n + j];
+      }
+    }
+
+    ref_kblock_int8<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), zpA.data(), scaleA.data(), blk_num,
+                                       scaleB.data(), reduceB.data(), n, m, n, k, kblock, k * sizeof(A[0]),
+                                       k * sizeof(B[0]), n * sizeof(C[0]), 0);
+
+    gemm.forward(A.data(), B.data(), C.data(), zpA.data(), scaleA.data(), blk_num, scaleB.data(), reduceB.data(), n, m,
+                 n, k, kblock, k * sizeof(A[0]), k * sizeof(B[0]), n * sizeof(C[0]), 0, 1.f, cache, CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 0.001f);
+  }
+
+  template <int NTile, int MTile>
+  void ut_splitblock(int m, int n, int k, int kblock, int kstep) {
+    assert(k == kblock);  // for large kblock case
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::ICoreRowNAvx512vnniKBlock<NTile, MTile>;
+    Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    avector<uint8_t> A(m * k);
+    avector<int8_t> B(k * n);
+    avector<float> C(m * n, 0), RefC(m * n, 0);
+    int blk_num = utils::updiv(k, kblock);
+    avector<float> scaleA(blk_num * m), scaleB(blk_num * n), reduceB(blk_num * n, 0.f);
+    avector<uint8_t> zpA(m * blk_num);
+    fill_buffer_randn(A.data(), A.size(), (uint8_t)0, (uint8_t)255);
+    fill_buffer_randn(zpA.data(), zpA.size(), (uint8_t)0, (uint8_t)0);
+    fill_buffer_randn(B.data(), B.size(), (int8_t)-127, (int8_t)127);
+    fill_buffer_randn(scaleA.data(), scaleA.size(), 0.003f, 0.005f);
+    fill_buffer_randn(scaleB.data(), scaleB.size(), 0.003f, 0.005f);
+    for (size_t i = 0; i < k; i++) {
+      for (size_t j = 0; j < n; j++) {
+        reduceB[i / kblock * n + j] += B[i * n + j];
+      }
+    }
+    for (size_t i = 0; i < blk_num; i++) {
+      for (size_t j = 0; j < n; j++) {
+        reduceB[i * n + j] *= scaleB[i * n + j];
+      }
+    }
+
+    ref_kblock_int8<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), zpA.data(), scaleA.data(), blk_num,
+                                       scaleB.data(), reduceB.data(), n, m, n, k, kblock, k * sizeof(A[0]),
+                                       k * sizeof(B[0]), n * sizeof(C[0]), 0);
+    for (size_t i = 0; i < k; i += kstep) {
+      auto k_re = remainsize(i, k, kstep);
+      gemm.forward(A.data() + i, B.data() + i * Core::Code::NTILE, C.data(), zpA.data(), scaleA.data(), blk_num,
+                   scaleB.data(), reduceB.data(), n, m, n, k_re, k_re, k * sizeof(A[0]), k * sizeof(B[0]),
+                   n * sizeof(C[0]), i, k_re / float(k), cache, CacheSize);
+    }
+
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_GEMM
+static UT_GEMM_AVX512VNNI_KBLOCK sUT_GEMM_AVX512VNNI_KBLOCK;
+#endif
+
+class UT_GEMM_AVXVNNI_KBLOCK {
+ public:
+  UT_GEMM_AVXVNNI_KBLOCK() {
+    UT_START();
+    CheckISA(AVX_VNNI);
+    ut_ss<24, 1>(1, 96, 36, 36);
+    ut_ss<48, 1>(1, 96, 36, 36);
+    ut<48, 1>(1, 96, 36, 36);
+    ut<48, 1>(1, 144, 128, 32);
+    ut<48, 1>(1, 144, 128, 128);
+    ut<48, 1>(1, 144, 256, 128);
+    ut<24, 2>(2, 96, 36, 36);
+    ut<24, 2>(2, 144, 128, 32);
+    ut<24, 2>(2, 144, 128, 128);
+    ut<24, 2>(2, 144, 256, 128);
+  }
+
+  template <int NTile, int MTile>
+  void ut(int m, int n, int k, int kblock) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::ICoreRowNAvxvnniKBlock<NTile, MTile>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    avector<uint8_t> A(m * k);
+    avector<int8_t> B(k * n);
+    avector<float> C(m * n, 0), RefC(m * n, 0);
+    int blk_num = utils::updiv(k, kblock);
+    avector<float> scaleA(blk_num * m), scaleB(blk_num * n), reduceB(blk_num * n, 0.f);
+    avector<uint8_t> zpA(m * blk_num);
+    fill_buffer_randn(A.data(), A.size(), (uint8_t)0, (uint8_t)255);
+    fill_buffer_randn(zpA.data(), zpA.size(), (uint8_t)0, (uint8_t)255);
+    fill_buffer_randn(B.data(), B.size(), (int8_t)-127, (int8_t)127);
+    fill_buffer_randn(scaleA.data(), scaleA.size(), 0.003f, 0.005f);
+    fill_buffer_randn(scaleB.data(), scaleB.size(), 0.003f, 0.005f);
+    for (size_t i = 0; i < k; i++) {
+      for (size_t j = 0; j < n; j++) {
+        reduceB[i / kblock * n + j] += B[i * n + j];
+      }
+    }
+    for (size_t i = 0; i < blk_num; i++) {
+      for (size_t j = 0; j < n; j++) {
+        reduceB[i * n + j] *= scaleB[i * n + j];
+      }
+    }
+
+    ref_kblock_int8<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), zpA.data(), scaleA.data(), blk_num,
+                                       scaleB.data(), reduceB.data(), n, m, n, k, kblock, k * sizeof(A[0]),
+                                       k * sizeof(B[0]), n * sizeof(C[0]), 0);
+
+    gemm.forward(A.data(), B.data(), C.data(), zpA.data(), scaleA.data(), blk_num, scaleB.data(), reduceB.data(), n, m,
+                 n, k, kblock, k * sizeof(A[0]), k * sizeof(B[0]), n * sizeof(C[0]), 0, 1.f, cache, CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 0.001f);
+  }
+
+    template <int NTile, int MTile>
+  void ut_ss(int m, int n, int k, int kblock) {
+    printf("Test Case SS: %d %d %d\n", m, n, k);
+    using Core = gemm::ICoreRowNAvxvnniKBlockSS<NTile, MTile>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    avector<int8_t> A(m * k);
+    avector<int8_t> B(k * n);
+    avector<float> C(m * n, 0), RefC(m * n, 0);
+    int blk_num = utils::updiv(k, kblock);
+    avector<float> scaleA(blk_num * m), scaleB(blk_num * n);
+    fill_buffer_randn(A.data(), A.size(), (int8_t)-127, (int8_t)127);
+    fill_buffer_randn(B.data(), B.size(), (int8_t)-127, (int8_t)127);
+    fill_buffer_randn(scaleA.data(), scaleA.size(), 0.003f, 0.005f);
+    fill_buffer_randn(scaleB.data(), scaleB.size(), 0.003f, 0.005f);
+
+    ref_kblock_int8_ss<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), scaleA.data(), blk_num, scaleB.data(), n, m,
+                                          n, k, kblock, k * sizeof(A[0]), k * sizeof(B[0]), n * sizeof(C[0]), 0);
+
+    gemm.forward(A.data(), B.data(), C.data(), nullptr, scaleA.data(), blk_num, scaleB.data(), nullptr, n, m, n, k,
+                 kblock, k * sizeof(A[0]), k * sizeof(B[0]), n * sizeof(C[0]), 0, 1.f, cache, CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_GEMM
+static UT_GEMM_AVXVNNI_KBLOCK sUT_GEMM_AVXVNNI_KBLOCK;
+#endif
+
+class UT_GEMM_AVX2VNNI_KBLOCK {
+ public:
+  UT_GEMM_AVX2VNNI_KBLOCK() {
+    UT_START();
+    CheckISA(AVX2);
+    ut<24, 1>(1, 96, 36, 36);
+    ut<24, 1>(1, 144, 128, 32);
+    ut<24, 1>(1, 144, 128, 128);
+    ut<24, 1>(1, 144, 256, 128);
+    ut_ss<24, 1>(1, 96, 36, 36);
+  }
+
+  template <int NTile, int MTile>
+  void ut(int m, int n, int k, int kblock) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::ICoreRowNAvx2vnniKBlock<NTile, MTile>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    avector<uint8_t> A(m * k);
+    avector<int8_t> B(k * n);
+    avector<float> C(m * n, 0), RefC(m * n, 0);
+    int blk_num = utils::updiv(k, kblock);
+    avector<float> scaleA(blk_num * m), scaleB(blk_num * n), reduceB(blk_num * n, 0.f);
+    avector<uint8_t> zpA(m * blk_num);
+    fill_buffer_randn(A.data(), A.size(), (uint8_t)0, (uint8_t)128);
+    fill_buffer_randn(zpA.data(), zpA.size(), (uint8_t)0, (uint8_t)64);
+    fill_buffer_randn(B.data(), B.size(), (int8_t)-127, (int8_t)127);
+    fill_buffer_randn(scaleA.data(), scaleA.size(), 0.003f, 0.005f);
+    fill_buffer_randn(scaleB.data(), scaleB.size(), 0.003f, 0.005f);
+    for (size_t i = 0; i < k; i++) {
+      for (size_t j = 0; j < n; j++) {
+        reduceB[i / kblock * n + j] += B[i * n + j];
+      }
+    }
+    for (size_t i = 0; i < blk_num; i++) {
+      for (size_t j = 0; j < n; j++) {
+        reduceB[i * n + j] *= scaleB[i * n + j];
+      }
+    }
+
+    ref_kblock_int8<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), zpA.data(), scaleA.data(), blk_num,
+                                       scaleB.data(), reduceB.data(), n, m, n, k, kblock, k * sizeof(A[0]),
+                                       k * sizeof(B[0]), n * sizeof(C[0]), 0);
+
+    gemm.forward(A.data(), B.data(), C.data(), zpA.data(), scaleA.data(), blk_num, scaleB.data(), reduceB.data(), n, m,
+                 n, k, kblock, k * sizeof(A[0]), k * sizeof(B[0]), n * sizeof(C[0]), 0, 1.f, cache, CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 0.001f);
+  }
+
+  template <int NTile, int MTile>
+  void ut_ss(int m, int n, int k, int kblock) {
+    printf("Test Case SS: %d %d %d\n", m, n, k);
+    using Core = gemm::ICoreRowNAvx2vnniKBlockSS<NTile, MTile>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    avector<int8_t> A(m * k);
+    avector<int8_t> B(k * n);
+    avector<float> C(m * n, 0), RefC(m * n, 0);
+    int blk_num = utils::updiv(k, kblock);
+    avector<float> scaleA(blk_num * m), scaleB(blk_num * n);
+    fill_buffer_randn(A.data(), A.size(), (int8_t)-127, (int8_t)127);
+    fill_buffer_randn(B.data(), B.size(), (int8_t)-127, (int8_t)127);
+    fill_buffer_randn(scaleA.data(), scaleA.size(), 0.003f, 0.005f);
+    fill_buffer_randn(scaleB.data(), scaleB.size(), 0.003f, 0.005f);
+
+    ref_kblock_int8_ss<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), scaleA.data(), blk_num, scaleB.data(), n, m,
+                                          n, k, kblock, k * sizeof(A[0]), k * sizeof(B[0]), n * sizeof(C[0]), 0);
+
+    gemm.forward(A.data(), B.data(), C.data(), nullptr, scaleA.data(), blk_num, scaleB.data(), nullptr, n, m, n, k,
+                 kblock, k * sizeof(A[0]), k * sizeof(B[0]), n * sizeof(C[0]), 0, 1.f, cache, CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_GEMM
+static UT_GEMM_AVX2VNNI_KBLOCK sUT_GEMM_AVX2VNNI_KBLOCK;
+#endif
+
+class UT_GEMM_AMXINT8_KBLOCK {
+ public:
+  UT_GEMM_AMXINT8_KBLOCK() {
+    UT_START();
+    CheckISA(AMX_INT8);
+    request_perm_xtile_data();
+    ut_splitblock<48, 16>(16, 144, 128, 64);
+    ut_splitblock<48, 16>(16, 144, 128, 128);
+    ut_splitblock<48, 16>(16, 144, 256, 128);
+    ut_splitblock<48, 16>(16, 144, 256, 128);
+  }
+
+  template <int NTile, int MTile>
+  void ut_splitblock(int m, int n, int k, int kblock) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::ICoreRowNAmxint8KBlock<NTile, MTile>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    avector<uint8_t> A(m * k);
+    avector<int8_t> B(k * n);
+    avector<float> C(m * n, 0), RefC(m * n, 0);
+    int blk_num = utils::updiv(k, kblock);
+    avector<float> scaleA(blk_num * m), scaleB(blk_num * n), reduceB(blk_num * n, 0.f);
+    avector<uint8_t> zpA(m * blk_num);
+    fill_buffer_randn(A.data(), A.size(), (uint8_t)0, (uint8_t)255);
+    fill_buffer_randn(zpA.data(), zpA.size(), (uint8_t)0, (uint8_t)0);
+    fill_buffer_randn(B.data(), B.size(), (int8_t)-127, (int8_t)127);
+    fill_buffer_randn(scaleA.data(), scaleA.size(), 0.003f, 0.005f);
+    fill_buffer_randn(scaleB.data(), scaleB.size(), 0.003f, 0.005f);
+    for (size_t i = 0; i < k; i++) {
+      for (size_t j = 0; j < n; j++) {
+        reduceB[i / kblock * n + j] += B[i * n + j];
+      }
+    }
+    for (size_t i = 0; i < blk_num; i++) {
+      for (size_t j = 0; j < n; j++) {
+        reduceB[i * n + j] *= scaleB[i * n + j];
+      }
+    }
+
+    ref_kblock_int8<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), zpA.data(), scaleA.data(), blk_num,
+                                       scaleB.data(), reduceB.data(), n, m, n, k, kblock, k * sizeof(A[0]),
+                                       k * sizeof(B[0]), n * sizeof(C[0]), 0);
+    gemm.configure(16, 16, 16);
+    gemm.forward(A.data(), B.data(), C.data(), zpA.data(), scaleA.data(), blk_num, scaleB.data(), reduceB.data(), n, m,
+                 n, k, kblock, k * sizeof(A[0]), k * sizeof(B[0]), n * sizeof(C[0]), 0, 1.f, cache, CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_GEMM
+static UT_GEMM_AMXINT8_KBLOCK sUT_GEMM_AMXINT8_KBLOCK;
+#endif
+
+class UT_GEMM_AVXVNNI {
+ public:
+  UT_GEMM_AVXVNNI() {
+    UT_START();
+    CheckISA(AVX_VNNI);
+    ut<24>(4, 48, 12);
+    ut<48>(2, 96, 12);
+
+    ut_ss<24>(4, 48, 12);
+    ut_ss<48>(2, 96, 12);
+  }
+
+  template <int NTile>
+  void ut(int m, int n, int k) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::ICoreRowNAvxvnni<NTile>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    avector<uint8_t> A(m * k);
+    avector<int8_t> B(k * n);
+    avector<int> C(m * n, 0), RefC(m * n, 0);
+    fill_buffer_randn(A.data(), A.size(), (uint8_t)0, (uint8_t)255);
+    fill_buffer_randn(B.data(), B.size(), (int8_t)-127, (int8_t)127);
+    ref_int8<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), m, n, k, k * sizeof(A[0]), k * sizeof(B[0]),
+                                n * sizeof(C[0]), 0);
+
+    gemm.forward(A.data(), B.data(), C.data(), m, n, k, k * sizeof(A[0]), k * sizeof(B[0]), n * sizeof(C[0]), 0, cache,
+                 CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 1);
+  }
+
+  template <int NTile>
+  void ut_ss(int m, int n, int k) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::ICoreRowNAvxvnniSS<NTile>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    avector<int8_t> A(m * k);
+    avector<int8_t> B(k * n);
+    avector<int> C(m * n, 0), RefC(m * n, 0);
+    fill_buffer_randn(A.data(), A.size(), (int8_t)-127, (int8_t)127);
+    fill_buffer_randn(B.data(), B.size(), (int8_t)-127, (int8_t)127);
+    ref_int8<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), m, n, k, k * sizeof(A[0]), k * sizeof(B[0]),
+                                n * sizeof(C[0]), 0);
+
+    gemm.forward(A.data(), B.data(), C.data(), m, n, k, k * sizeof(A[0]), k * sizeof(B[0]), n * sizeof(C[0]), 0, cache,
+                 CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 1);
+  }
+};
+#ifdef BTLA_UT_GEMM
+static UT_GEMM_AVXVNNI sUT_GEMM_AVXVNNI;
+#endif
+
+class UT_GEMM_AVX2VNNI {
+ public:
+  UT_GEMM_AVX2VNNI() {
+    UT_START();
+    CheckISA(AVX2);
+    ut<24>(1, 48, 12);
+    ut<24>(2, 48, 12);
+
+    ut_ss<24>(1, 48, 12);
+    ut_ss<24>(2, 96, 12);
+  }
+
+  template <int NTile>
+  void ut(int m, int n, int k) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::ICoreRowNAvx2vnni<NTile, 3>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    avector<uint8_t> A(m * k);
+    avector<int8_t> B(k * n);
+    avector<int> C(m * n, 0), RefC(m * n, 0);
+    fill_buffer_randn(A.data(), A.size(), (uint8_t)0, (uint8_t)127);
+    fill_buffer_randn(B.data(), B.size(), (int8_t)-127, (int8_t)127);
+    ref_int8<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), m, n, k, k * sizeof(A[0]), k * sizeof(B[0]),
+                                n * sizeof(C[0]), 0);
+
+    gemm.forward(A.data(), B.data(), C.data(), m, n, k, k * sizeof(A[0]), k * sizeof(B[0]), n * sizeof(C[0]), 0, cache,
+                 CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 1);
+  }
+
+  template <int NTile>
+  void ut_ss(int m, int n, int k) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::ICoreRowNAvx2vnniSS<NTile, 3>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    avector<int8_t> A(m * k);
+    avector<int8_t> B(k * n);
+    avector<int> C(m * n, 0), RefC(m * n, 0);
+    fill_buffer_randn(A.data(), A.size(), (int8_t)-127, (int8_t)127);
+    fill_buffer_randn(B.data(), B.size(), (int8_t)-127, (int8_t)127);
+    ref_int8<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), m, n, k, k * sizeof(A[0]), k * sizeof(B[0]),
+                                n * sizeof(C[0]), 0);
+
+    gemm.forward(A.data(), B.data(), C.data(), m, n, k, k * sizeof(A[0]), k * sizeof(B[0]), n * sizeof(C[0]), 0, cache,
+                 CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), RefC.size(), 1);
+  }
+};
+#ifdef BTLA_UT_GEMM
+static UT_GEMM_AVX2VNNI sUT_GEMM_AVX2VNNI;
+#endif
+
+class UT_GEMM_AVX512FP16 {
+ public:
+  UT_GEMM_AVX512FP16() {
+    UT_START();
+    CheckISA(AVX512_FP16);
+    ut<32, 0>(4, 64, 3);
+    ut<64, 0>(4, 128, 3);
+  }
+
+  template <int NTILE, int MTILE>
+  void ut(int m, int n, int k) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::HCoreRowNAvx512fp16<NTILE, MTILE>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+
+    avector<utils::fp16> matAbf16(m * k), matBbf16(k * n), matC(m * n), refC(m * n);
+    fill_buffer_randn(matAbf16.data(), matAbf16.size(), utils::fp16(-0.5f), utils::fp16(0.5f));
+    fill_buffer_randn(matBbf16.data(), matBbf16.size(), utils::fp16(-0.5f), utils::fp16(0.5f));
+    int reordered_bstride = k * 2;
+    ref_fp16<Core::NTILE>(matAbf16.data(), matBbf16.data(), refC.data(), m, n, k, k * 2, reordered_bstride, n * 2, 0);
+    gemm.forward(matAbf16.data(), matBbf16.data(), matC.data(), m, n, k, k * sizeof(fp16), k * sizeof(fp16),
+                 n * sizeof(fp16), 0, cache, CacheSize);
+    ut::buffer_error(refC.data(), matC.data(), refC.size(), fp16(FP16_ERR));
+  }
+};
+#ifdef BTLA_UT_GEMM
+static UT_GEMM_AVX512FP16 sUT_GEMM_AVX512FP16;
+#endif
+
+class UT_GEMM_AVX512BF16 {
+ public:
+  UT_GEMM_AVX512BF16() {
+    UT_START();
+    CheckISA(AVX512_BF16);
+    ut<48, 0>(4, 96, 6);
+    ut<48, 8>(4, 96, 6);
+    ut<64, 0>(4, 128, 6);
+  }
+
+  template <int NTILE, int MTILE>
+  void ut(int m, int n, int k) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::HCoreRowNAvx512bf16<NTILE, MTILE>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+
+    avector<utils::bf16> matAbf16(m * k), matBbf16(k * n);
+    avector<float> matC(m * n), refC(m * n);
+    fill_buffer_randn(matAbf16.data(), matAbf16.size(), utils::bf16(-0.5f), utils::bf16(0.5f));
+    fill_buffer_randn(matBbf16.data(), matBbf16.size(), utils::bf16(-0.5f), utils::bf16(0.5f));
+    ref_bf16<Core::NTILE>(matAbf16.data(), matBbf16.data(), refC.data(), m, n, k, k * 2, k * 2, n * 4, 0);
+    gemm.forward(matAbf16.data(), matBbf16.data(), matC.data(), m, n, k, k * sizeof(bf16), k * sizeof(bf16),
+                 n * sizeof(float), 0, cache, CacheSize);
+    ut::buffer_error(refC.data(), matC.data(), refC.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_GEMM
+static UT_GEMM_AVX512BF16 sUT_GEMM_AVX512BF16;
+#endif
+
+class UT_GEMM_AMXBF16 {
+ public:
+  UT_GEMM_AMXBF16() {
+    UT_START();
+    CheckISA(AMX_BF16);
+    request_perm_xtile_data();
+    ut<32, 32>(32, 32, 64);
+    ut<32, 32>(4, 96, 96);
+    ut<48, 0>(4, 96, 96);
+    ut<64, 16>(4, 128, 96);
+  }
+
+  template <int NTILE, int MTILE>
+  void ut(int m, int n, int k) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::HCoreRowNAmxbf16<NTILE, MTILE>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+
+    avector<utils::bf16> matAbf16(m * k), matBbf16(k * n);
+    avector<float> matC(Core::Code::MTILE * n), refC(Core::Code::MTILE * n);
+    fill_buffer_randn(matAbf16.data(), matAbf16.size(), utils::bf16(-0.5f), utils::bf16(0.5f));
+    fill_buffer_randn(matBbf16.data(), matBbf16.size(), utils::bf16(-0.5f), utils::bf16(0.5f));
+    ref_bf16<Core::NTILE>(matAbf16.data(), matBbf16.data(), refC.data(), m, n, k, k * 2, k * 2, n * 4, 0);
+    gemm.configure(m, n, k);
+
+    gemm.forward(matAbf16.data(), matBbf16.data(), matC.data(), m, n, k, k * sizeof(bf16), k * sizeof(bf16),
+                 n * sizeof(float), 0, cache, CacheSize);
+    ut::buffer_error(refC.data(), matC.data(), m * n, 0.001f);
+  }
+};
+#ifdef BTLA_UT_GEMM
+static UT_GEMM_AMXBF16 sUT_GEMM_AMXBF16;
+#endif
+
+class UT_GEMM_AMXINT8 {
+ public:
+  UT_GEMM_AMXINT8() {
+    UT_START();
+    CheckISA(AMX_INT8);
+    request_perm_xtile_data();
+    ut<32, 32>(32, 64, 64 * 3);
+    ut<48, 16>(16, 96, 64 * 3);
+    ut<32, 32>(4, 64, 64 * 3);
+    ut<32, 32>(20, 64, 64 * 3);
+    ut<32, 32>(32, 64, 64 * 3);
+
+    ut<64, 16>(16, 128, 64 * 3);
+  }
+
+  template <int NTile, int MTile>
+  void ut(int m, int n, int k) {
+    printf("Test Case: %d %d %d\n", m, n, k);
+    using Core = gemm::ICoreRowNAmxint8<NTile, MTile>;
+    static Core gemm;
+    if (n % Core::Code::NTILE != 0) {
+      return;
+    }
+    if (k % Core::Code::KTILE != 0) {
+      return;
+    }
+    avector<uint8_t> A(m * k);
+    avector<int8_t> B(k * n);
+    avector<int> C(Core::Code::MTILE * n, 0), RefC(Core::Code::MTILE * n, 0);
+    fill_buffer_randn(A.data(), A.size(), (uint8_t)0, (uint8_t)255);
+    fill_buffer_randn(B.data(), B.size(), (int8_t)-127, (int8_t)127);
+    ref_int8<Core::Code::NTILE>(A.data(), B.data(), RefC.data(), m, n, k, k * sizeof(A[0]), k * sizeof(B[0]),
+                                n * sizeof(C[0]), 0);
+    gemm.configure(m, n, k);
+    gemm.forward(A.data(), B.data(), C.data(), m, n, k, k * sizeof(A[0]), k * sizeof(B[0]), n * sizeof(C[0]), 0, cache,
+                 CacheSize);
+    ut::buffer_error(RefC.data(), C.data(), m * n, 1);
+  }
+};
+#ifdef BTLA_UT_GEMM
+static UT_GEMM_AMXINT8 sUT_GEMM_AMXINT8;
+#endif
+}  // namespace ut
+}  // namespace bestla
+#endif
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_parallel.cpp b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_parallel.cpp
new file mode 100644
index 000000000..93a47a898
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_parallel.cpp
@@ -0,0 +1,211 @@
+#include "bestla_utils.h"
+#include "bestla_parallel.h"
+#include "bestla_device.h"
+#include "bestla_gemm.h"
+#include "bestla_ut.h"
+#include "bestla_prologue_a.h"
+
+#ifdef BTLA_UT_PARALLEL
+namespace bestla {
+using namespace utils;
+namespace ut {
+#ifdef _OPENMP
+class UT_OMPThreading {
+ public:
+  UT_OMPThreading() {
+    UT_START();
+    GetCPUDevice();
+    ut_transpose(1024, 1024, _cd->getThreads());
+    ut_transpose(123, 111, _cd->getThreads());
+  }
+
+  void ut_transpose(int row, int col, int threads) {
+    printf("%s %d %d %d\n", __FUNCTION__, row, col, threads);
+    avector<float> src(row * col), dst(row * col), ref(row * col);
+    fill_buffer_randn(src.data(), src.size(), -0.5f, 0.5f);
+    int ld_src = col, ld_dst = row;
+    kernel::wrapper::Transpose2D<float>::template forward<BTLA_ISA::AVX512F>(src.data(), ref.data(), row, col, col,
+                                                                             row);
+    parallel::Scheduler2D _para({threads, row, col, 1, 1, 0, 0});
+    UT_Threading::get()->parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D thdp{tidx};
+      _para.getIndex(thdp);
+      if (thdp.valid) {
+        kernel::wrapper::Transpose2D<float>::template forward<BTLA_ISA::AVX512F>(
+            src.data() + thdp.loc[0] * ld_src + thdp.loc[1], dst.data() + thdp.loc[0] + thdp.loc[1] * ld_dst,
+            thdp.size[0], thdp.size[1], ld_src, ld_dst);
+      }
+    });
+    buffer_error(ref.data(), dst.data(), ref.size());
+  }
+};
+#ifdef JBLAS_UT_PARALLEL
+static UT_OMPThreading sUT_OMPThreading;
+#endif
+#endif
+
+class UT_StdThreading {
+ public:
+  UT_StdThreading() {
+    UT_START();
+    GetCPUDevice();
+    ut_transpose(1024, 1024, _cd->getThreads());
+    ut_transpose(123, 111, _cd->getThreads());
+  }
+
+  void ut_transpose(int row, int col, int threads) {
+    printf("%s %d %d %d\n", __FUNCTION__, row, col, threads);
+    avector<float> src(row * col), dst(row * col), ref(row * col);
+    fill_buffer_randn(src.data(), src.size(), -0.5f, 0.5f);
+    int ld_src = col, ld_dst = row;
+    kernel::wrapper::Transpose2D<float>::template forward<BTLA_ISA::AVX512F>(src.data(), ref.data(), row, col, col,
+                                                                             row);
+    parallel::Scheduler2D _para({threads, row, col, 1, 1, 0, 0});
+    UT_Threading::get()->parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D thdp{tidx};
+      _para.getIndex(thdp);
+      if (thdp.valid) {
+        kernel::wrapper::Transpose2D<float>::template forward<BTLA_ISA::AVX512F>(
+            src.data() + thdp.loc[0] * ld_src + thdp.loc[1], dst.data() + thdp.loc[0] + thdp.loc[1] * ld_dst,
+            thdp.size[0], thdp.size[1], ld_src, ld_dst);
+      }
+    });
+    buffer_error(ref.data(), dst.data(), ref.size());
+  }
+};
+#ifdef JBLAS_UT_PARALLEL
+static UT_StdThreading sUT_StdThreading;
+#endif
+
+class UT_Scheduler2D {
+ public:
+  UT_Scheduler2D() {
+    UT_START();
+    ut(4096, 4096, 24);
+    ut(4096, 4096, 28);
+    ut(4096, 4096, 48);
+    ut(4096, 4096, 56);
+  }
+
+  void ut(int row, int col, int threads) {
+    printf("%s %d %d %d\n", __FUNCTION__, row, col, threads);
+    parallel::Scheduler2D sch;
+    sch.update({threads, row, col, 1, 1, 0, 0});
+    sch.print();
+    parallel::ThreadProblem2D prb{threads - 1};
+    sch.getIndex(prb);
+    prb.print();
+  }
+};
+#ifdef JBLAS_UT_PARALLEL
+static UT_Scheduler2D sUT_Scheduler2D;
+#endif
+
+class UT_SchedulerGemmBase {
+ public:
+  UT_SchedulerGemmBase() {
+    UT_START();
+    ut<gemm::ICoreRowNAmxint8<48, 16>>(2048, 4096, 4096, 48, 2048 * 1024, 32 * 1024);
+    ut<gemm::SCoreRowNAvx512f<48, 8>>(1, 4096, 4096, 24);
+    ut<gemm::ICoreRowNAmxint8SS<32, 32>>(2048, 4096, 4096, 24);
+    ut<gemm::ICoreRowNAmxint8SS<32, 32>>(4, 4096, 4096, 48);
+  }
+
+  template <class GemmCore_T>
+  void ut(int m, int n, int k, int threads, size_t l2cache = 0, size_t l1cache = 0) {
+    printf("%s %d %d %d %d core:%s\n", __FUNCTION__, m, n, k, threads, gemm::CoreAttr::to_str(GemmCore_T::ID));
+    parallel::gemm::SchedulerBase<GemmCore_T> sch;
+    GetCPUDevice();
+    utils::GemmProblem gp(1, m, n, k);
+    sch.update({threads, gp, 0, 0, l2cache == 0 ? _cd->getL2CacheSize() : l2cache,
+                l1cache == 0 ? _cd->getL1CacheSize() : l1cache});
+    sch.print();
+    parallel::gemm::ThreadProblemBase prb{sch.valid_theads() - 1};
+    sch.getIndex(prb);
+    prb.print();
+  }
+};
+#ifdef JBLAS_UT_PARALLEL
+static UT_SchedulerGemmBase sUT_SchedulerGemmBase;
+#endif
+
+class UT_SchedulerGemmKBlock {
+ public:
+  UT_SchedulerGemmKBlock() {
+    UT_START();
+    ut<gemm::SCoreRowNAvx512f<48, 8>>(1, 4096, 4096, 32, 24);
+    ut<gemm::SCoreRowNAvx512f<48, 8>>(1, 4096, 4096, 64, 22, 32 * 1024);
+    ut<gemm::SCoreRowNAvx512f<48, 8>>(1, 4096, 4096, 128, 24);
+    ut<gemm::SCoreRowNAvx512f<48, 8>>(1, 4096, 4096, 1024, 24);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(1, 4096, 4096, 64, 24, 32 * 1024);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 64, 24);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 4096, 24);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 4096, 48);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 4096, 56);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 32, 56);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(4, 4096, 4096, 128, 48);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(4, 4096, 3072, 32, 48);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 3072, 3072, 48);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 3072, 32, 56);
+  }
+
+  template <class GemmCore_T>
+  void ut(int m, int n, int k, int kblock, int threads, size_t l1cache = 0) {
+    printf("%s %d %d %d %d %d core:%s\n", __FUNCTION__, m, n, k, kblock, threads,
+           gemm::CoreAttr::to_str(GemmCore_T::ID));
+    parallel::gemm::SchedulerKBlock<GemmCore_T> sch;
+    GetCPUDevice();
+    utils::GemmProblem gp(1, m, n, k, kblock);
+    sch.update({threads, gp, 0, 0, _cd->getL2CacheSize(), l1cache == 0 ? _cd->getL1CacheSize() : l1cache});
+    sch.print();
+    parallel::gemm::ThreadProblemBase prb{sch.valid_theads() - 1};
+    sch.getIndex(prb);
+    prb.print();
+  }
+};
+#ifdef JBLAS_UT_PARALLEL
+static UT_SchedulerGemmKBlock sUT_SchedulerGemmKBlock;
+#endif
+
+class UT_SchedulerGemmKBlockNew {
+ public:
+  UT_SchedulerGemmKBlockNew() {
+    UT_START();
+    ut<gemm::ICoreRowNAmxint8SSKBlock<48, 16>>(2011, 32000, 4096, 128, 32);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 4096, 48);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 128, 24);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 4096, 24);
+    ut<gemm::SCoreRowNAvx512f<48, 8>>(1, 4096, 4096, 32, 24);
+    ut<gemm::SCoreRowNAvx512f<48, 8>>(1, 4096, 4096, 64, 22, 32 * 1024);
+    ut<gemm::SCoreRowNAvx512f<48, 8>>(1, 4096, 4096, 128, 24);
+    ut<gemm::SCoreRowNAvx512f<48, 8>>(1, 4096, 4096, 1024, 24);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(1, 4096, 4096, 64, 24, 32 * 1024);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 64, 24);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 4096, 56);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 4096, 32, 56);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(4, 4096, 4096, 128, 48);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(4, 4096, 3072, 32, 48);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 3072, 3072, 48);
+    ut<gemm::ICoreRowNAmxint8SS<64, 16>>(2048, 4096, 3072, 32, 56);
+  }
+
+  template <class GemmCore_T>
+  void ut(int m, int n, int k, int kblock, int threads, size_t l1cache = 0) {
+    printf("%s %d %d %d %d %d core:%s\n", __FUNCTION__, m, n, k, kblock, threads,
+           gemm::CoreAttr::to_str(GemmCore_T::ID));
+    parallel::gemm::SchedulerKBlockS<GemmCore_T> sch;
+    GetCPUDevice();
+    utils::GemmProblem gp(1, m, n, k, kblock);
+    sch.update({threads, gp, 0, 0, _cd->getL2CacheSize(), l1cache == 0 ? _cd->getL1CacheSize() : l1cache});
+    sch.print();
+    parallel::gemm::ThreadProblemBase prb{sch.valid_theads() - 1};
+    sch.getIndex(prb);
+    prb.print();
+  }
+};
+#ifdef JBLAS_UT_PARALLEL
+static UT_SchedulerGemmKBlockNew sUT_SchedulerGemmKBlockNew;
+#endif
+}  // namespace ut
+}  // namespace bestla
+#endif
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_prologue_a.cpp b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_prologue_a.cpp
new file mode 100644
index 000000000..3c79b5e8c
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_prologue_a.cpp
@@ -0,0 +1,296 @@
+#include "bestla_prologue_a.h"
+#include "bestla_ut.h"
+#include "kernel_avx512f.h"
+
+#ifdef BTLA_UT_PROLOGUE_A
+namespace bestla {
+using namespace utils;
+namespace ut {
+class UT_ActivationBase {
+ public:
+  UT_ActivationBase() {
+    UT_START();
+    CheckISA(AVX512F);
+    ut<gemm::SCoreRowNAvx512f<48, 8>>(8, 3, 128);
+    ut<gemm::SCoreRowNAvx512f<48, 8>>(8, 48, 128);
+    ut<gemm::SCoreRowNAvx512f<48, 8>>(8, 3, 128);
+  }
+  template <typename _T>
+  void ut(int m, int k, int lda) {
+    int kpad = padto(k, _T::KTILE);
+    using BType = typename _T::BType;
+    printf("Test Case %dB NTile%d: %d %d %d %d \n", int(sizeof(BType)), _T::NTILE, m, k, lda, kpad);
+    std::vector<BType> src(m * lda);
+    std::vector<BType> dst(m * kpad), dstref(m * kpad);
+    for (int i = 0; i < src.size(); i++) {
+      src[i] = static_cast<BType>(i);
+    }
+    prologue_a::gemm::ActivationBase<_T, BTLA_ISA::NoSIMD> reorderref;
+    prologue_a::gemm::ActivationBase<_T, BTLA_ISA::AVX512F> reorderavx512;
+    auto dstrefptr = dstref.data();
+    auto dstptr = dst.data();
+    int dststride = 0;
+    reorderref.getActivation(&dstrefptr, &dststride, {src.data(), lda}, m, k, 0, 0, cache, CacheSize);
+    GetCPUDevice();
+    if (_cd->AVX512F()) {
+      reorderavx512.getActivation(&dstptr, &dststride, {src.data(), lda}, m, k, 0, 0, cache, CacheSize);
+      ut::buffer_error(dst.data(), dstref.data(), dst.size());
+    }
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_A
+static UT_ActivationBase sUT_ActivationBase;
+#endif
+
+class UT_ActivationConverter {
+ public:
+  UT_ActivationConverter() {
+    UT_START();
+    CheckISA(AMX_BF16);
+    ut<float, gemm::HCoreRowNAmxbf16<64, 16>>(8, 3, 128);
+    ut<float, gemm::HCoreRowNAmxbf16<64, 16>>(8, 48, 128);
+    ut<utils::bf16, gemm::SCoreRowNAvx512f<48, 8>>(8, 3, 128);
+    ut<utils::bf16, gemm::SCoreRowNAvx512f<48, 8>>(8, 48, 128);
+  }
+
+  template <typename SRC_T, typename _T>
+  void ut(int m, int k, int lda) {
+    using SrcType = SRC_T;
+    using AType = typename _T::AType;
+    int kpad = padto(k, _T::KTILE);
+    printf("Test Case %dB NTile%d: %d %d %d %d \n", int(sizeof(AType)), _T::NTILE, m, k, lda, kpad);
+    std::vector<SrcType> src(m * lda);
+    std::vector<AType> dst(m * kpad), dstref(m * kpad);
+    for (int i = 0; i < src.size(); i++) {
+      src[i] = static_cast<SrcType>(float(i));
+    }
+    prologue_a::gemm::ActivationConverter<_T, BTLA_ISA::NoSIMD, SRC_T> reorderref;
+    prologue_a::gemm::ActivationConverter<_T, BTLA_ISA::AVX512F, SRC_T> reorderavx512;
+    auto dstptr = dstref.data();
+    int dststride = 0;
+    auto ret = reorderref.getActivation(&dstptr, &dststride, {src.data(), lda}, m, k, 0, 0, cache, CacheSize);
+    assert(ret == BTLA_CODE::Success);
+    dstptr = dst.data();
+    ret = reorderavx512.getActivation(&dstptr, &dststride, {src.data(), lda}, m, k, 0, 0, cache, CacheSize);
+    assert(ret == BTLA_CODE::Success);
+    ut::buffer_error(dst.data(), dstref.data(), dst.size(), AType{0});
+    aligned_vector<SrcType> revert(dst.size());
+    for (size_t i = 0; i < revert.size(); i++) {
+      revert[i] = utils::cast<AType, SrcType>(dst[i]);
+    }
+    for (size_t i = 0; i < src.size(); i++) {
+      auto tmp = utils::cast<SrcType, AType>(src[i]);
+      src[i] = utils::cast<AType, SrcType>(tmp);
+    }
+    buffer_error_2d(src.data(), revert.data(), m, k, lda, kpad);
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_A
+static UT_ActivationConverter sUT_ActivationConverter;
+#endif
+
+class UT_ActivationU8KBlockQuantize {
+ public:
+  UT_ActivationU8KBlockQuantize() {
+    UT_START();
+    CheckISA(AVX512F);
+    ut<gemm::ICoreRowNAmxint8<48, 16>>(15, 63, 64, 2);
+    ut<gemm::ICoreRowNAvx512vnni<48, 8>>(2, 4096, 4096, 32, true);
+    ut<gemm::ICoreRowNAvx512vnni<48, 8>>(2, 4096, 4096, 4096, true);
+    ut<gemm::ICoreRowNAvxvnni<48, 2>>(2, 4096, 4096, 32, true);
+    ut<gemm::ICoreRowNAvxvnni<48, 2>>(2, 4096, 4096, 4096, true);
+    ut<gemm::ICoreRowNAvx512vnni<48, 8>>(2, 4096, 4096, 4096);
+    ut<gemm::ICoreRowNAvx512vnni<48, 8>>(2, 4096, 4096, 128);
+    ut<gemm::ICoreRowNAvx512vnni<48, 8>>(2, 4096, 4096, 32);
+    ut<gemm::ICoreRowNAvx512vnni<48, 8>>(2, 11040, 11040, 32);
+    ut<gemm::ICoreRowNAvx512vnni<48, 8>>(1024, 4096, 4096, 32);
+    ut<gemm::ICoreRowNAvx512vnni<48, 8>>(1024, 11040, 11040, 32);
+    ut<gemm::ICoreRowNAvxvnni<48, 2>>(2, 4096, 4096, 4096);
+    ut<gemm::ICoreRowNAvxvnni<48, 2>>(2, 4096, 4096, 128);
+    ut<gemm::ICoreRowNAvxvnni<48, 2>>(2, 4096, 4096, 32);
+    ut<gemm::ICoreRowNAvxvnni<48, 2>>(2, 11040, 11040, 32);
+    ut<gemm::ICoreRowNAvxvnni<48, 2>>(1024, 4096, 4096, 32);
+    ut<gemm::ICoreRowNAvxvnni<48, 2>>(1024, 11040, 11040, 32);
+  }
+  template <typename _T>
+  void ut(int m, int k, int lda, int kblock, bool hasreduce = false) {
+    int kpad = padto(k, _T::KTILE);
+    printf("Test Case core:%s: %d %d %d %d %d reduce:%d\n", gemm::CoreAttr::to_str(_T::ID), m, k, lda, kblock, kpad,
+           hasreduce);
+    int kcount = updiv(kpad, kblock);
+    utils::aligned_vector<float> raw(m * lda), scales(m * kcount);
+    ut::fill_buffer_randn(raw.data(), raw.size(), -0.5f, 0.5f);
+    utils::aligned_vector<uint8_t> q, zp;
+    q.resize(m * lda);
+    zp.resize(m * kcount);
+    avector<float> reduce(m * kcount);
+
+    kernel::ref::quantize_fp_u8_colblock(m, k, raw.data(), lda, q.data(), lda, scales.data(), kcount, zp.data(), kblock,
+                                         hasreduce ? reduce.data() : nullptr);
+    prologue_a::gemm::ActivationF32KBlockQuantize<_T, _T::ISA> actA;
+    auto quanAct = actA.createStorage(m, k, kblock, hasreduce);
+    avector<int8_t> bufA(quanAct.mSize);
+    quanAct.assign(bufA.data());
+    actA.quantize({raw.data(), lda, &quanAct}, m, k, UT_Threading::get());
+
+    ut::buffer_error(q.data(), quanAct.template APtr<uint8_t>(), q.size(), uint8_t(1));
+    ut::buffer_error(zp.data(), quanAct.template ZPtr<uint8_t>(), zp.size(), uint8_t(1));
+    if (hasreduce) {
+      avector<float> redref(reduce.size(), 0.f), redqref(reduce.size(), 0.f);
+      for (size_t i = 0; i < m; i++) {
+        for (size_t j = 0; j < k; j++) {
+          redref[i * kcount + j / kblock] += raw[i * k + j];
+          redqref[i * kcount + j / kblock] +=
+              (float(q[i * k + j]) - zp[i * kcount + j / kblock]) * scales[i * kcount + j / kblock];
+        }
+      }
+      buffer_error(redref.data(), reduce.data(), reduce.size(), INT8_ERR);
+      buffer_error(redqref.data(), reduce.data(), reduce.size(), 0.01f);
+      buffer_error(reduce.data(), quanAct.template RPtr<float>(), reduce.size(), 0.01f);
+    }
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_A
+static UT_ActivationU8KBlockQuantize sUT_ActivationU8KBlockQuantize;
+#endif
+
+class UT_ActivationS8KBlockQuantize {
+ public:
+  UT_ActivationS8KBlockQuantize() {
+    UT_START();
+    CheckISA(AVX512F);
+    ut<gemm::ICoreRowNAmxint8SS<48, 16>>(15, 63, 2, true);
+    ut<gemm::ICoreRowNAmxint8SS<48, 16>>(2, 4096, 32, true);
+    ut<gemm::ICoreRowNAmxint8SS<48, 16>>(2, 4096, 4096, true);
+    ut<gemm::ICoreRowNAmxint8SS<48, 16>>(2, 11040, 32);
+    ut<gemm::ICoreRowNAmxint8SS<48, 16>>(2, 4096, 128);
+    ut<gemm::ICoreRowNAmxint8SS<48, 16>>(2, 4096, 32);
+    ut<gemm::ICoreRowNAmxint8SS<48, 16>>(1024, 4096, 32);
+    ut<gemm::ICoreRowNAmxint8SS<48, 16>>(1024, 11040, 32);
+  }
+
+  template <typename _T>
+  void ut(int m, int k, int kblock, bool hasreduce = false) {
+    int kpad = padto(k, _T::KTILE);
+    int lda = kpad;
+    printf("Test Case NTile%d: %d %d %d %d %d reduce:%d\n", _T::KTILE, m, k, lda, kblock, kpad, hasreduce);
+    int kcount = updiv(kpad, kblock);
+    utils::aligned_vector<float> raw(m * k), scales(m * kcount);
+    ut::fill_buffer_randn(raw.data(), raw.size(), -0.1f, 0.1f);
+    utils::aligned_vector<int8_t> q;
+    avector<float> reduce(m * kcount);
+    q.resize(m * lda);
+    kernel::ref::quantize_fp_s8_colblock(m, k, raw.data(), k, q.data(), lda, scales.data(), kcount, kblock,
+                                         hasreduce ? reduce.data() : nullptr);
+    prologue_a::gemm::ActivationF32KBlockQuantize<_T, BTLA_ISA::AVX512F> actA;
+    auto quanAct = actA.createStorage(m, k, kblock, hasreduce);
+    avector<int8_t> bufA(quanAct.mSize);
+    quanAct.assign(bufA.data());
+    actA.quantize({raw.data(), k, &quanAct}, m, k, UT_Threading::get());
+    ut::buffer_error(q.data(), quanAct.template APtr<int8_t>(), q.size(), int8_t(1));
+    if (hasreduce) {
+      avector<float> redref(reduce.size(), 0.f), redqref(reduce.size(), 0.f);
+      for (size_t i = 0; i < m; i++) {
+        for (size_t j = 0; j < k; j++) {
+          redref[i * kcount + j / kblock] += raw[i * k + j];
+          redqref[i * kcount + j / kblock] += (float(q[i * lda + j])) * scales[i * kcount + j / kblock];
+        }
+      }
+      buffer_error(redref.data(), reduce.data(), reduce.size(), INT8_ERR);
+      buffer_error(redqref.data(), reduce.data(), reduce.size(), FP32_ERR);
+      buffer_error(reduce.data(), quanAct.template RPtr<float>(), reduce.size(), FP32_ERR);
+    }
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_A
+static UT_ActivationS8KBlockQuantize sUT_ActivationS8KBlockQuantize;
+#endif
+
+class UT_ShuffleActivationKblock {
+ public:
+  UT_ShuffleActivationKblock() {
+    UT_START();
+    ut<float, gemm::SCoreRowNAvx2<48, 2>>(15, 63);
+    ut<bf16, gemm::SCoreRowNAvx2<48, 2>>(15, 63);
+    ut<float, gemm::HCoreRowNAmxbf16<64, 16>>(15, 63);
+    ut<bf16, gemm::HCoreRowNAmxbf16<64, 16>>(15, 63);
+    dynamic_ut(15, 63, 2, true);
+  }
+  template <typename _SRC_T, class GC>
+  void ut(int m, int k) {
+    int kpad = padto(k, GC::KTILE);
+    using BType = typename GC::BType;
+    printf("Test Case %s %dB NTile%d: %d %d %d \n", __FUNCTION__, int(sizeof(BType)), GC::NTILE, m, k, kpad);
+    std::vector<_SRC_T> src(m * k);
+    std::vector<BType> dst(m * kpad), dstref(m * kpad);
+    std::vector<int> indices(k);
+    for (int i = 0; i < indices.size(); i++) {
+      indices[i] = i % 2 == 0 ? (i + 1) == indices.size() ? i : i + 1 : i - 1;
+    }
+    for (int i = 0; i < src.size(); i++) src[i] = static_cast<_SRC_T>(i);
+    prologue_a::gemm::ShuffleActivationKBlockBase<GC, BTLA_ISA::NoSIMD, _SRC_T> kernel;
+    auto dstrefptr = dstref.data();
+    auto dstptr = dst.data();
+    int dststride = 0;
+    auto reordA = kernel.createReorderStorage(m, k, 32);
+    avector<int8_t> bufA(reordA.mSize);
+    reordA.assign(bufA.data());
+    kernel.preprocess({src.data(), k, nullptr, indices.data(), &reordA}, m, k, 32, UT_Threading::get());
+
+    kernel.getActivation(&dstptr, &dststride, {src.data(), k, nullptr, indices.data(), &reordA}, m, kpad, 0, 0, cache,
+                         CacheSize);
+    for (int i = 0; i < m; i++) {
+      int j = 0;
+      for (; j < k; j++) dstrefptr[i * kpad + j] = static_cast<BType>(src[i * k + indices[j]]);
+      for (; j < kpad; j++) dstrefptr[i * kpad + j] = static_cast<BType>(0);
+    }
+    buffer_error(dstrefptr, dstptr, dst.size());
+  }
+
+  void dynamic_ut(int m, int k, int kblock, bool hasreduce = false) {
+    using GC = gemm::ICoreRowNAmxint8SS<48, 16>;
+    int kpad = padto(k, GC::KTILE);
+    int lda = kpad;
+    printf("Test Case %s NTile%d: %d %d %d %d %d reduce:%d\n", __FUNCTION__, GC::KTILE, m, k, lda, kblock, kpad,
+           hasreduce);
+    int kcount = updiv(kpad, kblock);
+    utils::aligned_vector<float> raw(m * k, 0), raw_cp(m * k, 0), scales(m * kcount);
+    ut::fill_buffer_randn(raw_cp.data(), raw_cp.size(), -0.1f, 0.1f);
+    utils::aligned_vector<int8_t> q;
+    avector<float> reduce(m * kcount);
+    q.resize(m * lda);
+    std::vector<int> indices(k);
+    for (int i = 0; i < indices.size(); i++) {
+      indices[i] = i % 2 == 0 ? (i + 1) == indices.size() ? i : i + 1 : i - 1;
+    }
+    kernel::ref::shuffle_activation(raw_cp.data(), raw.data(), m, k, 0, 0, indices.data(), k, k);
+    kernel::ref::quantize_fp_s8_colblock(m, k, raw.data(), k, q.data(), lda, scales.data(), kcount, kblock,
+                                         hasreduce ? reduce.data() : nullptr);
+    prologue_a::gemm::ShuffleActivationKBlockQuantize<GC, BTLA_ISA::NoSIMD, float> actA;
+    auto quanAct = actA.createQuantStorage(m, k, kblock, hasreduce);
+    auto reordAct = actA.createReorderStorage(m, k, kblock);
+    avector<int8_t> bufA(quanAct.mSize + reordAct.mSize);
+    quanAct.assign(bufA.data());
+    reordAct.assign(bufA.data() + quanAct.mSize);
+    actA.quantize({raw_cp.data(), k, &quanAct, indices.data(), &reordAct}, m, k, UT_Threading::get());
+    ut::buffer_error(quanAct.template APtr<int8_t>(), q.data(), q.size(), int8_t(1));
+    if (hasreduce) {
+      avector<float> redref(reduce.size(), 0.f), redqref(reduce.size(), 0.f);
+      for (size_t i = 0; i < m; i++) {
+        for (size_t j = 0; j < k; j++) {
+          redref[i * kcount + j / kblock] += raw[i * k + j];
+          redqref[i * kcount + j / kblock] += (float(q[i * lda + j])) * scales[i * kcount + j / kblock];
+        }
+      }
+      buffer_error(redref.data(), reduce.data(), reduce.size(), INT8_ERR);
+      buffer_error(redqref.data(), reduce.data(), reduce.size(), FP32_ERR);
+      buffer_error(reduce.data(), quanAct.template RPtr<float>(), reduce.size(), FP32_ERR);
+    }
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_A
+static UT_ShuffleActivationKblock sUT_ShuffleActivationKblock;
+#endif
+}  // namespace ut
+}  // namespace bestla
+#endif
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_prologue_b.cpp b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_prologue_b.cpp
new file mode 100644
index 000000000..048610182
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_prologue_b.cpp
@@ -0,0 +1,1373 @@
+#include "bestla_gemm.h"
+#include "bestla_prologue_b.h"
+#include "bestla_parallel.h"
+#include "bestla_device.h"
+#include "bestla_wrapper.h"
+#include "bestla_ut.h"
+
+namespace bestla {
+using namespace utils;
+namespace ut {
+class UT_BlockQunatize_INT8 {
+ public:
+  UT_BlockQunatize_INT8() {
+    UT_START();
+    CheckISA(AVX512F);
+    ut(1024, 1024, 32);
+    ut(1024, 1024, 32, true);
+    ut(4128, 4096, 32);
+    ut(4128, 4096, 32, true);
+    ut(1024, 4096, 32);
+    ut(4096, 1024, 32);
+
+    ut_transpose(4096, 4096, 32);
+    ut_transpose(4096, 4096, 32, true);
+    ut_transpose(4128, 4096, 32);
+    ut_transpose(4128, 4096, 32, true);
+    ut_transpose(1024, 4096, 32);
+    ut_transpose(4096, 1024, 32);
+  }
+
+  void ut(int n, int k, int blocksize, bool asym = false) {
+    printf("%s: %d %d %d %s\n", __FUNCTION__, n, k, blocksize, asym ? "asym" : "sym");
+    int ldb = n;
+    utils::aligned_vector<float> dequanRef(n * k);
+    int kblk_num = utils::updiv(k, blocksize);
+    utils::aligned_vector<float> scales(kblk_num * n);
+    ut::fill_buffer_randn(scales.data(), scales.size(), 0.001f, 0.003f);
+    utils::aligned_vector<int8_t> zero_points(kblk_num * n);
+    ut::fill_buffer_randn(zero_points.data(), zero_points.size(), (int8_t)(-5), (int8_t)(5));
+    ut::UT_vector_s8 quanW;
+    quanW.resize(k * n);
+    quanW.fill_rand(-127, 127);
+    for (int i = 0; i < k; i++) {
+      for (int j = 0; j < n; j++) {
+        if (i % blocksize == 0) {
+          quanW.data()[i * n + j] = 127;  // make sure each block has maximum value to quantize
+        }
+        if (i % blocksize == 1 && asym) {
+          quanW.data()[i * n + j] = -128;  // make sure each block has minimum value to quantize if asym
+        }
+      }
+    }
+
+    for (int j = 0; j < k; j++) {
+      for (int i = 0; i < n; i++) {
+        if (asym) {
+          dequanRef[j * ldb + i] = (float(quanW.data()[j * ldb + i]) - float(zero_points[j / blocksize * n + i])) *
+                                   scales[j / blocksize * n + i];
+        } else {
+          dequanRef[j * ldb + i] = float(quanW.data()[j * ldb + i]) * scales[j / blocksize * n + i];
+        }
+      }
+    }
+
+    auto constexpr RuntimeISA = BTLA_ISA::AVX512F;
+    using PrologueB = prologue_b::gemm::WeightKBlockNInteger<gemm::SCoreRowNAvx512f<48, 8>, RuntimeISA>;
+    PrologueB kernel;
+    auto ptr = kernel.createStorage(n, k, blocksize, BTLA_DTYPE::S8, bestla_dtype<float>, bestla_dtype<float>, asym);
+    avector<int8_t> buffer(ptr.mSize);
+    ptr.assign(buffer.data());
+    kernel.packWeight(n, k, dequanRef.data(), ldb, &ptr, UT_Threading::get());
+    avector<float> dequant(n * k);
+    kernel.unpackWeight(n, k, &ptr, dequant.data(), n, UT_Threading::get());
+  }
+
+  void ut_transpose(int n, int k, int blocksize, bool asym = false) {
+    printf("%s: %d %d %d %s\n", __FUNCTION__, n, k, blocksize, asym ? "asym" : "sym");
+    utils::aligned_vector<float> dequanRef(n * k);
+    int kblk_num = utils::updiv(k, blocksize);
+    utils::aligned_vector<float> scales(kblk_num * n);
+    ut::fill_buffer_randn(scales.data(), scales.size(), 0.001f, 0.003f);
+    utils::aligned_vector<int8_t> zero_points(kblk_num * n);
+    ut::fill_buffer_randn(zero_points.data(), zero_points.size(), (int8_t)(-5), (int8_t)(5));
+    ut::UT_vector_s8 quanW;
+    quanW.resize(k * n);
+    quanW.fill_rand(-127, 127);
+    for (int i = 0; i < k; i++) {
+      for (int j = 0; j < n; j++) {
+        if (i % blocksize == 0) {
+          quanW.data()[i * n + j] = 127;  // make sure each block has maximum value to quantize
+        }
+        if (i % blocksize == 1 && asym) {
+          quanW.data()[i * n + j] = -128;  // make sure each block has minimum value to quantize if asym
+        }
+      }
+    }
+
+    avector<float> dequanT(k * n);
+    for (int j = 0; j < k; j++) {
+      for (int i = 0; i < n; i++) {
+        if (asym) {
+          dequanRef[j * n + i] = (float(quanW.data()[j * n + i]) - float(zero_points[j / blocksize * n + i])) *
+                                 scales[j / blocksize * n + i];
+        } else {
+          dequanRef[j * n + i] = float(quanW.data()[j * n + i]) * scales[j / blocksize * n + i];
+        }
+        dequanT[j + i * k] = dequanRef[j * n + i];
+      }
+    }
+
+    auto constexpr RuntimeISA = BTLA_ISA::AVX512F;
+    using PrologueB = prologue_b::gemm::WeightKBlockNInteger<gemm::SCoreRowNAvx512f<48, 8>, RuntimeISA>;
+    PrologueB kernel;
+    auto ptr = kernel.createStorage(n, k, blocksize, BTLA_DTYPE::S8, bestla_dtype<float>, bestla_dtype<float>, asym);
+    avector<int8_t> buffer(ptr.mSize);
+    ptr.assign(buffer.data());
+    kernel.packTransposeWeight(n, k, dequanT.data(), k, &ptr, UT_Threading::get());
+    avector<float> dequant(n * k), tardequanT(k * n);
+    kernel.unpackWeight(n, k, &ptr, dequant.data(), n, UT_Threading::get());
+    kernel.unpackTransposeWeight(n, k, &ptr, tardequanT.data(), k, UT_Threading::get());
+    ut::buffer_error(dequanT.data(), tardequanT.data(), tardequanT.size(), 0.01f);
+    ut::buffer_error(dequanRef.data(), dequant.data(), dequanRef.size(), 0.01f);
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_B
+static UT_BlockQunatize_INT8 sUT_BlockQunatize_INT8;
+#endif
+
+class UT_BlockQunatize_F8 {
+ public:
+  UT_BlockQunatize_F8() {
+    UT_START();
+    CheckISA(AVX512F);
+    ut(127, 1023, 32, BTLA_DTYPE::F8_E4M3);
+    ut(127, 1023, 32, BTLA_DTYPE::F8_E5M2);
+  }
+
+  void ut(int n, int k, int blocksize, BTLA_DTYPE QUANT_T) {
+    printf("%s: %d %d %d\n", __FUNCTION__, n, k, blocksize);
+    int ldb = n;
+    utils::aligned_vector<float> raw(n * k);
+    ut::fill_buffer_randn(raw.data(), raw.size(), -3.f, 3.f);
+
+    auto constexpr RuntimeISA = BTLA_ISA::AVX512F;
+    using PrologueB = prologue_b::gemm::WeightKBlockNFloat<gemm::SCoreRowNAvx512f<48, 8>, RuntimeISA>;
+    using refPorB = prologue_b::gemm::WeightKBlockNFloat<gemm::SCoreRowNAvx512f<48, 8>, BTLA_ISA::NoSIMD>;
+    PrologueB kernel;
+    refPorB ref_ker;
+    auto ptr = kernel.createStorage(n, k, blocksize, QUANT_T, BTLA_DTYPE::F8_E8M0);
+    auto ref_ptr = kernel.createStorage(n, k, blocksize, QUANT_T, BTLA_DTYPE::F8_E8M0);
+    avector<int8_t> buffer(ptr.mSize);
+    avector<int8_t> ref_buffer(ptr.mSize);
+    ptr.assign(buffer.data());
+    ref_ptr.assign(ref_buffer.data());
+    kernel.packWeight(n, k, raw.data(), ldb, &ptr, UT_Threading::get());
+    ref_ker.packWeight(n, k, raw.data(), ldb, &ref_ptr, UT_Threading::get());
+    avector<float> dequant(n * k, 0);
+    avector<float> ref_dequant(n * k, 0);
+    kernel.unpackWeight(n, k, &ptr, dequant.data(), n, UT_Threading::get());
+    ref_ker.unpackWeight(n, k, &ref_ptr, ref_dequant.data(), n, UT_Threading::get());
+    ut::buffer_error(ref_dequant.data(), dequant.data(), dequant.size(), 0.01f);
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_B
+static UT_BlockQunatize_F8 sUT_BlockQunatize_F8;
+#endif
+
+class UT_BlockQunatize_S3S4 {
+ public:
+  UT_BlockQunatize_S3S4() {
+    UT_START();
+    CheckISA(AVX2);
+    ut<sAVX2>(4096, 4096, 32, BTLA_DTYPE::S4_CLIP, true);
+    ut<sAVX2>(4096, 4096, 32, BTLA_DTYPE::S4_CLIP);
+    ut<sAVX2>(4096, 4096, 128, BTLA_DTYPE::S4_CLIP);
+    ut<sAVX2>(127, 4096, 32, BTLA_DTYPE::S2_CLIP);
+    ut<sAVX2>(127, 4096, 32, BTLA_DTYPE::S2_CLIP, true);
+    ut<sAVX2>(4096, 4096, 32, BTLA_DTYPE::S2_CLIP);
+    ut<sAVX2>(4096, 4096, 128, BTLA_DTYPE::S2_CLIP);
+    ut<sAVX2>(127, 4096, 32, BTLA_DTYPE::S3_CLIP);
+    ut<sAVX2>(127, 4096, 32, BTLA_DTYPE::S3_CLIP, true);
+    ut<sAVX2>(4096, 4096, 32, BTLA_DTYPE::S3_CLIP);
+    ut<sAVX2>(127, 4096, 32, BTLA_DTYPE::S4_CLIP);
+    ut<sAVX2>(127, 4096, 32, BTLA_DTYPE::S4_CLIP, true);
+
+    CheckISA(AVX512F);
+    ut<sAVX512F>(127, 4096, 32, BTLA_DTYPE::S3_CLIP);
+    ut<sAVX512F>(4096, 4096, 32, BTLA_DTYPE::S3_CLIP);
+    ut<sAVX512F>(4096, 4096, 128, BTLA_DTYPE::S3_CLIP);
+    ut<sAVX512F>(127, 4096, 32, BTLA_DTYPE::S4_CLIP);
+    ut<sAVX512F>(4096, 4096, 32, BTLA_DTYPE::S4_CLIP);
+    ut<sAVX512F>(4096, 4096, 128, BTLA_DTYPE::S4_CLIP);
+  }
+  template <class GemmCore>
+  void ut(int n, int k, int blocksize, BTLA_DTYPE QUANT_T, bool isAsym = false) {
+    auto constexpr RuntimeISA = GemmCore::ISA;
+    printf("%s DType %s %d: %d %d %d Asym:%d\n", __FUNCTION__, utils::bestla_dtype_str(QUANT_T), int(RuntimeISA), n, k,
+           blocksize, isAsym);
+    int ldb = n;
+    utils::aligned_vector<float> raw(n * k);
+    ut::fill_buffer_randn(raw.data(), raw.size(), -0.5f, 0.5f);
+    using PrologueB = prologue_b::gemm::WeightKBlockNInteger<GemmCore, RuntimeISA>;
+    PrologueB kernel;
+    auto ptr = kernel.createStorage(n, k, blocksize, QUANT_T, BTLA_DTYPE::F32, BTLA_DTYPE::F32, isAsym);
+    avector<int8_t> buffer(ptr.mSize);
+    ptr.assign(buffer.data());
+    kernel.packWeight(n, k, raw.data(), ldb, &ptr, UT_Threading::get());
+    avector<float> dequant(n * k, 0);
+    kernel.unpackWeight(n, k, &ptr, dequant.data(), n, UT_Threading::get());
+    ut::buffer_error(raw.data(), dequant.data(), dequant.size(), 0.01f);
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_B
+// no proper threshold for this UT
+// static UT_BlockQunatize_S3S4 sUT_BlockQunatize_S3S4;
+#endif
+
+class UT_S3_WOQ {
+ public:
+  UT_S3_WOQ() {
+    UT_START();
+    CheckISA(AVX2);
+    ut<sAVX2, BTLA_ISA::AVX2>(1, 4096, 4096, 32, 8);
+    CheckISA(AVX_VNNI);
+    ut<gemm::ICoreRowNAvxvnniKBlock<24, 2>, BTLA_ISA::AVX_VNNI>(1, 4096, 4096, 128, 8);
+    CheckISA(AVX512F);
+    ut<sAVX512F, BTLA_ISA::AVX512F>(1, 4096, 4096, 32, 56);
+    CheckISA(AVX512_VNNI);
+    ut<gemm::ICoreRowNAvx512vnniKBlock<48, 4>, BTLA_ISA::AVX512_VNNI>(1, 4096, 4096, 128, 56);
+    CheckISA(AMX_BF16);
+    ut<sAMX_BF16, BTLA_ISA::AMX_BF16>(1, 4096, 4096, 32, 56);
+    CheckISA(AMX_INT8);
+    ut<gemm::ICoreRowNAmxint8KBlock<48, 16>, BTLA_ISA::AMX_INT8>(1, 4096, 4096, 128, 56);
+  }
+
+  template <class GemmCore_T, BTLA_ISA ISA>
+  void ut(int m, int n, int k, int blocksize, int enable_thr) {
+    UT_Threading::set_threads(enable_thr);
+    printf("%s:%d %d %d %d\n", __FUNCTION__, m, n, k, blocksize);
+    int ldb = n;
+
+    int kblk_num = utils::updiv(k, blocksize);
+    utils::aligned_vector<float> scales(kblk_num * n);
+    ut::fill_buffer_randn(scales.data(), scales.size(), 0.005f, 0.01f);
+    ut::UT_vector_s8 quanW;
+    quanW.resize(k * n);
+    quanW.fill_rand(-4, 3);
+
+    using PrologueB = prologue_b::gemm::WeightKBlockNInteger<GemmCore_T, ISA>;
+
+    PrologueB kernel;
+    auto ptr = kernel.createStorage(n, k, blocksize, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32, BTLA_DTYPE::F32, false);
+    auto ptr_ref = kernel.createStorage(n, k, blocksize, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32, BTLA_DTYPE::F32, false);
+    avector<int8_t> buffer(ptr.mSize);
+    avector<int8_t> buffer_ref(ptr_ref.mSize);
+    ptr.assign(buffer.data());
+    ptr_ref.assign(buffer_ref.data());
+    kernel.packQWeight(n, k, quanW.data(), ldb, scales.data(), nullptr, &ptr, UT_Threading::get());
+    kernel.packQWeight(n, k, quanW.data(), ldb, scales.data(), nullptr, &ptr_ref, UT_Threading::get());
+    using Launcher =
+        wrapper::gemm::LauncherBase<ISA, GemmCore_T, prologue_a::gemm::ActivationBase,
+                                    prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::AccumulatorWriteBackFp32>;
+    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+
+    Launcher launcher;
+    avector<float> matC(m * n), refC(m * n);
+    if constexpr (ISA == BTLA_ISA::AVX512F || ISA == BTLA_ISA::AVX2) {
+      avector<float> matAf32(m * k);
+      fill_buffer_randn(matAf32.data(), matAf32.size(), -0.5f, 0.5f);
+      utils::GemmProblem gp(1, m, n, k, blocksize);
+      typename Launcher::Param args{gp, {matAf32.data(), k}, {&ptr}, {matC.data(), n}};
+      parallel::GemmRun<Parallel>(launcher, args, UT_Threading::get());
+      typename Launcher::Param args_ref{gp, {matAf32.data(), k}, {&ptr_ref}, {refC.data(), n}};
+      parallel::GemmRun<Parallel>(launcher, args_ref, UT_Threading::get());
+    } else if constexpr (ISA == BTLA_ISA::AMX_BF16) {
+      avector<utils::bf16> matAbf16(m * k);
+      fill_buffer_randn(matAbf16.data(), matAbf16.size(), utils::bf16(-0.5f), utils::bf16(0.5f));
+      GemmProblem gp(1, m, n, k, blocksize);
+      typename Launcher::Param args{gp, {matAbf16.data(), k}, {&ptr}, {matC.data(), n}};
+      parallel::GemmRun<Parallel>(launcher, args, UT_Threading::get());
+      typename Launcher::Param args_ref{gp, {matAbf16.data(), k}, {&ptr_ref}, {refC.data(), n}};
+      parallel::GemmRun<Parallel>(launcher, args_ref, UT_Threading::get());
+    } else {
+      using Launcher2 = wrapper::gemm::LauncherIntKBlock<ISA, GemmCore_T, prologue_a::gemm::ActivationF32KBlockQuantize,
+                                                         prologue_b::gemm::WeightKBlockNInteger,
+                                                         epilogue::gemm::AccumulatorWriteBackFp32>;
+      Launcher2 launcher;
+      using Parallel2 = parallel::gemm::SchedulerKBlockS<GemmCore_T>;
+      avector<float> matAf32(m * k);
+      fill_buffer_randn(matAf32.data(), matAf32.size(), -0.5f, 0.5f);
+      auto quanA = launcher.mProA.createStorage(m, k, blocksize, false);
+      auto quanA_ref = launcher.mProA.createStorage(m, k, blocksize, false);
+      utils::avector<int8_t> bufferA(quanA.mSize);
+      utils::avector<int8_t> bufferA_ref(quanA.mSize);
+      quanA.assign(bufferA.data());
+      quanA_ref.assign(bufferA_ref.data());
+      GemmProblem gp(1, m, n, k, blocksize);
+      typename Launcher2::Param args{gp, {matAf32.data(), k, &quanA}, {&ptr}, {matC.data(), n}};
+      parallel::GemmRunWithA<Parallel2>(launcher, args, UT_Threading::get());
+      typename Launcher2::Param args_ref{gp, {matAf32.data(), k, &quanA_ref}, {&ptr_ref}, {refC.data(), n}};
+      parallel::GemmRunWithA<Parallel2>(launcher, args_ref, UT_Threading::get());
+    }
+    buffer_error(matC.data(), refC.data(), matC.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_B
+static UT_S3_WOQ sUT_S3_WOQ;
+#endif
+
+class UT_TransposeBlockQuantize_F4 {
+ public:
+  UT_TransposeBlockQuantize_F4() {
+    UT_START();
+    CheckISA(AVX512F);
+    ut(4096, 4096, 32, BTLA_DTYPE::F4_BNB, BTLA_DTYPE::F32);
+    ut(1024, 4096, 32, BTLA_DTYPE::F4_BNB, BTLA_DTYPE::F32);
+    ut(4096, 1024, 32, BTLA_DTYPE::F4_BNB, BTLA_DTYPE::F32);
+    ut(48, 32, 32, BTLA_DTYPE::F4_BNB, BTLA_DTYPE::F32);
+    ut(32, 32, 32, BTLA_DTYPE::F4_BNB, BTLA_DTYPE::F32);
+    ut(48, 32, 32, BTLA_DTYPE::F4_BNB, BTLA_DTYPE::F32);
+    ut(48, 32, 32, BTLA_DTYPE::F4_NF4, BTLA_DTYPE::F32);
+    ut(48, 32, 32, BTLA_DTYPE::F4_E2M1, BTLA_DTYPE::F32);
+    ut(16, 15, 8, BTLA_DTYPE::F4_NF4, BTLA_DTYPE::DQ8_BNB);
+    ut(48, 32, 16, BTLA_DTYPE::F4_NF4, BTLA_DTYPE::DQ8_BNB);
+    ut(1024, 4096, 32, BTLA_DTYPE::F4_NF4, BTLA_DTYPE::DQ8_BNB);
+  }
+
+  void ut(int n, int k, int blocksize, BTLA_DTYPE F4_T, BTLA_DTYPE SCA_T) {
+    printf("Test Case: %d %d %d\n", n, k, blocksize);
+    int ldb = n;
+    utils::aligned_vector<float> dequanRef(n * k);
+    int kblk_num = utils::updiv(k, blocksize);
+    utils::aligned_vector<float> scales(kblk_num * n);
+    ut::fill_buffer_randn(scales.data(), scales.size(), 1.f, 5.f);
+    ut::UT_vector_s8 quanW;
+    quanW.resize(k * n);
+    quanW.fill_rand(0, 16);
+    for (int i = 0; i < k; i++) {
+      for (int j = 0; j < n; j++) {
+        if (i % blocksize == 0) {
+          switch (F4_T) {
+            case BTLA_DTYPE::F4_E2M1:
+              quanW.data()[i * n + j] = 7;  // make sure each block has maximum fp4e2m1 value(0b111) to quantize
+              break;
+            case BTLA_DTYPE::F4_BNB:
+              quanW.data()[i * n + j] = 3;  // make sure each block has maximum fp4bnb value(0b011) to quantize
+              break;
+            case BTLA_DTYPE::F4_NF4:
+              quanW.data()[i * n + j] = 15;  // make sure each block has maximum nf4 value(0b1111) to quantize
+              break;
+            default:
+              break;
+          }
+        }
+      }
+    }
+    for (int j = 0; j < k; j++) {
+      for (int i = 0; i < n; i++) {
+        switch (F4_T) {
+          case BTLA_DTYPE::F4_E2M1:
+            dequanRef[j + i * k] = kernel::ref::f4_dequantize<BTLA_DTYPE::F4_E2M1>(quanW.data()[j * ldb + i],
+                                                                                   scales[j / blocksize * n + i]);
+            quanW.data()[j * ldb + i] =
+                kernel::ref::f4_quantize<BTLA_DTYPE::F4_E2M1>(dequanRef[j + i * k] / scales[j / blocksize * n + i]);
+            break;
+          case BTLA_DTYPE::F4_BNB:
+            dequanRef[j + i * k] = kernel::ref::f4_dequantize<BTLA_DTYPE::F4_BNB>(quanW.data()[j * ldb + i],
+                                                                                  scales[j / blocksize * n + i]);
+            quanW.data()[j * ldb + i] =
+                kernel::ref::f4_quantize<BTLA_DTYPE::F4_BNB>(dequanRef[j + i * k] / scales[j / blocksize * n + i]);
+            break;
+          case BTLA_DTYPE::F4_NF4:
+            dequanRef[j + i * k] = kernel::ref::f4_dequantize<BTLA_DTYPE::F4_NF4>(quanW.data()[j * ldb + i],
+                                                                                  scales[j / blocksize * n + i]);
+            quanW.data()[j * ldb + i] =
+                kernel::ref::f4_quantize<BTLA_DTYPE::F4_NF4>(dequanRef[j + i * k] / scales[j / blocksize * n + i]);
+            break;
+          default:
+            break;
+        }
+      }
+    }
+
+    auto constexpr RuntimeISA = BTLA_ISA::AVX512F;
+    using PrologueB = prologue_b::gemm::WeightKBlockNFloat<gemm::SCoreRowNAvx512f<48, 8>, RuntimeISA>;
+    PrologueB kernel;
+    auto packedW = kernel.createStorage(n, k, blocksize, F4_T, SCA_T);
+    auto packedW1 = kernel.createStorage(n, k, blocksize, F4_T, SCA_T);
+    avector<int8_t> buf(packedW.mSize), buf1(packedW1.mSize);
+    packedW.assign(buf.data());
+    packedW1.assign(buf1.data());
+    kernel.packTransposeWeight(n, k, dequanRef.data(), k, &packedW, UT_Threading::get());
+    kernel.packQWeight(n, k, quanW.data(), ldb, scales.data(), nullptr, &packedW1, UT_Threading::get());
+    avector<float> dequant(n * k);
+    kernel.unpackTransposeWeight(n, k, &packedW1, dequant.data(), k, UT_Threading::get());
+    if (SCA_T != BTLA_DTYPE::DQ8_BNB) {
+      ut::buffer_error(packedW.SPtr<float>(), packedW1.SPtr<float>(), packedW1.CSize());
+      ut::buffer_error(dequanRef.data(), dequant.data(), dequant.size());
+    } else {
+      ut::buffer_error(packedW.SPtr<int8_t>(), packedW1.SPtr<int8_t>(), packedW1.CSize());
+      ut::buffer_error(dequanRef.data(), dequant.data(), dequant.size(), 0.1f);
+    }
+    ut::buffer_error(packedW.WPtr<int8_t>(), packedW1.WPtr<int8_t>(), packedW1.mQBuf.size<int8_t>());
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_B
+static UT_TransposeBlockQuantize_F4 sUT_TransposeBlockQuantize_F4;
+#endif
+
+class UT_BlockQuantize_INT4 {
+ public:
+  UT_BlockQuantize_INT4() {
+    UT_START();
+    CheckISA(AVX2);
+    CheckISA(AVX512F);
+    ut_2(4096, 4096, 128, BTLA_DTYPE::S4_CLIP, false);
+    CheckISA(AVX512F);
+    ut_512vnni(4096, 4096, 128, BTLA_DTYPE::S4_CLIP, false);
+    ut_512vnni(4096, 4096, 128, BTLA_DTYPE::S4_CLIP, true);
+  }
+  void ut_2(int n, int k, int blocksize, BTLA_DTYPE qtype, bool asym = false) {
+    printf("Test Case: %d %d %d %s\n", n, k, blocksize, asym ? "asym" : "sym");
+    int ldb = n;
+    int kblk_num = utils::updiv(k, blocksize);
+    utils::aligned_vector<float> scales(kblk_num * n);
+    ut::fill_buffer_randn(scales.data(), scales.size(), 0.005f, 0.01f);
+    utils::aligned_vector<int8_t> zero_points(kblk_num * n);
+    ut::fill_buffer_randn(zero_points.data(), zero_points.size(), (int8_t)(-5), (int8_t)(5));
+    ut::UT_vector_s8 quanW;
+    quanW.resize(k * n);
+    quanW.fill_rand(-127, 127);
+    avector<float> dequant(quanW.size());
+    avector<float> reduce(scales.size(), 0.f);
+    for (int i = 0; i < k; i++) {
+      for (int j = 0; j < n; j++) {
+        quanW.data()[i * n + j] = quanW.data()[i * n + j] & 0xf0;
+        if (!asym) {
+          dequant[i * n + j] = quanW.data()[i * n + j] * scales[i / blocksize * n + j];
+        } else {
+          dequant[i * n + j] =
+              float(quanW.data()[i * n + j] - zero_points[i / blocksize * n + j]) * scales[i / blocksize * n + j];
+        }
+        reduce[i / blocksize * n + j] += dequant[i * n + j];
+      }
+    }
+
+    auto constexpr RuntimeISA = BTLA_ISA::AVX2;
+    using PrologueB = prologue_b::gemm::WeightKBlockNInteger<gemm::SCoreRowNAvx2<48, 2>, BTLA_ISA::AVX2>;
+    using PrologueB512 = prologue_b::gemm::WeightKBlockNInteger<gemm::SCoreRowNAvx2<48, 2>, BTLA_ISA::AVX512F>;
+    PrologueB kernel;
+    PrologueB512 kernel512;
+    utils::aligned_vector<int8_t> retW(n * k);
+    auto packedW = kernel.createStorage(n, k, blocksize, qtype, bestla_dtype<float>, bestla_dtype<float>, asym);
+    avector<int8_t> buffer(packedW.mSize);
+    packedW.assign(buffer.data());
+    kernel.packWeight(n, k, dequant.data(), ldb, &packedW, UT_Threading::get());
+    avector<float> unpackf32(dequant.size());
+    avector<float> unpack512f32(dequant.size());
+    kernel.unpackWeight(n, k, &packedW, unpackf32.data(), n, UT_Threading::get());
+    kernel512.unpackWeight(n, k, &packedW, unpack512f32.data(), n, UT_Threading::get());
+    ut::buffer_error(unpackf32.data(), unpack512f32.data(), unpackf32.size(), 0.01f);
+  }
+  void ut_512vnni(int n, int k, int blocksize, BTLA_DTYPE qtype, bool asym = false) {
+    printf("Test Case: %d %d %d %s\n", n, k, blocksize, asym ? "asym" : "sym");
+    int ldb = n;
+    int kblk_num = utils::updiv(k, blocksize);
+    utils::aligned_vector<float> scales(kblk_num * n);
+    ut::fill_buffer_randn(scales.data(), scales.size(), 0.005f, 0.01f);
+    utils::aligned_vector<int8_t> zero_points(kblk_num * n);
+    ut::fill_buffer_randn(zero_points.data(), zero_points.size(), (int8_t)(-5), (int8_t)(5));
+    ut::UT_vector_s8 quanW;
+    quanW.resize(k * n);
+    quanW.fill_rand(-127, 127);
+    avector<float> dequant(quanW.size());
+    avector<float> reduce(scales.size(), 0.f);
+    for (int i = 0; i < k; i++) {
+      for (int j = 0; j < n; j++) {
+        // quanW.data()[i * n + j] = quanW.data()[i * n + j] & 0xf0; //anyway there will be a float-rounding error
+        // about 1 LSB.
+        if (!asym) {
+          dequant[i * n + j] = quanW.data()[i * n + j] * scales[i / blocksize * n + j];
+        } else {
+          dequant[i * n + j] =
+              float(quanW.data()[i * n + j] - zero_points[i / blocksize * n + j]) * scales[i / blocksize * n + j];
+        }
+        reduce[i / blocksize * n + j] += dequant[i * n + j];
+      }
+    }
+
+    auto constexpr RuntimeISA = BTLA_ISA::AVX512_VNNI;
+    using PrologueB = prologue_b::gemm::WeightKBlockNInteger<gemm::ICoreRowNAvx512vnni<48, 8>, RuntimeISA>;
+
+    PrologueB kernel;
+    utils::aligned_vector<int8_t> retW(n * k);
+    auto packedW = kernel.createStorage(n, k, blocksize, qtype, bestla_dtype<float>, bestla_dtype<float>, asym);
+    avector<int8_t> buffer(packedW.mSize);
+    packedW.assign(buffer.data());
+    kernel.packWeight(n, k, dequant.data(), ldb, &packedW, UT_Threading::get());
+    avector<float> unpackf32(dequant.size());
+    kernel.unpackWeight(n, k, &packedW, unpackf32.data(), n, UT_Threading::get());
+    int lsb = 16;
+    float err_thres = lsb * 0.01f;  // lsb*max_scale
+    ut::buffer_error(dequant.data(), unpackf32.data(), dequant.size(), err_thres);
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_B
+static UT_BlockQuantize_INT4 sUT_BlockQuantize_INT4;
+#endif
+
+class UT_StorageMemCheck {
+ public:
+  UT_StorageMemCheck() {
+    UT_START();
+    CheckISA(AVX512F);
+    ut_s4(4096, 4096, 128, BTLA_DTYPE::S4_CLIP);
+    ut_f4(4096, 4096, 32, BTLA_DTYPE::F4_BNB);
+    ut_f4(4096, 4096, 32, BTLA_DTYPE::F4_E2M1);
+  }
+
+  void ut_s4(int n, int k, int blocksize, BTLA_DTYPE qtype, bool asym = false) {
+    printf("Test C type Case: %d %d %d %s\n", n, k, blocksize, asym ? "asym" : "sym");
+    int ldb = n;
+    int kblk_num = utils::updiv(k, blocksize);
+    using GemmCore = gemm::SCoreRowNAvx512f<48, 8>;
+    using PrologueB = prologue_b::gemm::WeightKBlockNInteger<GemmCore, BTLA_ISA::AVX2>;
+    PrologueB ProWei;
+
+    auto packedW = ProWei.createStorage(n, k, blocksize, qtype, bestla_dtype<float>, bestla_dtype<utils::bf16>, asym);
+    avector<int8_t> buf0(packedW.mSize), buf1(packedW.mSize);
+    packedW.assign(buf0.data());
+    storage::gemm::StorageWeightKBlockNInteger tmp(GemmCore::ID);
+    tmp.deserialize(buf0.data());
+    tmp.serialize(buf1.data());
+    buffer_error(buf0.data(), buf1.data(), buf0.size());
+  }
+
+  void ut_f4(int n, int k, int blocksize, BTLA_DTYPE qtype) {
+    printf("Test C type Case: %d %d %d\n", n, k, blocksize);
+    int ldb = n;
+    int kblk_num = utils::updiv(k, blocksize);
+    using GemmCore = gemm::HCoreRowNAmxbf16<64, 16>;
+    using PrologueB = prologue_b::gemm::WeightKBlockNFloat<GemmCore, BTLA_ISA::AMX_BF16>;
+    PrologueB ProWei;
+
+    auto packedW = ProWei.createStorage(n, k, blocksize, qtype, bestla_dtype<float>);
+    avector<int8_t> buf0(packedW.mSize), buf1(packedW.mSize);
+    packedW.assign(buf0.data());
+    storage::gemm::StorageWeightKBlockNFloat tmp(GemmCore::ID);
+    tmp.deserialize(buf0.data());
+    tmp.serialize(buf1.data());
+    buffer_error(buf0.data(), buf1.data(), buf0.size());
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_B
+static UT_StorageMemCheck sUT_StorageMemCheck;
+#endif
+
+class UT_ShuffleIndices {
+ public:
+  UT_ShuffleIndices() {
+    UT_START();
+    CheckISA(AVX2);
+    // ut_file();
+    ut_s4(4096, 4096, 128, BTLA_DTYPE::S4_CLIP);
+  }
+
+  void ut_s4(int n, int k, int blocksize, BTLA_DTYPE qtype, bool asym = false) {
+    printf("Test C type Case: %d %d %d %s\n", n, k, blocksize, asym ? "asym" : "sym");
+    int ldb = n;
+    int kblk_num = utils::updiv(k, blocksize);
+    using GemmCore = gemm::SCoreRowNAvx2<24, 4>;
+    using PrologueB = prologue_b::gemm::WeightKBlockNInteger<GemmCore, BTLA_ISA::AVX2>;
+    PrologueB ProWei;
+    auto packedW = ProWei.createStorage(n, k, blocksize, qtype, bestla_dtype<float>, bestla_dtype<utils::bf16>, asym);
+    ProWei.enableShuffle(&packedW);
+    avector<int> groupindices(k, 0);
+    auto groupsize = utils::updiv(k, blocksize);
+    avector<int> reflut(k, 0);
+    for (size_t i = 0; i < k; i++) {
+      groupindices[i] = i % groupsize;
+      auto offset = i / groupsize;
+      reflut[groupindices[i] * blocksize + offset] = i;
+    }
+    avector<int8_t> buf0(packedW.mSize), buf1(packedW.mSize);
+    packedW.assign(buf0.data());
+    ProWei.setShuffleIndices(groupindices.data(), &packedW, UT_Threading::get());
+    buffer_error(reflut.data(), packedW.ShfIndice(), reflut.size());
+
+    storage::gemm::StorageWeightKBlockNInteger tmp(GemmCore::ID);
+    tmp.deserialize(buf0.data());
+    tmp.serialize(buf1.data());
+    buffer_error(buf0.data(), buf1.data(), buf0.size());
+  }
+
+  void ut_file() {
+    int n = 14336;
+    int m = 8;
+    int k = 4096;
+    int blocksize = 32;
+    bool constexpr blauncher = true;
+    auto qtype = BTLA_DTYPE::S4_CLIP;
+    bool asym = true;
+    auto warray = ut::readFile2Buffer<int8_t>("src0_data.bin");
+    auto aarray = ut::readFile2Buffer<float>("src1_data.bin");
+    auto oarray = ut::readFile2Buffer<float>("tensor_data.bin");
+    auto refoarray = ut::readFile2Buffer<float>("tensor_data_ref.bin");
+    auto wptr = storage::gemm::PackedWeightParser::deserialBuffer(warray.data());
+    using GemmCore = gemm::SCoreRowNAvx512f<48, 8>;
+    auto wptr_ = reinterpret_cast<storage::gemm::StorageWeightKBlockNInteger*>(wptr);
+    utils::GemmProblem gp(1, m, n, k, blocksize);
+    avector<float> output(m * n);
+    if constexpr (blauncher) {
+      using Launcher =
+          wrapper::gemm::LauncherBase<GemmCore::ISA, GemmCore, prologue_a::gemm::ShuffleActivationKBlockBaseF32,
+                                      prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::AccumulatorWriteBackFp32>;
+      static Launcher kernel;
+      auto rordA = kernel.mProA.createReorderStorage(m, k, blocksize);
+      avector<int8_t> bufA(rordA.mSize);
+      rordA.assign(bufA.data());
+      typename Launcher::Param args{
+          gp, {aarray.data(), k, nullptr, wptr_->ShfIndice(), &rordA}, {wptr_}, {output.data(), n}};
+      parallel::GemmRunWithA<parallel::gemm::SchedulerBase<GemmCore>>(kernel, args, UT_Threading::get());
+    }
+
+    ut::buffer_error(output.data(), oarray.data(), output.size());
+    ut::buffer_error(output.data(), refoarray.data(), output.size());
+
+    delete wptr;
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_B
+static UT_ShuffleIndices sUT_ShuffleIndices;
+#endif
+
+class UT_CompFp32 {
+ public:
+  UT_CompFp32() {
+    UT_START();
+    ut_s4();
+    ut_s2();
+    ut_s3();
+    ut_s8();
+
+    ut_f4();
+    ut_f8();
+  }
+  void ut_s2() {
+    GetCPUDevice();
+    if (_cd->AVX2()) {
+      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::BF16,
+                                                            false);
+      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
+                                                            true);
+      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
+                                                            false);
+      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
+                                                            false);
+      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
+                                                            true);
+      ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
+                                                            false);
+    }
+    if (_cd->AVX512F()) {
+      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::BF16,
+                                                               false);
+      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
+                                                               true);
+      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
+                                                               false);
+      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
+                                                               false);
+      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
+                                                               true);
+      ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
+                                                               false);
+    }
+  }
+
+  void ut_s3() {
+    CheckISA(AVX2);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
+                                                          false);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
+                                                          true);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
+                                                          false);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
+                                                          true);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
+                                                          false);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
+                                                          false);
+  }
+
+  void ut_f8() {
+    CheckISA(AVX2);
+    ut<sAVX2, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F8_E4M3, BTLA_DTYPE::F8_E8M0);
+    ut<sAVX2, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F8_E4M3, BTLA_DTYPE::F32);
+    ut<sAVX2, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F8_E5M2, BTLA_DTYPE::F8_E8M0);
+    ut<sAVX2, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F8_E5M2, BTLA_DTYPE::F32);
+    CheckISA(AVX512F);
+    ut<sAVX512F, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F8_E4M3, BTLA_DTYPE::F8_E8M0);
+    ut<sAVX512F, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F8_E4M3, BTLA_DTYPE::F32);
+    ut<sAVX512F, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F8_E5M2, BTLA_DTYPE::F8_E8M0);
+    ut<sAVX512F, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F8_E5M2, BTLA_DTYPE::F32);
+  }
+  void ut_s4() {
+    CheckISA(AVX2);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
+                                                          true);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
+                                                          false);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
+                                                          false);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
+                                                          false);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::BF16,
+                                                          false);
+
+    CheckISA(AVX512F);
+    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
+                                                             false);
+    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(1, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
+                                                             true);
+    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
+                                                             false);
+    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
+                                                             false);
+    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(8, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::BF16,
+                                                             false);
+  }
+
+  void ut_s8() {
+    CheckISA(AVX2);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S8, BTLA_DTYPE::BF16, false);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S8, BTLA_DTYPE::BF16, true);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S8, BTLA_DTYPE::F32, false);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S8, BTLA_DTYPE::F32, false);
+    ut_int<sAVX2, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S8, BTLA_DTYPE::F32, false);
+
+    CheckISA(AVX512F);
+    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S8, BTLA_DTYPE::BF16,
+                                                             false);
+    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 32, BTLA_DTYPE::S8, BTLA_DTYPE::F32, false);
+    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, 128, BTLA_DTYPE::S8, BTLA_DTYPE::F32,
+                                                             false);
+    ut_int<sAVX512F, prologue_b::gemm::WeightKBlockNInteger>(2, 4096, 4096, -1, BTLA_DTYPE::S8, BTLA_DTYPE::F32, false);
+  }
+
+  void ut_f4() {
+    CheckISA(AVX2);
+    ut<sAVX2, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F4_BNB, BTLA_DTYPE::F32);
+    ut<sAVX2, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, -1, BTLA_DTYPE::F4_BNB, BTLA_DTYPE::F32);
+    ut<sAVX2, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F4_E2M1, BTLA_DTYPE::F32);
+    ut<sAVX2, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, -1, BTLA_DTYPE::F4_E2M1, BTLA_DTYPE::F32);
+    ut<sAVX2, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F4_NF4, BTLA_DTYPE::F32);
+    ut<sAVX2, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, -1, BTLA_DTYPE::F4_NF4, BTLA_DTYPE::F32);
+    ut<sAVX2, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F4_BNB, BTLA_DTYPE::BF16);
+    ut<sAVX2, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F4_E2M1, BTLA_DTYPE::BF16);
+    ut<sAVX2, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F4_NF4, BTLA_DTYPE::BF16);
+
+    CheckISA(AVX512F);
+    ut<sAVX512F, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F4_BNB, BTLA_DTYPE::F32);
+    ut<sAVX512F, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, -1, BTLA_DTYPE::F4_BNB, BTLA_DTYPE::F32);
+    ut<sAVX512F, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F4_E2M1, BTLA_DTYPE::F32);
+    ut<sAVX512F, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, -1, BTLA_DTYPE::F4_E2M1, BTLA_DTYPE::F32);
+    ut<sAVX512F, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F4_NF4, BTLA_DTYPE::F32);
+    ut<sAVX512F, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, -1, BTLA_DTYPE::F4_NF4, BTLA_DTYPE::F32);
+    ut<sAVX512F, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F4_BNB, BTLA_DTYPE::BF16);
+    ut<sAVX512F, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F4_E2M1, BTLA_DTYPE::BF16);
+    ut<sAVX512F, prologue_b::gemm::WeightKBlockNFloat>(2, 4096, 4096, 32, BTLA_DTYPE::F4_NF4, BTLA_DTYPE::BF16);
+  }
+
+  template <class GemmCore_T, template <class _T, BTLA_ISA> class Wei>
+  void ut_int(int m, int n, int k, int blocksize, BTLA_DTYPE qtype, BTLA_DTYPE stype, bool isAsym) {
+    printf("Test Case %s: %d %d %d-%d type:%s core:%s scaletype:%s Asym:%d\n", __FUNCTION__, m, n, k, blocksize,
+           bestla_dtype_str(qtype), gemm::CoreAttr::to_str(GemmCore_T::ID), bestla_dtype_str(stype), isAsym);
+    auto constexpr ISA = GemmCore_T::ISA;
+    using Launcher =
+        wrapper::gemm::LauncherBase<ISA, GemmCore_T, prologue_a::gemm::ActivationKBlockBaseF32,
+                                    prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::AccumulatorWriteBackFp32>;
+    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+    Launcher launcher;
+    blocksize = blocksize == -1 ? k : blocksize;
+    using WType = typename Wei<GemmCore_T, ISA>::StorageWeight;
+    WType packedw(0);
+    if constexpr (std::is_same_v<Wei<GemmCore_T, ISA>, prologue_b::gemm::WeightKBlockNInteger<GemmCore_T, ISA>>) {
+      packedw = launcher.mProB.createStorage(n, k, blocksize, qtype, stype, bestla_dtype<float>, isAsym);
+    } else if constexpr (std::is_same_v<Wei<GemmCore_T, ISA>, prologue_b::gemm::WeightKBlockNFloat<GemmCore_T, ISA>>) {
+      packedw = launcher.mProB.createStorage(n, k, blocksize, qtype, stype);
+    }
+
+    utils::avector<int8_t> buffer(packedw.mSize);
+    packedw.assign(buffer.data());
+    auto reduceA = launcher.mProA.createStorage(m, k, blocksize);
+    utils::avector<int8_t> bufferA(packedw.mSize);
+    reduceA.assign(bufferA.data());
+    avector<float> matBf32(k * n), matAf32(m * k), matC(m * n), refC(m * n), refCupk(m * n);
+    fill_buffer_randn(matBf32.data(), matBf32.size(), -0.5f, 0.5f);
+    fill_buffer_randn(matAf32.data(), matAf32.size(), -0.5f, 0.5f);
+    launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, UT_Threading::get());
+    gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n);
+    launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, UT_Threading::get());
+    gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refCupk.data(), k, n, n);
+
+    launcher.mProA.reduce({matAf32.data(), k, &reduceA}, m, k, blocksize, UT_Threading::get());
+    utils::GemmProblem gp(1, m, n, k, blocksize);
+    typename Launcher::Param args{gp, {matAf32.data(), k, &reduceA}, {&packedw}, {matC.data(), n}};
+    parallel::GemmRun<Parallel>(launcher, args, UT_Threading::get());
+    auto err = get_ut_err(qtype);
+    auto dbits = bestla_dtype_bits(qtype);
+    auto type = bestla_dtype_type(qtype);
+    auto constexpr dtype_int = bestla_dtype_type(BTLA_DTYPE::TypeInt);
+    buffer_error(refC.data(), matC.data(), refC.size(), err);
+    buffer_error(refCupk.data(), matC.data(), refCupk.size(), 0.001f);
+  }
+
+  template <class GemmCore_T, template <class _T, BTLA_ISA> class Wei>
+  void ut(int m, int n, int k, int blocksize, BTLA_DTYPE qtype, BTLA_DTYPE stype) {
+    printf("Test Case %s: %d %d %d-%d type:%s core:%s scaletype:%s\n", __FUNCTION__, m, n, k, blocksize,
+           bestla_dtype_str(qtype), gemm::CoreAttr::to_str(GemmCore_T::ID), bestla_dtype_str(stype));
+    auto constexpr ISA = GemmCore_T::ISA;
+    using Launcher = wrapper::gemm::LauncherBase<ISA, GemmCore_T, prologue_a::gemm::ActivationBase, Wei,
+                                                 epilogue::gemm::AccumulatorWriteBackFp32>;
+    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+    Launcher launcher;
+    blocksize = blocksize == -1 ? k : blocksize;
+    using WType = typename Wei<GemmCore_T, ISA>::StorageWeight;
+    WType packedw(0);
+    packedw = launcher.mProB.createStorage(n, k, blocksize, qtype, stype);
+    utils::avector<int8_t> buffer(packedw.mSize);
+    packedw.assign(buffer.data());
+    avector<float> matBf32(k * n), matAf32(m * k), matC(m * n), refC(m * n), refCupk(m * n);
+    fill_buffer_randn(matBf32.data(), matBf32.size(), -0.5f, 0.5f);
+    fill_buffer_randn(matAf32.data(), matAf32.size(), -0.5f, 0.5f);
+    launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, UT_Threading::get());
+    gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n);
+    launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, UT_Threading::get());
+    gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refCupk.data(), k, n, n);
+    GemmProblem gp(1, m, n, k, blocksize);
+    typename Launcher::Param args{gp, {matAf32.data(), k}, {&packedw}, {matC.data(), n}};
+    parallel::GemmRun<Parallel>(launcher, args, UT_Threading::get());
+    auto err = get_ut_err(qtype);
+    buffer_error(refC.data(), matC.data(), refC.size(), err);
+    buffer_error(refCupk.data(), matC.data(), refCupk.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_B
+static UT_CompFp32 sUT_CompFp32;
+#endif
+
+class UT_CompInt8 {
+ public:
+  UT_CompInt8() {
+    UT_START();
+    ut_s4();
+    ut_s2();
+    ut_s3();
+  }
+
+  void ut_s2() {
+    GetCPUDevice();
+    if (_cd->AVX_VNNI()) {
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 16, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 128, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
+    }
+    if (_cd->AVX512_VNNI()) {
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
+                                                           true);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 16, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(2, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
+                                                           true);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32,
+                                                           true);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 32, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 128, BTLA_DTYPE::S2_CLIP, BTLA_DTYPE::F32);
+    }
+  }
+
+  void ut_s3() {
+    GetCPUDevice();
+    if (_cd->AVX_VNNI()) {
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 128, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32);
+    }
+    if (_cd->AVX512_VNNI()) {
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
+                                                           true);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 32, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32,
+                                                           true);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 4096, 4096, 128, BTLA_DTYPE::S3_CLIP, BTLA_DTYPE::F32);
+    }
+  }
+
+  void ut_s4() {
+    GetCPUDevice();
+    if (_cd->AVX2()) {
+      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvx2vnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
+    }
+    if (_cd->AVX_VNNI()) {
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32, true);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(8, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
+
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(1, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::BF16);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvxvnniKBlock<24, 2>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::DQ8_BNB);
+    }
+
+    if (_cd->AVX512_VNNI()) {
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(1, 11008, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(8, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAvx512vnniKBlock<48, 4>>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::DQ8_BNB);
+    }
+
+    if (_cd->AMX_INT8()) {
+      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(128, 4096, 4096, 128, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(1, 4096, 4096, 64, BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32);
+      ut_newkblock<gemm::ICoreRowNAmxint8KBlock<48, 16>>(128, 4096, 4096, 128, BTLA_DTYPE::S4_CLIP,
+                                                         BTLA_DTYPE::DQ8_BNB);
+    }
+  }
+
+  template <class GemmCore_T>
+  void ut_newkblock(int m, int n, int k, int blocksize, BTLA_DTYPE qtype, BTLA_DTYPE stype, bool isAsym = false) {
+    printf("Test Case %s: %d %d %d-%d type:%s core:%s scaletype:%s Asym:%d\n", __FUNCTION__, m, n, k, blocksize,
+           bestla_dtype_str(qtype), gemm::CoreAttr::to_str(GemmCore_T::ID), bestla_dtype_str(stype), isAsym);
+    auto constexpr ISA = GemmCore_T::ISA;
+    using Launcher = wrapper::gemm::LauncherIntKBlock<ISA, GemmCore_T, prologue_a::gemm::ActivationF32KBlockQuantize,
+                                                      prologue_b::gemm::WeightKBlockNInteger,
+                                                      epilogue::gemm::AccumulatorWriteBackFp32>;
+    using Parallel = parallel::gemm::SchedulerKBlockS<GemmCore_T>;
+    Launcher launcher;
+    blocksize = blocksize == -1 ? k : blocksize;
+    int kblks = updiv(k, blocksize);
+    using WType = typename Launcher::PrologueB::StorageWeight;
+    WType packedw = launcher.mProB.createStorage(n, k, blocksize, qtype, stype, bestla_dtype<float>, isAsym);
+    utils::avector<int8_t> buffer(packedw.mSize);
+    packedw.assign(buffer.data());
+    avector<float> matBf32(k * n), matAf32(m * k), matC(m * n), refC(m * n), refCupk(m * n);
+    fill_buffer_randn(matBf32.data(), matBf32.size(), -0.5f, 0.5f);
+    avector<uint8_t> matAu8(m * k), zpAu8(m * kblks);
+    avector<float> scaleAf32(m * kblks);
+    fill_buffer_randn(matAu8.data(), matAu8.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(zpAu8.data(), zpAu8.size(), uint8_t(100), uint8_t(150));
+    fill_buffer_randn(scaleAf32.data(), scaleAf32.size(), 0.001f, 0.005f);
+    ut::fill_buffer_randn(matBf32.data(), matBf32.size(), -0.5f, 0.5f);
+    avector<float> reduceAf32(m * kblks, 0.f);
+    for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < k; j++) {
+        matAf32[i * k + j] =
+            (float(matAu8[i * k + j]) - zpAu8[i * kblks + j / blocksize]) * scaleAf32[i * kblks + j / blocksize];
+        reduceAf32[i * kblks + j / blocksize] += matAf32[i * k + j];
+      }
+    }
+    launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, UT_Threading::get());
+    gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n);
+    launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, UT_Threading::get());
+    gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refCupk.data(), k, n, n);
+    auto quanA = launcher.mProA.createStorage(m, k, blocksize, isAsym);
+    utils::avector<int8_t> bufferA(quanA.mSize);
+    quanA.assign(bufferA.data());
+    GemmProblem gp(1, m, n, k, blocksize);
+    typename Launcher::Param args{gp, {matAf32.data(), k, &quanA}, {&packedw}, {matC.data(), n}};
+    parallel::GemmRunWithA<Parallel>(launcher, args, UT_Threading::get());
+    auto err = get_ut_err(qtype);
+    buffer_error(refC.data(), matC.data(), refC.size(), err);
+    if (stype != BTLA_DTYPE::DQ8_BNB) {
+      buffer_error(refCupk.data(), matC.data(), refCupk.size(), INT8_ERR);  // dynamic quant error
+    } else {
+      auto DQ_INT8_ERR = 0.8f;
+      buffer_error(refCupk.data(), matC.data(), refCupk.size(), DQ_INT8_ERR);  // dynamic quant error
+    }
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_B
+static UT_CompInt8 sUT_CompInt8;
+#endif
+
+class UT_CompBf16 {
+ public:
+  UT_CompBf16() {
+    UT_START();
+    CheckISA(AMX_BF16);
+    request_perm_xtile_data();
+    ut_s4();
+    ut_s8();
+    ut_f4();
+    ut_f8();
+  }
+
+  void ut_f8() {
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNFloat, f8>(2, 4096, 4096, 32, BTLA_DTYPE::F8_E4M3);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNFloat, float>(2, 4096, 4096, 32, BTLA_DTYPE::F8_E4M3);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNFloat, f8>(2, 4096, 4096, 32, BTLA_DTYPE::F8_E5M2);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNFloat, float>(2, 4096, 4096, 32, BTLA_DTYPE::F8_E5M2);
+  }
+
+  void ut_s4() {
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNInteger, float>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNInteger, float>(2, 4096, 4096, 128, BTLA_DTYPE::S4_CLIP);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNInteger, float>(2, 4096, 4096, -1, BTLA_DTYPE::S4_CLIP);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP);
+  }
+
+  void ut_s8() {
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNInteger, float>(2, 4096, 4096, 32, BTLA_DTYPE::S8);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNInteger, float>(2, 4096, 4096, 128, BTLA_DTYPE::S8);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNInteger, float>(2, 4096, 4096, -1, BTLA_DTYPE::S8);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNInteger, utils::bf16>(2, 4096, 4096, 32, BTLA_DTYPE::S8);
+  }
+
+  void ut_f4() {
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNFloat, float>(2, 4096, 4096, 32, BTLA_DTYPE::F4_E2M1);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNFloat, float>(2, 4096, 4096, 32, BTLA_DTYPE::F4_BNB);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNFloat, float>(2, 4096, 4096, -1, BTLA_DTYPE::F4_BNB);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNFloat, float>(2, 4096, 4096, 32, BTLA_DTYPE::F4_E2M1);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNFloat, float>(2, 4096, 4096, -1, BTLA_DTYPE::F4_E2M1);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNFloat, float>(2, 4096, 4096, 32, BTLA_DTYPE::F4_NF4);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNFloat, float>(2, 4096, 4096, -1, BTLA_DTYPE::F4_NF4);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNFloat, utils::bf16>(2, 4096, 4096, 32, BTLA_DTYPE::F4_BNB);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNFloat, utils::bf16>(2, 4096, 4096, 32, BTLA_DTYPE::F4_E2M1);
+    ut<sAMX_BF16, prologue_b::gemm::WeightKBlockNFloat, utils::bf16>(2, 4096, 4096, 32, BTLA_DTYPE::F4_NF4);
+  }
+
+  template <class GemmCore_T, template <class _T, BTLA_ISA> class Wei, typename Scale_T>
+  void ut(int m, int n, int k, int blocksize, BTLA_DTYPE qtype) {
+    printf("Test Case %s: %d %d %d-%d type:%s core:%s scaletype:%s\n", __FUNCTION__, m, n, k, blocksize,
+           bestla_dtype_str(qtype), gemm::CoreAttr::to_str(GemmCore_T::ID), type_str<Scale_T>);
+    auto constexpr ISA = GemmCore_T::ISA;
+    using Launcher = wrapper::gemm::LauncherBase<ISA, GemmCore_T, prologue_a::gemm::ActivationBase, Wei,
+                                                 epilogue::gemm::AccumulatorWriteBackFp32>;
+    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+
+    Launcher launcher;
+    blocksize = blocksize == -1 ? k : blocksize;
+    using WType = typename Wei<GemmCore_T, ISA>::StorageWeight;
+    WType packedw(0);
+    if constexpr (std::is_same_v<Wei<GemmCore_T, ISA>, prologue_b::gemm::WeightKBlockNInteger<GemmCore_T, ISA>>) {
+      packedw = launcher.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>, bestla_dtype<float>, false);
+    } else if constexpr (std::is_same_v<Wei<GemmCore_T, ISA>, prologue_b::gemm::WeightKBlockNFloat<GemmCore_T, ISA>>) {
+      packedw = launcher.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>);
+    }
+
+    utils::avector<int8_t> buffer(packedw.mSize);
+    packedw.assign(buffer.data());
+    avector<utils::bf16> matAbf16(m * k), matBbf16(k * n);
+    fill_buffer_randn(matAbf16.data(), matAbf16.size(), utils::bf16(-0.5f), utils::bf16(0.5f));
+    fill_buffer_randn(matBbf16.data(), matBbf16.size(), utils::bf16(-0.5f), utils::bf16(0.5f));
+    avector<float> matBf32(k * n), matAf32(m * k), matC(m * n), refC(m * n), refCupk(m * n);
+    for (size_t i = 0; i < matBf32.size(); i++) {
+      matBf32[i] = matBbf16[i];
+    }
+    launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, UT_Threading::get());
+    gemmref_bf16bf16fp32(m, n, k, matAbf16.data(), matBbf16.data(), refC.data(), k, n, n);
+    launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, UT_Threading::get());
+    for (size_t i = 0; i < matBf32.size(); i++) {
+      matBbf16[i] = static_cast<utils::bf16>(matBf32[i]);
+    }
+    gemmref_bf16bf16fp32(m, n, k, matAbf16.data(), matBbf16.data(), refCupk.data(), k, n, n);
+    GemmProblem gp(1, m, n, k, blocksize);
+    typename Launcher::Param args{gp, {matAbf16.data(), k}, {&packedw}, {matC.data(), n}};
+    parallel::GemmRun<Parallel>(launcher, args, UT_Threading::get());
+    auto err = get_ut_err(qtype);
+    buffer_error(refC.data(), matC.data(), refC.size(), err);
+    buffer_error(refCupk.data(), matC.data(), refCupk.size(), 0.05f);
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_B
+static UT_CompBf16 sUT_CompBf16;
+#endif
+
+class UT_ORT_NBits {
+ public:
+  UT_ORT_NBits() {
+    UT_START();
+    ut_s4();
+  }
+
+  void ut_s4() {
+    CheckISA(AVX2);
+    ut<sAVX2>(1, 14336, 4096, 32, BTLA_DTYPE::S4_CLIP, true);
+    ut<sAVX2>(1, 1, 32, 32, BTLA_DTYPE::S4_CLIP, false);
+    ut<sAVX2>(1, 2, 32, 32, BTLA_DTYPE::S4_CLIP, true);
+    ut<sAVX2>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, true);
+    ut<sAVX2>(1, 11008, 4096, 32, BTLA_DTYPE::S4_CLIP, false);
+    ut<sAVX2>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, false);
+    ut<sAVX2>(2, 4096, 4096, 128, BTLA_DTYPE::S4_CLIP, false);
+    ut<sAVX2>(2, 4096, 4096, -1, BTLA_DTYPE::S4_CLIP, false);
+    CheckISA(AVX512F);
+    ut<sAVX512F>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, true);
+    ut<sAVX512F>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP, false);
+    ut<sAVX512F>(2, 4096, 4096, 128, BTLA_DTYPE::S4_CLIP, false);
+    ut<sAVX512F>(2, 4096, 4096, -1, BTLA_DTYPE::S4_CLIP, false);
+  }
+
+  template <class GemmCore_T>
+  void ut(int m, int n, int k, int blocksize, BTLA_DTYPE qtype, bool isasym) {
+    printf("Test Case %s: %d %d %d-%d type:%s core:%s asym:%d \n", __FUNCTION__, m, n, k, blocksize,
+           bestla_dtype_str(qtype), gemm::CoreAttr::to_str(GemmCore_T::ID), isasym);
+    auto constexpr ISA = GemmCore_T::ISA;
+    using Launcher =
+        wrapper::gemm::LauncherBase<ISA, GemmCore_T, prologue_a::gemm::ActivationKBlockBaseF32,
+                                    prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::AccumulatorWriteBackFp32>;
+    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+    Launcher launcher;
+    blocksize = blocksize == -1 ? k : blocksize;
+    using WType = storage::gemm::StorageWeightKBlockNInteger;
+    WType packedw(0);
+    packedw =
+        launcher.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<float>, bestla_dtype<utils::bf16>, isasym);
+
+    utils::avector<int8_t> buffer(packedw.mSize);
+    packedw.assign(buffer.data());
+    avector<float> matBf32(k * n), matAf32(m * k), matC(m * n), refC(m * n), refCupk(m * n);
+    fill_buffer_randn(matAf32.data(), matAf32.size(), -0.5f, 0.5f);
+    avector<uint8_t> matBs8(n * k);
+    avector<int4x2> matBs4(n * updiv(k, 2));
+    int blks = updiv(k, blocksize);
+    avector<float> scalesB(n * blks);
+    avector<uint8_t> zpBs8(n * blks, 8);
+    auto blk_padding = updiv(blks, 2);
+    avector<int4x2> zpBs4(n * blk_padding, uint8_t(0x88));
+    fill_buffer_randn(matBs8.data(), matBs8.size(), uint8_t(0), uint8_t(15));
+    if (isasym) {
+      fill_buffer_randn(zpBs8.data(), zpBs8.size(), uint8_t(0), uint8_t(15));
+    }
+    fill_buffer_randn(scalesB.data(), scalesB.size(), 0.001f, 0.005f);
+    avector<float> reduceA(m * blks, 0.f);
+
+    auto rA = launcher.mProA.createStorage(m, k, blocksize);
+    avector<int8_t> tmpA(rA.mSize);
+    if (isasym) {
+      for (size_t i = 0; i < m; i++) {
+        for (size_t j = 0; j < k; j++) {
+          reduceA[i * blks + j / blocksize] += matAf32[i * k + j];
+        }
+      }
+      rA.assign(tmpA.data());
+      launcher.mProA.reduce({matAf32.data(), k, &rA}, m, k, blocksize, UT_Threading::get());  // for reduce UT
+      buffer_error(reduceA.data(), rA.template RPtr<float>(), reduceA.size(), FP32_ERR);
+      memset(tmpA.data(), 0, tmpA.size());  // clear
+    }
+    for (size_t i = 0; i < n; i++) {
+      for (size_t j = 0; j < k; j += 2) {
+        *(uint8_t*)&matBs4[i * k / 2 + j / 2] = matBs8[i * k + j] | matBs8[i * k + j + 1] << 4;
+        auto koff = j / blocksize + i * blks;
+        auto koff1 = (j + 1) / blocksize + i * blks;
+        matBf32[j * n + i] = (float(matBs8[i * k + j]) - zpBs8[koff]) * scalesB[koff];
+        matBf32[(j + 1) * n + i] = (float(matBs8[i * k + j + 1]) - zpBs8[koff1]) * scalesB[koff1];
+      }
+      for (size_t j = 0; j < k; j += blocksize * 2) {
+        *(uint8_t*)&zpBs4[i * blk_padding + j / blocksize / 2] =
+            zpBs8[i * blks + j / blocksize] | zpBs8[i * blks + j / blocksize + 1] << 4;
+      }
+    }
+    launcher.mProB.packNbitsWeightQ4(n, k, isasym, (uint8_t*)matBs4.data(), k, scalesB.data(), (uint8_t*)zpBs4.data(),
+                                     &packedw, UT_Threading::get());
+    launcher.mProB.reduceWeight(&packedw, UT_Threading::get());
+    gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n);
+    avector<float> revB(matBf32.size());
+    launcher.mProB.unpackWeight(n, k, &packedw, revB.data(), n, UT_Threading::get());
+    buffer_error(matBf32.data(), revB.data(), revB.size(), FP32_ERR);
+    gemmref_fp32fp32fp32(m, n, k, matAf32.data(), revB.data(), refCupk.data(), k, n, n);
+    GemmProblem gp(1, m, n, k, blocksize);
+    typename Launcher::Param args{gp, {matAf32.data(), k, &rA}, {&packedw}, {matC.data(), n}};
+    if (isasym) {
+      parallel::GemmRunWithA<Parallel>(launcher, args, UT_Threading::get());
+    } else {
+      parallel::GemmRun<Parallel>(launcher, args, UT_Threading::get());
+    }
+    auto err = INT4_ERR;
+    buffer_error(refC.data(), matC.data(), refC.size(), err);
+    buffer_error(refCupk.data(), matC.data(), refCupk.size(), 0.001f);
+  }
+
+  template <class GemmCore_T>
+  void ut_file(int m) {
+    int n = 14336, k = 4096, blocksize = 32;
+    BTLA_DTYPE qtype = BTLA_DTYPE::S4_CLIP;
+    bool isasym = true;
+    printf("Test Case %s: %d %d %d-%d type:%s core:%s asym:%d \n", __FUNCTION__, m, n, k, blocksize,
+           bestla_dtype_str(qtype), gemm::CoreAttr::to_str(GemmCore_T::ID), isasym);
+    auto constexpr ISA = GemmCore_T::ISA;
+    using Launcher =
+        wrapper::gemm::LauncherBase<ISA, GemmCore_T, prologue_a::gemm::ActivationKBlockBaseF32,
+                                    prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::AccumulatorWriteBackFp32>;
+    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+    Launcher launcher;
+    const char *qfile = "int_weight.bin", *sfile = "scales.bin", *zfile = "zeros.bin";
+    auto qdata = ut::readFile2Buffer<int8_t>(qfile);
+    auto sdata = readFile2Buffer<float>(sfile);
+    auto zdata = readFile2Buffer<int8_t>(zfile);
+    using WType = storage::gemm::StorageWeightKBlockNInteger;
+    WType packedw =
+        launcher.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<float>, bestla_dtype<utils::bf16>, isasym);
+
+    utils::avector<int8_t> buffer(packedw.mSize);
+    packedw.assign(buffer.data());
+    avector<float> matBf32(k * n), matAf32(m * k), matC(m * n), refC(m * n), refCupk(m * n);
+    fill_buffer_randn(matAf32.data(), matAf32.size(), -0.5f, 0.5f);
+    int blks = updiv(k, blocksize);
+    avector<float> reduceA(m * blks, 0.f);
+
+    auto rA = launcher.mProA.createStorage(m, k, blocksize);
+    avector<int8_t> tmpA(rA.mSize);
+    if (isasym) {
+      for (size_t i = 0; i < m; i++) {
+        for (size_t j = 0; j < k; j++) {
+          reduceA[i * blks + j / blocksize] += matAf32[i * k + j];
+        }
+      }
+      rA.assign(tmpA.data());
+      launcher.mProA.reduce({matAf32.data(), k, &rA}, m, k, blocksize, UT_Threading::get());  // for reduce UT
+      buffer_error(reduceA.data(), rA.template RPtr<float>(), reduceA.size(), FP32_ERR);
+      memset(tmpA.data(), 0, tmpA.size());  // clear
+    }
+    for (size_t i = 0; i < k; i++) {
+      for (size_t j = 0; j < n; j++) {
+        matBf32[i * n + j] = (float(qdata[i * n + j]) - zdata[i / blocksize * n + j]) * sdata[i / blocksize * n + j];
+      }
+    }
+
+    launcher.mProB.packQWeight(n, k, qdata.data(), n, sdata.data(), zdata.data(), &packedw, UT_Threading::get());
+
+    auto bfile = readFile2Buffer<int8_t>("bestla_w3.weight.bin");
+    WType packedfile(0);
+    packedfile.deserialize(bfile.data());
+    buffer_error(packedw.WPtr<int8_t>(), packedfile.WPtr<int8_t>(), packedw.mQBuf.size<int8_t>());
+    buffer_error(packedw.SPtr<float>(), packedfile.SPtr<float>(), packedw.CSize());
+    buffer_error(packedw.ZPtr<int8_t>(), packedfile.ZPtr<int8_t>(), packedw.CSize());
+    gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n);
+    avector<float> revB(matBf32.size());
+    launcher.mProB.unpackWeight(n, k, &packedw, revB.data(), n, UT_Threading::get());
+    buffer_error(matBf32.data(), revB.data(), revB.size(), FP32_ERR);
+    gemmref_fp32fp32fp32(m, n, k, matAf32.data(), revB.data(), refCupk.data(), k, n, n);
+    GemmProblem gp(1, m, n, k, blocksize);
+    typename Launcher::Param args{gp, {matAf32.data(), k, &rA}, {&packedw}, {matC.data(), n}};
+        {matC.data(), n}};
+    if (isasym) {
+      parallel::GemmRunWithA<Parallel>(launcher, args, UT_Threading::get());
+    } else {
+      parallel::GemmRun<Parallel>(launcher, args, UT_Threading::get());
+    }
+    auto err = INT4_ERR;
+    buffer_error(refC.data(), matC.data(), refC.size(), err);
+    buffer_error(refCupk.data(), matC.data(), refCupk.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_PROLOGUE_B
+static UT_ORT_NBits sUT_ORT_NBits;
+#endif
+
+#if 0  // TODO Add getweight fp16 
+class UT_CompFp16 {
+ public:
+  UT_CompFp16() {
+    UT_START();
+    CheckISA(AVX512_FP16);
+    ut_s4();
+    ut_s8();
+    ut_f4();
+  }
+
+  void ut_s4() {
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockS4, float>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP);
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockS4, float>(2, 4096, 4096, 128, BTLA_DTYPE::S4_CLIP);
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockS4, float>(2, 4096, 4096, -1, BTLA_DTYPE::S4_CLIP);
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockS4, utils::bf16>(2, 4096, 4096, 32, BTLA_DTYPE::S4_CLIP);
+  }
+
+  void ut_s8() {
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockS8, float>(2, 4096, 4096, 32, BTLA_DTYPE::S8);
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockS8, float>(2, 4096, 4096, 128, BTLA_DTYPE::S8);
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockS8, float>(2, 4096, 4096, -1, BTLA_DTYPE::S8);
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockS8, utils::bf16>(2, 4096, 4096, 32, BTLA_DTYPE::S8);
+  }
+
+  void ut_f4() {
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockF4, float>(2, 4096, 4096, 32, BTLA_DTYPE::F4_BNB);
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockF4, float>(2, 4096, 4096, -1, BTLA_DTYPE::F4_BNB);
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockF4, float>(2, 4096, 4096, 32, BTLA_DTYPE::F4_E2M1);
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockF4, float>(2, 4096, 4096, -1, BTLA_DTYPE::F4_E2M1);
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockF4, float>(2, 4096, 4096, 32, BTLA_DTYPE::F4_NF4);
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockF4, float>(2, 4096, 4096, -1, BTLA_DTYPE::F4_NF4);
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockF4, utils::bf16>(2, 4096, 4096, 32, BTLA_DTYPE::F4_BNB);
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockF4, utils::bf16>(2, 4096, 4096, 32, BTLA_DTYPE::F4_E2M1);
+    ut<sAVX512_FP16, prologue_b::gemm::WeightKBlockF4, utils::bf16>(2, 4096, 4096, 32, BTLA_DTYPE::F4_NF4);
+  }
+
+  template <class GemmCore_T, template <class _T, BTLA_ISA> class Wei, typename Scale_T>
+  void ut(int m, int n, int k, int blocksize, BTLA_DTYPE qtype) {
+    printf("Test Case %s: %d %d %d-%d type:%s core:%s scaletype:%s\n", __FUNCTION__, m, n, k, blocksize,
+           bestla_dtype_str(qtype),gemm::CoreAttr::to_str(GemmCore_T::ID), type_str<Scale_T>);
+    auto constexpr ISA = GemmCore_T::ISA;
+    using Launcher = wrapper::gemm::LauncherKBlock<ISA, GemmCore_T, prologue_a::gemm::ActivationBase, Wei,
+                                                          epilogue::gemm::CompFp32BlockEpilogue,
+                                                          epilogue::gemm::AccumulatorWriteBackFp32>;
+    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+    Launcher launcher;
+    blocksize = blocksize == -1 ? k : blocksize;
+    using WType = typename Wei<GemmCore_T, ISA>::StorageWeight;
+    WType packedw(0);
+    if constexpr (std::is_same_v<Wei<GemmCore_T, ISA>, prologue_b::gemm::WeightKBlockS8<GemmCore_T, ISA>>) {
+      packedw = launcher.mProB.createStorage(n, k, blocksize, bestla_dtype<Scale_T>, bestla_dtype<float>, false);
+    } else if constexpr (std::is_same_v<Wei<GemmCore_T, ISA>,
+                                        prologue_b::gemm::WeightKBlockS4<GemmCore_T, ISA>>) {
+      packedw = launcher.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>, bestla_dtype<float>, false);
+    } else if constexpr (std::is_same_v<Wei<GemmCore_T, ISA>,
+                                        prologue_b::gemm::WeightKBlockF4<GemmCore_T, ISA>>) {
+      packedw = launcher.mProB.createStorage(n, k, blocksize, qtype, bestla_dtype<Scale_T>);
+    }
+
+    utils::avector<int8_t> buffer(packedw.mSize);
+    packedw.assign(buffer.data());
+    avector<utils::bf16> matAbf16(m * k), matBbf16(k * n);
+    fill_buffer_randn(matAbf16.data(), matAbf16.size(), utils::bf16(-0.5f), utils::bf16(0.5f));
+    fill_buffer_randn(matBbf16.data(), matBbf16.size(), utils::bf16(-0.5f), utils::bf16(0.5f));
+    avector<float> matBf32(k * n), matAf32(m * k), matC(m * n), refC(m * n), refCupk(m * n);
+    for (size_t i = 0; i < matBf32.size(); i++) {
+      matBf32[i] = matBbf16[i];
+    }
+    launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, UT_Threading::get());
+    gemmref_bf16bf16fp32(m, n, k, matAbf16.data(), matBbf16.data(), refC.data(), k, n, n);
+    launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, UT_Threading::get());
+    for (size_t i = 0; i < matBf32.size(); i++) {
+      matBbf16[i] = static_cast<utils::bf16>(matBf32[i]);
+    }
+    gemmref_bf16bf16fp32(m, n, k, matAbf16.data(), matBbf16.data(), refCupk.data(), k, n, n);
+    typename Launcher::Param args{m,
+                                  n,
+                                  k,
+                                  blocksize,
+                                  {matAbf16.data(), k},
+                                  {&packedw},
+                                  {packedw.template SPtr<int8_t>(), packedw.SDtype(), packedw.CStep()},
+                                  {matC.data(), n}};
+    parallel::GemmRun<Parallel>(launcher, args);
+    auto err = INT8_ERR;
+    if constexpr (std::is_same_v<Wei<GemmCore_T, ISA>, prologue_b::gemm::WeightKBlockS4<GemmCore_T, ISA>>) {
+      err = INT4_ERR;
+    } else if constexpr (std::is_same_v<Wei<GemmCore_T, ISA>,
+                                        prologue_b::gemm::WeightKBlockF4<GemmCore_T, ISA>>) {
+      err = FP4_ERR;
+    }
+
+    buffer_error(refC.data(), matC.data(), refC.size(), err);
+    buffer_error(refCupk.data(), matC.data(), refCupk.size(), 0.05f);
+  }
+};
+#ifdef BTLA_UT_DEBUG
+static UT_CompFp16 sUT_CompFp16;
+#endif
+#endif
+}  // namespace ut
+}  // namespace bestla
+#endif
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_ut.cpp b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_ut.cpp
new file mode 100644
index 000000000..a2ac93714
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_ut.cpp
@@ -0,0 +1,7 @@
+#include <stdio.h>
+#include <bestla_parallel.h>
+
+int main() {
+  printf("BesTLA UT done\n");
+  return 0;
+}
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_ut.h b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_ut.h
new file mode 100644
index 000000000..a874f02ef
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_ut.h
@@ -0,0 +1,677 @@
+#pragma once
+
+#include <random>
+#include <stdexcept>
+#include "bestla_utils.h"
+#include "bestla_gemm.h"
+#include "bestla_device.h"
+#include "bestla_parallel.h"
+
+#define CheckISA(ISA)                         \
+  {                                           \
+    GetCPUDevice();                           \
+    if (!_cd->ISA()) {                        \
+      printf("Wrong Device ISA: " #ISA "\n"); \
+      return;                                 \
+    }                                         \
+  }
+
+namespace bestla {
+namespace ut {
+using sAVX512F = gemm::SCoreRowNAvx512f<48, 8>;
+using sAMX_BF16 = gemm::HCoreRowNAmxbf16<64, 16>;
+using sAVX512_FP16 = gemm::HCoreRowNAvx512fp16<96, 8>;
+using sAVX_VNNI = gemm::ICoreRowNAvxvnni<24, 4>;
+using sAVX_VNNI_SS = gemm::ICoreRowNAvxvnniSS<24, 4>;
+using sAVX512_VNNI = gemm::ICoreRowNAvx512vnni<48, 8>;
+using sAMX_INT8_US = gemm::ICoreRowNAmxint8<64, 16>;
+using sAMX_INT8_SS = gemm::ICoreRowNAmxint8SS<64, 16>;
+using sAVX2 = gemm::SCoreRowNAvx2<24, 4>;
+
+class UT_Threading {
+ public:
+  static bestla::parallel::IThreading* get() {
+#if BTLA_OPENMP
+    static bestla::parallel::OMPThreading DefaultThreading;
+#else
+    static bestla::parallel::StdThreading DefaultThreading;
+#endif  // _OPNEMP
+    return &DefaultThreading;
+  }
+
+  static void set_threads(int n_thread) { get()->set_threads(n_thread); }
+
+  static std::vector<int> get_threads_config() {
+    GetCPUDevice();
+    if (_cd->isClient()) {
+      if (_cd->isHybrid()) {
+        return std::vector<int>{_cd->getThreads(), _cd->getCores(), int(_cd->getPcoreNum())};
+      }
+      return std::vector<int>{_cd->getCores() * 2, _cd->getCores()};
+    }
+
+    if (_cd->getThreads() == 56) {
+      return std::vector<int>{48, 56};
+    }
+    return std::vector<int>{_cd->getThreads()};
+  }
+};
+static inline size_t gemm_memsize(int m, int n, int k, BTLA_DTYPE dtA, BTLA_DTYPE dtB, BTLA_DTYPE dtC) {
+  size_t total = 0;
+  total += size_t(m) * k * utils::bestla_dtype_bits(dtA);
+  total += size_t(n) * k * utils::bestla_dtype_bits(dtB);
+  total += size_t(m) * n * utils::bestla_dtype_bits(dtC);
+  return total / 8;
+}
+
+static inline int auto_batch(size_t memsize) {
+  GetCPUDevice();
+  auto L3 = _cd->getL3CacheSize();
+  size_t constexpr Enlarge = 4;
+  size_t constexpr TargetMem = 1LL << 30;
+  auto batch = std::max(L3 * Enlarge, TargetMem) / memsize;
+  return batch > 1 ? batch : 2;
+}
+
+constexpr size_t CacheSize = size_t(100) << 10;
+static int8_t cache[CacheSize];
+
+// UT Error definitions
+// Activation uniform distribution range [-0.5f,0.5f]
+// Weight uniform distribution range [-0.5f,0.5f]
+// reduce dim: 4096
+#define FP32_ERR 0.0001f
+#define FP16_ERR 0.001f
+#define BF16_ERR 0.02f
+#define INT8_ERR 0.2f
+#define F8_ERR 1.5f
+#define INT4_ERR 3.5f
+#define INT3_ERR 7.f
+#define INT2_ERR 18.f
+#define FP4_ERR 3.5f
+
+static inline float get_ut_err(BTLA_DTYPE qtype) {
+  auto dbits = utils::bestla_dtype_bits(qtype);
+  auto type = utils::bestla_dtype_type(qtype);
+  auto err = FP32_ERR;
+  auto constexpr dtype_int = utils::bestla_dtype_type(BTLA_DTYPE::TypeInt);
+  if (type == dtype_int) {
+    if (dbits == 8) {
+      err = INT8_ERR;
+    } else if (dbits == 3) {
+      err = INT3_ERR;
+    } else if (dbits == 2) {
+      err = INT2_ERR;
+    } else {
+      err = INT4_ERR;
+    }
+  } else {
+    if (dbits == 4) {
+      err = FP4_ERR;
+    } else if (dbits == 8) {
+      err = F8_ERR;
+    } else if (dbits == 16) {
+      if (qtype == BTLA_DTYPE::F16) {
+        err = FP16_ERR;
+      } else {
+        err = BF16_ERR;
+      }
+    }
+  }
+  return err;
+}
+
+template <typename _T>
+inline _T randn(_T minval, _T maxval) {
+  auto normval = (rand() + 0.5f) / (RAND_MAX + 1.f);
+  auto _gap = maxval - minval;
+  return static_cast<_T>(_gap * normval + minval);
+}
+
+template <>
+inline utils::bf16 randn(utils::bf16 minval, utils::bf16 maxval) {
+  auto normval = (rand() + 0.5f) / (RAND_MAX + 1.f);
+  auto _gap = maxval.tofloat() - minval.tofloat();
+  utils::bf16 tmp;
+  tmp.fromfloat(_gap * normval + minval.tofloat());
+  return tmp;
+}
+
+template <>
+inline utils::fp16 randn(utils::fp16 minval, utils::fp16 maxval) {
+  auto normval = (rand() + 0.5f) / (RAND_MAX + 1.f);
+  auto _gap = float(maxval) - float(minval);
+  return utils::fp16(_gap * normval + float(minval));
+}
+
+inline float remove_bf16_err(float raw) {
+  return utils::cast<utils::bf16, float>(utils::cast<float, utils::bf16>(raw));
+}
+
+inline void buffer_remove_bf16_err(float* buf, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    buf[i] = remove_bf16_err(buf[i]);
+  }
+}
+
+template <typename _T>
+static void fill_buffer_randn(_T* buf, size_t size, _T minval, _T maxval) {
+  for (size_t i = 0; i < size; i++) {
+    buf[i] = randn(minval, maxval);
+  }
+}
+
+template <typename _T>
+utils::aligned_vector<_T> readFile2Buffer(const char* filepath) {
+  auto w1fp = fopen(filepath, "rb");
+  if (w1fp == nullptr) {
+    return utils::aligned_vector<_T>();
+  }
+  fseek(w1fp, 0, SEEK_END);
+  auto size = ftell(w1fp);
+  fseek(w1fp, 0, SEEK_SET);
+  size_t memsize = utils::padto(size_t(size), sizeof(_T));
+  utils::aligned_vector<_T> buf(memsize);
+  fread(buf.data(), size, 1, w1fp);
+  fclose(w1fp);
+  return buf;
+}
+
+#define UT_START()                                    \
+  {                                                   \
+    GetCPUDevice();                                   \
+    ut::UT_Threading::set_threads(_cd->getThreads()); \
+    printf("Test Class: %s\n", __FUNCTION__);         \
+  }
+template <typename _T>
+static double buffer_error(_T* ref, _T* tar, size_t size, _T thres = _T(0)) {
+  double err = 0;
+  int cnt = 0;
+  float max_err = 0, max_a = 0.f, max_b = 0.f;
+  int constexpr MAX_PRINT = 10;
+  int max_i = 0;
+  for (size_t i = 0; i < size; i++) {
+    auto vtar = float(tar[i]);
+    auto vref = float(ref[i]);
+    auto diff = abs(vtar - vref);
+    err += diff;
+    if (diff > float(thres) && cnt < MAX_PRINT) {
+      cnt++;
+      printf("%6i Ref %12.5f\tTar %12.5f\n", int(i), vref, vtar);
+    }
+    if (diff > max_err) {
+      max_err = diff;
+      max_a = vref;
+      max_b = vtar;
+      max_i = int(i);
+    }
+  }
+  if (cnt == 0) {
+    printf("Case Passed!\n");
+  } else {
+    printf("Case Failed!\n");
+  }
+  printf("Max Error @%d:%.3f=%.3f-%.3f\n", max_i, max_err, max_a, max_b);
+  err /= size;
+  printf("Average Error: %.3f\n", err);
+  return err;
+}
+
+template <>
+double buffer_error(utils::bf16* ref, utils::bf16* tar, size_t size, utils::bf16 thres) {
+  float err = 0;
+  int cnt = 0;
+  float max_err = 0, max_a = 0.f, max_b = 0.f;
+  int constexpr MAX_PRINT = 10;
+  int max_i = 0;
+  for (size_t i = 0; i < size; i++) {
+    auto diff = abs(utils::cast<utils::bf16, float>(tar[i]) - utils::cast<utils::bf16, float>(ref[i]));
+    err += diff;
+    if (diff > utils::cast<utils::bf16, float>(thres) && cnt < MAX_PRINT) {
+      cnt++;
+      printf("%6i Ref %12.5f\tTar %12.5f\n", int(i), utils::cast<utils::bf16, float>(ref[i]),
+             utils::cast<utils::bf16, float>(tar[i]));
+    }
+    if (diff > max_err) {
+      max_err = diff;
+      max_a = utils::cast<utils::bf16, float>(ref[i]);
+      max_b = utils::cast<utils::bf16, float>(tar[i]);
+      max_i = int(i);
+    }
+  }
+  if (cnt == 0) {
+    printf("Case Passed!\n");
+  } else {
+    printf("Case Failed!\n");
+  }
+  printf("Max Error @%d:%.3f=%.3f-%.3f\n", max_i, max_err, max_a, max_b);
+  err /= size;
+  printf("Average Error: %.3f\n", err);
+  return err;
+}
+
+template <>
+double buffer_error(float* ref, float* tar, size_t size, float thres) {
+  float err = 0;
+  int cnt = 0;
+  float max_err = 0, max_a = 0.f, max_b = 0.f;
+  int constexpr MAX_PRINT = 10;
+  int max_i = 0;
+  for (size_t i = 0; i < size; i++) {
+    auto diff = abs(tar[i] - ref[i]);
+    err += diff;
+    if (diff > thres && cnt < MAX_PRINT) {
+      cnt++;
+      printf("%6i Ref %12.5f\tTar %12.5f\n", int(i), ref[i], tar[i]);
+    }
+    if (diff > max_err) {
+      max_err = diff;
+      max_a = ref[i];
+      max_b = tar[i];
+      max_i = int(i);
+    }
+  }
+  if (cnt == 0) {
+    printf("Case Passed!\n");
+  } else {
+    printf("Case Failed!\n");
+  }
+  printf("Max Error @%d:%.3f=%.3f-%.3f\n", max_i, max_err, max_a, max_b);
+  err /= size;
+  printf("Average Error: %.3f\n", err);
+  return err;
+}
+
+template <typename _T>
+static double buffer_error_2d(_T* ref, _T* tar, size_t row, size_t col, size_t refstep, size_t tarstep,
+                              _T thres = _T(0)) {
+  double err = 0;
+  int cnt = 0;
+  int constexpr MAX_PRINT = 10;
+  for (size_t i = 0; i < row; i++) {
+    for (size_t j = 0; j < col; j++) {
+      auto refv = ref[i * refstep + j];
+      auto tarv = tar[i * tarstep + j];
+      auto diff = std::abs(refv - tarv);
+      err += diff;
+      if (diff > thres && cnt < MAX_PRINT) {
+        cnt++;
+        printf("%i %d Ref %12.5f\tTar %12.5f\n", int(i), int(j), utils::cast<_T, float>(refv),
+               utils::cast<_T, float>(tarv));
+      }
+    }
+  }
+  if (cnt == 0) {
+    printf("Case Passed!\n");
+  } else {
+    printf("Case Failed!\n");
+  }
+  auto size = (size_t)row * col;
+  err /= size;
+  printf("Average Error: %.3f\n", err);
+  return err;
+}
+
+struct UT_vector_s8 {
+  utils::aligned_vector<int8_t> data_;
+  void resize(size_t _size) { data_.resize(_size); }
+  size_t size() { return data_.size(); }
+  int8_t* data() { return data_.data(); }
+  void fill_rand(int8_t minval, int8_t maxval) {
+    for (size_t i = 0; i < data_.size(); i++) {
+      data_[i] = randn(minval, maxval);
+    }
+  }
+  void rand_scale(size_t n, float minval, float maxval) {
+    scales.resize(n);
+    for (size_t i = 0; i < n; i++) {
+      scales[i] = randn(minval, maxval);
+    }
+  }
+  utils::aligned_vector<float> scales;
+};
+
+struct UT_vector_u8 {
+  utils::aligned_vector<uint8_t> data_;
+  void resize(size_t _size) { data_.resize(_size); }
+  size_t size() { return data_.size(); }
+  uint8_t* data() { return data_.data(); }
+  void fill_rand(uint8_t minval, uint8_t maxval) {
+    for (size_t i = 0; i < data_.size(); i++) {
+      data_[i] = ut::randn(minval, maxval);
+    }
+  }
+  void rand_scale(size_t n, float minval, float maxval) {
+    scales.resize(n);
+    zeropoints.resize(n);
+    for (size_t i = 0; i < n; i++) {
+      scales[i] = ut::randn(minval, maxval);
+      zeropoints[i] = 0;
+    }
+  }
+  utils::aligned_vector<float> scales;
+  utils::aligned_vector<int> zeropoints;
+};
+
+struct UT_GEMMData_Row_u8s8 {
+  UT_GEMMData_Row_u8s8(int m, int n, int k, int lda, int ldb, int ldc, int ldd, int nscale = 1,
+                       bool interleaved_ = false)
+      : M(m), N(n), K(k), LDA(lda), LDB(ldb), LDC(ldc), LDD(ldd), interleaved(interleaved_) {
+    matA.resize(m * lda);
+    matB.resize((interleaved ? n : k) * ldb);
+    matC.resize(m * ldc);
+    if (ldd == 0) {
+      matD.resize(n);
+    } else {
+      matD.resize(m * ldd);
+    }
+    matA.fill_rand(0, 255);
+    matB.fill_rand(-127, 127);
+    matD.fill_rand(0, 255);
+    matA.rand_scale(nscale, 0.f, 0.01f);
+    matB.rand_scale(nscale, 0.f, 0.01f);
+    matD.rand_scale(nscale, 0.f, 1.f);
+  }
+
+  void calc_ref(float alpha, float beta) {
+    if (interleaved) throw std::runtime_error("Only support plain data format!");
+    float _cmin = std::numeric_limits<float>::max();
+    float _cmax = std::numeric_limits<float>::min();
+    matCRef.resize(M * LDC);
+    auto tmpsrcscale = alpha * matA.scales[0] * matB.scales[0];
+#pragma omp parallel for
+    for (int j = 0; j < M; j++) {
+      for (int i = 0; i < N; i += 1) {
+        int tmp = 0;
+        for (int ik = 0; ik < K; ik++) {
+          tmp += matA.data()[ik + j * LDA] * matB.data()[ik * LDB + i];
+        }
+        auto ftmp = tmp * tmpsrcscale;
+        ftmp = ftmp + (matD.data()[i + j * LDD] - matD.zeropoints[0]) * matD.scales[0] * beta;
+        matCRef[i + j * LDC] = ftmp;
+        _cmin = ftmp < _cmin ? ftmp : _cmin;
+        _cmax = ftmp > _cmax ? ftmp : _cmax;
+      }
+    }
+    matC.scales.resize(1);
+    matC.zeropoints.resize(1);
+    matC.scales[0] = (_cmax - _cmin) / (255.f);
+    matC.zeropoints[0] = int((0 - _cmin) / matC.scales[0]);
+    auto tmpscale = 1.f / matC.scales[0];
+#pragma omp parallel for
+    for (int j = 0; j < M; j++) {
+      for (int i = 0; i < N; i += 1) {
+        matC.data()[j * LDC + i] = utils::cast<float, uint8_t>(matCRef[j * LDC + i] * tmpscale + matC.zeropoints[0]);
+      }
+    }
+    matCDequan.resize(matCRef.size());
+#pragma omp parallel for
+    for (int j = 0; j < M; j++) {
+      for (int i = 0; i < N; i += 1) {
+        matCDequan.data()[j * LDC + i] = ((int)matC.data()[j * LDC + i] - matC.zeropoints[0]) * matC.scales[0];
+      }
+    }
+
+    /*utils::ut::buffer_error(matCRef.data(), matCDequan.data(),
+                                                    matCDequan.size(),
+       matC.scales[0]);*/
+  }
+
+  UT_vector_u8 matA, matC, matD;
+  utils::aligned_vector<float> matCRef, matCDequan;
+  UT_vector_s8 matB;
+  int M, N, K, LDA, LDB, LDC, LDD;
+  bool interleaved;
+};
+
+static inline void gemmref_u8s8s32(int m, int n, int k, uint8_t* A, int8_t* B, int32_t* C, int lda, int ldb, int ldc) {
+#pragma omp parallel for
+  for (int j = 0; j < m; j++) {
+    for (int i = 0; i < n; i += 1) {
+      int tmp = 0;
+      for (int ik = 0; ik < k; ik++) {
+        tmp += int(A[ik + j * lda]) * int(B[ik * ldb + i]);
+      }
+      C[i + j * ldc] = tmp;
+    }
+  }
+}
+
+static inline void gemmref_s8s8s32(int m, int n, int k, int8_t* A, int8_t* B, int32_t* C, int lda, int ldb, int ldc) {
+#pragma omp parallel for
+  for (int j = 0; j < m; j++) {
+    for (int i = 0; i < n; i += 1) {
+      int tmp = 0;
+      for (int ik = 0; ik < k; ik++) {
+        tmp += int(A[ik + j * lda]) * int(B[ik * ldb + i]);
+      }
+      C[i + j * ldc] = tmp;
+    }
+  }
+}
+
+static inline void kblockgemmref_u8zp_s8_f32(int m, int n, int k, int kblock, uint8_t* A, uint8_t* zpA, float* scaleA,
+                                             int8_t* B, float* scaleB, float* C, int lda, int ldsa, int ldb, int ldsb,
+                                             int ldc) {
+  int kblk = utils::padto_le(k, kblock);
+#pragma omp parallel for
+  for (int j = 0; j < m; j++) {
+    for (int i = 0; i < n; i += 1) {
+      float tmp = 0.f;
+      int ik = 0;
+      for (; ik < kblk; ik += kblock) {
+        int stmp = 0;
+        for (int ikk = 0; ikk < kblock; ikk++) {
+          stmp += (int(A[(ik + ikk) + j * lda]) - int(zpA[j * ldsa + ik / kblock])) * int(B[(ik + ikk) * ldb + i]);
+        }
+        tmp += stmp * scaleA[j * ldsa + ik / kblock] * scaleB[ik / kblock * ldsb + i];
+      }
+      if (ik < k) {
+        int stmp = 0;
+        for (; ik < k; ik++) {
+          stmp += (int(A[ik + j * lda]) - int(zpA[j * ldsa + ik / kblock])) * int(B[ik * ldb + i]);
+        }
+        tmp += stmp * scaleA[j * ldsa + ik / kblock] * scaleB[ik / kblock * ldsb + i];
+      }
+      C[i + j * ldc] = tmp;
+    }
+  }
+}
+
+static inline void kblockgemmref_u8zp_s8_f32(int m, int n, int k, int kblock, uint8_t* A, uint8_t* zpA, float* scaleA,
+                                             int8_t* B, utils::bf16* scaleB, float* C, int lda, int ldsa, int ldb,
+                                             int ldsb, int ldc) {
+#pragma omp parallel for
+  for (int j = 0; j < m; j++) {
+    for (int i = 0; i < n; i += 1) {
+      float tmp = 0.f;
+      for (int ik = 0; ik < k; ik += kblock) {
+        int stmp = 0;
+        for (int ikk = 0; ikk < kblock; ikk++) {
+          stmp += (int(A[(ik + ikk) + j * lda]) - int(zpA[j * ldsa + ik / kblock])) * int(B[(ik + ikk) * ldb + i]);
+        }
+        tmp += stmp * scaleA[j * ldsa + ik / kblock] * scaleB[ik / kblock * ldsb + i].tofloat();
+      }
+      C[i + j * ldc] = tmp;
+    }
+  }
+}
+
+struct UT_GEMMData_Row_bf16 {
+  UT_GEMMData_Row_bf16(int m, int n, int k, int lda, int ldb, int ldc, int ldd)
+      : M(m), N(n), K(k), LDA(lda), LDB(ldb), LDC(ldc), LDD(ldd) {
+    matA.resize(m * lda);
+    matB.resize(k * ldb);
+    matC.resize(m * ldc);
+    if (ldd == 0) {
+      matD.resize(n);
+    } else {
+      matD.resize(m * ldd);
+    }
+    for (size_t i = 0; i < matA.size(); i++) {
+      matA[i] = utils::cast<float, utils::bf16>(ut::randn(-0.5f, 0.5f));
+    }
+    for (size_t i = 0; i < matB.size(); i++) {
+      matB[i] = utils::cast<float, utils::bf16>(ut::randn(-0.5f, 0.5f));
+    }
+    for (size_t i = 0; i < matD.size(); i++) {
+      matD[i] = utils::cast<float, utils::bf16>(ut::randn(1.f, 10.f));
+    }
+  }
+
+  void calc_ref(float alpha, float beta) {
+#pragma omp parallel for
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        auto tmp = 0.f;
+        for (int ik = 0; ik < K; ik++) {
+          auto tmpA = utils::cast<utils::bf16, float>(utils::bf16{matA.data()[i * LDA + ik]});
+          auto tmpB = utils::cast<utils::bf16, float>(utils::bf16{matB.data()[ik * LDB + j]});
+          tmp += tmpA * tmpB;
+        }
+        tmp = tmp * alpha + utils::cast<utils::bf16, float>({matD.data()[i * LDD + j]}) * beta;
+        matC.data()[i * LDC + j] = utils::cast<float, utils::bf16>(tmp);
+      }
+    }
+  }
+  utils::aligned_vector<utils::bf16> matA, matC, matD;
+  utils::aligned_vector<utils::bf16> matB;
+  int M, N, K, LDA, LDB, LDC, LDD;
+};
+
+static inline void gemmref_fp32fp32fp32(int m, int n, int k, float* A, float* B, float* C, int lda, int ldb, int ldc) {
+#pragma omp parallel for
+  for (int j = 0; j < m; j++) {
+    for (int i = 0; i < n; i += 1) {
+      float tmp = 0;
+      for (int ik = 0; ik < k; ik++) {
+        tmp += A[ik + j * lda] * B[ik * ldb + i];
+      }
+      C[i + j * ldc] = tmp;
+    }
+  }
+}
+
+static inline void gemmref_bf16bf16fp32(int m, int n, int k, utils::bf16* A, utils::bf16* B, float* C, int lda, int ldb,
+                                        int ldc) {
+#pragma omp parallel for
+  for (int j = 0; j < m; j++) {
+    for (int i = 0; i < n; i += 1) {
+      float tmp = 0;
+      for (int ik = 0; ik < k; ik++) {
+        tmp += float(A[ik + j * lda]) * float(B[ik * ldb + i]);
+      }
+      C[i + j * ldc] = tmp;
+    }
+  }
+}
+
+static inline void gemmref_fp16fp16fp16(int m, int n, int k, utils::fp16* A, utils::fp16* B, utils::fp16* C, int lda,
+                                        int ldb, int ldc) {
+#pragma omp parallel for
+  for (int j = 0; j < m; j++) {
+    for (int i = 0; i < n; i += 1) {
+      float tmp = 0;
+      for (int ik = 0; ik < k; ik++) {
+        tmp += float(A[ik + j * lda]) * float(B[ik * ldb + i]);
+      }
+      C[i + j * ldc] = tmp;
+    }
+  }
+}
+
+struct UT_GEMMData_Row_fp16 {
+  utils::aligned_vector<utils::fp16> matA, matB, matC, matD;
+  int M, N, K, LDA, LDB, LDC, LDD;
+  UT_GEMMData_Row_fp16(int m, int n, int k, int lda, int ldb, int ldc, int ldd)
+      : M(m), N(n), K(k), LDA(lda), LDB(ldb), LDC(ldc), LDD(ldd) {
+    matA.resize(m * lda);
+    matB.resize(k * ldb);
+    matC.resize(m * ldc);
+    if (ldd == 0) {
+      matD.resize(n);
+    } else {
+      matD.resize(m * ldd);
+    }
+    ut::fill_buffer_randn(matA.data(), m * lda, utils::fp16(-0.5f), utils::fp16(0.5f));
+    ut::fill_buffer_randn(matB.data(), k * ldb, utils::fp16(-0.5f), utils::fp16(0.5f));
+    ut::fill_buffer_randn(matD.data(), matD.size(), utils::fp16(0.f), utils::fp16(1.f));
+  }
+
+  void calc_ref(float alpha, float beta) {
+#pragma omp parallel for
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        utils::fp16 tmp = utils::fp16(0.f);
+        for (int ik = 0; ik < K; ik++) {
+          auto tmpA = utils::cast<utils::fp16, float>(matA.data()[i * LDA + ik]);
+          auto tmpB = utils::cast<utils::fp16, float>(matB.data()[ik * LDB + j]);
+          auto ret = utils::fp16(tmpA * tmpB);
+          tmp = utils::fp16(float(tmp) + float(ret));
+        }
+        float ftmp = float(tmp);
+        ftmp = ftmp * alpha + utils::cast<utils::fp16, float>(matD.data()[i * LDD + j]) * beta;
+        matC.data()[i * LDC + j] = utils::cast<float, utils::fp16>(ftmp);
+      }
+    }
+  }
+};
+
+struct UT_GEMMData_Row_f32 {
+  utils::aligned_vector<float> matA, matB, matC, matD, matRef;
+  int M, N, K, LDA, LDB, LDC, LDD;
+  UT_GEMMData_Row_f32(int m, int n, int k, int lda, int ldb, int ldc, int ldd)
+      : M(m), N(n), K(k), LDA(lda), LDB(ldb), LDC(ldc), LDD(ldd) {
+    matA.resize(m * lda);
+    matB.resize(k * ldb);
+    matC.resize(m * ldc);
+    if (ldd == 0) {
+      matD.resize(n);
+    } else {
+      matD.resize(m * ldd);
+    }
+    ut::fill_buffer_randn(matA.data(), m * lda, -0.5f, 0.5f);
+    ut::fill_buffer_randn(matB.data(), k * ldb, -0.5f, 0.5f);
+    ut::fill_buffer_randn(matD.data(), matD.size(), 0.f, 1.f);
+  }
+
+  void calc_ref(float alpha, float beta) {
+    matRef.resize(matC.size());
+    ref_NN_f32(matA.data(), matB.data(), matRef.data(), matD.data(), M, N, K, LDA, LDB, LDC, LDD, alpha, beta);
+  }
+  static void ref_NN_f32(float* matA, float* matB, float* matC, float* matD, int m, int n, int k, int lda, int ldb,
+                         int ldc, int ldd, float alpha, float beta) {
+    int NBlock = 128;
+#if 1
+#pragma omp parallel for
+    for (int i = 0; i < n; i += NBlock) {
+      for (int j = 0; j < m; j++) {
+        int remainn = i + NBlock <= n ? NBlock : n - i;
+        for (int ii = 0; ii < remainn; ii++) {
+          auto tmp = 0.f;
+          for (int ik = 0; ik < k; ik++) {
+            tmp += matA[ik + j * lda] * matB[ik * ldb + i + ii];
+          }
+          tmp = tmp * alpha + matD[(i + ii) + j * ldd] * beta;
+          matC[(i + ii) + j * ldc] = tmp;
+        }
+      }
+    }
+#else
+#pragma omp parallel for
+    for (int i = 0; i < n; i += 1) {
+      for (int j = 0; j < m; j++) {
+        auto tmp = 0.f;
+        for (int ik = 0; ik < k; ik++) {
+          tmp += matA[ik + j * lda] * matB[ik * ldb + i];
+        }
+        tmp = tmp * alpha + matD[i + j * ldd] * beta;
+        matC[i + j * ldc] = tmp;
+      }
+    }
+#endif
+  }
+};
+
+};  // namespace ut
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_utils.cpp b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_utils.cpp
new file mode 100644
index 000000000..bf668b374
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_utils.cpp
@@ -0,0 +1,4 @@
+
+namespace bestla {
+namespace ut {}  // namespace ut
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_wrapper.cpp b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_wrapper.cpp
new file mode 100644
index 000000000..39d32807b
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/bestla_wrapper.cpp
@@ -0,0 +1,279 @@
+#include "bestla_wrapper.h"
+#include "bestla_ut.h"
+namespace bestla {
+using namespace utils;
+namespace ut {
+class UT_Fp32Fp32 {
+ public:
+  UT_Fp32Fp32() {
+    UT_START();
+    CheckISA(AVX2);
+    ut<sAVX2>(1, 1, 1);
+    ut<sAVX2>(8, 48, 2);
+    ut<sAVX2>(8, 4096, 4096);
+    ut<sAVX2>(384, 768, 768);
+    ut<sAVX2>(1024, 1024, 1024);
+    ut<sAVX2>(1024, 1536, 1536);
+
+    CheckISA(AVX512F);
+    ut<gemm::SCoreRowNAvx512f<48, 8>>(384, 768, 768);
+    ut<gemm::SCoreRowNAvx512f<48, 8>>(1024, 1024, 1024);
+    ut<sAVX512F>(1, 1, 1);
+    ut<sAVX512F>(8, 48, 2);
+    ut<sAVX512F>(8, 4096, 4096);
+    ut<sAVX512F>(384, 768, 768);
+    ut<sAVX512F>(1024, 1024, 1024);
+    ut<sAVX512F>(1024, 1536, 1536);
+  }
+  template <class GemmCore_T>
+  void ut(int m, int n, int k) {
+    printf("Test Case: %d %d %d Core:%s\n", m, n, k, gemm::CoreAttr::to_str(GemmCore_T::ID));
+    avector<float> matA(m * k), matB(k * n), matC(m * n), ref(m * n);
+    fill_buffer_randn(matA.data(), matA.size(), -0.5f, 0.5f);
+    fill_buffer_randn(matB.data(), matB.size(), -0.5f, 0.5f);
+    gemmref_fp32fp32fp32(m, n, k, matA.data(), matB.data(), ref.data(), k, n, n);
+    using Launcher =
+        wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationBase,
+                                    prologue_b::gemm::WeightPack, epilogue::gemm::AccumulatorWriteBackFp32>;
+    Launcher launcher;
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+
+    auto packw = launcher.mProB.createStorage(n, k);
+    avector<int8_t> buffer(packw.mSize);
+    packw.assign(buffer.data());
+    launcher.mProB.packWeight(n, k, {matB.data(), n, &packw}, UT_Threading::get());
+    utils::GemmProblem gp(1, m, n, k);
+    typename Launcher::Param args{gp, {matA.data(), k}, {matB.data(), n, &packw}, {matC.data(), n}};
+    parallel::GemmRun<Parallel>(launcher, args, UT_Threading::get());
+    ut::buffer_error(ref.data(), matC.data(), ref.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_WRAPPER
+static UT_Fp32Fp32 sUT_Fp32Fp32;
+#endif
+
+class UT_U8S8S32 {
+ public:
+  UT_U8S8S32() {
+    UT_START();
+    GetCPUDevice();
+    if (_cd->AVX512_VNNI()) {
+      ut<sAVX512_VNNI>(4, 48, 4);
+      ut<sAVX512_VNNI>(1, 1, 1);
+      ut<sAVX512_VNNI>(8, 48, 2);
+      ut<sAVX512_VNNI>(8, 4096, 4096);
+      ut<sAVX512_VNNI>(384, 768, 768);
+      ut<sAVX512_VNNI>(1024, 1024, 1024);
+      ut<sAVX512_VNNI>(1024, 1536, 1536);
+    }
+    if (_cd->AVX_VNNI()) {
+      ut<sAVX_VNNI>(1, 1, 1);
+      ut<sAVX_VNNI>(8, 48, 2);
+      ut<sAVX_VNNI>(8, 4096, 4096);
+      ut<sAVX_VNNI>(384, 768, 768);
+      ut<sAVX_VNNI>(1024, 1024, 1024);
+      ut<sAVX_VNNI>(1024, 1536, 1536);
+    }
+    if (_cd->AMX_INT8()) {
+      request_perm_xtile_data();
+      ut<sAMX_INT8_US>(1, 1, 1);
+      ut<sAMX_INT8_US>(8, 48, 2);
+      ut<sAMX_INT8_US>(8, 4096, 4096);
+      ut<sAMX_INT8_US>(384, 768, 768);
+      ut<sAMX_INT8_US>(1024, 1024, 1024);
+      ut<sAMX_INT8_US>(1024, 1536, 1536);
+    }
+  }
+
+  template <class GemmCore_T>
+  void ut(int m, int n, int k) {
+    printf("Test Case: %d %d %d Core:%s\n", m, n, k, gemm::CoreAttr::to_str(GemmCore_T::ID));
+    avector<float> matBf32(k * n), matAf32(m * k), matC(m * n), refC(m * n);
+    avector<uint8_t> matAu8(m * k), zpAu8(m);
+    avector<int8_t> matBs8(k * n);
+    avector<float> scaleAf32(m), scaleBf32(n);
+    fill_buffer_randn(matAu8.data(), matAu8.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(zpAu8.data(), zpAu8.size(), uint8_t(100), uint8_t(150));
+    fill_buffer_randn(matBs8.data(), matBs8.size(), int8_t(-127), int8_t(127));
+    fill_buffer_randn(scaleAf32.data(), scaleAf32.size(), 0.001f, 0.005f);
+    fill_buffer_randn(scaleBf32.data(), scaleBf32.size(), 0.001f, 0.005f);
+    for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < k; j++) {
+        matAf32[i * k + j] = (float(matAu8[i * k + j]) - zpAu8[i]) * scaleAf32[i];
+      }
+    }
+    avector<float> reduceB(n, 0);
+    for (size_t i = 0; i < k; i++) {
+      for (size_t j = 0; j < n; j++) {
+        matBf32[i * n + j] = (float(matBs8[i * n + j])) * scaleBf32[j];
+        reduceB[j] += matBf32[i * n + j];
+      }
+    }
+    gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n);
+    using Launcher = wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationBase,
+                                                 prologue_b::gemm::WeightPack, epilogue::gemm::ZpDequantInt32ToFp32>;
+    Launcher launcher;
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+
+    auto packw = launcher.mProB.createStorage(n, k);
+    avector<int8_t> buffer(packw.mSize);
+    packw.assign(buffer.data());
+    launcher.mProB.packWeight(n, k, {matBs8.data(), n, &packw}, UT_Threading::get());
+    utils::GemmProblem gp(1, m, n, k);
+    typename Launcher::Param args{
+        gp,
+        {matAu8.data(), k},
+        {matBs8.data(), n, &packw},
+        {matC.data(), n, 1, scaleAf32.data(), scaleBf32.data(), zpAu8.data(), reduceB.data()}};
+    parallel::GemmRun<Parallel>(launcher, args, UT_Threading::get());
+    ut::buffer_error(refC.data(), matC.data(), refC.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_WRAPPER
+static UT_U8S8S32 sUT_U8S8S32;
+#endif
+
+class UT_S8S8S32 {
+ public:
+  UT_S8S8S32() {
+    UT_START();
+    GetCPUDevice();
+    if (_cd->AVX_VNNI()) {
+      ut<sAVX_VNNI_SS>(1, 1, 1);
+      ut<sAVX_VNNI_SS>(8, 48, 2);
+      ut<sAVX_VNNI_SS>(8, 4096, 4096);
+      ut<sAVX_VNNI_SS>(384, 768, 768);
+    }
+    if (_cd->AMX_INT8()) {
+      ut<sAMX_INT8_SS>(1, 1, 1);
+      ut<sAMX_INT8_SS>(8, 48, 2);
+      ut<sAMX_INT8_SS>(8, 4096, 4096);
+      ut<sAMX_INT8_SS>(384, 768, 768);
+      ut<sAMX_INT8_SS>(1024, 1024, 1024);
+      ut<sAMX_INT8_SS>(1024, 1536, 1536);
+    }
+  }
+  template <class GemmCore_T>
+  void ut(int m, int n, int k) {
+    printf("Test Case: %d %d %d Core:%s\n", m, n, k, gemm::CoreAttr::to_str(GemmCore_T::ID));
+    avector<float> matBf32(k * n), matAf32(m * k), matC(m * n), refC(m * n);
+    avector<int8_t> matAu8(m * k);
+    avector<int8_t> matBs8(k * n);
+    avector<float> scaleAf32(m), scaleBf32(n);
+    fill_buffer_randn(matAu8.data(), matAu8.size(), int8_t(-127), int8_t(127));
+    fill_buffer_randn(matBs8.data(), matBs8.size(), int8_t(-127), int8_t(127));
+    fill_buffer_randn(scaleAf32.data(), scaleAf32.size(), 0.001f, 0.005f);
+    fill_buffer_randn(scaleBf32.data(), scaleBf32.size(), 0.001f, 0.005f);
+    for (size_t i = 0; i < m; i++) {
+      for (size_t j = 0; j < k; j++) {
+        matAf32[i * k + j] = (float(matAu8[i * k + j])) * scaleAf32[i];
+      }
+    }
+    avector<float> reduceB(n, 0);
+    for (size_t i = 0; i < k; i++) {
+      for (size_t j = 0; j < n; j++) {
+        matBf32[i * n + j] = (float(matBs8[i * n + j])) * scaleBf32[j];
+        reduceB[j] += matBf32[i * n + j];
+      }
+    }
+    gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n);
+    using Launcher = wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationBase,
+                                                 prologue_b::gemm::WeightPack, epilogue::gemm::DequantInt32ToFp32>;
+    Launcher launcher;
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+
+    auto packw = launcher.mProB.createStorage(n, k);
+    avector<int8_t> buffer(packw.mSize);
+    packw.assign(buffer.data());
+    launcher.mProB.packWeight(n, k, {matBs8.data(), n, &packw}, UT_Threading::get());
+    utils::GemmProblem gp(1, m, n, k);
+    typename Launcher::Param args{
+        gp, {matAu8.data(), k}, {matBs8.data(), n, &packw}, {matC.data(), n, 1, scaleAf32.data(), scaleBf32.data()}};
+    parallel::GemmRun<Parallel>(launcher, args, UT_Threading::get());
+    ut::buffer_error(refC.data(), matC.data(), refC.size(), 0.001f);
+  }
+};
+#ifdef BTLA_UT_WRAPPER
+static UT_S8S8S32 sUT_S8S8S32;
+#endif
+
+class UT_Bf16Bf16Fp32 {
+ public:
+  UT_Bf16Bf16Fp32() {
+    UT_START();
+    CheckISA(AMX_BF16);
+    ut<sAMX_BF16>(1, 1, 1);
+    ut<sAMX_BF16>(8, 48, 2);
+    ut<sAMX_BF16>(8, 4096, 4096);
+    ut<sAMX_BF16>(384, 768, 768);
+    ut<sAMX_BF16>(1024, 1024, 1024);
+    ut<sAMX_BF16>(1024, 1536, 1536);
+  }
+
+  template <class GemmCore_T>
+  void ut(int m, int n, int k) {
+    printf("Test Case %s: %d %d %d core:%s\n", __FUNCTION__, m, n, k, gemm::CoreAttr::to_str(GemmCore_T::ID));
+    using Launcher =
+        wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationBase,
+                                    prologue_b::gemm::WeightPack, epilogue::gemm::AccumulatorWriteBackFp32>;
+    Launcher launcher;
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+    auto packw = launcher.mProB.createStorage(n, k);
+    avector<int8_t> buffer(packw.mSize);
+    packw.assign(buffer.data());
+    avector<utils::bf16> matAbf16(m * k), matBbf16(k * n);
+    fill_buffer_randn(matAbf16.data(), matAbf16.size(), utils::bf16(-0.5f), utils::bf16(0.5f));
+    fill_buffer_randn(matBbf16.data(), matBbf16.size(), utils::bf16(-0.5f), utils::bf16(0.5f));
+    avector<float> matC(m * n), refC(m * n);
+    launcher.mProB.packWeight(n, k, {matBbf16.data(), n, &packw}, UT_Threading::get());
+    gemmref_bf16bf16fp32(m, n, k, matAbf16.data(), matBbf16.data(), refC.data(), k, n, n);
+    utils::GemmProblem gp(1, m, n, k);
+    typename Launcher::Param args{gp, {matAbf16.data(), k}, {matBbf16.data(), n, &packw}, {matC.data(), n}};
+    parallel::GemmRun<Parallel>(launcher, args, UT_Threading::get());
+    buffer_error(refC.data(), matC.data(), refC.size(), 0.05f);
+  }
+};
+#ifdef BTLA_UT_WRAPPER
+static UT_Bf16Bf16Fp32 sUT_Bf16Bf16Fp32;
+#endif
+
+class UT_Fp16Fp16Fp16 {
+ public:
+  UT_Fp16Fp16Fp16() {
+    UT_START();
+    CheckISA(AVX512_FP16);
+    ut<sAVX512_FP16>(1, 1, 1);
+    ut<sAVX512_FP16>(8, 48, 2);
+    ut<sAVX512_FP16>(8, 4096, 4096);
+    ut<sAVX512_FP16>(384, 768, 768);
+    ut<sAVX512_FP16>(1024, 1024, 1024);
+    ut<sAVX512_FP16>(1024, 1536, 1536);
+  }
+
+  template <class GemmCore_T>
+  void ut(int m, int n, int k) {
+    printf("Test Case %s: %d %d %d core:%s\n", __FUNCTION__, m, n, k, gemm::CoreAttr::to_str(GemmCore_T::ID));
+    using Launcher =
+        wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationBase,
+                                    prologue_b::gemm::WeightPack, epilogue::gemm::AccumulatorWriteBackFp16>;
+    Launcher launcher;
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+    auto packw = launcher.mProB.createStorage(n, k);
+    avector<int8_t> buffer(packw.mSize);
+    packw.assign(buffer.data());
+    avector<utils::fp16> matAbf16(m * k), matBbf16(k * n), matC(m * n), refC(m * n);
+    fill_buffer_randn(matAbf16.data(), matAbf16.size(), utils::fp16(-0.5f), utils::fp16(0.5f));
+    fill_buffer_randn(matBbf16.data(), matBbf16.size(), utils::fp16(-0.5f), utils::fp16(0.5f));
+    launcher.mProB.packWeight(n, k, {matBbf16.data(), n, &packw}, UT_Threading::get());
+    gemmref_fp16fp16fp16(m, n, k, matAbf16.data(), matBbf16.data(), refC.data(), k, n, n);
+    GemmProblem gp(1, m, n, k);
+    typename Launcher::Param args{gp, {matAbf16.data(), k}, {matBbf16.data(), n, &packw}, {matC.data(), n}};
+    parallel::GemmRun<Parallel>(launcher, args, UT_Threading::get());
+    buffer_error(refC.data(), matC.data(), refC.size(), utils::fp16(0.0002f * k));
+  }
+};
+#ifdef BTLA_UT_WRAPPER
+static UT_Fp16Fp16Fp16 sUT_Fp16Fp16Fp16;
+#endif
+}  // namespace ut
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/ut/kernel_intrin.cpp b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/kernel_intrin.cpp
new file mode 100644
index 000000000..47135c893
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/kernel_intrin.cpp
@@ -0,0 +1,1460 @@
+#include "kernel_ut.h"
+#include "kernel_avx2.h"
+#include "kernel_avx512f.h"
+namespace bestla {
+using namespace utils;
+namespace ut {
+
+#if CompileAVX512F()
+class UT_avx512_decompress_s4_s8 {
+ public:
+  UT_avx512_decompress_s4_s8() {
+    UT_START();
+    CheckISA(AVX512F);
+    ut<1, 48>(32);
+    ut<4, 48>(32);
+    ut<1, 48>(32, true);
+    ut<2, 48>(32, true);
+    ut<4, 48>(32, true);
+  }
+
+  template <int PackRow, int NTILE>
+  void ut(int blocksize, bool isasym = false) {
+    int row = blocksize * 2;
+    int constexpr col = NTILE;
+    printf("Test Case %s: %d %d %d\n", __FUNCTION__, row, col, blocksize);
+    std::vector<utils::int4x2> s4_wei(row * col / 2);
+    std::vector<int8_t> s8_wei(col * row);
+    std::vector<int8_t> s8_ref(col * row);
+    int blks = row / blocksize;
+    int row_offset = PackRow;
+    std::vector<int8_t> zp(col * blks);
+    fill_buffer_randn(zp.data(), zp.size(), int8_t(-8), int8_t(7));
+    std::vector<int8_t> rev(col * row);
+    fill_buffer_randn(s8_wei.data(), s8_wei.size(), int8_t(-8), int8_t(7));
+
+    for (int i = 0; i < col * row; i += 2) {
+      s8_ref[i] = s8_wei[i];
+      s8_ref[i + 1] = s8_wei[i + 1];
+      s4_wei[i / 2].x = utils::int4x2::convert(s8_wei[i]) + 8;
+      s4_wei[i / 2].y = utils::int4x2::convert(s8_wei[i + 1]) + 8;
+    }
+    if (isasym) {
+      for (int i = 0; i < row; i += PackRow) {
+        for (int j = 0; j < NTILE; j++) {
+          for (int ip = 0; ip < PackRow; ip++) {
+            s8_ref[i * NTILE + j * PackRow + ip] -= zp[i / blocksize * NTILE + j];
+          }
+        }
+      }
+    }
+
+    kernel::avx512f::decompress_kblock_s4_s8<PackRow, NTILE>(s4_wei.data(), isasym ? zp.data() : nullptr, rev.data(),
+                                                             blocksize, NTILE, 0, 0, row_offset, NTILE, cache,
+                                                             CacheSize);
+    kernel::avx512f::decompress_kblock_s4_s8<PackRow, NTILE>(
+        s4_wei.data() + row_offset * NTILE / 2, isasym ? zp.data() : nullptr, rev.data() + row_offset * NTILE,
+        blocksize, NTILE, 0, row_offset, row - row_offset, NTILE, cache, CacheSize);
+    ut::buffer_error(s8_ref.data(), rev.data(), rev.size(), int8_t(0));
+  }
+};
+#ifdef BTLA_UT_KERNEL_INTRIN
+static UT_avx512_decompress_s4_s8 sUT_avx512_decompress_s4_s8;
+#endif
+
+class UT_avx512_decompress_s3_s8 {
+ public:
+  UT_avx512_decompress_s3_s8() {
+    UT_START();
+    CheckISA(AVX512F);
+    ut<1, 48>(32);
+    ut<4, 48>(32);
+    ut<1, 48>(32, true);
+    ut<2, 48>(32, true);
+    ut<4, 48>(32, true);
+  }
+
+  template <int PackRow, int NTILE>
+  void ut(int blocksize, bool isasym = false) {
+    int row = blocksize * 2;
+    int constexpr col = NTILE;
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, PackRow, row, col, blocksize, isasym);
+    std::vector<utils::bit2x4> s2_wei(row * col / 4);
+    avector<utils::bit1x8> s1_wei(row * col / 8);
+
+    std::vector<int8_t> s8_wei(col * row);
+    std::vector<int8_t> s8_ref(col * row);
+    int blks = row / blocksize;
+    int row_offset = 8;
+    assert(blocksize % 8 == 0);
+    std::vector<int8_t> zp(col * blks);
+    fill_buffer_randn(zp.data(), zp.size(), int8_t(-4), int8_t(3));
+    std::vector<int8_t> rev(col * row);
+    fill_buffer_randn(s8_wei.data(), s8_wei.size(), int8_t(-4), int8_t(3));
+
+    for (int i = 0; i < col * row; i += 8) {
+      memcpy(&s8_ref[i], &s8_wei[i], 8 * sizeof(int8_t));
+      s2_wei[i / 4].a = (s8_wei[i + 0] + 4) & 0x3;
+      s2_wei[i / 4].b = (s8_wei[i + 1] + 4) & 0x3;
+      s2_wei[i / 4].c = (s8_wei[i + 2] + 4) & 0x3;
+      s2_wei[i / 4].d = (s8_wei[i + 3] + 4) & 0x3;
+
+      s2_wei[i / 4 + 1].a = (s8_wei[i + 4] + 4) & 0x3;
+      s2_wei[i / 4 + 1].b = (s8_wei[i + 5] + 4) & 0x3;
+      s2_wei[i / 4 + 1].c = (s8_wei[i + 6] + 4) & 0x3;
+      s2_wei[i / 4 + 1].d = (s8_wei[i + 7] + 4) & 0x3;
+
+      s1_wei[i / 8].a = ((s8_wei[i + 0] + 4) & 0x4) >> 2;
+      s1_wei[i / 8].b = ((s8_wei[i + 1] + 4) & 0x4) >> 2;
+      s1_wei[i / 8].c = ((s8_wei[i + 2] + 4) & 0x4) >> 2;
+      s1_wei[i / 8].d = ((s8_wei[i + 3] + 4) & 0x4) >> 2;
+      s1_wei[i / 8].e = ((s8_wei[i + 4] + 4) & 0x4) >> 2;
+      s1_wei[i / 8].f = ((s8_wei[i + 5] + 4) & 0x4) >> 2;
+      s1_wei[i / 8].g = ((s8_wei[i + 6] + 4) & 0x4) >> 2;
+      s1_wei[i / 8].h = ((s8_wei[i + 7] + 4) & 0x4) >> 2;
+    }
+    if (isasym) {
+      for (int i = 0; i < row; i += PackRow) {
+        for (int j = 0; j < NTILE; j++) {
+          for (int ip = 0; ip < PackRow; ip++) {
+            s8_ref[i * NTILE + j * PackRow + ip] -= zp[i / blocksize * NTILE + j];
+          }
+        }
+      }
+    }
+
+    kernel::avx512f::decompress_kblock_s3_s8<PackRow, NTILE>(s2_wei.data(), s1_wei.data(), isasym ? zp.data() : nullptr,
+                                                             rev.data(), blocksize, NTILE, 0, 0, row_offset, NTILE,
+                                                             cache, CacheSize);
+    kernel::avx512f::decompress_kblock_s3_s8<PackRow, NTILE>(
+        s2_wei.data() + row_offset * NTILE / 4, s1_wei.data() + row_offset * NTILE / 8, isasym ? zp.data() : nullptr,
+        rev.data() + row_offset * NTILE, blocksize, NTILE, 0, row_offset, row - row_offset, NTILE, cache, CacheSize);
+    ut::buffer_error(s8_ref.data(), rev.data(), rev.size(), int8_t(0));
+  }
+};
+#ifdef BTLA_UT_KERNEL_INTRIN
+static UT_avx512_decompress_s3_s8 sUT_avx512_decompress_s3_s8;
+#endif
+
+class UT_avx512_decompress_s2_s8 {
+ public:
+  UT_avx512_decompress_s2_s8() {
+    UT_START();
+    CheckISA(AVX512F);
+    ut<1, 48>(32);
+    ut<4, 48>(32);
+    ut<1, 48>(32, true);
+    ut<2, 48>(32, true);
+    ut<4, 48>(32, true);
+  }
+
+  template <int PackRow, int NTILE>
+  void ut(int blocksize, bool isasym = false) {
+    int row = blocksize * 2;
+    int constexpr col = NTILE;
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, PackRow, row, col, blocksize, isasym);
+    std::vector<utils::bit2x4> s2_wei(row * col / 4);
+
+    std::vector<int8_t> s8_wei(col * row);
+    std::vector<int8_t> s8_ref(col * row);
+    int blks = row / blocksize;
+    int row_offset = blocksize;
+    assert(blocksize % 8 == 0);
+    std::vector<int8_t> zp(col * blks);
+    fill_buffer_randn(zp.data(), zp.size(), int8_t(-2), int8_t(1));
+    std::vector<int8_t> rev(col * row);
+    fill_buffer_randn(s8_wei.data(), s8_wei.size(), int8_t(-2), int8_t(1));
+
+    for (int i = 0; i < col * row; i += 4) {
+      memcpy(&s8_ref[i], &s8_wei[i], 4 * sizeof(int8_t));
+      s2_wei[i / 4].a = (s8_wei[i + 0] + 2) & 0x3;
+      s2_wei[i / 4].b = (s8_wei[i + 1] + 2) & 0x3;
+      s2_wei[i / 4].c = (s8_wei[i + 2] + 2) & 0x3;
+      s2_wei[i / 4].d = (s8_wei[i + 3] + 2) & 0x3;
+    }
+    if (isasym) {
+      for (int i = 0; i < row; i += PackRow) {
+        for (int j = 0; j < NTILE; j++) {
+          for (int ip = 0; ip < PackRow; ip++) {
+            s8_ref[i * NTILE + j * PackRow + ip] -= zp[i / blocksize * NTILE + j];
+          }
+        }
+      }
+    }
+
+    kernel::avx512f::decompress_kblock_s2_s8<PackRow, NTILE>(s2_wei.data(), isasym ? zp.data() : nullptr, rev.data(),
+                                                             blocksize, NTILE, 0, 0, row_offset, NTILE, cache,
+                                                             CacheSize);
+    kernel::avx512f::decompress_kblock_s2_s8<PackRow, NTILE>(
+        s2_wei.data() + row_offset * NTILE / 4, isasym ? zp.data() : nullptr, rev.data() + row_offset * NTILE,
+        blocksize, NTILE, 0, row_offset, row - row_offset, NTILE, cache, CacheSize);
+    ut::buffer_error(s8_ref.data(), rev.data(), rev.size(), int8_t(0));
+  }
+};
+#ifdef BTLA_UT_KERNEL_INTRIN
+static UT_avx512_decompress_s2_s8 sUT_avx512_decompress_s2_s8;
+#endif
+
+class UT_avx512_decompress_s4_fp {
+ public:
+  UT_avx512_decompress_s4_fp() {
+    UT_START();
+    CheckISA(AVX512F);
+    ut<1, 48, float>(32);
+    ut<2, 48, float>(32);
+    ut<4, 48, float>(32);
+    ut<4, 48, utils::bf16>(32);
+    ut<4, 48, utils::bf16, utils::bf16>(32);
+    ut<1, 48, float>(32, true);
+    ut<2, 48, float>(32, true);
+    ut<4, 48, float>(32, true);
+    ut<4, 48, utils::bf16>(32, true);
+    ut<4, 48, utils::bf16, utils::bf16>(32, true);
+  }
+
+  template <int PackRow, int NTILE, typename T, typename ScaleT = float>
+  void ut(int blocksize, bool isasym = false) {
+    auto dst_dtype = bestla_dtype<T>;
+    auto scale_dtype = bestla_dtype<ScaleT>;
+    int row = blocksize * 2;
+    int constexpr col = NTILE;
+    printf("Test Case %s: %d %d %d Asym:%d Pack:%d %s %s\n", __FUNCTION__, row, col, blocksize, isasym, PackRow,
+           utils::bestla_dtype_str(dst_dtype), bestla_dtype_str(scale_dtype));
+    std::vector<utils::int4x2> s4_wei(row * col / 2);
+    std::vector<int8_t> s8_wei(col * row);
+    std::vector<T> s8_ref(col * row);
+    int blks = row / blocksize;
+    int row_offset = PackRow;
+    std::vector<int8_t> zp(col * blks);
+    avector<ScaleT> scale(col * blks);
+    fill_buffer_randn(scale.data(), scale.size(), ScaleT(0.01f), ScaleT(0.03f));
+    fill_buffer_randn(zp.data(), zp.size(), int8_t(-8), int8_t(7));
+    std::vector<T> rev(col * row);
+    fill_buffer_randn(s8_wei.data(), s8_wei.size(), int8_t(-8), int8_t(7));
+
+    for (int i = 0; i < col * row; i += 2) {
+      s8_ref[i] = float(s8_wei[i]);
+      s8_ref[i + 1] = float(s8_wei[i + 1]);
+      s4_wei[i / 2].x = utils::int4x2::convert(s8_wei[i]) + 8;
+      s4_wei[i / 2].y = utils::int4x2::convert(s8_wei[i + 1]) + 8;
+    }
+    if (isasym) {
+      for (int i = 0; i < row; i += PackRow) {
+        for (int j = 0; j < NTILE; j++) {
+          int corr_offset = i / blocksize * NTILE + j;
+          for (int ip = 0; ip < PackRow; ip++) {
+            s8_ref[i * NTILE + j * PackRow + ip] = float(s8_ref[i * NTILE + j * PackRow + ip]) - float(zp[corr_offset]);
+          }
+        }
+      }
+    }
+
+    for (int i = 0; i < row; i += PackRow) {
+      for (int j = 0; j < NTILE; j++) {
+        int corr_offset = i / blocksize * NTILE + j;
+        for (int ip = 0; ip < PackRow; ip++) {
+          s8_ref[i * NTILE + j * PackRow + ip] =
+              float(s8_ref[i * NTILE + j * PackRow + ip]) * float(scale[corr_offset]);
+        }
+      }
+    }
+    kernel::avx512f::decompress_kblock_s4_fp<PackRow, NTILE>(s4_wei.data(), rev.data(), row_offset, NTILE, scale.data(),
+                                                             scale_dtype, isasym ? zp.data() : nullptr, 0, 0, blocksize,
+                                                             NTILE, cache, CacheSize);
+    kernel::avx512f::decompress_kblock_s4_fp<PackRow, NTILE>(
+        s4_wei.data() + row_offset * NTILE / 2, rev.data() + row_offset * NTILE, row - row_offset, NTILE, scale.data(),
+        scale_dtype, isasym ? zp.data() : nullptr, row_offset, 0, blocksize, NTILE, cache, CacheSize);
+    float err = get_ut_err(dst_dtype);
+    ut::buffer_error(s8_ref.data(), rev.data(), rev.size(), T(err));
+  }
+};
+#ifdef BTLA_UT_KERNEL_INTRIN
+static UT_avx512_decompress_s4_fp sUT_avx512_decompress_s4_fp;
+#endif
+#endif
+
+#if CompileAVX512VNNI()
+class UT_avx512_gemv {
+ public:
+  UT_avx512_gemv() {
+    UT_START();
+    CheckISA(AVX512_VNNI);
+    ut_4bit<1>(48, 128, 32, true);
+    ut_4bit<1>(48, 128, 32, false);
+    ut_4bit<4>(48, 128, 32, false);
+    ut_4bit<4>(48, 128, 32, true);
+
+    ut_4bit_fp32<1>(48, 128, 32, true);
+    ut_4bit_fp32<1>(48, 128, 32, false);
+    ut_4bit_fp32<4>(48, 128, 32, true);
+    ut_4bit_fp32<4>(48, 128, 32, false);
+
+    ut_4bit_s8s8<1>(48, 128, 32, true);
+    ut_4bit_s8s8<1>(48, 128, 32, false);
+    ut_4bit_s8s8<4>(48, 128, 32, true);
+    ut_4bit_s8s8<4>(48, 128, 32, false);
+
+    ut_2bit<1>(48, 128, 32, true);
+    ut_2bit<1>(48, 128, 32, false);
+    ut_2bit<4>(48, 128, 32, true);
+    ut_2bit<4>(48, 128, 32, false);
+
+    ut_2bit_s8s8<1>(48, 128, 32, true);
+    ut_2bit_s8s8<1>(48, 128, 32, false);
+    ut_2bit_s8s8<4>(48, 128, 32, true);
+    ut_2bit_s8s8<4>(48, 128, 32, false);
+
+    ut_2bit_fp32<1>(48, 128, 32, true);
+    ut_2bit_fp32<1>(48, 128, 32, false);
+    ut_2bit_fp32<4>(48, 128, 32, true);
+    ut_2bit_fp32<4>(48, 128, 32, false);
+
+    ut_3bit_u8s8<1>(48, 128, 32, true);
+    ut_3bit_u8s8<1>(48, 128, 32, false);
+    ut_3bit_u8s8<4>(48, 128, 32, true);
+    ut_3bit_u8s8<4>(48, 128, 32, false);
+
+    ut_3bit_s8s8<1>(48, 128, 32, true);
+    ut_3bit_s8s8<1>(48, 128, 32, false);
+    ut_3bit_s8s8<4>(48, 128, 32, true);
+    ut_3bit_s8s8<4>(48, 128, 32, false);
+
+    ut_3bit_fp32<1>(48, 128, 32, true);
+    ut_3bit_fp32<1>(48, 128, 32, false);
+    ut_3bit_fp32<4>(48, 128, 32, true);
+    ut_3bit_fp32<4>(48, 128, 32, false);
+  }
+
+  template <int MTILE>
+  void ut_4bit(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit4x2> b2(n * k / 2);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<uint8_t> A(MTILE * k), azp(MTILE * blks);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(A.data(), A.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-8), int8_t(7));
+    fill_buffer_randn(azp.data(), azp.size(), uint8_t(100), uint8_t(150));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+    fill_buffer_randn(scalea.data(), scalea.size(), 0.01f, 0.02f);
+    for (int im = 0; im < MTILE; im++) {
+      for (int i = 0; i < k; i += 4) {
+        int bid = i / kblock + im * blks;
+        for (int j = 0; j < 4; j++) {
+          Af32[im * k + i + j] = (int(A[im * k + i + j]) - azp[bid]) * scalea[bid];
+        }
+      }
+    }
+
+    for (int i = 0; i < k; i += 4) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 1) {
+        auto b24 = b2[(i * n + j * 4) / 2];
+        auto b42 = b2[(i * n + j * 4 + 2) / 2];
+        if (iasym) {
+          Bf32[(i + 0) * n + j] = (int(b24.x - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.y - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b42.x - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b42.y - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+        } else {
+          Bf32[(i + 0) * n + j] = (int(b24.x - 8)) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.y - 8)) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b42.x - 8)) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b42.y - 8)) * scaleb[bid * n + j];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    utils::GemvParamB<float> B{(uint8_t*)b2.data(),          nullptr, nullptr, scaleb.data(),
+                               iasym ? bzp.data() : nullptr, 2,       n};
+    kernel::avx512f::vnni::gemv_4bit_u8s8_fp32<float, 48, MTILE>({A.data(), scalea.data(), azp.data(), k, blks}, B,
+                                                                 Cf32.data(), n, k, kblock, cache, CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_4bit_fp32(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit4x2> b2(n * k / 2);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(Af32.data(), Af32.size(), -0.5f, 0.5f);
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-8), int8_t(7));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+
+    for (int i = 0; i < k; i += 1) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 2) {
+        auto b24 = b2[(i * n + j) / 2];
+        if (iasym) {
+          Bf32[(i)*n + j + 0] = (int(b24.x - 8) - bzp[bid * n + j + 0]) * scaleb[bid * n + j + 0];
+          Bf32[(i)*n + j + 1] = (int(b24.y - 8) - bzp[bid * n + j + 1]) * scaleb[bid * n + j + 1];
+        } else {
+          Bf32[(i)*n + j + 0] = (int(b24.x - 8)) * scaleb[bid * n + j + 0];
+          Bf32[(i)*n + j + 1] = (int(b24.y - 8)) * scaleb[bid * n + j + 1];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    utils::GemvParamB<float> B{(uint8_t*)b2.data(),          nullptr, nullptr, scaleb.data(),
+                               iasym ? bzp.data() : nullptr, 4,       n};
+    kernel::avx512f::gemv_4bit_fp32_fp32<float, 48, MTILE>(Af32.data(), k, B, Cf32.data(), n, k, kblock, cache,
+                                                           CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_4bit_s8s8(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit4x2> b2(n * k / 2);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<int8_t> A(MTILE * k);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(A.data(), A.size(), int8_t(-127), int8_t(127));
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-8), int8_t(7));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+    fill_buffer_randn(scalea.data(), scalea.size(), 0.01f, 0.02f);
+    for (int im = 0; im < MTILE; im++) {
+      for (int i = 0; i < k; i += 4) {
+        int bid = i / kblock + im * blks;
+        for (int j = 0; j < 4; j++) {
+          Af32[im * k + i + j] = (int(A[im * k + i + j])) * scalea[bid];
+        }
+      }
+    }
+
+    for (int i = 0; i < k; i += 4) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 1) {
+        auto b24 = b2[(i * n + j * 4) / 2];
+        auto b42 = b2[(i * n + j * 4 + 2) / 2];
+        if (iasym) {
+          Bf32[(i + 0) * n + j] = (int(b24.x - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.y - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b42.x - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b42.y - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+        } else {
+          Bf32[(i + 0) * n + j] = (int(b24.x - 8)) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.y - 8)) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b42.x - 8)) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b42.y - 8)) * scaleb[bid * n + j];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    utils::GemvParamB<float> B{(uint8_t*)b2.data(),          nullptr, nullptr, scaleb.data(),
+                               iasym ? bzp.data() : nullptr, 2,       n};
+    kernel::avx512f::vnni::gemv_4bit_s8s8_fp32<float, 48, MTILE>({(uint8_t*)A.data(), scalea.data(), nullptr, k, blks},
+                                                                 B, Cf32.data(), n, k, kblock, cache, CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_2bit(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit2x4> b2(n * k / 4);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<uint8_t> A(MTILE * k), azp(MTILE * blks);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(A.data(), A.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-2), int8_t(1));
+    fill_buffer_randn(azp.data(), azp.size(), uint8_t(100), uint8_t(150));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+    fill_buffer_randn(scalea.data(), scalea.size(), 0.01f, 0.02f);
+    for (int i = 0; i < MTILE; i++) {
+      for (int j = 0; j < k; j++) {
+        Af32[i * k + j] = (int(A[i * k + j]) - azp[i * blks + j / kblock]) * scalea[i * blks + j / kblock];
+      }
+    }
+    for (int i = 0; i < k; i += 4) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 1) {
+        auto b24 = b2[(i * n + j * 4) / 4];
+        if (iasym) {
+          Bf32[(i + 0) * n + j] = (int(b24.a - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.b - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b24.c - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b24.d - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+        } else {
+          Bf32[(i + 0) * n + j] = (int(b24.a - 2)) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.b - 2)) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b24.c - 2)) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b24.d - 2)) * scaleb[bid * n + j];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    kernel::avx512f::vnni::gemv_2bit_u8s8_fp32<float, 48, MTILE>(
+        {A.data(), scalea.data(), azp.data(), k, blks},
+        {nullptr, (uint8_t*)b2.data(), nullptr, scaleb.data(), iasym ? bzp.data() : nullptr, 2, n}, Cf32.data(), n, k,
+        kblock, cache, CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_2bit_s8s8(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit2x4> b2(n * k / 4);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<int8_t> A(MTILE * k);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(A.data(), A.size(), int8_t(0), int8_t(127));
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-2), int8_t(1));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+    fill_buffer_randn(scalea.data(), scalea.size(), 0.01f, 0.02f);
+    for (int i = 0; i < MTILE; i++) {
+      for (int j = 0; j < k; j++) {
+        Af32[i * k + j] = (int(A[i * k + j])) * scalea[i * blks + j / kblock];
+      }
+    }
+    for (int i = 0; i < k; i += 4) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 1) {
+        auto b24 = b2[(i * n + j * 4) / 4];
+        if (iasym) {
+          Bf32[(i + 0) * n + j] = (int(b24.a - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.b - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b24.c - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b24.d - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+        } else {
+          Bf32[(i + 0) * n + j] = (int(b24.a - 2)) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.b - 2)) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b24.c - 2)) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b24.d - 2)) * scaleb[bid * n + j];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    kernel::avx512f::vnni::gemv_2bit_s8s8_fp32<float, 48, MTILE>(
+        {(uint8_t*)A.data(), scalea.data(), nullptr, k, blks},
+        {nullptr, (uint8_t*)b2.data(), nullptr, scaleb.data(), iasym ? bzp.data() : nullptr, 2, n}, Cf32.data(), n, k,
+        kblock, cache, CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_2bit_fp32(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit2x4> b2(n * k / 4);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(Af32.data(), Af32.size(), -0.5f, 0.5f);
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-8), int8_t(7));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+
+    for (int i = 0; i < k; i += 1) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 4) {
+        auto b24 = b2[(i * n + j) / 4];
+        if (iasym) {
+          Bf32[(i)*n + j + 0] = (int(b24.a - 2) - bzp[bid * n + j + 0]) * scaleb[bid * n + j + 0];
+          Bf32[(i)*n + j + 1] = (int(b24.b - 2) - bzp[bid * n + j + 1]) * scaleb[bid * n + j + 1];
+          Bf32[(i)*n + j + 2] = (int(b24.c - 2) - bzp[bid * n + j + 2]) * scaleb[bid * n + j + 2];
+          Bf32[(i)*n + j + 3] = (int(b24.d - 2) - bzp[bid * n + j + 3]) * scaleb[bid * n + j + 3];
+        } else {
+          Bf32[(i)*n + j + 0] = (int(b24.a - 2)) * scaleb[bid * n + j + 0];
+          Bf32[(i)*n + j + 1] = (int(b24.b - 2)) * scaleb[bid * n + j + 1];
+          Bf32[(i)*n + j + 2] = (int(b24.c - 2)) * scaleb[bid * n + j + 2];
+          Bf32[(i)*n + j + 3] = (int(b24.d - 2)) * scaleb[bid * n + j + 3];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    utils::GemvParamB<float> B{nullptr, (uint8_t*)b2.data(), nullptr, scaleb.data(), iasym ? bzp.data() : nullptr, 2,
+                               n};
+    kernel::avx512f::gemv_2bit_fp32_fp32<float, 48, MTILE>(Af32.data(), k, B, Cf32.data(), n, k, kblock, cache,
+                                                           CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_3bit_fp32(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit2x4> b2(n * k / 4);
+    avector<bit1x8> b1(n * k / 8);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn((uint8_t*)b1.data(), b1.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(Af32.data(), Af32.size(), -0.5f, 0.5f);
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-8), int8_t(7));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+    avector<int8_t> b8(n * k);
+    kernel::ref::decompress_s3_s8(b2.data(), b1.data(), b8.data(), b8.size(), cache, CacheSize);
+    for (int i = 0; i < k; i += 1) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 4) {
+        if (iasym) {
+          Bf32[(i)*n + j + 0] = (b8[(i)*n + j + 0] - bzp[bid * n + j + 0]) * scaleb[bid * n + j + 0];
+          Bf32[(i)*n + j + 1] = (b8[(i)*n + j + 1] - bzp[bid * n + j + 1]) * scaleb[bid * n + j + 1];
+          Bf32[(i)*n + j + 2] = (b8[(i)*n + j + 2] - bzp[bid * n + j + 2]) * scaleb[bid * n + j + 2];
+          Bf32[(i)*n + j + 3] = (b8[(i)*n + j + 3] - bzp[bid * n + j + 3]) * scaleb[bid * n + j + 3];
+        } else {
+          Bf32[(i)*n + j + 0] = (b8[(i)*n + j + 0]) * scaleb[bid * n + j + 0];
+          Bf32[(i)*n + j + 1] = (b8[(i)*n + j + 1]) * scaleb[bid * n + j + 1];
+          Bf32[(i)*n + j + 2] = (b8[(i)*n + j + 2]) * scaleb[bid * n + j + 2];
+          Bf32[(i)*n + j + 3] = (b8[(i)*n + j + 3]) * scaleb[bid * n + j + 3];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    utils::GemvParamB<float> B{
+        nullptr, (uint8_t*)b2.data(), (uint8_t*)b1.data(), scaleb.data(), iasym ? bzp.data() : nullptr, 2, n};
+    kernel::avx512f::gemv_3bit_fp32_fp32<float, 48, MTILE>(Af32.data(), k, B, Cf32.data(), n, k, kblock, cache,
+                                                           CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_3bit_u8s8(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit2x4> b2(n * k / 4);
+    avector<bit1x8> b1(n * k / 8);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn((uint8_t*)b1.data(), b1.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-8), int8_t(7));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+    avector<uint8_t> A(MTILE * k), azp(MTILE * blks);
+    fill_buffer_randn(A.data(), A.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(azp.data(), azp.size(), uint8_t(100), uint8_t(150));
+    fill_buffer_randn(scalea.data(), scalea.size(), 0.01f, 0.02f);
+    for (int im = 0; im < MTILE; im++) {
+      for (int i = 0; i < k; i += 4) {
+        int bid = i / kblock + im * blks;
+        for (int j = 0; j < 4; j++) {
+          Af32[im * k + i + j] = (int(A[im * k + i + j]) - azp[bid]) * scalea[bid];
+        }
+      }
+    }
+
+    avector<int8_t> b8(n * k);
+    kernel::ref::decompress_s3_s8(b2.data(), b1.data(), b8.data(), b8.size(), cache, CacheSize);
+    for (int i = 0; i < k; i += 4) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 1) {
+        if (iasym) {
+          Bf32[(i + 0) * n + j] = (int(b8[i * n + j * 4 + 0]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b8[i * n + j * 4 + 1]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b8[i * n + j * 4 + 2]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b8[i * n + j * 4 + 3]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+        } else {
+          Bf32[(i + 0) * n + j] = (int(b8[i * n + j * 4 + 0])) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b8[i * n + j * 4 + 1])) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b8[i * n + j * 4 + 2])) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b8[i * n + j * 4 + 3])) * scaleb[bid * n + j];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    utils::GemvParamB<float> B{
+        nullptr, (uint8_t*)b2.data(), (uint8_t*)b1.data(), scaleb.data(), iasym ? bzp.data() : nullptr, 2, n};
+    kernel::avx512f::vnni::gemv_3bit_u8s8_fp32<float, 48, MTILE>({A.data(), scalea.data(), azp.data(), k, blks}, B,
+                                                                 Cf32.data(), n, k, kblock, cache, CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_3bit_s8s8(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit2x4> b2(n * k / 4);
+    avector<bit1x8> b1(n * k / 8);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn((uint8_t*)b1.data(), b1.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-8), int8_t(7));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+    avector<int8_t> A(MTILE * k);
+    fill_buffer_randn(A.data(), A.size(), int8_t(0), int8_t(127));
+    fill_buffer_randn(scalea.data(), scalea.size(), 0.01f, 0.02f);
+    for (int im = 0; im < MTILE; im++) {
+      for (int i = 0; i < k; i += 4) {
+        int bid = i / kblock + im * blks;
+        for (int j = 0; j < 4; j++) {
+          Af32[im * k + i + j] = (int(A[im * k + i + j])) * scalea[bid];
+        }
+      }
+    }
+
+    avector<int8_t> b8(n * k);
+    kernel::ref::decompress_s3_s8(b2.data(), b1.data(), b8.data(), b8.size(), cache, CacheSize);
+    for (int i = 0; i < k; i += 4) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 1) {
+        if (iasym) {
+          Bf32[(i + 0) * n + j] = (int(b8[i * n + j * 4 + 0]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b8[i * n + j * 4 + 1]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b8[i * n + j * 4 + 2]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b8[i * n + j * 4 + 3]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+        } else {
+          Bf32[(i + 0) * n + j] = (int(b8[i * n + j * 4 + 0])) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b8[i * n + j * 4 + 1])) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b8[i * n + j * 4 + 2])) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b8[i * n + j * 4 + 3])) * scaleb[bid * n + j];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    utils::GemvParamB<float> B{
+        nullptr, (uint8_t*)b2.data(), (uint8_t*)b1.data(), scaleb.data(), iasym ? bzp.data() : nullptr, 2, n};
+    kernel::avx512f::vnni::gemv_3bit_s8s8_fp32<float, 48, MTILE>({(uint8_t*)A.data(), scalea.data(), nullptr, k, blks},
+                                                                 B, Cf32.data(), n, k, kblock, cache, CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+};
+#ifdef BTLA_UT_KERNEL_INTRIN
+UT_avx512_gemv sUT_avx512_gemv;
+#endif
+#endif
+
+#if CompileAVX2()
+class UT_avx2_decompress_s4_s8 {
+ public:
+  UT_avx2_decompress_s4_s8() {
+    UT_START();
+    CheckISA(AVX2);
+    ut<1, 24>(32);
+    ut<4, 24>(32);
+    ut<1, 24>(32, true);
+    ut<2, 24>(32, true);
+    ut<4, 24>(32, true);
+  }
+
+  template <int PackRow, int NTILE>
+  void ut(int blocksize, bool isasym = false) {
+    int row = blocksize * 2;
+    int constexpr col = NTILE;
+    printf("Test Case %s: %d %d %d\n", __FUNCTION__, row, col, blocksize);
+    std::vector<utils::int4x2> s4_wei(row * col / 2);
+    std::vector<int8_t> s8_wei(col * row);
+    std::vector<int8_t> s8_ref(col * row);
+    int blks = row / blocksize;
+    int row_offset = PackRow;
+    std::vector<int8_t> zp(col * blks);
+    fill_buffer_randn(zp.data(), zp.size(), int8_t(-8), int8_t(7));
+    std::vector<int8_t> rev(col * row);
+    fill_buffer_randn(s8_wei.data(), s8_wei.size(), int8_t(-8), int8_t(7));
+
+    for (int i = 0; i < col * row; i += 2) {
+      s8_ref[i] = s8_wei[i];
+      s8_ref[i + 1] = s8_wei[i + 1];
+      s4_wei[i / 2].x = utils::int4x2::convert(s8_wei[i]) + 8;
+      s4_wei[i / 2].y = utils::int4x2::convert(s8_wei[i + 1]) + 8;
+    }
+    if (isasym) {
+      for (int i = 0; i < row; i += PackRow) {
+        for (int j = 0; j < NTILE; j++) {
+          for (int ip = 0; ip < PackRow; ip++) {
+            s8_ref[i * NTILE + j * PackRow + ip] -= zp[i / blocksize * NTILE + j];
+          }
+        }
+      }
+    }
+
+    kernel::avx2::decompress_kblock_s4_s8<PackRow, NTILE>(s4_wei.data(), isasym ? zp.data() : nullptr, rev.data(),
+                                                          blocksize, NTILE, 0, 0, row_offset, NTILE, cache, CacheSize);
+    kernel::avx2::decompress_kblock_s4_s8<PackRow, NTILE>(
+        s4_wei.data() + row_offset * NTILE / 2, isasym ? zp.data() : nullptr, rev.data() + row_offset * NTILE,
+        blocksize, NTILE, 0, row_offset, row - row_offset, NTILE, cache, CacheSize);
+    ut::buffer_error(s8_ref.data(), rev.data(), rev.size(), int8_t(0));
+  }
+};
+#ifdef BTLA_UT_KERNEL_INTRIN
+static UT_avx2_decompress_s4_s8 sUT_avx2_decompress_s4_s8;
+#endif
+
+class UT_avx2_decompress_s3_s8 {
+ public:
+  UT_avx2_decompress_s3_s8() {
+    UT_START();
+    CheckISA(AVX2);
+    ut<1, 24>(32);
+    ut<4, 24>(32);
+    ut<1, 24>(32, true);
+    ut<2, 24>(32, true);
+    ut<4, 24>(32, true);
+  }
+
+  template <int PackRow, int NTILE>
+  void ut(int blocksize, bool isasym = false) {
+    int row = blocksize * 2;
+    int constexpr col = NTILE;
+    printf("Test Case %s: %d %d %d\n", __FUNCTION__, row, col, blocksize);
+    std::vector<utils::bit2x4> s2_wei(row * col / 4);
+    avector<utils::bit1x8> s1_wei(row * col / 8);
+
+    std::vector<int8_t> s8_wei(col * row);
+    std::vector<int8_t> s8_ref(col * row);
+    int blks = row / blocksize;
+    int row_offset = 8;
+    assert(blocksize % 8 == 0);
+    std::vector<int8_t> zp(col * blks);
+    fill_buffer_randn(zp.data(), zp.size(), int8_t(-4), int8_t(3));
+    std::vector<int8_t> rev(col * row);
+    fill_buffer_randn(s8_wei.data(), s8_wei.size(), int8_t(-4), int8_t(3));
+
+    for (int i = 0; i < col * row; i += 8) {
+      memcpy(&s8_ref[i], &s8_wei[i], 8 * sizeof(int8_t));
+      s2_wei[i / 4].a = (s8_wei[i + 0] + 4) & 0x3;
+      s2_wei[i / 4].b = (s8_wei[i + 1] + 4) & 0x3;
+      s2_wei[i / 4].c = (s8_wei[i + 2] + 4) & 0x3;
+      s2_wei[i / 4].d = (s8_wei[i + 3] + 4) & 0x3;
+
+      s2_wei[i / 4 + 1].a = (s8_wei[i + 4] + 4) & 0x3;
+      s2_wei[i / 4 + 1].b = (s8_wei[i + 5] + 4) & 0x3;
+      s2_wei[i / 4 + 1].c = (s8_wei[i + 6] + 4) & 0x3;
+      s2_wei[i / 4 + 1].d = (s8_wei[i + 7] + 4) & 0x3;
+
+      s1_wei[i / 8].a = ((s8_wei[i + 0] + 4) & 0x4) >> 2;
+      s1_wei[i / 8].b = ((s8_wei[i + 1] + 4) & 0x4) >> 2;
+      s1_wei[i / 8].c = ((s8_wei[i + 2] + 4) & 0x4) >> 2;
+      s1_wei[i / 8].d = ((s8_wei[i + 3] + 4) & 0x4) >> 2;
+      s1_wei[i / 8].e = ((s8_wei[i + 4] + 4) & 0x4) >> 2;
+      s1_wei[i / 8].f = ((s8_wei[i + 5] + 4) & 0x4) >> 2;
+      s1_wei[i / 8].g = ((s8_wei[i + 6] + 4) & 0x4) >> 2;
+      s1_wei[i / 8].h = ((s8_wei[i + 7] + 4) & 0x4) >> 2;
+    }
+    if (isasym) {
+      for (int i = 0; i < row; i += PackRow) {
+        for (int j = 0; j < NTILE; j++) {
+          for (int ip = 0; ip < PackRow; ip++) {
+            s8_ref[i * NTILE + j * PackRow + ip] -= zp[i / blocksize * NTILE + j];
+          }
+        }
+      }
+    }
+
+    kernel::avx2::decompress_kblock_s3_s8<PackRow, NTILE>(s2_wei.data(), s1_wei.data(), isasym ? zp.data() : nullptr,
+                                                          rev.data(), blocksize, NTILE, 0, 0, row_offset, NTILE, cache,
+                                                          CacheSize);
+    kernel::avx2::decompress_kblock_s3_s8<PackRow, NTILE>(
+        s2_wei.data() + row_offset * NTILE / 4, s1_wei.data() + row_offset * NTILE / 8, isasym ? zp.data() : nullptr,
+        rev.data() + row_offset * NTILE, blocksize, NTILE, 0, row_offset, row - row_offset, NTILE, cache, CacheSize);
+    ut::buffer_error(s8_ref.data(), rev.data(), rev.size(), int8_t(0));
+  }
+};
+#ifdef BTLA_UT_KERNEL_INTRIN
+static UT_avx2_decompress_s3_s8 sUT_avx2_decompress_s3_s8;
+#endif
+
+class UT_avx2_decompress_s2_s8 {
+ public:
+  UT_avx2_decompress_s2_s8() {
+    UT_START();
+    CheckISA(AVX2);
+    ut<1, 24>(32);
+    ut<4, 24>(32);
+    ut<1, 24>(32, true);
+    ut<2, 24>(32, true);
+    ut<4, 24>(32, true);
+  }
+
+  template <int PackRow, int NTILE>
+  void ut(int blocksize, bool isasym = false) {
+    int row = blocksize * 2;
+    int constexpr col = NTILE;
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, PackRow, row, col, blocksize, isasym);
+    std::vector<utils::bit2x4> s2_wei(row * col / 4);
+
+    std::vector<int8_t> s8_wei(col * row);
+    std::vector<int8_t> s8_ref(col * row);
+    int blks = row / blocksize;
+    int row_offset = blocksize;
+    assert(blocksize % 8 == 0);
+    std::vector<int8_t> zp(col * blks);
+    fill_buffer_randn(zp.data(), zp.size(), int8_t(-2), int8_t(1));
+    std::vector<int8_t> rev(col * row);
+    fill_buffer_randn(s8_wei.data(), s8_wei.size(), int8_t(-2), int8_t(1));
+
+    for (int i = 0; i < col * row; i += 4) {
+      memcpy(&s8_ref[i], &s8_wei[i], 4 * sizeof(int8_t));
+      s2_wei[i / 4].a = (s8_wei[i + 0] + 2) & 0x3;
+      s2_wei[i / 4].b = (s8_wei[i + 1] + 2) & 0x3;
+      s2_wei[i / 4].c = (s8_wei[i + 2] + 2) & 0x3;
+      s2_wei[i / 4].d = (s8_wei[i + 3] + 2) & 0x3;
+    }
+    if (isasym) {
+      for (int i = 0; i < row; i += PackRow) {
+        for (int j = 0; j < NTILE; j++) {
+          for (int ip = 0; ip < PackRow; ip++) {
+            s8_ref[i * NTILE + j * PackRow + ip] -= zp[i / blocksize * NTILE + j];
+          }
+        }
+      }
+    }
+
+    kernel::avx2::decompress_kblock_s2_s8<PackRow, NTILE>(s2_wei.data(), isasym ? zp.data() : nullptr, rev.data(),
+                                                          blocksize, NTILE, 0, 0, row_offset, NTILE, cache, CacheSize);
+    kernel::avx2::decompress_kblock_s2_s8<PackRow, NTILE>(
+        s2_wei.data() + row_offset * NTILE / 4, isasym ? zp.data() : nullptr, rev.data() + row_offset * NTILE,
+        blocksize, NTILE, 0, row_offset, row - row_offset, NTILE, cache, CacheSize);
+    ut::buffer_error(s8_ref.data(), rev.data(), rev.size(), int8_t(0));
+  }
+};
+#ifdef BTLA_UT_KERNEL_INTRIN
+static UT_avx2_decompress_s2_s8 sUT_avx2_decompress_s2_s8;
+#endif
+
+class UT_avx2_decompress_s4_fp {
+ public:
+  UT_avx2_decompress_s4_fp() {
+    UT_START();
+    CheckISA(AVX2);
+    ut<4, 24, utils::bf16>(32);
+    ut<1, 24, float>(32);
+    ut<2, 24, float>(32);
+    ut<4, 24, float>(32);
+    ut<4, 24, utils::bf16, utils::bf16>(32);
+    ut<1, 24, float>(32, true);
+    ut<2, 24, float>(32, true);
+    ut<4, 24, float>(32, true);
+    ut<4, 24, utils::bf16>(32, true);
+    ut<4, 24, utils::bf16, utils::bf16>(32, true);
+  }
+
+  template <int PackRow, int NTILE, typename T, typename ScaleT = float>
+  void ut(int blocksize, bool isasym = false) {
+    auto dst_dtype = bestla_dtype<T>;
+    auto scale_dtype = bestla_dtype<ScaleT>;
+    int row = blocksize * 2;
+    int constexpr col = NTILE;
+    printf("Test Case %s: %d %d %d Asym:%d Pack:%d %s %s\n", __FUNCTION__, row, col, blocksize, isasym, PackRow,
+           utils::bestla_dtype_str(dst_dtype), bestla_dtype_str(scale_dtype));
+    std::vector<utils::int4x2> s4_wei(row * col / 2);
+    std::vector<int8_t> s8_wei(col * row);
+    std::vector<T> s8_ref(col * row);
+    int blks = row / blocksize;
+    int row_offset = PackRow;
+    std::vector<int8_t> zp(col * blks);
+    avector<ScaleT> scale(col * blks);
+    fill_buffer_randn(scale.data(), scale.size(), ScaleT(0.01f), ScaleT(0.03f));
+    fill_buffer_randn(zp.data(), zp.size(), int8_t(-8), int8_t(7));
+    std::vector<T> rev(col * row);
+    fill_buffer_randn(s8_wei.data(), s8_wei.size(), int8_t(-8), int8_t(7));
+
+    for (int i = 0; i < col * row; i += 2) {
+      s8_ref[i] = float(s8_wei[i]);
+      s8_ref[i + 1] = float(s8_wei[i + 1]);
+      s4_wei[i / 2].x = utils::int4x2::convert(s8_wei[i]) + 8;
+      s4_wei[i / 2].y = utils::int4x2::convert(s8_wei[i + 1]) + 8;
+    }
+    if (isasym) {
+      for (int i = 0; i < row; i += PackRow) {
+        for (int j = 0; j < NTILE; j++) {
+          int corr_offset = i / blocksize * NTILE + j;
+          for (int ip = 0; ip < PackRow; ip++) {
+            s8_ref[i * NTILE + j * PackRow + ip] = float(s8_ref[i * NTILE + j * PackRow + ip]) - float(zp[corr_offset]);
+          }
+        }
+      }
+    }
+
+    for (int i = 0; i < row; i += PackRow) {
+      for (int j = 0; j < NTILE; j++) {
+        int corr_offset = i / blocksize * NTILE + j;
+        for (int ip = 0; ip < PackRow; ip++) {
+          s8_ref[i * NTILE + j * PackRow + ip] =
+              float(s8_ref[i * NTILE + j * PackRow + ip]) * float(scale[corr_offset]);
+        }
+      }
+    }
+    kernel::avx2::decompress_kblock_s4_fp<PackRow, NTILE>(s4_wei.data(), rev.data(), row_offset, NTILE, scale.data(),
+                                                          scale_dtype, isasym ? zp.data() : nullptr, 0, 0, blocksize,
+                                                          NTILE, cache, CacheSize);
+    kernel::avx2::decompress_kblock_s4_fp<PackRow, NTILE>(
+        s4_wei.data() + row_offset * NTILE / 2, rev.data() + row_offset * NTILE, row - row_offset, NTILE, scale.data(),
+        scale_dtype, isasym ? zp.data() : nullptr, row_offset, 0, blocksize, NTILE, cache, CacheSize);
+    float err = get_ut_err(dst_dtype);
+    ut::buffer_error(s8_ref.data(), rev.data(), rev.size(), T(err));
+  }
+};
+#ifdef BTLA_UT_KERNEL_INTRIN
+static UT_avx2_decompress_s4_fp sUT_avx2_decompress_s4_fp;
+#endif
+#endif
+
+#if CompileAVXVNNI()
+class UT_avx2_gemv {
+ public:
+  UT_avx2_gemv() {
+    UT_START();
+    CheckISA(AVX_VNNI);
+    ut_4bit<1>(24, 128, 32, true);
+    ut_4bit<1>(24, 128, 32, false);
+    ut_4bit<4>(24, 128, 32, false);
+    ut_4bit<4>(24, 128, 32, true);
+
+    ut_4bit_s8s8<1>(24, 128, 32, true);
+    ut_4bit_s8s8<1>(24, 128, 32, false);
+    ut_4bit_s8s8<4>(24, 128, 32, true);
+    ut_4bit_s8s8<4>(24, 128, 32, false);
+
+    ut_4bit_fp32<1>(24, 128, 32, true);
+    ut_4bit_fp32<1>(24, 128, 32, false);
+    ut_4bit_fp32<4>(24, 128, 32, true);
+    ut_4bit_fp32<4>(24, 128, 32, false);
+
+    ut_2bit<1>(24, 128, 32, true);
+    ut_2bit<1>(24, 128, 32, false);
+    ut_2bit<4>(24, 128, 32, true);
+    ut_2bit<4>(24, 128, 32, false);
+
+    ut_2bit_s8s8<1>(24, 128, 32, true);
+    ut_2bit_s8s8<1>(24, 128, 32, false);
+    ut_2bit_s8s8<4>(24, 128, 32, true);
+    ut_2bit_s8s8<4>(24, 128, 32, false);
+
+    ut_2bit_fp32<1>(24, 128, 32, true);
+    ut_2bit_fp32<1>(24, 128, 32, false);
+    ut_2bit_fp32<4>(24, 128, 32, true);
+    ut_2bit_fp32<4>(24, 128, 32, false);
+
+    ut_3bit_fp32<1>(24, 128, 32, true);
+    ut_3bit_fp32<1>(24, 128, 32, false);
+    ut_3bit_fp32<4>(24, 128, 32, true);
+    ut_3bit_fp32<4>(24, 128, 32, false);
+
+    ut_3bit_u8s8<1>(24, 128, 32, true);
+    ut_3bit_u8s8<1>(24, 128, 32, false);
+    ut_3bit_u8s8<4>(24, 128, 32, true);
+    ut_3bit_u8s8<4>(24, 128, 32, false);
+
+    ut_3bit_s8s8<1>(24, 128, 32, true);
+    ut_3bit_s8s8<1>(24, 128, 32, false);
+    ut_3bit_s8s8<4>(24, 128, 32, true);
+    ut_3bit_s8s8<4>(24, 128, 32, false);
+  }
+
+  template <int MTILE>
+  void ut_4bit(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit4x2> b2(n * k / 2);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<uint8_t> A(MTILE * k), azp(MTILE * blks);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(A.data(), A.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-8), int8_t(7));
+    fill_buffer_randn(azp.data(), azp.size(), uint8_t(100), uint8_t(150));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+    fill_buffer_randn(scalea.data(), scalea.size(), 0.01f, 0.02f);
+    for (int im = 0; im < MTILE; im++) {
+      for (int i = 0; i < k; i += 4) {
+        int bid = i / kblock + im * blks;
+        for (int j = 0; j < 4; j++) {
+          Af32[im * k + i + j] = (int(A[im * k + i + j]) - azp[bid]) * scalea[bid];
+        }
+      }
+    }
+
+    for (int i = 0; i < k; i += 4) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 1) {
+        auto b24 = b2[(i * n + j * 4) / 2];
+        auto b42 = b2[(i * n + j * 4 + 2) / 2];
+        if (iasym) {
+          Bf32[(i + 0) * n + j] = (int(b24.x - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.y - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b42.x - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b42.y - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+        } else {
+          Bf32[(i + 0) * n + j] = (int(b24.x - 8)) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.y - 8)) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b42.x - 8)) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b42.y - 8)) * scaleb[bid * n + j];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    utils::GemvParamB<float> B{(uint8_t*)b2.data(),          nullptr, nullptr, scaleb.data(),
+                               iasym ? bzp.data() : nullptr, 2,       n};
+    kernel::avx2::vnni::gemv_4bit_u8s8_fp32<float, 24, MTILE>({A.data(), scalea.data(), azp.data(), k, blks}, B,
+                                                              Cf32.data(), n, k, kblock, cache, CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_4bit_fp32(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit4x2> b2(n * k / 2);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(Af32.data(), Af32.size(), -0.5f, 0.5f);
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-8), int8_t(7));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+
+    for (int i = 0; i < k; i += 1) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 2) {
+        auto b24 = b2[(i * n + j) / 2];
+        if (iasym) {
+          Bf32[(i)*n + j + 0] = (int(b24.x - 8) - bzp[bid * n + j + 0]) * scaleb[bid * n + j + 0];
+          Bf32[(i)*n + j + 1] = (int(b24.y - 8) - bzp[bid * n + j + 1]) * scaleb[bid * n + j + 1];
+        } else {
+          Bf32[(i)*n + j + 0] = (int(b24.x - 8)) * scaleb[bid * n + j + 0];
+          Bf32[(i)*n + j + 1] = (int(b24.y - 8)) * scaleb[bid * n + j + 1];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    utils::GemvParamB<float> B{(uint8_t*)b2.data(),          nullptr, nullptr, scaleb.data(),
+                               iasym ? bzp.data() : nullptr, 4,       n};
+    kernel::avx2::gemv_4bit_fp32_fp32<float, 24, MTILE>(Af32.data(), k, B, Cf32.data(), n, k, kblock, cache, CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_4bit_s8s8(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit4x2> b2(n * k / 2);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<int8_t> A(MTILE * k);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(A.data(), A.size(), int8_t(-127), int8_t(127));
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-8), int8_t(7));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+    fill_buffer_randn(scalea.data(), scalea.size(), 0.01f, 0.02f);
+    for (int im = 0; im < MTILE; im++) {
+      for (int i = 0; i < k; i += 4) {
+        int bid = i / kblock + im * blks;
+        for (int j = 0; j < 4; j++) {
+          Af32[im * k + i + j] = (int(A[im * k + i + j])) * scalea[bid];
+        }
+      }
+    }
+
+    for (int i = 0; i < k; i += 4) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 1) {
+        auto b24 = b2[(i * n + j * 4) / 2];
+        auto b42 = b2[(i * n + j * 4 + 2) / 2];
+        if (iasym) {
+          Bf32[(i + 0) * n + j] = (int(b24.x - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.y - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b42.x - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b42.y - 8) - bzp[bid * n + j]) * scaleb[bid * n + j];
+        } else {
+          Bf32[(i + 0) * n + j] = (int(b24.x - 8)) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.y - 8)) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b42.x - 8)) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b42.y - 8)) * scaleb[bid * n + j];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    utils::GemvParamB<float> B{(uint8_t*)b2.data(),          nullptr, nullptr, scaleb.data(),
+                               iasym ? bzp.data() : nullptr, 2,       n};
+    kernel::avx2::vnni::gemv_4bit_s8s8_fp32<float, 24, MTILE>({(uint8_t*)A.data(), scalea.data(), nullptr, k, blks}, B,
+                                                              Cf32.data(), n, k, kblock, cache, CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_2bit(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit2x4> b2(n * k / 4);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<uint8_t> A(MTILE * k), azp(MTILE * blks);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(A.data(), A.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-2), int8_t(1));
+    fill_buffer_randn(azp.data(), azp.size(), uint8_t(100), uint8_t(150));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+    fill_buffer_randn(scalea.data(), scalea.size(), 0.01f, 0.02f);
+    for (int i = 0; i < MTILE; i++) {
+      for (int j = 0; j < k; j++) {
+        Af32[i * k + j] = (int(A[i * k + j]) - azp[i * blks + j / kblock]) * scalea[i * blks + j / kblock];
+      }
+    }
+    for (int i = 0; i < k; i += 4) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 1) {
+        auto b24 = b2[(i * n + j * 4) / 4];
+        if (iasym) {
+          Bf32[(i + 0) * n + j] = (int(b24.a - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.b - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b24.c - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b24.d - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+        } else {
+          Bf32[(i + 0) * n + j] = (int(b24.a - 2)) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.b - 2)) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b24.c - 2)) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b24.d - 2)) * scaleb[bid * n + j];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    kernel::avx2::vnni::gemv_2bit_u8s8_fp32<float, 24, MTILE>(
+        {A.data(), scalea.data(), azp.data(), k, blks},
+        {nullptr, (uint8_t*)b2.data(), nullptr, scaleb.data(), iasym ? bzp.data() : nullptr, 2, n}, Cf32.data(), n, k,
+        kblock, cache, CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_2bit_s8s8(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit2x4> b2(n * k / 4);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<int8_t> A(MTILE * k);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(A.data(), A.size(), int8_t(0), int8_t(127));
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-2), int8_t(1));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+    fill_buffer_randn(scalea.data(), scalea.size(), 0.01f, 0.02f);
+    for (int i = 0; i < MTILE; i++) {
+      for (int j = 0; j < k; j++) {
+        Af32[i * k + j] = (int(A[i * k + j])) * scalea[i * blks + j / kblock];
+      }
+    }
+    for (int i = 0; i < k; i += 4) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 1) {
+        auto b24 = b2[(i * n + j * 4) / 4];
+        if (iasym) {
+          Bf32[(i + 0) * n + j] = (int(b24.a - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.b - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b24.c - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b24.d - 2) - bzp[bid * n + j]) * scaleb[bid * n + j];
+        } else {
+          Bf32[(i + 0) * n + j] = (int(b24.a - 2)) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b24.b - 2)) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b24.c - 2)) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b24.d - 2)) * scaleb[bid * n + j];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    kernel::avx2::vnni::gemv_2bit_s8s8_fp32<float, 24, MTILE>(
+        {(uint8_t*)A.data(), scalea.data(), nullptr, k, blks},
+        {nullptr, (uint8_t*)b2.data(), nullptr, scaleb.data(), iasym ? bzp.data() : nullptr, 2, n}, Cf32.data(), n, k,
+        kblock, cache, CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_2bit_fp32(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit2x4> b2(n * k / 4);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(Af32.data(), Af32.size(), -0.5f, 0.5f);
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-8), int8_t(7));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+
+    for (int i = 0; i < k; i += 1) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 4) {
+        auto b24 = b2[(i * n + j) / 4];
+        if (iasym) {
+          Bf32[(i)*n + j + 0] = (int(b24.a - 2) - bzp[bid * n + j + 0]) * scaleb[bid * n + j + 0];
+          Bf32[(i)*n + j + 1] = (int(b24.b - 2) - bzp[bid * n + j + 1]) * scaleb[bid * n + j + 1];
+          Bf32[(i)*n + j + 2] = (int(b24.c - 2) - bzp[bid * n + j + 2]) * scaleb[bid * n + j + 2];
+          Bf32[(i)*n + j + 3] = (int(b24.d - 2) - bzp[bid * n + j + 3]) * scaleb[bid * n + j + 3];
+        } else {
+          Bf32[(i)*n + j + 0] = (int(b24.a - 2)) * scaleb[bid * n + j + 0];
+          Bf32[(i)*n + j + 1] = (int(b24.b - 2)) * scaleb[bid * n + j + 1];
+          Bf32[(i)*n + j + 2] = (int(b24.c - 2)) * scaleb[bid * n + j + 2];
+          Bf32[(i)*n + j + 3] = (int(b24.d - 2)) * scaleb[bid * n + j + 3];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    utils::GemvParamB<float> B{nullptr, (uint8_t*)b2.data(), nullptr, scaleb.data(), iasym ? bzp.data() : nullptr, 2,
+                               n};
+    kernel::avx2::gemv_2bit_fp32_fp32<float, 24, MTILE>(Af32.data(), k, B, Cf32.data(), n, k, kblock, cache, CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_3bit_fp32(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit2x4> b2(n * k / 4);
+    avector<bit1x8> b1(n * k / 8);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn((uint8_t*)b1.data(), b1.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(Af32.data(), Af32.size(), -0.5f, 0.5f);
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-8), int8_t(7));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+    avector<int8_t> b8(n * k);
+    kernel::ref::decompress_s3_s8(b2.data(), b1.data(), b8.data(), b8.size(), cache, CacheSize);
+    for (int i = 0; i < k; i += 1) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 4) {
+        if (iasym) {
+          Bf32[(i)*n + j + 0] = (b8[(i)*n + j + 0] - bzp[bid * n + j + 0]) * scaleb[bid * n + j + 0];
+          Bf32[(i)*n + j + 1] = (b8[(i)*n + j + 1] - bzp[bid * n + j + 1]) * scaleb[bid * n + j + 1];
+          Bf32[(i)*n + j + 2] = (b8[(i)*n + j + 2] - bzp[bid * n + j + 2]) * scaleb[bid * n + j + 2];
+          Bf32[(i)*n + j + 3] = (b8[(i)*n + j + 3] - bzp[bid * n + j + 3]) * scaleb[bid * n + j + 3];
+        } else {
+          Bf32[(i)*n + j + 0] = (b8[(i)*n + j + 0]) * scaleb[bid * n + j + 0];
+          Bf32[(i)*n + j + 1] = (b8[(i)*n + j + 1]) * scaleb[bid * n + j + 1];
+          Bf32[(i)*n + j + 2] = (b8[(i)*n + j + 2]) * scaleb[bid * n + j + 2];
+          Bf32[(i)*n + j + 3] = (b8[(i)*n + j + 3]) * scaleb[bid * n + j + 3];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    utils::GemvParamB<float> B{
+        nullptr, (uint8_t*)b2.data(), (uint8_t*)b1.data(), scaleb.data(), iasym ? bzp.data() : nullptr, 2, n};
+    kernel::avx2::gemv_3bit_fp32_fp32<float, 24, MTILE>(Af32.data(), k, B, Cf32.data(), n, k, kblock, cache, CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_3bit_u8s8(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit2x4> b2(n * k / 4);
+    avector<bit1x8> b1(n * k / 8);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn((uint8_t*)b1.data(), b1.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-8), int8_t(7));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+    avector<uint8_t> A(MTILE * k), azp(MTILE * blks);
+    fill_buffer_randn(A.data(), A.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(azp.data(), azp.size(), uint8_t(100), uint8_t(150));
+    fill_buffer_randn(scalea.data(), scalea.size(), 0.01f, 0.02f);
+    for (int im = 0; im < MTILE; im++) {
+      for (int i = 0; i < k; i += 4) {
+        int bid = i / kblock + im * blks;
+        for (int j = 0; j < 4; j++) {
+          Af32[im * k + i + j] = (int(A[im * k + i + j]) - azp[bid]) * scalea[bid];
+        }
+      }
+    }
+
+    avector<int8_t> b8(n * k);
+    kernel::ref::decompress_s3_s8(b2.data(), b1.data(), b8.data(), b8.size(), cache, CacheSize);
+    for (int i = 0; i < k; i += 4) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 1) {
+        if (iasym) {
+          Bf32[(i + 0) * n + j] = (int(b8[i * n + j * 4 + 0]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b8[i * n + j * 4 + 1]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b8[i * n + j * 4 + 2]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b8[i * n + j * 4 + 3]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+        } else {
+          Bf32[(i + 0) * n + j] = (int(b8[i * n + j * 4 + 0])) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b8[i * n + j * 4 + 1])) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b8[i * n + j * 4 + 2])) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b8[i * n + j * 4 + 3])) * scaleb[bid * n + j];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    utils::GemvParamB<float> B{
+        nullptr, (uint8_t*)b2.data(), (uint8_t*)b1.data(), scaleb.data(), iasym ? bzp.data() : nullptr, 2, n};
+    kernel::avx2::vnni::gemv_3bit_u8s8_fp32<float, 24, MTILE>({A.data(), scalea.data(), azp.data(), k, blks}, B,
+                                                              Cf32.data(), n, k, kblock, cache, CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+
+  template <int MTILE>
+  void ut_3bit_s8s8(int n, int k, int kblock, bool iasym) {
+    printf("Test Case %s_%d: %d %d %d Asym:%d\n", __FUNCTION__, MTILE, n, k, kblock, iasym);
+    int blks = k / kblock;
+    avector<bit2x4> b2(n * k / 4);
+    avector<bit1x8> b1(n * k / 8);
+    avector<float> scaleb(n * blks), scalea(MTILE * blks);
+    avector<int8_t> bzp(n * blks);
+    avector<float> Af32(MTILE * k), Bf32(n * k), Cf32(MTILE * n), Cref(MTILE * n);
+    fill_buffer_randn((uint8_t*)b2.data(), b2.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn((uint8_t*)b1.data(), b1.size(), uint8_t(0), uint8_t(255));
+    fill_buffer_randn(bzp.data(), bzp.size(), int8_t(-8), int8_t(7));
+    fill_buffer_randn(scaleb.data(), scaleb.size(), 0.01f, 0.02f);
+    avector<int8_t> A(MTILE * k);
+    fill_buffer_randn(A.data(), A.size(), int8_t(0), int8_t(127));
+    fill_buffer_randn(scalea.data(), scalea.size(), 0.01f, 0.02f);
+    for (int im = 0; im < MTILE; im++) {
+      for (int i = 0; i < k; i += 4) {
+        int bid = i / kblock + im * blks;
+        for (int j = 0; j < 4; j++) {
+          Af32[im * k + i + j] = (int(A[im * k + i + j])) * scalea[bid];
+        }
+      }
+    }
+
+    avector<int8_t> b8(n * k);
+    kernel::ref::decompress_s3_s8(b2.data(), b1.data(), b8.data(), b8.size(), cache, CacheSize);
+    for (int i = 0; i < k; i += 4) {
+      int bid = i / kblock;
+      for (int j = 0; j < n; j += 1) {
+        if (iasym) {
+          Bf32[(i + 0) * n + j] = (int(b8[i * n + j * 4 + 0]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b8[i * n + j * 4 + 1]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b8[i * n + j * 4 + 2]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b8[i * n + j * 4 + 3]) - bzp[bid * n + j]) * scaleb[bid * n + j];
+        } else {
+          Bf32[(i + 0) * n + j] = (int(b8[i * n + j * 4 + 0])) * scaleb[bid * n + j];
+          Bf32[(i + 1) * n + j] = (int(b8[i * n + j * 4 + 1])) * scaleb[bid * n + j];
+          Bf32[(i + 2) * n + j] = (int(b8[i * n + j * 4 + 2])) * scaleb[bid * n + j];
+          Bf32[(i + 3) * n + j] = (int(b8[i * n + j * 4 + 3])) * scaleb[bid * n + j];
+        }
+      }
+    }
+    gemmref_fp32fp32fp32(MTILE, n, k, Af32.data(), Bf32.data(), Cref.data(), k, n, n);
+    utils::GemvParamB<float> B{
+        nullptr, (uint8_t*)b2.data(), (uint8_t*)b1.data(), scaleb.data(), iasym ? bzp.data() : nullptr, 2, n};
+    kernel::avx2::vnni::gemv_3bit_s8s8_fp32<float, 24, MTILE>({(uint8_t*)A.data(), scalea.data(), nullptr, k, blks}, B,
+                                                              Cf32.data(), n, k, kblock, cache, CacheSize);
+    buffer_error(Cref.data(), Cf32.data(), Cref.size(), FP32_ERR);
+  }
+};
+#ifdef BTLA_UT_KERNEL_INTRIN
+UT_avx2_gemv sUT_avx2_gemv;
+#endif
+#endif
+
+}  // namespace ut
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/ut/kernel_jit.cpp b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/kernel_jit.cpp
new file mode 100644
index 000000000..a1869b19f
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/kernel_jit.cpp
@@ -0,0 +1,237 @@
+#include "kernel_jit.h"
+#include "kernel_ut.h"
+
+namespace bestla {
+using namespace utils;
+namespace ut {
+class UT_Memcpy2D_AVX512F {
+ public:
+  UT_Memcpy2D_AVX512F() {
+    UT_START();
+    CheckISA(AVX512F);
+    ut(512, 432, 432, 432);
+    ut(4, 432 * 1024, 432 * 2048, 432 * 1024);
+    ut(16, 432 * 1024, 432 * 1024, 432 * 1024);
+  }
+  void ut(int row, int col, int srcstep, int dststep) {
+    printf("Test Case: %d %d %d %d\n", row, col, srcstep, dststep);
+    std::vector<float> src(row * srcstep), dst(row * dststep), dstref(row * dststep);
+    for (int i = 0; i < src.size(); i++) {
+      src[i] = float(i);
+    }
+    utils::timer<utils::microseconds> tm;
+    size_t tsize = (size_t)row * col;
+
+    int constexpr TestLoop = 1000;
+    for (int i = 0; i < TestLoop; i++) {
+      kernel::jit::JitMemcpy2DAvx512f::forward<float, float>(src.data(), dst.data(), row, col, srcstep, dststep);
+    }
+    tm.start();
+    parallel::Scheduler2D para({UT_Threading::get()->num_threads(), row, col, 4, 64, 0, 0});
+    for (size_t i = 0; i < TestLoop; i++) {
+      UT_Threading::get()->parallel_for([&](int tidx) {
+        parallel::ThreadProblem2D thdp{tidx};
+        para.getIndex(thdp);
+        if (thdp.valid) {
+          kernel::jit::JitMemcpy2DAvx512f::forward<float, float>(  //
+              src.data() + thdp.loc[0] * srcstep + thdp.loc[1],    //
+              dst.data() + thdp.loc[0] * dststep + thdp.loc[1],    //
+              thdp.size[0], thdp.size[1], srcstep, dststep);
+        }
+      });
+    }
+
+    auto tper = tm.stop() / TestLoop;
+    printf("Kernel Time: %f us\n", tper);
+    printf("Bandwidth: %f GB/s\n", tsize / tper / 1000);
+
+    tm.start();
+    for (size_t i = 0; i < TestLoop; i++) {
+      UT_Threading::get()->parallel_for([&](int tidx) {
+        parallel::ThreadProblem2D thdp{tidx};
+        para.getIndex(thdp);
+        if (thdp.valid) {
+          kernel::ref::memcpy2d(                                    //
+              src.data() + thdp.loc[0] * srcstep + thdp.loc[1],     //
+              dstref.data() + thdp.loc[0] * dststep + thdp.loc[1],  //
+              thdp.size[0], thdp.size[1] * sizeof(float), srcstep * sizeof(float), dststep * sizeof(float));
+        }
+      });
+    }
+    tper = tm.stop() / TestLoop;
+    printf("Ref Time: %f us\n", tper);
+    printf("Bandwidth: %f GB/s\n", tsize / tper / 1000);
+    ut::buffer_error<float>(dstref.data(), dst.data(), dstref.size());
+  }
+};
+#ifdef BTLA_UT_KERNEL_JIT
+static UT_Memcpy2D_AVX512F sUT_Memcpy2D_AVX512F;
+#endif
+
+class UT_Memcpy2D_AVX2 {
+ public:
+  UT_Memcpy2D_AVX2() {
+    UT_START();
+    CheckISA(AVX2);
+    ut(1, 1, 1, 4);
+    ut(2, 2, 2, 4);
+  }
+  void ut(int row, int col, int srcstep, int dststep) {
+    printf("Test Case: %d %d %d %d\n", row, col, srcstep, dststep);
+    std::vector<float> src(row * srcstep), dst(row * dststep), dstref(row * dststep);
+    for (int i = 0; i < src.size(); i++) {
+      src[i] = float(i + 128);
+    }
+
+    kernel::jit::JitMemcpy2DAvx2::forward<float, float>(src.data(), dst.data(), row, col, srcstep, dststep);
+    kernel::ref::memcpy2d(  //
+        src.data(),         //
+        dstref.data(),      //
+        row, col * sizeof(src[0]), srcstep * sizeof(src[0]), dststep * sizeof(src[0]));
+    ut::buffer_error<float>(dstref.data(), dst.data(), dstref.size());
+  }
+};
+#ifdef BTLA_UT_KERNEL_JIT
+static UT_Memcpy2D_AVX2 sUT_Memcpy2D_AVX2;
+#endif
+
+class UT_PaddngInterleaveCvt {
+ public:
+  UT_PaddngInterleaveCvt() {
+    UT_START();
+    CheckISA(AVX512_BF16);
+    ut<48, float, bf16>(77, 256, 96);
+    ut<48, float, bf16>(512, 432, 512);
+    ut<48, float, bf16>(4, 432 * 1024, 4);
+    ut<48, float, bf16>(16, 432 * 1024, 16);
+  }
+  template <int NTile, typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
+  void ut(int rows, int cols, int rows_pad) {
+    printf("\ntest_case: %s\t", __PRETTY_FUNCTION__);
+    printf("row_%d col_%d row_pad_%d\n", rows, cols, rows_pad);
+    const auto cols_pad = utils::padto(cols, NTile);
+    const auto src_step = cols;
+    const auto dst_step = rows_pad;
+    std::vector<T_SRC> src(rows * cols);
+    for (size_t i = 0; i < src.size(); i++) src[i] = static_cast<float>(i);
+    std::vector<T_DST> dst(cols_pad * dst_step), dst_ref(cols_pad * dst_step);
+
+    utils::timer<utils::microseconds> tm;
+    constexpr int TestLoop = 100;
+    for (int i = 0; i < TestLoop; i++) {
+      kernel::jit::PaddingInterleaveCvt::forward<NTile, T_SRC, T_DST, RowPack>(  //
+          src.data(), dst.data(), rows, cols, rows_pad, cols_pad, src_step, dst_step);
+    }
+    tm.start();
+    for (int i = 0; i < TestLoop; i++) {
+      kernel::jit::PaddingInterleaveCvt::forward<NTile, T_SRC, T_DST, RowPack>(  //
+          src.data(), dst.data(), rows, cols, rows_pad, cols_pad, src_step, dst_step);
+    }
+    const auto data_size = sizeof(T_DST) * rows_pad * cols_pad;
+    const auto t_kern = tm.stop() / TestLoop;
+    printf("Kernel Time: %f us\n", t_kern);
+    printf("Bandwidth: %f GB/s\n", data_size / t_kern / 1000);
+
+    tm.start();
+    for (int i = 0; i < TestLoop; i++) {
+      kernel::jit::PaddingInterleaveCvt::reference<NTile, T_SRC, T_DST, RowPack>(  //
+          src.data(), dst_ref.data(), rows, cols, rows_pad, cols_pad, src_step, dst_step);
+    }
+    const auto t_ref = tm.stop() / TestLoop;
+    printf("Ref Time: %f us\n", t_ref);
+    printf("Bandwidth: %f GB/s\n", data_size / t_ref / 1000);
+    ut::buffer_error<T_DST>(dst_ref.data(), dst.data(), dst_ref.size());
+  }
+};
+#ifdef BTLA_UT_KERNEL_JIT
+static UT_PaddngInterleaveCvt sUT_Pading_InterleaveCvt;
+#endif
+
+class UT_PaddingTransInterleaveCvt {
+ public:
+  UT_PaddingTransInterleaveCvt() {
+    UT_START();
+    CheckISA(AVX512_BF16);
+    ut<48, float, bf16>(48, 32, 32);
+    ut<48, float, bf16>(77, 77, 96);
+    ut<48, float, bf16>(77, 256, 256);
+  }
+  template <int MTile, typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
+  void ut(int rows, int cols, int cols_pad) {
+    printf("\ntest_case: %s\t", __PRETTY_FUNCTION__);
+    printf("rows_%d cols_%d cols_pad_%d\n", rows, cols, cols_pad);
+    const auto rows_pad = utils::padto(rows, MTile);
+    const auto src_step = cols;
+    const auto dst_step = cols_pad;
+    std::vector<T_SRC> src(rows * cols);
+    for (size_t i = 0; i < src.size(); i++) src[i] = static_cast<float>(i);
+    std::vector<T_DST> dst(rows_pad * dst_step), dst_ref(rows_pad * dst_step);
+
+    utils::timer<utils::microseconds> tm;
+    constexpr int TestLoop = 100;
+    for (int i = 0; i < TestLoop; i++) {
+      kernel::jit::PaddingTransInterleaveCvt::forward<MTile, T_SRC, T_DST, ColPack>(  //
+          src.data(), dst.data(), rows, cols, rows_pad, cols_pad, src_step, dst_step);
+    }
+    tm.start();
+    for (int i = 0; i < TestLoop; i++) {
+      kernel::jit::PaddingTransInterleaveCvt::forward<MTile, T_SRC, T_DST, ColPack>(  //
+          src.data(), dst.data(), rows, cols, rows_pad, cols_pad, src_step, dst_step);
+    }
+    const auto data_size = sizeof(T_DST) * cols_pad * rows_pad;
+    const auto t_kern = tm.stop() / TestLoop;
+    printf("Kernel Time: %f us\n", t_kern);
+    printf("Bandwidth: %f GB/s\n", data_size / t_kern / 1000);
+
+    tm.start();
+    for (int i = 0; i < TestLoop; i++) {
+      kernel::jit::PaddingTransInterleaveCvt::reference<MTile, T_SRC, T_DST, ColPack>(  //
+          src.data(), dst_ref.data(), rows, cols, rows_pad, cols_pad, src_step, dst_step);
+    }
+    const auto t_ref = tm.stop() / TestLoop;
+    printf("Ref Time: %f us\n", t_ref);
+    printf("Bandwidth: %f GB/s\n", data_size / t_ref / 1000);
+    ut::buffer_error<T_DST>(dst_ref.data(), dst.data(), dst_ref.size());
+  }
+};
+#ifdef BTLA_UT_KERNEL_JIT
+static UT_PaddingTransInterleaveCvt sUT_PaddingTransInterleaveCvt;
+#endif
+
+class UT_CScaleInterleavedBF16FP16 {
+ public:
+  UT_CScaleInterleavedBF16FP16() {
+    UT_START();
+    CheckISA(AVX512_BF16);
+    CheckISA(AVX512_FP16);
+    ut<48, 2>(32, 96, 0);
+    ut<48, 2>(32, 96, 4);
+  }
+  template <int NTile, int RowPack = 2>
+  void ut(int rows, int cols, int n_offset) {
+    printf("\ntest_case: %s\t", __PRETTY_FUNCTION__);
+    printf("rows_%d cols_%d n_offset_%d\n", rows, cols, n_offset);
+    const auto src_step = rows;
+    std::vector<utils::bf16> data(rows * cols);
+    std::vector<utils::bf16> data_ref(data.size());
+    for (size_t i = 0; i < data.size(); i++) data[i] = static_cast<utils::bf16>(static_cast<float>(i % 523 - 300));
+    std::copy_n(data.cbegin(), data.size(), data_ref.begin());
+
+    std::vector<utils::fp16> scale(rows);
+    for (size_t i = 0; i < scale.size(); ++i)
+      scale[i] = static_cast<utils::fp16>(static_cast<float>(i % 523 - 300) / 10.f);
+
+    kernel::jit::CScaleInterleavedBF16FP16::forward<NTile, RowPack>(  //
+        data.data(), scale.data(), rows, cols, src_step, n_offset);
+    kernel::jit::CScaleInterleavedBF16FP16::reference<NTile, RowPack>(  //
+        data_ref.data(), scale.data(), rows, cols, src_step, n_offset);
+
+    ut::buffer_error<utils::bf16>(data.data(), data_ref.data(), data.size());
+    printf("\n");
+  }
+};
+#ifdef BTLA_UT_KERNEL_JIT
+static UT_CScaleInterleavedBF16FP16 sUT_CScaleInterleavedBF16FP16;
+#endif
+}  // namespace ut
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/ut/kernel_ut.h b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/kernel_ut.h
new file mode 100644
index 000000000..9f6ea32c7
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/kernel_ut.h
@@ -0,0 +1,7 @@
+#pragma once
+#include "bestla_ut.h"
+#include "kernel_ref.h"
+
+#ifdef _MSC_VER
+#define __PRETTY_FUNCTION__ __FUNCSIG__
+#endif
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/ut/kernel_wrapper.cpp b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/kernel_wrapper.cpp
new file mode 100644
index 000000000..721f6601b
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/ut/kernel_wrapper.cpp
@@ -0,0 +1,211 @@
+#include "kernel_ut.h"
+#include "kernel_wrapper.h"
+namespace bestla {
+using namespace utils;
+namespace ut {
+#if 0
+class UT_DecompressKBlockS4FP {
+ public:
+  UT_DecompressKBlockS4FP() {
+    UT_START();
+    CheckISA(AVX2);
+    ut_avx2<BTLA_DTYPE::S4_CLIP, 1, float, float>(410, 24, 24, 24, 0, 128, 24);
+    ut_avx2<BTLA_DTYPE::S4_CLIP, 1, float, float>(410, 48, 48, 48, 0, 128, 48);
+  }
+
+  template <BTLA_DTYPE S4_T, int PACK_ROW, typename ST_T, typename DST_T>
+  void ut_avx2(int row, int col, int ld_src, int ld_dst, int k_offset, int kblock, int NPad, bool asym = false) {
+    printf("Test Case %s_%d_%d: %d %d %d %d %d %d %d %d\n", __FUNCTION__, int(S4_T), PACK_ROW, row, col, ld_src, ld_dst,
+           k_offset, kblock, NPad, asym);
+    int nk_blk = updiv(row, kblock);
+    std::vector<utils::int4x2> s4_wei(row * col / 2);
+    std::vector<int8_t> s8_wei(col * row);
+    std::vector<DST_T> bf16_wei(ld_dst * row);
+    std::vector<DST_T> ref_wei(ld_dst * row);
+    std::vector<ST_T> scales(NPad * nk_blk);
+    std::vector<int8_t> zero_points(NPad * nk_blk);
+    fill_buffer_randn(s8_wei.data(), s8_wei.size(), int8_t(-128), int8_t(127));
+    fill_buffer_randn(scales.data(), scales.size(), ST_T(0.01f), ST_T(0.02f));
+    fill_buffer_randn(zero_points.data(), zero_points.size(), (int8_t)(-5), (int8_t)(5));
+
+    for (int i = 0; i < col * row; i += 2) {
+      s4_wei[i / 2].x = utils::int4x2::convert(s8_wei[i]);
+      s4_wei[i / 2].y = utils::int4x2::convert(s8_wei[i + 1]);
+    }
+    kernel::wrapper::DecompressKBlockS4Fp<DST_T, PACK_ROW>::template forward<BTLA_ISA::NoSIMD, ST_T, S4_T>(
+        s4_wei.data(), bf16_wei.data(), row, col, ld_src, ld_dst, scales.data(), asym ? zero_points.data() : nullptr,
+        k_offset, kblock, NPad, cache, CacheSize);
+    kernel::wrapper::DecompressKBlockS4Fp<DST_T, PACK_ROW>::template forward<BTLA_ISA::AVX2, ST_T, S4_T>(
+        s4_wei.data(), ref_wei.data(), row, col, ld_src, ld_dst, scales.data(), asym ? zero_points.data() : nullptr,
+        k_offset, kblock, NPad, cache, CacheSize);
+    DST_T thres = DST_T(0.01f);
+    if constexpr (std::is_same_v<DST_T, utils::bf16>) {
+      thres = DST_T(BF16_ERR);
+    }
+    ut::buffer_error(ref_wei.data(), bf16_wei.data(), bf16_wei.size(), thres);
+  }
+};
+#ifdef BTLA_UT_KERNEL_WRAPPER
+static UT_DecompressKBlockS4FP sUT_DecompressKBlockS4FP;
+#endif
+#endif
+
+class UT_DecompressKBlockF4FP {
+ public:
+  UT_DecompressKBlockF4FP() {
+    UT_START();
+    CheckISA(AVX2);
+    ut<float, 1, BTLA_DTYPE::F4_BNB, BTLA_ISA::AVX2>(35, 48, 48, 48, 0, 12, 48);
+    ut<float, 1, BTLA_DTYPE::F4_BNB, BTLA_ISA::AVX2>(11, 48, 48, 48, 0, 20, 48);
+    CheckISA(AVX512F);
+    ut<float, 1, BTLA_DTYPE::F4_BNB, BTLA_ISA::AVX512F>(35, 48, 48, 48, 0, 12, 48);
+    ut<float, 1, BTLA_DTYPE::F4_BNB, BTLA_ISA::AVX512F>(11, 48, 48, 48, 0, 20, 48);
+  }
+
+  template <typename T, int PACK_ROW, BTLA_DTYPE F4_T, BTLA_ISA ISA_T>
+  void ut(int row, int col, int ld_src, int ld_dst, int k_offset, int kblock, int NPad) {
+    std::vector<utils::f4x2> f4_wei(row * col / 2);
+    std::vector<int8_t> s8_wei(col * row);
+    std::vector<T> wei(col * row);
+    std::vector<T> ref_wei(col * row);
+    std::vector<T> scales(col * updiv(row, kblock));
+    fill_buffer_randn(s8_wei.data(), s8_wei.size(), int8_t(-127), int8_t(127));
+    fill_buffer_randn(scales.data(), scales.size(), T(1.f), T(10.f));
+    for (int i = 0; i < col * row; i += 2) {
+      f4_wei[i / 2].x = utils::int4x2::convert(s8_wei[i]);
+      f4_wei[i / 2].y = utils::int4x2::convert(s8_wei[i + 1]);
+    }
+    kernel::wrapper::DecompressKBlockF4Fp<T, PACK_ROW>::template forward<ISA_T, T, F4_T>(
+        f4_wei.data(), wei.data(), row, col, ld_src, ld_dst, scales.data(), k_offset, kblock, NPad, cache, CacheSize);
+    kernel::wrapper::DecompressKBlockF4Fp<T, PACK_ROW>::template forward<BTLA_ISA::NoSIMD, T, F4_T>(
+        f4_wei.data(), ref_wei.data(), row, col, ld_src, ld_dst, scales.data(), k_offset, kblock, NPad, cache,
+        CacheSize);
+    ut::buffer_error(ref_wei.data(), wei.data(), wei.size(), T(0.01f));
+  }
+};
+#ifdef BTLA_UT_KERNEL_WRAPPER
+static UT_DecompressKBlockF4FP sUT_DecompressKBlockF4FP;
+#endif
+
+class UT_PaddingInterleaveMN {
+ public:
+  UT_PaddingInterleaveMN() {
+    UT_START();
+    // ut<48, 2, bf16, bf16>(128, 128, 2);  // TO IMPLEMENT
+    ut<32, 2, fp16, bf16>(128, 128, 2);
+  }
+  template <int NTile, int RowPack, typename T_SRC, typename T_DST>
+  void ut(int row, int col, int row_tile) {
+    printf("%s %d %d %d\n", __FUNCTION__, row, col, row_tile);
+    int row_pad = padto(row, row_tile);
+    int col_pad = padto(col, NTile);
+
+    aligned_vector<T_SRC> src(row * col);
+    aligned_vector<T_DST> dst(row_pad * col_pad), ref(row_pad * col_pad);
+    for (size_t i = 0; i < src.size(); i++) src[i] = static_cast<T_SRC>(float(i));
+
+    kernel::wrapper::PaddingInterleaveMN<NTile, RowPack>::template forward<BTLA_ISA::NoSIMD>(
+        src.data(), ref.data(), row, col, row_pad, col_pad, row_pad, col);
+    kernel::wrapper::PaddingInterleaveMN<NTile, RowPack>::template forward<BTLA_ISA::AVX512_FP16>(
+        src.data(), dst.data(), row, col, row_pad, col_pad, col, row_pad);
+    ut::buffer_error(dst.data(), ref.data(), dst.size(), T_DST(100));
+  }
+};
+#ifdef BTLA_UT_KERNEL_WRAPPER
+static UT_PaddingInterleaveMN sUT_PaddingInterleaveMN;
+#endif
+
+class UT_PaddingTransInterleaveMN {
+ public:
+  UT_PaddingTransInterleaveMN() {
+    UT_START();
+    // ut<48, 2, bf16, bf16>(128, 128, 2);  // TO IMPLEMENT
+    ut<32, 2, fp16, bf16>(128, 128, 2);
+  }
+  template <int MTile, int ColPack, typename T_SRC, typename T_DST>
+  void ut(int row, int col, int col_tile) {
+    printf("%s %d %d %d\n", __FUNCTION__, row, col, col_tile);
+    int row_pad = padto(row, MTile);
+    int col_pad = padto(col, col_tile);
+
+    aligned_vector<T_SRC> src(row * col);
+    aligned_vector<T_DST> dst(col_pad * row_pad), ref(col_pad * row_pad);
+    for (size_t i = 0; i < src.size(); i++) src[i] = static_cast<T_SRC>(float(i));
+
+    kernel::wrapper::PaddingTransInterleaveMN<MTile, ColPack>::template forward<BTLA_ISA::NoSIMD>(
+        src.data(), ref.data(), row, col, row_pad, col_pad, row_pad, col);
+    kernel::wrapper::PaddingTransInterleaveMN<MTile, ColPack>::template forward<BTLA_ISA::AVX512_FP16>(
+        src.data(), dst.data(), row, col, row_pad, col_pad, col, row_pad);
+    ut::buffer_error(dst.data(), ref.data(), dst.size(), T_DST(100));
+  }
+};
+#ifdef BTLA_UT_KERNEL_WRAPPER
+static UT_PaddingTransInterleaveMN sUT_PaddingTransInterleaveMN;
+#endif
+
+class UT_RevertPaddingInterleaveMN {
+ public:
+  UT_RevertPaddingInterleaveMN() {
+    UT_START();
+    ut<48, 4, char>(128, 128, 4);   // vnni
+    ut<48, 1, float>(128, 128, 1);  // 512f
+    ut<48, 4, char>(128, 32, 64);   // amxint8
+  }
+  template <int NTile, int PackRow, typename T>
+  void ut(int row, int col, int rowtile) {
+    printf("%s %d %d %d\n", __FUNCTION__, row, col, rowtile);
+    int rowpad = padto(row, rowtile);
+    int colpad = padto(col, NTile);
+    aligned_vector<T> src(row * col), packed(rowpad * colpad);
+    for (size_t i = 0; i < src.size(); i++) {
+      src[i] = static_cast<T>(i);
+    }
+    aligned_vector<T> reverted(row * col);
+    kernel::wrapper::PaddingInterleaveMN<NTile, PackRow>::template forward<BTLA_ISA::NoSIMD>(
+        src.data(), packed.data(), row, col, rowpad, colpad, col, rowpad);
+    kernel::wrapper::RevertPaddingInterleaveMN<NTile, PackRow>::template forward<BTLA_ISA::NoSIMD>(
+        packed.data(), reverted.data(), row, col, rowpad, colpad, rowpad, col);
+    ut::buffer_error(src.data(), reverted.data(), reverted.size());
+  }
+};
+#ifdef BTLA_UT_KERNEL_WRAPPER
+static UT_RevertPaddingInterleaveMN sUT_RevertPaddingInterleaveMN;
+#endif
+
+class UT_LayerNormalization {
+ public:
+  UT_LayerNormalization() {
+    UT_START();
+    CheckISA(AVX2);
+    ut<float, BTLA_ISA::AVX2>(4096, false, true, true);
+    ut<float, BTLA_ISA::AVX2>(4096, false, false, false);
+    ut<float, BTLA_ISA::AVX2>(111, false, true, true);
+    ut<float, BTLA_ISA::AVX2>(111, true, true, true);
+    CheckISA(AVX512F);
+    ut<float, BTLA_ISA::AVX512F>(4096, false, true, true);
+    ut<float, BTLA_ISA::AVX512F>(4096, false, false, false);
+    ut<float, BTLA_ISA::AVX512F>(111, false, true, true);
+    ut<float, BTLA_ISA::AVX512F>(111, true, true, true);
+  }
+  template <typename T, BTLA_ISA ISA>
+  void ut(int norm_size, bool simplified, bool hasscale, bool hasbias) {
+    printf("%s %d\n", __FUNCTION__, norm_size);
+    aligned_vector<T> src(norm_size), dst(norm_size), bias(norm_size), scale(norm_size), ref(norm_size);
+    fill_buffer_randn(src.data(), src.size(), -0.5f, 0.5f);
+    fill_buffer_randn(bias.data(), bias.size(), -0.5f, 0.5f);
+    fill_buffer_randn(scale.data(), scale.size(), 0.1f, 1.f);
+    T mean = 0.f, mean_square = 0.f;
+    kernel::wrapper::LayerNormalization::forward<BTLA_ISA::NoSIMD>(src.data(), hasscale ? scale.data() : nullptr,
+                                                                   hasbias ? bias.data() : nullptr, 0.00001f, norm_size,
+                                                                   ref.data(), &mean, &mean_square, simplified);
+    kernel::wrapper::LayerNormalization::forward<ISA>(src.data(), hasscale ? scale.data() : nullptr,
+                                                      hasbias ? bias.data() : nullptr, 0.00001f, norm_size, dst.data(),
+                                                      &mean, &mean_square, simplified);
+    buffer_error(ref.data(), dst.data(), ref.size(), 0.01f);
+  }
+};
+#ifdef BTLA_UT_KERNEL_WRAPPER
+UT_LayerNormalization sUT_LayerNormalization;
+#endif
+}  // namespace ut
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/xbyak/xbyak.h b/vllm-ext/inference_engine/cpp/bestla/bestla/xbyak/xbyak.h
new file mode 100644
index 000000000..fab31919d
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/xbyak/xbyak.h
@@ -0,0 +1,3313 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#ifndef XBYAK_XBYAK_H_
+#define XBYAK_XBYAK_H_
+/*!
+        @file xbyak.h
+        @brief Xbyak ; JIT assembler for x86(IA32)/x64 by C++
+        @author herumi
+        @url https://github.com/herumi/xbyak
+        @note modified new BSD license
+        http://opensource.org/licenses/BSD-3-Clause
+*/
+#if (not +0) && !defined(XBYAK_NO_OP_NAMES)  // trick to detect whether 'not' is operator or not
+#define XBYAK_NO_OP_NAMES
+#endif
+
+#include <stdio.h>  // for debug print
+#include <assert.h>
+#include <list>
+#include <string>
+#include <algorithm>
+#ifndef NDEBUG
+#include <iostream>
+#endif
+
+// #define XBYAK_DISABLE_AVX512
+
+#if !defined(XBYAK_USE_MMAP_ALLOCATOR) && !defined(XBYAK_DONT_USE_MMAP_ALLOCATOR)
+#define XBYAK_USE_MMAP_ALLOCATOR
+#endif
+#if !defined(__GNUC__) || defined(__MINGW32__)
+#undef XBYAK_USE_MMAP_ALLOCATOR
+#endif
+
+#ifdef __GNUC__
+#define XBYAK_GNUC_PREREQ(major, minor) ((__GNUC__)*100 + (__GNUC_MINOR__) >= (major)*100 + (minor))
+#else
+#define XBYAK_GNUC_PREREQ(major, minor) 0
+#endif
+
+// This covers -std=(gnu|c)++(0x|11|1y), -stdlib=libc++, and modern Microsoft.
+#if ((defined(_MSC_VER) && (_MSC_VER >= 1600)) || defined(_LIBCPP_VERSION) || \
+     ((__cplusplus >= 201103) || defined(__GXX_EXPERIMENTAL_CXX0X__)))
+#include <unordered_set>
+#define XBYAK_STD_UNORDERED_SET std::unordered_set
+#include <unordered_map>
+#define XBYAK_STD_UNORDERED_MAP std::unordered_map
+#define XBYAK_STD_UNORDERED_MULTIMAP std::unordered_multimap
+
+/*
+        Clang/llvm-gcc and ICC-EDG in 'GCC-mode' always claim to be GCC 4.2, using
+        libstdcxx 20070719 (from GCC 4.2.1, the last GPL 2 version).
+*/
+#elif XBYAK_GNUC_PREREQ(4, 5) || (XBYAK_GNUC_PREREQ(4, 2) && __GLIBCXX__ >= 20070719) || defined(__INTEL_COMPILER) || \
+    defined(__llvm__)
+#include <tr1/unordered_set>
+#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
+#include <tr1/unordered_map>
+#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
+#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
+
+#elif defined(_MSC_VER) && (_MSC_VER >= 1500) && (_MSC_VER < 1600)
+#include <unordered_set>
+#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
+#include <unordered_map>
+#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
+#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
+
+#else
+#include <set>
+#define XBYAK_STD_UNORDERED_SET std::set
+#include <map>
+#define XBYAK_STD_UNORDERED_MAP std::map
+#define XBYAK_STD_UNORDERED_MULTIMAP std::multimap
+#endif
+#ifdef _WIN32
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#include <malloc.h>
+#ifdef _MSC_VER
+#define XBYAK_TLS __declspec(thread)
+#else
+#define XBYAK_TLS __thread
+#endif
+#elif defined(__GNUC__)
+#include <unistd.h>
+#include <sys/mman.h>
+#include <stdlib.h>
+#define XBYAK_TLS __thread
+#endif
+#if defined(__APPLE__) && !defined(XBYAK_DONT_USE_MAP_JIT)
+#define XBYAK_USE_MAP_JIT
+#include <sys/sysctl.h>
+#ifndef MAP_JIT
+#define MAP_JIT 0x800
+#endif
+#endif
+#if !defined(_MSC_VER) || (_MSC_VER >= 1600)
+#include <stdint.h>
+#endif
+
+// MFD_CLOEXEC defined only linux 3.17 or later.
+// Android wraps the memfd_create syscall from API version 30.
+#if !defined(MFD_CLOEXEC) || (defined(__ANDROID__) && __ANDROID_API__ < 30)
+#undef XBYAK_USE_MEMFD
+#endif
+
+#if defined(_WIN64) || defined(__MINGW64__) || (defined(__CYGWIN__) && defined(__x86_64__))
+#define XBYAK64_WIN
+#elif defined(__x86_64__)
+#define XBYAK64_GCC
+#endif
+#if !defined(XBYAK64) && !defined(XBYAK32)
+#if defined(XBYAK64_GCC) || defined(XBYAK64_WIN)
+#define XBYAK64
+#else
+#define XBYAK32
+#endif
+#endif
+
+#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1900)
+#undef XBYAK_TLS
+#define XBYAK_TLS thread_local
+#define XBYAK_VARIADIC_TEMPLATE
+#define XBYAK_NOEXCEPT noexcept
+#else
+#define XBYAK_NOEXCEPT throw()
+#endif
+
+// require c++14 or later
+// Visual Studio 2017 version 15.0 or later
+// g++-6 or later
+#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || \
+    (defined(_MSC_VER) && _MSC_VER >= 1910)
+#define XBYAK_CONSTEXPR constexpr
+#else
+#define XBYAK_CONSTEXPR
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4514) /* remove inline function */
+#pragma warning(disable : 4786) /* identifier is too long */
+#pragma warning(disable : 4503) /* name is too long */
+#pragma warning(disable : 4127) /* constant expresison */
+#endif
+
+// disable -Warray-bounds because it may be a bug of gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104603
+#if defined(__GNUC__) && !defined(__clang__)
+#define XBYAK_DISABLE_WARNING_ARRAY_BOUNDS
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+
+namespace Xbyak {
+
+enum {
+  DEFAULT_MAX_CODE_SIZE = 4096,
+  VERSION = 0x6730 /* 0xABCD = A.BC(.D) */
+};
+
+#ifndef MIE_INTEGER_TYPE_DEFINED
+#define MIE_INTEGER_TYPE_DEFINED
+// for backward compatibility
+typedef uint64_t uint64;
+typedef int64_t sint64;
+typedef uint32_t uint32;
+typedef uint16_t uint16;
+typedef uint8_t uint8;
+#endif
+
+#ifndef MIE_ALIGN
+#ifdef _MSC_VER
+#define MIE_ALIGN(x) __declspec(align(x))
+#else
+#define MIE_ALIGN(x) __attribute__((aligned(x)))
+#endif
+#endif
+#ifndef MIE_PACK  // for shufps
+#define MIE_PACK(x, y, z, w) ((x)*64 + (y)*16 + (z)*4 + (w))
+#endif
+
+enum {
+  ERR_NONE = 0,
+  ERR_BAD_ADDRESSING,
+  ERR_CODE_IS_TOO_BIG,
+  ERR_BAD_SCALE,
+  ERR_ESP_CANT_BE_INDEX,
+  ERR_BAD_COMBINATION,
+  ERR_BAD_SIZE_OF_REGISTER,
+  ERR_IMM_IS_TOO_BIG,
+  ERR_BAD_ALIGN,
+  ERR_LABEL_IS_REDEFINED,
+  ERR_LABEL_IS_TOO_FAR,
+  ERR_LABEL_IS_NOT_FOUND,
+  ERR_CODE_ISNOT_COPYABLE,
+  ERR_BAD_PARAMETER,
+  ERR_CANT_PROTECT,
+  ERR_CANT_USE_64BIT_DISP,
+  ERR_OFFSET_IS_TOO_BIG,
+  ERR_MEM_SIZE_IS_NOT_SPECIFIED,
+  ERR_BAD_MEM_SIZE,
+  ERR_BAD_ST_COMBINATION,
+  ERR_OVER_LOCAL_LABEL,  // not used
+  ERR_UNDER_LOCAL_LABEL,
+  ERR_CANT_ALLOC,
+  ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW,
+  ERR_BAD_PROTECT_MODE,
+  ERR_BAD_PNUM,
+  ERR_BAD_TNUM,
+  ERR_BAD_VSIB_ADDRESSING,
+  ERR_CANT_CONVERT,
+  ERR_LABEL_ISNOT_SET_BY_L,
+  ERR_LABEL_IS_ALREADY_SET_BY_L,
+  ERR_BAD_LABEL_STR,
+  ERR_MUNMAP,
+  ERR_OPMASK_IS_ALREADY_SET,
+  ERR_ROUNDING_IS_ALREADY_SET,
+  ERR_K0_IS_INVALID,
+  ERR_EVEX_IS_INVALID,
+  ERR_SAE_IS_INVALID,
+  ERR_ER_IS_INVALID,
+  ERR_INVALID_BROADCAST,
+  ERR_INVALID_OPMASK_WITH_MEMORY,
+  ERR_INVALID_ZERO,
+  ERR_INVALID_RIP_IN_AUTO_GROW,
+  ERR_INVALID_MIB_ADDRESS,
+  ERR_X2APIC_IS_NOT_SUPPORTED,
+  ERR_NOT_SUPPORTED,
+  ERR_SAME_REGS_ARE_INVALID,
+  ERR_INTERNAL  // Put it at last.
+};
+
+inline const char* ConvertErrorToString(int err) {
+  static const char* errTbl[] = {"none",
+                                 "bad addressing",
+                                 "code is too big",
+                                 "bad scale",
+                                 "esp can't be index",
+                                 "bad combination",
+                                 "bad size of register",
+                                 "imm is too big",
+                                 "bad align",
+                                 "label is redefined",
+                                 "label is too far",
+                                 "label is not found",
+                                 "code is not copyable",
+                                 "bad parameter",
+                                 "can't protect",
+                                 "can't use 64bit disp(use (void*))",
+                                 "offset is too big",
+                                 "MEM size is not specified",
+                                 "bad mem size",
+                                 "bad st combination",
+                                 "over local label",
+                                 "under local label",
+                                 "can't alloc",
+                                 "T_SHORT is not supported in AutoGrow",
+                                 "bad protect mode",
+                                 "bad pNum",
+                                 "bad tNum",
+                                 "bad vsib addressing",
+                                 "can't convert",
+                                 "label is not set by L()",
+                                 "label is already set by L()",
+                                 "bad label string",
+                                 "err munmap",
+                                 "opmask is already set",
+                                 "rounding is already set",
+                                 "k0 is invalid",
+                                 "evex is invalid",
+                                 "sae(suppress all exceptions) is invalid",
+                                 "er(embedded rounding) is invalid",
+                                 "invalid broadcast",
+                                 "invalid opmask with memory",
+                                 "invalid zero",
+                                 "invalid rip in AutoGrow",
+                                 "invalid mib address",
+                                 "x2APIC is not supported",
+                                 "not supported",
+                                 "same regs are invalid",
+                                 "internal error"};
+  assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl));
+  return err <= ERR_INTERNAL ? errTbl[err] : "unknown err";
+}
+
+#ifdef XBYAK_NO_EXCEPTION
+namespace local {
+
+inline int& GetErrorRef() {
+  static XBYAK_TLS int err = 0;
+  return err;
+}
+
+inline void SetError(int err) {
+  if (local::GetErrorRef()) return;  // keep the first err code
+  local::GetErrorRef() = err;
+}
+
+}  // namespace local
+
+inline void ClearError() { local::GetErrorRef() = 0; }
+inline int GetError() { return Xbyak::local::GetErrorRef(); }
+
+#define XBYAK_THROW(err)         \
+  {                              \
+    Xbyak::local::SetError(err); \
+    return;                      \
+  }
+#define XBYAK_THROW_RET(err, r)  \
+  {                              \
+    Xbyak::local::SetError(err); \
+    return r;                    \
+  }
+
+#else
+class Error : public std::exception {
+  int err_;
+
+ public:
+  explicit Error(int err) : err_(err) {
+    if (err_ < 0 || err_ > ERR_INTERNAL) {
+      err_ = ERR_INTERNAL;
+    }
+  }
+  operator int() const { return err_; }
+  const char* what() const XBYAK_NOEXCEPT { return ConvertErrorToString(err_); }
+};
+
+// dummy functions
+inline void ClearError() {}
+inline int GetError() { return 0; }
+
+inline const char* ConvertErrorToString(const Error& err) { return err.what(); }
+
+#define XBYAK_THROW(err) \
+  { throw Error(err); }
+#define XBYAK_THROW_RET(err, r) \
+  { throw Error(err); }
+
+#endif
+
+inline void* AlignedMalloc(size_t size, size_t alignment) {
+#ifdef __MINGW32__
+  return __mingw_aligned_malloc(size, alignment);
+#elif defined(_WIN32)
+  return _aligned_malloc(size, alignment);
+#else
+  void* p;
+  int ret = posix_memalign(&p, alignment, size);
+  return (ret == 0) ? p : 0;
+#endif
+}
+
+inline void AlignedFree(void* p) {
+#ifdef __MINGW32__
+  __mingw_aligned_free(p);
+#elif defined(_MSC_VER)
+  _aligned_free(p);
+#else
+  free(p);
+#endif
+}
+
+template <class To, class From>
+inline const To CastTo(From p) XBYAK_NOEXCEPT {
+  return (const To)(size_t)(p);
+}
+namespace inner {
+
+#ifdef _WIN32
+struct SystemInfo {
+  SYSTEM_INFO info;
+  SystemInfo() { GetSystemInfo(&info); }
+};
+#endif
+// static const size_t ALIGN_PAGE_SIZE = 4096;
+inline size_t getPageSize() {
+#ifdef _WIN32
+  static const SystemInfo si;
+  return si.info.dwPageSize;
+#elif defined(__GNUC__)
+  static const long pageSize = sysconf(_SC_PAGESIZE);
+  if (pageSize > 0) {
+    return (size_t)pageSize;
+  }
+#endif
+  return 4096;
+}
+
+inline bool IsInDisp8(uint32_t x) { return 0xFFFFFF80 <= x || x <= 0x7F; }
+inline bool IsInInt32(uint64_t x) { return ~uint64_t(0x7fffffffu) <= x || x <= 0x7FFFFFFFU; }
+
+inline uint32_t VerifyInInt32(uint64_t x) {
+#if defined(XBYAK64) && !defined(__ILP32__)
+  if (!IsInInt32(x)) XBYAK_THROW_RET(ERR_OFFSET_IS_TOO_BIG, 0)
+#endif
+  return static_cast<uint32_t>(x);
+}
+
+enum LabelMode {
+  LasIs,   // as is
+  Labs,    // absolute
+  LaddTop  // (addr + top) for mov(reg, label) with AutoGrow
+};
+
+}  // namespace inner
+
+/*
+        custom allocator
+*/
+struct Allocator {
+  explicit Allocator(const std::string& = "") {}  // same interface with MmapAllocator
+  virtual uint8_t* alloc(size_t size) { return reinterpret_cast<uint8_t*>(AlignedMalloc(size, inner::getPageSize())); }
+  virtual void free(uint8_t* p) { AlignedFree(p); }
+  virtual ~Allocator() {}
+  /* override to return false if you call protect() manually */
+  virtual bool useProtect() const { return true; }
+};
+
+#ifdef XBYAK_USE_MMAP_ALLOCATOR
+#ifdef XBYAK_USE_MAP_JIT
+namespace util {
+
+inline int getMacOsVersionPure() {
+  char buf[64];
+  size_t size = sizeof(buf);
+  int err = sysctlbyname("kern.osrelease", buf, &size, nullptr, 0);
+  if (err != 0) return 0;
+  char* endp;
+  int major = strtol(buf, &endp, 10);
+  if (*endp != '.') return 0;
+  return major;
+}
+
+inline int getMacOsVersion() {
+  static const int version = getMacOsVersionPure();
+  return version;
+}
+
+}  // namespace util
+#endif
+class MmapAllocator : public Allocator {
+  struct Allocation {
+    size_t size;
+#if defined(XBYAK_USE_MEMFD)
+    // fd_ is only used with XBYAK_USE_MEMFD. We keep the file open
+    // during the lifetime of each allocation in order to support
+    // checkpoint/restore by unprivileged users.
+    int fd;
+#endif
+  };
+  const std::string name_;  // only used with XBYAK_USE_MEMFD
+  typedef XBYAK_STD_UNORDERED_MAP<uintptr_t, Allocation> AllocationList;
+  AllocationList allocList_;
+
+ public:
+  explicit MmapAllocator(const std::string& name = "xbyak") : name_(name) {}
+  uint8_t* alloc(size_t size) {
+    const size_t alignedSizeM1 = inner::getPageSize() - 1;
+    size = (size + alignedSizeM1) & ~alignedSizeM1;
+#if defined(MAP_ANONYMOUS)
+    int mode = MAP_PRIVATE | MAP_ANONYMOUS;
+#elif defined(MAP_ANON)
+    int mode = MAP_PRIVATE | MAP_ANON;
+#else
+#error "not supported"
+#endif
+#if defined(XBYAK_USE_MAP_JIT)
+    const int mojaveVersion = 18;
+    if (util::getMacOsVersion() >= mojaveVersion) mode |= MAP_JIT;
+#endif
+    int fd = -1;
+#if defined(XBYAK_USE_MEMFD)
+    fd = memfd_create(name_.c_str(), MFD_CLOEXEC);
+    if (fd != -1) {
+      mode = MAP_SHARED;
+      if (ftruncate(fd, size) != 0) {
+        close(fd);
+        XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
+      }
+    }
+#endif
+    void* p = mmap(nullptr, size, PROT_READ | PROT_WRITE, mode, fd, 0);
+    if (p == MAP_FAILED) {
+      if (fd != -1) close(fd);
+      XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
+    }
+    assert(p);
+    Allocation& alloc = allocList_[(uintptr_t)p];
+    alloc.size = size;
+#if defined(XBYAK_USE_MEMFD)
+    alloc.fd = fd;
+#endif
+    return (uint8_t*)p;
+  }
+  void free(uint8_t* p) {
+    if (p == 0) return;
+    AllocationList::iterator i = allocList_.find((uintptr_t)p);
+    if (i == allocList_.end()) XBYAK_THROW(ERR_BAD_PARAMETER)
+    if (munmap((void*)i->first, i->second.size) < 0) XBYAK_THROW(ERR_MUNMAP)
+#if defined(XBYAK_USE_MEMFD)
+    if (i->second.fd != -1) close(i->second.fd);
+#endif
+    allocList_.erase(i);
+  }
+};
+#else
+typedef Allocator MmapAllocator;
+#endif
+
+class Address;
+class Reg;
+
+class Operand {
+  static const uint8_t EXT8BIT = 0x20;
+  unsigned int idx_ : 6;  // 0..31 + EXT8BIT = 1 if spl/bpl/sil/dil
+  unsigned int kind_ : 10;
+  unsigned int bit_ : 14;
+
+ protected:
+  unsigned int zero_ : 1;
+  unsigned int mask_ : 3;
+  unsigned int rounding_ : 3;
+  void setIdx(int idx) { idx_ = idx; }
+
+ public:
+  enum Kind {
+    NONE = 0,
+    MEM = 1 << 0,
+    REG = 1 << 1,
+    MMX = 1 << 2,
+    FPU = 1 << 3,
+    XMM = 1 << 4,
+    YMM = 1 << 5,
+    ZMM = 1 << 6,
+    OPMASK = 1 << 7,
+    BNDREG = 1 << 8,
+    TMM = 1 << 9
+  };
+  enum Code {
+#ifdef XBYAK64
+    RAX = 0,
+    RCX,
+    RDX,
+    RBX,
+    RSP,
+    RBP,
+    RSI,
+    RDI,
+    R8,
+    R9,
+    R10,
+    R11,
+    R12,
+    R13,
+    R14,
+    R15,
+    R8D = 8,
+    R9D,
+    R10D,
+    R11D,
+    R12D,
+    R13D,
+    R14D,
+    R15D,
+    R8W = 8,
+    R9W,
+    R10W,
+    R11W,
+    R12W,
+    R13W,
+    R14W,
+    R15W,
+    R8B = 8,
+    R9B,
+    R10B,
+    R11B,
+    R12B,
+    R13B,
+    R14B,
+    R15B,
+    SPL = 4,
+    BPL,
+    SIL,
+    DIL,
+#endif
+    EAX = 0,
+    ECX,
+    EDX,
+    EBX,
+    ESP,
+    EBP,
+    ESI,
+    EDI,
+    AX = 0,
+    CX,
+    DX,
+    BX,
+    SP,
+    BP,
+    SI,
+    DI,
+    AL = 0,
+    CL,
+    DL,
+    BL,
+    AH,
+    CH,
+    DH,
+    BH
+  };
+  XBYAK_CONSTEXPR Operand() : idx_(0), kind_(0), bit_(0), zero_(0), mask_(0), rounding_(0) {}
+  XBYAK_CONSTEXPR Operand(int idx, Kind kind, int bit, bool ext8bit = 0)
+      : idx_(static_cast<uint8_t>(idx | (ext8bit ? EXT8BIT : 0))),
+        kind_(kind),
+        bit_(bit),
+        zero_(0),
+        mask_(0),
+        rounding_(0) {
+    assert((bit_ & (bit_ - 1)) == 0);  // bit must be power of two
+  }
+  XBYAK_CONSTEXPR Kind getKind() const { return static_cast<Kind>(kind_); }
+  XBYAK_CONSTEXPR int getIdx() const { return idx_ & (EXT8BIT - 1); }
+  XBYAK_CONSTEXPR bool isNone() const { return kind_ == 0; }
+  XBYAK_CONSTEXPR bool isMMX() const { return is(MMX); }
+  XBYAK_CONSTEXPR bool isXMM() const { return is(XMM); }
+  XBYAK_CONSTEXPR bool isYMM() const { return is(YMM); }
+  XBYAK_CONSTEXPR bool isZMM() const { return is(ZMM); }
+  XBYAK_CONSTEXPR bool isTMM() const { return is(TMM); }
+  XBYAK_CONSTEXPR bool isXMEM() const { return is(XMM | MEM); }
+  XBYAK_CONSTEXPR bool isYMEM() const { return is(YMM | MEM); }
+  XBYAK_CONSTEXPR bool isZMEM() const { return is(ZMM | MEM); }
+  XBYAK_CONSTEXPR bool isOPMASK() const { return is(OPMASK); }
+  XBYAK_CONSTEXPR bool isBNDREG() const { return is(BNDREG); }
+  XBYAK_CONSTEXPR bool isREG(int bit = 0) const { return is(REG, bit); }
+  XBYAK_CONSTEXPR bool isMEM(int bit = 0) const { return is(MEM, bit); }
+  XBYAK_CONSTEXPR bool isFPU() const { return is(FPU); }
+  XBYAK_CONSTEXPR bool isExt8bit() const { return (idx_ & EXT8BIT) != 0; }
+  XBYAK_CONSTEXPR bool isExtIdx() const { return (getIdx() & 8) != 0; }
+  XBYAK_CONSTEXPR bool isExtIdx2() const { return (getIdx() & 16) != 0; }
+  XBYAK_CONSTEXPR bool hasEvex() const { return isZMM() || isExtIdx2() || getOpmaskIdx() || getRounding(); }
+  XBYAK_CONSTEXPR bool hasRex() const { return isExt8bit() || isREG(64) || isExtIdx(); }
+  XBYAK_CONSTEXPR bool hasZero() const { return zero_; }
+  XBYAK_CONSTEXPR int getOpmaskIdx() const { return mask_; }
+  XBYAK_CONSTEXPR int getRounding() const { return rounding_; }
+  void setKind(Kind kind) {
+    if ((kind & (XMM | YMM | ZMM | TMM)) == 0) return;
+    kind_ = kind;
+    bit_ = kind == XMM ? 128 : kind == YMM ? 256 : kind == ZMM ? 512 : 8192;
+  }
+  // err if MMX/FPU/OPMASK/BNDREG
+  void setBit(int bit);
+  void setOpmaskIdx(int idx, bool /*ignore_idx0*/ = true) {
+    if (mask_) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET)
+    mask_ = idx;
+  }
+  void setRounding(int idx) {
+    if (rounding_) XBYAK_THROW(ERR_ROUNDING_IS_ALREADY_SET)
+    rounding_ = idx;
+  }
+  void setZero() { zero_ = true; }
+  // ah, ch, dh, bh?
+  bool isHigh8bit() const {
+    if (!isBit(8)) return false;
+    if (isExt8bit()) return false;
+    const int idx = getIdx();
+    return AH <= idx && idx <= BH;
+  }
+  // any bit is acceptable if bit == 0
+  XBYAK_CONSTEXPR bool is(int kind, uint32_t bit = 0) const {
+    return (kind == 0 || (kind_ & kind)) && (bit == 0 || (bit_ & bit));  // cf. you can set (8|16)
+  }
+  XBYAK_CONSTEXPR bool isBit(uint32_t bit) const { return (bit_ & bit) != 0; }
+  XBYAK_CONSTEXPR uint32_t getBit() const { return bit_; }
+  const char* toString() const {
+    const int idx = getIdx();
+    if (kind_ == REG) {
+      if (isExt8bit()) {
+        static const char* tbl[4] = {"spl", "bpl", "sil", "dil"};
+        return tbl[idx - 4];
+      }
+      static const char* tbl[4][16] = {
+          {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh", "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b",
+           "r15b"},
+          {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di", "r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w",
+           "r15w"},
+          {"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d",
+           "r15d"},
+          {"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14",
+           "r15"},
+      };
+      return tbl[bit_ == 8 ? 0 : bit_ == 16 ? 1 : bit_ == 32 ? 2 : 3][idx];
+    } else if (isOPMASK()) {
+      static const char* tbl[8] = {"k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7"};
+      return tbl[idx];
+    } else if (isTMM()) {
+      static const char* tbl[8] = {"tmm0", "tmm1", "tmm2", "tmm3", "tmm4", "tmm5", "tmm6", "tmm7"};
+      return tbl[idx];
+    } else if (isZMM()) {
+      static const char* tbl[32] = {"zmm0",  "zmm1",  "zmm2",  "zmm3",  "zmm4",  "zmm5",  "zmm6",  "zmm7",
+                                    "zmm8",  "zmm9",  "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+                                    "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
+                                    "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31"};
+      return tbl[idx];
+    } else if (isYMM()) {
+      static const char* tbl[32] = {"ymm0",  "ymm1",  "ymm2",  "ymm3",  "ymm4",  "ymm5",  "ymm6",  "ymm7",
+                                    "ymm8",  "ymm9",  "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
+                                    "ymm16", "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23",
+                                    "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31"};
+      return tbl[idx];
+    } else if (isXMM()) {
+      static const char* tbl[32] = {"xmm0",  "xmm1",  "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
+                                    "xmm8",  "xmm9",  "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
+                                    "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23",
+                                    "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31"};
+      return tbl[idx];
+    } else if (isMMX()) {
+      static const char* tbl[8] = {"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"};
+      return tbl[idx];
+    } else if (isFPU()) {
+      static const char* tbl[8] = {"st0", "st1", "st2", "st3", "st4", "st5", "st6", "st7"};
+      return tbl[idx];
+    } else if (isBNDREG()) {
+      static const char* tbl[4] = {"bnd0", "bnd1", "bnd2", "bnd3"};
+      return tbl[idx];
+    }
+    XBYAK_THROW_RET(ERR_INTERNAL, 0);
+  }
+  bool isEqualIfNotInherited(const Operand& rhs) const {
+    return idx_ == rhs.idx_ && kind_ == rhs.kind_ && bit_ == rhs.bit_ && zero_ == rhs.zero_ && mask_ == rhs.mask_ &&
+           rounding_ == rhs.rounding_;
+  }
+  bool operator==(const Operand& rhs) const;
+  bool operator!=(const Operand& rhs) const { return !operator==(rhs); }
+  const Address& getAddress() const;
+  const Reg& getReg() const;
+};
+
+inline void Operand::setBit(int bit) {
+  if (bit != 8 && bit != 16 && bit != 32 && bit != 64 && bit != 128 && bit != 256 && bit != 512 && bit != 8192)
+    goto ERR;
+  if (isBit(bit)) return;
+  if (is(MEM | OPMASK)) {
+    bit_ = bit;
+    return;
+  }
+  if (is(REG | XMM | YMM | ZMM | TMM)) {
+    int idx = getIdx();
+    // err if converting ah, bh, ch, dh
+    if (isREG(8) && (4 <= idx && idx < 8) && !isExt8bit()) goto ERR;
+    Kind kind = REG;
+    switch (bit) {
+      case 8:
+        if (idx >= 16) goto ERR;
+#ifdef XBYAK32
+        if (idx >= 4) goto ERR;
+#else
+        if (4 <= idx && idx < 8) idx |= EXT8BIT;
+#endif
+        break;
+      case 16:
+      case 32:
+      case 64:
+        if (idx >= 16) goto ERR;
+        break;
+      case 128:
+        kind = XMM;
+        break;
+      case 256:
+        kind = YMM;
+        break;
+      case 512:
+        kind = ZMM;
+        break;
+      case 8192:
+        kind = TMM;
+        break;
+    }
+    idx_ = idx;
+    kind_ = kind;
+    bit_ = bit;
+    if (bit >= 128) return;  // keep mask_ and rounding_
+    mask_ = 0;
+    rounding_ = 0;
+    return;
+  }
+ERR:
+  XBYAK_THROW(ERR_CANT_CONVERT)
+}
+
+class Label;
+
+struct Reg8;
+struct Reg16;
+struct Reg32;
+#ifdef XBYAK64
+struct Reg64;
+#endif
+class Reg : public Operand {
+ public:
+  XBYAK_CONSTEXPR Reg() {}
+  XBYAK_CONSTEXPR Reg(int idx, Kind kind, int bit = 0, bool ext8bit = false) : Operand(idx, kind, bit, ext8bit) {}
+  // convert to Reg8/Reg16/Reg32/Reg64/XMM/YMM/ZMM
+  Reg changeBit(int bit) const {
+    Reg r(*this);
+    r.setBit(bit);
+    return r;
+  }
+  uint8_t getRexW() const { return isREG(64) ? 8 : 0; }
+  uint8_t getRexR() const { return isExtIdx() ? 4 : 0; }
+  uint8_t getRexX() const { return isExtIdx() ? 2 : 0; }
+  uint8_t getRexB() const { return isExtIdx() ? 1 : 0; }
+  uint8_t getRex(const Reg& base = Reg()) const {
+    uint8_t rex = getRexW() | getRexR() | base.getRexW() | base.getRexB();
+    if (rex || isExt8bit() || base.isExt8bit()) rex |= 0x40;
+    return rex;
+  }
+  Reg8 cvt8() const;
+  Reg16 cvt16() const;
+  Reg32 cvt32() const;
+#ifdef XBYAK64
+  Reg64 cvt64() const;
+#endif
+};
+
+inline const Reg& Operand::getReg() const {
+  assert(!isMEM());
+  return static_cast<const Reg&>(*this);
+}
+
+struct Reg8 : public Reg {
+  explicit XBYAK_CONSTEXPR Reg8(int idx = 0, bool ext8bit = false) : Reg(idx, Operand::REG, 8, ext8bit) {}
+};
+
+struct Reg16 : public Reg {
+  explicit XBYAK_CONSTEXPR Reg16(int idx = 0) : Reg(idx, Operand::REG, 16) {}
+};
+
+struct Mmx : public Reg {
+  explicit XBYAK_CONSTEXPR Mmx(int idx = 0, Kind kind = Operand::MMX, int bit = 64) : Reg(idx, kind, bit) {}
+};
+
+struct EvexModifierRounding {
+  enum { T_RN_SAE = 1, T_RD_SAE = 2, T_RU_SAE = 3, T_RZ_SAE = 4, T_SAE = 5 };
+  explicit XBYAK_CONSTEXPR EvexModifierRounding(int rounding) : rounding(rounding) {}
+  int rounding;
+};
+struct EvexModifierZero {
+  XBYAK_CONSTEXPR EvexModifierZero() {}
+};
+
+struct Xmm : public Mmx {
+  explicit XBYAK_CONSTEXPR Xmm(int idx = 0, Kind kind = Operand::XMM, int bit = 128) : Mmx(idx, kind, bit) {}
+  XBYAK_CONSTEXPR Xmm(Kind kind, int idx) : Mmx(idx, kind, kind == XMM ? 128 : kind == YMM ? 256 : 512) {}
+  Xmm operator|(const EvexModifierRounding& emr) const {
+    Xmm r(*this);
+    r.setRounding(emr.rounding);
+    return r;
+  }
+  Xmm copyAndSetIdx(int idx) const {
+    Xmm ret(*this);
+    ret.setIdx(idx);
+    return ret;
+  }
+  Xmm copyAndSetKind(Operand::Kind kind) const {
+    Xmm ret(*this);
+    ret.setKind(kind);
+    return ret;
+  }
+};
+
+struct Ymm : public Xmm {
+  explicit XBYAK_CONSTEXPR Ymm(int idx = 0, Kind kind = Operand::YMM, int bit = 256) : Xmm(idx, kind, bit) {}
+  Ymm operator|(const EvexModifierRounding& emr) const {
+    Ymm r(*this);
+    r.setRounding(emr.rounding);
+    return r;
+  }
+};
+
+struct Zmm : public Ymm {
+  explicit XBYAK_CONSTEXPR Zmm(int idx = 0) : Ymm(idx, Operand::ZMM, 512) {}
+  Zmm operator|(const EvexModifierRounding& emr) const {
+    Zmm r(*this);
+    r.setRounding(emr.rounding);
+    return r;
+  }
+};
+
+#ifdef XBYAK64
+struct Tmm : public Reg {
+  explicit XBYAK_CONSTEXPR Tmm(int idx = 0, Kind kind = Operand::TMM, int bit = 8192) : Reg(idx, kind, bit) {}
+};
+#endif
+
+struct Opmask : public Reg {
+  explicit XBYAK_CONSTEXPR Opmask(int idx = 0) : Reg(idx, Operand::OPMASK, 64) {}
+};
+
+struct BoundsReg : public Reg {
+  explicit XBYAK_CONSTEXPR BoundsReg(int idx = 0) : Reg(idx, Operand::BNDREG, 128) {}
+};
+
+template <class T>
+T operator|(const T& x, const Opmask& k) {
+  T r(x);
+  r.setOpmaskIdx(k.getIdx());
+  return r;
+}
+template <class T>
+T operator|(const T& x, const EvexModifierZero&) {
+  T r(x);
+  r.setZero();
+  return r;
+}
+template <class T>
+T operator|(const T& x, const EvexModifierRounding& emr) {
+  T r(x);
+  r.setRounding(emr.rounding);
+  return r;
+}
+
+struct Fpu : public Reg {
+  explicit XBYAK_CONSTEXPR Fpu(int idx = 0) : Reg(idx, Operand::FPU, 32) {}
+};
+
+struct Reg32e : public Reg {
+  explicit XBYAK_CONSTEXPR Reg32e(int idx, int bit) : Reg(idx, Operand::REG, bit) {}
+};
+struct Reg32 : public Reg32e {
+  explicit XBYAK_CONSTEXPR Reg32(int idx = 0) : Reg32e(idx, 32) {}
+};
+#ifdef XBYAK64
+struct Reg64 : public Reg32e {
+  explicit XBYAK_CONSTEXPR Reg64(int idx = 0) : Reg32e(idx, 64) {}
+};
+struct RegRip {
+  int64_t disp_;
+  const Label* label_;
+  bool isAddr_;
+  explicit XBYAK_CONSTEXPR RegRip(int64_t disp = 0, const Label* label = 0, bool isAddr = false)
+      : disp_(disp), label_(label), isAddr_(isAddr) {}
+  friend const RegRip operator+(const RegRip& r, int disp) { return RegRip(r.disp_ + disp, r.label_, r.isAddr_); }
+  friend const RegRip operator-(const RegRip& r, int disp) { return RegRip(r.disp_ - disp, r.label_, r.isAddr_); }
+  friend const RegRip operator+(const RegRip& r, int64_t disp) { return RegRip(r.disp_ + disp, r.label_, r.isAddr_); }
+  friend const RegRip operator-(const RegRip& r, int64_t disp) { return RegRip(r.disp_ - disp, r.label_, r.isAddr_); }
+  friend const RegRip operator+(const RegRip& r, const Label& label) {
+    if (r.label_ || r.isAddr_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegRip());
+    return RegRip(r.disp_, &label);
+  }
+  friend const RegRip operator+(const RegRip& r, const void* addr) {
+    if (r.label_ || r.isAddr_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegRip());
+    return RegRip(r.disp_ + (int64_t)addr, 0, true);
+  }
+};
+#endif
+
+inline Reg8 Reg::cvt8() const {
+  Reg r = changeBit(8);
+  return Reg8(r.getIdx(), r.isExt8bit());
+}
+
+inline Reg16 Reg::cvt16() const { return Reg16(changeBit(16).getIdx()); }
+
+inline Reg32 Reg::cvt32() const { return Reg32(changeBit(32).getIdx()); }
+
+#ifdef XBYAK64
+inline Reg64 Reg::cvt64() const { return Reg64(changeBit(64).getIdx()); }
+#endif
+
+#ifndef XBYAK_DISABLE_SEGMENT
+// not derived from Reg
+class Segment {
+  int idx_;
+
+ public:
+  enum { es, cs, ss, ds, fs, gs };
+  explicit XBYAK_CONSTEXPR Segment(int idx) : idx_(idx) { assert(0 <= idx_ && idx_ < 6); }
+  int getIdx() const { return idx_; }
+  const char* toString() const {
+    static const char tbl[][3] = {"es", "cs", "ss", "ds", "fs", "gs"};
+    return tbl[idx_];
+  }
+};
+#endif
+
+class RegExp {
+ public:
+#ifdef XBYAK64
+  enum { i32e = 32 | 64 };
+#else
+  enum { i32e = 32 };
+#endif
+  XBYAK_CONSTEXPR RegExp(size_t disp = 0) : scale_(0), disp_(disp) {}
+  XBYAK_CONSTEXPR RegExp(const Reg& r, int scale = 1) : scale_(scale), disp_(0) {
+    if (!r.isREG(i32e) && !r.is(Reg::XMM | Reg::YMM | Reg::ZMM | Reg::TMM)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
+    if (scale == 0) return;
+    if (scale != 1 && scale != 2 && scale != 4 && scale != 8) XBYAK_THROW(ERR_BAD_SCALE)
+    if (r.getBit() >= 128 || scale != 1) {  // xmm/ymm is always index
+      index_ = r;
+    } else {
+      base_ = r;
+    }
+  }
+  bool isVsib(int bit = 128 | 256 | 512) const { return index_.isBit(bit); }
+  RegExp optimize() const {
+    RegExp exp = *this;
+    // [reg * 2] => [reg + reg]
+    if (index_.isBit(i32e) && !base_.getBit() && scale_ == 2) {
+      exp.base_ = index_;
+      exp.scale_ = 1;
+    }
+    return exp;
+  }
+  bool operator==(const RegExp& rhs) const {
+    return base_ == rhs.base_ && index_ == rhs.index_ && disp_ == rhs.disp_ && scale_ == rhs.scale_;
+  }
+  const Reg& getBase() const { return base_; }
+  const Reg& getIndex() const { return index_; }
+  int getScale() const { return scale_; }
+  size_t getDisp() const { return disp_; }
+  XBYAK_CONSTEXPR void verify() const {
+    if (base_.getBit() >= 128) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
+    if (index_.getBit() && index_.getBit() <= 64) {
+      if (index_.getIdx() == Operand::ESP) XBYAK_THROW(ERR_ESP_CANT_BE_INDEX)
+      if (base_.getBit() && base_.getBit() != index_.getBit()) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
+    }
+  }
+  friend RegExp operator+(const RegExp& a, const RegExp& b);
+  friend RegExp operator-(const RegExp& e, size_t disp);
+  uint8_t getRex() const {
+    uint8_t rex = index_.getRexX() | base_.getRexB();
+    return rex ? uint8_t(rex | 0x40) : 0;
+  }
+
+ private:
+  /*
+          [base_ + index_ * scale_ + disp_]
+          base : Reg32e, index : Reg32e(w/o esp), Xmm, Ymm
+  */
+  Reg base_;
+  Reg index_;
+  int scale_;
+  size_t disp_;
+};
+
+inline RegExp operator+(const RegExp& a, const RegExp& b) {
+  if (a.index_.getBit() && b.index_.getBit()) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp())
+  RegExp ret = a;
+  if (!ret.index_.getBit()) {
+    ret.index_ = b.index_;
+    ret.scale_ = b.scale_;
+  }
+  if (b.base_.getBit()) {
+    if (ret.base_.getBit()) {
+      if (ret.index_.getBit()) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp())
+      // base + base => base + index * 1
+      ret.index_ = b.base_;
+      // [reg + esp] => [esp + reg]
+      if (ret.index_.getIdx() == Operand::ESP) std::swap(ret.base_, ret.index_);
+      ret.scale_ = 1;
+    } else {
+      ret.base_ = b.base_;
+    }
+  }
+  ret.disp_ += b.disp_;
+  return ret;
+}
+inline RegExp operator*(const Reg& r, int scale) { return RegExp(r, scale); }
+inline RegExp operator*(int scale, const Reg& r) { return r * scale; }
+inline RegExp operator-(const RegExp& e, size_t disp) {
+  RegExp ret = e;
+  ret.disp_ -= disp;
+  return ret;
+}
+
+// 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc)
+void* const AutoGrow = (void*)1;           //-V566
+void* const DontSetProtectRWE = (void*)2;  //-V566
+
+class CodeArray {
+  enum Type {
+    USER_BUF = 1,  // use userPtr(non alignment, non protect)
+    ALLOC_BUF,     // use new(alignment, protect)
+    AUTO_GROW      // automatically move and grow memory if necessary
+  };
+  CodeArray(const CodeArray& rhs);
+  void operator=(const CodeArray&);
+  bool isAllocType() const { return type_ == ALLOC_BUF || type_ == AUTO_GROW; }
+  struct AddrInfo {
+    size_t codeOffset;  // position to write
+    size_t jmpAddr;     // value to write
+    int jmpSize;        // size of jmpAddr
+    inner::LabelMode mode;
+    AddrInfo(size_t _codeOffset, size_t _jmpAddr, int _jmpSize, inner::LabelMode _mode)
+        : codeOffset(_codeOffset), jmpAddr(_jmpAddr), jmpSize(_jmpSize), mode(_mode) {}
+    uint64_t getVal(const uint8_t* top) const {
+      uint64_t disp = (mode == inner::LaddTop) ? jmpAddr + size_t(top)
+                      : (mode == inner::LasIs) ? jmpAddr
+                                               : jmpAddr - size_t(top);
+      if (jmpSize == 4) disp = inner::VerifyInInt32(disp);
+      return disp;
+    }
+  };
+  typedef std::list<AddrInfo> AddrInfoList;
+  AddrInfoList addrInfoList_;
+  const Type type_;
+#ifdef XBYAK_USE_MMAP_ALLOCATOR
+  MmapAllocator defaultAllocator_;
+#else
+  Allocator defaultAllocator_;
+#endif
+  Allocator* alloc_;
+
+ protected:
+  size_t maxSize_;
+  uint8_t* top_;
+  size_t size_;
+  bool isCalledCalcJmpAddress_;
+
+  bool useProtect() const { return alloc_->useProtect(); }
+  /*
+          allocate new memory and copy old data to the new area
+  */
+  void growMemory() {
+    const size_t newSize = (std::max<size_t>)(DEFAULT_MAX_CODE_SIZE, maxSize_ * 2);
+    uint8_t* newTop = alloc_->alloc(newSize);
+    if (newTop == 0) XBYAK_THROW(ERR_CANT_ALLOC)
+    for (size_t i = 0; i < size_; i++) newTop[i] = top_[i];
+    alloc_->free(top_);
+    top_ = newTop;
+    maxSize_ = newSize;
+  }
+  /*
+          calc jmp address for AutoGrow mode
+  */
+  void calcJmpAddress() {
+    if (isCalledCalcJmpAddress_) return;
+    for (AddrInfoList::const_iterator i = addrInfoList_.begin(), ie = addrInfoList_.end(); i != ie; ++i) {
+      uint64_t disp = i->getVal(top_);
+      rewrite(i->codeOffset, disp, i->jmpSize);
+    }
+    isCalledCalcJmpAddress_ = true;
+  }
+
+ public:
+  enum ProtectMode {
+    PROTECT_RW = 0,   // read/write
+    PROTECT_RWE = 1,  // read/write/exec
+    PROTECT_RE = 2    // read/exec
+  };
+  explicit CodeArray(size_t maxSize, void* userPtr = 0, Allocator* allocator = 0)
+      : type_(userPtr == AutoGrow                              ? AUTO_GROW
+              : (userPtr == 0 || userPtr == DontSetProtectRWE) ? ALLOC_BUF
+                                                               : USER_BUF),
+        alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_),
+        maxSize_(maxSize),
+        top_(type_ == USER_BUF ? reinterpret_cast<uint8_t*>(userPtr) : alloc_->alloc((std::max<size_t>)(maxSize, 1))),
+        size_(0),
+        isCalledCalcJmpAddress_(false) {
+    if (maxSize_ > 0 && top_ == 0) XBYAK_THROW(ERR_CANT_ALLOC)
+    if ((type_ == ALLOC_BUF && userPtr != DontSetProtectRWE && useProtect()) && !setProtectMode(PROTECT_RWE, false)) {
+      alloc_->free(top_);
+      XBYAK_THROW(ERR_CANT_PROTECT)
+    }
+  }
+  virtual ~CodeArray() {
+    if (isAllocType()) {
+      if (useProtect()) setProtectModeRW(false);
+      alloc_->free(top_);
+    }
+  }
+  bool setProtectMode(ProtectMode mode, bool throwException = true) {
+    bool isOK = protect(top_, maxSize_, mode);
+    if (isOK) return true;
+    if (throwException) XBYAK_THROW_RET(ERR_CANT_PROTECT, false)
+    return false;
+  }
+  bool setProtectModeRE(bool throwException = true) { return setProtectMode(PROTECT_RE, throwException); }
+  bool setProtectModeRW(bool throwException = true) { return setProtectMode(PROTECT_RW, throwException); }
+  void resetSize() {
+    size_ = 0;
+    addrInfoList_.clear();
+    isCalledCalcJmpAddress_ = false;
+  }
+  void db(int code) {
+    if (size_ >= maxSize_) {
+      if (type_ == AUTO_GROW) {
+        growMemory();
+      } else {
+        XBYAK_THROW(ERR_CODE_IS_TOO_BIG)
+      }
+    }
+    top_[size_++] = static_cast<uint8_t>(code);
+  }
+  void db(const uint8_t* code, size_t codeSize) {
+    for (size_t i = 0; i < codeSize; i++) db(code[i]);
+  }
+  void db(uint64_t code, size_t codeSize) {
+    if (codeSize > 8) XBYAK_THROW(ERR_BAD_PARAMETER)
+    for (size_t i = 0; i < codeSize; i++) db(static_cast<uint8_t>(code >> (i * 8)));
+  }
+  void dw(uint32_t code) { db(code, 2); }
+  void dd(uint32_t code) { db(code, 4); }
+  void dq(uint64_t code) { db(code, 8); }
+  const uint8_t* getCode() const { return top_; }
+  template <class F>
+  const F getCode() const {
+    return reinterpret_cast<F>(top_);
+  }
+  const uint8_t* getCurr() const { return &top_[size_]; }
+  template <class F>
+  const F getCurr() const {
+    return reinterpret_cast<F>(&top_[size_]);
+  }
+  size_t getSize() const { return size_; }
+  void setSize(size_t size) {
+    if (size > maxSize_) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
+    size_ = size;
+  }
+  void dump() const {
+    const uint8_t* p = getCode();
+    size_t bufSize = getSize();
+    size_t remain = bufSize;
+    for (int i = 0; i < 4; i++) {
+      size_t disp = 16;
+      if (remain < 16) {
+        disp = remain;
+      }
+      for (size_t j = 0; j < 16; j++) {
+        if (j < disp) {
+          printf("%02X", p[i * 16 + j]);
+        }
+      }
+      putchar('\n');
+      remain -= disp;
+      if (remain == 0) {
+        break;
+      }
+    }
+  }
+  /*
+          @param offset [in] offset from top
+          @param disp [in] offset from the next of jmp
+          @param size [in] write size(1, 2, 4, 8)
+  */
+  void rewrite(size_t offset, uint64_t disp, size_t size) {
+    assert(offset < maxSize_);
+    if (size != 1 && size != 2 && size != 4 && size != 8) XBYAK_THROW(ERR_BAD_PARAMETER)
+    uint8_t* const data = top_ + offset;
+    for (size_t i = 0; i < size; i++) {
+      data[i] = static_cast<uint8_t>(disp >> (i * 8));
+    }
+  }
+  void save(size_t offset, size_t val, int size, inner::LabelMode mode) {
+    addrInfoList_.push_back(AddrInfo(offset, val, size, mode));
+  }
+  bool isAutoGrow() const { return type_ == AUTO_GROW; }
+  bool isCalledCalcJmpAddress() const { return isCalledCalcJmpAddress_; }
+  /**
+          change exec permission of memory
+          @param addr [in] buffer address
+          @param size [in] buffer size
+          @param protectMode [in] mode(RW/RWE/RE)
+          @return true(success), false(failure)
+  */
+  static inline bool protect(const void* addr, size_t size, int protectMode) {
+#if defined(_WIN32)
+    const DWORD c_rw = PAGE_READWRITE;
+    const DWORD c_rwe = PAGE_EXECUTE_READWRITE;
+    const DWORD c_re = PAGE_EXECUTE_READ;
+    DWORD mode;
+#else
+    const int c_rw = PROT_READ | PROT_WRITE;
+    const int c_rwe = PROT_READ | PROT_WRITE | PROT_EXEC;
+    const int c_re = PROT_READ | PROT_EXEC;
+    int mode;
+#endif
+    switch (protectMode) {
+      case PROTECT_RW:
+        mode = c_rw;
+        break;
+      case PROTECT_RWE:
+        mode = c_rwe;
+        break;
+      case PROTECT_RE:
+        mode = c_re;
+        break;
+      default:
+        return false;
+    }
+#if defined(_WIN32)
+    DWORD oldProtect;
+    return VirtualProtect(const_cast<void*>(addr), size, mode, &oldProtect) != 0;
+#elif defined(__GNUC__)
+    size_t pageSize = sysconf(_SC_PAGESIZE);
+    size_t iaddr = reinterpret_cast<size_t>(addr);
+    size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
+    return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
+#else
+    return true;
+#endif
+  }
+  /**
+          get aligned memory pointer
+          @param addr [in] address
+          @param alignedSize [in] power of two
+          @return aligned addr by alingedSize
+  */
+  static inline uint8_t* getAlignedAddress(uint8_t* addr, size_t alignedSize = 16) {
+    return reinterpret_cast<uint8_t*>((reinterpret_cast<size_t>(addr) + alignedSize - 1) &
+                                      ~(alignedSize - static_cast<size_t>(1)));
+  }
+};
+
+class Address : public Operand {
+ public:
+  enum Mode { M_ModRM, M_64bitDisp, M_rip, M_ripAddr };
+  XBYAK_CONSTEXPR Address(uint32_t sizeBit, bool broadcast, const RegExp& e)
+      : Operand(0, MEM, sizeBit), e_(e), label_(0), mode_(M_ModRM), broadcast_(broadcast) {
+    e_.verify();
+  }
+#ifdef XBYAK64
+  explicit XBYAK_CONSTEXPR Address(size_t disp)
+      : Operand(0, MEM, 64), e_(disp), label_(0), mode_(M_64bitDisp), broadcast_(false) {}
+  XBYAK_CONSTEXPR Address(uint32_t sizeBit, bool broadcast, const RegRip& addr)
+      : Operand(0, MEM, sizeBit),
+        e_(addr.disp_),
+        label_(addr.label_),
+        mode_(addr.isAddr_ ? M_ripAddr : M_rip),
+        broadcast_(broadcast) {}
+#endif
+  RegExp getRegExp(bool optimize = true) const { return optimize ? e_.optimize() : e_; }
+  Mode getMode() const { return mode_; }
+  bool is32bit() const { return e_.getBase().getBit() == 32 || e_.getIndex().getBit() == 32; }
+  bool isOnlyDisp() const { return !e_.getBase().getBit() && !e_.getIndex().getBit(); }  // for mov eax
+  size_t getDisp() const { return e_.getDisp(); }
+  uint8_t getRex() const {
+    if (mode_ != M_ModRM) return 0;
+    return getRegExp().getRex();
+  }
+  bool is64bitDisp() const { return mode_ == M_64bitDisp; }  // for moffset
+  bool isBroadcast() const { return broadcast_; }
+  const Label* getLabel() const { return label_; }
+  bool operator==(const Address& rhs) const {
+    return getBit() == rhs.getBit() && e_ == rhs.e_ && label_ == rhs.label_ && mode_ == rhs.mode_ &&
+           broadcast_ == rhs.broadcast_;
+  }
+  bool operator!=(const Address& rhs) const { return !operator==(rhs); }
+  bool isVsib() const { return e_.isVsib(); }
+
+ private:
+  RegExp e_;
+  const Label* label_;
+  Mode mode_;
+  bool broadcast_;
+};
+
+inline const Address& Operand::getAddress() const {
+  assert(isMEM());
+  return static_cast<const Address&>(*this);
+}
+
+inline bool Operand::operator==(const Operand& rhs) const {
+  if (isMEM() && rhs.isMEM()) return this->getAddress() == rhs.getAddress();
+  return isEqualIfNotInherited(rhs);
+}
+
+class AddressFrame {
+  void operator=(const AddressFrame&);
+  AddressFrame(const AddressFrame&);
+
+ public:
+  const uint32_t bit_;
+  const bool broadcast_;
+  explicit XBYAK_CONSTEXPR AddressFrame(uint32_t bit, bool broadcast = false) : bit_(bit), broadcast_(broadcast) {}
+  Address operator[](const RegExp& e) const { return Address(bit_, broadcast_, e); }
+  Address operator[](const void* disp) const {
+    return Address(bit_, broadcast_, RegExp(reinterpret_cast<size_t>(disp)));
+  }
+#ifdef XBYAK64
+  Address operator[](uint64_t disp) const { return Address(disp); }
+  Address operator[](const RegRip& addr) const { return Address(bit_, broadcast_, addr); }
+#endif
+};
+
+struct JmpLabel {
+  size_t endOfJmp; /* offset from top to the end address of jmp */
+  int jmpSize;
+  inner::LabelMode mode;
+  size_t disp;  // disp for [rip + disp]
+  explicit JmpLabel(size_t endOfJmp = 0, int jmpSize = 0, inner::LabelMode mode = inner::LasIs, size_t disp = 0)
+      : endOfJmp(endOfJmp), jmpSize(jmpSize), mode(mode), disp(disp) {}
+};
+
+class LabelManager;
+
+class Label {
+  mutable LabelManager* mgr;
+  mutable int id;
+  friend class LabelManager;
+
+ public:
+  Label() : mgr(0), id(0) {}
+  Label(const Label& rhs);
+  Label& operator=(const Label& rhs);
+  ~Label();
+  void clear() {
+    mgr = 0;
+    id = 0;
+  }
+  int getId() const { return id; }
+  const uint8_t* getAddress() const;
+
+  // backward compatibility
+  static inline std::string toStr(int num) {
+    char buf[16];
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+    _snprintf_s
+#else
+    snprintf
+#endif
+        (buf, sizeof(buf), ".%08x", num);
+    return buf;
+  }
+};
+
+class LabelManager {
+  // for string label
+  struct SlabelVal {
+    size_t offset;
+    SlabelVal(size_t offset) : offset(offset) {}
+  };
+  typedef XBYAK_STD_UNORDERED_MAP<std::string, SlabelVal> SlabelDefList;
+  typedef XBYAK_STD_UNORDERED_MULTIMAP<std::string, const JmpLabel> SlabelUndefList;
+  struct SlabelState {
+    SlabelDefList defList;
+    SlabelUndefList undefList;
+  };
+  typedef std::list<SlabelState> StateList;
+  // for Label class
+  struct ClabelVal {
+    ClabelVal(size_t offset = 0) : offset(offset), refCount(1) {}
+    size_t offset;
+    int refCount;
+  };
+  typedef XBYAK_STD_UNORDERED_MAP<int, ClabelVal> ClabelDefList;
+  typedef XBYAK_STD_UNORDERED_MULTIMAP<int, const JmpLabel> ClabelUndefList;
+  typedef XBYAK_STD_UNORDERED_SET<Label*> LabelPtrList;
+
+  CodeArray* base_;
+  // global : stateList_.front(), local : stateList_.back()
+  StateList stateList_;
+  mutable int labelId_;
+  ClabelDefList clabelDefList_;
+  ClabelUndefList clabelUndefList_;
+  LabelPtrList labelPtrList_;
+
+  int getId(const Label& label) const {
+    if (label.id == 0) label.id = labelId_++;
+    return label.id;
+  }
+  template <class DefList, class UndefList, class T>
+  void define_inner(DefList& defList, UndefList& undefList, const T& labelId, size_t addrOffset) {
+    // add label
+    typename DefList::value_type item(labelId, addrOffset);
+    std::pair<typename DefList::iterator, bool> ret = defList.insert(item);
+    if (!ret.second) XBYAK_THROW(ERR_LABEL_IS_REDEFINED)
+    // search undefined label
+    for (;;) {
+      typename UndefList::iterator itr = undefList.find(labelId);
+      if (itr == undefList.end()) break;
+      const JmpLabel* jmp = &itr->second;
+      const size_t offset = jmp->endOfJmp - jmp->jmpSize;
+      size_t disp;
+      if (jmp->mode == inner::LaddTop) {
+        disp = addrOffset;
+      } else if (jmp->mode == inner::Labs) {
+        disp = size_t(base_->getCurr());
+      } else {
+        disp = addrOffset - jmp->endOfJmp + jmp->disp;
+#ifdef XBYAK64
+        if (jmp->jmpSize <= 4 && !inner::IsInInt32(disp)) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
+#endif
+        if (jmp->jmpSize == 1 && !inner::IsInDisp8((uint32_t)disp)) XBYAK_THROW(ERR_LABEL_IS_TOO_FAR)
+      }
+      if (base_->isAutoGrow()) {
+        base_->save(offset, disp, jmp->jmpSize, jmp->mode);
+      } else {
+        base_->rewrite(offset, disp, jmp->jmpSize);
+      }
+      undefList.erase(itr);
+    }
+  }
+  template <class DefList, class T>
+  bool getOffset_inner(const DefList& defList, size_t* offset, const T& label) const {
+    typename DefList::const_iterator i = defList.find(label);
+    if (i == defList.end()) return false;
+    *offset = i->second.offset;
+    return true;
+  }
+  friend class Label;
+  void incRefCount(int id, Label* label) {
+    clabelDefList_[id].refCount++;
+    labelPtrList_.insert(label);
+  }
+  void decRefCount(int id, Label* label) {
+    labelPtrList_.erase(label);
+    ClabelDefList::iterator i = clabelDefList_.find(id);
+    if (i == clabelDefList_.end()) return;
+    if (i->second.refCount == 1) {
+      clabelDefList_.erase(id);
+    } else {
+      --i->second.refCount;
+    }
+  }
+  template <class T>
+  bool hasUndefinedLabel_inner(const T& list) const {
+#ifndef NDEBUG
+    for (typename T::const_iterator i = list.begin(); i != list.end(); ++i) {
+      std::cerr << "undefined label:" << i->first << std::endl;
+    }
+#endif
+    return !list.empty();
+  }
+  // detach all labels linked to LabelManager
+  void resetLabelPtrList() {
+    for (LabelPtrList::iterator i = labelPtrList_.begin(), ie = labelPtrList_.end(); i != ie; ++i) {
+      (*i)->clear();
+    }
+    labelPtrList_.clear();
+  }
+
+ public:
+  LabelManager() { reset(); }
+  ~LabelManager() { resetLabelPtrList(); }
+  void reset() {
+    base_ = 0;
+    labelId_ = 1;
+    stateList_.clear();
+    stateList_.push_back(SlabelState());
+    stateList_.push_back(SlabelState());
+    clabelDefList_.clear();
+    clabelUndefList_.clear();
+    resetLabelPtrList();
+  }
+  void enterLocal() { stateList_.push_back(SlabelState()); }
+  void leaveLocal() {
+    if (stateList_.size() <= 2) XBYAK_THROW(ERR_UNDER_LOCAL_LABEL)
+    if (hasUndefinedLabel_inner(stateList_.back().undefList)) XBYAK_THROW(ERR_LABEL_IS_NOT_FOUND)
+    stateList_.pop_back();
+  }
+  void set(CodeArray* base) { base_ = base; }
+  void defineSlabel(std::string label) {
+    if (label == "@b" || label == "@f") XBYAK_THROW(ERR_BAD_LABEL_STR)
+    if (label == "@@") {
+      SlabelDefList& defList = stateList_.front().defList;
+      SlabelDefList::iterator i = defList.find("@f");
+      if (i != defList.end()) {
+        defList.erase(i);
+        label = "@b";
+      } else {
+        i = defList.find("@b");
+        if (i != defList.end()) {
+          defList.erase(i);
+        }
+        label = "@f";
+      }
+    }
+    SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
+    define_inner(st.defList, st.undefList, label, base_->getSize());
+  }
+  void defineClabel(Label& label) {
+    define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getSize());
+    label.mgr = this;
+    labelPtrList_.insert(&label);
+  }
+  void assign(Label& dst, const Label& src) {
+    ClabelDefList::const_iterator i = clabelDefList_.find(src.id);
+    if (i == clabelDefList_.end()) XBYAK_THROW(ERR_LABEL_ISNOT_SET_BY_L)
+    define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.offset);
+    dst.mgr = this;
+    labelPtrList_.insert(&dst);
+  }
+  bool getOffset(size_t* offset, std::string& label) const {
+    const SlabelDefList& defList = stateList_.front().defList;
+    if (label == "@b") {
+      if (defList.find("@f") != defList.end()) {
+        label = "@f";
+      } else if (defList.find("@b") == defList.end()) {
+        XBYAK_THROW_RET(ERR_LABEL_IS_NOT_FOUND, false)
+      }
+    } else if (label == "@f") {
+      if (defList.find("@f") != defList.end()) {
+        label = "@b";
+      }
+    }
+    const SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
+    return getOffset_inner(st.defList, offset, label);
+  }
+  bool getOffset(size_t* offset, const Label& label) const {
+    return getOffset_inner(clabelDefList_, offset, getId(label));
+  }
+  void addUndefinedLabel(const std::string& label, const JmpLabel& jmp) {
+    SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
+    st.undefList.insert(SlabelUndefList::value_type(label, jmp));
+  }
+  void addUndefinedLabel(const Label& label, const JmpLabel& jmp) {
+    clabelUndefList_.insert(ClabelUndefList::value_type(label.id, jmp));
+  }
+  bool hasUndefSlabel() const {
+    for (StateList::const_iterator i = stateList_.begin(), ie = stateList_.end(); i != ie; ++i) {
+      if (hasUndefinedLabel_inner(i->undefList)) return true;
+    }
+    return false;
+  }
+  bool hasUndefClabel() const { return hasUndefinedLabel_inner(clabelUndefList_); }
+  const uint8_t* getCode() const { return base_->getCode(); }
+  bool isReady() const { return !base_->isAutoGrow() || base_->isCalledCalcJmpAddress(); }
+};
+
+inline Label::Label(const Label& rhs) {
+  id = rhs.id;
+  mgr = rhs.mgr;
+  if (mgr) mgr->incRefCount(id, this);
+}
+inline Label& Label::operator=(const Label& rhs) {
+  if (id) XBYAK_THROW_RET(ERR_LABEL_IS_ALREADY_SET_BY_L, *this)
+  id = rhs.id;
+  mgr = rhs.mgr;
+  if (mgr) mgr->incRefCount(id, this);
+  return *this;
+}
+inline Label::~Label() {
+  if (id && mgr) mgr->decRefCount(id, this);
+}
+inline const uint8_t* Label::getAddress() const {
+  if (mgr == 0 || !mgr->isReady()) return 0;
+  size_t offset;
+  if (!mgr->getOffset(&offset, *this)) return 0;
+  return mgr->getCode() + offset;
+}
+
+typedef enum { DefaultEncoding, VexEncoding, EvexEncoding } PreferredEncoding;
+
+class CodeGenerator : public CodeArray {
+ public:
+  enum LabelType {
+    T_SHORT,
+    T_NEAR,
+    T_FAR,  // far jump
+    T_AUTO  // T_SHORT if possible
+  };
+
+ private:
+  CodeGenerator operator=(const CodeGenerator&);  // don't call
+#ifdef XBYAK64
+  enum {i32e = 32 | 64, BIT = 64};
+  static const uint64_t dummyAddr = uint64_t(0x1122334455667788ull);
+  typedef Reg64 NativeReg;
+#else
+  enum {i32e = 32, BIT = 32};
+  static const size_t dummyAddr = 0x12345678;
+  typedef Reg32 NativeReg;
+#endif
+  // (XMM, XMM|MEM)
+  static inline bool isXMM_XMMorMEM(const Operand& op1, const Operand& op2) {
+    return op1.isXMM() && (op2.isXMM() || op2.isMEM());
+  }
+  // (MMX, MMX|MEM) or (XMM, XMM|MEM)
+  static inline bool isXMMorMMX_MEM(const Operand& op1, const Operand& op2) {
+    return (op1.isMMX() && (op2.isMMX() || op2.isMEM())) || isXMM_XMMorMEM(op1, op2);
+  }
+  // (XMM, MMX|MEM)
+  static inline bool isXMM_MMXorMEM(const Operand& op1, const Operand& op2) {
+    return op1.isXMM() && (op2.isMMX() || op2.isMEM());
+  }
+  // (MMX, XMM|MEM)
+  static inline bool isMMX_XMMorMEM(const Operand& op1, const Operand& op2) {
+    return op1.isMMX() && (op2.isXMM() || op2.isMEM());
+  }
+  // (XMM, REG32|MEM)
+  static inline bool isXMM_REG32orMEM(const Operand& op1, const Operand& op2) {
+    return op1.isXMM() && (op2.isREG(i32e) || op2.isMEM());
+  }
+  // (REG32, XMM|MEM)
+  static inline bool isREG32_XMMorMEM(const Operand& op1, const Operand& op2) {
+    return op1.isREG(i32e) && (op2.isXMM() || op2.isMEM());
+  }
+  // (REG32, REG32|MEM)
+  static inline bool isREG32_REG32orMEM(const Operand& op1, const Operand& op2) {
+    return op1.isREG(i32e) && ((op2.isREG(i32e) && op1.getBit() == op2.getBit()) || op2.isMEM());
+  }
+  static inline bool isValidSSE(const Operand& op1) {
+    // SSE instructions do not support XMM16 - XMM31
+    return !(op1.isXMM() && op1.getIdx() >= 16);
+  }
+  void rex(const Operand& op1, const Operand& op2 = Operand()) {
+    uint8_t rex = 0;
+    const Operand *p1 = &op1, *p2 = &op2;
+    if (p1->isMEM()) std::swap(p1, p2);
+    if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
+    if (p2->isMEM()) {
+      const Address& addr = p2->getAddress();
+      if (BIT == 64 && addr.is32bit()) db(0x67);
+      rex = addr.getRex() | p1->getReg().getRex();
+    } else {
+      // ModRM(reg, base);
+      rex = op2.getReg().getRex(op1.getReg());
+    }
+    // except movsx(16bit, 32/64bit)
+    if ((op1.isBit(16) && !op2.isBit(i32e)) || (op2.isBit(16) && !op1.isBit(i32e))) db(0x66);
+    if (rex) db(rex);
+  }
+  enum AVXtype {
+    // low 3 bit
+    T_N1 = 1,
+    T_N2 = 2,
+    T_N4 = 3,
+    T_N8 = 4,
+    T_N16 = 5,
+    T_N32 = 6,
+    T_NX_MASK = 7,
+    //
+    T_N_VL = 1 << 3,     // N * (1, 2, 4) for VL
+    T_DUP = 1 << 4,      // N = (8, 32, 64)
+    T_66 = 1 << 5,       // pp = 1
+    T_F3 = 1 << 6,       // pp = 2
+    T_F2 = T_66 | T_F3,  // pp = 3
+    T_ER_R = 1 << 7,     // reg{er}
+    T_0F = 1 << 8,
+    T_0F38 = 1 << 9,
+    T_0F3A = 1 << 10,
+    T_L0 = 1 << 11,
+    T_L1 = 1 << 12,
+    T_W0 = 1 << 13,
+    T_W1 = 1 << 14,
+    T_EW0 = 1 << 15,
+    T_EW1 = 1 << 16,
+    T_YMM = 1 << 17,  // support YMM, ZMM
+    T_EVEX = 1 << 18,
+    T_ER_X = 1 << 19,       // xmm{er}
+    T_ER_Y = 1 << 20,       // ymm{er}
+    T_ER_Z = 1 << 21,       // zmm{er}
+    T_SAE_X = 1 << 22,      // xmm{sae}
+    T_SAE_Y = 1 << 23,      // ymm{sae}
+    T_SAE_Z = 1 << 24,      // zmm{sae}
+    T_MUST_EVEX = 1 << 25,  // contains T_EVEX
+    T_B32 = 1 << 26,        // m32bcst
+    T_B64 = 1 << 27,        // m64bcst
+    T_B16 = T_B32 | T_B64,  // m16bcst (Be careful)
+    T_M_K = 1 << 28,        // mem{k}
+    T_VSIB = 1 << 29,
+    T_MEM_EVEX = 1 << 30,  // use evex if mem
+    T_FP16 = 1 << 31,      // avx512-fp16
+    T_MAP5 = T_FP16 | T_0F,
+    T_MAP6 = T_FP16 | T_0F38,
+    T_XXX
+  };
+  // T_66 = 1, T_F3 = 2, T_F2 = 3
+  uint32_t getPP(int type) const { return (type >> 5) & 3; }
+  void vex(const Reg& reg, const Reg& base, const Operand* v, int type, int code, bool x = false) {
+    int w = (type & T_W1) ? 1 : 0;
+    bool is256 = (type & T_L1) ? true : (type & T_L0) ? false : reg.isYMM();
+    bool r = reg.isExtIdx();
+    bool b = base.isExtIdx();
+    int idx = v ? v->getIdx() : 0;
+    if ((idx | reg.getIdx() | base.getIdx()) >= 16) XBYAK_THROW(ERR_BAD_COMBINATION)
+    uint32_t pp = getPP(type);
+    uint32_t vvvv = (((~idx) & 15) << 3) | (is256 ? 4 : 0) | pp;
+    if (!b && !x && !w && (type & T_0F)) {
+      db(0xC5);
+      db((r ? 0 : 0x80) | vvvv);
+    } else {
+      uint32_t mmmm = (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0;
+      db(0xC4);
+      db((r ? 0 : 0x80) | (x ? 0 : 0x40) | (b ? 0 : 0x20) | mmmm);
+      db((w << 7) | vvvv);
+    }
+    db(code);
+  }
+  void verifySAE(const Reg& r, int type) const {
+    if (((type & T_SAE_X) && r.isXMM()) || ((type & T_SAE_Y) && r.isYMM()) || ((type & T_SAE_Z) && r.isZMM())) return;
+    XBYAK_THROW(ERR_SAE_IS_INVALID)
+  }
+  void verifyER(const Reg& r, int type) const {
+    if ((type & T_ER_R) && r.isREG(32 | 64)) return;
+    if (((type & T_ER_X) && r.isXMM()) || ((type & T_ER_Y) && r.isYMM()) || ((type & T_ER_Z) && r.isZMM())) return;
+    XBYAK_THROW(ERR_ER_IS_INVALID)
+  }
+  // (a, b, c) contains non zero two or three values then err
+  int verifyDuplicate(int a, int b, int c, int err) {
+    int v = a | b | c;
+    if ((a > 0 && a != v) + (b > 0 && b != v) + (c > 0 && c != v) > 0) XBYAK_THROW_RET(err, 0)
+    return v;
+  }
+  int evex(const Reg& reg, const Reg& base, const Operand* v, int type, int code, bool x = false, bool b = false,
+           int aaa = 0, uint32_t VL = 0, bool Hi16Vidx = false) {
+    if (!(type & (T_EVEX | T_MUST_EVEX))) XBYAK_THROW_RET(ERR_EVEX_IS_INVALID, 0)
+    int w = (type & T_EW1) ? 1 : 0;
+    uint32_t mmm = (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0;
+    if (type & T_FP16) mmm |= 4;
+    uint32_t pp = getPP(type);
+    int idx = v ? v->getIdx() : 0;
+    uint32_t vvvv = ~idx;
+
+    bool R = !reg.isExtIdx();
+    bool X = x ? false : !base.isExtIdx2();
+    bool B = !base.isExtIdx();
+    bool Rp = !reg.isExtIdx2();
+    int LL;
+    int rounding =
+        verifyDuplicate(reg.getRounding(), base.getRounding(), v ? v->getRounding() : 0, ERR_ROUNDING_IS_ALREADY_SET);
+    int disp8N = 1;
+    if (rounding) {
+      if (rounding == EvexModifierRounding::T_SAE) {
+        verifySAE(base, type);
+        LL = 0;
+      } else {
+        verifyER(base, type);
+        LL = rounding - 1;
+      }
+      b = true;
+    } else {
+      if (v) VL = (std::max)(VL, v->getBit());
+      VL = (std::max)((std::max)(reg.getBit(), base.getBit()), VL);
+      LL = (VL == 512) ? 2 : (VL == 256) ? 1 : 0;
+      if (b) {
+        disp8N = ((type & T_B16) == T_B16) ? 2 : (type & T_B32) ? 4 : 8;
+      } else if (type & T_DUP) {
+        disp8N = VL == 128 ? 8 : VL == 256 ? 32 : 64;
+      } else {
+        if ((type & (T_NX_MASK | T_N_VL)) == 0) {
+          type |= T_N16 | T_N_VL;  // default
+        }
+        int low = type & T_NX_MASK;
+        if (low > 0) {
+          disp8N = 1 << (low - 1);
+          if (type & T_N_VL) disp8N *= (VL == 512 ? 4 : VL == 256 ? 2 : 1);
+        }
+      }
+    }
+    bool Vp = !((v ? v->isExtIdx2() : 0) | Hi16Vidx);
+    bool z = reg.hasZero() || base.hasZero() || (v ? v->hasZero() : false);
+    if (aaa == 0)
+      aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0),
+                            ERR_OPMASK_IS_ALREADY_SET);
+    if (aaa == 0) z = 0;  // clear T_z if mask is not set
+    db(0x62);
+    db((R ? 0x80 : 0) | (X ? 0x40 : 0) | (B ? 0x20 : 0) | (Rp ? 0x10 : 0) | mmm);
+    db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | 4 | (pp & 3));
+    db((z ? 0x80 : 0) | ((LL & 3) << 5) | (b ? 0x10 : 0) | (Vp ? 8 : 0) | (aaa & 7));
+    db(code);
+    return disp8N;
+  }
+  void setModRM(int mod, int r1, int r2) { db(static_cast<uint8_t>((mod << 6) | ((r1 & 7) << 3) | (r2 & 7))); }
+  void setSIB(const RegExp& e, int reg, int disp8N = 0) {
+    uint64_t disp64 = e.getDisp();
+#if defined(XBYAK64) && !defined(__ILP32__)
+#ifdef XBYAK_OLD_DISP_CHECK
+    // treat 0xffffffff as 0xffffffffffffffff
+    uint64_t high = disp64 >> 32;
+    if (high != 0 && high != 0xFFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
+#else
+    // displacement should be a signed 32-bit value, so also check sign bit
+    uint64_t high = disp64 >> 31;
+    if (high != 0 && high != 0x1FFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
+#endif
+#endif
+    uint32_t disp = static_cast<uint32_t>(disp64);
+    const Reg& base = e.getBase();
+    const Reg& index = e.getIndex();
+    const int baseIdx = base.getIdx();
+    const int baseBit = base.getBit();
+    const int indexBit = index.getBit();
+    enum { mod00 = 0, mod01 = 1, mod10 = 2 };
+    int mod = mod10;  // disp32
+    if (!baseBit || ((baseIdx & 7) != Operand::EBP && disp == 0)) {
+      mod = mod00;
+    } else {
+      if (disp8N == 0) {
+        if (inner::IsInDisp8(disp)) {
+          mod = mod01;
+        }
+      } else {
+        // disp must be casted to signed
+        uint32_t t = static_cast<uint32_t>(static_cast<int>(disp) / disp8N);
+        if ((disp % disp8N) == 0 && inner::IsInDisp8(t)) {
+          disp = t;
+          mod = mod01;
+        }
+      }
+    }
+    const int newBaseIdx = baseBit ? (baseIdx & 7) : Operand::EBP;
+    /* ModR/M = [2:3:3] = [Mod:reg/code:R/M] */
+    bool hasSIB = indexBit || (baseIdx & 7) == Operand::ESP;
+#ifdef XBYAK64
+    if (!baseBit && !indexBit) hasSIB = true;
+#endif
+    if (hasSIB) {
+      setModRM(mod, reg, Operand::ESP);
+      /* SIB = [2:3:3] = [SS:index:base(=rm)] */
+      const int idx = indexBit ? (index.getIdx() & 7) : Operand::ESP;
+      const int scale = e.getScale();
+      const int SS = (scale == 8) ? 3 : (scale == 4) ? 2 : (scale == 2) ? 1 : 0;
+      setModRM(SS, idx, newBaseIdx);
+    } else {
+      setModRM(mod, reg, newBaseIdx);
+    }
+    if (mod == mod01) {
+      db(disp);
+    } else if (mod == mod10 || (mod == mod00 && !baseBit)) {
+      dd(disp);
+    }
+  }
+  LabelManager labelMgr_;
+  bool isInDisp16(uint32_t x) const { return 0xFFFF8000 <= x || x <= 0x7FFF; }
+  void opModR(const Reg& reg1, const Reg& reg2, int code0, int code1 = NONE, int code2 = NONE) {
+    rex(reg2, reg1);
+    db(code0 | (reg1.isBit(8) ? 0 : 1));
+    if (code1 != NONE) db(code1);
+    if (code2 != NONE) db(code2);
+    setModRM(3, reg1.getIdx(), reg2.getIdx());
+  }
+  void opModM(const Address& addr, const Reg& reg, int code0, int code1 = NONE, int code2 = NONE, int immSize = 0) {
+    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
+    rex(addr, reg);
+    db(code0 | (reg.isBit(8) ? 0 : 1));
+    if (code1 != NONE) db(code1);
+    if (code2 != NONE) db(code2);
+    opAddr(addr, reg.getIdx(), immSize);
+  }
+  void opLoadSeg(const Address& addr, const Reg& reg, int code0, int code1 = NONE) {
+    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
+    if (reg.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
+    rex(addr, reg);
+    db(code0);
+    if (code1 != NONE) db(code1);
+    opAddr(addr, reg.getIdx());
+  }
+  void opMIB(const Address& addr, const Reg& reg, int code0, int code1) {
+    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
+    if (addr.getMode() != Address::M_ModRM) XBYAK_THROW(ERR_INVALID_MIB_ADDRESS)
+    if (BIT == 64 && addr.is32bit()) db(0x67);
+    const RegExp& regExp = addr.getRegExp(false);
+    uint8_t rex = regExp.getRex();
+    if (rex) db(rex);
+    db(code0);
+    db(code1);
+    setSIB(regExp, reg.getIdx());
+  }
+  void makeJmp(uint32_t disp, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref) {
+    const int shortJmpSize = 2;
+    const int longHeaderSize = longPref ? 2 : 1;
+    const int longJmpSize = longHeaderSize + 4;
+    if (type != T_NEAR && inner::IsInDisp8(disp - shortJmpSize)) {
+      db(shortCode);
+      db(disp - shortJmpSize);
+    } else {
+      if (type == T_SHORT) XBYAK_THROW(ERR_LABEL_IS_TOO_FAR)
+      if (longPref) db(longPref);
+      db(longCode);
+      dd(disp - longJmpSize);
+    }
+  }
+  bool isNEAR(LabelType type) const { return type == T_NEAR || (type == T_AUTO && isDefaultJmpNEAR_); }
+  template <class T>
+  void opJmp(T& label, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref) {
+    if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED)
+    if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory(); /* avoid splitting code of jmp */
+    size_t offset = 0;
+    if (labelMgr_.getOffset(&offset, label)) { /* label exists */
+      makeJmp(inner::VerifyInInt32(offset - size_), type, shortCode, longCode, longPref);
+    } else {
+      int jmpSize = 0;
+      if (isNEAR(type)) {
+        jmpSize = 4;
+        if (longPref) db(longPref);
+        db(longCode);
+        dd(0);
+      } else {
+        jmpSize = 1;
+        db(shortCode);
+        db(0);
+      }
+      JmpLabel jmp(size_, jmpSize, inner::LasIs);
+      labelMgr_.addUndefinedLabel(label, jmp);
+    }
+  }
+  void opJmpAbs(const void* addr, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref = 0) {
+    if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED)
+    if (isAutoGrow()) {
+      if (!isNEAR(type)) XBYAK_THROW(ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW)
+      if (size_ + 16 >= maxSize_) growMemory();
+      if (longPref) db(longPref);
+      db(longCode);
+      dd(0);
+      save(size_ - 4, size_t(addr) - size_, 4, inner::Labs);
+    } else {
+      makeJmp(inner::VerifyInInt32(reinterpret_cast<const uint8_t*>(addr) - getCurr()), type, shortCode, longCode,
+              longPref);
+    }
+  }
+  void opJmpOp(const Operand& op, LabelType type, int ext) {
+    const int bit = 16 | i32e;
+    if (type == T_FAR) {
+      if (!op.isMEM(bit)) XBYAK_THROW(ERR_NOT_SUPPORTED)
+      opR_ModM(op, bit, ext + 1, 0xFF, NONE, NONE, false);
+    } else {
+      opR_ModM(op, bit, ext, 0xFF, NONE, NONE, true);
+    }
+  }
+  // reg is reg field of ModRM
+  // immSize is the size for immediate value
+  // disp8N = 0(normal), disp8N = 1(force disp32), disp8N = {2, 4, 8} ; compressed displacement
+  void opAddr(const Address& addr, int reg, int immSize = 0, int disp8N = 0, bool permitVisb = false) {
+    if (!permitVisb && addr.isVsib()) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
+    if (addr.getMode() == Address::M_ModRM) {
+      setSIB(addr.getRegExp(), reg, disp8N);
+    } else if (addr.getMode() == Address::M_rip || addr.getMode() == Address::M_ripAddr) {
+      setModRM(0, reg, 5);
+      if (addr.getLabel()) {  // [rip + Label]
+        putL_inner(*addr.getLabel(), true, addr.getDisp() - immSize);
+      } else {
+        size_t disp = addr.getDisp();
+        if (addr.getMode() == Address::M_ripAddr) {
+          if (isAutoGrow()) XBYAK_THROW(ERR_INVALID_RIP_IN_AUTO_GROW)
+          disp -= (size_t)getCurr() + 4 + immSize;
+        }
+        dd(inner::VerifyInInt32(disp));
+      }
+    }
+  }
+  /* preCode is for SSSE3/SSE4 */
+  void opGen(const Operand& reg, const Operand& op, int code, int pref, bool isValid(const Operand&, const Operand&),
+             int imm8 = NONE, int preCode = NONE) {
+    if (isValid && !isValid(reg, op)) XBYAK_THROW(ERR_BAD_COMBINATION)
+    if (!isValidSSE(reg) || !isValidSSE(op)) XBYAK_THROW(ERR_NOT_SUPPORTED)
+    if (pref != NONE) db(pref);
+    if (op.isMEM()) {
+      opModM(op.getAddress(), reg.getReg(), 0x0F, preCode, code, (imm8 != NONE) ? 1 : 0);
+    } else {
+      opModR(reg.getReg(), op.getReg(), 0x0F, preCode, code);
+    }
+    if (imm8 != NONE) db(imm8);
+  }
+  void opMMX_IMM(const Mmx& mmx, int imm8, int code, int ext) {
+    if (!isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED)
+    if (mmx.isXMM()) db(0x66);
+    opModR(Reg32(ext), mmx, 0x0F, code);
+    db(imm8);
+  }
+  void opMMX(const Mmx& mmx, const Operand& op, int code, int pref = 0x66, int imm8 = NONE, int preCode = NONE) {
+    opGen(mmx, op, code, mmx.isXMM() ? pref : NONE, isXMMorMMX_MEM, imm8, preCode);
+  }
+  void opMovXMM(const Operand& op1, const Operand& op2, int code, int pref) {
+    if (!isValidSSE(op1) || !isValidSSE(op2)) XBYAK_THROW(ERR_NOT_SUPPORTED)
+    if (pref != NONE) db(pref);
+    if (op1.isXMM() && op2.isMEM()) {
+      opModM(op2.getAddress(), op1.getReg(), 0x0F, code);
+    } else if (op1.isMEM() && op2.isXMM()) {
+      opModM(op1.getAddress(), op2.getReg(), 0x0F, code | 1);
+    } else {
+      XBYAK_THROW(ERR_BAD_COMBINATION)
+    }
+  }
+  void opExt(const Operand& op, const Mmx& mmx, int code, int imm, bool hasMMX2 = false) {
+    if (!isValidSSE(op) || !isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED)
+    if (hasMMX2 && op.isREG(i32e)) { /* pextrw is special */
+      if (mmx.isXMM()) db(0x66);
+      opModR(op.getReg(), mmx, 0x0F, 0xC5);
+      db(imm);
+    } else {
+      opGen(mmx, op, code, 0x66, isXMM_REG32orMEM, imm, 0x3A);
+    }
+  }
+  void opR_ModM(const Operand& op, int bit, int ext, int code0, int code1 = NONE, int code2 = NONE,
+                bool disableRex = false, int immSize = 0) {
+    int opBit = op.getBit();
+    if (disableRex && opBit == 64) opBit = 32;
+    if (op.isREG(bit)) {
+      opModR(Reg(ext, Operand::REG, opBit), op.getReg().changeBit(opBit), code0, code1, code2);
+    } else if (op.isMEM()) {
+      opModM(op.getAddress(), Reg(ext, Operand::REG, opBit), code0, code1, code2, immSize);
+    } else {
+      XBYAK_THROW(ERR_BAD_COMBINATION)
+    }
+  }
+  void opShift(const Operand& op, int imm, int ext) {
+    verifyMemHasSize(op);
+    opR_ModM(op, 0, ext, (0xC0 | ((imm == 1 ? 1 : 0) << 4)), NONE, NONE, false, (imm != 1) ? 1 : 0);
+    if (imm != 1) db(imm);
+  }
+  void opShift(const Operand& op, const Reg8& _cl, int ext) {
+    if (_cl.getIdx() != Operand::CL) XBYAK_THROW(ERR_BAD_COMBINATION)
+    opR_ModM(op, 0, ext, 0xD2);
+  }
+  void opModRM(const Operand& op1, const Operand& op2, bool condR, bool condM, int code0, int code1 = NONE,
+               int code2 = NONE, int immSize = 0) {
+    if (condR) {
+      opModR(op1.getReg(), op2.getReg(), code0, code1, code2);
+    } else if (condM) {
+      opModM(op2.getAddress(), op1.getReg(), code0, code1, code2, immSize);
+    } else {
+      XBYAK_THROW(ERR_BAD_COMBINATION)
+    }
+  }
+  void opShxd(const Operand& op, const Reg& reg, uint8_t imm, int code, const Reg8* _cl = 0) {
+    if (_cl && _cl->getIdx() != Operand::CL) XBYAK_THROW(ERR_BAD_COMBINATION)
+    opModRM(reg, op, (op.isREG(16 | i32e) && op.getBit() == reg.getBit()), op.isMEM() && (reg.isREG(16 | i32e)), 0x0F,
+            code | (_cl ? 1 : 0), NONE, _cl ? 0 : 1);
+    if (!_cl) db(imm);
+  }
+  // (REG, REG|MEM), (MEM, REG)
+  void opRM_RM(const Operand& op1, const Operand& op2, int code) {
+    if (op1.isREG() && op2.isMEM()) {
+      opModM(op2.getAddress(), op1.getReg(), code | 2);
+    } else {
+      opModRM(op2, op1, op1.isREG() && op1.getKind() == op2.getKind(), op1.isMEM() && op2.isREG(), code);
+    }
+  }
+  // (REG|MEM, IMM)
+  void opRM_I(const Operand& op, uint32_t imm, int code, int ext) {
+    verifyMemHasSize(op);
+    uint32_t immBit = inner::IsInDisp8(imm) ? 8 : isInDisp16(imm) ? 16 : 32;
+    if (op.isBit(8)) immBit = 8;
+    if (op.getBit() < immBit) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
+    if (op.isBit(32 | 64) && immBit == 16) immBit = 32; /* don't use MEM16 if 32/64bit mode */
+    if (op.isREG() && op.getIdx() == 0 &&
+        (op.getBit() == immBit || (op.isBit(64) && immBit == 32))) {  // rax, eax, ax, al
+      rex(op);
+      db(code | 4 | (immBit == 8 ? 0 : 1));
+    } else {
+      int tmp = immBit < (std::min)(op.getBit(), 32U) ? 2 : 0;
+      opR_ModM(op, 0, ext, 0x80 | tmp, NONE, NONE, false, immBit / 8);
+    }
+    db(imm, immBit / 8);
+  }
+  void opIncDec(const Operand& op, int code, int ext) {
+    verifyMemHasSize(op);
+#ifndef XBYAK64
+    if (op.isREG() && !op.isBit(8)) {
+      rex(op);
+      db(code | op.getIdx());
+      return;
+    }
+#endif
+    code = 0xFE;
+    if (op.isREG()) {
+      opModR(Reg(ext, Operand::REG, op.getBit()), op.getReg(), code);
+    } else {
+      opModM(op.getAddress(), Reg(ext, Operand::REG, op.getBit()), code);
+    }
+  }
+  void opPushPop(const Operand& op, int code, int ext, int alt) {
+    int bit = op.getBit();
+    if (bit == 16 || bit == BIT) {
+      if (bit == 16) db(0x66);
+      if (op.isREG()) {
+        if (op.getReg().getIdx() >= 8) db(0x41);
+        db(alt | (op.getIdx() & 7));
+        return;
+      }
+      if (op.isMEM()) {
+        opModM(op.getAddress(), Reg(ext, Operand::REG, 32), code);
+        return;
+      }
+    }
+    XBYAK_THROW(ERR_BAD_COMBINATION)
+  }
+  void verifyMemHasSize(const Operand& op) const {
+    if (op.isMEM() && op.getBit() == 0) XBYAK_THROW(ERR_MEM_SIZE_IS_NOT_SPECIFIED)
+  }
+  /*
+          mov(r, imm) = db(imm, mov_imm(r, imm))
+  */
+  int mov_imm(const Reg& reg, uint64_t imm) {
+    int bit = reg.getBit();
+    const int idx = reg.getIdx();
+    int code = 0xB0 | ((bit == 8 ? 0 : 1) << 3);
+    if (bit == 64 && (imm & ~uint64_t(0xffffffffu)) == 0) {
+      rex(Reg32(idx));
+      bit = 32;
+    } else {
+      rex(reg);
+      if (bit == 64 && inner::IsInInt32(imm)) {
+        db(0xC7);
+        code = 0xC0;
+        bit = 32;
+      }
+    }
+    db(code | (idx & 7));
+    return bit / 8;
+  }
+  template <class T>
+  void putL_inner(T& label, bool relative = false, size_t disp = 0) {
+    const int jmpSize = relative ? 4 : (int)sizeof(size_t);
+    if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory();
+    size_t offset = 0;
+    if (labelMgr_.getOffset(&offset, label)) {
+      if (relative) {
+        db(inner::VerifyInInt32(offset + disp - size_ - jmpSize), jmpSize);
+      } else if (isAutoGrow()) {
+        db(uint64_t(0), jmpSize);
+        save(size_ - jmpSize, offset, jmpSize, inner::LaddTop);
+      } else {
+        db(size_t(top_) + offset, jmpSize);
+      }
+      return;
+    }
+    db(uint64_t(0), jmpSize);
+    JmpLabel jmp(size_, jmpSize, (relative ? inner::LasIs : isAutoGrow() ? inner::LaddTop : inner::Labs), disp);
+    labelMgr_.addUndefinedLabel(label, jmp);
+  }
+  void opMovxx(const Reg& reg, const Operand& op, uint8_t code) {
+    if (op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION)
+    int w = op.isBit(16);
+    bool cond = reg.isREG() && (reg.getBit() > op.getBit());
+    opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
+  }
+  void opFpuMem(const Address& addr, uint8_t m16, uint8_t m32, uint8_t m64, uint8_t ext, uint8_t m64ext) {
+    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
+    uint8_t code = addr.isBit(16) ? m16 : addr.isBit(32) ? m32 : addr.isBit(64) ? m64 : 0;
+    if (!code) XBYAK_THROW(ERR_BAD_MEM_SIZE)
+    if (m64ext && addr.isBit(64)) ext = m64ext;
+
+    rex(addr, st0);
+    db(code);
+    opAddr(addr, ext);
+  }
+  // use code1 if reg1 == st0
+  // use code2 if reg1 != st0 && reg2 == st0
+  void opFpuFpu(const Fpu& reg1, const Fpu& reg2, uint32_t code1, uint32_t code2) {
+    uint32_t code = reg1.getIdx() == 0 ? code1 : reg2.getIdx() == 0 ? code2 : 0;
+    if (!code) XBYAK_THROW(ERR_BAD_ST_COMBINATION)
+    db(uint8_t(code >> 8));
+    db(uint8_t(code | (reg1.getIdx() | reg2.getIdx())));
+  }
+  void opFpu(const Fpu& reg, uint8_t code1, uint8_t code2) {
+    db(code1);
+    db(code2 | reg.getIdx());
+  }
+  void opVex(const Reg& r, const Operand* p1, const Operand& op2, int type, int code, int imm8 = NONE) {
+    if (op2.isMEM()) {
+      const Address& addr = op2.getAddress();
+      const RegExp& regExp = addr.getRegExp();
+      const Reg& base = regExp.getBase();
+      const Reg& index = regExp.getIndex();
+      if (BIT == 64 && addr.is32bit()) db(0x67);
+      int disp8N = 0;
+      bool x = index.isExtIdx();
+      if ((type & (T_MUST_EVEX | T_MEM_EVEX)) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() ||
+          addr.getOpmaskIdx()) {
+        int aaa = addr.getOpmaskIdx();
+        if (aaa && !(type & T_M_K)) XBYAK_THROW(ERR_INVALID_OPMASK_WITH_MEMORY)
+        bool b = false;
+        if (addr.isBroadcast()) {
+          if (!(type & (T_B32 | T_B64))) XBYAK_THROW(ERR_INVALID_BROADCAST)
+          b = true;
+        }
+        int VL = regExp.isVsib() ? index.getBit() : 0;
+        disp8N = evex(r, base, p1, type, code, x, b, aaa, VL, index.isExtIdx2());
+      } else {
+        vex(r, base, p1, type, code, x);
+      }
+      opAddr(addr, r.getIdx(), (imm8 != NONE) ? 1 : 0, disp8N, (type & T_VSIB) != 0);
+    } else {
+      const Reg& base = op2.getReg();
+      if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || base.hasEvex()) {
+        evex(r, base, p1, type, code);
+      } else {
+        vex(r, base, p1, type, code);
+      }
+      setModRM(3, r.getIdx(), base.getIdx());
+    }
+    if (imm8 != NONE) db(imm8);
+  }
+  // (r, r, r/m) if isR_R_RM
+  // (r, r/m, r)
+  void opGpr(const Reg32e& r, const Operand& op1, const Operand& op2, int type, uint8_t code, bool isR_R_RM,
+             int imm8 = NONE) {
+    const Operand* p1 = &op1;
+    const Operand* p2 = &op2;
+    if (!isR_R_RM) std::swap(p1, p2);
+    const unsigned int bit = r.getBit();
+    if (p1->getBit() != bit || (p2->isREG() && p2->getBit() != bit)) XBYAK_THROW(ERR_BAD_COMBINATION)
+    type |= (bit == 64) ? T_W1 : T_W0;
+    opVex(r, p1, *p2, type, code, imm8);
+  }
+  void opAVX_X_X_XM(const Xmm& x1, const Operand& op1, const Operand& op2, int type, int code0, int imm8 = NONE) {
+    const Xmm* x2 = static_cast<const Xmm*>(&op1);
+    const Operand* op = &op2;
+    if (op2.isNone()) {  // (x1, op1) -> (x1, x1, op1)
+      x2 = &x1;
+      op = &op1;
+    }
+    // (x1, x2, op)
+    if (!((x1.isXMM() && x2->isXMM()) ||
+          ((type & T_YMM) && ((x1.isYMM() && x2->isYMM()) || (x1.isZMM() && x2->isZMM())))))
+      XBYAK_THROW(ERR_BAD_COMBINATION)
+    opVex(x1, x2, *op, type, code0, imm8);
+  }
+  void opAVX_K_X_XM(const Opmask& k, const Xmm& x2, const Operand& op3, int type, int code0, int imm8 = NONE) {
+    if (!op3.isMEM() && (x2.getKind() != op3.getKind())) XBYAK_THROW(ERR_BAD_COMBINATION)
+    opVex(k, &x2, op3, type, code0, imm8);
+  }
+  // (x, x/m), (y, x/m256), (z, y/m)
+  void checkCvt1(const Operand& x, const Operand& op) const {
+    if (!op.isMEM() && !(x.is(Operand::XMM | Operand::YMM) && op.isXMM()) && !(x.isZMM() && op.isYMM()))
+      XBYAK_THROW(ERR_BAD_COMBINATION)
+  }
+  // (x, x/m), (x, y/m256), (y, z/m)
+  void checkCvt2(const Xmm& x, const Operand& op) const {
+    if (!(x.isXMM() && op.is(Operand::XMM | Operand::YMM | Operand::MEM)) &&
+        !(x.isYMM() && op.is(Operand::ZMM | Operand::MEM)))
+      XBYAK_THROW(ERR_BAD_COMBINATION)
+  }
+  void opCvt(const Xmm& x, const Operand& op, int type, int code) {
+    Operand::Kind kind = x.isXMM() ? (op.isBit(256) ? Operand::YMM : Operand::XMM) : Operand::ZMM;
+    opVex(x.copyAndSetKind(kind), &xm0, op, type, code);
+  }
+  void opCvt2(const Xmm& x, const Operand& op, int type, int code) {
+    checkCvt2(x, op);
+    opCvt(x, op, type, code);
+  }
+  void opCvt3(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int type64, int type32, uint8_t code) {
+    if (!(x1.isXMM() && x2.isXMM() && (op.isREG(i32e) || op.isMEM()))) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
+    Xmm x(op.getIdx());
+    const Operand* p = op.isREG() ? &x : &op;
+    opVex(x1, &x2, *p, type | (op.isBit(64) ? type64 : type32), code);
+  }
+  // (x, x/y/xword/yword), (y, z/m)
+  void checkCvt4(const Xmm& x, const Operand& op) const {
+    if (!(x.isXMM() && op.is(Operand::XMM | Operand::YMM | Operand::MEM) && op.isBit(128 | 256)) &&
+        !(x.isYMM() && op.is(Operand::ZMM | Operand::MEM)))
+      XBYAK_THROW(ERR_BAD_COMBINATION)
+  }
+  // (x, x/y/z/xword/yword/zword)
+  void opCvt5(const Xmm& x, const Operand& op, int type, int code) {
+    if (!(x.isXMM() && op.isBit(128 | 256 | 512))) XBYAK_THROW(ERR_BAD_COMBINATION)
+    Operand::Kind kind = op.isBit(128) ? Operand::XMM : op.isBit(256) ? Operand::YMM : Operand::ZMM;
+    opVex(x.copyAndSetKind(kind), &xm0, op, type, code);
+  }
+  const Xmm& cvtIdx0(const Operand& x) const { return x.isZMM() ? zm0 : x.isYMM() ? ym0 : xm0; }
+  // support (x, x/m, imm), (y, y/m, imm)
+  void opAVX_X_XM_IMM(const Xmm& x, const Operand& op, int type, int code, int imm8 = NONE) {
+    opAVX_X_X_XM(x, cvtIdx0(x), op, type, code, imm8);
+  }
+  // QQQ:need to refactor
+  void opSp1(const Reg& reg, const Operand& op, uint8_t pref, uint8_t code0, uint8_t code1) {
+    if (reg.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
+    bool is16bit = reg.isREG(16) && (op.isREG(16) || op.isMEM());
+    if (!is16bit && !(reg.isREG(i32e) && (op.isREG(reg.getBit()) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION)
+    if (is16bit) db(0x66);
+    db(pref);
+    opModRM(reg.changeBit(i32e == 32 ? 32 : reg.getBit()), op, op.isREG(), true, code0, code1);
+  }
+  void opGather(const Xmm& x1, const Address& addr, const Xmm& x2, int type, uint8_t code, int mode) {
+    const RegExp& regExp = addr.getRegExp();
+    if (!regExp.isVsib(128 | 256)) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
+    const int y_vx_y = 0;
+    const int y_vy_y = 1;
+    //		const int x_vy_x = 2;
+    const bool isAddrYMM = regExp.getIndex().getBit() == 256;
+    if (!x1.isXMM() || isAddrYMM || !x2.isXMM()) {
+      bool isOK = false;
+      if (mode == y_vx_y) {
+        isOK = x1.isYMM() && !isAddrYMM && x2.isYMM();
+      } else if (mode == y_vy_y) {
+        isOK = x1.isYMM() && isAddrYMM && x2.isYMM();
+      } else {  // x_vy_x
+        isOK = !x1.isYMM() && isAddrYMM && !x2.isYMM();
+      }
+      if (!isOK) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
+    }
+    int i1 = x1.getIdx();
+    int i2 = regExp.getIndex().getIdx();
+    int i3 = x2.getIdx();
+    if (i1 == i2 || i1 == i3 || i2 == i3) XBYAK_THROW(ERR_SAME_REGS_ARE_INVALID);
+    opAVX_X_X_XM(isAddrYMM ? Ymm(i1) : x1, isAddrYMM ? Ymm(i3) : x2, addr, type, code);
+  }
+  enum { xx_yy_zz = 0, xx_yx_zy = 1, xx_xy_yz = 2 };
+  void checkGather2(const Xmm& x1, const Reg& x2, int mode) const {
+    if (x1.isXMM() && x2.isXMM()) return;
+    switch (mode) {
+      case xx_yy_zz:
+        if ((x1.isYMM() && x2.isYMM()) || (x1.isZMM() && x2.isZMM())) return;
+        break;
+      case xx_yx_zy:
+        if ((x1.isYMM() && x2.isXMM()) || (x1.isZMM() && x2.isYMM())) return;
+        break;
+      case xx_xy_yz:
+        if ((x1.isXMM() && x2.isYMM()) || (x1.isYMM() && x2.isZMM())) return;
+        break;
+    }
+    XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
+  }
+  void opGather2(const Xmm& x, const Address& addr, int type, uint8_t code, int mode) {
+    if (x.hasZero()) XBYAK_THROW(ERR_INVALID_ZERO)
+    const RegExp& regExp = addr.getRegExp();
+    checkGather2(x, regExp.getIndex(), mode);
+    int maskIdx = x.getOpmaskIdx();
+    if ((type & T_M_K) && addr.getOpmaskIdx()) maskIdx = addr.getOpmaskIdx();
+    if (maskIdx == 0) XBYAK_THROW(ERR_K0_IS_INVALID);
+    if (!(type & T_M_K) && x.getIdx() == regExp.getIndex().getIdx()) XBYAK_THROW(ERR_SAME_REGS_ARE_INVALID);
+    opVex(x, 0, addr, type, code);
+  }
+  /*
+          xx_xy_yz ; mode = true
+          xx_xy_xz ; mode = false
+  */
+  void opVmov(const Operand& op, const Xmm& x, int type, uint8_t code, bool mode) {
+    if (mode) {
+      if (!op.isMEM() && !((op.isXMM() && x.isXMM()) || (op.isXMM() && x.isYMM()) || (op.isYMM() && x.isZMM())))
+        XBYAK_THROW(ERR_BAD_COMBINATION)
+    } else {
+      if (!op.isMEM() && !op.isXMM()) XBYAK_THROW(ERR_BAD_COMBINATION)
+    }
+    opVex(x, 0, op, type, code);
+  }
+  void opGatherFetch(const Address& addr, const Xmm& x, int type, uint8_t code, Operand::Kind kind) {
+    if (addr.hasZero()) XBYAK_THROW(ERR_INVALID_ZERO)
+    if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
+    opVex(x, 0, addr, type, code);
+  }
+  void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding) {
+    opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code0);
+  }
+  int orEvexIf(PreferredEncoding encoding) {
+    if (encoding == DefaultEncoding) {
+      encoding = defaultEncoding_;
+    }
+    if (encoding == EvexEncoding) {
+#ifdef XBYAK_DISABLE_AVX512
+      XBYAK_THROW(ERR_EVEX_IS_INVALID)
+#endif
+      return T_MUST_EVEX;
+    }
+    return 0;
+  }
+  void opInOut(const Reg& a, const Reg& d, uint8_t code) {
+    if (a.getIdx() == Operand::AL && d.getIdx() == Operand::DX && d.getBit() == 16) {
+      switch (a.getBit()) {
+        case 8:
+          db(code);
+          return;
+        case 16:
+          db(0x66);
+          db(code + 1);
+          return;
+        case 32:
+          db(code + 1);
+          return;
+      }
+    }
+    XBYAK_THROW(ERR_BAD_COMBINATION)
+  }
+  void opInOut(const Reg& a, uint8_t code, uint8_t v) {
+    if (a.getIdx() == Operand::AL) {
+      switch (a.getBit()) {
+        case 8:
+          db(code);
+          db(v);
+          return;
+        case 16:
+          db(0x66);
+          db(code + 1);
+          db(v);
+          return;
+        case 32:
+          db(code + 1);
+          db(v);
+          return;
+      }
+    }
+    XBYAK_THROW(ERR_BAD_COMBINATION)
+  }
+#ifdef XBYAK64
+  void opAMX(const Tmm& t1, const Address& addr, int type, int code0) {
+    // require both base and index
+    const RegExp exp = addr.getRegExp(false);
+    if (exp.getBase().getBit() == 0 || exp.getIndex().getBit() == 0) XBYAK_THROW(ERR_NOT_SUPPORTED)
+    opVex(t1, &tmm0, addr, type, code0);
+  }
+#endif
+ public:
+  unsigned int getVersion() const { return VERSION; }
+  using CodeArray::db;
+  const Mmx mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
+  const Xmm xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  const Ymm ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
+  const Zmm zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+  const Xmm &xm0, &xm1, &xm2, &xm3, &xm4, &xm5, &xm6, &xm7;
+  const Ymm &ym0, &ym1, &ym2, &ym3, &ym4, &ym5, &ym6, &ym7;
+  const Zmm &zm0, &zm1, &zm2, &zm3, &zm4, &zm5, &zm6, &zm7;
+  const Reg32 eax, ecx, edx, ebx, esp, ebp, esi, edi;
+  const Reg16 ax, cx, dx, bx, sp, bp, si, di;
+  const Reg8 al, cl, dl, bl, ah, ch, dh, bh;
+  const AddressFrame ptr, byte, word, dword, qword, xword, yword, zword;  // xword is same as oword of NASM
+  const AddressFrame ptr_b, xword_b, yword_b, zword_b;  // broadcast such as {1to2}, {1to4}, {1to8}, {1to16}, {b}
+  const Fpu st0, st1, st2, st3, st4, st5, st6, st7;
+  const Opmask k0, k1, k2, k3, k4, k5, k6, k7;
+  const BoundsReg bnd0, bnd1, bnd2, bnd3;
+  const EvexModifierRounding T_sae, T_rn_sae, T_rd_sae, T_ru_sae,
+      T_rz_sae;                // {sae}, {rn-sae}, {rd-sae}, {ru-sae}, {rz-sae}
+  const EvexModifierZero T_z;  // {z}
+#ifdef XBYAK64
+  const Reg64 rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15;
+  const Reg32 r8d, r9d, r10d, r11d, r12d, r13d, r14d, r15d;
+  const Reg16 r8w, r9w, r10w, r11w, r12w, r13w, r14w, r15w;
+  const Reg8 r8b, r9b, r10b, r11b, r12b, r13b, r14b, r15b;
+  const Reg8 spl, bpl, sil, dil;
+  const Xmm xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  const Xmm xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23;
+  const Xmm xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31;
+  const Ymm ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
+  const Ymm ymm16, ymm17, ymm18, ymm19, ymm20, ymm21, ymm22, ymm23;
+  const Ymm ymm24, ymm25, ymm26, ymm27, ymm28, ymm29, ymm30, ymm31;
+  const Zmm zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15;
+  const Zmm zmm16, zmm17, zmm18, zmm19, zmm20, zmm21, zmm22, zmm23;
+  const Zmm zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31;
+  const Tmm tmm0, tmm1, tmm2, tmm3, tmm4, tmm5, tmm6, tmm7;
+  const Xmm &xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15;  // for my convenience
+  const Xmm &xm16, &xm17, &xm18, &xm19, &xm20, &xm21, &xm22, &xm23;
+  const Xmm &xm24, &xm25, &xm26, &xm27, &xm28, &xm29, &xm30, &xm31;
+  const Ymm &ym8, &ym9, &ym10, &ym11, &ym12, &ym13, &ym14, &ym15;
+  const Ymm &ym16, &ym17, &ym18, &ym19, &ym20, &ym21, &ym22, &ym23;
+  const Ymm &ym24, &ym25, &ym26, &ym27, &ym28, &ym29, &ym30, &ym31;
+  const Zmm &zm8, &zm9, &zm10, &zm11, &zm12, &zm13, &zm14, &zm15;
+  const Zmm &zm16, &zm17, &zm18, &zm19, &zm20, &zm21, &zm22, &zm23;
+  const Zmm &zm24, &zm25, &zm26, &zm27, &zm28, &zm29, &zm30, &zm31;
+  const RegRip rip;
+#endif
+#ifndef XBYAK_DISABLE_SEGMENT
+  const Segment es, cs, ss, ds, fs, gs;
+#endif
+ private:
+  bool isDefaultJmpNEAR_;
+  PreferredEncoding defaultEncoding_;
+
+ public:
+  void L(const std::string& label) { labelMgr_.defineSlabel(label); }
+  void L(Label& label) { labelMgr_.defineClabel(label); }
+  Label L() {
+    Label label;
+    L(label);
+    return label;
+  }
+  void inLocalLabel() { labelMgr_.enterLocal(); }
+  void outLocalLabel() { labelMgr_.leaveLocal(); }
+  /*
+          assign src to dst
+          require
+          dst : does not used by L()
+          src : used by L()
+  */
+  void assignL(Label& dst, const Label& src) { labelMgr_.assign(dst, src); }
+  /*
+          put address of label to buffer
+          @note the put size is 4(32-bit), 8(64-bit)
+  */
+  void putL(std::string label) { putL_inner(label); }
+  void putL(const Label& label) { putL_inner(label); }
+
+  // set default type of `jmp` of undefined label to T_NEAR
+  void setDefaultJmpNEAR(bool isNear) { isDefaultJmpNEAR_ = isNear; }
+  void jmp(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 4); }
+  void jmp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
+  void jmp(const char* label, LabelType type = T_AUTO) { jmp(std::string(label), type); }
+  void jmp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
+  void jmp(const void* addr, LabelType type = T_AUTO) { opJmpAbs(addr, type, 0xEB, 0xE9); }
+
+  void call(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 2); }
+  // call(string label), not const std::string&
+  void call(std::string label) { opJmp(label, T_NEAR, 0, 0xE8, 0); }
+  void call(const char* label) { call(std::string(label)); }
+  void call(const Label& label) { opJmp(label, T_NEAR, 0, 0xE8, 0); }
+  // call(function pointer)
+#ifdef XBYAK_VARIADIC_TEMPLATE
+  template <class Ret, class... Params>
+  void call(Ret (*func)(Params...)) {
+    call(reinterpret_cast<const void*>(func));
+  }
+#endif
+  void call(const void* addr) { opJmpAbs(addr, T_NEAR, 0, 0xE8); }
+
+  void test(const Operand& op, const Reg& reg) {
+    opModRM(reg, op, op.isREG() && (op.getKind() == reg.getKind()), op.isMEM(), 0x84);
+  }
+  void test(const Operand& op, uint32_t imm) {
+    verifyMemHasSize(op);
+    int immSize = (std::min)(op.getBit() / 8, 4U);
+    if (op.isREG() && op.getIdx() == 0) {  // al, ax, eax
+      rex(op);
+      db(0xA8 | (op.isBit(8) ? 0 : 1));
+    } else {
+      opR_ModM(op, 0, 0, 0xF6, NONE, NONE, false, immSize);
+    }
+    db(imm, immSize);
+  }
+  void imul(const Reg& reg, const Operand& op) {
+    opModRM(reg, op, op.isREG() && (reg.getKind() == op.getKind()), op.isMEM(), 0x0F, 0xAF);
+  }
+  void imul(const Reg& reg, const Operand& op, int imm) {
+    int s = inner::IsInDisp8(imm) ? 1 : 0;
+    int immSize = s ? 1 : reg.isREG(16) ? 2 : 4;
+    opModRM(reg, op, op.isREG() && (reg.getKind() == op.getKind()), op.isMEM(), 0x69 | (s << 1), NONE, NONE, immSize);
+    db(imm, immSize);
+  }
+  void push(const Operand& op) { opPushPop(op, 0xFF, 6, 0x50); }
+  void pop(const Operand& op) { opPushPop(op, 0x8F, 0, 0x58); }
+  void push(const AddressFrame& af, uint32_t imm) {
+    if (af.bit_ == 8) {
+      db(0x6A);
+      db(imm);
+    } else if (af.bit_ == 16) {
+      db(0x66);
+      db(0x68);
+      dw(imm);
+    } else {
+      db(0x68);
+      dd(imm);
+    }
+  }
+  /* use "push(word, 4)" if you want "push word 4" */
+  void push(uint32_t imm) {
+    if (inner::IsInDisp8(imm)) {
+      push(byte, imm);
+    } else {
+      push(dword, imm);
+    }
+  }
+  void mov(const Operand& reg1, const Operand& reg2) {
+    const Reg* reg = 0;
+    const Address* addr = 0;
+    uint8_t code = 0;
+    if (reg1.isREG() && reg1.getIdx() == 0 && reg2.isMEM()) {  // mov eax|ax|al, [disp]
+      reg = &reg1.getReg();
+      addr = &reg2.getAddress();
+      code = 0xA0;
+    } else if (reg1.isMEM() && reg2.isREG() && reg2.getIdx() == 0) {  // mov [disp], eax|ax|al
+      reg = &reg2.getReg();
+      addr = &reg1.getAddress();
+      code = 0xA2;
+    }
+#ifdef XBYAK64
+    if (addr && addr->is64bitDisp()) {
+      if (code) {
+        rex(*reg);
+        db(reg1.isREG(8) ? 0xA0 : reg1.isREG() ? 0xA1 : reg2.isREG(8) ? 0xA2 : 0xA3);
+        db(addr->getDisp(), 8);
+      } else {
+        XBYAK_THROW(ERR_BAD_COMBINATION)
+      }
+    } else
+#else
+    if (code && addr->isOnlyDisp()) {
+      rex(*reg, *addr);
+      db(code | (reg->isBit(8) ? 0 : 1));
+      dd(static_cast<uint32_t>(addr->getDisp()));
+    } else
+#endif
+    {
+      opRM_RM(reg1, reg2, 0x88);
+    }
+  }
+  void mov(const Operand& op, uint64_t imm) {
+    if (op.isREG()) {
+      const int size = mov_imm(op.getReg(), imm);
+      db(imm, size);
+    } else if (op.isMEM()) {
+      verifyMemHasSize(op);
+      int immSize = op.getBit() / 8;
+      if (immSize <= 4) {
+        int64_t s = int64_t(imm) >> (immSize * 8);
+        if (s != 0 && s != -1) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
+      } else {
+        if (!inner::IsInInt32(imm)) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
+        immSize = 4;
+      }
+      opModM(op.getAddress(), Reg(0, Operand::REG, op.getBit()), 0xC6, NONE, NONE, immSize);
+      db(static_cast<uint32_t>(imm), immSize);
+    } else {
+      XBYAK_THROW(ERR_BAD_COMBINATION)
+    }
+  }
+
+  // The template is used to avoid ambiguity when the 2nd argument is 0.
+  // When the 2nd argument is 0 the call goes to
+  // `void mov(const Operand& op, uint64_t imm)`.
+  template <typename T1, typename T2>
+  void mov(const T1&, const T2*) {
+    T1::unexpected;
+  }
+  void mov(const NativeReg& reg, const Label& label) {
+    mov_imm(reg, dummyAddr);
+    putL(label);
+  }
+  void xchg(const Operand& op1, const Operand& op2) {
+    const Operand *p1 = &op1, *p2 = &op2;
+    if (p1->isMEM() || (p2->isREG(16 | i32e) && p2->getIdx() == 0)) {
+      p1 = &op2;
+      p2 = &op1;
+    }
+    if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
+    if (p2->isREG() && (p1->isREG(16 | i32e) && p1->getIdx() == 0)
+#ifdef XBYAK64
+        && (p2->getIdx() != 0 || !p1->isREG(32))
+#endif
+    ) {
+      rex(*p2, *p1);
+      db(0x90 | (p2->getIdx() & 7));
+      return;
+    }
+    opModRM(*p1, *p2, (p1->isREG() && p2->isREG() && (p1->getBit() == p2->getBit())), p2->isMEM(),
+            0x86 | (p1->isBit(8) ? 0 : 1));
+  }
+
+#ifndef XBYAK_DISABLE_SEGMENT
+  void push(const Segment& seg) {
+    switch (seg.getIdx()) {
+      case Segment::es:
+        db(0x06);
+        break;
+      case Segment::cs:
+        db(0x0E);
+        break;
+      case Segment::ss:
+        db(0x16);
+        break;
+      case Segment::ds:
+        db(0x1E);
+        break;
+      case Segment::fs:
+        db(0x0F);
+        db(0xA0);
+        break;
+      case Segment::gs:
+        db(0x0F);
+        db(0xA8);
+        break;
+      default:
+        assert(0);
+    }
+  }
+  void pop(const Segment& seg) {
+    switch (seg.getIdx()) {
+      case Segment::es:
+        db(0x07);
+        break;
+      case Segment::cs:
+        XBYAK_THROW(ERR_BAD_COMBINATION)
+      case Segment::ss:
+        db(0x17);
+        break;
+      case Segment::ds:
+        db(0x1F);
+        break;
+      case Segment::fs:
+        db(0x0F);
+        db(0xA1);
+        break;
+      case Segment::gs:
+        db(0x0F);
+        db(0xA9);
+        break;
+      default:
+        assert(0);
+    }
+  }
+  void putSeg(const Segment& seg) {
+    switch (seg.getIdx()) {
+      case Segment::es:
+        db(0x2E);
+        break;
+      case Segment::cs:
+        db(0x36);
+        break;
+      case Segment::ss:
+        db(0x3E);
+        break;
+      case Segment::ds:
+        db(0x26);
+        break;
+      case Segment::fs:
+        db(0x64);
+        break;
+      case Segment::gs:
+        db(0x65);
+        break;
+      default:
+        assert(0);
+    }
+  }
+  void mov(const Operand& op, const Segment& seg) {
+    opModRM(Reg8(seg.getIdx()), op, op.isREG(16 | i32e), op.isMEM(), 0x8C);
+  }
+  void mov(const Segment& seg, const Operand& op) {
+    opModRM(Reg8(seg.getIdx()), op.isREG(16 | i32e) ? static_cast<const Operand&>(op.getReg().cvt32()) : op,
+            op.isREG(16 | i32e), op.isMEM(), 0x8E);
+  }
+#endif
+
+  enum { NONE = 256 };
+  // constructor
+  CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void* userPtr = 0, Allocator* allocator = 0)
+      : CodeArray(maxSize, userPtr, allocator),
+        mm0(0),
+        mm1(1),
+        mm2(2),
+        mm3(3),
+        mm4(4),
+        mm5(5),
+        mm6(6),
+        mm7(7),
+        xmm0(0),
+        xmm1(1),
+        xmm2(2),
+        xmm3(3),
+        xmm4(4),
+        xmm5(5),
+        xmm6(6),
+        xmm7(7),
+        ymm0(0),
+        ymm1(1),
+        ymm2(2),
+        ymm3(3),
+        ymm4(4),
+        ymm5(5),
+        ymm6(6),
+        ymm7(7),
+        zmm0(0),
+        zmm1(1),
+        zmm2(2),
+        zmm3(3),
+        zmm4(4),
+        zmm5(5),
+        zmm6(6),
+        zmm7(7)
+        // for my convenience
+        ,
+        xm0(xmm0),
+        xm1(xmm1),
+        xm2(xmm2),
+        xm3(xmm3),
+        xm4(xmm4),
+        xm5(xmm5),
+        xm6(xmm6),
+        xm7(xmm7),
+        ym0(ymm0),
+        ym1(ymm1),
+        ym2(ymm2),
+        ym3(ymm3),
+        ym4(ymm4),
+        ym5(ymm5),
+        ym6(ymm6),
+        ym7(ymm7),
+        zm0(zmm0),
+        zm1(zmm1),
+        zm2(zmm2),
+        zm3(zmm3),
+        zm4(zmm4),
+        zm5(zmm5),
+        zm6(zmm6),
+        zm7(zmm7)
+
+        ,
+        eax(Operand::EAX),
+        ecx(Operand::ECX),
+        edx(Operand::EDX),
+        ebx(Operand::EBX),
+        esp(Operand::ESP),
+        ebp(Operand::EBP),
+        esi(Operand::ESI),
+        edi(Operand::EDI),
+        ax(Operand::AX),
+        cx(Operand::CX),
+        dx(Operand::DX),
+        bx(Operand::BX),
+        sp(Operand::SP),
+        bp(Operand::BP),
+        si(Operand::SI),
+        di(Operand::DI),
+        al(Operand::AL),
+        cl(Operand::CL),
+        dl(Operand::DL),
+        bl(Operand::BL),
+        ah(Operand::AH),
+        ch(Operand::CH),
+        dh(Operand::DH),
+        bh(Operand::BH),
+        ptr(0),
+        byte(8),
+        word(16),
+        dword(32),
+        qword(64),
+        xword(128),
+        yword(256),
+        zword(512),
+        ptr_b(0, true),
+        xword_b(128, true),
+        yword_b(256, true),
+        zword_b(512, true),
+        st0(0),
+        st1(1),
+        st2(2),
+        st3(3),
+        st4(4),
+        st5(5),
+        st6(6),
+        st7(7),
+        k0(0),
+        k1(1),
+        k2(2),
+        k3(3),
+        k4(4),
+        k5(5),
+        k6(6),
+        k7(7),
+        bnd0(0),
+        bnd1(1),
+        bnd2(2),
+        bnd3(3),
+        T_sae(EvexModifierRounding::T_SAE),
+        T_rn_sae(EvexModifierRounding::T_RN_SAE),
+        T_rd_sae(EvexModifierRounding::T_RD_SAE),
+        T_ru_sae(EvexModifierRounding::T_RU_SAE),
+        T_rz_sae(EvexModifierRounding::T_RZ_SAE),
+        T_z()
+#ifdef XBYAK64
+        ,
+        rax(Operand::RAX),
+        rcx(Operand::RCX),
+        rdx(Operand::RDX),
+        rbx(Operand::RBX),
+        rsp(Operand::RSP),
+        rbp(Operand::RBP),
+        rsi(Operand::RSI),
+        rdi(Operand::RDI),
+        r8(Operand::R8),
+        r9(Operand::R9),
+        r10(Operand::R10),
+        r11(Operand::R11),
+        r12(Operand::R12),
+        r13(Operand::R13),
+        r14(Operand::R14),
+        r15(Operand::R15),
+        r8d(8),
+        r9d(9),
+        r10d(10),
+        r11d(11),
+        r12d(12),
+        r13d(13),
+        r14d(14),
+        r15d(15),
+        r8w(8),
+        r9w(9),
+        r10w(10),
+        r11w(11),
+        r12w(12),
+        r13w(13),
+        r14w(14),
+        r15w(15),
+        r8b(8),
+        r9b(9),
+        r10b(10),
+        r11b(11),
+        r12b(12),
+        r13b(13),
+        r14b(14),
+        r15b(15),
+        spl(Operand::SPL, true),
+        bpl(Operand::BPL, true),
+        sil(Operand::SIL, true),
+        dil(Operand::DIL, true),
+        xmm8(8),
+        xmm9(9),
+        xmm10(10),
+        xmm11(11),
+        xmm12(12),
+        xmm13(13),
+        xmm14(14),
+        xmm15(15),
+        xmm16(16),
+        xmm17(17),
+        xmm18(18),
+        xmm19(19),
+        xmm20(20),
+        xmm21(21),
+        xmm22(22),
+        xmm23(23),
+        xmm24(24),
+        xmm25(25),
+        xmm26(26),
+        xmm27(27),
+        xmm28(28),
+        xmm29(29),
+        xmm30(30),
+        xmm31(31),
+        ymm8(8),
+        ymm9(9),
+        ymm10(10),
+        ymm11(11),
+        ymm12(12),
+        ymm13(13),
+        ymm14(14),
+        ymm15(15),
+        ymm16(16),
+        ymm17(17),
+        ymm18(18),
+        ymm19(19),
+        ymm20(20),
+        ymm21(21),
+        ymm22(22),
+        ymm23(23),
+        ymm24(24),
+        ymm25(25),
+        ymm26(26),
+        ymm27(27),
+        ymm28(28),
+        ymm29(29),
+        ymm30(30),
+        ymm31(31),
+        zmm8(8),
+        zmm9(9),
+        zmm10(10),
+        zmm11(11),
+        zmm12(12),
+        zmm13(13),
+        zmm14(14),
+        zmm15(15),
+        zmm16(16),
+        zmm17(17),
+        zmm18(18),
+        zmm19(19),
+        zmm20(20),
+        zmm21(21),
+        zmm22(22),
+        zmm23(23),
+        zmm24(24),
+        zmm25(25),
+        zmm26(26),
+        zmm27(27),
+        zmm28(28),
+        zmm29(29),
+        zmm30(30),
+        zmm31(31),
+        tmm0(0),
+        tmm1(1),
+        tmm2(2),
+        tmm3(3),
+        tmm4(4),
+        tmm5(5),
+        tmm6(6),
+        tmm7(7)
+        // for my convenience
+        ,
+        xm8(xmm8),
+        xm9(xmm9),
+        xm10(xmm10),
+        xm11(xmm11),
+        xm12(xmm12),
+        xm13(xmm13),
+        xm14(xmm14),
+        xm15(xmm15),
+        xm16(xmm16),
+        xm17(xmm17),
+        xm18(xmm18),
+        xm19(xmm19),
+        xm20(xmm20),
+        xm21(xmm21),
+        xm22(xmm22),
+        xm23(xmm23),
+        xm24(xmm24),
+        xm25(xmm25),
+        xm26(xmm26),
+        xm27(xmm27),
+        xm28(xmm28),
+        xm29(xmm29),
+        xm30(xmm30),
+        xm31(xmm31),
+        ym8(ymm8),
+        ym9(ymm9),
+        ym10(ymm10),
+        ym11(ymm11),
+        ym12(ymm12),
+        ym13(ymm13),
+        ym14(ymm14),
+        ym15(ymm15),
+        ym16(ymm16),
+        ym17(ymm17),
+        ym18(ymm18),
+        ym19(ymm19),
+        ym20(ymm20),
+        ym21(ymm21),
+        ym22(ymm22),
+        ym23(ymm23),
+        ym24(ymm24),
+        ym25(ymm25),
+        ym26(ymm26),
+        ym27(ymm27),
+        ym28(ymm28),
+        ym29(ymm29),
+        ym30(ymm30),
+        ym31(ymm31),
+        zm8(zmm8),
+        zm9(zmm9),
+        zm10(zmm10),
+        zm11(zmm11),
+        zm12(zmm12),
+        zm13(zmm13),
+        zm14(zmm14),
+        zm15(zmm15),
+        zm16(zmm16),
+        zm17(zmm17),
+        zm18(zmm18),
+        zm19(zmm19),
+        zm20(zmm20),
+        zm21(zmm21),
+        zm22(zmm22),
+        zm23(zmm23),
+        zm24(zmm24),
+        zm25(zmm25),
+        zm26(zmm26),
+        zm27(zmm27),
+        zm28(zmm28),
+        zm29(zmm29),
+        zm30(zmm30),
+        zm31(zmm31),
+        rip()
+#endif
+#ifndef XBYAK_DISABLE_SEGMENT
+        ,
+        es(Segment::es),
+        cs(Segment::cs),
+        ss(Segment::ss),
+        ds(Segment::ds),
+        fs(Segment::fs),
+        gs(Segment::gs)
+#endif
+        ,
+        isDefaultJmpNEAR_(false),
+        defaultEncoding_(EvexEncoding) {
+    labelMgr_.set(this);
+  }
+  void reset() {
+    ClearError();
+    resetSize();
+    labelMgr_.reset();
+    labelMgr_.set(this);
+  }
+  bool hasUndefinedLabel() const { return labelMgr_.hasUndefSlabel() || labelMgr_.hasUndefClabel(); }
+  /*
+          MUST call ready() to complete generating code if you use AutoGrow mode.
+          It is not necessary for the other mode if hasUndefinedLabel() is true.
+  */
+  void ready(ProtectMode mode = PROTECT_RWE) {
+    if (hasUndefinedLabel()) XBYAK_THROW(ERR_LABEL_IS_NOT_FOUND)
+    if (isAutoGrow()) {
+      calcJmpAddress();
+      if (useProtect()) setProtectMode(mode);
+    }
+  }
+  // set read/exec
+  void readyRE() { return ready(PROTECT_RE); }
+#ifdef XBYAK_TEST
+  void dump(bool doClear = true) {
+    CodeArray::dump();
+    if (doClear) size_ = 0;
+  }
+#endif
+
+#ifdef XBYAK_UNDEF_JNL
+#undef jnl
+#endif
+
+  // set default encoding to select Vex or Evex
+  void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; }
+
+  /*
+          use single byte nop if useMultiByteNop = false
+  */
+  void nop(size_t size = 1, bool useMultiByteNop = true) {
+    if (!useMultiByteNop) {
+      for (size_t i = 0; i < size; i++) {
+        db(0x90);
+      }
+      return;
+    }
+    /*
+            Intel Architectures Software Developer's Manual Volume 2
+            recommended multi-byte sequence of NOP instruction
+            AMD and Intel seem to agree on the same sequences for up to 9 bytes:
+            https://support.amd.com/TechDocs/55723_SOG_Fam_17h_Processors_3.00.pdf
+    */
+    static const uint8_t nopTbl[9][9] = {
+        {0x90},
+        {0x66, 0x90},
+        {0x0F, 0x1F, 0x00},
+        {0x0F, 0x1F, 0x40, 0x00},
+        {0x0F, 0x1F, 0x44, 0x00, 0x00},
+        {0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00},
+        {0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
+        {0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+        {0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    };
+    const size_t n = sizeof(nopTbl) / sizeof(nopTbl[0]);
+    while (size > 0) {
+      size_t len = (std::min)(n, size);
+      const uint8_t* seq = nopTbl[len - 1];
+      db(seq, len);
+      size -= len;
+    }
+  }
+
+#ifndef XBYAK_DONT_READ_LIST
+#include "xbyak_mnemonic.h"
+  /*
+          use single byte nop if useMultiByteNop = false
+  */
+  void align(size_t x = 16, bool useMultiByteNop = true) {
+    if (x == 1) return;
+    if (x < 1 || (x & (x - 1))) XBYAK_THROW(ERR_BAD_ALIGN)
+    if (isAutoGrow()) XBYAK_THROW(ERR_BAD_ALIGN)
+    size_t remain = size_t(getCurr()) % x;
+    if (remain) {
+      nop(x - remain, useMultiByteNop);
+    }
+  }
+#endif
+};
+
+template <>
+inline void CodeGenerator::mov(const NativeReg& reg, const char* label)  // can't use std::string
+{
+  assert(label);
+  mov_imm(reg, dummyAddr);
+  putL(label);
+}
+
+namespace util {
+static const XBYAK_CONSTEXPR Mmx mm0(0), mm1(1), mm2(2), mm3(3), mm4(4), mm5(5), mm6(6), mm7(7);
+static const XBYAK_CONSTEXPR Xmm xmm0(0), xmm1(1), xmm2(2), xmm3(3), xmm4(4), xmm5(5), xmm6(6), xmm7(7);
+static const XBYAK_CONSTEXPR Ymm ymm0(0), ymm1(1), ymm2(2), ymm3(3), ymm4(4), ymm5(5), ymm6(6), ymm7(7);
+static const XBYAK_CONSTEXPR Zmm zmm0(0), zmm1(1), zmm2(2), zmm3(3), zmm4(4), zmm5(5), zmm6(6), zmm7(7);
+static const XBYAK_CONSTEXPR Reg32 eax(Operand::EAX), ecx(Operand::ECX), edx(Operand::EDX), ebx(Operand::EBX),
+    esp(Operand::ESP), ebp(Operand::EBP), esi(Operand::ESI), edi(Operand::EDI);
+static const XBYAK_CONSTEXPR Reg16 ax(Operand::AX), cx(Operand::CX), dx(Operand::DX), bx(Operand::BX), sp(Operand::SP),
+    bp(Operand::BP), si(Operand::SI), di(Operand::DI);
+static const XBYAK_CONSTEXPR Reg8 al(Operand::AL), cl(Operand::CL), dl(Operand::DL), bl(Operand::BL), ah(Operand::AH),
+    ch(Operand::CH), dh(Operand::DH), bh(Operand::BH);
+static const XBYAK_CONSTEXPR AddressFrame ptr(0), byte(8), word(16), dword(32), qword(64), xword(128), yword(256),
+    zword(512);
+static const XBYAK_CONSTEXPR AddressFrame ptr_b(0, true), xword_b(128, true), yword_b(256, true), zword_b(512, true);
+static const XBYAK_CONSTEXPR Fpu st0(0), st1(1), st2(2), st3(3), st4(4), st5(5), st6(6), st7(7);
+static const XBYAK_CONSTEXPR Opmask k0(0), k1(1), k2(2), k3(3), k4(4), k5(5), k6(6), k7(7);
+static const XBYAK_CONSTEXPR BoundsReg bnd0(0), bnd1(1), bnd2(2), bnd3(3);
+static const XBYAK_CONSTEXPR EvexModifierRounding T_sae(EvexModifierRounding::T_SAE),
+    T_rn_sae(EvexModifierRounding::T_RN_SAE), T_rd_sae(EvexModifierRounding::T_RD_SAE),
+    T_ru_sae(EvexModifierRounding::T_RU_SAE), T_rz_sae(EvexModifierRounding::T_RZ_SAE);
+static const XBYAK_CONSTEXPR EvexModifierZero T_z;
+#ifdef XBYAK64
+static const XBYAK_CONSTEXPR Reg64 rax(Operand::RAX), rcx(Operand::RCX), rdx(Operand::RDX), rbx(Operand::RBX),
+    rsp(Operand::RSP), rbp(Operand::RBP), rsi(Operand::RSI), rdi(Operand::RDI), r8(Operand::R8), r9(Operand::R9),
+    r10(Operand::R10), r11(Operand::R11), r12(Operand::R12), r13(Operand::R13), r14(Operand::R14), r15(Operand::R15);
+static const XBYAK_CONSTEXPR Reg32 r8d(8), r9d(9), r10d(10), r11d(11), r12d(12), r13d(13), r14d(14), r15d(15);
+static const XBYAK_CONSTEXPR Reg16 r8w(8), r9w(9), r10w(10), r11w(11), r12w(12), r13w(13), r14w(14), r15w(15);
+static const XBYAK_CONSTEXPR Reg8 r8b(8), r9b(9), r10b(10), r11b(11), r12b(12), r13b(13), r14b(14), r15b(15),
+    spl(Operand::SPL, true), bpl(Operand::BPL, true), sil(Operand::SIL, true), dil(Operand::DIL, true);
+static const XBYAK_CONSTEXPR Xmm xmm8(8), xmm9(9), xmm10(10), xmm11(11), xmm12(12), xmm13(13), xmm14(14), xmm15(15);
+static const XBYAK_CONSTEXPR Xmm xmm16(16), xmm17(17), xmm18(18), xmm19(19), xmm20(20), xmm21(21), xmm22(22), xmm23(23);
+static const XBYAK_CONSTEXPR Xmm xmm24(24), xmm25(25), xmm26(26), xmm27(27), xmm28(28), xmm29(29), xmm30(30), xmm31(31);
+static const XBYAK_CONSTEXPR Ymm ymm8(8), ymm9(9), ymm10(10), ymm11(11), ymm12(12), ymm13(13), ymm14(14), ymm15(15);
+static const XBYAK_CONSTEXPR Ymm ymm16(16), ymm17(17), ymm18(18), ymm19(19), ymm20(20), ymm21(21), ymm22(22), ymm23(23);
+static const XBYAK_CONSTEXPR Ymm ymm24(24), ymm25(25), ymm26(26), ymm27(27), ymm28(28), ymm29(29), ymm30(30), ymm31(31);
+static const XBYAK_CONSTEXPR Zmm zmm8(8), zmm9(9), zmm10(10), zmm11(11), zmm12(12), zmm13(13), zmm14(14), zmm15(15);
+static const XBYAK_CONSTEXPR Zmm zmm16(16), zmm17(17), zmm18(18), zmm19(19), zmm20(20), zmm21(21), zmm22(22), zmm23(23);
+static const XBYAK_CONSTEXPR Zmm zmm24(24), zmm25(25), zmm26(26), zmm27(27), zmm28(28), zmm29(29), zmm30(30), zmm31(31);
+static const XBYAK_CONSTEXPR Zmm tmm0(0), tmm1(1), tmm2(2), tmm3(3), tmm4(4), tmm5(5), tmm6(6), tmm7(7);
+static const XBYAK_CONSTEXPR RegRip rip;
+#endif
+#ifndef XBYAK_DISABLE_SEGMENT
+static const XBYAK_CONSTEXPR Segment es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds),
+    fs(Segment::fs), gs(Segment::gs);
+#endif
+}  // namespace util
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
+}  // namespace Xbyak
+
+#endif  // XBYAK_XBYAK_H_
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/xbyak/xbyak_bin2hex.h b/vllm-ext/inference_engine/cpp/bestla/bestla/xbyak/xbyak_bin2hex.h
new file mode 100644
index 000000000..fda7da3c9
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/xbyak/xbyak_bin2hex.h
@@ -0,0 +1,271 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+enum {
+  B00000000 = 0,
+  B00000001 = 1,
+  B00000010 = 2,
+  B00000011 = 3,
+  B00000100 = 4,
+  B00000101 = 5,
+  B00000110 = 6,
+  B00000111 = 7,
+  B00001000 = 8,
+  B00001001 = 9,
+  B00001010 = 10,
+  B00001011 = 11,
+  B00001100 = 12,
+  B00001101 = 13,
+  B00001110 = 14,
+  B00001111 = 15,
+  B00010000 = 16,
+  B00010001 = 17,
+  B00010010 = 18,
+  B00010011 = 19,
+  B00010100 = 20,
+  B00010101 = 21,
+  B00010110 = 22,
+  B00010111 = 23,
+  B00011000 = 24,
+  B00011001 = 25,
+  B00011010 = 26,
+  B00011011 = 27,
+  B00011100 = 28,
+  B00011101 = 29,
+  B00011110 = 30,
+  B00011111 = 31,
+  B00100000 = 32,
+  B00100001 = 33,
+  B00100010 = 34,
+  B00100011 = 35,
+  B00100100 = 36,
+  B00100101 = 37,
+  B00100110 = 38,
+  B00100111 = 39,
+  B00101000 = 40,
+  B00101001 = 41,
+  B00101010 = 42,
+  B00101011 = 43,
+  B00101100 = 44,
+  B00101101 = 45,
+  B00101110 = 46,
+  B00101111 = 47,
+  B00110000 = 48,
+  B00110001 = 49,
+  B00110010 = 50,
+  B00110011 = 51,
+  B00110100 = 52,
+  B00110101 = 53,
+  B00110110 = 54,
+  B00110111 = 55,
+  B00111000 = 56,
+  B00111001 = 57,
+  B00111010 = 58,
+  B00111011 = 59,
+  B00111100 = 60,
+  B00111101 = 61,
+  B00111110 = 62,
+  B00111111 = 63,
+  B01000000 = 64,
+  B01000001 = 65,
+  B01000010 = 66,
+  B01000011 = 67,
+  B01000100 = 68,
+  B01000101 = 69,
+  B01000110 = 70,
+  B01000111 = 71,
+  B01001000 = 72,
+  B01001001 = 73,
+  B01001010 = 74,
+  B01001011 = 75,
+  B01001100 = 76,
+  B01001101 = 77,
+  B01001110 = 78,
+  B01001111 = 79,
+  B01010000 = 80,
+  B01010001 = 81,
+  B01010010 = 82,
+  B01010011 = 83,
+  B01010100 = 84,
+  B01010101 = 85,
+  B01010110 = 86,
+  B01010111 = 87,
+  B01011000 = 88,
+  B01011001 = 89,
+  B01011010 = 90,
+  B01011011 = 91,
+  B01011100 = 92,
+  B01011101 = 93,
+  B01011110 = 94,
+  B01011111 = 95,
+  B01100000 = 96,
+  B01100001 = 97,
+  B01100010 = 98,
+  B01100011 = 99,
+  B01100100 = 100,
+  B01100101 = 101,
+  B01100110 = 102,
+  B01100111 = 103,
+  B01101000 = 104,
+  B01101001 = 105,
+  B01101010 = 106,
+  B01101011 = 107,
+  B01101100 = 108,
+  B01101101 = 109,
+  B01101110 = 110,
+  B01101111 = 111,
+  B01110000 = 112,
+  B01110001 = 113,
+  B01110010 = 114,
+  B01110011 = 115,
+  B01110100 = 116,
+  B01110101 = 117,
+  B01110110 = 118,
+  B01110111 = 119,
+  B01111000 = 120,
+  B01111001 = 121,
+  B01111010 = 122,
+  B01111011 = 123,
+  B01111100 = 124,
+  B01111101 = 125,
+  B01111110 = 126,
+  B01111111 = 127,
+  B10000000 = 128,
+  B10000001 = 129,
+  B10000010 = 130,
+  B10000011 = 131,
+  B10000100 = 132,
+  B10000101 = 133,
+  B10000110 = 134,
+  B10000111 = 135,
+  B10001000 = 136,
+  B10001001 = 137,
+  B10001010 = 138,
+  B10001011 = 139,
+  B10001100 = 140,
+  B10001101 = 141,
+  B10001110 = 142,
+  B10001111 = 143,
+  B10010000 = 144,
+  B10010001 = 145,
+  B10010010 = 146,
+  B10010011 = 147,
+  B10010100 = 148,
+  B10010101 = 149,
+  B10010110 = 150,
+  B10010111 = 151,
+  B10011000 = 152,
+  B10011001 = 153,
+  B10011010 = 154,
+  B10011011 = 155,
+  B10011100 = 156,
+  B10011101 = 157,
+  B10011110 = 158,
+  B10011111 = 159,
+  B10100000 = 160,
+  B10100001 = 161,
+  B10100010 = 162,
+  B10100011 = 163,
+  B10100100 = 164,
+  B10100101 = 165,
+  B10100110 = 166,
+  B10100111 = 167,
+  B10101000 = 168,
+  B10101001 = 169,
+  B10101010 = 170,
+  B10101011 = 171,
+  B10101100 = 172,
+  B10101101 = 173,
+  B10101110 = 174,
+  B10101111 = 175,
+  B10110000 = 176,
+  B10110001 = 177,
+  B10110010 = 178,
+  B10110011 = 179,
+  B10110100 = 180,
+  B10110101 = 181,
+  B10110110 = 182,
+  B10110111 = 183,
+  B10111000 = 184,
+  B10111001 = 185,
+  B10111010 = 186,
+  B10111011 = 187,
+  B10111100 = 188,
+  B10111101 = 189,
+  B10111110 = 190,
+  B10111111 = 191,
+  B11000000 = 192,
+  B11000001 = 193,
+  B11000010 = 194,
+  B11000011 = 195,
+  B11000100 = 196,
+  B11000101 = 197,
+  B11000110 = 198,
+  B11000111 = 199,
+  B11001000 = 200,
+  B11001001 = 201,
+  B11001010 = 202,
+  B11001011 = 203,
+  B11001100 = 204,
+  B11001101 = 205,
+  B11001110 = 206,
+  B11001111 = 207,
+  B11010000 = 208,
+  B11010001 = 209,
+  B11010010 = 210,
+  B11010011 = 211,
+  B11010100 = 212,
+  B11010101 = 213,
+  B11010110 = 214,
+  B11010111 = 215,
+  B11011000 = 216,
+  B11011001 = 217,
+  B11011010 = 218,
+  B11011011 = 219,
+  B11011100 = 220,
+  B11011101 = 221,
+  B11011110 = 222,
+  B11011111 = 223,
+  B11100000 = 224,
+  B11100001 = 225,
+  B11100010 = 226,
+  B11100011 = 227,
+  B11100100 = 228,
+  B11100101 = 229,
+  B11100110 = 230,
+  B11100111 = 231,
+  B11101000 = 232,
+  B11101001 = 233,
+  B11101010 = 234,
+  B11101011 = 235,
+  B11101100 = 236,
+  B11101101 = 237,
+  B11101110 = 238,
+  B11101111 = 239,
+  B11110000 = 240,
+  B11110001 = 241,
+  B11110010 = 242,
+  B11110011 = 243,
+  B11110100 = 244,
+  B11110101 = 245,
+  B11110110 = 246,
+  B11110111 = 247,
+  B11111000 = 248,
+  B11111001 = 249,
+  B11111010 = 250,
+  B11111011 = 251,
+  B11111100 = 252,
+  B11111101 = 253,
+  B11111110 = 254,
+  B11111111 = 255
+};
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/xbyak/xbyak_mnemonic.h b/vllm-ext/inference_engine/cpp/bestla/bestla/xbyak/xbyak_mnemonic.h
new file mode 100644
index 000000000..5d2df99cb
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/xbyak/xbyak_mnemonic.h
@@ -0,0 +1,4715 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+const char* getVersionString() const { return "6.73"; }
+void aadd(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
+void aand(const Address& addr, const Reg32e& reg) {
+  db(0x66);
+  opModM(addr, reg, 0x0F, 0x38, 0x0FC);
+}
+void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
+void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
+void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
+void add(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x00, 0); }
+void add(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x00); }
+void addpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0x66, isXMM_XMMorMEM); }
+void addps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0x100, isXMM_XMMorMEM); }
+void addsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0xF2, isXMM_XMMorMEM); }
+void addss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0xF3, isXMM_XMMorMEM); }
+void addsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xD0, 0x66, isXMM_XMMorMEM); }
+void addsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xD0, 0xF2, isXMM_XMMorMEM); }
+void adox(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0xF3, isREG32_REG32orMEM, NONE, 0x38); }
+void aesdec(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDE, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesdeclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesenc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDC, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesenclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDD, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aesimc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDB, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void aeskeygenassist(const Xmm& xmm, const Operand& op, uint8_t imm) {
+  opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, imm, 0x3A);
+}
+void and_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x20, 4); }
+void and_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x20); }
+void andn(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_0F38, 0xf2, true); }
+void andnpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x66, isXMM_XMMorMEM); }
+void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); }
+void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); }
+void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); }
+void aor(const Address& addr, const Reg32e& reg) {
+  db(0xF2);
+  opModM(addr, reg, 0x0F, 0x38, 0x0FC);
+}
+void axor(const Address& addr, const Reg32e& reg) {
+  db(0xF3);
+  opModM(addr, reg, 0x0F, 0x38, 0x0FC);
+}
+void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf7, false); }
+void blendpd(const Xmm& xmm, const Operand& op, int imm) {
+  opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
+}
+void blendps(const Xmm& xmm, const Operand& op, int imm) {
+  opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
+}
+void blendvpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void blendvps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void blsi(const Reg32e& r, const Operand& op) { opGpr(Reg32e(3, r.getBit()), op, r, T_0F38, 0xf3, false); }
+void blsmsk(const Reg32e& r, const Operand& op) { opGpr(Reg32e(2, r.getBit()), op, r, T_0F38, 0xf3, false); }
+void blsr(const Reg32e& r, const Operand& op) { opGpr(Reg32e(1, r.getBit()), op, r, T_0F38, 0xf3, false); }
+void bnd() { db(0xF2); }
+void bndcl(const BoundsReg& bnd, const Operand& op) {
+  db(0xF3);
+  opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM());
+}
+void bndcn(const BoundsReg& bnd, const Operand& op) {
+  db(0xF2);
+  opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1B, NONE, !op.isMEM());
+}
+void bndcu(const BoundsReg& bnd, const Operand& op) {
+  db(0xF2);
+  opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM());
+}
+void bndldx(const BoundsReg& bnd, const Address& addr) { opMIB(addr, bnd, 0x0F, 0x1A); }
+void bndmk(const BoundsReg& bnd, const Address& addr) {
+  db(0xF3);
+  opModM(addr, bnd, 0x0F, 0x1B);
+}
+void bndmov(const Address& addr, const BoundsReg& bnd) {
+  db(0x66);
+  opModM(addr, bnd, 0x0F, 0x1B);
+}
+void bndmov(const BoundsReg& bnd, const Operand& op) {
+  db(0x66);
+  opModRM(bnd, op, op.isBNDREG(), op.isMEM(), 0x0F, 0x1A);
+}
+void bndstx(const Address& addr, const BoundsReg& bnd) { opMIB(addr, bnd, 0x0F, 0x1B); }
+void bsf(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBC); }
+void bsr(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBD); }
+void bswap(const Reg32e& reg) { opModR(Reg32(1), reg, 0x0F); }
+void bt(const Operand& op, const Reg& reg) {
+  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xA3);
+}
+void bt(const Operand& op, uint8_t imm) {
+  opR_ModM(op, 16 | 32 | 64, 4, 0x0f, 0xba, NONE, false, 1);
+  db(imm);
+}
+void btc(const Operand& op, const Reg& reg) {
+  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xBB);
+}
+void btc(const Operand& op, uint8_t imm) {
+  opR_ModM(op, 16 | 32 | 64, 7, 0x0f, 0xba, NONE, false, 1);
+  db(imm);
+}
+void btr(const Operand& op, const Reg& reg) {
+  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xB3);
+}
+void btr(const Operand& op, uint8_t imm) {
+  opR_ModM(op, 16 | 32 | 64, 6, 0x0f, 0xba, NONE, false, 1);
+  db(imm);
+}
+void bts(const Operand& op, const Reg& reg) {
+  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xAB);
+}
+void bts(const Operand& op, uint8_t imm) {
+  opR_ModM(op, 16 | 32 | 64, 5, 0x0f, 0xba, NONE, false, 1);
+  db(imm);
+}
+void bzhi(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf5, false); }
+void cbw() {
+  db(0x66);
+  db(0x98);
+}
+void cdq() { db(0x99); }
+void clc() { db(0xF8); }
+void cld() { db(0xFC); }
+void cldemote(const Address& addr) { opMIB(addr, eax, 0x0F, 0x1C); }
+void clflush(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0xAE); }
+void clflushopt(const Address& addr) {
+  db(0x66);
+  opModM(addr, Reg32(7), 0x0F, 0xAE);
+}
+void cli() { db(0xFA); }
+void clwb(const Address& addr) {
+  db(0x66);
+  opMIB(addr, esi, 0x0F, 0xAE);
+}
+void clzero() {
+  db(0x0F);
+  db(0x01);
+  db(0xFC);
+}
+void cmc() { db(0xF5); }
+void cmova(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7);
+}  //-V524
+void cmovae(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3);
+}  //-V524
+void cmovb(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2);
+}  //-V524
+void cmovbe(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 6);
+}  //-V524
+void cmovc(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2);
+}  //-V524
+void cmove(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 4);
+}  //-V524
+void cmovg(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 15);
+}  //-V524
+void cmovge(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 13);
+}  //-V524
+void cmovl(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 12);
+}  //-V524
+void cmovle(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 14);
+}  //-V524
+void cmovna(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 6);
+}  //-V524
+void cmovnae(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2);
+}  //-V524
+void cmovnb(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3);
+}  //-V524
+void cmovnbe(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7);
+}  //-V524
+void cmovnc(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3);
+}  //-V524
+void cmovne(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 5);
+}  //-V524
+void cmovng(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 14);
+}  //-V524
+void cmovnge(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 12);
+}  //-V524
+void cmovnl(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 13);
+}  //-V524
+void cmovnle(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 15);
+}  //-V524
+void cmovno(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 1);
+}  //-V524
+void cmovnp(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 11);
+}  //-V524
+void cmovns(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 9);
+}  //-V524
+void cmovnz(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 5);
+}  //-V524
+void cmovo(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 0);
+}  //-V524
+void cmovp(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 10);
+}  //-V524
+void cmovpe(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 10);
+}  //-V524
+void cmovpo(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 11);
+}  //-V524
+void cmovs(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 8);
+}  //-V524
+void cmovz(const Reg& reg, const Operand& op) {
+  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 4);
+}  //-V524
+void cmp(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x38, 7); }
+void cmp(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x38); }
+void cmpeqpd(const Xmm& x, const Operand& op) { cmppd(x, op, 0); }
+void cmpeqps(const Xmm& x, const Operand& op) { cmpps(x, op, 0); }
+void cmpeqsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 0); }
+void cmpeqss(const Xmm& x, const Operand& op) { cmpss(x, op, 0); }
+void cmplepd(const Xmm& x, const Operand& op) { cmppd(x, op, 2); }
+void cmpleps(const Xmm& x, const Operand& op) { cmpps(x, op, 2); }
+void cmplesd(const Xmm& x, const Operand& op) { cmpsd(x, op, 2); }
+void cmpless(const Xmm& x, const Operand& op) { cmpss(x, op, 2); }
+void cmpltpd(const Xmm& x, const Operand& op) { cmppd(x, op, 1); }
+void cmpltps(const Xmm& x, const Operand& op) { cmpps(x, op, 1); }
+void cmpltsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 1); }
+void cmpltss(const Xmm& x, const Operand& op) { cmpss(x, op, 1); }
+void cmpneqpd(const Xmm& x, const Operand& op) { cmppd(x, op, 4); }
+void cmpneqps(const Xmm& x, const Operand& op) { cmpps(x, op, 4); }
+void cmpneqsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 4); }
+void cmpneqss(const Xmm& x, const Operand& op) { cmpss(x, op, 4); }
+void cmpnlepd(const Xmm& x, const Operand& op) { cmppd(x, op, 6); }
+void cmpnleps(const Xmm& x, const Operand& op) { cmpps(x, op, 6); }
+void cmpnlesd(const Xmm& x, const Operand& op) { cmpsd(x, op, 6); }
+void cmpnless(const Xmm& x, const Operand& op) { cmpss(x, op, 6); }
+void cmpnltpd(const Xmm& x, const Operand& op) { cmppd(x, op, 5); }
+void cmpnltps(const Xmm& x, const Operand& op) { cmpps(x, op, 5); }
+void cmpnltsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 5); }
+void cmpnltss(const Xmm& x, const Operand& op) { cmpss(x, op, 5); }
+void cmpordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 7); }
+void cmpordps(const Xmm& x, const Operand& op) { cmpps(x, op, 7); }
+void cmpordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 7); }
+void cmpordss(const Xmm& x, const Operand& op) { cmpss(x, op, 7); }
+void cmppd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0x66, isXMM_XMMorMEM, imm8); }
+void cmpps(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0x100, isXMM_XMMorMEM, imm8); }
+void cmpsb() { db(0xA6); }
+void cmpsd() { db(0xA7); }
+void cmpsd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0xF2, isXMM_XMMorMEM, imm8); }
+void cmpss(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0xF3, isXMM_XMMorMEM, imm8); }
+void cmpsw() {
+  db(0x66);
+  db(0xA7);
+}
+void cmpunordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 3); }
+void cmpunordps(const Xmm& x, const Operand& op) { cmpps(x, op, 3); }
+void cmpunordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 3); }
+void cmpunordss(const Xmm& x, const Operand& op) { cmpss(x, op, 3); }
+void cmpxchg(const Operand& op, const Reg& reg) {
+  opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F,
+          0xB0 | (reg.isBit(8) ? 0 : 1));
+}
+void cmpxchg8b(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0xC7); }
+void comisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2F, 0x66, isXMM_XMMorMEM); }
+void comiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2F, 0x100, isXMM_XMMorMEM); }
+void cpuid() {
+  db(0x0F);
+  db(0xA2);
+}
+void crc32(const Reg32e& reg, const Operand& op) {
+  if (reg.isBit(32) && op.isBit(16)) db(0x66);
+  db(0xF2);
+  opModRM(reg, op, op.isREG(), op.isMEM(), 0x0F, 0x38, 0xF0 | (op.isBit(8) ? 0 : 1));
+}
+void cvtdq2pd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0xF3, isXMM_XMMorMEM); }
+void cvtdq2ps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0x100, isXMM_XMMorMEM); }
+void cvtpd2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0xF2, isXMM_XMMorMEM); }
+void cvtpd2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0x66, isMMX_XMMorMEM); }
+void cvtpd2ps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0x66, isXMM_XMMorMEM); }
+void cvtpi2pd(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0x66, isXMM_MMXorMEM); }
+void cvtpi2ps(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0x100, isXMM_MMXorMEM); }
+void cvtps2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0x66, isXMM_XMMorMEM); }
+void cvtps2pd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0x100, isXMM_XMMorMEM); }
+void cvtps2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0x100, isMMX_XMMorMEM); }
+void cvtsd2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0xF2, isREG32_XMMorMEM); }
+void cvtsd2ss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0xF2, isXMM_XMMorMEM); }
+void cvtsi2sd(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0xF2, isXMM_REG32orMEM); }
+void cvtsi2ss(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0xF3, isXMM_REG32orMEM); }
+void cvtss2sd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0xF3, isXMM_XMMorMEM); }
+void cvtss2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0xF3, isREG32_XMMorMEM); }
+void cvttpd2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0x66, isXMM_XMMorMEM); }
+void cvttpd2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0x66, isMMX_XMMorMEM); }
+void cvttps2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0xF3, isXMM_XMMorMEM); }
+void cvttps2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0x100, isMMX_XMMorMEM); }
+void cvttsd2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0xF2, isREG32_XMMorMEM); }
+void cvttss2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0xF3, isREG32_XMMorMEM); }
+void cwd() {
+  db(0x66);
+  db(0x99);
+}
+void cwde() { db(0x98); }
+void dec(const Operand& op) { opIncDec(op, 0x48, 1); }
+void div(const Operand& op) { opR_ModM(op, 0, 6, 0xF6); }
+void divpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0x66, isXMM_XMMorMEM); }
+void divps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0x100, isXMM_XMMorMEM); }
+void divsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0xF2, isXMM_XMMorMEM); }
+void divss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0xF3, isXMM_XMMorMEM); }
+void dppd(const Xmm& xmm, const Operand& op, int imm) {
+  opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
+}
+void dpps(const Xmm& xmm, const Operand& op, int imm) {
+  opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
+}
+void emms() {
+  db(0x0F);
+  db(0x77);
+}
+void endbr32() {
+  db(0xF3);
+  db(0x0F);
+  db(0x1E);
+  db(0xFB);
+}
+void endbr64() {
+  db(0xF3);
+  db(0x0F);
+  db(0x1E);
+  db(0xFA);
+}
+void enter(uint16_t x, uint8_t y) {
+  db(0xC8);
+  dw(x);
+  db(y);
+}
+void extractps(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x17, imm); }
+void f2xm1() {
+  db(0xD9);
+  db(0xF0);
+}
+void fabs() {
+  db(0xD9);
+  db(0xE1);
+}
+void fadd(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 0, 0); }
+void fadd(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8C0, 0xDCC0); }
+void fadd(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C0, 0xDCC0); }
+void faddp() {
+  db(0xDE);
+  db(0xC1);
+}
+void faddp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEC0); }
+void faddp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC0); }
+void fbld(const Address& addr) { opModM(addr, Reg32(4), 0xDF, 0x100); }
+void fbstp(const Address& addr) { opModM(addr, Reg32(6), 0xDF, 0x100); }
+void fchs() {
+  db(0xD9);
+  db(0xE0);
+}
+void fclex() {
+  db(0x9B);
+  db(0xDB);
+  db(0xE2);
+}
+void fcmovb(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAC0, 0x00C0); }
+void fcmovb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC0, 0x00C0); }
+void fcmovbe(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAD0, 0x00D0); }
+void fcmovbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD0, 0x00D0); }
+void fcmove(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAC8, 0x00C8); }
+void fcmove(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC8, 0x00C8); }
+void fcmovnb(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBC0, 0x00C0); }
+void fcmovnb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC0, 0x00C0); }
+void fcmovnbe(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBD0, 0x00D0); }
+void fcmovnbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD0, 0x00D0); }
+void fcmovne(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBC8, 0x00C8); }
+void fcmovne(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC8, 0x00C8); }
+void fcmovnu(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBD8, 0x00D8); }
+void fcmovnu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD8, 0x00D8); }
+void fcmovu(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAD8, 0x00D8); }
+void fcmovu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD8, 0x00D8); }
+void fcom() {
+  db(0xD8);
+  db(0xD1);
+}
+void fcom(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 2, 0); }
+void fcom(const Fpu& reg) { opFpu(reg, 0xD8, 0xD0); }
+void fcomi(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBF0, 0x00F0); }
+void fcomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBF0, 0x00F0); }
+void fcomip(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDFF0, 0x00F0); }
+void fcomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFF0, 0x00F0); }
+void fcomp() {
+  db(0xD8);
+  db(0xD9);
+}
+void fcomp(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 3, 0); }
+void fcomp(const Fpu& reg) { opFpu(reg, 0xD8, 0xD8); }
+void fcompp() {
+  db(0xDE);
+  db(0xD9);
+}
+void fcos() {
+  db(0xD9);
+  db(0xFF);
+}
+void fdecstp() {
+  db(0xD9);
+  db(0xF6);
+}
+void fdiv(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 6, 0); }
+void fdiv(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8F0, 0xDCF8); }
+void fdiv(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F0, 0xDCF8); }
+void fdivp() {
+  db(0xDE);
+  db(0xF9);
+}
+void fdivp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEF8); }
+void fdivp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF8); }
+void fdivr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 7, 0); }
+void fdivr(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8F8, 0xDCF0); }
+void fdivr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F8, 0xDCF0); }
+void fdivrp() {
+  db(0xDE);
+  db(0xF1);
+}
+void fdivrp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEF0); }
+void fdivrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF0); }
+void ffree(const Fpu& reg) { opFpu(reg, 0xDD, 0xC0); }
+void fiadd(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 0, 0); }
+void ficom(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 2, 0); }
+void ficomp(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 3, 0); }
+void fidiv(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 6, 0); }
+void fidivr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 7, 0); }
+void fild(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 0, 5); }
+void fimul(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 1, 0); }
+void fincstp() {
+  db(0xD9);
+  db(0xF7);
+}
+void finit() {
+  db(0x9B);
+  db(0xDB);
+  db(0xE3);
+}
+void fist(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0x00, 2, 0); }
+void fistp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 3, 7); }
+void fisttp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDD, 1, 0); }
+void fisub(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 4, 0); }
+void fisubr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 5, 0); }
+void fld(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 0, 0); }
+void fld(const Fpu& reg) { opFpu(reg, 0xD9, 0xC0); }
+void fld1() {
+  db(0xD9);
+  db(0xE8);
+}
+void fldcw(const Address& addr) { opModM(addr, Reg32(5), 0xD9, 0x100); }
+void fldenv(const Address& addr) { opModM(addr, Reg32(4), 0xD9, 0x100); }
+void fldl2e() {
+  db(0xD9);
+  db(0xEA);
+}
+void fldl2t() {
+  db(0xD9);
+  db(0xE9);
+}
+void fldlg2() {
+  db(0xD9);
+  db(0xEC);
+}
+void fldln2() {
+  db(0xD9);
+  db(0xED);
+}
+void fldpi() {
+  db(0xD9);
+  db(0xEB);
+}
+void fldz() {
+  db(0xD9);
+  db(0xEE);
+}
+void fmul(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 1, 0); }
+void fmul(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8C8, 0xDCC8); }
+void fmul(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C8, 0xDCC8); }
+void fmulp() {
+  db(0xDE);
+  db(0xC9);
+}
+void fmulp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEC8); }
+void fmulp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC8); }
+void fnclex() {
+  db(0xDB);
+  db(0xE2);
+}
+void fninit() {
+  db(0xDB);
+  db(0xE3);
+}
+void fnop() {
+  db(0xD9);
+  db(0xD0);
+}
+void fnsave(const Address& addr) { opModM(addr, Reg32(6), 0xDD, 0x100); }
+void fnstcw(const Address& addr) { opModM(addr, Reg32(7), 0xD9, 0x100); }
+void fnstenv(const Address& addr) { opModM(addr, Reg32(6), 0xD9, 0x100); }
+void fnstsw(const Address& addr) { opModM(addr, Reg32(7), 0xDD, 0x100); }
+void fnstsw(const Reg16& r) {
+  if (r.getIdx() != Operand::AX) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xDF);
+  db(0xE0);
+}
+void fpatan() {
+  db(0xD9);
+  db(0xF3);
+}
+void fprem() {
+  db(0xD9);
+  db(0xF8);
+}
+void fprem1() {
+  db(0xD9);
+  db(0xF5);
+}
+void fptan() {
+  db(0xD9);
+  db(0xF2);
+}
+void frndint() {
+  db(0xD9);
+  db(0xFC);
+}
+void frstor(const Address& addr) { opModM(addr, Reg32(4), 0xDD, 0x100); }
+void fsave(const Address& addr) {
+  db(0x9B);
+  opModM(addr, Reg32(6), 0xDD, 0x100);
+}
+void fscale() {
+  db(0xD9);
+  db(0xFD);
+}
+void fsin() {
+  db(0xD9);
+  db(0xFE);
+}
+void fsincos() {
+  db(0xD9);
+  db(0xFB);
+}
+void fsqrt() {
+  db(0xD9);
+  db(0xFA);
+}
+void fst(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 2, 0); }
+void fst(const Fpu& reg) { opFpu(reg, 0xDD, 0xD0); }
+void fstcw(const Address& addr) {
+  db(0x9B);
+  opModM(addr, Reg32(7), 0xD9, 0x100);
+}
+void fstenv(const Address& addr) {
+  db(0x9B);
+  opModM(addr, Reg32(6), 0xD9, 0x100);
+}
+void fstp(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 3, 0); }
+void fstp(const Fpu& reg) { opFpu(reg, 0xDD, 0xD8); }
+void fstsw(const Address& addr) {
+  db(0x9B);
+  opModM(addr, Reg32(7), 0xDD, 0x100);
+}
+void fstsw(const Reg16& r) {
+  if (r.getIdx() != Operand::AX) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x9B);
+  db(0xDF);
+  db(0xE0);
+}
+void fsub(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 4, 0); }
+void fsub(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8E0, 0xDCE8); }
+void fsub(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E0, 0xDCE8); }
+void fsubp() {
+  db(0xDE);
+  db(0xE9);
+}
+void fsubp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEE8); }
+void fsubp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE8); }
+void fsubr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 5, 0); }
+void fsubr(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8E8, 0xDCE0); }
+void fsubr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E8, 0xDCE0); }
+void fsubrp() {
+  db(0xDE);
+  db(0xE1);
+}
+void fsubrp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEE0); }
+void fsubrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE0); }
+void ftst() {
+  db(0xD9);
+  db(0xE4);
+}
+void fucom() {
+  db(0xDD);
+  db(0xE1);
+}
+void fucom(const Fpu& reg) { opFpu(reg, 0xDD, 0xE0); }
+void fucomi(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBE8, 0x00E8); }
+void fucomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBE8, 0x00E8); }
+void fucomip(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDFE8, 0x00E8); }
+void fucomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFE8, 0x00E8); }
+void fucomp() {
+  db(0xDD);
+  db(0xE9);
+}
+void fucomp(const Fpu& reg) { opFpu(reg, 0xDD, 0xE8); }
+void fucompp() {
+  db(0xDA);
+  db(0xE9);
+}
+void fwait() { db(0x9B); }
+void fxam() {
+  db(0xD9);
+  db(0xE5);
+}
+void fxch() {
+  db(0xD9);
+  db(0xC9);
+}
+void fxch(const Fpu& reg) { opFpu(reg, 0xD9, 0xC8); }
+void fxrstor(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0xAE); }
+void fxtract() {
+  db(0xD9);
+  db(0xF4);
+}
+void fyl2x() {
+  db(0xD9);
+  db(0xF1);
+}
+void fyl2xp1() {
+  db(0xD9);
+  db(0xF9);
+}
+void gf2p8affineinvqb(const Xmm& xmm, const Operand& op, int imm) {
+  opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
+}
+void gf2p8affineqb(const Xmm& xmm, const Operand& op, int imm) {
+  opGen(xmm, op, 0xCE, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
+}
+void gf2p8mulb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void haddpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0x66, isXMM_XMMorMEM); }
+void haddps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0xF2, isXMM_XMMorMEM); }
+void hlt() { db(0xF4); }
+void hsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0x66, isXMM_XMMorMEM); }
+void hsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0xF2, isXMM_XMMorMEM); }
+void idiv(const Operand& op) { opR_ModM(op, 0, 7, 0xF6); }
+void imul(const Operand& op) { opR_ModM(op, 0, 5, 0xF6); }
+void in_(const Reg& a, const Reg& d) { opInOut(a, d, 0xEC); }
+void in_(const Reg& a, uint8_t v) { opInOut(a, 0xE4, v); }
+void inc(const Operand& op) { opIncDec(op, 0x40, 0); }
+void insertps(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
+void int3() { db(0xCC); }
+void int_(uint8_t x) {
+  db(0xCD);
+  db(x);
+}
+void ja(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }    //-V524
+void ja(const char* label, LabelType type = T_AUTO) { ja(std::string(label), type); }             //-V524
+void ja(const void* addr) { opJmpAbs(addr, T_NEAR, 0x77, 0x87, 0x0F); }                           //-V524
+void ja(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }     //-V524
+void jae(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }   //-V524
+void jae(const char* label, LabelType type = T_AUTO) { jae(std::string(label), type); }           //-V524
+void jae(const void* addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }                          //-V524
+void jae(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }    //-V524
+void jb(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }    //-V524
+void jb(const char* label, LabelType type = T_AUTO) { jb(std::string(label), type); }             //-V524
+void jb(const void* addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }                           //-V524
+void jb(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }     //-V524
+void jbe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }   //-V524
+void jbe(const char* label, LabelType type = T_AUTO) { jbe(std::string(label), type); }           //-V524
+void jbe(const void* addr) { opJmpAbs(addr, T_NEAR, 0x76, 0x86, 0x0F); }                          //-V524
+void jbe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }    //-V524
+void jc(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }    //-V524
+void jc(const char* label, LabelType type = T_AUTO) { jc(std::string(label), type); }             //-V524
+void jc(const void* addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }                           //-V524
+void jc(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }     //-V524
+void je(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }    //-V524
+void je(const char* label, LabelType type = T_AUTO) { je(std::string(label), type); }             //-V524
+void je(const void* addr) { opJmpAbs(addr, T_NEAR, 0x74, 0x84, 0x0F); }                           //-V524
+void je(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }     //-V524
+void jg(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }    //-V524
+void jg(const char* label, LabelType type = T_AUTO) { jg(std::string(label), type); }             //-V524
+void jg(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7F, 0x8F, 0x0F); }                           //-V524
+void jg(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }     //-V524
+void jge(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }   //-V524
+void jge(const char* label, LabelType type = T_AUTO) { jge(std::string(label), type); }           //-V524
+void jge(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7D, 0x8D, 0x0F); }                          //-V524
+void jge(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }    //-V524
+void jl(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }    //-V524
+void jl(const char* label, LabelType type = T_AUTO) { jl(std::string(label), type); }             //-V524
+void jl(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7C, 0x8C, 0x0F); }                           //-V524
+void jl(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }     //-V524
+void jle(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }   //-V524
+void jle(const char* label, LabelType type = T_AUTO) { jle(std::string(label), type); }           //-V524
+void jle(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7E, 0x8E, 0x0F); }                          //-V524
+void jle(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }    //-V524
+void jna(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }   //-V524
+void jna(const char* label, LabelType type = T_AUTO) { jna(std::string(label), type); }           //-V524
+void jna(const void* addr) { opJmpAbs(addr, T_NEAR, 0x76, 0x86, 0x0F); }                          //-V524
+void jna(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }    //-V524
+void jnae(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }  //-V524
+void jnae(const char* label, LabelType type = T_AUTO) { jnae(std::string(label), type); }         //-V524
+void jnae(const void* addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }                         //-V524
+void jnae(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }   //-V524
+void jnb(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }   //-V524
+void jnb(const char* label, LabelType type = T_AUTO) { jnb(std::string(label), type); }           //-V524
+void jnb(const void* addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }                          //-V524
+void jnb(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }    //-V524
+void jnbe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }  //-V524
+void jnbe(const char* label, LabelType type = T_AUTO) { jnbe(std::string(label), type); }         //-V524
+void jnbe(const void* addr) { opJmpAbs(addr, T_NEAR, 0x77, 0x87, 0x0F); }                         //-V524
+void jnbe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }   //-V524
+void jnc(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }   //-V524
+void jnc(const char* label, LabelType type = T_AUTO) { jnc(std::string(label), type); }           //-V524
+void jnc(const void* addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }                          //-V524
+void jnc(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }    //-V524
+void jne(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }   //-V524
+void jne(const char* label, LabelType type = T_AUTO) { jne(std::string(label), type); }           //-V524
+void jne(const void* addr) { opJmpAbs(addr, T_NEAR, 0x75, 0x85, 0x0F); }                          //-V524
+void jne(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }    //-V524
+void jng(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }   //-V524
+void jng(const char* label, LabelType type = T_AUTO) { jng(std::string(label), type); }           //-V524
+void jng(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7E, 0x8E, 0x0F); }                          //-V524
+void jng(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }    //-V524
+void jnge(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }  //-V524
+void jnge(const char* label, LabelType type = T_AUTO) { jnge(std::string(label), type); }         //-V524
+void jnge(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7C, 0x8C, 0x0F); }                         //-V524
+void jnge(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }   //-V524
+void jnl(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }   //-V524
+void jnl(const char* label, LabelType type = T_AUTO) { jnl(std::string(label), type); }           //-V524
+void jnl(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7D, 0x8D, 0x0F); }                          //-V524
+void jnl(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }    //-V524
+void jnle(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }  //-V524
+void jnle(const char* label, LabelType type = T_AUTO) { jnle(std::string(label), type); }         //-V524
+void jnle(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7F, 0x8F, 0x0F); }                         //-V524
+void jnle(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }   //-V524
+void jno(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }   //-V524
+void jno(const char* label, LabelType type = T_AUTO) { jno(std::string(label), type); }           //-V524
+void jno(const void* addr) { opJmpAbs(addr, T_NEAR, 0x71, 0x81, 0x0F); }                          //-V524
+void jno(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }    //-V524
+void jnp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }   //-V524
+void jnp(const char* label, LabelType type = T_AUTO) { jnp(std::string(label), type); }           //-V524
+void jnp(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7B, 0x8B, 0x0F); }                          //-V524
+void jnp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }    //-V524
+void jns(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }   //-V524
+void jns(const char* label, LabelType type = T_AUTO) { jns(std::string(label), type); }           //-V524
+void jns(const void* addr) { opJmpAbs(addr, T_NEAR, 0x79, 0x89, 0x0F); }                          //-V524
+void jns(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }    //-V524
+void jnz(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }   //-V524
+void jnz(const char* label, LabelType type = T_AUTO) { jnz(std::string(label), type); }           //-V524
+void jnz(const void* addr) { opJmpAbs(addr, T_NEAR, 0x75, 0x85, 0x0F); }                          //-V524
+void jnz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }    //-V524
+void jo(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }    //-V524
+void jo(const char* label, LabelType type = T_AUTO) { jo(std::string(label), type); }             //-V524
+void jo(const void* addr) { opJmpAbs(addr, T_NEAR, 0x70, 0x80, 0x0F); }                           //-V524
+void jo(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }     //-V524
+void jp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }    //-V524
+void jp(const char* label, LabelType type = T_AUTO) { jp(std::string(label), type); }             //-V524
+void jp(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7A, 0x8A, 0x0F); }                           //-V524
+void jp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }     //-V524
+void jpe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }   //-V524
+void jpe(const char* label, LabelType type = T_AUTO) { jpe(std::string(label), type); }           //-V524
+void jpe(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7A, 0x8A, 0x0F); }                          //-V524
+void jpe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }    //-V524
+void jpo(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }   //-V524
+void jpo(const char* label, LabelType type = T_AUTO) { jpo(std::string(label), type); }           //-V524
+void jpo(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7B, 0x8B, 0x0F); }                          //-V524
+void jpo(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }    //-V524
+void js(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }    //-V524
+void js(const char* label, LabelType type = T_AUTO) { js(std::string(label), type); }             //-V524
+void js(const void* addr) { opJmpAbs(addr, T_NEAR, 0x78, 0x88, 0x0F); }                           //-V524
+void js(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }     //-V524
+void jz(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }    //-V524
+void jz(const char* label, LabelType type = T_AUTO) { jz(std::string(label), type); }             //-V524
+void jz(const void* addr) { opJmpAbs(addr, T_NEAR, 0x74, 0x84, 0x0F); }                           //-V524
+void jz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }     //-V524
+void lahf() { db(0x9F); }
+void lddqu(const Xmm& xmm, const Address& addr) {
+  db(0xF2);
+  opModM(addr, xmm, 0x0F, 0xF0);
+}
+void ldmxcsr(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0xAE); }
+void lea(const Reg& reg, const Address& addr) {
+  if (!reg.isBit(16 | i32e)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModM(addr, reg, 0x8D);
+}
+void leave() { db(0xC9); }
+void lfence() {
+  db(0x0F);
+  db(0xAE);
+  db(0xE8);
+}
+void lfs(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB4); }
+void lgs(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB5); }
+void lock() { db(0xF0); }
+void lodsb() { db(0xAC); }
+void lodsd() { db(0xAD); }
+void lodsw() {
+  db(0x66);
+  db(0xAD);
+}
+void loop(const Label& label) { opJmp(label, T_SHORT, 0xE2, 0, 0); }
+void loop(const char* label) { loop(std::string(label)); }
+void loop(std::string label) { opJmp(label, T_SHORT, 0xE2, 0, 0); }
+void loope(const Label& label) { opJmp(label, T_SHORT, 0xE1, 0, 0); }
+void loope(const char* label) { loope(std::string(label)); }
+void loope(std::string label) { opJmp(label, T_SHORT, 0xE1, 0, 0); }
+void loopne(const Label& label) { opJmp(label, T_SHORT, 0xE0, 0, 0); }
+void loopne(const char* label) { loopne(std::string(label)); }
+void loopne(std::string label) { opJmp(label, T_SHORT, 0xE0, 0, 0); }
+void lss(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB2); }
+void lzcnt(const Reg& reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBD); }
+void maskmovdqu(const Xmm& reg1, const Xmm& reg2) {
+  db(0x66);
+  opModR(reg1, reg2, 0x0F, 0xF7);
+}
+void maskmovq(const Mmx& reg1, const Mmx& reg2) {
+  if (!reg1.isMMX() || !reg2.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opModR(reg1, reg2, 0x0F, 0xF7);
+}
+void maxpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0x66, isXMM_XMMorMEM); }
+void maxps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0x100, isXMM_XMMorMEM); }
+void maxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0xF2, isXMM_XMMorMEM); }
+void maxss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0xF3, isXMM_XMMorMEM); }
+void mfence() {
+  db(0x0F);
+  db(0xAE);
+  db(0xF0);
+}
+void minpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0x66, isXMM_XMMorMEM); }
+void minps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0x100, isXMM_XMMorMEM); }
+void minsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0xF2, isXMM_XMMorMEM); }
+void minss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0xF3, isXMM_XMMorMEM); }
+void monitor() {
+  db(0x0F);
+  db(0x01);
+  db(0xC8);
+}
+void monitorx() {
+  db(0x0F);
+  db(0x01);
+  db(0xFA);
+}
+void movapd(const Address& addr, const Xmm& xmm) {
+  db(0x66);
+  opModM(addr, xmm, 0x0F, 0x29);
+}
+void movapd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, 0x66); }
+void movaps(const Address& addr, const Xmm& xmm) { opModM(addr, xmm, 0x0F, 0x29); }
+void movaps(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, 0x100); }
+void movbe(const Address& addr, const Reg& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF1); }
+void movbe(const Reg& reg, const Address& addr) { opModM(addr, reg, 0x0F, 0x38, 0xF0); }
+void movd(const Address& addr, const Mmx& mmx) {
+  if (mmx.isXMM()) db(0x66);
+  opModM(addr, mmx, 0x0F, 0x7E);
+}
+void movd(const Mmx& mmx, const Address& addr) {
+  if (mmx.isXMM()) db(0x66);
+  opModM(addr, mmx, 0x0F, 0x6E);
+}
+void movd(const Mmx& mmx, const Reg32& reg) {
+  if (mmx.isXMM()) db(0x66);
+  opModR(mmx, reg, 0x0F, 0x6E);
+}
+void movd(const Reg32& reg, const Mmx& mmx) {
+  if (mmx.isXMM()) db(0x66);
+  opModR(mmx, reg, 0x0F, 0x7E);
+}
+void movddup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF2, isXMM_XMMorMEM, NONE, NONE); }
+void movdir64b(const Reg& reg, const Address& addr) {
+  db(0x66);
+  opModM(addr, reg.cvt32(), 0x0F, 0x38, 0xF8);
+}
+void movdiri(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF9); }
+void movdq2q(const Mmx& mmx, const Xmm& xmm) {
+  db(0xF2);
+  opModR(mmx, xmm, 0x0F, 0xD6);
+}
+void movdqa(const Address& addr, const Xmm& xmm) {
+  db(0x66);
+  opModM(addr, xmm, 0x0F, 0x7F);
+}
+void movdqa(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0x66); }
+void movdqu(const Address& addr, const Xmm& xmm) {
+  db(0xF3);
+  opModM(addr, xmm, 0x0F, 0x7F);
+}
+void movdqu(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0xF3); }
+void movhlps(const Xmm& reg1, const Xmm& reg2) { opModR(reg1, reg2, 0x0F, 0x12); }
+void movhpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0x66); }
+void movhps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0x100); }
+void movlhps(const Xmm& reg1, const Xmm& reg2) { opModR(reg1, reg2, 0x0F, 0x16); }
+void movlpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x66); }
+void movlps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x100); }
+void movmskpd(const Reg32e& reg, const Xmm& xmm) {
+  db(0x66);
+  movmskps(reg, xmm);
+}
+void movmskps(const Reg32e& reg, const Xmm& xmm) { opModR(reg, xmm, 0x0F, 0x50); }
+void movntdq(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0xE7); }
+void movntdqa(const Xmm& xmm, const Address& addr) {
+  db(0x66);
+  opModM(addr, xmm, 0x0F, 0x38, 0x2A);
+}
+void movnti(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0xC3); }
+void movntpd(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0x2B); }
+void movntps(const Address& addr, const Xmm& xmm) { opModM(addr, Mmx(xmm.getIdx()), 0x0F, 0x2B); }
+void movntq(const Address& addr, const Mmx& mmx) {
+  if (!mmx.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opModM(addr, mmx, 0x0F, 0xE7);
+}
+void movq(const Address& addr, const Mmx& mmx) {
+  if (mmx.isXMM()) db(0x66);
+  opModM(addr, mmx, 0x0F, mmx.isXMM() ? 0xD6 : 0x7F);
+}
+void movq(const Mmx& mmx, const Operand& op) {
+  if (mmx.isXMM()) db(0xF3);
+  opModRM(mmx, op, (mmx.getKind() == op.getKind()), op.isMEM(), 0x0F, mmx.isXMM() ? 0x7E : 0x6F);
+}
+void movq2dq(const Xmm& xmm, const Mmx& mmx) {
+  db(0xF3);
+  opModR(xmm, mmx, 0x0F, 0xD6);
+}
+void movsb() { db(0xA4); }
+void movsd() { db(0xA5); }
+void movsd(const Address& addr, const Xmm& xmm) {
+  db(0xF2);
+  opModM(addr, xmm, 0x0F, 0x11);
+}
+void movsd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0xF2); }
+void movshdup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x16, 0xF3, isXMM_XMMorMEM, NONE, NONE); }
+void movsldup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF3, isXMM_XMMorMEM, NONE, NONE); }
+void movss(const Address& addr, const Xmm& xmm) {
+  db(0xF3);
+  opModM(addr, xmm, 0x0F, 0x11);
+}
+void movss(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0xF3); }
+void movsw() {
+  db(0x66);
+  db(0xA5);
+}
+void movsx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xBE); }
+void movupd(const Address& addr, const Xmm& xmm) {
+  db(0x66);
+  opModM(addr, xmm, 0x0F, 0x11);
+}
+void movupd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0x66); }
+void movups(const Address& addr, const Xmm& xmm) { opModM(addr, xmm, 0x0F, 0x11); }
+void movups(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0x100); }
+void movzx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xB6); }
+void mpsadbw(const Xmm& xmm, const Operand& op, int imm) {
+  opGen(xmm, op, 0x42, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
+}
+void mul(const Operand& op) { opR_ModM(op, 0, 4, 0xF6); }
+void mulpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0x66, isXMM_XMMorMEM); }
+void mulps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0x100, isXMM_XMMorMEM); }
+void mulsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0xF2, isXMM_XMMorMEM); }
+void mulss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0xF3, isXMM_XMMorMEM); }
+void mulx(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F2 | T_0F38, 0xf6, true); }
+void mwait() {
+  db(0x0F);
+  db(0x01);
+  db(0xC9);
+}
+void mwaitx() {
+  db(0x0F);
+  db(0x01);
+  db(0xFB);
+}
+void neg(const Operand& op) { opR_ModM(op, 0, 3, 0xF6); }
+void not_(const Operand& op) { opR_ModM(op, 0, 2, 0xF6); }
+void or_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x08, 1); }
+void or_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x08); }
+void orpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x56, 0x66, isXMM_XMMorMEM); }
+void orps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x56, 0x100, isXMM_XMMorMEM); }
+void out_(const Reg& d, const Reg& a) { opInOut(a, d, 0xEE); }
+void out_(uint8_t v, const Reg& a) { opInOut(a, 0xE6, v); }
+void outsb() { db(0x6E); }
+void outsd() { db(0x6F); }
+void outsw() {
+  db(0x66);
+  db(0x6F);
+}
+void pabsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1C, 0x66, NONE, 0x38); }
+void pabsd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1E, 0x66, NONE, 0x38); }
+void pabsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1D, 0x66, NONE, 0x38); }
+void packssdw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6B); }
+void packsswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x63); }
+void packusdw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void packuswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x67); }
+void paddb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFC); }
+void paddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFE); }
+void paddq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD4); }
+void paddsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEC); }
+void paddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xED); }
+void paddusb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDC); }
+void paddusw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDD); }
+void paddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFD); }
+void palignr(const Mmx& mmx, const Operand& op, int imm) {
+  opMMX(mmx, op, 0x0f, 0x66, static_cast<uint8_t>(imm), 0x3a);
+}
+void pand(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDB); }
+void pandn(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDF); }
+void pause() {
+  db(0xF3);
+  db(0x90);
+}
+void pavgb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE0); }
+void pavgw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE3); }
+void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pblendw(const Xmm& xmm, const Operand& op, int imm) {
+  opGen(xmm, op, 0x0E, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
+}
+void pclmulhqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x11); }
+void pclmulhqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x01); }
+void pclmullqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x10); }
+void pclmullqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x00); }
+void pclmulqdq(const Xmm& xmm, const Operand& op, int imm) {
+  opGen(xmm, op, 0x44, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
+}
+void pcmpeqb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x74); }
+void pcmpeqd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x76); }
+void pcmpeqq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x29, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pcmpeqw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x75); }
+void pcmpestri(const Xmm& xmm, const Operand& op, uint8_t imm) {
+  opGen(xmm, op, 0x61, 0x66, isXMM_XMMorMEM, imm, 0x3A);
+}
+void pcmpestrm(const Xmm& xmm, const Operand& op, uint8_t imm) {
+  opGen(xmm, op, 0x60, 0x66, isXMM_XMMorMEM, imm, 0x3A);
+}
+void pcmpgtb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x64); }
+void pcmpgtd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x66); }
+void pcmpgtq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x37, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pcmpgtw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x65); }
+void pcmpistri(const Xmm& xmm, const Operand& op, uint8_t imm) {
+  opGen(xmm, op, 0x63, 0x66, isXMM_XMMorMEM, imm, 0x3A);
+}
+void pcmpistrm(const Xmm& xmm, const Operand& op, uint8_t imm) {
+  opGen(xmm, op, 0x62, 0x66, isXMM_XMMorMEM, imm, 0x3A);
+}
+void pdep(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F2 | T_0F38, 0xf5, true); }
+void pext(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F3 | T_0F38, 0xf5, true); }
+void pextrb(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x14, imm); }
+void pextrd(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x16, imm); }
+void pextrw(const Operand& op, const Mmx& xmm, uint8_t imm) { opExt(op, xmm, 0x15, imm, true); }
+void phaddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x02, 0x66, NONE, 0x38); }
+void phaddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x03, 0x66, NONE, 0x38); }
+void phaddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x01, 0x66, NONE, 0x38); }
+void phminposuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void phsubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x06, 0x66, NONE, 0x38); }
+void phsubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x07, 0x66, NONE, 0x38); }
+void phsubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x05, 0x66, NONE, 0x38); }
+void pinsrb(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x20, 0x66, isXMM_REG32orMEM, imm, 0x3A); }
+void pinsrd(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x22, 0x66, isXMM_REG32orMEM, imm, 0x3A); }
+void pinsrw(const Mmx& mmx, const Operand& op, int imm) {
+  if (!op.isREG(32) && !op.isMEM())
+    XBYAK_THROW(ERR_BAD_COMBINATION) opGen(mmx, op, 0xC4, mmx.isXMM() ? 0x66 : NONE, 0, imm);
+}
+void pmaddubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x04, 0x66, NONE, 0x38); }
+void pmaddwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF5); }
+void pmaxsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3C, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmaxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3D, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmaxsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEE); }
+void pmaxub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDE); }
+void pmaxud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3F, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmaxuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3E, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x38, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x39, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEA); }
+void pminub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDA); }
+void pminud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pminuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3A, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovmskb(const Reg32e& reg, const Mmx& mmx) {
+  if (mmx.isXMM()) db(0x66);
+  opModR(reg, mmx, 0x0F, 0xD7);
+}
+void pmovsxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x22, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x20, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x25, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x23, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovsxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x24, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x31, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x32, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x30, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x35, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x33, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmovzxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x34, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmuldq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x28, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmulhrsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0B, 0x66, NONE, 0x38); }
+void pmulhuw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE4); }
+void pmulhw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE5); }
+void pmulld(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void pmullw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD5); }
+void pmuludq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF4); }
+void popcnt(const Reg& reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); }
+void popf() { db(0x9D); }
+void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); }
+void prefetchit0(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0x18); }
+void prefetchit1(const Address& addr) { opModM(addr, Reg32(6), 0x0F, 0x18); }
+void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, 0x18); }
+void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); }
+void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); }
+void prefetcht2(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0x18); }
+void prefetchw(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x0D); }
+void prefetchwt1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x0D); }
+void psadbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF6); }
+void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, 0x66, NONE, 0x38); }
+void pshufd(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0x66, imm8); }
+void pshufhw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0xF3, imm8); }
+void pshuflw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0xF2, imm8); }
+void pshufw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0x00, imm8); }
+void psignb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x08, 0x66, NONE, 0x38); }
+void psignd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0A, 0x66, NONE, 0x38); }
+void psignw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x09, 0x66, NONE, 0x38); }
+void pslld(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF2); }
+void pslld(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 6); }
+void pslldq(const Xmm& xmm, int imm8) { opMMX_IMM(xmm, imm8, 0x73, 7); }
+void psllq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF3); }
+void psllq(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x73, 6); }
+void psllw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF1); }
+void psllw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 6); }
+void psrad(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE2); }
+void psrad(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 4); }
+void psraw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE1); }
+void psraw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 4); }
+void psrld(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD2); }
+void psrld(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 2); }
+void psrldq(const Xmm& xmm, int imm8) { opMMX_IMM(xmm, imm8, 0x73, 3); }
+void psrlq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD3); }
+void psrlq(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x73, 2); }
+void psrlw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD1); }
+void psrlw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 2); }
+void psubb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF8); }
+void psubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFA); }
+void psubq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFB); }
+void psubsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE8); }
+void psubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE9); }
+void psubusb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD8); }
+void psubusw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD9); }
+void psubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF9); }
+void ptest(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x17, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
+void punpckhbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x68); }
+void punpckhdq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6A); }
+void punpckhqdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x6D, 0x66, isXMM_XMMorMEM); }
+void punpckhwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x69); }
+void punpcklbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x60); }
+void punpckldq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x62); }
+void punpcklqdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x6C, 0x66, isXMM_XMMorMEM); }
+void punpcklwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x61); }
+void pushf() { db(0x9C); }
+void pxor(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEF); }
+void rcl(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 2); }
+void rcl(const Operand& op, int imm) { opShift(op, imm, 2); }
+void rcpps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x53, 0x100, isXMM_XMMorMEM); }
+void rcpss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x53, 0xF3, isXMM_XMMorMEM); }
+void rcr(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 3); }
+void rcr(const Operand& op, int imm) { opShift(op, imm, 3); }
+void rdmsr() {
+  db(0x0F);
+  db(0x32);
+}
+void rdpmc() {
+  db(0x0F);
+  db(0x33);
+}
+void rdrand(const Reg& r) {
+  if (r.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModR(Reg(6, Operand::REG, r.getBit()), r, 0x0F, 0xC7);
+}
+void rdseed(const Reg& r) {
+  if (r.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModR(Reg(7, Operand::REG, r.getBit()), r, 0x0F, 0xC7);
+}
+void rdtsc() {
+  db(0x0F);
+  db(0x31);
+}
+void rdtscp() {
+  db(0x0F);
+  db(0x01);
+  db(0xF9);
+}
+void rep() { db(0xF3); }
+void repe() { db(0xF3); }
+void repne() { db(0xF2); }
+void repnz() { db(0xF2); }
+void repz() { db(0xF3); }
+void ret(int imm = 0) {
+  if (imm) {
+    db(0xC2);
+    dw(imm);
+  } else {
+    db(0xC3);
+  }
+}
+void retf(int imm = 0) {
+  if (imm) {
+    db(0xCA);
+    dw(imm);
+  } else {
+    db(0xCB);
+  }
+}
+void rol(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 0); }
+void rol(const Operand& op, int imm) { opShift(op, imm, 0); }
+void ror(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 1); }
+void ror(const Operand& op, int imm) { opShift(op, imm, 1); }
+void rorx(const Reg32e& r, const Operand& op, uint8_t imm) {
+  opGpr(r, op, Reg32e(0, r.getBit()), T_0F3A | T_F2, 0xF0, false, imm);
+}
+void roundpd(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x09, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
+void roundps(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x08, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
+void roundsd(const Xmm& xmm, const Operand& op, int imm) {
+  opGen(xmm, op, 0x0B, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
+}
+void roundss(const Xmm& xmm, const Operand& op, int imm) {
+  opGen(xmm, op, 0x0A, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
+}
+void rsqrtps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x52, 0x100, isXMM_XMMorMEM); }
+void rsqrtss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x52, 0xF3, isXMM_XMMorMEM); }
+void sahf() { db(0x9E); }
+void sal(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 4); }
+void sal(const Operand& op, int imm) { opShift(op, imm, 4); }
+void sar(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 7); }
+void sar(const Operand& op, int imm) { opShift(op, imm, 7); }
+void sarx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_F3 | T_0F38, 0xf7, false); }
+void sbb(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x18, 3); }
+void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); }
+void scasb() { db(0xAE); }
+void scasd() { db(0xAF); }
+void scasw() {
+  db(0x66);
+  db(0xAF);
+}
+void serialize() {
+  db(0x0F);
+  db(0x01);
+  db(0xE8);
+}
+void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }     //-V524
+void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }    //-V524
+void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }     //-V524
+void setbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 6); }    //-V524
+void setc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }     //-V524
+void sete(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 4); }     //-V524
+void setg(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 15); }    //-V524
+void setge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 13); }   //-V524
+void setl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 12); }    //-V524
+void setle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 14); }   //-V524
+void setna(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 6); }    //-V524
+void setnae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }   //-V524
+void setnb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }    //-V524
+void setnbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }   //-V524
+void setnc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }    //-V524
+void setne(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 5); }    //-V524
+void setng(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 14); }   //-V524
+void setnge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 12); }  //-V524
+void setnl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 13); }   //-V524
+void setnle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 15); }  //-V524
+void setno(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 1); }    //-V524
+void setnp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 11); }   //-V524
+void setns(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 9); }    //-V524
+void setnz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 5); }    //-V524
+void seto(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 0); }     //-V524
+void setp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 10); }    //-V524
+void setpe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 10); }   //-V524
+void setpo(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 11); }   //-V524
+void sets(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 8); }     //-V524
+void setz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 4); }     //-V524
+void sfence() {
+  db(0x0F);
+  db(0xAE);
+  db(0xF8);
+}
+void sha1msg1(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xC9, NONE, isXMM_XMMorMEM, NONE, 0x38); }
+void sha1msg2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCA, NONE, isXMM_XMMorMEM, NONE, 0x38); }
+void sha1nexte(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xC8, NONE, isXMM_XMMorMEM, NONE, 0x38); }
+void sha1rnds4(const Xmm& xmm, const Operand& op, uint8_t imm) {
+  opGen(xmm, op, 0xCC, NONE, isXMM_XMMorMEM, imm, 0x3A);
+}
+void sha256msg1(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCC, NONE, isXMM_XMMorMEM, NONE, 0x38); }
+void sha256msg2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCD, NONE, isXMM_XMMorMEM, NONE, 0x38); }
+void sha256rnds2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCB, NONE, isXMM_XMMorMEM, NONE, 0x38); }
+void shl(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 4); }
+void shl(const Operand& op, int imm) { opShift(op, imm, 4); }
+void shld(const Operand& op, const Reg& reg, const Reg8& _cl) { opShxd(op, reg, 0, 0xA4, &_cl); }
+void shld(const Operand& op, const Reg& reg, uint8_t imm) { opShxd(op, reg, imm, 0xA4); }
+void shlx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_66 | T_0F38, 0xf7, false); }
+void shr(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 5); }
+void shr(const Operand& op, int imm) { opShift(op, imm, 5); }
+void shrd(const Operand& op, const Reg& reg, const Reg8& _cl) { opShxd(op, reg, 0, 0xAC, &_cl); }
+void shrd(const Operand& op, const Reg& reg, uint8_t imm) { opShxd(op, reg, imm, 0xAC); }
+void shrx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_F2 | T_0F38, 0xf7, false); }
+void shufpd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC6, 0x66, isXMM_XMMorMEM, imm8); }
+void shufps(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC6, 0x100, isXMM_XMMorMEM, imm8); }
+void sqrtpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0x66, isXMM_XMMorMEM); }
+void sqrtps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0x100, isXMM_XMMorMEM); }
+void sqrtsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0xF2, isXMM_XMMorMEM); }
+void sqrtss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0xF3, isXMM_XMMorMEM); }
+void stac() {
+  db(0x0F);
+  db(0x01);
+  db(0xCB);
+}
+void stc() { db(0xF9); }
+void std() { db(0xFD); }
+void sti() { db(0xFB); }
+void stmxcsr(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0xAE); }
+void stosb() { db(0xAA); }
+void stosd() { db(0xAB); }
+void stosw() {
+  db(0x66);
+  db(0xAB);
+}
+void sub(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x28, 5); }
+void sub(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x28); }
+void subpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x66, isXMM_XMMorMEM); }
+void subps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x100, isXMM_XMMorMEM); }
+void subsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF2, isXMM_XMMorMEM); }
+void subss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF3, isXMM_XMMorMEM); }
+void sysenter() {
+  db(0x0F);
+  db(0x34);
+}
+void sysexit() {
+  db(0x0F);
+  db(0x35);
+}
+void tpause(const Reg32& r) {
+  int idx = r.getIdx();
+  if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x66);
+  db(0x0F);
+  db(0xAE);
+  setModRM(3, 6, idx);
+}
+void tzcnt(const Reg& reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBC); }
+void ucomisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x66, isXMM_XMMorMEM); }
+void ucomiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x100, isXMM_XMMorMEM); }
+void ud2() {
+  db(0x0F);
+  db(0x0B);
+}
+void umonitor(const Reg& r) {
+  int idx = r.getIdx();
+  if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) int bit = r.getBit();
+  if (BIT != bit) {
+    if ((BIT == 32 && bit == 16) || (BIT == 64 && bit == 32)) {
+      db(0x67);
+    } else {
+      XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
+    }
+  }
+  db(0xF3);
+  db(0x0F);
+  db(0xAE);
+  setModRM(3, 6, idx);
+}
+void umwait(const Reg32& r) {
+  int idx = r.getIdx();
+  if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xF2);
+  db(0x0F);
+  db(0xAE);
+  setModRM(3, 6, idx);
+}
+void unpckhpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM); }
+void unpckhps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x100, isXMM_XMMorMEM); }
+void unpcklpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM); }
+void unpcklps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x100, isXMM_XMMorMEM); }
+void vaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x58);
+}
+void vaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x58);
+}
+void vaddsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x58);
+}
+void vaddss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x58);
+}
+void vaddsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0xD0);
+}
+void vaddsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0xD0);
+}
+void vaesdec(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDE);
+}
+void vaesdeclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDF);
+}
+void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDC);
+}
+void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDD);
+}
+void vaesimc(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_W0, 0xDB); }
+void vaeskeygenassist(const Xmm& xm, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0xDF, imm);
+}
+void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x55);
+}
+void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55);
+}
+void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54);
+}
+void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54);
+}
+void vbcstnebf162ps(const Xmm& x, const Address& addr) {
+  opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1);
+}
+void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); }
+void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm);
+}
+void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm);
+}
+void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
+  opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4);
+}
+void vblendvps(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
+  opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4A, x4.getIdx() << 4);
+}
+void vbroadcastf128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x1A); }
+void vbroadcasti128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x5A); }
+void vbroadcastsd(const Ymm& y, const Operand& op) {
+  if (!op.isMEM() && !(y.isYMM() && op.isXMM()) && !(y.isZMM() && op.isXMM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(y, op, T_0F38 | T_66 | T_W0 | T_YMM | T_EVEX | T_EW1 | T_N8, 0x19);
+}
+void vbroadcastss(const Xmm& x, const Operand& op) {
+  if (!(op.isXMM() || op.isMEM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x18);
+}
+void vcmpeq_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 16); }
+void vcmpeq_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 16); }
+void vcmpeq_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 16); }
+void vcmpeq_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 16); }
+void vcmpeq_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 8); }
+void vcmpeq_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 8); }
+void vcmpeq_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 8); }
+void vcmpeq_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 8); }
+void vcmpeq_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 24); }
+void vcmpeq_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 24); }
+void vcmpeq_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 24); }
+void vcmpeq_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 24); }
+void vcmpeqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 0); }
+void vcmpeqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 0); }
+void vcmpeqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 0); }
+void vcmpeqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 0); }
+void vcmpfalse_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 27); }
+void vcmpfalse_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 27); }
+void vcmpfalse_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 27); }
+void vcmpfalse_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 27); }
+void vcmpfalsepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 11); }
+void vcmpfalseps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 11); }
+void vcmpfalsesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 11); }
+void vcmpfalsess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 11); }
+void vcmpge_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 29); }
+void vcmpge_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 29); }
+void vcmpge_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 29); }
+void vcmpge_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 29); }
+void vcmpgepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 13); }
+void vcmpgeps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 13); }
+void vcmpgesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 13); }
+void vcmpgess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 13); }
+void vcmpgt_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 30); }
+void vcmpgt_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 30); }
+void vcmpgt_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 30); }
+void vcmpgt_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 30); }
+void vcmpgtpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 14); }
+void vcmpgtps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 14); }
+void vcmpgtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 14); }
+void vcmpgtss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 14); }
+void vcmple_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 18); }
+void vcmple_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 18); }
+void vcmple_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 18); }
+void vcmple_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 18); }
+void vcmplepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 2); }
+void vcmpleps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 2); }
+void vcmplesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 2); }
+void vcmpless(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 2); }
+void vcmplt_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 17); }
+void vcmplt_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 17); }
+void vcmplt_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 17); }
+void vcmplt_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 17); }
+void vcmpltpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 1); }
+void vcmpltps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 1); }
+void vcmpltsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 1); }
+void vcmpltss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 1); }
+void vcmpneq_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 12); }
+void vcmpneq_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 12); }
+void vcmpneq_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 12); }
+void vcmpneq_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 12); }
+void vcmpneq_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 28); }
+void vcmpneq_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 28); }
+void vcmpneq_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 28); }
+void vcmpneq_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 28); }
+void vcmpneq_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 20); }
+void vcmpneq_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 20); }
+void vcmpneq_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 20); }
+void vcmpneq_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 20); }
+void vcmpneqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 4); }
+void vcmpneqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 4); }
+void vcmpneqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 4); }
+void vcmpneqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 4); }
+void vcmpnge_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 25); }
+void vcmpnge_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 25); }
+void vcmpnge_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 25); }
+void vcmpnge_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 25); }
+void vcmpngepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 9); }
+void vcmpngeps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 9); }
+void vcmpngesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 9); }
+void vcmpngess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 9); }
+void vcmpngt_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 26); }
+void vcmpngt_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 26); }
+void vcmpngt_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 26); }
+void vcmpngt_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 26); }
+void vcmpngtpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 10); }
+void vcmpngtps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 10); }
+void vcmpngtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 10); }
+void vcmpngtss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 10); }
+void vcmpnle_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 22); }
+void vcmpnle_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 22); }
+void vcmpnle_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 22); }
+void vcmpnle_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 22); }
+void vcmpnlepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 6); }
+void vcmpnleps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 6); }
+void vcmpnlesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 6); }
+void vcmpnless(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 6); }
+void vcmpnlt_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 21); }
+void vcmpnlt_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 21); }
+void vcmpnlt_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 21); }
+void vcmpnlt_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 21); }
+void vcmpnltpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 5); }
+void vcmpnltps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 5); }
+void vcmpnltsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 5); }
+void vcmpnltss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 5); }
+void vcmpord_spd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 23); }
+void vcmpord_sps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 23); }
+void vcmpord_ssd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 23); }
+void vcmpord_sss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 23); }
+void vcmpordpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 7); }
+void vcmpordps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 7); }
+void vcmpordsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 7); }
+void vcmpordss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 7); }
+void vcmppd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xC2, imm);
+}
+void vcmpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_0F | T_YMM, 0xC2, imm);
+}
+void vcmpsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F, 0xC2, imm);
+}
+void vcmpss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0xC2, imm);
+}
+void vcmptrue_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 31); }
+void vcmptrue_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 31); }
+void vcmptrue_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 31); }
+void vcmptrue_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 31); }
+void vcmptruepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 15); }
+void vcmptrueps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 15); }
+void vcmptruesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 15); }
+void vcmptruess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 15); }
+void vcmpunord_spd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 19); }
+void vcmpunord_sps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 19); }
+void vcmpunord_ssd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 19); }
+void vcmpunord_sss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 19); }
+void vcmpunordpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 3); }
+void vcmpunordps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 3); }
+void vcmpunordsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 3); }
+void vcmpunordss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 3); }
+void vcomisd(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_N8 | T_66 | T_0F | T_EW1 | T_EVEX | T_SAE_X, 0x2F);
+}
+void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); }
+void vcvtdq2pd(const Xmm& x, const Operand& op) {
+  checkCvt1(x, op);
+  opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6);
+}
+void vcvtdq2ps(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B);
+}
+void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0); }
+void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM, 0xB0); }
+void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0); }
+void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_W0 | T_YMM, 0xB0); }
+void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
+  opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72);
+}
+void vcvtpd2dq(const Xmm& x, const Operand& op) {
+  opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6);
+}
+void vcvtpd2ps(const Xmm& x, const Operand& op) {
+  opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A);
+}
+void vcvtph2ps(const Xmm& x, const Operand& op) {
+  checkCvt1(x, op);
+  opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13);
+}
+void vcvtps2dq(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B);
+}
+void vcvtps2pd(const Xmm& x, const Operand& op) {
+  checkCvt1(x, op);
+  opVex(x, 0, op, T_0F | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL | T_SAE_Y, 0x5A);
+}
+void vcvtps2ph(const Operand& op, const Xmm& x, uint8_t imm) {
+  checkCvt1(x, op);
+  opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y | T_M_K, 0x1D, imm);
+}
+void vcvtsd2si(const Reg32& r, const Operand& op) {
+  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_ER_X, 0x2D);
+}
+void vcvtsd2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_ER_X, 0x5A);
+}
+void vcvtsi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opCvt3(x1, x2, op, T_0F | T_F2 | T_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x2A);
+}
+void vcvtsi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opCvt3(x1, x2, op, T_0F | T_F3 | T_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x2A);
+}
+void vcvtss2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x5A);
+}
+void vcvtss2si(const Reg32& r, const Operand& op) {
+  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_ER_X | T_N8, 0x2D);
+}
+void vcvttpd2dq(const Xmm& x, const Operand& op) {
+  opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6);
+}
+void vcvttps2dq(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX | T_SAE_Z | T_B32, 0x5B);
+}
+void vcvttsd2si(const Reg32& r, const Operand& op) {
+  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_SAE_X, 0x2C);
+}
+void vcvttss2si(const Reg32& r, const Operand& op) {
+  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_SAE_X | T_N8, 0x2C);
+}
+void vdivpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5E);
+}
+void vdivps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5E);
+}
+void vdivsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5E);
+}
+void vdivss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5E);
+}
+void vdppd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x41, imm);
+}
+void vdpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x40, imm);
+}
+void vextractf128(const Operand& op, const Ymm& y, uint8_t imm) {
+  if (!(op.isXMEM() && y.isYMM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x19, imm);
+}
+void vextracti128(const Operand& op, const Ymm& y, uint8_t imm) {
+  if (!(op.isXMEM() && y.isYMM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x39, imm);
+}
+void vextractps(const Operand& op, const Xmm& x, uint8_t imm) {
+  if (!((op.isREG(32) || op.isMEM()) && x.isXMM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_N4, 0x17, imm);
+}
+void vfmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x98);
+}
+void vfmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x98);
+}
+void vfmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x99);
+}
+void vfmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x99);
+}
+void vfmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA8);
+}
+void vfmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA8);
+}
+void vfmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xA9);
+}
+void vfmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xA9);
+}
+void vfmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB8);
+}
+void vfmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB8);
+}
+void vfmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xB9);
+}
+void vfmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xB9);
+}
+void vfmaddsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x96);
+}
+void vfmaddsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x96);
+}
+void vfmaddsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA6);
+}
+void vfmaddsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA6);
+}
+void vfmaddsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB6);
+}
+void vfmaddsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB6);
+}
+void vfmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9A);
+}
+void vfmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9A);
+}
+void vfmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9B);
+}
+void vfmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9B);
+}
+void vfmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAA);
+}
+void vfmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAA);
+}
+void vfmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAB);
+}
+void vfmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAB);
+}
+void vfmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBA);
+}
+void vfmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBA);
+}
+void vfmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBB);
+}
+void vfmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBB);
+}
+void vfmsubadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x97);
+}
+void vfmsubadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x97);
+}
+void vfmsubadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA7);
+}
+void vfmsubadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA7);
+}
+void vfmsubadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB7);
+}
+void vfmsubadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB7);
+}
+void vfnmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9C);
+}
+void vfnmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9C);
+}
+void vfnmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9D);
+}
+void vfnmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9D);
+}
+void vfnmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAC);
+}
+void vfnmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAC);
+}
+void vfnmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAD);
+}
+void vfnmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAD);
+}
+void vfnmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBC);
+}
+void vfnmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBC);
+}
+void vfnmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBD);
+}
+void vfnmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBD);
+}
+void vfnmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9E);
+}
+void vfnmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9E);
+}
+void vfnmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9F);
+}
+void vfnmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9F);
+}
+void vfnmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAE);
+}
+void vfnmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAE);
+}
+void vfnmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAF);
+}
+void vfnmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAF);
+}
+void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBE);
+}
+void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBE);
+}
+void vfnmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBF);
+}
+void vfnmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBF);
+}
+void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) {
+  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x92, 0);
+}
+void vgatherdps(const Xmm& x1, const Address& addr, const Xmm& x2) {
+  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x92, 1);
+}
+void vgatherqpd(const Xmm& x1, const Address& addr, const Xmm& x2) {
+  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x93, 1);
+}
+void vgatherqps(const Xmm& x1, const Address& addr, const Xmm& x2) {
+  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x93, 2);
+}
+void vgf2p8affineinvqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCF, imm);
+}
+void vgf2p8affineqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCE, imm);
+}
+void vgf2p8mulb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_SAE_Z, 0xCF);
+}
+void vhaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0x7C);
+}
+void vhaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7C);
+}
+void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0x7D);
+}
+void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7D);
+}
+void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
+  if (!(y1.isYMM() && y2.isYMM() && op.isXMEM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x18, imm);
+}
+void vinserti128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
+  if (!(y1.isYMM() && y2.isYMM() && op.isXMEM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x38, imm);
+}
+void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm);
+}
+void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, cvtIdx0(x), addr, T_0F | T_F2 | T_W0 | T_YMM, 0xF0); }
+void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, T_0F, 0xAE); }
+void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_66, 0xF7); }
+void vmaskmovpd(const Address& addr, const Xmm& x1, const Xmm& x2) {
+  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2F);
+}
+void vmaskmovpd(const Xmm& x1, const Xmm& x2, const Address& addr) {
+  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2D);
+}
+void vmaskmovps(const Address& addr, const Xmm& x1, const Xmm& x2) {
+  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2E);
+}
+void vmaskmovps(const Xmm& x1, const Xmm& x2, const Address& addr) {
+  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2C);
+}
+void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5F);
+}
+void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5F);
+}
+void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5F);
+}
+void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5F);
+}
+void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5D);
+}
+void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5D);
+}
+void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5D);
+}
+void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5D);
+}
+void vmovapd(const Address& addr, const Xmm& xmm) {
+  opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x29);
+}
+void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x28); }
+void vmovaps(const Address& addr, const Xmm& xmm) {
+  opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x29);
+}
+void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x28); }
+void vmovd(const Operand& op, const Xmm& x) {
+  if (!op.isREG(32) && !op.isMEM())
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E);
+}
+void vmovd(const Xmm& x, const Operand& op) {
+  if (!op.isREG(32) && !op.isMEM())
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E);
+}
+void vmovddup(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_DUP | T_F2 | T_0F | T_EW1 | T_YMM | T_EVEX | T_ER_X | T_ER_Y | T_ER_Z, 0x12);
+}
+void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_YMM, 0x7F); }
+void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_YMM, 0x6F); }
+void vmovdqu(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_F3 | T_0F | T_YMM, 0x7F); }
+void vmovdqu(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_YMM, 0x6F); }
+void vmovhlps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
+  if (!op.isNone() && !op.isXMM())
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_0F | T_EVEX | T_EW0, 0x12);
+}
+void vmovhpd(const Address& addr, const Xmm& x) {
+  opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x17);
+}
+void vmovhpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
+  if (!op2.isNone() && !op2.isMEM())
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x16);
+}
+void vmovhps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_EVEX | T_EW0 | T_N8, 0x17); }
+void vmovhps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
+  if (!op2.isNone() && !op2.isMEM())
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_EVEX | T_EW0 | T_N8, 0x16);
+}
+void vmovlhps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
+  if (!op.isNone() && !op.isXMM())
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_0F | T_EVEX | T_EW0, 0x16);
+}
+void vmovlpd(const Address& addr, const Xmm& x) {
+  opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x13);
+}
+void vmovlpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
+  if (!op2.isNone() && !op2.isMEM())
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x12);
+}
+void vmovlps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_EVEX | T_EW0 | T_N8, 0x13); }
+void vmovlps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
+  if (!op2.isNone() && !op2.isMEM())
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_EVEX | T_EW0 | T_N8, 0x12);
+}
+void vmovmskpd(const Reg& r, const Xmm& x) {
+  if (!r.isBit(i32e)) XBYAK_THROW(ERR_BAD_COMBINATION)
+  opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), cvtIdx0(x), x, T_0F | T_66 | T_W0 | T_YMM, 0x50);
+}
+void vmovmskps(const Reg& r, const Xmm& x) {
+  if (!r.isBit(i32e)) XBYAK_THROW(ERR_BAD_COMBINATION)
+  opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), cvtIdx0(x), x, T_0F | T_W0 | T_YMM, 0x50);
+}
+void vmovntdq(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_66 | T_YMM | T_EVEX | T_EW0, 0xE7); }
+void vmovntdqa(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_66 | T_YMM | T_EVEX | T_EW0, 0x2A); }
+void vmovntpd(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_66 | T_YMM | T_EVEX | T_EW1, 0x2B); }
+void vmovntps(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_YMM | T_EVEX | T_EW0, 0x2B); }
+void vmovq(const Address& addr, const Xmm& x) {
+  opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, x.getIdx() < 16 ? 0xD6 : 0x7E);
+}
+void vmovq(const Xmm& x, const Address& addr) {
+  int type, code;
+  if (x.getIdx() < 16) {
+    type = T_0F | T_F3;
+    code = 0x7E;
+  } else {
+    type = T_0F | T_66 | T_EVEX | T_EW1 | T_N8;
+    code = 0x6E;
+  }
+  opAVX_X_X_XM(x, xm0, addr, type, code);
+}
+void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_F3 | T_EVEX | T_EW1 | T_N8, 0x7E); }
+void vmovsd(const Address& addr, const Xmm& x) {
+  opAVX_X_X_XM(x, xm0, addr, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_M_K, 0x11);
+}
+void vmovsd(const Xmm& x, const Address& addr) {
+  opAVX_X_X_XM(x, xm0, addr, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX, 0x10);
+}
+void vmovsd(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
+  if (!op.isNone() && !op.isXMM())
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX, 0x10);
+}
+void vmovshdup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX, 0x16); }
+void vmovsldup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX, 0x12); }
+void vmovss(const Address& addr, const Xmm& x) {
+  opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_M_K, 0x11);
+}
+void vmovss(const Xmm& x, const Address& addr) {
+  opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10);
+}
+void vmovss(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
+  if (!op.isNone() && !op.isXMM())
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10);
+}
+void vmovupd(const Address& addr, const Xmm& xmm) {
+  opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x11);
+}
+void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x10); }
+void vmovups(const Address& addr, const Xmm& xmm) {
+  opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x11);
+}
+void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x10); }
+void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x42, imm);
+}
+void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x59);
+}
+void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x59);
+}
+void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x59);
+}
+void vmulss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x59);
+}
+void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x56);
+}
+void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x56);
+}
+void vpabsb(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x1C); }
+void vpabsd(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x1E);
+}
+void vpabsw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x1D); }
+void vpackssdw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x6B);
+}
+void vpacksswb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x63);
+}
+void vpackusdw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x2B);
+}
+void vpackuswb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x67);
+}
+void vpaddb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xFC);
+}
+void vpaddd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFE);
+}
+void vpaddq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xD4);
+}
+void vpaddsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEC);
+}
+void vpaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xED);
+}
+void vpaddusb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDC);
+}
+void vpaddusw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDD);
+}
+void vpaddw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xFD);
+}
+void vpalignr(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_YMM | T_EVEX, 0x0F, imm);
+}
+void vpand(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xDB); }
+void vpandn(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xDF); }
+void vpavgb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE0);
+}
+void vpavgw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE3);
+}
+void vpblendd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x02, imm);
+}
+void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
+  opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4C, x4.getIdx() << 4);
+}
+void vpblendw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0E, imm);
+}
+void vpbroadcastb(const Xmm& x, const Operand& op) {
+  if (!(op.isXMM() || op.isMEM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x78);
+}
+void vpbroadcastd(const Xmm& x, const Operand& op) {
+  if (!(op.isXMM() || op.isMEM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x58);
+}
+void vpbroadcastq(const Xmm& x, const Operand& op) {
+  if (!(op.isXMM() || op.isMEM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX, 0x59);
+}
+void vpbroadcastw(const Xmm& x, const Operand& op) {
+  if (!(op.isXMM() || op.isMEM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x79);
+}
+void vpclmulhqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x11); }
+void vpclmulhqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x01); }
+void vpclmullqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x10); }
+void vpclmullqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x00); }
+void vpclmulqdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM | T_EVEX, 0x44, imm);
+}
+void vpcmpeqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x74); }
+void vpcmpeqd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x76); }
+void vpcmpeqq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x29);
+}
+void vpcmpeqw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x75); }
+void vpcmpestri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x61, imm); }
+void vpcmpestrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x60, imm); }
+void vpcmpgtb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x64); }
+void vpcmpgtd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x66); }
+void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x37);
+}
+void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); }
+void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); }
+void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); }
+void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x50);
+}
+void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x51);
+}
+void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x50);
+}
+void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x51);
+}
+void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
+  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding);
+}
+void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
+  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding);
+}
+void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x50);
+}
+void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x51);
+}
+void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
+  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding);
+}
+void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
+  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding);
+}
+void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD2);
+}
+void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD3);
+}
+void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD2);
+}
+void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD3);
+}
+void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD2);
+}
+void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD3);
+}
+void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
+  if (!(y1.isYMM() && y2.isYMM() && op.isYMEM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm);
+}
+void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
+  if (!(y1.isYMM() && y2.isYMM() && op.isYMEM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm);
+}
+void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) {
+  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36);
+}
+void vpermilpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x0D);
+}
+void vpermilpd(const Xmm& xm, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_EVEX | T_B64, 0x05, imm);
+}
+void vpermilps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x0C);
+}
+void vpermilps(const Xmm& xm, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_EVEX | T_B32, 0x04, imm);
+}
+void vpermpd(const Ymm& y, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(y, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x01, imm);
+}
+void vpermpd(const Ymm& y1, const Ymm& y2, const Operand& op) {
+  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x16);
+}
+void vpermps(const Ymm& y1, const Ymm& y2, const Operand& op) {
+  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x16);
+}
+void vpermq(const Ymm& y, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(y, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x00, imm);
+}
+void vpermq(const Ymm& y1, const Ymm& y2, const Operand& op) {
+  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x36);
+}
+void vpextrb(const Operand& op, const Xmm& x, uint8_t imm) {
+  if (!((op.isREG(8 | 16 | i32e) || op.isMEM()) && x.isXMM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_EVEX | T_N1, 0x14, imm);
+}
+void vpextrd(const Operand& op, const Xmm& x, uint8_t imm) {
+  if (!((op.isREG(32) || op.isMEM()) && x.isXMM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x16, imm);
+}
+void vpextrq(const Operand& op, const Xmm& x, uint8_t imm) {
+  if (!((op.isREG(64) || op.isMEM()) && x.isXMM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x16, imm);
+}
+void vpextrw(const Operand& op, const Xmm& x, uint8_t imm) {
+  if (!((op.isREG(16 | i32e) || op.isMEM()) && x.isXMM()))
+    XBYAK_THROW(ERR_BAD_COMBINATION) if (op.isREG() && x.getIdx() < 16) {
+      opAVX_X_X_XM(Xmm(op.getIdx()), xm0, x, T_0F | T_66, 0xC5, imm);
+    }
+  else {
+    opVex(x, 0, op, T_0F3A | T_66 | T_EVEX | T_N2, 0x15, imm);
+  }
+}
+void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) {
+  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x90, 1);
+}
+void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) {
+  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x90, 0);
+}
+void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) {
+  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x91, 2);
+}
+void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) {
+  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x91, 1);
+}
+void vphaddd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x02); }
+void vphaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x03);
+}
+void vphaddw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x01); }
+void vphminposuw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38, 0x41); }
+void vphsubd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x06); }
+void vphsubsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x07);
+}
+void vphsubw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x05); }
+void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM())))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_EVEX | T_N1, 0x20, imm);
+}
+void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM())))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm);
+}
+void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM())))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm);
+}
+void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM())))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm);
+}
+void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
+  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB5, encoding);
+}
+void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
+  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB4, encoding);
+}
+void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04);
+}
+void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5);
+}
+void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) {
+  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E);
+}
+void vpmaskmovd(const Xmm& x1, const Xmm& x2, const Address& addr) {
+  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8C);
+}
+void vpmaskmovq(const Address& addr, const Xmm& x1, const Xmm& x2) {
+  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W1 | T_YMM, 0x8E);
+}
+void vpmaskmovq(const Xmm& x1, const Xmm& x2, const Address& addr) {
+  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W1 | T_YMM, 0x8C);
+}
+void vpmaxsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3C);
+}
+void vpmaxsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3D);
+}
+void vpmaxsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEE);
+}
+void vpmaxub(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDE);
+}
+void vpmaxud(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3F);
+}
+void vpmaxuw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3E);
+}
+void vpminsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x38);
+}
+void vpminsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x39);
+}
+void vpminsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEA);
+}
+void vpminub(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDA);
+}
+void vpminud(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3B);
+}
+void vpminuw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3A);
+}
+void vpmovmskb(const Reg32e& r, const Xmm& x) {
+  if (!x.is(Operand::XMM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION)
+  opVex(x.isYMM() ? Ymm(r.getIdx()) : Xmm(r.getIdx()), 0, x, T_0F | T_66 | T_YMM, 0xD7);
+}
+void vpmovsxbd(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x21);
+}
+void vpmovsxbq(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_N2 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x22);
+}
+void vpmovsxbw(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x20);
+}
+void vpmovsxdq(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX, 0x25);
+}
+void vpmovsxwd(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x23);
+}
+void vpmovsxwq(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x24);
+}
+void vpmovzxbd(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x31);
+}
+void vpmovzxbq(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_N2 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x32);
+}
+void vpmovzxbw(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x30);
+}
+void vpmovzxdq(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX, 0x35);
+}
+void vpmovzxwd(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x33);
+}
+void vpmovzxwq(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x34);
+}
+void vpmuldq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x28);
+}
+void vpmulhrsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x0B);
+}
+void vpmulhuw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE4);
+}
+void vpmulhw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE5);
+}
+void vpmulld(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x40);
+}
+void vpmullw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD5);
+}
+void vpmuludq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xF4);
+}
+void vpor(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xEB); }
+void vpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF6);
+}
+void vpshufb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x00);
+}
+void vpshufd(const Xmm& xm, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x70, imm);
+}
+void vpshufhw(const Xmm& xm, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_YMM | T_EVEX, 0x70, imm);
+}
+void vpshuflw(const Xmm& xm, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(xm, op, T_F2 | T_0F | T_YMM | T_EVEX, 0x70, imm);
+}
+void vpsignb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x08); }
+void vpsignd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x0A); }
+void vpsignw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x09); }
+void vpslld(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm);
+}
+void vpslld(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xF2);
+}
+void vpslldq(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm);
+}
+void vpsllq(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm);
+}
+void vpsllq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xF3);
+}
+void vpsllvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x47);
+}
+void vpsllvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x47);
+}
+void vpsllw(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm);
+}
+void vpsllw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xF1);
+}
+void vpsrad(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm);
+}
+void vpsrad(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xE2);
+}
+void vpsravd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x46);
+}
+void vpsraw(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm);
+}
+void vpsraw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xE1);
+}
+void vpsrld(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm);
+}
+void vpsrld(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xD2);
+}
+void vpsrldq(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm);
+}
+void vpsrlq(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm);
+}
+void vpsrlq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xD3);
+}
+void vpsrlvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x45);
+}
+void vpsrlvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x45);
+}
+void vpsrlw(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm);
+}
+void vpsrlw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xD1);
+}
+void vpsubb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF8);
+}
+void vpsubd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFA);
+}
+void vpsubq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xFB);
+}
+void vpsubsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE8);
+}
+void vpsubsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE9);
+}
+void vpsubusb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD8);
+}
+void vpsubusw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD9);
+}
+void vpsubw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF9);
+}
+void vptest(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x17); }
+void vpunpckhbw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x68);
+}
+void vpunpckhdq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x6A);
+}
+void vpunpckhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x6D);
+}
+void vpunpckhwd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x69);
+}
+void vpunpcklbw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x60);
+}
+void vpunpckldq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x62);
+}
+void vpunpcklqdq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x6C);
+}
+void vpunpcklwd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x61);
+}
+void vpxor(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xEF); }
+void vrcpps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x53); }
+void vrcpss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0x53); }
+void vroundpd(const Xmm& xm, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_YMM, 0x09, imm);
+}
+void vroundps(const Xmm& xm, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_YMM, 0x08, imm);
+}
+void vroundsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x0B, imm);
+}
+void vroundss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x0A, imm);
+}
+void vrsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x52); }
+void vrsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0x52); }
+void vsha512msg1(const Ymm& y, const Xmm& x) {
+  if (!(y.isYMM() && x.isXMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y, 0, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCC);
+}
+void vsha512msg2(const Ymm& y1, const Ymm& y2) {
+  if (!(y1.isYMM() && y2.isYMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, 0, y2, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCD);
+}
+void vsha512rnds2(const Ymm& y1, const Ymm& y2, const Xmm& x) {
+  if (!(y1.isYMM() && y2.isYMM() && x.isXMM()))
+    XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, &y2, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCB);
+}
+void vshufpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xC6, imm);
+}
+void vshufps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xC6, imm);
+}
+void vsm3msg1(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
+}
+void vsm3msg2(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
+}
+void vsm3rnds2(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0xDE, imm);
+}
+void vsm4key4(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
+}
+void vsm4rnds4(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
+}
+void vsqrtpd(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x51);
+}
+void vsqrtps(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x51);
+}
+void vsqrtsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_ER_X, 0x51);
+}
+void vsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_ER_X, 0x51);
+}
+void vstmxcsr(const Address& addr) { opAVX_X_X_XM(xm3, xm0, addr, T_0F, 0xAE); }
+void vsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5C);
+}
+void vsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5C);
+}
+void vsubsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5C);
+}
+void vsubss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5C);
+}
+void vtestpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x0F); }
+void vtestps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x0E); }
+void vucomisd(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_N8 | T_66 | T_0F | T_EW1 | T_EVEX | T_SAE_X, 0x2E);
+}
+void vucomiss(const Xmm& xm, const Operand& op) {
+  opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2E);
+}
+void vunpckhpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x15);
+}
+void vunpckhps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x15);
+}
+void vunpcklpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x14);
+}
+void vunpcklps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x14);
+}
+void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x57);
+}
+void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x57);
+}
+void vzeroall() {
+  db(0xC5);
+  db(0xFC);
+  db(0x77);
+}
+void vzeroupper() {
+  db(0xC5);
+  db(0xF8);
+  db(0x77);
+}
+void wait() { db(0x9B); }
+void wbinvd() {
+  db(0x0F);
+  db(0x09);
+}
+void wrmsr() {
+  db(0x0F);
+  db(0x30);
+}
+void xabort(uint8_t imm) {
+  db(0xC6);
+  db(0xF8);
+  db(imm);
+}
+void xadd(const Operand& op, const Reg& reg) {
+  opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F,
+          0xC0 | (reg.isBit(8) ? 0 : 1));
+}
+void xbegin(uint32_t rel) {
+  db(0xC7);
+  db(0xF8);
+  dd(rel);
+}
+void xend() {
+  db(0x0F);
+  db(0x01);
+  db(0xD5);
+}
+void xgetbv() {
+  db(0x0F);
+  db(0x01);
+  db(0xD0);
+}
+void xlatb() { db(0xD7); }
+void xor_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x30, 6); }
+void xor_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x30); }
+void xorpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x57, 0x66, isXMM_XMMorMEM); }
+void xorps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x57, 0x100, isXMM_XMMorMEM); }
+#ifdef XBYAK_ENABLE_OMITTED_OPERAND
+void vblendpd(const Xmm& x, const Operand& op, uint8_t imm) { vblendpd(x, x, op, imm); }
+void vblendps(const Xmm& x, const Operand& op, uint8_t imm) { vblendps(x, x, op, imm); }
+void vblendvpd(const Xmm& x1, const Operand& op, const Xmm& x4) { vblendvpd(x1, x1, op, x4); }
+void vblendvps(const Xmm& x1, const Operand& op, const Xmm& x4) { vblendvps(x1, x1, op, x4); }
+void vcmpeq_ospd(const Xmm& x, const Operand& op) { vcmpeq_ospd(x, x, op); }
+void vcmpeq_osps(const Xmm& x, const Operand& op) { vcmpeq_osps(x, x, op); }
+void vcmpeq_ossd(const Xmm& x, const Operand& op) { vcmpeq_ossd(x, x, op); }
+void vcmpeq_osss(const Xmm& x, const Operand& op) { vcmpeq_osss(x, x, op); }
+void vcmpeq_uqpd(const Xmm& x, const Operand& op) { vcmpeq_uqpd(x, x, op); }
+void vcmpeq_uqps(const Xmm& x, const Operand& op) { vcmpeq_uqps(x, x, op); }
+void vcmpeq_uqsd(const Xmm& x, const Operand& op) { vcmpeq_uqsd(x, x, op); }
+void vcmpeq_uqss(const Xmm& x, const Operand& op) { vcmpeq_uqss(x, x, op); }
+void vcmpeq_uspd(const Xmm& x, const Operand& op) { vcmpeq_uspd(x, x, op); }
+void vcmpeq_usps(const Xmm& x, const Operand& op) { vcmpeq_usps(x, x, op); }
+void vcmpeq_ussd(const Xmm& x, const Operand& op) { vcmpeq_ussd(x, x, op); }
+void vcmpeq_usss(const Xmm& x, const Operand& op) { vcmpeq_usss(x, x, op); }
+void vcmpeqpd(const Xmm& x, const Operand& op) { vcmpeqpd(x, x, op); }
+void vcmpeqps(const Xmm& x, const Operand& op) { vcmpeqps(x, x, op); }
+void vcmpeqsd(const Xmm& x, const Operand& op) { vcmpeqsd(x, x, op); }
+void vcmpeqss(const Xmm& x, const Operand& op) { vcmpeqss(x, x, op); }
+void vcmpfalse_ospd(const Xmm& x, const Operand& op) { vcmpfalse_ospd(x, x, op); }
+void vcmpfalse_osps(const Xmm& x, const Operand& op) { vcmpfalse_osps(x, x, op); }
+void vcmpfalse_ossd(const Xmm& x, const Operand& op) { vcmpfalse_ossd(x, x, op); }
+void vcmpfalse_osss(const Xmm& x, const Operand& op) { vcmpfalse_osss(x, x, op); }
+void vcmpfalsepd(const Xmm& x, const Operand& op) { vcmpfalsepd(x, x, op); }
+void vcmpfalseps(const Xmm& x, const Operand& op) { vcmpfalseps(x, x, op); }
+void vcmpfalsesd(const Xmm& x, const Operand& op) { vcmpfalsesd(x, x, op); }
+void vcmpfalsess(const Xmm& x, const Operand& op) { vcmpfalsess(x, x, op); }
+void vcmpge_oqpd(const Xmm& x, const Operand& op) { vcmpge_oqpd(x, x, op); }
+void vcmpge_oqps(const Xmm& x, const Operand& op) { vcmpge_oqps(x, x, op); }
+void vcmpge_oqsd(const Xmm& x, const Operand& op) { vcmpge_oqsd(x, x, op); }
+void vcmpge_oqss(const Xmm& x, const Operand& op) { vcmpge_oqss(x, x, op); }
+void vcmpgepd(const Xmm& x, const Operand& op) { vcmpgepd(x, x, op); }
+void vcmpgeps(const Xmm& x, const Operand& op) { vcmpgeps(x, x, op); }
+void vcmpgesd(const Xmm& x, const Operand& op) { vcmpgesd(x, x, op); }
+void vcmpgess(const Xmm& x, const Operand& op) { vcmpgess(x, x, op); }
+void vcmpgt_oqpd(const Xmm& x, const Operand& op) { vcmpgt_oqpd(x, x, op); }
+void vcmpgt_oqps(const Xmm& x, const Operand& op) { vcmpgt_oqps(x, x, op); }
+void vcmpgt_oqsd(const Xmm& x, const Operand& op) { vcmpgt_oqsd(x, x, op); }
+void vcmpgt_oqss(const Xmm& x, const Operand& op) { vcmpgt_oqss(x, x, op); }
+void vcmpgtpd(const Xmm& x, const Operand& op) { vcmpgtpd(x, x, op); }
+void vcmpgtps(const Xmm& x, const Operand& op) { vcmpgtps(x, x, op); }
+void vcmpgtsd(const Xmm& x, const Operand& op) { vcmpgtsd(x, x, op); }
+void vcmpgtss(const Xmm& x, const Operand& op) { vcmpgtss(x, x, op); }
+void vcmple_oqpd(const Xmm& x, const Operand& op) { vcmple_oqpd(x, x, op); }
+void vcmple_oqps(const Xmm& x, const Operand& op) { vcmple_oqps(x, x, op); }
+void vcmple_oqsd(const Xmm& x, const Operand& op) { vcmple_oqsd(x, x, op); }
+void vcmple_oqss(const Xmm& x, const Operand& op) { vcmple_oqss(x, x, op); }
+void vcmplepd(const Xmm& x, const Operand& op) { vcmplepd(x, x, op); }
+void vcmpleps(const Xmm& x, const Operand& op) { vcmpleps(x, x, op); }
+void vcmplesd(const Xmm& x, const Operand& op) { vcmplesd(x, x, op); }
+void vcmpless(const Xmm& x, const Operand& op) { vcmpless(x, x, op); }
+void vcmplt_oqpd(const Xmm& x, const Operand& op) { vcmplt_oqpd(x, x, op); }
+void vcmplt_oqps(const Xmm& x, const Operand& op) { vcmplt_oqps(x, x, op); }
+void vcmplt_oqsd(const Xmm& x, const Operand& op) { vcmplt_oqsd(x, x, op); }
+void vcmplt_oqss(const Xmm& x, const Operand& op) { vcmplt_oqss(x, x, op); }
+void vcmpltpd(const Xmm& x, const Operand& op) { vcmpltpd(x, x, op); }
+void vcmpltps(const Xmm& x, const Operand& op) { vcmpltps(x, x, op); }
+void vcmpltsd(const Xmm& x, const Operand& op) { vcmpltsd(x, x, op); }
+void vcmpltss(const Xmm& x, const Operand& op) { vcmpltss(x, x, op); }
+void vcmpneq_oqpd(const Xmm& x, const Operand& op) { vcmpneq_oqpd(x, x, op); }
+void vcmpneq_oqps(const Xmm& x, const Operand& op) { vcmpneq_oqps(x, x, op); }
+void vcmpneq_oqsd(const Xmm& x, const Operand& op) { vcmpneq_oqsd(x, x, op); }
+void vcmpneq_oqss(const Xmm& x, const Operand& op) { vcmpneq_oqss(x, x, op); }
+void vcmpneq_ospd(const Xmm& x, const Operand& op) { vcmpneq_ospd(x, x, op); }
+void vcmpneq_osps(const Xmm& x, const Operand& op) { vcmpneq_osps(x, x, op); }
+void vcmpneq_ossd(const Xmm& x, const Operand& op) { vcmpneq_ossd(x, x, op); }
+void vcmpneq_osss(const Xmm& x, const Operand& op) { vcmpneq_osss(x, x, op); }
+void vcmpneq_uspd(const Xmm& x, const Operand& op) { vcmpneq_uspd(x, x, op); }
+void vcmpneq_usps(const Xmm& x, const Operand& op) { vcmpneq_usps(x, x, op); }
+void vcmpneq_ussd(const Xmm& x, const Operand& op) { vcmpneq_ussd(x, x, op); }
+void vcmpneq_usss(const Xmm& x, const Operand& op) { vcmpneq_usss(x, x, op); }
+void vcmpneqpd(const Xmm& x, const Operand& op) { vcmpneqpd(x, x, op); }
+void vcmpneqps(const Xmm& x, const Operand& op) { vcmpneqps(x, x, op); }
+void vcmpneqsd(const Xmm& x, const Operand& op) { vcmpneqsd(x, x, op); }
+void vcmpneqss(const Xmm& x, const Operand& op) { vcmpneqss(x, x, op); }
+void vcmpnge_uqpd(const Xmm& x, const Operand& op) { vcmpnge_uqpd(x, x, op); }
+void vcmpnge_uqps(const Xmm& x, const Operand& op) { vcmpnge_uqps(x, x, op); }
+void vcmpnge_uqsd(const Xmm& x, const Operand& op) { vcmpnge_uqsd(x, x, op); }
+void vcmpnge_uqss(const Xmm& x, const Operand& op) { vcmpnge_uqss(x, x, op); }
+void vcmpngepd(const Xmm& x, const Operand& op) { vcmpngepd(x, x, op); }
+void vcmpngeps(const Xmm& x, const Operand& op) { vcmpngeps(x, x, op); }
+void vcmpngesd(const Xmm& x, const Operand& op) { vcmpngesd(x, x, op); }
+void vcmpngess(const Xmm& x, const Operand& op) { vcmpngess(x, x, op); }
+void vcmpngt_uqpd(const Xmm& x, const Operand& op) { vcmpngt_uqpd(x, x, op); }
+void vcmpngt_uqps(const Xmm& x, const Operand& op) { vcmpngt_uqps(x, x, op); }
+void vcmpngt_uqsd(const Xmm& x, const Operand& op) { vcmpngt_uqsd(x, x, op); }
+void vcmpngt_uqss(const Xmm& x, const Operand& op) { vcmpngt_uqss(x, x, op); }
+void vcmpngtpd(const Xmm& x, const Operand& op) { vcmpngtpd(x, x, op); }
+void vcmpngtps(const Xmm& x, const Operand& op) { vcmpngtps(x, x, op); }
+void vcmpngtsd(const Xmm& x, const Operand& op) { vcmpngtsd(x, x, op); }
+void vcmpngtss(const Xmm& x, const Operand& op) { vcmpngtss(x, x, op); }
+void vcmpnle_uqpd(const Xmm& x, const Operand& op) { vcmpnle_uqpd(x, x, op); }
+void vcmpnle_uqps(const Xmm& x, const Operand& op) { vcmpnle_uqps(x, x, op); }
+void vcmpnle_uqsd(const Xmm& x, const Operand& op) { vcmpnle_uqsd(x, x, op); }
+void vcmpnle_uqss(const Xmm& x, const Operand& op) { vcmpnle_uqss(x, x, op); }
+void vcmpnlepd(const Xmm& x, const Operand& op) { vcmpnlepd(x, x, op); }
+void vcmpnleps(const Xmm& x, const Operand& op) { vcmpnleps(x, x, op); }
+void vcmpnlesd(const Xmm& x, const Operand& op) { vcmpnlesd(x, x, op); }
+void vcmpnless(const Xmm& x, const Operand& op) { vcmpnless(x, x, op); }
+void vcmpnlt_uqpd(const Xmm& x, const Operand& op) { vcmpnlt_uqpd(x, x, op); }
+void vcmpnlt_uqps(const Xmm& x, const Operand& op) { vcmpnlt_uqps(x, x, op); }
+void vcmpnlt_uqsd(const Xmm& x, const Operand& op) { vcmpnlt_uqsd(x, x, op); }
+void vcmpnlt_uqss(const Xmm& x, const Operand& op) { vcmpnlt_uqss(x, x, op); }
+void vcmpnltpd(const Xmm& x, const Operand& op) { vcmpnltpd(x, x, op); }
+void vcmpnltps(const Xmm& x, const Operand& op) { vcmpnltps(x, x, op); }
+void vcmpnltsd(const Xmm& x, const Operand& op) { vcmpnltsd(x, x, op); }
+void vcmpnltss(const Xmm& x, const Operand& op) { vcmpnltss(x, x, op); }
+void vcmpord_spd(const Xmm& x, const Operand& op) { vcmpord_spd(x, x, op); }
+void vcmpord_sps(const Xmm& x, const Operand& op) { vcmpord_sps(x, x, op); }
+void vcmpord_ssd(const Xmm& x, const Operand& op) { vcmpord_ssd(x, x, op); }
+void vcmpord_sss(const Xmm& x, const Operand& op) { vcmpord_sss(x, x, op); }
+void vcmpordpd(const Xmm& x, const Operand& op) { vcmpordpd(x, x, op); }
+void vcmpordps(const Xmm& x, const Operand& op) { vcmpordps(x, x, op); }
+void vcmpordsd(const Xmm& x, const Operand& op) { vcmpordsd(x, x, op); }
+void vcmpordss(const Xmm& x, const Operand& op) { vcmpordss(x, x, op); }
+void vcmppd(const Xmm& x, const Operand& op, uint8_t imm) { vcmppd(x, x, op, imm); }
+void vcmpps(const Xmm& x, const Operand& op, uint8_t imm) { vcmpps(x, x, op, imm); }
+void vcmpsd(const Xmm& x, const Operand& op, uint8_t imm) { vcmpsd(x, x, op, imm); }
+void vcmpss(const Xmm& x, const Operand& op, uint8_t imm) { vcmpss(x, x, op, imm); }
+void vcmptrue_uspd(const Xmm& x, const Operand& op) { vcmptrue_uspd(x, x, op); }
+void vcmptrue_usps(const Xmm& x, const Operand& op) { vcmptrue_usps(x, x, op); }
+void vcmptrue_ussd(const Xmm& x, const Operand& op) { vcmptrue_ussd(x, x, op); }
+void vcmptrue_usss(const Xmm& x, const Operand& op) { vcmptrue_usss(x, x, op); }
+void vcmptruepd(const Xmm& x, const Operand& op) { vcmptruepd(x, x, op); }
+void vcmptrueps(const Xmm& x, const Operand& op) { vcmptrueps(x, x, op); }
+void vcmptruesd(const Xmm& x, const Operand& op) { vcmptruesd(x, x, op); }
+void vcmptruess(const Xmm& x, const Operand& op) { vcmptruess(x, x, op); }
+void vcmpunord_spd(const Xmm& x, const Operand& op) { vcmpunord_spd(x, x, op); }
+void vcmpunord_sps(const Xmm& x, const Operand& op) { vcmpunord_sps(x, x, op); }
+void vcmpunord_ssd(const Xmm& x, const Operand& op) { vcmpunord_ssd(x, x, op); }
+void vcmpunord_sss(const Xmm& x, const Operand& op) { vcmpunord_sss(x, x, op); }
+void vcmpunordpd(const Xmm& x, const Operand& op) { vcmpunordpd(x, x, op); }
+void vcmpunordps(const Xmm& x, const Operand& op) { vcmpunordps(x, x, op); }
+void vcmpunordsd(const Xmm& x, const Operand& op) { vcmpunordsd(x, x, op); }
+void vcmpunordss(const Xmm& x, const Operand& op) { vcmpunordss(x, x, op); }
+void vcvtsd2ss(const Xmm& x, const Operand& op) { vcvtsd2ss(x, x, op); }
+void vcvtsi2sd(const Xmm& x, const Operand& op) { vcvtsi2sd(x, x, op); }
+void vcvtsi2ss(const Xmm& x, const Operand& op) { vcvtsi2ss(x, x, op); }
+void vcvtss2sd(const Xmm& x, const Operand& op) { vcvtss2sd(x, x, op); }
+void vdppd(const Xmm& x, const Operand& op, uint8_t imm) { vdppd(x, x, op, imm); }
+void vdpps(const Xmm& x, const Operand& op, uint8_t imm) { vdpps(x, x, op, imm); }
+void vinsertps(const Xmm& x, const Operand& op, uint8_t imm) { vinsertps(x, x, op, imm); }
+void vmpsadbw(const Xmm& x, const Operand& op, uint8_t imm) { vmpsadbw(x, x, op, imm); }
+void vpackssdw(const Xmm& x, const Operand& op) { vpackssdw(x, x, op); }
+void vpacksswb(const Xmm& x, const Operand& op) { vpacksswb(x, x, op); }
+void vpackusdw(const Xmm& x, const Operand& op) { vpackusdw(x, x, op); }
+void vpackuswb(const Xmm& x, const Operand& op) { vpackuswb(x, x, op); }
+void vpaddb(const Xmm& x, const Operand& op) { vpaddb(x, x, op); }
+void vpaddd(const Xmm& x, const Operand& op) { vpaddd(x, x, op); }
+void vpaddq(const Xmm& x, const Operand& op) { vpaddq(x, x, op); }
+void vpaddsb(const Xmm& x, const Operand& op) { vpaddsb(x, x, op); }
+void vpaddsw(const Xmm& x, const Operand& op) { vpaddsw(x, x, op); }
+void vpaddusb(const Xmm& x, const Operand& op) { vpaddusb(x, x, op); }
+void vpaddusw(const Xmm& x, const Operand& op) { vpaddusw(x, x, op); }
+void vpaddw(const Xmm& x, const Operand& op) { vpaddw(x, x, op); }
+void vpalignr(const Xmm& x, const Operand& op, uint8_t imm) { vpalignr(x, x, op, imm); }
+void vpand(const Xmm& x, const Operand& op) { vpand(x, x, op); }
+void vpandn(const Xmm& x, const Operand& op) { vpandn(x, x, op); }
+void vpavgb(const Xmm& x, const Operand& op) { vpavgb(x, x, op); }
+void vpavgw(const Xmm& x, const Operand& op) { vpavgw(x, x, op); }
+void vpblendd(const Xmm& x, const Operand& op, uint8_t imm) { vpblendd(x, x, op, imm); }
+void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { vpblendvb(x1, x1, op, x4); }
+void vpblendw(const Xmm& x, const Operand& op, uint8_t imm) { vpblendw(x, x, op, imm); }
+void vpclmulqdq(const Xmm& x, const Operand& op, uint8_t imm) { vpclmulqdq(x, x, op, imm); }
+void vpcmpeqb(const Xmm& x, const Operand& op) { vpcmpeqb(x, x, op); }
+void vpcmpeqd(const Xmm& x, const Operand& op) { vpcmpeqd(x, x, op); }
+void vpcmpeqq(const Xmm& x, const Operand& op) { vpcmpeqq(x, x, op); }
+void vpcmpeqw(const Xmm& x, const Operand& op) { vpcmpeqw(x, x, op); }
+void vpcmpgtb(const Xmm& x, const Operand& op) { vpcmpgtb(x, x, op); }
+void vpcmpgtd(const Xmm& x, const Operand& op) { vpcmpgtd(x, x, op); }
+void vpcmpgtq(const Xmm& x, const Operand& op) { vpcmpgtq(x, x, op); }
+void vpcmpgtw(const Xmm& x, const Operand& op) { vpcmpgtw(x, x, op); }
+void vphaddd(const Xmm& x, const Operand& op) { vphaddd(x, x, op); }
+void vphaddsw(const Xmm& x, const Operand& op) { vphaddsw(x, x, op); }
+void vphaddw(const Xmm& x, const Operand& op) { vphaddw(x, x, op); }
+void vphsubd(const Xmm& x, const Operand& op) { vphsubd(x, x, op); }
+void vphsubsw(const Xmm& x, const Operand& op) { vphsubsw(x, x, op); }
+void vphsubw(const Xmm& x, const Operand& op) { vphsubw(x, x, op); }
+void vpinsrb(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrb(x, x, op, imm); }
+void vpinsrd(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrd(x, x, op, imm); }
+void vpinsrq(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrq(x, x, op, imm); }
+void vpinsrw(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrw(x, x, op, imm); }
+void vpmaddubsw(const Xmm& x, const Operand& op) { vpmaddubsw(x, x, op); }
+void vpmaddwd(const Xmm& x, const Operand& op) { vpmaddwd(x, x, op); }
+void vpmaxsb(const Xmm& x, const Operand& op) { vpmaxsb(x, x, op); }
+void vpmaxsd(const Xmm& x, const Operand& op) { vpmaxsd(x, x, op); }
+void vpmaxsw(const Xmm& x, const Operand& op) { vpmaxsw(x, x, op); }
+void vpmaxub(const Xmm& x, const Operand& op) { vpmaxub(x, x, op); }
+void vpmaxud(const Xmm& x, const Operand& op) { vpmaxud(x, x, op); }
+void vpmaxuw(const Xmm& x, const Operand& op) { vpmaxuw(x, x, op); }
+void vpminsb(const Xmm& x, const Operand& op) { vpminsb(x, x, op); }
+void vpminsd(const Xmm& x, const Operand& op) { vpminsd(x, x, op); }
+void vpminsw(const Xmm& x, const Operand& op) { vpminsw(x, x, op); }
+void vpminub(const Xmm& x, const Operand& op) { vpminub(x, x, op); }
+void vpminud(const Xmm& x, const Operand& op) { vpminud(x, x, op); }
+void vpminuw(const Xmm& x, const Operand& op) { vpminuw(x, x, op); }
+void vpmuldq(const Xmm& x, const Operand& op) { vpmuldq(x, x, op); }
+void vpmulhrsw(const Xmm& x, const Operand& op) { vpmulhrsw(x, x, op); }
+void vpmulhuw(const Xmm& x, const Operand& op) { vpmulhuw(x, x, op); }
+void vpmulhw(const Xmm& x, const Operand& op) { vpmulhw(x, x, op); }
+void vpmulld(const Xmm& x, const Operand& op) { vpmulld(x, x, op); }
+void vpmullw(const Xmm& x, const Operand& op) { vpmullw(x, x, op); }
+void vpmuludq(const Xmm& x, const Operand& op) { vpmuludq(x, x, op); }
+void vpor(const Xmm& x, const Operand& op) { vpor(x, x, op); }
+void vpsadbw(const Xmm& x, const Operand& op) { vpsadbw(x, x, op); }
+void vpsignb(const Xmm& x, const Operand& op) { vpsignb(x, x, op); }
+void vpsignd(const Xmm& x, const Operand& op) { vpsignd(x, x, op); }
+void vpsignw(const Xmm& x, const Operand& op) { vpsignw(x, x, op); }
+void vpslld(const Xmm& x, const Operand& op) { vpslld(x, x, op); }
+void vpslld(const Xmm& x, uint8_t imm) { vpslld(x, x, imm); }
+void vpslldq(const Xmm& x, uint8_t imm) { vpslldq(x, x, imm); }
+void vpsllq(const Xmm& x, const Operand& op) { vpsllq(x, x, op); }
+void vpsllq(const Xmm& x, uint8_t imm) { vpsllq(x, x, imm); }
+void vpsllw(const Xmm& x, const Operand& op) { vpsllw(x, x, op); }
+void vpsllw(const Xmm& x, uint8_t imm) { vpsllw(x, x, imm); }
+void vpsrad(const Xmm& x, const Operand& op) { vpsrad(x, x, op); }
+void vpsrad(const Xmm& x, uint8_t imm) { vpsrad(x, x, imm); }
+void vpsraw(const Xmm& x, const Operand& op) { vpsraw(x, x, op); }
+void vpsraw(const Xmm& x, uint8_t imm) { vpsraw(x, x, imm); }
+void vpsrld(const Xmm& x, const Operand& op) { vpsrld(x, x, op); }
+void vpsrld(const Xmm& x, uint8_t imm) { vpsrld(x, x, imm); }
+void vpsrldq(const Xmm& x, uint8_t imm) { vpsrldq(x, x, imm); }
+void vpsrlq(const Xmm& x, const Operand& op) { vpsrlq(x, x, op); }
+void vpsrlq(const Xmm& x, uint8_t imm) { vpsrlq(x, x, imm); }
+void vpsrlw(const Xmm& x, const Operand& op) { vpsrlw(x, x, op); }
+void vpsrlw(const Xmm& x, uint8_t imm) { vpsrlw(x, x, imm); }
+void vpsubb(const Xmm& x, const Operand& op) { vpsubb(x, x, op); }
+void vpsubd(const Xmm& x, const Operand& op) { vpsubd(x, x, op); }
+void vpsubq(const Xmm& x, const Operand& op) { vpsubq(x, x, op); }
+void vpsubsb(const Xmm& x, const Operand& op) { vpsubsb(x, x, op); }
+void vpsubsw(const Xmm& x, const Operand& op) { vpsubsw(x, x, op); }
+void vpsubusb(const Xmm& x, const Operand& op) { vpsubusb(x, x, op); }
+void vpsubusw(const Xmm& x, const Operand& op) { vpsubusw(x, x, op); }
+void vpsubw(const Xmm& x, const Operand& op) { vpsubw(x, x, op); }
+void vpunpckhbw(const Xmm& x, const Operand& op) { vpunpckhbw(x, x, op); }
+void vpunpckhdq(const Xmm& x, const Operand& op) { vpunpckhdq(x, x, op); }
+void vpunpckhqdq(const Xmm& x, const Operand& op) { vpunpckhqdq(x, x, op); }
+void vpunpckhwd(const Xmm& x, const Operand& op) { vpunpckhwd(x, x, op); }
+void vpunpcklbw(const Xmm& x, const Operand& op) { vpunpcklbw(x, x, op); }
+void vpunpckldq(const Xmm& x, const Operand& op) { vpunpckldq(x, x, op); }
+void vpunpcklqdq(const Xmm& x, const Operand& op) { vpunpcklqdq(x, x, op); }
+void vpunpcklwd(const Xmm& x, const Operand& op) { vpunpcklwd(x, x, op); }
+void vpxor(const Xmm& x, const Operand& op) { vpxor(x, x, op); }
+void vrcpss(const Xmm& x, const Operand& op) { vrcpss(x, x, op); }
+void vroundsd(const Xmm& x, const Operand& op, uint8_t imm) { vroundsd(x, x, op, imm); }
+void vroundss(const Xmm& x, const Operand& op, uint8_t imm) { vroundss(x, x, op, imm); }
+void vrsqrtss(const Xmm& x, const Operand& op) { vrsqrtss(x, x, op); }
+void vshufpd(const Xmm& x, const Operand& op, uint8_t imm) { vshufpd(x, x, op, imm); }
+void vshufps(const Xmm& x, const Operand& op, uint8_t imm) { vshufps(x, x, op, imm); }
+void vsqrtsd(const Xmm& x, const Operand& op) { vsqrtsd(x, x, op); }
+void vsqrtss(const Xmm& x, const Operand& op) { vsqrtss(x, x, op); }
+void vunpckhpd(const Xmm& x, const Operand& op) { vunpckhpd(x, x, op); }
+void vunpckhps(const Xmm& x, const Operand& op) { vunpckhps(x, x, op); }
+void vunpcklpd(const Xmm& x, const Operand& op) { vunpcklpd(x, x, op); }
+void vunpcklps(const Xmm& x, const Operand& op) { vunpcklps(x, x, op); }
+#endif
+#ifdef XBYAK64
+void jecxz(std::string label) {
+  db(0x67);
+  opJmp(label, T_SHORT, 0xe3, 0, 0);
+}
+void jecxz(const Label& label) {
+  db(0x67);
+  opJmp(label, T_SHORT, 0xe3, 0, 0);
+}
+void jrcxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void jrcxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void cdqe() {
+  db(0x48);
+  db(0x98);
+}
+void cqo() {
+  db(0x48);
+  db(0x99);
+}
+void cmpsq() {
+  db(0x48);
+  db(0xA7);
+}
+void popfq() { db(0x9D); }
+void pushfq() { db(0x9C); }
+void lodsq() {
+  db(0x48);
+  db(0xAD);
+}
+void movsq() {
+  db(0x48);
+  db(0xA5);
+}
+void scasq() {
+  db(0x48);
+  db(0xAF);
+}
+void stosq() {
+  db(0x48);
+  db(0xAB);
+}
+void syscall() {
+  db(0x0F);
+  db(0x05);
+}
+void sysret() {
+  db(0x0F);
+  db(0x07);
+}
+void clui() {
+  db(0xF3);
+  db(0x0F);
+  db(0x01);
+  db(0xEE);
+}
+void stui() {
+  db(0xF3);
+  db(0x0F);
+  db(0x01);
+  db(0xEF);
+}
+void testui() {
+  db(0xF3);
+  db(0x0F);
+  db(0x01);
+  db(0xED);
+}
+void uiret() {
+  db(0xF3);
+  db(0x0F);
+  db(0x01);
+  db(0xEC);
+}
+void cmpxchg16b(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xC7); }
+void fxrstor64(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xAE); }
+void movq(const Reg64& reg, const Mmx& mmx) {
+  if (mmx.isXMM()) db(0x66);
+  opModR(mmx, reg, 0x0F, 0x7E);
+}
+void movq(const Mmx& mmx, const Reg64& reg) {
+  if (mmx.isXMM()) db(0x66);
+  opModR(mmx, reg, 0x0F, 0x6E);
+}
+void movsxd(const Reg64& reg, const Operand& op) {
+  if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opModRM(reg, op, op.isREG(), op.isMEM(), 0x63);
+}
+void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) {
+  if (!op.isREG(64) && !op.isMEM())
+    XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, 0x3A);
+}
+void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) {
+  if (!op.isREG(64) && !op.isMEM())
+    XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, 0x3A);
+}
+void senduipi(const Reg64& r) {
+  db(0xF3);
+  opModR(Reg32(6), r.cvt32(), 0x0F, 0xC7);
+}
+void vcvtss2si(const Reg64& r, const Operand& op) {
+  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_ER_X | T_N8, 0x2D);
+}
+void vcvttss2si(const Reg64& r, const Operand& op) {
+  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_SAE_X | T_N8, 0x2C);
+}
+void vcvtsd2si(const Reg64& r, const Operand& op) {
+  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_ER_X, 0x2D);
+}
+void vcvttsd2si(const Reg64& r, const Operand& op) {
+  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x2C);
+}
+void vmovq(const Xmm& x, const Reg64& r) {
+  opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E);
+}
+void vmovq(const Reg64& r, const Xmm& x) {
+  opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E);
+}
+void cmpbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE6, false);
+}
+void cmpbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE2, false);
+}
+void cmplexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEE, false);
+}
+void cmplxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEC, false);
+}
+void cmpnbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE7, false);
+}
+void cmpnbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE3, false);
+}
+void cmpnlexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEF, false);
+}
+void cmpnlxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xED, false);
+}
+void cmpnoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE1, false);
+}
+void cmpnpxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEB, false);
+}
+void cmpnsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE9, false);
+}
+void cmpnzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE5, false);
+}
+void cmpoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE0, false);
+}
+void cmppxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEA, false);
+}
+void cmpsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE8, false);
+}
+void cmpzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
+  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE4, false);
+}
+void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); }
+void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }
+void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); }
+void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66 | T_0F38 | T_W0, 0x4b); }
+void tilerelease() {
+  db(0xc4);
+  db(0xe2);
+  db(0x78);
+  db(0x49);
+  db(0xc0);
+}
+void tilestored(const Address& addr, const Tmm& tm) { opVex(tm, &tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); }
+void tilezero(const Tmm& Tmm) { opVex(Tmm, &tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }
+void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); }
+void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }
+void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }
+void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }
+void tdpfp16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }
+void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }
+#else
+void jcxz(std::string label) {
+  db(0x67);
+  opJmp(label, T_SHORT, 0xe3, 0, 0);
+}
+void jcxz(const Label& label) {
+  db(0x67);
+  opJmp(label, T_SHORT, 0xe3, 0, 0);
+}
+void jecxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void jecxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
+void aaa() { db(0x37); }
+void aad() {
+  db(0xD5);
+  db(0x0A);
+}
+void aam() {
+  db(0xD4);
+  db(0x0A);
+}
+void aas() { db(0x3F); }
+void daa() { db(0x27); }
+void das() { db(0x2F); }
+void into() { db(0xCE); }
+void popad() { db(0x61); }
+void popfd() { db(0x9D); }
+void pusha() { db(0x60); }
+void pushad() { db(0x60); }
+void pushfd() { db(0x9C); }
+void popa() { db(0x61); }
+void lds(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0xC5, 0x100); }
+void les(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0xC4, 0x100); }
+#endif
+#ifndef XBYAK_NO_OP_NAMES
+void and (const Operand& op1, const Operand& op2) { and_(op1, op2); }
+void and (const Operand& op, uint32_t imm) { and_(op, imm); }
+void or (const Operand& op1, const Operand& op2) { or_(op1, op2); }
+void or (const Operand& op, uint32_t imm) { or_(op, imm); }
+void xor (const Operand& op1, const Operand& op2) { xor_(op1, op2); } void xor
+    (const Operand& op, uint32_t imm) { xor_(op, imm); } void not(const Operand& op) {
+  not_(op);
+}
+#endif
+#ifndef XBYAK_DISABLE_AVX512
+void kaddb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
+  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x4A);
+}
+void kaddd(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
+  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x4A);
+}
+void kaddq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x4A); }
+void kaddw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x4A); }
+void kandb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
+  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x41);
+}
+void kandd(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
+  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x41);
+}
+void kandnb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
+  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x42);
+}
+void kandnd(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
+  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x42);
+}
+void kandnq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x42); }
+void kandnw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x42); }
+void kandq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x41); }
+void kandw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x41); }
+void kmovb(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W0, 0x91); }
+void kmovb(const Opmask& k, const Operand& op) {
+  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W0, 0x90);
+}
+void kmovb(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_66 | T_W0, 0x92); }
+void kmovb(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_66 | T_W0, 0x93); }
+void kmovd(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W1, 0x91); }
+void kmovd(const Opmask& k, const Operand& op) {
+  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W1, 0x90);
+}
+void kmovd(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W0, 0x92); }
+void kmovd(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W0, 0x93); }
+void kmovq(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W1, 0x91); }
+void kmovq(const Opmask& k, const Operand& op) {
+  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W1, 0x90);
+}
+void kmovw(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W0, 0x91); }
+void kmovw(const Opmask& k, const Operand& op) {
+  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W0, 0x90);
+}
+void kmovw(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_W0, 0x92); }
+void kmovw(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_W0, 0x93); }
+void knotb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x44); }
+void knotd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x44); }
+void knotq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x44); }
+void knotw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x44); }
+void korb(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x45); }
+void kord(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x45); }
+void korq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x45); }
+void kortestb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x98); }
+void kortestd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x98); }
+void kortestq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x98); }
+void kortestw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x98); }
+void korw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x45); }
+void kshiftlb(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x32, imm); }
+void kshiftld(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x33, imm); }
+void kshiftlq(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x33, imm); }
+void kshiftlw(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x32, imm); }
+void kshiftrb(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x30, imm); }
+void kshiftrd(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x31, imm); }
+void kshiftrq(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x31, imm); }
+void kshiftrw(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x30, imm); }
+void ktestb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x99); }
+void ktestd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x99); }
+void ktestq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x99); }
+void ktestw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x99); }
+void kunpckbw(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
+  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x4B);
+}
+void kunpckdq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x4B); }
+void kunpckwd(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x4B); }
+void kxnorb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
+  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x46);
+}
+void kxnord(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
+  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x46);
+}
+void kxnorq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x46); }
+void kxnorw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x46); }
+void kxorb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
+  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x47);
+}
+void kxord(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
+  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x47);
+}
+void kxorq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x47); }
+void kxorw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x47); }
+void v4fmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) {
+  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x9A);
+}
+void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) {
+  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0x9B);
+}
+void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) {
+  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA);
+}
+void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) {
+  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0xAB);
+}
+void vaddph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x58);
+}
+void vaddsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x58);
+}
+void valignd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x03, imm);
+}
+void valignq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x03, imm);
+}
+void vblendmpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x65);
+}
+void vblendmps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x65);
+}
+void vbroadcastf32x2(const Ymm& y, const Operand& op) {
+  opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N8, 0x19);
+}
+void vbroadcastf32x4(const Ymm& y, const Address& addr) {
+  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N16, 0x1A);
+}
+void vbroadcastf32x8(const Zmm& y, const Address& addr) {
+  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N32, 0x1B);
+}
+void vbroadcastf64x2(const Ymm& y, const Address& addr) {
+  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N16, 0x1A);
+}
+void vbroadcastf64x4(const Zmm& y, const Address& addr) {
+  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N32, 0x1B);
+}
+void vbroadcasti32x2(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N8, 0x59);
+}
+void vbroadcasti32x4(const Ymm& y, const Operand& op) {
+  opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N16, 0x5A);
+}
+void vbroadcasti32x8(const Zmm& z, const Operand& op) {
+  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N32, 0x5B);
+}
+void vbroadcasti64x2(const Ymm& y, const Operand& op) {
+  opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N16, 0x5A);
+}
+void vbroadcasti64x4(const Zmm& z, const Operand& op) {
+  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N32, 0x5B);
+}
+void vcmpeq_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 16); }
+void vcmpeq_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 16); }
+void vcmpeq_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 16); }
+void vcmpeq_osss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 16); }
+void vcmpeq_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 8); }
+void vcmpeq_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 8); }
+void vcmpeq_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 8); }
+void vcmpeq_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 8); }
+void vcmpeq_uspd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 24); }
+void vcmpeq_usps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 24); }
+void vcmpeq_ussd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 24); }
+void vcmpeq_usss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 24); }
+void vcmpeqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 0); }
+void vcmpeqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 0); }
+void vcmpeqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 0); }
+void vcmpeqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 0); }
+void vcmpfalse_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 27); }
+void vcmpfalse_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 27); }
+void vcmpfalse_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 27); }
+void vcmpfalse_osss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 27); }
+void vcmpfalsepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 11); }
+void vcmpfalseps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 11); }
+void vcmpfalsesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 11); }
+void vcmpfalsess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 11); }
+void vcmpge_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 29); }
+void vcmpge_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 29); }
+void vcmpge_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 29); }
+void vcmpge_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 29); }
+void vcmpgepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 13); }
+void vcmpgeps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 13); }
+void vcmpgesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 13); }
+void vcmpgess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 13); }
+void vcmpgt_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 30); }
+void vcmpgt_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 30); }
+void vcmpgt_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 30); }
+void vcmpgt_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 30); }
+void vcmpgtpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 14); }
+void vcmpgtps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 14); }
+void vcmpgtsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 14); }
+void vcmpgtss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 14); }
+void vcmple_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 18); }
+void vcmple_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 18); }
+void vcmple_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 18); }
+void vcmple_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 18); }
+void vcmplepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 2); }
+void vcmpleps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 2); }
+void vcmplesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 2); }
+void vcmpless(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 2); }
+void vcmplt_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 17); }
+void vcmplt_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 17); }
+void vcmplt_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 17); }
+void vcmplt_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 17); }
+void vcmpltpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 1); }
+void vcmpltps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 1); }
+void vcmpltsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 1); }
+void vcmpltss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 1); }
+void vcmpneq_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 12); }
+void vcmpneq_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 12); }
+void vcmpneq_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 12); }
+void vcmpneq_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 12); }
+void vcmpneq_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 28); }
+void vcmpneq_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 28); }
+void vcmpneq_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 28); }
+void vcmpneq_osss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 28); }
+void vcmpneq_uspd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 20); }
+void vcmpneq_usps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 20); }
+void vcmpneq_ussd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 20); }
+void vcmpneq_usss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 20); }
+void vcmpneqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 4); }
+void vcmpneqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 4); }
+void vcmpneqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 4); }
+void vcmpneqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 4); }
+void vcmpnge_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 25); }
+void vcmpnge_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 25); }
+void vcmpnge_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 25); }
+void vcmpnge_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 25); }
+void vcmpngepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 9); }
+void vcmpngeps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 9); }
+void vcmpngesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 9); }
+void vcmpngess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 9); }
+void vcmpngt_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 26); }
+void vcmpngt_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 26); }
+void vcmpngt_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 26); }
+void vcmpngt_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 26); }
+void vcmpngtpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 10); }
+void vcmpngtps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 10); }
+void vcmpngtsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 10); }
+void vcmpngtss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 10); }
+void vcmpnle_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 22); }
+void vcmpnle_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 22); }
+void vcmpnle_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 22); }
+void vcmpnle_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 22); }
+void vcmpnlepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 6); }
+void vcmpnleps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 6); }
+void vcmpnlesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 6); }
+void vcmpnless(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 6); }
+void vcmpnlt_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 21); }
+void vcmpnlt_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 21); }
+void vcmpnlt_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 21); }
+void vcmpnlt_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 21); }
+void vcmpnltpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 5); }
+void vcmpnltps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 5); }
+void vcmpnltsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 5); }
+void vcmpnltss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 5); }
+void vcmpord_spd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 23); }
+void vcmpord_sps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 23); }
+void vcmpord_ssd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 23); }
+void vcmpord_sss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 23); }
+void vcmpordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 7); }
+void vcmpordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 7); }
+void vcmpordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 7); }
+void vcmpordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 7); }
+void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0xC2, imm);
+}
+void vcmpph(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_K_X_XM(k, x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0xC2, imm);
+}
+void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_K_X_XM(k, x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0xC2, imm);
+}
+void vcmpsd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_K_X_XM(k, x, op, T_N8 | T_F2 | T_0F | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm);
+}
+void vcmpsh(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_K_X_XM(k, x, op, T_N2 | T_F3 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xC2, imm);
+}
+void vcmpss(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_K_X_XM(k, x, op, T_N4 | T_F3 | T_0F | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm);
+}
+void vcmptrue_uspd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 31); }
+void vcmptrue_usps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 31); }
+void vcmptrue_ussd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 31); }
+void vcmptrue_usss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 31); }
+void vcmptruepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 15); }
+void vcmptrueps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 15); }
+void vcmptruesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 15); }
+void vcmptruess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 15); }
+void vcmpunord_spd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 19); }
+void vcmpunord_sps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 19); }
+void vcmpunord_ssd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 19); }
+void vcmpunord_sss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 19); }
+void vcmpunordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 3); }
+void vcmpunordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 3); }
+void vcmpunordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 3); }
+void vcmpunordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 3); }
+void vcomish(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F);
+}
+void vcompressb(const Operand& op, const Xmm& x) {
+  opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x63);
+}
+void vcompresspd(const Operand& op, const Xmm& x) {
+  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8A);
+}
+void vcompressps(const Operand& op, const Xmm& x) {
+  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8A);
+}
+void vcompressw(const Operand& op, const Xmm& x) {
+  opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63);
+}
+void vcvtdq2ph(const Xmm& x, const Operand& op) {
+  checkCvt4(x, op);
+  opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B);
+}
+void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72);
+}
+void vcvtpd2ph(const Xmm& x, const Operand& op) {
+  opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A);
+}
+void vcvtpd2qq(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B);
+}
+void vcvtpd2udq(const Xmm& x, const Operand& op) {
+  opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79);
+}
+void vcvtpd2uqq(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79);
+}
+void vcvtph2dq(const Xmm& x, const Operand& op) {
+  checkCvt1(x, op);
+  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B16, 0x5B);
+}
+void vcvtph2pd(const Xmm& x, const Operand& op) {
+  if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE)
+  opVex(x, 0, op, T_N4 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_SAE_X | T_MUST_EVEX | T_B16, 0x5A);
+}
+void vcvtph2psx(const Xmm& x, const Operand& op) {
+  checkCvt1(x, op);
+  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_MAP6 | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B16, 0x13);
+}
+void vcvtph2qq(const Xmm& x, const Operand& op) {
+  if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE)
+  opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_X | T_MUST_EVEX | T_B16, 0x7B);
+}
+void vcvtph2udq(const Xmm& x, const Operand& op) {
+  checkCvt1(x, op);
+  opVex(x, 0, op, T_N8 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B16, 0x79);
+}
+void vcvtph2uqq(const Xmm& x, const Operand& op) {
+  if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE)
+  opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_X | T_MUST_EVEX | T_B16, 0x79);
+}
+void vcvtph2uw(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
+}
+void vcvtph2w(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
+}
+void vcvtps2phx(const Xmm& x, const Operand& op) {
+  checkCvt4(x, op);
+  opCvt(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_ER_Z | T_MUST_EVEX | T_B32, 0x1D);
+}
+void vcvtps2qq(const Xmm& x, const Operand& op) {
+  checkCvt1(x, op);
+  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B32, 0x7B);
+}
+void vcvtps2udq(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_0F | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x79);
+}
+void vcvtps2uqq(const Xmm& x, const Operand& op) {
+  checkCvt1(x, op);
+  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B32, 0x79);
+}
+void vcvtqq2pd(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0xE6);
+}
+void vcvtqq2ph(const Xmm& x, const Operand& op) {
+  opCvt5(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5B);
+}
+void vcvtqq2ps(const Xmm& x, const Operand& op) {
+  opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x5B);
+}
+void vcvtsd2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_MAP5 | T_EW1 | T_ER_X | T_MUST_EVEX, 0x5A);
+}
+void vcvtsd2usi(const Reg32e& r, const Operand& op) {
+  int type = (T_N8 | T_F2 | T_0F | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
+  opVex(r, &xm0, op, type, 0x79);
+}
+void vcvtsh2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x5A);
+}
+void vcvtsh2si(const Reg32e& r, const Operand& op) {
+  int type = (T_N2 | T_F3 | T_MAP5 | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
+  opVex(r, &xm0, op, type, 0x2D);
+}
+void vcvtsh2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_MAP6 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x13);
+}
+void vcvtsh2usi(const Reg32e& r, const Operand& op) {
+  int type = (T_N2 | T_F3 | T_MAP5 | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
+  opVex(r, &xm0, op, type, 0x79);
+}
+void vcvtsi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  if (!(x1.isXMM() && x2.isXMM() && op.isBit(32 | 64))) XBYAK_THROW(ERR_BAD_COMBINATION)
+  int type = (T_F3 | T_MAP5 | T_ER_R | T_MUST_EVEX | T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8));
+  opVex(x1, &x2, op, type, 0x2A);
+}
+void vcvtss2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_MAP5 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x1D);
+}
+void vcvtss2usi(const Reg32e& r, const Operand& op) {
+  int type = (T_N4 | T_F3 | T_0F | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
+  opVex(r, &xm0, op, type, 0x79);
+}
+void vcvttpd2qq(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x7A);
+}
+void vcvttpd2udq(const Xmm& x, const Operand& op) {
+  opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x78);
+}
+void vcvttpd2uqq(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x78);
+}
+void vcvttph2dq(const Xmm& x, const Operand& op) {
+  checkCvt1(x, op);
+  opVex(x, 0, op, T_N8 | T_N_VL | T_F3 | T_MAP5 | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B16, 0x5B);
+}
+void vcvttph2qq(const Xmm& x, const Operand& op) {
+  if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE)
+  opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_SAE_X | T_MUST_EVEX | T_B16, 0x7A);
+}
+void vcvttph2udq(const Xmm& x, const Operand& op) {
+  checkCvt1(x, op);
+  opVex(x, 0, op, T_N8 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B16, 0x78);
+}
+void vcvttph2uqq(const Xmm& x, const Operand& op) {
+  if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE)
+  opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_SAE_X | T_MUST_EVEX | T_B16, 0x78);
+}
+void vcvttph2uw(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_MAP5 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x7C);
+}
+void vcvttph2w(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_MAP5 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x7C);
+}
+void vcvttps2qq(const Xmm& x, const Operand& op) {
+  checkCvt1(x, op);
+  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B32, 0x7A);
+}
+void vcvttps2udq(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x78);
+}
+void vcvttps2uqq(const Xmm& x, const Operand& op) {
+  checkCvt1(x, op);
+  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B32, 0x78);
+}
+void vcvttsd2usi(const Reg32e& r, const Operand& op) {
+  int type = (T_N8 | T_F2 | T_0F | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
+  opVex(r, &xm0, op, type, 0x78);
+}
+void vcvttsh2si(const Reg32e& r, const Operand& op) {
+  int type = (T_N2 | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
+  opVex(r, &xm0, op, type, 0x2C);
+}
+void vcvttsh2usi(const Reg32e& r, const Operand& op) {
+  int type = (T_N2 | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
+  opVex(r, &xm0, op, type, 0x78);
+}
+void vcvttss2usi(const Reg32e& r, const Operand& op) {
+  int type = (T_N4 | T_F3 | T_0F | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
+  opVex(r, &xm0, op, type, 0x78);
+}
+void vcvtudq2pd(const Xmm& x, const Operand& op) {
+  checkCvt1(x, op);
+  opVex(x, 0, op, T_N8 | T_N_VL | T_F3 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7A);
+}
+void vcvtudq2ph(const Xmm& x, const Operand& op) {
+  checkCvt4(x, op);
+  opCvt(x, op, T_N16 | T_N_VL | T_F2 | T_MAP5 | T_EW0 | T_ER_Z | T_MUST_EVEX | T_B32, 0x7A);
+}
+void vcvtudq2ps(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x7A);
+}
+void vcvtuqq2pd(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A);
+}
+void vcvtuqq2ph(const Xmm& x, const Operand& op) {
+  opCvt5(x, op, T_N16 | T_N_VL | T_F2 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A);
+}
+void vcvtuqq2ps(const Xmm& x, const Operand& op) {
+  opCvt2(x, op, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A);
+}
+void vcvtusi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opCvt3(x1, x2, op, T_F2 | T_0F | T_MUST_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x7B);
+}
+void vcvtusi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  if (!(x1.isXMM() && x2.isXMM() && op.isBit(32 | 64))) XBYAK_THROW(ERR_BAD_COMBINATION)
+  int type = (T_F3 | T_MAP5 | T_ER_R | T_MUST_EVEX | T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8));
+  opVex(x1, &x2, op, type, 0x7B);
+}
+void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opCvt3(x1, x2, op, T_F3 | T_0F | T_MUST_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x7B);
+}
+void vcvtuw2ph(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_F2 | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
+}
+void vcvtw2ph(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_F3 | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
+}
+void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x42, imm);
+}
+void vdivph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5E);
+}
+void vdivsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5E);
+}
+void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x52);
+}
+void vexp2pd(const Zmm& z, const Operand& op) {
+  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8);
+}
+void vexp2ps(const Zmm& z, const Operand& op) {
+  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8);
+}
+void vexpandpd(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x88);
+}
+void vexpandps(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x88);
+}
+void vextractf32x4(const Operand& op, const Ymm& r, uint8_t imm) {
+  if (!op.is(Operand::MEM | Operand::XMM))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x19, imm);
+}
+void vextractf32x8(const Operand& op, const Zmm& r, uint8_t imm) {
+  if (!op.is(Operand::MEM | Operand::YMM))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x1B, imm);
+}
+void vextractf64x2(const Operand& op, const Ymm& r, uint8_t imm) {
+  if (!op.is(Operand::MEM | Operand::XMM))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x19, imm);
+}
+void vextractf64x4(const Operand& op, const Zmm& r, uint8_t imm) {
+  if (!op.is(Operand::MEM | Operand::YMM))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x1B, imm);
+}
+void vextracti32x4(const Operand& op, const Ymm& r, uint8_t imm) {
+  if (!op.is(Operand::MEM | Operand::XMM))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x39, imm);
+}
+void vextracti32x8(const Operand& op, const Zmm& r, uint8_t imm) {
+  if (!op.is(Operand::MEM | Operand::YMM))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3B, imm);
+}
+void vextracti64x2(const Operand& op, const Ymm& r, uint8_t imm) {
+  if (!op.is(Operand::MEM | Operand::XMM))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x39, imm);
+}
+void vextracti64x4(const Operand& op, const Zmm& r, uint8_t imm) {
+  if (!op.is(Operand::MEM | Operand::YMM))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3B, imm);
+}
+void vfcmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_F2 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x56);
+}
+void vfcmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_F2 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0xD6);
+}
+void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x54, imm);
+}
+void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x54, imm);
+}
+void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0x55, imm);
+}
+void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0x55, imm);
+}
+void vfmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x98);
+}
+void vfmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x99);
+}
+void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xA8);
+}
+void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xA9);
+}
+void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xB8);
+}
+void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xB9);
+}
+void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_F3 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x56);
+}
+void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x96);
+}
+void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xA6);
+}
+void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xB6);
+}
+void vfmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x9A);
+}
+void vfmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x9B);
+}
+void vfmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xAA);
+}
+void vfmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xAB);
+}
+void vfmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xBA);
+}
+void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xBB);
+}
+void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x97);
+}
+void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xA7);
+}
+void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xB7);
+}
+void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_F3 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0xD6);
+}
+void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x9C);
+}
+void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x9D);
+}
+void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xAC);
+}
+void vfnmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xAD);
+}
+void vfnmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xBC);
+}
+void vfnmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xBD);
+}
+void vfnmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x9E);
+}
+void vfnmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x9F);
+}
+void vfnmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xAE);
+}
+void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xAF);
+}
+void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xBE);
+}
+void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xBF);
+}
+void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) {
+  if (!op.isBit(128 | 256 | 512)) XBYAK_THROW(ERR_BAD_MEM_SIZE)
+  opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm);
+}
+void vfpclassph(const Opmask& k, const Operand& op, uint8_t imm) {
+  if (!op.isBit(128 | 256 | 512)) XBYAK_THROW(ERR_BAD_MEM_SIZE)
+  opVex(k.changeBit(op.getBit()), 0, op, T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B16, 0x66, imm);
+}
+void vfpclassps(const Opmask& k, const Operand& op, uint8_t imm) {
+  if (!op.isBit(128 | 256 | 512)) XBYAK_THROW(ERR_BAD_MEM_SIZE)
+  opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm);
+}
+void vfpclasssd(const Opmask& k, const Operand& op, uint8_t imm) {
+  if (!op.isXMEM())
+    XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW1 | T_N8, 0x67, imm);
+}
+void vfpclasssh(const Opmask& k, const Operand& op, uint8_t imm) {
+  if (!op.isXMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_0F3A | T_MUST_EVEX | T_EW0 | T_N2, 0x67, imm);
+}
+void vfpclassss(const Opmask& k, const Operand& op, uint8_t imm) {
+  if (!op.isXMEM())
+    XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm);
+}
+void vgatherdpd(const Xmm& x, const Address& addr) {
+  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 1);
+}
+void vgatherdps(const Xmm& x, const Address& addr) {
+  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 0);
+}
+void vgatherpf0dpd(const Address& addr) {
+  opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
+}
+void vgatherpf0dps(const Address& addr) {
+  opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
+}
+void vgatherpf0qpd(const Address& addr) {
+  opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
+}
+void vgatherpf0qps(const Address& addr) {
+  opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
+}
+void vgatherpf1dpd(const Address& addr) {
+  opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
+}
+void vgatherpf1dps(const Address& addr) {
+  opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
+}
+void vgatherpf1qpd(const Address& addr) {
+  opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
+}
+void vgatherpf1qps(const Address& addr) {
+  opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
+}
+void vgatherqpd(const Xmm& x, const Address& addr) {
+  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 0);
+}
+void vgatherqps(const Xmm& x, const Address& addr) {
+  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 2);
+}
+void vgetexppd(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x42);
+}
+void vgetexpph(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x42);
+}
+void vgetexpps(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x42);
+}
+void vgetexpsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x43);
+}
+void vgetexpsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x43);
+}
+void vgetexpss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x43);
+}
+void vgetmantpd(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x26, imm);
+}
+void vgetmantph(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x26, imm);
+}
+void vgetmantps(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x26, imm);
+}
+void vgetmantsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x27, imm);
+}
+void vgetmantsh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x27, imm);
+}
+void vgetmantss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x27, imm);
+}
+void vinsertf32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
+  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x18, imm);
+}
+void vinsertf32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
+  if (!op.is(Operand::MEM | Operand::YMM))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x1A, imm);
+}
+void vinsertf64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
+  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x18, imm);
+}
+void vinsertf64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
+  if (!op.is(Operand::MEM | Operand::YMM))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x1A, imm);
+}
+void vinserti32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
+  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x38, imm);
+}
+void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
+  if (!op.is(Operand::MEM | Operand::YMM))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3A, imm);
+}
+void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
+  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x38, imm);
+}
+void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
+  if (!op.is(Operand::MEM | Operand::YMM))
+    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3A, imm);
+}
+void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5F);
+}
+void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5F);
+}
+void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5D);
+}
+void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5D);
+}
+void vmovdqa32(const Address& addr, const Xmm& x) {
+  opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
+}
+void vmovdqa32(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
+}
+void vmovdqa64(const Address& addr, const Xmm& x) {
+  opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
+}
+void vmovdqa64(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
+}
+void vmovdqu16(const Address& addr, const Xmm& x) {
+  opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
+}
+void vmovdqu16(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
+}
+void vmovdqu32(const Address& addr, const Xmm& x) {
+  opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
+}
+void vmovdqu32(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
+}
+void vmovdqu64(const Address& addr, const Xmm& x) {
+  opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
+}
+void vmovdqu64(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
+}
+void vmovdqu8(const Address& addr, const Xmm& x) {
+  opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
+}
+void vmovdqu8(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
+}
+void vmovsh(const Address& addr, const Xmm& x) {
+  opAVX_X_XM_IMM(x, addr, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_MUST_EVEX | T_M_K, 0x11);
+}
+void vmovsh(const Xmm& x, const Address& addr) {
+  opAVX_X_X_XM(x, xm0, addr, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_MUST_EVEX, 0x10);
+}
+void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) {
+  opAVX_X_X_XM(x1, x2, x3, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_MUST_EVEX, 0x10);
+}
+void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2 | T_66 | T_MAP5 | T_MUST_EVEX, 0x7E); }
+void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2 | T_66 | T_MAP5 | T_MUST_EVEX, 0x7E); }
+void vmovw(const Xmm& x, const Operand& op) {
+  if (!op.isREG(32 | 64) && !op.isMEM())
+    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2 | T_66 | T_MAP5 | T_MUST_EVEX, 0x6E);
+}
+void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59);
+}
+void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59);
+}
+void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) {
+  if (k.getOpmaskIdx() != 0)
+    XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68);
+}
+void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) {
+  if (k.getOpmaskIdx() != 0)
+    XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68);
+}
+void vp4dpwssd(const Zmm& z1, const Zmm& z2, const Address& addr) {
+  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x52);
+}
+void vp4dpwssds(const Zmm& z1, const Zmm& z2, const Address& addr) {
+  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x53);
+}
+void vpabsq(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_MUST_EVEX | T_EW1 | T_B64 | T_YMM, 0x1F);
+}
+void vpandd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xDB);
+}
+void vpandnd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xDF);
+}
+void vpandnq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xDF);
+}
+void vpandq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xDB);
+}
+void vpblendmb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x66);
+}
+void vpblendmd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x64);
+}
+void vpblendmq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x64);
+}
+void vpblendmw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x66);
+}
+void vpbroadcastb(const Xmm& x, const Reg8& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7A); }
+void vpbroadcastd(const Xmm& x, const Reg32& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7C); }
+void vpbroadcastmb2q(const Xmm& x, const Opmask& k) {
+  opVex(x, 0, k, T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1, 0x2A);
+}
+void vpbroadcastmw2d(const Xmm& x, const Opmask& k) {
+  opVex(x, 0, k, T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0, 0x3A);
+}
+void vpbroadcastw(const Xmm& x, const Reg16& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7B); }
+void vpcmpb(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3F, imm);
+}
+void vpcmpd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1F, imm);
+}
+void vpcmpeqb(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x74);
+}
+void vpcmpeqd(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX | T_B32, 0x76);
+}
+void vpcmpeqq(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x29);
+}
+void vpcmpeqw(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x75);
+}
+void vpcmpgtb(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x64);
+}
+void vpcmpgtd(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x66);
+}
+void vpcmpgtq(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x37);
+}
+void vpcmpgtw(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x65);
+}
+void vpcmpq(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1F, imm);
+}
+void vpcmpub(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3E, imm);
+}
+void vpcmpud(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1E, imm);
+}
+void vpcmpuq(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1E, imm);
+}
+void vpcmpuw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3E, imm);
+}
+void vpcmpw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3F, imm);
+}
+void vpcompressd(const Operand& op, const Xmm& x) {
+  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8B);
+}
+void vpcompressq(const Operand& op, const Xmm& x) {
+  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8B);
+}
+void vpconflictd(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xC4);
+}
+void vpconflictq(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xC4);
+}
+void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8D);
+}
+void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x75);
+}
+void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x76);
+}
+void vpermi2pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x77);
+}
+void vpermi2ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x77);
+}
+void vpermi2q(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x76);
+}
+void vpermi2w(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x75);
+}
+void vpermt2b(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7D);
+}
+void vpermt2d(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7E);
+}
+void vpermt2pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7F);
+}
+void vpermt2ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7F);
+}
+void vpermt2q(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7E);
+}
+void vpermt2w(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7D);
+}
+void vpermw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8D);
+}
+void vpexpandb(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62);
+}
+void vpexpandd(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x89);
+}
+void vpexpandq(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x89);
+}
+void vpexpandw(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62);
+}
+void vpgatherdd(const Xmm& x, const Address& addr) {
+  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 0);
+}
+void vpgatherdq(const Xmm& x, const Address& addr) {
+  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 1);
+}
+void vpgatherqd(const Xmm& x, const Address& addr) {
+  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 2);
+}
+void vpgatherqq(const Xmm& x, const Address& addr) {
+  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0);
+}
+void vplzcntd(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44);
+}
+void vplzcntq(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44);
+}
+void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D);
+}
+void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F);
+}
+void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39);
+}
+void vpminuq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3B);
+}
+void vpmovb2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x29); }
+void vpmovd2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x39); }
+void vpmovdb(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x31, false);
+}
+void vpmovdw(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x33, true);
+}
+void vpmovm2b(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x28); }
+void vpmovm2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x38); }
+void vpmovm2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x38); }
+void vpmovm2w(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x28); }
+void vpmovq2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x39); }
+void vpmovqb(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x32, false);
+}
+void vpmovqd(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x35, true);
+}
+void vpmovqw(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x34, false);
+}
+void vpmovsdb(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x21, false);
+}
+void vpmovsdw(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x23, true);
+}
+void vpmovsqb(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x22, false);
+}
+void vpmovsqd(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x25, true);
+}
+void vpmovsqw(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x24, false);
+}
+void vpmovswb(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x20, true);
+}
+void vpmovusdb(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x11, false);
+}
+void vpmovusdw(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x13, true);
+}
+void vpmovusqb(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x12, false);
+}
+void vpmovusqd(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x15, true);
+}
+void vpmovusqw(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x14, false);
+}
+void vpmovuswb(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x10, true);
+}
+void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x29); }
+void vpmovwb(const Operand& op, const Xmm& x) {
+  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x30, true);
+}
+void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40);
+}
+void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83);
+}
+void vpopcntb(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54);
+}
+void vpopcntd(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x55);
+}
+void vpopcntq(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x55);
+}
+void vpopcntw(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54);
+}
+void vpord(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xEB);
+}
+void vporq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xEB);
+}
+void vprold(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(Xmm(x.getKind(), 1), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x72, imm);
+}
+void vprolq(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(Xmm(x.getKind(), 1), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm);
+}
+void vprolvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x15);
+}
+void vprolvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x15);
+}
+void vprord(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(Xmm(x.getKind(), 0), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x72, imm);
+}
+void vprorq(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(Xmm(x.getKind(), 0), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm);
+}
+void vprorvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x14);
+}
+void vprorvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x14);
+}
+void vpscatterdd(const Address& addr, const Xmm& x) {
+  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 0);
+}
+void vpscatterdq(const Address& addr, const Xmm& x) {
+  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 1);
+}
+void vpscatterqd(const Address& addr, const Xmm& x) {
+  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 2);
+}
+void vpscatterqq(const Address& addr, const Xmm& x) {
+  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 0);
+}
+void vpshldd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71, imm);
+}
+void vpshldq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71, imm);
+}
+void vpshldvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71);
+}
+void vpshldvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71);
+}
+void vpshldvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x70);
+}
+void vpshldw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x70, imm);
+}
+void vpshrdd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x73, imm);
+}
+void vpshrdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x73, imm);
+}
+void vpshrdvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x73);
+}
+void vpshrdvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x73);
+}
+void vpshrdvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x72);
+}
+void vpshrdw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x72, imm);
+}
+void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) {
+  opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F);
+}
+void vpsllvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x12);
+}
+void vpsraq(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm);
+}
+void vpsraq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX, 0xE2);
+}
+void vpsravq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x46);
+}
+void vpsravw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x11);
+}
+void vpsrlvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x10);
+}
+void vpternlogd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x25, imm);
+}
+void vpternlogq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x25, imm);
+}
+void vptestmb(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x26);
+}
+void vptestmd(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x27);
+}
+void vptestmq(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x27);
+}
+void vptestmw(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x26);
+}
+void vptestnmb(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x26);
+}
+void vptestnmd(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x27);
+}
+void vptestnmq(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x27);
+}
+void vptestnmw(const Opmask& k, const Xmm& x, const Operand& op) {
+  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x26);
+}
+void vpxord(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xEF);
+}
+void vpxorq(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xEF);
+}
+void vrangepd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x50, imm);
+}
+void vrangeps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x50, imm);
+}
+void vrangesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x51, imm);
+}
+void vrangess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x51, imm);
+}
+void vrcp14pd(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x4C);
+}
+void vrcp14ps(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x4C);
+}
+void vrcp14sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX, 0x4D);
+}
+void vrcp14ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX, 0x4D);
+}
+void vrcp28pd(const Zmm& z, const Operand& op) {
+  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xCA);
+}
+void vrcp28ps(const Zmm& z, const Operand& op) {
+  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCA);
+}
+void vrcp28sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0xCB);
+}
+void vrcp28ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xCB);
+}
+void vrcpph(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_MUST_EVEX | T_B16, 0x4C);
+}
+void vrcpsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_MUST_EVEX, 0x4D);
+}
+void vreducepd(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x56, imm);
+}
+void vreduceph(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x56, imm);
+}
+void vreduceps(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x56, imm);
+}
+void vreducesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x57, imm);
+}
+void vreducesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x57, imm);
+}
+void vreducess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x57, imm);
+}
+void vrndscalepd(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x09, imm);
+}
+void vrndscaleph(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x08, imm);
+}
+void vrndscaleps(const Xmm& x, const Operand& op, uint8_t imm) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x08, imm);
+}
+void vrndscalesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x0B, imm);
+}
+void vrndscalesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x0A, imm);
+}
+void vrndscaless(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x0A, imm);
+}
+void vrsqrt14pd(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x4E);
+}
+void vrsqrt14ps(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x4E);
+}
+void vrsqrt14sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x4F);
+}
+void vrsqrt14ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x4F);
+}
+void vrsqrt28pd(const Zmm& z, const Operand& op) {
+  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xCC);
+}
+void vrsqrt28ps(const Zmm& z, const Operand& op) {
+  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCC);
+}
+void vrsqrt28sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0xCD);
+}
+void vrsqrt28ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xCD);
+}
+void vrsqrtph(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_MUST_EVEX | T_B16, 0x4E);
+}
+void vrsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_MUST_EVEX, 0x4F);
+}
+void vscalefpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x2C);
+}
+void vscalefph(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x2C);
+}
+void vscalefps(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x2C);
+}
+void vscalefsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_ER_X | T_MUST_EVEX, 0x2D);
+}
+void vscalefsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x2D);
+}
+void vscalefss(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x2D);
+}
+void vscatterdpd(const Address& addr, const Xmm& x) {
+  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 1);
+}
+void vscatterdps(const Address& addr, const Xmm& x) {
+  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 0);
+}
+void vscatterpf0dpd(const Address& addr) {
+  opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
+}
+void vscatterpf0dps(const Address& addr) {
+  opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
+}
+void vscatterpf0qpd(const Address& addr) {
+  opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
+}
+void vscatterpf0qps(const Address& addr) {
+  opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
+}
+void vscatterpf1dpd(const Address& addr) {
+  opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
+}
+void vscatterpf1dps(const Address& addr) {
+  opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
+}
+void vscatterpf1qpd(const Address& addr) {
+  opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
+}
+void vscatterpf1qps(const Address& addr) {
+  opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
+}
+void vscatterqpd(const Address& addr, const Xmm& x) {
+  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 0);
+}
+void vscatterqps(const Address& addr, const Xmm& x) {
+  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 2);
+}
+void vshuff32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x23, imm);
+}
+void vshuff64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x23, imm);
+}
+void vshufi32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x43, imm);
+}
+void vshufi64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
+  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x43, imm);
+}
+void vsqrtph(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x51);
+}
+void vsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
+  opAVX_X_X_XM(x1, x2, op, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x51);
+}
+void vsubph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5C);
+}
+void vsubsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
+  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5C);
+}
+void vucomish(const Xmm& x, const Operand& op) {
+  opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E);
+}
+#ifdef XBYAK64
+void kmovq(const Opmask& k, const Reg64& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W1, 0x92); }
+void kmovq(const Reg64& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W1, 0x93); }
+void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7C); }
+#endif
+#endif
diff --git a/vllm-ext/inference_engine/cpp/bestla/bestla/xbyak/xbyak_util.h b/vllm-ext/inference_engine/cpp/bestla/bestla/xbyak/xbyak_util.h
new file mode 100644
index 000000000..cdd24e9f9
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/bestla/bestla/xbyak/xbyak_util.h
@@ -0,0 +1,1160 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#ifndef XBYAK_XBYAK_UTIL_H_
+#define XBYAK_XBYAK_UTIL_H_
+
+#ifdef XBYAK_ONLY_CLASS_CPU
+#include <stdint.h>
+#include <stdlib.h>
+#include <assert.h>
+#ifndef XBYAK_THROW
+#define XBYAK_THROW(x) ;
+#define XBYAK_THROW_RET(x, y) return y;
+#endif
+#ifndef XBYAK_CONSTEXPR
+#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || \
+    (defined(_MSC_VER) && _MSC_VER >= 1910)
+#define XBYAK_CONSTEXPR constexpr
+#else
+#define XBYAK_CONSTEXPR
+#endif
+#endif
+#else
+#include <string.h>
+
+/**
+        utility class and functions for Xbyak
+        Xbyak::util::Clock ; rdtsc timer
+        Xbyak::util::Cpu ; detect CPU
+*/
+#include "xbyak.h"
+#endif  // XBYAK_ONLY_CLASS_CPU
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+#define XBYAK_INTEL_CPU_SPECIFIC
+#endif
+
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+#ifdef _WIN32
+#if defined(_MSC_VER) && (_MSC_VER < 1400) && defined(XBYAK32)
+static inline __declspec(naked) void __cpuid(int[4], int) {
+  __asm {
+				push	ebx
+				push	esi
+				mov		eax, dword ptr [esp + 4 * 2 + 8]  // eaxIn
+				cpuid
+				mov		esi, dword ptr [esp + 4 * 2 + 4]  // data
+				mov		dword ptr [esi], eax
+				mov		dword ptr [esi + 4], ebx
+				mov		dword ptr [esi + 8], ecx
+				mov		dword ptr [esi + 12], edx
+				pop		esi
+				pop		ebx
+				ret
+  }
+}
+#else
+#include <intrin.h>  // for __cpuid
+#endif
+#else
+#ifndef __GNUC_PREREQ
+#define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
+#endif
+#if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
+#if !defined(signature_VORTEX_ebx) && !defined(signature_NEXGEN_ebx) && \
+    !defined(signature_AMD_ebx)  // workaround for Bug 96238 - [i386] cpuid.h header needs include guards
+#include <cpuid.h>
+#endif
+#else
+#if defined(__APPLE__) && defined(XBYAK32)  // avoid err : can't find a register in class `BREG' while reloading `asm'
+#define __cpuid(eaxIn, a, b, c, d)                                         \
+  __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" \
+                       : "=a"(a), "=S"(b), "=c"(c), "=d"(d)                \
+                       : "0"(eaxIn))
+#define __cpuid_count(eaxIn, ecxIn, a, b, c, d)                            \
+  __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" \
+                       : "=a"(a), "=S"(b), "=c"(c), "=d"(d)                \
+                       : "0"(eaxIn), "2"(ecxIn))
+#else
+#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
+#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) \
+  __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
+#endif
+#endif
+#endif
+#endif
+
+#ifdef XBYAK_USE_VTUNE
+// -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
+#include <jitprofiling.h>
+#ifdef _MSC_VER
+#pragma comment(lib, "libittnotify.lib")
+#endif
+#ifdef __linux__
+#include <dlfcn.h>
+#endif
+#endif
+#ifdef __linux__
+#define XBYAK_USE_PERF
+#endif
+
+namespace Xbyak {
+namespace util {
+
+typedef enum { SmtLevel = 1, CoreLevel = 2 } IntelCpuTopologyLevel;
+
+namespace local {
+
+template <uint64_t L, uint64_t H = 0>
+struct TypeT {};
+
+template <uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2>
+XBYAK_CONSTEXPR TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) {
+  return TypeT<L1 | L2, H1 | H2>();
+}
+
+template <typename T>
+inline T max_(T x, T y) {
+  return x >= y ? x : y;
+}
+template <typename T>
+inline T min_(T x, T y) {
+  return x < y ? x : y;
+}
+
+}  // namespace local
+
+/**
+        CPU detection class
+        @note static inline const member is supported by c++17 or later, so use template hack
+*/
+class Cpu {
+ public:
+  class Type {
+    uint64_t L;
+    uint64_t H;
+
+   public:
+    Type(uint64_t L = 0, uint64_t H = 0) : L(L), H(H) {}
+    template <uint64_t L_, uint64_t H_>
+    Type(local::TypeT<L_, H_>) : L(L_), H(H_) {}
+    Type& operator&=(const Type& rhs) {
+      L &= rhs.L;
+      H &= rhs.H;
+      return *this;
+    }
+    Type& operator|=(const Type& rhs) {
+      L |= rhs.L;
+      H |= rhs.H;
+      return *this;
+    }
+    Type operator&(const Type& rhs) const {
+      Type t = *this;
+      t &= rhs;
+      return t;
+    }
+    Type operator|(const Type& rhs) const {
+      Type t = *this;
+      t |= rhs;
+      return t;
+    }
+    bool operator==(const Type& rhs) const { return H == rhs.H && L == rhs.L; }
+    bool operator!=(const Type& rhs) const { return !operator==(rhs); }
+    // without explicit because backward compatibility
+    operator bool() const { return (H | L) != 0; }
+    uint64_t getL() const { return L; }
+    uint64_t getH() const { return H; }
+  };
+
+ private:
+  Type type_;
+  // system topology
+  bool x2APIC_supported_;
+  static const size_t maxTopologyLevels = 2;
+  uint32_t numCores_[maxTopologyLevels];
+
+  static const uint32_t maxNumberCacheLevels = 10;
+  uint32_t dataCacheSize_[maxNumberCacheLevels];
+  uint32_t coresSharignDataCache_[maxNumberCacheLevels];
+  uint32_t dataCacheLevels_;
+
+  uint32_t get32bitAsBE(const char* x) const { return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24); }
+  uint32_t mask(int n) const { return (1U << n) - 1; }
+  void setFamily() {
+    uint32_t data[4] = {};
+    getCpuid(1, data);
+    stepping = data[0] & mask(4);
+    model = (data[0] >> 4) & mask(4);
+    family = (data[0] >> 8) & mask(4);
+    // type = (data[0] >> 12) & mask(2);
+    extModel = (data[0] >> 16) & mask(4);
+    extFamily = (data[0] >> 20) & mask(8);
+    if (family == 0x0f) {
+      displayFamily = family + extFamily;
+    } else {
+      displayFamily = family;
+    }
+    if (family == 6 || family == 0x0f) {
+      displayModel = (extModel << 4) + model;
+    } else {
+      displayModel = model;
+    }
+  }
+  uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end) { return (val >> base) & ((1u << (end - base)) - 1); }
+  void setNumCores() {
+    if (!has(tINTEL) && !has(tAMD)) return;
+
+    uint32_t data[4] = {};
+    getCpuidEx(0x0, 0, data);
+    if (data[0] >= 0xB) {
+      /*
+             if leaf 11 exists(x2APIC is supported),
+             we use it to get the number of smt cores and cores on socket
+
+             leaf 0xB can be zeroed-out by a hypervisor
+     */
+      x2APIC_supported_ = true;
+      for (uint32_t i = 0; i < maxTopologyLevels; i++) {
+        getCpuidEx(0xB, i, data);
+        IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
+        if (level == SmtLevel || level == CoreLevel) {
+          numCores_[level - 1] = extractBit(data[1], 0, 15);
+        }
+      }
+      /*
+              Fallback values in case a hypervisor has 0xB leaf zeroed-out.
+      */
+      numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]);
+      numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
+    } else {
+      /*
+              Failed to deremine num of cores without x2APIC support.
+              TODO: USE initial APIC ID to determine ncores.
+      */
+      numCores_[SmtLevel - 1] = 0;
+      numCores_[CoreLevel - 1] = 0;
+    }
+  }
+  void setCacheHierarchy() {
+    if (!has(tINTEL) && !has(tAMD)) return;
+
+    // https://github.com/amd/ZenDNN/blob/a08bf9a9efc160a69147cdecfb61cc85cc0d4928/src/cpu/x64/xbyak/xbyak_util.h#L236-L288
+    if (has(tAMD)) {
+      // There are 3 Data Cache Levels (L1, L2, L3)
+      dataCacheLevels_ = 3;
+      const uint32_t leaf = 0x8000001D;  // for modern AMD CPus
+      // Sub leaf value ranges from 0 to 3
+      // Sub leaf value 0 refers to L1 Data Cache
+      // Sub leaf value 1 refers to L1 Instruction Cache
+      // Sub leaf value 2 refers to L2 Cache
+      // Sub leaf value 3 refers to L3 Cache
+      // For legacy AMD CPU, use leaf 0x80000005 for L1 cache
+      // and 0x80000006 for L2 and L3 cache
+      int cache_index = 0;
+      for (uint32_t sub_leaf = 0; sub_leaf <= dataCacheLevels_; sub_leaf++) {
+        // Skip sub_leaf = 1 as it refers to
+        // L1 Instruction Cache (not required)
+        if (sub_leaf == 1) {
+          continue;
+        }
+        uint32_t data[4] = {};
+        getCpuidEx(leaf, sub_leaf, data);
+        // Cache Size = Line Size * Partitions * Associativity * Cache Sets
+        dataCacheSize_[cache_index] = (extractBit(data[1], 22, 31) + 1)    // Associativity-1
+                                      * (extractBit(data[1], 12, 21) + 1)  // Partitions-1
+                                      * (extractBit(data[1], 0, 11) + 1)   // Line Size
+                                      * (data[2] + 1);
+        // Calculate the number of cores sharing the current data cache
+        int smt_width = numCores_[0];
+        int logical_cores = numCores_[1];
+        int actual_logical_cores = extractBit(data[0], 14, 25) /* # of cores * # of threads */ + 1;
+        if (logical_cores != 0) {
+          actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
+        }
+        coresSharignDataCache_[cache_index] = local::max_(actual_logical_cores / smt_width, 1);
+        ++cache_index;
+      }
+      return;
+    }
+    // intel
+    const uint32_t NO_CACHE = 0;
+    const uint32_t DATA_CACHE = 1;
+    //		const uint32_t INSTRUCTION_CACHE = 2;
+    const uint32_t UNIFIED_CACHE = 3;
+    uint32_t smt_width = 0;
+    uint32_t logical_cores = 0;
+    uint32_t data[4] = {};
+
+    if (x2APIC_supported_) {
+      smt_width = numCores_[0];
+      logical_cores = numCores_[1];
+    }
+
+    /*
+            Assumptions:
+            the first level of data cache is not shared (which is the
+            case for every existing architecture) and use this to
+            determine the SMT width for arch not supporting leaf 11.
+            when leaf 4 reports a number of core less than numCores_
+            on socket reported by leaf 11, then it is a correct number
+            of cores not an upperbound.
+    */
+    for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
+      getCpuidEx(0x4, i, data);
+      uint32_t cacheType = extractBit(data[0], 0, 4);
+      if (cacheType == NO_CACHE) break;
+      if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
+        uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1;
+        if (logical_cores != 0) {  // true only if leaf 0xB is supported and valid
+          actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
+        }
+        assert(actual_logical_cores != 0);
+        dataCacheSize_[dataCacheLevels_] = (extractBit(data[1], 22, 31) + 1) * (extractBit(data[1], 12, 21) + 1) *
+                                           (extractBit(data[1], 0, 11) + 1) * (data[2] + 1);
+        if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
+        assert(smt_width != 0);
+        coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u);
+        dataCacheLevels_++;
+      }
+    }
+  }
+
+ public:
+  int model;
+  int family;
+  int stepping;
+  int extModel;
+  int extFamily;
+  int displayFamily;  // family + extFamily
+  int displayModel;   // model + extModel
+
+  uint32_t getNumCores(IntelCpuTopologyLevel level) const {
+    if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
+    switch (level) {
+      case SmtLevel:
+        return numCores_[level - 1];
+      case CoreLevel:
+        return numCores_[level - 1] / numCores_[SmtLevel - 1];
+      default:
+        XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
+    }
+  }
+
+  uint32_t getDataCacheLevels() const { return dataCacheLevels_; }
+  uint32_t getCoresSharingDataCache(uint32_t i) const {
+    if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
+    return coresSharignDataCache_[i];
+  }
+  uint32_t getDataCacheSize(uint32_t i) const {
+    if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
+    return dataCacheSize_[i];
+  }
+
+  /*
+          data[] = { eax, ebx, ecx, edx }
+  */
+  static inline void getCpuid(uint32_t eaxIn, uint32_t data[4]) {
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+#ifdef _WIN32
+    __cpuid(reinterpret_cast<int*>(data), eaxIn);
+#else
+    __cpuid(eaxIn, data[0], data[1], data[2], data[3]);
+#endif
+#else
+    (void)eaxIn;
+    (void)data;
+#endif
+  }
+  static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4]) {
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+#ifdef _WIN32
+    __cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
+#else
+    __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
+#endif
+#else
+    (void)eaxIn;
+    (void)ecxIn;
+    (void)data;
+#endif
+  }
+  static inline uint64_t getXfeature() {
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+#ifdef _MSC_VER
+    return _xgetbv(0);
+#else
+    uint32_t eax, edx;
+    // xgetvb is not support on gcc 4.2
+    //		__asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
+    __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
+    return ((uint64_t)edx << 32) | eax;
+#endif
+#else
+    return 0;
+#endif
+  }
+
+#define XBYAK_SPLIT_ID(id) ((0 <= id && id < 64) ? (1ull << (id % 64)) : 0), (id >= 64 ? (1ull << (id % 64)) : 0)
+#if (__cplusplus >= 201103) || (defined(_MSC_VER) && (_MSC_VER >= 1700)) /* VS2012 */
+#define XBYAK_DEFINE_TYPE(id, NAME) \
+  static const constexpr local::TypeT<XBYAK_SPLIT_ID(id)> NAME {}
+#else
+#define XBYAK_DEFINE_TYPE(id, NAME) static const local::TypeT<XBYAK_SPLIT_ID(id)> NAME
+#endif
+  XBYAK_DEFINE_TYPE(0, tMMX);
+  XBYAK_DEFINE_TYPE(1, tMMX2);
+  XBYAK_DEFINE_TYPE(2, tCMOV);
+  XBYAK_DEFINE_TYPE(3, tSSE);
+  XBYAK_DEFINE_TYPE(4, tSSE2);
+  XBYAK_DEFINE_TYPE(5, tSSE3);
+  XBYAK_DEFINE_TYPE(6, tSSSE3);
+  XBYAK_DEFINE_TYPE(7, tSSE41);
+  XBYAK_DEFINE_TYPE(8, tSSE42);
+  XBYAK_DEFINE_TYPE(9, tPOPCNT);
+  XBYAK_DEFINE_TYPE(10, tAESNI);
+  XBYAK_DEFINE_TYPE(11, tAVX512_FP16);
+  XBYAK_DEFINE_TYPE(12, tOSXSAVE);
+  XBYAK_DEFINE_TYPE(13, tPCLMULQDQ);
+  XBYAK_DEFINE_TYPE(14, tAVX);
+  XBYAK_DEFINE_TYPE(15, tFMA);
+  XBYAK_DEFINE_TYPE(16, t3DN);
+  XBYAK_DEFINE_TYPE(17, tE3DN);
+  XBYAK_DEFINE_TYPE(18, tWAITPKG);
+  XBYAK_DEFINE_TYPE(19, tRDTSCP);
+  XBYAK_DEFINE_TYPE(20, tAVX2);
+  XBYAK_DEFINE_TYPE(21, tBMI1);  // andn, bextr, blsi, blsmsk, blsr, tzcnt
+  XBYAK_DEFINE_TYPE(22, tBMI2);  // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
+  XBYAK_DEFINE_TYPE(23, tLZCNT);
+  XBYAK_DEFINE_TYPE(24, tINTEL);
+  XBYAK_DEFINE_TYPE(25, tAMD);
+  XBYAK_DEFINE_TYPE(26, tENHANCED_REP);  // enhanced rep movsb/stosb
+  XBYAK_DEFINE_TYPE(27, tRDRAND);
+  XBYAK_DEFINE_TYPE(28, tADX);     // adcx, adox
+  XBYAK_DEFINE_TYPE(29, tRDSEED);  // rdseed
+  XBYAK_DEFINE_TYPE(30, tSMAP);    // stac
+  XBYAK_DEFINE_TYPE(31, tHLE);     // xacquire, xrelease, xtest
+  XBYAK_DEFINE_TYPE(32, tRTM);     // xbegin, xend, xabort
+  XBYAK_DEFINE_TYPE(33, tF16C);    // vcvtph2ps, vcvtps2ph
+  XBYAK_DEFINE_TYPE(34, tMOVBE);   // mobve
+  XBYAK_DEFINE_TYPE(35, tAVX512F);
+  XBYAK_DEFINE_TYPE(36, tAVX512DQ);
+  XBYAK_DEFINE_TYPE(37, tAVX512_IFMA);
+  XBYAK_DEFINE_TYPE(37, tAVX512IFMA);  // = tAVX512_IFMA;
+  XBYAK_DEFINE_TYPE(38, tAVX512PF);
+  XBYAK_DEFINE_TYPE(39, tAVX512ER);
+  XBYAK_DEFINE_TYPE(40, tAVX512CD);
+  XBYAK_DEFINE_TYPE(41, tAVX512BW);
+  XBYAK_DEFINE_TYPE(42, tAVX512VL);
+  XBYAK_DEFINE_TYPE(43, tAVX512_VBMI);
+  XBYAK_DEFINE_TYPE(43, tAVX512VBMI);  // = tAVX512_VBMI; // changed by Intel's manual
+  XBYAK_DEFINE_TYPE(44, tAVX512_4VNNIW);
+  XBYAK_DEFINE_TYPE(45, tAVX512_4FMAPS);
+  XBYAK_DEFINE_TYPE(46, tPREFETCHWT1);
+  XBYAK_DEFINE_TYPE(47, tPREFETCHW);
+  XBYAK_DEFINE_TYPE(48, tSHA);
+  XBYAK_DEFINE_TYPE(49, tMPX);
+  XBYAK_DEFINE_TYPE(50, tAVX512_VBMI2);
+  XBYAK_DEFINE_TYPE(51, tGFNI);
+  XBYAK_DEFINE_TYPE(52, tVAES);
+  XBYAK_DEFINE_TYPE(53, tVPCLMULQDQ);
+  XBYAK_DEFINE_TYPE(54, tAVX512_VNNI);
+  XBYAK_DEFINE_TYPE(55, tAVX512_BITALG);
+  XBYAK_DEFINE_TYPE(56, tAVX512_VPOPCNTDQ);
+  XBYAK_DEFINE_TYPE(57, tAVX512_BF16);
+  XBYAK_DEFINE_TYPE(58, tAVX512_VP2INTERSECT);
+  XBYAK_DEFINE_TYPE(59, tAMX_TILE);
+  XBYAK_DEFINE_TYPE(60, tAMX_INT8);
+  XBYAK_DEFINE_TYPE(61, tAMX_BF16);
+  XBYAK_DEFINE_TYPE(62, tAVX_VNNI);
+  XBYAK_DEFINE_TYPE(63, tCLFLUSHOPT);
+  XBYAK_DEFINE_TYPE(64, tCLDEMOTE);
+  XBYAK_DEFINE_TYPE(65, tMOVDIRI);
+  XBYAK_DEFINE_TYPE(66, tMOVDIR64B);
+  XBYAK_DEFINE_TYPE(67, tCLZERO);  // AMD Zen
+  XBYAK_DEFINE_TYPE(68, tAMX_FP16);
+  XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8);
+  XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT);
+  XBYAK_DEFINE_TYPE(71, tAVX_IFMA);
+  XBYAK_DEFINE_TYPE(72, tRAO_INT);
+  XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
+  XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
+  XBYAK_DEFINE_TYPE(75, tSERIALIZE);
+  XBYAK_DEFINE_TYPE(76, tUINTR);
+  XBYAK_DEFINE_TYPE(77, tXSAVE);
+  XBYAK_DEFINE_TYPE(78, tSHA512);
+  XBYAK_DEFINE_TYPE(79, tSM3);
+  XBYAK_DEFINE_TYPE(80, tSM4);
+  XBYAK_DEFINE_TYPE(81, tAVX_VNNI_INT16);
+
+#undef XBYAK_SPLIT_ID
+#undef XBYAK_DEFINE_TYPE
+
+  Cpu()
+      : type_(),
+        x2APIC_supported_(false),
+        numCores_(),
+        dataCacheSize_(),
+        coresSharignDataCache_(),
+        dataCacheLevels_(0) {
+    uint32_t data[4] = {};
+    const uint32_t& EAX = data[0];
+    const uint32_t& EBX = data[1];
+    const uint32_t& ECX = data[2];
+    const uint32_t& EDX = data[3];
+    getCpuid(0, data);
+    const uint32_t maxNum = EAX;
+    static const char intel[] = "ntel";
+    static const char amd[] = "cAMD";
+    if (ECX == get32bitAsBE(amd)) {
+      type_ |= tAMD;
+      getCpuid(0x80000001, data);
+      if (EDX & (1U << 31)) {
+        type_ |= t3DN;
+        // 3DNow! implies support for PREFETCHW on AMD
+        type_ |= tPREFETCHW;
+      }
+
+      if (EDX & (1U << 29)) {
+        // Long mode implies support for PREFETCHW on AMD
+        type_ |= tPREFETCHW;
+      }
+    }
+    if (ECX == get32bitAsBE(intel)) {
+      type_ |= tINTEL;
+    }
+
+    // Extended flags information
+    getCpuid(0x80000000, data);
+    const uint32_t maxExtendedNum = EAX;
+    if (maxExtendedNum >= 0x80000001) {
+      getCpuid(0x80000001, data);
+
+      if (EDX & (1U << 31)) type_ |= t3DN;
+      if (EDX & (1U << 30)) type_ |= tE3DN;
+      if (EDX & (1U << 27)) type_ |= tRDTSCP;
+      if (EDX & (1U << 22)) type_ |= tMMX2;
+      if (EDX & (1U << 15)) type_ |= tCMOV;
+      if (ECX & (1U << 5)) type_ |= tLZCNT;
+      if (ECX & (1U << 8)) type_ |= tPREFETCHW;
+    }
+
+    if (maxExtendedNum >= 0x80000008) {
+      getCpuid(0x80000008, data);
+      if (EBX & (1U << 0)) type_ |= tCLZERO;
+    }
+
+    getCpuid(1, data);
+    if (ECX & (1U << 0)) type_ |= tSSE3;
+    if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
+    if (ECX & (1U << 9)) type_ |= tSSSE3;
+    if (ECX & (1U << 19)) type_ |= tSSE41;
+    if (ECX & (1U << 20)) type_ |= tSSE42;
+    if (ECX & (1U << 22)) type_ |= tMOVBE;
+    if (ECX & (1U << 23)) type_ |= tPOPCNT;
+    if (ECX & (1U << 25)) type_ |= tAESNI;
+    if (ECX & (1U << 26)) type_ |= tXSAVE;
+    if (ECX & (1U << 27)) type_ |= tOSXSAVE;
+    if (ECX & (1U << 30)) type_ |= tRDRAND;
+    if (ECX & (1U << 29)) type_ |= tF16C;
+
+    if (EDX & (1U << 15)) type_ |= tCMOV;
+    if (EDX & (1U << 23)) type_ |= tMMX;
+    if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE;
+    if (EDX & (1U << 26)) type_ |= tSSE2;
+
+    if (type_ & tOSXSAVE) {
+      // check XFEATURE_ENABLED_MASK[2:1] = '11b'
+      uint64_t bv = getXfeature();
+      if ((bv & 6) == 6) {
+        if (ECX & (1U << 28)) type_ |= tAVX;
+        if (ECX & (1U << 12)) type_ |= tFMA;
+          // do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support
+#if !defined(__APPLE__)
+        if (((bv >> 5) & 7) == 7)
+#endif
+        {
+          getCpuidEx(7, 0, data);
+          if (EBX & (1U << 16)) type_ |= tAVX512F;
+          if (type_ & tAVX512F) {
+            if (EBX & (1U << 17)) type_ |= tAVX512DQ;
+            if (EBX & (1U << 21)) type_ |= tAVX512_IFMA;
+            if (EBX & (1U << 26)) type_ |= tAVX512PF;
+            if (EBX & (1U << 27)) type_ |= tAVX512ER;
+            if (EBX & (1U << 28)) type_ |= tAVX512CD;
+            if (EBX & (1U << 30)) type_ |= tAVX512BW;
+            if (EBX & (1U << 31)) type_ |= tAVX512VL;
+            if (ECX & (1U << 1)) type_ |= tAVX512_VBMI;
+            if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2;
+            if (ECX & (1U << 11)) type_ |= tAVX512_VNNI;
+            if (ECX & (1U << 12)) type_ |= tAVX512_BITALG;
+            if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
+            if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW;
+            if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
+            if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
+            if ((type_ & tAVX512BW) && (EDX & (1U << 23))) type_ |= tAVX512_FP16;
+          }
+        }
+      }
+    }
+    if (maxNum >= 7) {
+      getCpuidEx(7, 0, data);
+      const uint32_t maxNumSubLeaves = EAX;
+      if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
+      if (EBX & (1U << 3)) type_ |= tBMI1;
+      if (EBX & (1U << 8)) type_ |= tBMI2;
+      if (EBX & (1U << 9)) type_ |= tENHANCED_REP;
+      if (EBX & (1U << 18)) type_ |= tRDSEED;
+      if (EBX & (1U << 19)) type_ |= tADX;
+      if (EBX & (1U << 20)) type_ |= tSMAP;
+      if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT;
+      if (EBX & (1U << 4)) type_ |= tHLE;
+      if (EBX & (1U << 11)) type_ |= tRTM;
+      if (EBX & (1U << 14)) type_ |= tMPX;
+      if (EBX & (1U << 29)) type_ |= tSHA;
+      if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
+      if (ECX & (1U << 5)) type_ |= tWAITPKG;
+      if (ECX & (1U << 8)) type_ |= tGFNI;
+      if (ECX & (1U << 9)) type_ |= tVAES;
+      if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
+      if (ECX & (1U << 25)) type_ |= tCLDEMOTE;
+      if (ECX & (1U << 27)) type_ |= tMOVDIRI;
+      if (ECX & (1U << 28)) type_ |= tMOVDIR64B;
+      if (EDX & (1U << 5)) type_ |= tUINTR;
+      if (EDX & (1U << 14)) type_ |= tSERIALIZE;
+      if (EDX & (1U << 22)) type_ |= tAMX_BF16;
+      if (EDX & (1U << 24)) type_ |= tAMX_TILE;
+      if (EDX & (1U << 25)) type_ |= tAMX_INT8;
+      if (maxNumSubLeaves >= 1) {
+        getCpuidEx(7, 1, data);
+        if (EAX & (1U << 0)) type_ |= tSHA512;
+        if (EAX & (1U << 1)) type_ |= tSM3;
+        if (EAX & (1U << 2)) type_ |= tSM4;
+        if (EAX & (1U << 3)) type_ |= tRAO_INT;
+        if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
+        if (type_ & tAVX512F) {
+          if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
+        }
+        if (EAX & (1U << 7)) type_ |= tCMPCCXADD;
+        if (EAX & (1U << 21)) type_ |= tAMX_FP16;
+        if (EAX & (1U << 23)) type_ |= tAVX_IFMA;
+        if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8;
+        if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT;
+        if (EDX & (1U << 10)) type_ |= tAVX_VNNI_INT16;
+        if (EDX & (1U << 14)) type_ |= tPREFETCHITI;
+      }
+    }
+    setFamily();
+    setNumCores();
+    setCacheHierarchy();
+  }
+  void putFamily() const {
+#ifndef XBYAK_ONLY_CLASS_CPU
+    printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n", family, model, stepping, extFamily,
+           extModel);
+    printf("display:family=%X, model=%X\n", displayFamily, displayModel);
+#endif
+  }
+  bool has(const Type& type) const { return (type & type_) == type; }
+};
+
+#ifndef XBYAK_ONLY_CLASS_CPU
+class Clock {
+ public:
+  static inline uint64_t getRdtsc() {
+#ifdef XBYAK_INTEL_CPU_SPECIFIC
+#ifdef _MSC_VER
+    return __rdtsc();
+#else
+    uint32_t eax, edx;
+    __asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
+    return ((uint64_t)edx << 32) | eax;
+#endif
+#else
+    // TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
+    return 0;
+#endif
+  }
+  Clock() : clock_(0), count_(0) {}
+  void begin() { clock_ -= getRdtsc(); }
+  void end() {
+    clock_ += getRdtsc();
+    count_++;
+  }
+  int getCount() const { return count_; }
+  uint64_t getClock() const { return clock_; }
+  void clear() {
+    count_ = 0;
+    clock_ = 0;
+  }
+
+ private:
+  uint64_t clock_;
+  int count_;
+};
+
+#ifdef XBYAK64
+const int UseRCX = 1 << 6;
+const int UseRDX = 1 << 7;
+
+class Pack {
+  static const size_t maxTblNum = 15;
+  Xbyak::Reg64 tbl_[maxTblNum];
+  size_t n_;
+
+ public:
+  Pack() : tbl_(), n_(0) {}
+  Pack(const Xbyak::Reg64* tbl, size_t n) { init(tbl, n); }
+  Pack(const Pack& rhs) : n_(rhs.n_) {
+    for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
+  }
+  Pack& operator=(const Pack& rhs) {
+    n_ = rhs.n_;
+    for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
+    return *this;
+  }
+  Pack(const Xbyak::Reg64& t0) {
+    n_ = 1;
+    tbl_[0] = t0;
+  }
+  Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
+    n_ = 2;
+    tbl_[0] = t0;
+    tbl_[1] = t1;
+  }
+  Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
+    n_ = 3;
+    tbl_[0] = t0;
+    tbl_[1] = t1;
+    tbl_[2] = t2;
+  }
+  Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
+    n_ = 4;
+    tbl_[0] = t0;
+    tbl_[1] = t1;
+    tbl_[2] = t2;
+    tbl_[3] = t3;
+  }
+  Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1,
+       const Xbyak::Reg64& t0) {
+    n_ = 5;
+    tbl_[0] = t0;
+    tbl_[1] = t1;
+    tbl_[2] = t2;
+    tbl_[3] = t3;
+    tbl_[4] = t4;
+  }
+  Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2,
+       const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
+    n_ = 6;
+    tbl_[0] = t0;
+    tbl_[1] = t1;
+    tbl_[2] = t2;
+    tbl_[3] = t3;
+    tbl_[4] = t4;
+    tbl_[5] = t5;
+  }
+  Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3,
+       const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
+    n_ = 7;
+    tbl_[0] = t0;
+    tbl_[1] = t1;
+    tbl_[2] = t2;
+    tbl_[3] = t3;
+    tbl_[4] = t4;
+    tbl_[5] = t5;
+    tbl_[6] = t6;
+  }
+  Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4,
+       const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
+    n_ = 8;
+    tbl_[0] = t0;
+    tbl_[1] = t1;
+    tbl_[2] = t2;
+    tbl_[3] = t3;
+    tbl_[4] = t4;
+    tbl_[5] = t5;
+    tbl_[6] = t6;
+    tbl_[7] = t7;
+  }
+  Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5,
+       const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1,
+       const Xbyak::Reg64& t0) {
+    n_ = 9;
+    tbl_[0] = t0;
+    tbl_[1] = t1;
+    tbl_[2] = t2;
+    tbl_[3] = t3;
+    tbl_[4] = t4;
+    tbl_[5] = t5;
+    tbl_[6] = t6;
+    tbl_[7] = t7;
+    tbl_[8] = t8;
+  }
+  Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6,
+       const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2,
+       const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
+    n_ = 10;
+    tbl_[0] = t0;
+    tbl_[1] = t1;
+    tbl_[2] = t2;
+    tbl_[3] = t3;
+    tbl_[4] = t4;
+    tbl_[5] = t5;
+    tbl_[6] = t6;
+    tbl_[7] = t7;
+    tbl_[8] = t8;
+    tbl_[9] = t9;
+  }
+  Pack(const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7,
+       const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3,
+       const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
+    n_ = 11;
+    tbl_[0] = t0;
+    tbl_[1] = t1;
+    tbl_[2] = t2;
+    tbl_[3] = t3;
+    tbl_[4] = t4;
+    tbl_[5] = t5;
+    tbl_[6] = t6;
+    tbl_[7] = t7;
+    tbl_[8] = t8;
+    tbl_[9] = t9;
+    tbl_[10] = ta;
+  }
+  Pack(const Xbyak::Reg64& tb, const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8,
+       const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4,
+       const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
+    n_ = 12;
+    tbl_[0] = t0;
+    tbl_[1] = t1;
+    tbl_[2] = t2;
+    tbl_[3] = t3;
+    tbl_[4] = t4;
+    tbl_[5] = t5;
+    tbl_[6] = t6;
+    tbl_[7] = t7;
+    tbl_[8] = t8;
+    tbl_[9] = t9;
+    tbl_[10] = ta;
+    tbl_[11] = tb;
+  }
+  Pack& append(const Xbyak::Reg64& t) {
+    if (n_ == maxTblNum) {
+      fprintf(stderr, "ERR Pack::can't append\n");
+      XBYAK_THROW_RET(ERR_BAD_PARAMETER, *this)
+    }
+    tbl_[n_++] = t;
+    return *this;
+  }
+  void init(const Xbyak::Reg64* tbl, size_t n) {
+    if (n > maxTblNum) {
+      fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n);
+      XBYAK_THROW(ERR_BAD_PARAMETER)
+    }
+    n_ = n;
+    for (size_t i = 0; i < n; i++) {
+      tbl_[i] = tbl[i];
+    }
+  }
+  const Xbyak::Reg64& operator[](size_t n) const {
+    if (n >= n_) {
+      fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
+      XBYAK_THROW_RET(ERR_BAD_PARAMETER, rax)
+    }
+    return tbl_[n];
+  }
+  size_t size() const { return n_; }
+  /*
+          get tbl[pos, pos + num)
+  */
+  Pack sub(size_t pos, size_t num = size_t(-1)) const {
+    if (num == size_t(-1)) num = n_ - pos;
+    if (pos + num > n_) {
+      fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num);
+      XBYAK_THROW_RET(ERR_BAD_PARAMETER, Pack())
+    }
+    Pack pack;
+    pack.n_ = num;
+    for (size_t i = 0; i < num; i++) {
+      pack.tbl_[i] = tbl_[pos + i];
+    }
+    return pack;
+  }
+  void put() const {
+    for (size_t i = 0; i < n_; i++) {
+      printf("%s ", tbl_[i].toString());
+    }
+    printf("\n");
+  }
+};
+
+class StackFrame {
+#ifdef XBYAK64_WIN
+  static const int noSaveNum = 6;
+  static const int rcxPos = 0;
+  static const int rdxPos = 1;
+#else
+  static const int noSaveNum = 8;
+  static const int rcxPos = 3;
+  static const int rdxPos = 2;
+#endif
+  static const int maxRegNum = 14;  // maxRegNum = 16 - rsp - rax
+  Xbyak::CodeGenerator* code_;
+  int pNum_;
+  int tNum_;
+  bool useRcx_;
+  bool useRdx_;
+  int saveNum_;
+  int P_;
+  bool makeEpilog_;
+  Xbyak::Reg64 pTbl_[4];
+  Xbyak::Reg64 tTbl_[maxRegNum];
+  Pack p_;
+  Pack t_;
+  StackFrame(const StackFrame&);
+  void operator=(const StackFrame&);
+
+ public:
+  const Pack& p;
+  const Pack& t;
+  /*
+          make stack frame
+          @param sf [in] this
+          @param pNum [in] num of function parameter(0 <= pNum <= 4)
+          @param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14
+          @param stackSizeByte [in] local stack size
+          @param makeEpilog [in] automatically call close() if true
+
+          you can use
+          rax
+          gp0, ..., gp(pNum - 1)
+          gt0, ..., gt(tNum-1)
+          rcx if tNum & UseRCX
+          rdx if tNum & UseRDX
+          rsp[0..stackSizeByte - 1]
+  */
+  StackFrame(Xbyak::CodeGenerator* code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true)
+      : code_(code),
+        pNum_(pNum),
+        tNum_(tNum & ~(UseRCX | UseRDX)),
+        useRcx_((tNum & UseRCX) != 0),
+        useRdx_((tNum & UseRDX) != 0),
+        saveNum_(0),
+        P_(0),
+        makeEpilog_(makeEpilog),
+        p(p_),
+        t(t_) {
+    using namespace Xbyak;
+    if (pNum < 0 || pNum > 4) XBYAK_THROW(ERR_BAD_PNUM)
+    const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
+    if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM)
+    const Reg64& _rsp = code->rsp;
+    saveNum_ = local::max_(0, allRegNum - noSaveNum);
+    const int* tbl = getOrderTbl() + noSaveNum;
+    for (int i = 0; i < saveNum_; i++) {
+      code->push(Reg64(tbl[i]));
+    }
+    P_ = (stackSizeByte + 7) / 8;
+    if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++;  // (rsp % 16) == 8, then increment P_ for 16 byte alignment
+    P_ *= 8;
+    if (P_ > 0) code->sub(_rsp, P_);
+    int pos = 0;
+    for (int i = 0; i < pNum; i++) {
+      pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
+    }
+    for (int i = 0; i < tNum_; i++) {
+      tTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
+    }
+    if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx);
+    if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx);
+    p_.init(pTbl_, pNum);
+    t_.init(tTbl_, tNum_);
+  }
+  /*
+          make epilog manually
+          @param callRet [in] call ret() if true
+  */
+  void close(bool callRet = true) {
+    using namespace Xbyak;
+    const Reg64& _rsp = code_->rsp;
+    const int* tbl = getOrderTbl() + noSaveNum;
+    if (P_ > 0) code_->add(_rsp, P_);
+    for (int i = 0; i < saveNum_; i++) {
+      code_->pop(Reg64(tbl[saveNum_ - 1 - i]));
+    }
+
+    if (callRet) code_->ret();
+  }
+  ~StackFrame() {
+    if (!makeEpilog_) return;
+    close();
+  }
+
+ private:
+  const int* getOrderTbl() const {
+    using namespace Xbyak;
+    static const int tbl[] = {
+#ifdef XBYAK64_WIN
+        Operand::RCX, Operand::RDX, Operand::R8,  Operand::R9,  Operand::R10, Operand::R11, Operand::RDI, Operand::RSI,
+#else
+        Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8,  Operand::R9, Operand::R10, Operand::R11,
+#endif
+        Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15};
+    return &tbl[0];
+  }
+  int getRegIdx(int& pos) const {
+    assert(pos < maxRegNum);
+    using namespace Xbyak;
+    const int* tbl = getOrderTbl();
+    int r = tbl[pos++];
+    if (useRcx_) {
+      if (r == Operand::RCX) {
+        return Operand::R10;
+      }
+      if (r == Operand::R10) {
+        r = tbl[pos++];
+      }
+    }
+    if (useRdx_) {
+      if (r == Operand::RDX) {
+        return Operand::R11;
+      }
+      if (r == Operand::R11) {
+        return tbl[pos++];
+      }
+    }
+    return r;
+  }
+};
+#endif
+
+class Profiler {
+  int mode_;
+  const char* suffix_;
+  const void* startAddr_;
+#ifdef XBYAK_USE_PERF
+  FILE* fp_;
+#endif
+ public:
+  enum { None = 0, Perf = 1, VTune = 2 };
+  Profiler()
+      : mode_(None),
+        suffix_(""),
+        startAddr_(0)
+#ifdef XBYAK_USE_PERF
+        ,
+        fp_(0)
+#endif
+  {
+  }
+  // append suffix to funcName
+  void setNameSuffix(const char* suffix) { suffix_ = suffix; }
+  void setStartAddr(const void* startAddr) { startAddr_ = startAddr; }
+  void init(int mode) {
+    mode_ = None;
+    switch (mode) {
+      default:
+      case None:
+        return;
+      case Perf:
+#ifdef XBYAK_USE_PERF
+        close();
+        {
+          const int pid = getpid();
+          char name[128];
+          snprintf(name, sizeof(name), "/tmp/perf-%d.map", pid);
+          fp_ = fopen(name, "a+");
+          if (fp_ == 0) {
+            fprintf(stderr, "can't open %s\n", name);
+            return;
+          }
+        }
+        mode_ = Perf;
+#endif
+        return;
+      case VTune:
+#ifdef XBYAK_USE_VTUNE
+        dlopen("dummy", RTLD_LAZY);  // force to load dlopen to enable jit profiling
+        if (iJIT_IsProfilingActive() != iJIT_SAMPLING_ON) {
+          fprintf(stderr, "VTune profiling is not active\n");
+          return;
+        }
+        mode_ = VTune;
+#endif
+        return;
+    }
+  }
+  ~Profiler() { close(); }
+  void close() {
+#ifdef XBYAK_USE_PERF
+    if (fp_ == 0) return;
+    fclose(fp_);
+    fp_ = 0;
+#endif
+  }
+  void set(const char* funcName, const void* startAddr, size_t funcSize) const {
+    if (mode_ == None) return;
+#if !defined(XBYAK_USE_PERF) && !defined(XBYAK_USE_VTUNE)
+    (void)funcName;
+    (void)startAddr;
+    (void)funcSize;
+#endif
+#ifdef XBYAK_USE_PERF
+    if (mode_ == Perf) {
+      if (fp_ == 0) return;
+      fprintf(fp_, "%llx %zx %s%s", (long long)startAddr, funcSize, funcName, suffix_);
+      /*
+              perf does not recognize the function name which is less than 3,
+              so append '_' at the end of the name if necessary
+      */
+      size_t n = strlen(funcName) + strlen(suffix_);
+      for (size_t i = n; i < 3; i++) {
+        fprintf(fp_, "_");
+      }
+      fprintf(fp_, "\n");
+      fflush(fp_);
+    }
+#endif
+#ifdef XBYAK_USE_VTUNE
+    if (mode_ != VTune) return;
+    char className[] = "";
+    char fileName[] = "";
+    iJIT_Method_Load jmethod = {};
+    jmethod.method_id = iJIT_GetNewMethodID();
+    jmethod.class_file_name = className;
+    jmethod.source_file_name = fileName;
+    jmethod.method_load_address = const_cast<void*>(startAddr);
+    jmethod.method_size = funcSize;
+    jmethod.line_number_size = 0;
+    char buf[128];
+    snprintf(buf, sizeof(buf), "%s%s", funcName, suffix_);
+    jmethod.method_name = buf;
+    iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, (void*)&jmethod);
+#endif
+  }
+  /*
+          for continuous set
+          funcSize = endAddr - <previous set endAddr>
+  */
+  void set(const char* funcName, const void* endAddr) {
+    set(funcName, startAddr_, (size_t)endAddr - (size_t)startAddr_);
+    startAddr_ = endAddr;
+  }
+};
+#endif  // XBYAK_ONLY_CLASS_CPU
+
+}  // namespace util
+}  // namespace Xbyak
+
+#endif
diff --git a/vllm-ext/inference_engine/cpp/clang-format.py b/vllm-ext/inference_engine/cpp/clang-format.py
new file mode 100644
index 000000000..86edc7b66
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/clang-format.py
@@ -0,0 +1,69 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import cmd
+import os
+import platform
+import sys
+import glob
+import argparse
+import fnmatch
+import subprocess
+
+ProjectEXT = ["h", "hpp", "c", "cpp"]
+
+
+def glob_files(dirs):
+    files = []
+    for directory in dirs:
+        for root, _, filenames in os.walk(directory):
+            for ext in ProjectEXT:
+                for filename in fnmatch.filter(filenames, "*." + ext):
+                    files.append(os.path.join(root, filename))
+    return files
+
+
+if sys.platform == "linux":
+    ClangBin = "clang-format"
+elif sys.platform == "win32":
+    ClangBin = "clang-format.exe"
+
+
+def clang_format_dir(args):
+    files = glob_files(args.dirs)
+    for file in files:
+        cmds = [ClangBin, "-i", "--style=file", file]
+        subprocess.run(cmds, check=True)
+
+
+def parse_args(argv=None):
+    if argv is None:
+        argv = sys.argv
+    parser = argparse.ArgumentParser(description="Recursively clang-format")
+    parser.add_argument("--dirs", nargs="+", help="paths to clang-format")
+    args = parser.parse_args(argv[1:])
+    if not args.dirs:
+        sys.exit(-1)
+    return args
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        args = parse_args(["", "--dirs", "neural_speed", "bestla"])
+    else:
+        args = parse_args()
+    clang_format_dir(args)
diff --git a/vllm-ext/inference_engine/cpp/cmake/ClangTidy.cmake b/vllm-ext/inference_engine/cpp/cmake/ClangTidy.cmake
new file mode 100644
index 000000000..984bc1293
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/cmake/ClangTidy.cmake
@@ -0,0 +1,31 @@
+#  Copyright (c) 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+if (IE_USE_CLANG_TIDY MATCHES "(CHECK|FIX)" AND ${CMAKE_VERSION} VERSION_LESS "3.6.0")
+    message(FATAL_ERROR "Using clang-tidy requires CMake 3.6.0 or newer")
+elseif(IE_USE_CLANG_TIDY MATCHES "(CHECK|FIX)")
+    find_program(CLANG_TIDY NAMES clang-tidy)
+    if(NOT CLANG_TIDY)
+        message(FATAL_ERROR "Clang-tidy not found")
+    else()
+        add_compile_definitions(CLANGTIDY)
+        if(IE_USE_CLANG_TIDY STREQUAL "CHECK")
+            set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY})
+            message(STATUS "Using clang-tidy to run checks")
+        elseif(IE_USE_CLANG_TIDY STREQUAL "FIX")
+            set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY} -fix)
+            message(STATUS "Using clang-tidy to run checks and fix found issues")
+        endif()
+    endif()
+endif()
diff --git a/vllm-ext/inference_engine/cpp/cmake/Common.cmake b/vllm-ext/inference_engine/cpp/cmake/Common.cmake
new file mode 100644
index 000000000..048de3eae
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/cmake/Common.cmake
@@ -0,0 +1,87 @@
+#  Copyright (c) 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+function(warning_check TARGET)
+    # TODO(hengyu): add warning check
+    if (MSVC)
+    #     target_compile_definitions(${TARGET} PUBLIC -DPLATFORM_WINDOWS -DNOGDI -DNOMINMAX -D_USE_MATH_DEFINES -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS)
+        target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options /utf-8>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:/utf-8>")
+    #     target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options /sdl>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:/sdl>")
+
+        # Use public to affect pybind targets
+        target_compile_options(${TARGET} PUBLIC /wd4244 /wd4267)  # possible loss of data
+        target_compile_options(${TARGET} PUBLIC /wd4305)  # truncation from 'double' to 'float'
+        target_compile_options(${TARGET} PUBLIC /wd4018)  # '>': signed/unsigned mismatch
+        target_compile_options(${TARGET} PUBLIC /wd4334)  # '<<': result of 32-bit shift implicitly converted to 64 bits
+
+        # 'std::codecvt_utf8<wchar_t,1114111,(std::codecvt_mode)0>': warning STL4017: std::wbuffer_convert,
+        # std::wstring_convert, and the <codecvt> header (containing std::codecvt_mode, std::codecvt_utf8,
+        # std::codecvt_utf16, and std::codecvt_utf8_utf16) are deprecated in C++17. (The std::codecvt class template is NOT
+        # deprecated.) The C++ Standard doesn't provide equivalent non-deprecated functionality; consider using
+        # MultiByteToWideChar() and WideCharToMultiByte() from <Windows.h> instead. You can define
+        # _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING or _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS to suppress this
+        # warning.
+        target_compile_definitions(${TARGET} PUBLIC _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
+
+        # Microsoft renamed some POSIX and Microsoft-specific library functions in the CRT to conform with C99 and C++03
+        # constraints on reserved and global implementation-defined names. If you need to use the existing function names
+        # for portability reasons, you can turn off these warnings. The functions are still available in the library under
+        # their original names.
+        target_compile_definitions(${TARGET} PUBLIC _CRT_NONSTDC_NO_WARNINGS)
+    else()
+    #     # Enable warning
+    #     target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options -Wall>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Wall>")
+    #     target_compile_options(${TARGET} PRIVATE "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Wextra>")
+    #     if(NOT CMAKE_BUILD_TYPE MATCHES "[Dd]ebug")
+    #         target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options -Werror>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Werror>")
+    #         target_compile_options(${TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:DPCPP>:SHELL:--compiler-options -Wno-error=deprecated-declarations>" "$<$<NOT:$<COMPILE_LANGUAGE:DPCPP>>:-Wno-error=deprecated-declarations>")
+    #     endif()
+    endif()
+endfunction()
+
+function(add_executable_w_warning TARGET)
+    add_executable(${TARGET} ${ARGN})
+    if(IE_USE_OPENMP)
+      target_link_libraries(${TARGET} PUBLIC OpenMP::OpenMP_CXX OpenMP::OpenMP_C)
+    endif()
+    set_target_properties(${TARGET} PROPERTIES C_STANDARD 11 C_STANDARD_REQUIRED ON C_EXTENSIONS OFF)
+    set_target_properties(${TARGET} PROPERTIES CXX_STANDARD 11 CXX_STANDARD_REQUIRED ON CXX_EXTENSIONS OFF)
+    warning_check(${TARGET})
+endfunction()
+
+function(add_library_w_warning_ TARGET)
+    add_library(${TARGET} ${ARGN})
+    if(IE_USE_OPENMP)
+      target_link_libraries(${TARGET} PUBLIC OpenMP::OpenMP_CXX OpenMP::OpenMP_C)
+    endif()
+    set_target_properties(${TARGET} PROPERTIES C_STANDARD 11 C_STANDARD_REQUIRED ON C_EXTENSIONS OFF)
+    set_target_properties(${TARGET} PROPERTIES CXX_STANDARD 11 CXX_STANDARD_REQUIRED ON CXX_EXTENSIONS OFF)
+    warning_check(${TARGET})
+endfunction()
+
+function(add_library_w_warning TARGET)
+    add_library_w_warning_(${TARGET} STATIC ${ARGN})
+endfunction()
+
+function(add_shared_library_w_warning TARGET)
+    add_library_w_warning_(${TARGET} SHARED ${ARGN})
+endfunction()
+
+function(add_shareable_library_w_warning TARGET)
+    if (BUILD_SHARED_LIBS)
+        add_library_w_warning_(${TARGET} SHARED ${ARGN})
+    else()
+        add_library_w_warning_(${TARGET} STATIC ${ARGN})
+    endif()
+endfunction()
diff --git a/vllm-ext/inference_engine/cpp/cmake/ISA.cmake b/vllm-ext/inference_engine/cpp/cmake/ISA.cmake
new file mode 100644
index 000000000..2d6c14b56
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/cmake/ISA.cmake
@@ -0,0 +1,67 @@
+#  Copyright (c) 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+if (MSVC)
+    if(IE_F16C)
+      add_compile_definitions(__F16C__)
+    endif()
+    if (IE_AVX512)
+        add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
+        # MSVC has no compile-time flags enabling specific
+        # AVX512 extensions, neither it defines the
+        # macros corresponding to the extensions.
+        # Do it manually.
+        if (IE_AVX512_VBMI)
+            add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
+            add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
+        endif()
+        if (IE_AVX512_VNNI)
+            add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
+            add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
+        endif()
+    elseif (IE_AVX2)
+        add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
+    elseif (IE_AVX)
+        add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
+        add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
+    endif()
+else()
+    if (IE_F16C)
+        add_compile_options(-mf16c)
+    endif()
+    if (IE_FMA)
+        add_compile_options(-mfma)
+    endif()
+    if (IE_AVX)
+        add_compile_options(-mavx)
+    endif()
+    if (IE_AVX2)
+        add_compile_options(-mavx2)
+    endif()
+    if (IE_AVX512)
+        add_compile_options(-mavx512f)
+        add_compile_options(-mavx512bw)
+    endif()
+    if (IE_AVX512_VBMI)
+        add_compile_options(-mavx512vbmi)
+    endif()
+    if (IE_AVX512_VNNI)
+        add_compile_options(-mavx512vnni)
+    endif()
+    if (IE_AMX)
+        add_compile_options(-mamx-tile -mamx-int8 -mamx-bf16)
+    endif()
+endif()
diff --git a/vllm-ext/inference_engine/cpp/core/CMakeLists.txt b/vllm-ext/inference_engine/cpp/core/CMakeLists.txt
new file mode 100644
index 000000000..727efd1c7
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/CMakeLists.txt
@@ -0,0 +1,61 @@
+#  Copyright (c) 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+find_package(Threads REQUIRED)
+file(GLOB layers_srcs "layers/*.cpp")
+file(GLOB test_srcs "layers/*test*.cpp")
+list(REMOVE_ITEM layers_srcs ${test_srcs})
+set(sources ne_layers.c ${layers_srcs})
+
+add_shareable_library_w_warning(ne_layers "${sources}")
+
+target_include_directories(ne_layers PUBLIC .)
+target_compile_features(ne_layers PUBLIC c_std_11) # don't bump
+set_target_properties(ne_layers PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+target_link_libraries(ne_layers PUBLIC Threads::Threads bestla ne_vec)
+
+
+if(NOT WIN32)
+  target_link_libraries(ne_layers PUBLIC rt)
+else()
+  target_link_options(ne_layers PUBLIC /STACK:5242880)
+endif()
+
+if (IE_BUILD_TESTS)
+
+function(add_test_target src)  # ARGN: additional source
+  get_filename_component(test_target ${src} NAME_WE)
+  get_filename_component(src_dir ${src} DIRECTORY)
+  string(REGEX REPLACE [/\\] "_" src_dir ${src_dir})
+  if(src_dir)
+    set (test_target "${src_dir}_${test_target}")
+  endif()
+  set (test_target "test_${test_target}")
+  add_executable_w_warning(${test_target} ${src} ${ARGN})
+  target_compile_definitions(${test_target} PRIVATE IE_TESTS)
+  target_compile_options(${test_target} PRIVATE -fsanitize=address)
+  target_link_options(${test_target} PRIVATE -fsanitize=address)
+  target_include_directories(${test_target} PUBLIC .)
+  target_link_libraries(${test_target} PUBLIC Threads::Threads bestla ne_vec)
+  if(NOT WIN32)
+    target_link_libraries(${test_target} PUBLIC rt)
+  endif()
+  add_test(NAME ${test_target} COMMAND ${test_target})
+  set_tests_properties(${test_target} PROPERTIES LABELS "${src_dir}_test")
+endfunction()
+
+add_test_target(layers/mha_dense.cpp layers/mha_dense_tests.cpp)
+
+endif()
diff --git a/vllm-ext/inference_engine/cpp/core/README.md b/vllm-ext/inference_engine/cpp/core/README.md
new file mode 100644
index 000000000..ee7548e3c
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/README.md
@@ -0,0 +1,75 @@
+# Highly Optimized Low Precision Kernels
+Our kernels are based on x64 template library [BESTLA](../../bestla/README.md).
+## Support Matrix
+Limited by the graph framework, we only add kernels which accept float tensor as input and output tensor.
+
+input dtype | output dtype | compute type | compute ISA
+--- |---|---|---
+float32 | float32 | float32 | AVX2
+float32 | float32 | float32 | AVX512F
+float32<sup>1</sup> | float32<sup>2</sup> | int8 | AVX512_VNNI
+float32<sup>1</sup> | float32<sup>2</sup> | int8 | AVX_VNNI
+float32<sup>1</sup> | float32<sup>2</sup> | int8 | AMX_INT8
+float32/bf16 | float32/bf16 | bf16 | AMX_BF16
+float32/fp16 | float32/fp16 | fp16 | AVX512_FP16
+
+<sup>1</sup>: per-batch and per-K group-wise dynamic quantization for input tensor, where per-K group-wise also applies to weight quantization
+group size of weight tensor; support both symmetric and asymmetric quantization.
+<sup>2</sup>: per-batch dynamic dequantization for output tensor.
+
+### Weight-only Quantization Support
+dtype | algo | group size
+--- | --- | ---
+int4 | symmetric int8 truncated quant<sup>2</sup> | multiplier of 8, -1<sup>1</sup>
+int4 | symmetric int4 full range<sup>3</sup> | multiplier of 8, -1<sup>1</sup>
+int4 | asymmetric int4 full range<sup>3</sup> | multiplier of 8, -1<sup>1</sup>
+int8 | symmetric | multiplier of 8, -1<sup>1</sup>
+fp4 | | multiplier of 8
+nf4 | | multiplier of 8
+
+<sup>1</sup>: group size=-1 means per channel quantization on output channel (or group size equals to input channel size).
+<sup>2</sup>: truncated quant means keep the high 4 bits of int8 quantization result for model saving and computation.
+<sup>3</sup>: full range is a quantization method that utilizes the -8 value of int4 range compared with the normal int4 range [-7,7].
+
+NOTE: AMX_INT8 requires group size is aligend to 128 (best hardware efficiency)
+
+## Fusion Support
+We support three kinds of kernel fusion for transformer models: QKV, MHA (multi-head attention), and FFN (feed-forward network) fusion.
+
+<table>
+    <thead>
+        <tr>
+            <th>fusion type</th>
+            <th>models</th>
+            <th>runtime ISA</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>QKV</td>
+            <td >GPT-J<br>LLaMA</td>
+            <td>AMX_INT8, AVX512_VNNI, AVX_VNNI</td>
+        </tr>
+        <tr>
+            <td>FFN</td>
+            <td>GPT-J<br>LLaMA<br>BLOOM<br>ChatGLM<br>Falcon<br>MPT</td>
+            <td>AMX_INT8, AVX512_VNNI, AVX512F, AMX_BF16, AVX_VNNI, AVX2</td>
+        </tr>
+        <tr>
+            <td>MHA</td>
+            <td colspan=2>
+
+Referring [the fused-attention doc for details](../docs/fused_attention.md#supported-models)
+</td>
+        </tr>
+    </tbody>
+</table>
+
+## Fastest Configuration for CPUs
+codename | weight config | runtime ISA
+---|---|---
+Sapphire Rapids | any int4<br>group size=-1<br>compute type=int8 | AMX_INT8
+Ice Lake<br>Cascade Lake<br>Cooper Lake<br>Tiger Lake<br>Rocket Lake | any int4<br>group size=-1<br>compute type=int8 | AVX512_VNNI
+Skylake |  any 4bits<br>group size=-1<br>compute type=fp32 | AVX512F
+Alder Lake (12th Gen)<br>Raptor Lake (13th and 14th Gen)|any 4bits<br>group size=-1<br>compute type=int8 | AVX_VNNI
+Older architecture (before 12th Gen)|  any 4bits<br>group size=-1<br>compute type=fp32 | AVX2
diff --git a/vllm-ext/inference_engine/cpp/core/data_types.h b/vllm-ext/inference_engine/cpp/core/data_types.h
new file mode 100644
index 000000000..f3fd0d272
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/data_types.h
@@ -0,0 +1,266 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// floating point type used to accumulate sums
+typedef double ne_float;
+typedef uint16_t ne_fp16_t;
+typedef uint16_t ne_bf16_t;
+
+enum ne_type {
+  NE_TYPE_F32 = 0,
+  NE_TYPE_F16 = 1,
+  NE_TYPE_Q4_0 = 2,
+  NE_TYPE_Q4_1 = 3,
+  // NE_TYPE_Q4_2 = 4, support has been removed
+  // NE_TYPE_Q4_3 (5) support has been removed
+  NE_TYPE_Q5_0 = 6,
+  NE_TYPE_Q5_1 = 7,
+  NE_TYPE_Q8_0 = 8,
+  NE_TYPE_Q8_1 = 9,
+  // k-quantizations
+  // GGML_TYPE_Q2_K = 10,
+  // GGML_TYPE_Q3_K = 11,
+  // GGML_TYPE_Q4_K = 12,
+  // GGML_TYPE_Q5_K = 13,
+  NE_TYPE_Q6_K = 14,
+  NE_TYPE_Q8_K = 15,
+  NE_TYPE_I8,
+  NE_TYPE_I16,
+  NE_TYPE_I32,
+  NE_TYPE_BTLA,
+  NE_TYPE_COUNT,
+};
+
+enum ne_comp_type {
+  NE_COMP_UNDEF = 0,  // infer from activation dtype
+  NE_COMP_F32 = 1,    // input f32, accumulator f32
+  NE_COMP_BF16 = 2,   // input bf16, accumulator f32
+  NE_COMP_F16 = 3,    // input f16, accumulator f16
+  NE_COMP_INT8 = 4,   // input int8, accumulator int32 (fp32 for intra block)
+};
+
+// model file types
+enum ne_ftype {
+  NE_FTYPE_UNKNOWN = -1,
+  NE_FTYPE_ALL_F32 = 0,
+  NE_FTYPE_MOSTLY_F16 = 1,            // except 1d tensors
+  NE_FTYPE_MOSTLY_Q4_0 = 2,           // except 1d tensors
+  NE_FTYPE_MOSTLY_Q4_1 = 3,           // except 1d tensors
+  NE_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
+  NE_FTYPE_MOSTLY_Q8_0 = 7,           // except 1d tensors
+  NE_FTYPE_MOSTLY_Q5_0 = 8,           // except 1d tensors
+  NE_FTYPE_MOSTLY_Q5_1 = 9,           // except 1d tensors
+  NE_FTYPE_MOSTLY_Q_BTLA = 10,        // except 1d tensors
+};
+
+#define QK4_0 32
+typedef struct {
+  ne_fp16_t d;            // delta
+  uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+
+#define QK4_1 32
+typedef struct {
+  ne_fp16_t d;            // delta
+  ne_fp16_t m;            // min
+  uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+
+#define QK5_0 32
+typedef struct {
+  ne_fp16_t d;            // delta
+  uint8_t qh[4];          // 5-th bit of quants
+  uint8_t qs[QK5_0 / 2];  // nibbles / quants
+} block_q5_0;
+
+#define QK5_1 32
+typedef struct {
+  ne_fp16_t d;            // delta
+  ne_fp16_t m;            // min
+  uint8_t qh[4];          // 5-th bit of quants
+  uint8_t qs[QK5_1 / 2];  // nibbles / quants
+} block_q5_1;
+
+#define QK8_0 32
+typedef struct {
+  ne_fp16_t d;       // delta
+  int8_t qs[QK8_0];  // quants
+} block_q8_0;
+
+#define QK8_1 32
+typedef struct {
+  float d;           // delta
+  float s;           // d * sum(qs[i])
+  int8_t qs[QK8_1];  // quants
+} block_q8_1;
+
+// Super-block size
+#ifdef GGML_QKK_64
+#define QK_K 64
+#define K_SCALE_SIZE 4
+#else
+#define QK_K 256
+#define K_SCALE_SIZE 12
+#endif
+
+// 6-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 6.5625 bits per weight
+typedef struct {
+  uint8_t ql[QK_K / 2];      // quants, lower 4 bits
+  uint8_t qh[QK_K / 4];      // quants, upper 2 bits
+  int8_t scales[QK_K / 16];  // scales, quantized with 8 bits
+  ne_fp16_t d;               // super-block scale
+} block_q6_K;
+
+typedef struct {
+  float d;                   // delta
+  int8_t qs[QK_K];           // quants
+  int16_t bsums[QK_K / 16];  // sum of quants in groups of 16
+} block_q8_K;
+
+// ne_fp16_t related
+
+#ifdef __F16C__
+
+#ifdef _MSC_VER
+#define NE_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+#define NE_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+#else
+#define NE_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define NE_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+#endif
+
+#else
+
+// FP16 <-> FP32
+// ref: https://github.com/Maratyszcza/FP16
+
+static inline float fp32_from_bits(uint32_t w) {
+  union {
+    uint32_t as_bits;
+    float as_value;
+  } fp32;
+  fp32.as_bits = w;
+  return fp32.as_value;
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+  union {
+    float as_value;
+    uint32_t as_bits;
+  } fp32;
+  fp32.as_value = f;
+  return fp32.as_bits;
+}
+
+static inline float ne_compute_fp16_to_fp32(ne_fp16_t h) {
+  const uint32_t w = (uint32_t)h << 16;
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  const uint32_t two_w = w + w;
+
+  const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+  const float exp_scale = 0x1.0p-112f;
+#else
+  const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+  const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+  const uint32_t magic_mask = UINT32_C(126) << 23;
+  const float magic_bias = 0.5f;
+  const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+  const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+  const uint32_t result =
+      sign | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+  return fp32_from_bits(result);
+}
+
+static inline ne_fp16_t ne_compute_fp32_to_fp16(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+  const float scale_to_inf = 0x1.0p+112f;
+  const float scale_to_zero = 0x1.0p-110f;
+#else
+  const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+  const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+  float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+  const uint32_t w = fp32_to_bits(f);
+  const uint32_t shl1_w = w + w;
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+  if (bias < UINT32_C(0x71000000)) {
+    bias = UINT32_C(0x71000000);
+  }
+
+  base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+  const uint32_t bits = fp32_to_bits(base);
+  const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+  const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+  const uint32_t nonsign = exp_bits + mantissa_bits;
+  return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+#define NE_COMPUTE_FP16_TO_FP32(x) ne_compute_fp16_to_fp32(x)
+#define NE_COMPUTE_FP32_TO_FP16(x) ne_compute_fp32_to_fp16(x)
+
+#endif  // __F16C__
+
+//
+// global data
+//
+
+// precomputed gelu table for f16 (128 KB)
+static ne_fp16_t table_gelu_f16[1 << 16];
+
+// precomputed silu table for f16 (128 KB)
+static ne_fp16_t table_silu_f16[1 << 16];
+
+// precomputed exp table for f16 (128 KB)
+static ne_fp16_t table_exp_f16[1 << 16];
+
+// precomputed f32 table for f16 (256 KB)
+static float table_f32_f16[1 << 16];
+
+#if !defined(NE_FP16_TO_FP32) || !defined(NE_FP32_TO_FP16)
+
+inline static float ne_lookup_fp16_to_fp32(ne_fp16_t f) {
+  uint16_t s;
+  memcpy(&s, &f, sizeof(uint16_t));
+  return table_f32_f16[s];
+}
+
+#define NE_FP16_TO_FP32(x) ne_lookup_fp16_to_fp32(x)
+#define NE_FP32_TO_FP16(x) NE_COMPUTE_FP32_TO_FP16(x)
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/vllm-ext/inference_engine/cpp/core/layers/CMakeLists.txt b/vllm-ext/inference_engine/cpp/core/layers/CMakeLists.txt
new file mode 100644
index 000000000..9eb8c4c28
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/CMakeLists.txt
@@ -0,0 +1,13 @@
+#  Copyright (c) 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
diff --git a/vllm-ext/inference_engine/cpp/core/layers/Ops.h b/vllm-ext/inference_engine/cpp/core/layers/Ops.h
new file mode 100644
index 000000000..e38dfbafd
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/Ops.h
@@ -0,0 +1,101 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// available tensor operations:
+enum ne_op {
+  NE_OP_NONE = 0,
+
+  NE_OP_DUP,
+  NE_OP_ADD,
+  NE_OP_ADD1,
+  NE_OP_ACC,
+  NE_OP_SUB,
+  NE_OP_MUL,
+  NE_OP_DIV,
+  NE_OP_SQR,
+  NE_OP_SQRT,
+  NE_OP_LOG,
+  NE_OP_SUM,
+  NE_OP_SUM_ROWS,
+  NE_OP_MEAN,
+  NE_OP_REPEAT,
+  NE_OP_ABS,
+  NE_OP_SGN,
+  NE_OP_NEG,
+  NE_OP_STEP,
+  NE_OP_RELU,
+  NE_OP_GELU,
+  NE_OP_SILU,
+  NE_OP_SILU_BACK,
+  NE_OP_NORM,  // normalize
+  NE_OP_RMS_NORM,
+  NE_OP_RMS_NORM_BACK,
+  NE_OP_RMS_ARGSORT,
+
+  NE_OP_MUL_MAT,
+  NE_OP_MUL_MAT_BIAS,
+  NE_OP_MUL_MAT_ID,
+  NE_OP_SCALE,
+  NE_OP_SET,
+  NE_OP_CPY,
+  NE_OP_CONT,
+  NE_OP_RESHAPE,
+  NE_OP_VIEW,
+  NE_OP_PERMUTE,
+  NE_OP_TRANSPOSE,
+  NE_OP_GET_ROWS,
+  NE_OP_GET_ROWS_BACK,
+  NE_OP_DIAG,
+  NE_OP_DIAG_MASK_INF,
+  NE_OP_DIAG_MASK_ZERO,
+  NE_OP_PADDING_MASK_INF,
+  NE_OP_SOFT_MAX,
+  NE_OP_ROPE,
+  NE_OP_ROPE_BACK,
+  NE_OP_ALIBI,
+  NE_OP_CLAMP,
+  NE_OP_CONV_1D_1S,
+  NE_OP_CONV_1D_2S,
+
+  // LLM related
+  NE_OP_MUL_QKV,
+  NE_OP_MUL_FFN_SILU,
+  NE_OP_MUL_FFN_GELU,
+  NE_OP_MUL_FFN_GELU_MUL,
+  NE_OP_MUL_FFN_ADD_GELU,
+  NE_OP_MUL_ID_FFN_SILU,
+  NE_OP_FLASH_ATTN,
+  NE_OP_FLASH_ATTN_KV_UPDATE,
+  NE_OP_FLASH_FF,
+
+  NE_OP_MAP_UNARY,
+  NE_OP_MAP_BINARY,
+
+  NE_OP_SPLIT,
+  NE_OP_ALL_REDUCE,
+  NE_OP_TP_CONCAT,
+  NE_OP_DUMP_TENSOR,
+  NE_OP_DEBUG,
+  NE_OP_CONV_1D,
+  NE_OP_ARGSORT,
+  NE_OP_COUNT,
+};
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/vllm-ext/inference_engine/cpp/core/layers/argsort.cpp b/vllm-ext/inference_engine/cpp/core/layers/argsort.cpp
new file mode 100644
index 000000000..89840ba09
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/argsort.cpp
@@ -0,0 +1,68 @@
+//  Copyright (c) 2024 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#include "argsort.h"
+#include <algorithm>
+#include <cstdio>
+
+static void ne_compute_forward_argsort_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                           struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  const size_t nb00 = src0->nb[0];
+
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int64_t nr = src0->ne[1] * src0->ne[2] * src0->ne[3];
+
+  for (int64_t i = ith; i < nr; i += nth) {
+    int32_t* dst_data = (int32_t*)((char*)dst->data + i * nb1);
+    const float* src_data = (float*)((char*)src0->data + i * nb01);
+
+    for (int64_t j = 0; j < ne0; j++) {
+      dst_data[j] = j;
+    }
+    std::sort(dst_data, dst_data + ne0, [src_data](int pos1, int pos2) { return (src_data[pos1] > src_data[pos2]); });
+  }
+}
+void ne_compute_forward_argsort(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_argsort_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
diff --git a/vllm-ext/inference_engine/cpp/core/layers/argsort.h b/vllm-ext/inference_engine/cpp/core/layers/argsort.h
new file mode 100644
index 000000000..a9c7c2058
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/argsort.h
@@ -0,0 +1,28 @@
+//  Copyright (c) 2024 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#pragma once
+#include "core/ne.h"
+#include "core/data_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ne_compute_forward_argsort(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                struct ne_tensor* dst);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/vllm-ext/inference_engine/cpp/core/layers/bestla_common.hpp b/vllm-ext/inference_engine/cpp/core/layers/bestla_common.hpp
new file mode 100644
index 000000000..10a893fb2
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/bestla_common.hpp
@@ -0,0 +1,204 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#include <utility>
+#include "ne_bestla.h"
+#include "bestla/bestla_prologue_b.h"
+#include "bestla/bestla_device.h"
+#include "bestla/bestla_utils.h"
+#include "bestla/bestla_parallel.h"
+#include "layers/ele_wise.h"
+#include "bestla_defs.h"
+
+namespace ne_bestla {
+
+class ne_threading {
+ public:
+  static bestla::parallel::IThreading* get() {
+    GetCPUDevice();
+    static bestla::parallel::StdThreading OptmizedThreading;
+#ifdef IE_USE_OPENMP
+    static bestla::parallel::OMPThreading DefaultThreading;
+    if (!_cd->isHybrid()) {
+      return &DefaultThreading;
+    }
+#endif  // _OPNEMP
+    return &OptmizedThreading;
+  }
+
+  static void set_threads(int n_thread) { get()->set_threads(n_thread); }
+};
+
+template <typename T>
+static inline void safe_delete(T* ptr) {
+  if (ptr) {
+    delete ptr;
+  }
+}
+
+template <typename T>
+static bool contains(const T& val, const T* set, size_t len) {
+  for (size_t i = 0; i < len; i++) {
+    if (val == set[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool hasISA(const uint64_t* coreset, size_t len) {
+  GetCPUDevice();
+  bool support = false;
+  for (size_t i = 0; i < len; i++) {
+    auto isa = bestla::gemm::CoreAttr::get_ISA(coreset[i]);
+    switch (isa) {
+      case BTLA_ISA::AVX:
+        support |= _cd->AVX();
+        break;
+      case BTLA_ISA::AVX2:
+        support |= _cd->AVX2();
+        break;
+      case BTLA_ISA::AMX_BF16:
+        support |= _cd->AMX_BF16();
+        break;
+      case BTLA_ISA::AMX_INT8:
+        support |= _cd->AMX_INT8();
+        break;
+      case BTLA_ISA::AVX512F:
+        support |= _cd->AVX512F();
+        break;
+      case BTLA_ISA::AVX512_VNNI:
+        support |= _cd->AVX512_VNNI();
+        break;
+      case BTLA_ISA::AVX512_FP16:
+        support |= _cd->AVX512_FP16();
+        break;
+      case BTLA_ISA::AVX_VNNI:
+        support |= _cd->AVX_VNNI();
+        break;
+      default:
+        break;
+    }
+    if (support) {
+      break;
+    }
+  }
+  return support;
+}
+
+static inline bool samePackedWeight(bestla::storage::gemm::IWeightBase* ptr0,
+                                    bestla::storage::gemm::IWeightBase* ptr1) {
+  return ptr0->mCoreId == ptr1->mCoreId && ptr0->mPrologueID == ptr1->mPrologueID;
+}
+
+static inline bool samePackedWeight(bestla::storage::gemm::IWeightBase** ptrs, size_t len) {
+  bool sameKernel = samePackedWeight(ptrs[0], ptrs[1]);
+  if (sameKernel) {
+    for (size_t i = 2; i < len; i++) {
+      sameKernel &= samePackedWeight(ptrs[0], ptrs[i]);
+    }
+  }
+  return sameKernel;
+}
+
+namespace custom {
+namespace epilogue {
+template <typename _T>
+struct ParamAdd {
+  _T *C, *D;
+  int ldc, ldd;
+};
+template <BTLA_ISA ISA_T, typename _T>
+class Add {
+ public:
+  using Param = ParamAdd<_T>;
+
+  static BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+    auto COffset = M_offset * _param.ldc + N_offset;
+    auto DOffset = M_offset * _param.ldd + N_offset;
+    auto cptr = _param.C + COffset;
+    auto dptr = _param.D + DOffset;
+    // for (int i = 0; i < M; i++) {
+    //   ne_vec_add_f32(N, cptr + i * _param.ldc,dptr + i * _param.ldd, cacheptr + i * cachestep);
+    // }
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        cptr[i * _param.ldc + j] = dptr[i * _param.ldd + j] + cacheptr[i * cachestep + j];
+      }
+    }
+    return BTLA_CODE::Success;
+  }
+};
+template <BTLA_ISA ISA_T>
+using AddFp32 = Add<ISA_T, float>;
+
+template <typename _T>
+struct ParamMul {
+  _T *C, *D;
+  int ldc, ldd;
+};
+template <BTLA_ISA ISA_T, typename _T>
+class Mul {
+ public:
+  using Param = ParamMul<_T>;
+  static BTLA_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset,
+                           const int M, const int N, const Param& _param, void* tmpcache, size_t cachesize) {
+    auto COffset = M_offset * _param.ldc + N_offset;
+    auto DOffset = M_offset * _param.ldd + N_offset;
+    auto cptr = _param.C + COffset;
+    auto dptr = _param.D + DOffset;
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        cptr[i * _param.ldc + j] = dptr[i * _param.ldd + j] * cacheptr[i * cachestep + j];
+      }
+    }
+    return BTLA_CODE::Success;
+  }
+};
+template <BTLA_ISA ISA_T>
+using MulFp32 = Mul<ISA_T, float>;
+
+template <typename _T>
+struct ParamAdd_Gelu {
+  _T *C, *D;
+  int ldc, ldd;
+};
+template <BTLA_ISA ISA_T, typename _T>
+class Add_Gelu {
+ public:
+  using Param = ParamAdd_Gelu<_T>;
+  static BTLA_CODE forward(  // NOLINT [build/include_what_you_use]
+      const float* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M, const int N,
+      const Param& _param, void* tmpcache, size_t cachesize) {
+    auto COffset = M_offset * _param.ldc + N_offset;
+    auto DOffset = M_offset * _param.ldd + N_offset;
+    auto cptr = _param.C + COffset;
+    auto dptr = _param.D + DOffset;
+    for (int i = 0; i < M; i++) {
+      ne_vec_add_f32(N, cptr + i * _param.ldc, dptr + i * _param.ldd, cacheptr + i * cachestep);
+    }
+    using GeluKernel = bestla::epilogue::gemm::AccumulatorWriteBackWithGeluFp32<ISA_T>;
+    static GeluKernel ker;
+    typename GeluKernel::Param param{_param.C, _param.ldc, nullptr};
+    auto ret = ker.forward(cptr, _param.ldc, M_offset, N_offset, M, N, param, tmpcache, cachesize);
+    return ret;
+  }
+};
+template <BTLA_ISA ISA_T>
+using Add_GeluFp32 = Add_Gelu<ISA_T, float>;
+
+}  // namespace epilogue
+}  // namespace custom
+}  // namespace ne_bestla
diff --git a/vllm-ext/inference_engine/cpp/core/layers/bestla_defs.h b/vllm-ext/inference_engine/cpp/core/layers/bestla_defs.h
new file mode 100644
index 000000000..7fa7b5685
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/bestla_defs.h
@@ -0,0 +1,68 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#pragma once
+#include "bestla/bestla_prologue_b.h"
+#include "bestla/bestla_wrapper.h"
+
+namespace bestla {
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T>
+using tLauncher_Fp_F32F32 =
+    wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ShuffleActivationKBlockBaseF32, Wei_T,
+                                epilogue::gemm::AccumulatorWriteBackFp32>;
+
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T>
+using tLauncher_Int8_F32F32 =
+    wrapper::gemm::LauncherIntKBlock<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ShuffleActivationKBlockQuantizeF32,
+                                     Wei_T, epilogue::gemm::AccumulatorWriteBackFp32>;
+
+using tAVX2 = gemm::SCoreRowNAvx2<24, 4>;
+using tAVX_VNNI = gemm::ICoreRowNAvxvnni<24, 4>;
+using tAVX512F = gemm::SCoreRowNAvx512f<48, 8>;
+using tAVX512_VNNI = gemm::ICoreRowNAvx512vnni<48, 8>;
+using tAMX_BF16 = gemm::HCoreRowNAmxbf16<48, 16>;
+using tAVX512_BF16 = gemm::HCoreRowNAvx512bf16<48, 8>;
+using tAVX512_FP16 = gemm::HCoreRowNAvx512fp16<96, 8>;
+using tAMX_INT8_US = gemm::ICoreRowNAmxint8<48, 16>;
+using tAMX_INT8_SS = gemm::ICoreRowNAmxint8SS<48, 16>;
+
+using tAVX2_VNNI_KBlock = gemm::ICoreRowNAvx2vnniKBlock<24, 2>;
+using tAVX_VNNI_KBlock = gemm::ICoreRowNAvxvnniKBlock<24, 2>;
+using tAVX512_VNNI_KBlock = gemm::ICoreRowNAvx512vnniKBlock<48, 4>;
+using tAMX_INT8_US_KBlock = gemm::ICoreRowNAmxint8KBlock<48, 16>;
+using tAMX_INT8_SS_KBlock = gemm::ICoreRowNAmxint8SSKBlock<48, 16>;
+
+template <class GC_T, BTLA_ISA ISA_T>
+using tWeiNInt = prologue_b::gemm::WeightKBlockNInteger<GC_T, ISA_T>;
+template <class GC_T, BTLA_ISA ISA_T>
+using tWeiNFloat = prologue_b::gemm::WeightKBlockNFloat<GC_T, ISA_T>;
+
+template <class GC_T, BTLA_ISA ISA_T>
+using tActKBaseF32 = prologue_a::gemm::ShuffleActivationKBlockBaseF32<GC_T, ISA_T>;
+
+constexpr uint64_t Fp32Cores[] = {tAVX2::ID, tAVX512F::ID};
+constexpr uint64_t Bf16Cores[] = {tAMX_BF16::ID};
+constexpr uint64_t Fp16Cores[] = {tAVX512_FP16::ID};
+constexpr uint64_t Int8Cores[] = {tAVX_VNNI::ID, tAVX512F::ID, tAVX512_VNNI::ID, tAMX_INT8_US::ID, tAMX_INT8_SS::ID};
+constexpr uint64_t FloatCores[] = {tAVX2::ID, tAVX512F::ID, tAMX_BF16::ID, tAVX512_FP16::ID};
+constexpr uint64_t AllKBlockCores[] = {tAVX2::ID,
+                                       tAVX512F::ID,
+                                       tAMX_BF16::ID,
+                                       tAVX512_FP16::ID,
+                                       tAVX_VNNI_KBlock::ID,
+                                       tAVX512_VNNI_KBlock::ID,
+                                       tAMX_INT8_US_KBlock::ID,
+                                       tAMX_INT8_SS_KBlock::ID};
+
+}  // namespace bestla
diff --git a/vllm-ext/inference_engine/cpp/core/layers/bestla_gemm.cpp b/vllm-ext/inference_engine/cpp/core/layers/bestla_gemm.cpp
new file mode 100644
index 000000000..06916aca7
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/bestla_gemm.cpp
@@ -0,0 +1,703 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+/*++
+
+Module Name:
+
+    bestla_gemm.cpp
+
+Abstract:
+
+    C APIs of BesTLA GEMMs.
+--*/
+
+#include "bestla_gemm.h"
+#include <cstdint>
+
+#include "bestla_defs.h"
+
+using namespace bestla;  // NOLINT
+
+namespace {
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T>
+void BTLAGemmCompF32(const int M, const int N, const int K, const float* A, const int lda,
+                     storage::gemm::IWeightBase* _B, float* C, const int ldc, int8_t* WorkSpace,
+                     parallel::IThreading* th) {
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+  using Launcher =
+      wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ShuffleActivationKBlockBaseF32, Wei_T,
+                                  epilogue::gemm::AccumulatorWriteBackFp32>;
+  static Launcher kernel;
+  auto B = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(_B);
+  utils::GemmProblem gp(1, M, N, K, B->mBlockSize);
+  auto reordA = kernel.mProA.createReorderStorage(M, K, B->mBlockSize);
+  typename Launcher::Param args{gp, {A, K, nullptr, B->ShfIndice(), &reordA}, {B}, {C, N}};
+  if (B->ShfIndice()) {
+    reordA.assign(WorkSpace);
+    parallel::GemmRunWithA<Parallel>(kernel, args, th);
+  } else {
+    parallel::GemmRun<Parallel>(kernel, args, th);
+  }
+}
+
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T>
+void BTLAGemmCompInt8(const int M, const int N, const int K, const float* A, const int lda,
+                      storage::gemm::IWeightBase* _B, float* C, const int ldc, int8_t* WorkSpace,
+                      parallel::IThreading* th) {
+  using Parallel = parallel::gemm::SchedulerKBlockS<GemmCore_T>;
+  using Launcher = tLauncher_Int8_F32F32<GemmCore_T, Wei_T>;
+  auto B = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(_B);
+  utils::GemmProblem gp(1, M, N, K, B->mBlockSize);
+  static Launcher kernel;
+  auto quanA = kernel.mProA.createQuantStorage(M, K, B->mBlockSize, B->IsAsym());
+  quanA.assign(WorkSpace);
+  WorkSpace += quanA.mSize;
+  auto reordA = kernel.mProA.createReorderStorage(M, K, B->mBlockSize);
+  typename Launcher::Param args{gp, {A, K, &quanA, B->ShfIndice(), &reordA}, {B}, {C, N}};
+  if (B->ShfIndice()) {
+    reordA.assign(WorkSpace);
+    kernel.mProA.quantize({A, K, &quanA, B->ShfIndice(), &reordA}, M, K, th);
+    parallel::GemmRun<Parallel>(kernel, args, th);
+  } else {
+    parallel::GemmRunWithA<Parallel>(kernel, args, th);
+  }
+}
+
+bool BTLAGemmBatchDriver(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                         const BTLA_GEMM_DATA_PACKED_PARAMS* DataParams, int8_t* WorkSpace, void* ThreadPool) {
+  GetCPUDevice();
+  auto pth = reinterpret_cast<parallel::IThreading*>(ThreadPool);
+  bool processed = true;
+  for (size_t i = 0; i < BatchN; i++) {
+    auto ptr = storage::gemm::PackedWeightParser::deserialBuffer(const_cast<void*>(DataParams[i].B));
+    if (ptr) {
+      auto coretype = ptr->mCoreId;
+      auto NTile = gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+      auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+      auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+      auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+      if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+        auto bptr = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr);
+        auto BlkSize = bptr->mBlockSize;
+        if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+          if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+            BTLAGemmCompF32<tAVX512F, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                                DataParams[i].ldc, WorkSpace, pth);
+          } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+            BTLAGemmCompF32<tAVX2, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                             DataParams[i].ldc, WorkSpace, pth);
+          }
+        }
+        if (btype == gemm::CompType::tBF16 && PackRow == 2) {
+          if (NTile == tAMX_BF16::NTILE && _cd->AMX_BF16() && BlkSize % tAMX_BF16::KTILE == 0) {
+            if (M <= tAVX512_BF16::MTILE) {
+              static_assert(tAVX512_BF16::NTILE == tAMX_BF16::NTILE);
+              BTLAGemmCompF32<tAVX512_BF16, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                                      DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompF32<tAMX_BF16, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                                   DataParams[i].ldc, WorkSpace, pth);
+            }
+          }
+        }
+        if (btype == gemm::CompType::tS8 && PackRow == 4) {
+          // Do we need US for AMX_INT8
+          if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+            static_assert(tAMX_INT8_SS_KBlock::NTILE == tAVX512_VNNI_KBlock::NTILE);
+            if (M <= tAVX512_VNNI_KBlock::MTILE) {
+              BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                              DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompInt8<tAMX_INT8_SS_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                              DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            }
+
+          } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                     BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+            BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                            DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+          } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+            BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                         DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+          } else if (NTile == tAVX2_VNNI_KBlock::NTILE && _cd->AVX2() && BlkSize % tAVX2_VNNI_KBlock::KTILE == 0) {
+            BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                          DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+          }
+        }
+      }
+      if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNFloat) {
+        auto bptr = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr);
+        auto BlkSize = bptr->mBlockSize;
+        if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+          if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+            BTLAGemmCompF32<tAVX512F, tWeiNFloat>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                                  DataParams[i].ldc, WorkSpace, pth);
+          } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+            BTLAGemmCompF32<tAVX2, tWeiNFloat>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                               DataParams[i].ldc, WorkSpace, pth);
+          }
+        }
+        if (btype == gemm::CompType::tBF16 && PackRow == 2) {
+          if (NTile == tAMX_BF16::NTILE && _cd->AMX_BF16() && BlkSize % tAMX_BF16::KTILE == 0) {
+            if (M <= tAVX512_BF16::MTILE) {
+              static_assert(tAVX512_BF16::NTILE == tAMX_BF16::NTILE);
+              BTLAGemmCompF32<tAVX512_BF16, tWeiNFloat>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                        DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompF32<tAMX_BF16, tWeiNFloat>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                                     DataParams[i].ldc, WorkSpace, pth);
+            }
+          }
+        }
+      }
+      delete ptr;
+    } else {
+      processed = false;
+      break;
+    }
+  }
+  return processed;
+}
+
+template <typename T>
+size_t BTLABuSize(int block_size, size_t N, size_t K, BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym,
+                  int* shuffle_indice) {
+  static T launcher;
+  using WType = typename T::PrologueB::StorageWeight;
+  WType stor(0);
+  if constexpr (std::is_same_v<typename T::PrologueB,
+                               prologue_b::gemm::WeightKBlockNInteger<typename T::GemmCore, T::ISA>>) {
+    stor = launcher.mProB.createStorage(N, K, block_size, QuantType, ScaleDtype, BTLA_DTYPE::BF16, isAsym);
+    if (shuffle_indice != nullptr) {
+      launcher.mProB.enableShuffle(&stor);
+    }
+  } else {
+    stor = launcher.mProB.createStorage(N, K, block_size, QuantType, ScaleDtype);
+    (void)(shuffle_indice);
+  }
+
+  // Reduce dtype set to bf16
+  return stor.mSize;
+}
+template <template <class, BTLA_ISA> class Wei_T>
+size_t BTLAGemmPackBSizeLocal(size_t N, size_t K, size_t BlkSize, BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype,
+                              bool isAsym, ne_comp_type CompType, int* shuffle_indice) {
+  GetCPUDevice();
+  auto dtype_type = utils::bestla_dtype_type(QuantType);
+  auto constexpr dtype_int = utils::bestla_dtype_type(BTLA_DTYPE::TypeInt);
+  // from low precision to high precision
+  switch (CompType) {
+    case NE_COMP_INT8:
+      if (dtype_type == dtype_int && !(QuantType == BTLA_DTYPE::S8 && isAsym)) {
+        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          return BTLABuSize<tLauncher_Int8_F32F32<tAMX_INT8_SS_KBlock, Wei_T>>(
+              static_cast<int>(BlkSize), N, K, QuantType, ScaleDtype, isAsym, shuffle_indice);
+        }
+        if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          return BTLABuSize<tLauncher_Int8_F32F32<tAVX512_VNNI_KBlock, Wei_T>>(
+              static_cast<int>(BlkSize), N, K, QuantType, ScaleDtype, isAsym, shuffle_indice);
+        }
+        if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          return BTLABuSize<tLauncher_Int8_F32F32<tAVX_VNNI_KBlock, Wei_T>>(static_cast<int>(BlkSize), N, K, QuantType,
+                                                                            ScaleDtype, isAsym, shuffle_indice);
+        }
+        if (_cd->AVX2() && BlkSize % tAVX2_VNNI_KBlock::KTILE == 0) {
+          return BTLABuSize<tLauncher_Int8_F32F32<tAVX2_VNNI_KBlock, Wei_T>>(static_cast<int>(BlkSize), N, K, QuantType,
+                                                                             ScaleDtype, isAsym, shuffle_indice);
+        }
+      }
+      [[fallthrough]];
+    case NE_COMP_F16:
+    case NE_COMP_BF16:
+      if (_cd->AMX_BF16() && BlkSize % tAMX_BF16::KTILE == 0) {
+        return BTLABuSize<tLauncher_Int8_F32F32<tAMX_BF16, Wei_T>>(static_cast<int>(BlkSize), N, K, QuantType,
+                                                                   ScaleDtype, isAsym, shuffle_indice);
+      }
+      [[fallthrough]];
+    case NE_COMP_F32:
+    case NE_COMP_UNDEF:  // currently only f32 activation
+      if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+        return BTLABuSize<tLauncher_Fp_F32F32<tAVX512F, Wei_T>>(static_cast<int>(BlkSize), N, K, QuantType, ScaleDtype,
+                                                                isAsym, shuffle_indice);
+      }
+      if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+        return BTLABuSize<tLauncher_Fp_F32F32<tAVX2, Wei_T>>(static_cast<int>(BlkSize), N, K, QuantType, ScaleDtype,
+                                                             isAsym, shuffle_indice);
+      }
+      [[fallthrough]];
+    default:
+      return 0;
+  }
+  return 0;
+}
+
+size_t BTLAGemmPackBSize(size_t N, size_t K, size_t BlkSize, BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym,
+                         ne_comp_type CompType, int* shuffle_indice) {
+  auto qtype = utils::bestla_dtype_type(QuantType);
+  if (qtype == utils::bestla_dtype_type(BTLA_DTYPE::TypeInt)) {
+    return BTLAGemmPackBSizeLocal<prologue_b::gemm::WeightKBlockNInteger>(N, K, BlkSize, QuantType, ScaleDtype, isAsym,
+                                                                          CompType, shuffle_indice);
+  } else if (qtype == utils::bestla_dtype_type(BTLA_DTYPE::TypeFloat)) {
+    return BTLAGemmPackBSizeLocal<prologue_b::gemm::WeightKBlockNFloat>(N, K, BlkSize, QuantType, ScaleDtype, isAsym,
+                                                                        CompType, shuffle_indice);
+  } else {
+    assert(0);
+  }
+  return 0;
+}
+
+template <typename T>
+void BTLAGemmQuantPackB(void* PackedBuf, int BlkSize, const float* FpData, int N, int K, BTLA_DTYPE QuantType,
+                        BTLA_DTYPE ScaleDtype, bool IsAsym, int ldb, bool IsTrans, void* ThreadPool) {
+  static T launcher;
+  using WType = typename T::PrologueB::StorageWeight;
+  WType stor(0);
+  if constexpr (std::is_same_v<typename T::PrologueB,
+                               prologue_b::gemm::WeightKBlockNInteger<typename T::GemmCore, T::ISA>>) {
+    stor = launcher.mProB.createStorage(N, K, BlkSize, QuantType, ScaleDtype, BTLA_DTYPE::BF16, IsAsym);
+  } else {
+    stor = launcher.mProB.createStorage(N, K, BlkSize, QuantType, ScaleDtype);
+  }
+  stor.assign(reinterpret_cast<int8_t*>(PackedBuf));
+  auto pth = reinterpret_cast<parallel::IThreading*>(ThreadPool);
+  if (IsTrans) {
+    launcher.mProB.packTransposeWeight(N, K, FpData, ldb, &stor, pth);
+  } else {
+    launcher.mProB.packWeight(N, K, FpData, ldb, &stor, pth);
+  }
+}
+
+template <template <class, BTLA_ISA> class Wei_T>
+bool BTLAGemmQuantPackBLocal(void* PackedBuf, const float* FpData, size_t N, size_t K, size_t ldb, size_t BlkSize,
+                             BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym, ne_comp_type CompType,
+                             bool isTrans, void* ThreadPool) {
+  GetCPUDevice();
+  auto dtype_type = utils::bestla_dtype_type(QuantType);
+  auto constexpr dtype_int = utils::bestla_dtype_type(BTLA_DTYPE::TypeInt);
+  switch (CompType) {
+    case NE_COMP_INT8:
+      if (dtype_type == dtype_int && !(QuantType == BTLA_DTYPE::S8 && isAsym)) {
+        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          BTLAGemmQuantPackB<tLauncher_Int8_F32F32<tAMX_INT8_SS_KBlock, Wei_T>>(
+              PackedBuf, static_cast<int>(BlkSize), FpData, static_cast<int>(N), static_cast<int>(K), QuantType,
+              ScaleDtype, isAsym, static_cast<int>(ldb), isTrans, ThreadPool);
+          return true;
+        }
+        if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          BTLAGemmQuantPackB<tLauncher_Int8_F32F32<tAVX512_VNNI_KBlock, Wei_T>>(
+              PackedBuf, static_cast<int>(BlkSize), FpData, static_cast<int>(N), static_cast<int>(K), QuantType,
+              ScaleDtype, isAsym, static_cast<int>(ldb), isTrans, ThreadPool);
+          return true;
+        }
+        if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          BTLAGemmQuantPackB<tLauncher_Int8_F32F32<tAVX_VNNI_KBlock, Wei_T>>(
+              PackedBuf, static_cast<int>(BlkSize), FpData, static_cast<int>(N), static_cast<int>(K), QuantType,
+              ScaleDtype, isAsym, static_cast<int>(ldb), isTrans, ThreadPool);
+          return true;
+        }
+        if (_cd->AVX2() && BlkSize % tAVX2_VNNI_KBlock::KTILE == 0) {
+          BTLAGemmQuantPackB<tLauncher_Int8_F32F32<tAVX2_VNNI_KBlock, Wei_T>>(
+              PackedBuf, static_cast<int>(BlkSize), FpData, static_cast<int>(N), static_cast<int>(K), QuantType,
+              ScaleDtype, isAsym, static_cast<int>(ldb), isTrans, ThreadPool);
+          return true;
+        }
+      }
+      [[fallthrough]];
+    case NE_COMP_F16:
+    case NE_COMP_BF16:
+      if (_cd->AMX_BF16() && BlkSize % tAMX_BF16::KTILE == 0) {
+        BTLAGemmQuantPackB<tLauncher_Fp_F32F32<tAMX_BF16, Wei_T>>(
+            PackedBuf, static_cast<int>(BlkSize), FpData, static_cast<int>(N), static_cast<int>(K), QuantType,
+            ScaleDtype, isAsym, static_cast<int>(ldb), isTrans, ThreadPool);
+        return true;
+      }
+      [[fallthrough]];
+    case NE_COMP_F32:
+    case NE_COMP_UNDEF:  // currently only f32 activation
+      if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+        BTLAGemmQuantPackB<tLauncher_Fp_F32F32<tAVX512F, Wei_T>>(
+            PackedBuf, static_cast<int>(BlkSize), FpData, static_cast<int>(N), static_cast<int>(K), QuantType,
+            ScaleDtype, isAsym, static_cast<int>(ldb), isTrans, ThreadPool);
+        return true;
+      }
+      if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+        BTLAGemmQuantPackB<tLauncher_Fp_F32F32<tAVX2, Wei_T>>(
+            PackedBuf, static_cast<int>(BlkSize), FpData, static_cast<int>(N), static_cast<int>(K), QuantType,
+            ScaleDtype, isAsym, static_cast<int>(ldb), isTrans, ThreadPool);
+        return true;
+      }
+      [[fallthrough]];
+    default:
+      return false;
+  }
+  return false;
+}
+
+bool BTLAGemmQuantPackB(void* PackedBuf, const float* FpData, size_t N, size_t K, size_t ldb, size_t BlkSize,
+                        BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym, ne_comp_type CompType, bool isTrans,
+                        void* ThreadPool) {
+  auto qtype = utils::bestla_dtype_type(QuantType);
+  if (qtype == utils::bestla_dtype_type(BTLA_DTYPE::TypeInt)) {
+    return BTLAGemmQuantPackBLocal<prologue_b::gemm::WeightKBlockNInteger>(
+        PackedBuf, FpData, N, K, ldb, BlkSize, QuantType, ScaleDtype, isAsym, CompType, isTrans, ThreadPool);
+  } else if (qtype == utils::bestla_dtype_type(BTLA_DTYPE::TypeFloat)) {
+    return BTLAGemmQuantPackBLocal<prologue_b::gemm::WeightKBlockNFloat>(
+        PackedBuf, FpData, N, K, ldb, BlkSize, QuantType, ScaleDtype, isAsym, CompType, isTrans, ThreadPool);
+  } else {
+    assert(0);
+    return false;
+  }
+}
+
+template <typename T>
+void BTLAGemmPackBImpl(void* PackedBuf, int BlkSize, const int8_t* QData, const float* Scales, const int8_t* Zp, int N,
+                       int K, BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool IsAsym, int ldb, int* shuffle_indice,
+                       void* ThreadPool) {
+  static T launcher;
+  using WType = typename T::PrologueB::StorageWeight;
+  auto pth = reinterpret_cast<parallel::IThreading*>(ThreadPool);
+  WType stor(0);
+  if constexpr (std::is_same_v<typename T::PrologueB,
+                               prologue_b::gemm::WeightKBlockNInteger<typename T::GemmCore, T::ISA>>) {
+    stor = launcher.mProB.createStorage(N, K, BlkSize, QuantType, ScaleDtype, BTLA_DTYPE::BF16, IsAsym);
+    if (shuffle_indice != nullptr) {
+      launcher.mProB.enableShuffle(&stor);
+      stor.assign(reinterpret_cast<int8_t*>(PackedBuf));
+      launcher.mProB.setShuffleIndices(shuffle_indice, &stor, pth);
+    } else {
+      stor.assign(reinterpret_cast<int8_t*>(PackedBuf));
+    }
+  } else {
+    (void)(shuffle_indice);
+    stor = launcher.mProB.createStorage(N, K, BlkSize, QuantType, ScaleDtype);
+    stor.assign(reinterpret_cast<int8_t*>(PackedBuf));
+  }
+  launcher.mProB.packQWeight(N, K, QData, ldb, Scales, IsAsym ? Zp : nullptr, &stor, pth);
+}
+
+template <template <class, BTLA_ISA> class Wei_T>
+bool BTLAGemmPackBLocal(void* PackedBuf, const int8_t* QData, const float* Scales, const int8_t* Zp, size_t N, size_t K,
+                        size_t ldb, size_t BlkSize, BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym,
+                        ne_comp_type CompType, int* shuffle_indice, void* ThreadPool) {
+  GetCPUDevice();
+  auto dtype_type = utils::bestla_dtype_type(QuantType);
+  auto constexpr dtype_int = utils::bestla_dtype_type(BTLA_DTYPE::TypeInt);
+  if (dtype_type != dtype_int) {
+    return false;
+  }
+  switch (CompType) {
+    case NE_COMP_INT8:
+      if (dtype_type == dtype_int) {
+        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          BTLAGemmPackBImpl<tLauncher_Int8_F32F32<tAMX_INT8_SS_KBlock, Wei_T>>(
+              PackedBuf, static_cast<int>(BlkSize), QData, Scales, Zp, static_cast<int>(N), static_cast<int>(K),
+              QuantType, ScaleDtype, isAsym, static_cast<int>(ldb), shuffle_indice, ThreadPool);
+          return true;
+        }
+        if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          BTLAGemmPackBImpl<tLauncher_Int8_F32F32<tAVX512_VNNI_KBlock, Wei_T>>(
+              PackedBuf, static_cast<int>(BlkSize), QData, Scales, Zp, static_cast<int>(N), static_cast<int>(K),
+              QuantType, ScaleDtype, isAsym, static_cast<int>(ldb), shuffle_indice, ThreadPool);
+          return true;
+        }
+        if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          BTLAGemmPackBImpl<tLauncher_Int8_F32F32<tAVX_VNNI_KBlock, Wei_T>>(
+              PackedBuf, static_cast<int>(BlkSize), QData, Scales, Zp, static_cast<int>(N), static_cast<int>(K),
+              QuantType, ScaleDtype, isAsym, static_cast<int>(ldb), shuffle_indice, ThreadPool);
+          return true;
+        }
+        if (_cd->AVX2() && BlkSize % tAVX2_VNNI_KBlock::KTILE == 0) {
+          BTLAGemmPackBImpl<tLauncher_Int8_F32F32<tAVX2_VNNI_KBlock, Wei_T>>(
+              PackedBuf, static_cast<int>(BlkSize), QData, Scales, Zp, static_cast<int>(N), static_cast<int>(K),
+              QuantType, ScaleDtype, isAsym, static_cast<int>(ldb), shuffle_indice, ThreadPool);
+          return true;
+        }
+      }
+      [[fallthrough]];
+    case NE_COMP_F16:
+    case NE_COMP_BF16:
+      if (_cd->AMX_BF16() && BlkSize % tAMX_BF16::KTILE == 0) {
+        BTLAGemmPackBImpl<tLauncher_Fp_F32F32<tAMX_BF16, Wei_T>>(
+            PackedBuf, static_cast<int>(BlkSize), QData, Scales, Zp, static_cast<int>(N), static_cast<int>(K),
+            QuantType, ScaleDtype, isAsym, static_cast<int>(ldb), shuffle_indice, ThreadPool);
+        return true;
+      }
+      [[fallthrough]];
+    case NE_COMP_F32:
+    case NE_COMP_UNDEF:  // currently only f32 activation
+      if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+        BTLAGemmPackBImpl<tLauncher_Fp_F32F32<tAVX512F, Wei_T>>(
+            PackedBuf, static_cast<int>(BlkSize), QData, Scales, Zp, static_cast<int>(N), static_cast<int>(K),
+            QuantType, ScaleDtype, isAsym, static_cast<int>(ldb), shuffle_indice, ThreadPool);
+        return true;
+      }
+      if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+        BTLAGemmPackBImpl<tLauncher_Fp_F32F32<tAVX2, Wei_T>>(
+            PackedBuf, static_cast<int>(BlkSize), QData, Scales, Zp, static_cast<int>(N), static_cast<int>(K),
+            QuantType, ScaleDtype, isAsym, static_cast<int>(ldb), shuffle_indice, ThreadPool);
+        return true;
+      }
+      [[fallthrough]];
+    default:
+      return false;
+  }
+  return false;
+}
+
+}  // namespace
+
+bool BTLAGemmBatchDriver(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                         const BTLA_GEMM_DATA_PACKED_PARAMS* DataParams, int8_t* WorkSpace, void* ThreadPool) {
+  GetCPUDevice();
+  auto pth = reinterpret_cast<bestla::parallel::IThreading*>(ThreadPool);
+  bool processed = true;
+  for (size_t i = 0; i < BatchN; i++) {
+    auto ptr = storage::gemm::PackedWeightParser::deserialBuffer(const_cast<void*>(DataParams[i].B));
+    if (ptr) {
+      auto coretype = ptr->mCoreId;
+      auto NTile = gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+      auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+      auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+      auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+      if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+        auto bptr = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr);
+        auto BlkSize = bptr->mBlockSize;
+        if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+          if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+            BTLAGemmCompF32<tAVX512F, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                                DataParams[i].ldc, WorkSpace, pth);
+          } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+            BTLAGemmCompF32<tAVX2, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                             DataParams[i].ldc, WorkSpace, pth);
+          }
+        }
+        if (btype == gemm::CompType::tBF16 && PackRow == 2) {
+          if (NTile == tAMX_BF16::NTILE && _cd->AMX_BF16() && BlkSize % tAMX_BF16::KTILE == 0) {
+            if (M <= tAVX512_BF16::MTILE) {
+              static_assert(tAVX512_BF16::NTILE == tAMX_BF16::NTILE);
+              BTLAGemmCompF32<tAVX512_BF16, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                                      DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompF32<tAMX_BF16, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                                   DataParams[i].ldc, WorkSpace, pth);
+            }
+          }
+        }
+        if (btype == gemm::CompType::tS8 && PackRow == 4) {
+          // Do we need US for AMX_INT8
+          if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+            static_assert(tAMX_INT8_SS_KBlock::NTILE == tAVX512_VNNI_KBlock::NTILE);
+            if (M <= tAVX512_VNNI_KBlock::MTILE) {
+              BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                              DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompInt8<tAMX_INT8_SS_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                              DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            }
+
+          } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                     BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+            BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                            DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+          } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+            BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                         DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+          } else if (NTile == tAVX2_VNNI_KBlock::NTILE && _cd->AVX2() && BlkSize % tAVX2_VNNI_KBlock::KTILE == 0) {
+            BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                          DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+          }
+        }
+      }
+      if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNFloat) {
+        auto bptr = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr);
+        auto BlkSize = bptr->mBlockSize;
+        if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+          if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+            BTLAGemmCompF32<tAVX512F, tWeiNFloat>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                                  DataParams[i].ldc, WorkSpace, pth);
+          } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+            BTLAGemmCompF32<tAVX2, tWeiNFloat>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                               DataParams[i].ldc, WorkSpace, pth);
+          }
+        }
+        if (btype == gemm::CompType::tBF16 && PackRow == 2) {
+          if (NTile == tAMX_BF16::NTILE && _cd->AMX_BF16() && BlkSize % tAMX_BF16::KTILE == 0) {
+            if (M <= tAVX512_BF16::MTILE) {
+              static_assert(tAVX512_BF16::NTILE == tAMX_BF16::NTILE);
+              BTLAGemmCompF32<tAVX512_BF16, tWeiNFloat>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr,
+                                                        DataParams[i].C, DataParams[i].ldc, WorkSpace, pth);
+            } else {
+              BTLAGemmCompF32<tAMX_BF16, tWeiNFloat>(M, N, K, DataParams[i].A, DataParams[i].lda, ptr, DataParams[i].C,
+                                                     DataParams[i].ldc, WorkSpace, pth);
+            }
+          }
+        }
+      }
+      delete ptr;
+    } else {
+      processed = false;
+      break;
+    }
+  }
+  return processed;
+}
+
+size_t BTLAGemmPackBSize(size_t N, size_t K, size_t BlkSize, BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym,
+                         ne_comp_type CompType, int* shuffle_indice) {
+  auto qtype = utils::bestla_dtype_type(QuantType);
+  if (qtype == utils::bestla_dtype_type(BTLA_DTYPE::TypeInt)) {
+    return BTLAGemmPackBSizeLocal<prologue_b::gemm::WeightKBlockNInteger>(N, K, BlkSize, QuantType, ScaleDtype, isAsym,
+                                                                          CompType, shuffle_indice);
+  } else if (qtype == utils::bestla_dtype_type(BTLA_DTYPE::TypeFloat)) {
+    return BTLAGemmPackBSizeLocal<prologue_b::gemm::WeightKBlockNFloat>(N, K, BlkSize, QuantType, ScaleDtype, isAsym,
+                                                                        CompType, shuffle_indice);
+  } else {
+    assert(0);
+  }
+  return 0;
+}
+
+bool BTLAGemmQuantPackB(void* PackedBuf, const float* FpData, size_t N, size_t K, size_t ldb, size_t BlkSize,
+                        BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym, ne_comp_type CompType, bool isTrans,
+                        void* ThreadPool) {
+  auto qtype = utils::bestla_dtype_type(QuantType);
+  if (qtype == utils::bestla_dtype_type(BTLA_DTYPE::TypeInt)) {
+    return BTLAGemmQuantPackBLocal<prologue_b::gemm::WeightKBlockNInteger>(
+        PackedBuf, FpData, N, K, ldb, BlkSize, QuantType, ScaleDtype, isAsym, CompType, isTrans, ThreadPool);
+  } else if (qtype == utils::bestla_dtype_type(BTLA_DTYPE::TypeFloat)) {
+    return BTLAGemmQuantPackBLocal<prologue_b::gemm::WeightKBlockNFloat>(
+        PackedBuf, FpData, N, K, ldb, BlkSize, QuantType, ScaleDtype, isAsym, CompType, isTrans, ThreadPool);
+  } else {
+    assert(0);
+  }
+  return false;
+}
+
+bool BTLAGemmPackB(void* PackedBuf, const int8_t* QData, const float* Scales, const int8_t* Zp, size_t N, size_t K,
+                   size_t ldb, size_t BlkSize, BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym,
+                   ne_comp_type CompType, int* shuffle_indice, void* ThreadPool) {
+  auto qtype = utils::bestla_dtype_type(QuantType);
+  if (qtype == utils::bestla_dtype_type(BTLA_DTYPE::TypeInt)) {
+    return BTLAGemmPackBLocal<prologue_b::gemm::WeightKBlockNInteger>(PackedBuf, QData, Scales, Zp, N, K, ldb, BlkSize,
+                                                                      QuantType, ScaleDtype, isAsym, CompType,
+                                                                      shuffle_indice, ThreadPool);
+  } else if (qtype == utils::bestla_dtype_type(BTLA_DTYPE::TypeFloat)) {
+    assert(0);
+  } else {
+    assert(0);
+  }
+  return false;
+}
+
+bool BTLAGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* ThreadPool) {
+  auto ptr = storage::gemm::PackedWeightParser::deserialBuffer(const_cast<void*>(PackedBuf));
+  auto pth = reinterpret_cast<parallel::IThreading*>(ThreadPool);
+  GetCPUDevice();
+  if (ptr) {
+    auto NTile = gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+    auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+    auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+    auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+    if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+      auto sptr = reinterpret_cast<storage::gemm::StorageWeightKBlockNInteger*>(ptr);
+      if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+        if (NTile == tAVX512F::NTILE && _cd->AVX512F()) {
+          static prologue_b::gemm::WeightKBlockNInteger<tAVX512F, tAVX512F::ISA> proB;
+          proB.unpackWeight(static_cast<int>(N), static_cast<int>(K), sptr, FpData, static_cast<int>(ldb), pth);
+        } else if (NTile == tAVX2::NTILE && _cd->AVX2()) {
+          static prologue_b::gemm::WeightKBlockNInteger<tAVX2, tAVX2::ISA> proB;
+          proB.unpackWeight(static_cast<int>(N), static_cast<int>(K), sptr, FpData, static_cast<int>(ldb), pth);
+        }
+      }
+      if (btype == gemm::CompType::tS8 && PackRow == 4) {
+        if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8()) {
+          static prologue_b::gemm::WeightKBlockNInteger<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA> proB;
+          proB.unpackWeight(static_cast<int>(N), static_cast<int>(K), sptr, FpData, static_cast<int>(ldb), pth);
+        } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI()) {
+          static prologue_b::gemm::WeightKBlockNInteger<tAVX512_VNNI_KBlock, tAVX512_VNNI_KBlock::ISA> proB;
+          proB.unpackWeight(static_cast<int>(N), static_cast<int>(K), sptr, FpData, static_cast<int>(ldb), pth);
+        } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI()) {
+          static prologue_b::gemm::WeightKBlockNInteger<tAVX_VNNI_KBlock, tAVX_VNNI_KBlock::ISA> proB;
+          proB.unpackWeight(static_cast<int>(N), static_cast<int>(K), sptr, FpData, static_cast<int>(ldb), pth);
+        }
+      }
+      if (btype == gemm::CompType::tBF16 && PackRow == 2) {
+        if (NTile == tAMX_BF16::NTILE && _cd->AMX_BF16()) {
+          static prologue_b::gemm::WeightKBlockNInteger<tAMX_BF16, tAMX_BF16::ISA> proB;
+          proB.unpackWeight(static_cast<int>(N), static_cast<int>(K), sptr, FpData, static_cast<int>(ldb), pth);
+        }
+      }
+    }
+    if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNFloat) {
+      auto sptr = reinterpret_cast<storage::gemm::StorageWeightKBlockNFloat*>(ptr);
+      if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+        if (NTile == tAVX512F::NTILE && _cd->AVX512F()) {
+          static prologue_b::gemm::WeightKBlockNFloat<tAVX512F, tAVX512F::ISA> proB;
+          proB.unpackWeight(static_cast<int>(N), static_cast<int>(K), sptr, FpData, static_cast<int>(ldb), pth);
+        } else if (NTile == tAVX2::NTILE && _cd->AVX2()) {
+          static prologue_b::gemm::WeightKBlockNFloat<tAVX2, tAVX2::ISA> proB;
+          proB.unpackWeight(static_cast<int>(N), static_cast<int>(K), sptr, FpData, static_cast<int>(ldb), pth);
+        }
+      }
+      if (btype == gemm::CompType::tBF16 && PackRow == 2) {
+        if (NTile == tAMX_BF16::NTILE && _cd->AMX_BF16()) {
+          static prologue_b::gemm::WeightKBlockNFloat<tAMX_BF16, tAMX_BF16::ISA> proB;
+          proB.unpackWeight(static_cast<int>(N), static_cast<int>(K), sptr, FpData, static_cast<int>(ldb), pth);
+        }
+      }
+    }
+    delete ptr;
+    return true;
+  }
+  return false;
+}
+
+bool BTLALayerNorm(size_t norm_count, size_t norm_size, bool isrms, float epsilon, const float* FpIn, float* FpOut,
+                   void* ThreadPool) {
+  auto inorm_count = static_cast<int>(norm_count);
+  auto inorm_size = static_cast<int>(norm_size);
+  auto pth = reinterpret_cast<parallel::IThreading*>(ThreadPool);
+  int threads = inorm_count <= 4 ? 1 : pth->num_threads();
+  parallel::Scheduler2D sch({threads, inorm_count, inorm_size, 1, inorm_size});
+  if (threads == 1) {
+    parallel::SingleThread st;
+    st.parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D tp{tidx};
+      sch.getIndex(tp);
+      if (tp.valid) {
+        for (size_t i = 0; i < tp.size[0]; i++) {
+          auto srcptr = FpIn + (tp.loc[0] + i) * inorm_size;
+          auto dstptr = FpOut + (tp.loc[0] + i) * inorm_size;
+          auto ret = kernel::wrapper::LayerNormalization::forward_auto<float>(
+              srcptr, nullptr, nullptr, epsilon, inorm_size, dstptr, nullptr, nullptr, isrms);
+        }
+      }
+    });
+  } else {
+    pth->parallel_for([&](int tidx) {
+      parallel::ThreadProblem2D tp{tidx};
+      sch.getIndex(tp);
+      if (tp.valid) {
+        for (size_t i = 0; i < tp.size[0]; i++) {
+          auto srcptr = FpIn + (tp.loc[0] + i) * inorm_size;
+          auto dstptr = FpOut + (tp.loc[0] + i) * inorm_size;
+          auto ret = kernel::wrapper::LayerNormalization::forward_auto<float>(
+              srcptr, nullptr, nullptr, epsilon, inorm_size, dstptr, nullptr, nullptr, isrms);
+        }
+      }
+    });
+  }
+  return true;
+}
diff --git a/vllm-ext/inference_engine/cpp/core/layers/bestla_gemm.h b/vllm-ext/inference_engine/cpp/core/layers/bestla_gemm.h
new file mode 100644
index 000000000..5cbd3f29e
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/bestla_gemm.h
@@ -0,0 +1,58 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+/*++
+Module Name:
+
+    bestla_gemm.h
+
+Abstract:
+
+    C APIs of BesTLA GEMMs.
+--*/
+
+#pragma once
+
+#include "data_types.h"
+#include "bestla/bestla.h"
+
+struct BTLA_GEMM_DATA_PACKED_PARAMS {
+  const float* A = nullptr; /**< address of A (float32 matrix)*/
+  const void* B = nullptr;  /**< address of B (packed nbits blob)*/
+  float* C = nullptr;       /**< address of result matrix */
+  int lda = 0;              /**< leading dimension of A */
+  int ldc = 0;              /**< leading dimension of C*/
+};
+
+size_t BTLAGemmPackBSize(size_t N, size_t K, size_t BlkSize, BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym,
+                         ne_comp_type CompType, int* shuffle_indice);
+
+bool BTLAGemmQuantPackB(void* PackedBuf, const float* FpData, size_t N, size_t K, size_t ldb, size_t BlkSize,
+                        BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym, ne_comp_type CompType, bool isTrans,
+                        void* ThreadPool);
+
+// QData:  K*N quantized int8 weight
+// Scales: K/BlkSize * N scales
+// Zp:     K/BlkSize * N zero points
+bool BTLAGemmPackB(void* PackedBuf, const int8_t* QData, const float* Scales, const int8_t* Zp, size_t N, size_t K,
+                   size_t ldb, size_t BlkSize, BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym,
+                   ne_comp_type CompType, int* shuffle_indice, void* ThreadPool);
+
+bool BTLAGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* ThreadPool);
+
+bool BTLAGemmBatchDriver(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                         const BTLA_GEMM_DATA_PACKED_PARAMS* DataParams, int8_t* WorkSpace, void* ThreadPool);
+
+bool BTLALayerNorm(size_t norm_count, size_t norm_size, bool isrms, float epsilon, const float* FpIn, float* FpOut,
+                   void* ThreadPool);
diff --git a/vllm-ext/inference_engine/cpp/core/layers/conv.cpp b/vllm-ext/inference_engine/cpp/core/layers/conv.cpp
new file mode 100644
index 000000000..34aa30413
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/conv.cpp
@@ -0,0 +1,938 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#include "conv.h"
+#include "vec_dot.h"
+
+#define NE_UNUSED(x) (void)(x)
+#define NE_TENSOR_LOCALS_1(type, prefix, pointer, array) \
+  const type prefix##0 = (pointer)->array[0];            \
+  NE_UNUSED(prefix##0);
+#define NE_TENSOR_LOCALS_2(type, prefix, pointer, array) \
+  NE_TENSOR_LOCALS_1(type, prefix, pointer, array)       \
+  const type prefix##1 = (pointer)->array[1];            \
+  NE_UNUSED(prefix##1);
+#define NE_TENSOR_LOCALS_3(type, prefix, pointer, array) \
+  NE_TENSOR_LOCALS_2(type, prefix, pointer, array)       \
+  const type prefix##2 = (pointer)->array[2];            \
+  NE_UNUSED(prefix##2);
+#define NE_TENSOR_LOCALS(type, prefix, pointer, array) \
+  NE_TENSOR_LOCALS_3(type, prefix, pointer, array)     \
+  const type prefix##3 = (pointer)->array[3];          \
+  NE_UNUSED(prefix##3);
+#define NE_TENSOR_BINARY_OP_LOCALS          \
+  NE_TENSOR_LOCALS(int64_t, ne0, src0, ne); \
+  NE_TENSOR_LOCALS(size_t, nb0, src0, nb);  \
+  NE_TENSOR_LOCALS(int64_t, ne1, src1, ne); \
+  NE_TENSOR_LOCALS(size_t, nb1, src1, nb);  \
+  NE_TENSOR_LOCALS(int64_t, ne, dst, ne);   \
+  NE_TENSOR_LOCALS(size_t, nb, dst, nb);
+
+static inline int ne_up32(int n) { return (n + 31) & ~31; }
+
+// ne_compute_forward_conv_1d
+
+static void ne_compute_forward_conv_1d_s1_ph_f16_f32(const struct ne_compute_params* params,
+                                                     const struct ne_tensor* src0, const struct ne_tensor* src1,
+                                                     struct ne_tensor* dst) {
+  NE_ASSERT(src0->type == NE_TYPE_F16);
+  NE_ASSERT(src1->type == NE_TYPE_F32);
+  NE_ASSERT(dst->type == NE_TYPE_F32);
+
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // int64_t t0 = ne_perf_time_us();
+  // UNUSED(t0);
+
+  NE_TENSOR_BINARY_OP_LOCALS;
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nk = ne00;
+  const int nh = nk / 2;
+
+  const int ew0 = ne_up32(ne01);
+
+  NE_ASSERT(ne00 % 2 == 1);  // TODO(Bo): support even kernel sizes
+  NE_ASSERT(nb00 == sizeof(ne_fp16_t));
+  NE_ASSERT(nb10 == sizeof(float));
+
+  if (params->type == NE_TASK_INIT) {
+    // TODO(Bo): fix this memset (wsize is overestimated)
+    memset(params->wdata, 0, params->wsize);
+
+    // prepare kernel data (src0)
+    {
+      ne_fp16_t* const wdata = reinterpret_cast<ne_fp16_t*>(params->wdata) + 0;
+
+      for (int64_t i02 = 0; i02 < ne02; i02++) {
+        for (int64_t i01 = 0; i01 < ne01; i01++) {
+          const ne_fp16_t* const src =
+              reinterpret_cast<ne_fp16_t*>(reinterpret_cast<char*>(src0->data) + i02 * nb02 + i01 * nb01);
+          ne_fp16_t* dst_data = wdata + i02 * ew0 * ne00;
+          for (int64_t i00 = 0; i00 < ne00; i00++) {
+            dst_data[i00 * ew0 + i01] = src[i00];
+          }
+        }
+      }
+    }
+
+    // prepare source data (src1)
+    {
+      ne_fp16_t* const wdata = reinterpret_cast<ne_fp16_t*>(params->wdata) + ne02 * ew0 * ne00;
+
+      for (int64_t i11 = 0; i11 < ne11; i11++) {
+        const float* const src = reinterpret_cast<float*>(reinterpret_cast<char*>(src1->data) + i11 * nb11);
+        ne_fp16_t* dst_data = wdata;
+        for (int64_t i10 = 0; i10 < ne10; i10++) {
+          dst_data[(i10 + nh) * ew0 + i11] = NE_FP32_TO_FP16(src[i10]);
+        }
+      }
+    }
+
+    return;
+  }
+
+  // total rows in dst
+  const int nr = ne02;
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int i1 = ir0; i1 < ir1; i1++) {
+    float* dst_data = reinterpret_cast<float*>(reinterpret_cast<char*>(dst->data) + i1 * nb1);
+    for (int64_t i0 = 0; i0 < ne10; ++i0) {
+      dst_data[i0] = 0;
+      for (int k = -nh; k <= nh; k++) {
+        float v = 0.0f;
+        ne_vec_dot_f16(ew0, &v, reinterpret_cast<ne_fp16_t*>(params->wdata) + i1 * ew0 * ne00 + (nh + k) * ew0,
+                       reinterpret_cast<ne_fp16_t*>(params->wdata) + ne02 * ew0 * ne00 + (i0 + nh + k) * ew0);
+
+        dst_data[i0] += v;
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_conv_1d_s1_ph_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                                 const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(src0->type == NE_TYPE_F32);
+  NE_ASSERT(src1->type == NE_TYPE_F32);
+  NE_ASSERT(dst->type == NE_TYPE_F32);
+
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // int64_t t0 = ne_perf_time_us();
+  // UNUSED(t0);
+
+  NE_TENSOR_BINARY_OP_LOCALS;
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nk = ne00;
+  const int nh = nk / 2;
+
+  const int ew0 = ne_up32(ne01);
+
+  NE_ASSERT(ne00 % 2 == 1);  // TODO(Bo): support even kernel sizes
+  NE_ASSERT(nb00 == sizeof(float));
+  NE_ASSERT(nb10 == sizeof(float));
+
+  if (params->type == NE_TASK_INIT) {
+    // TODO(Bo): fix this memset (wsize is overestimated)
+    memset(params->wdata, 0, params->wsize);
+
+    // prepare kernel data (src0)
+    {
+      float* const wdata = reinterpret_cast<float*>(params->wdata) + 0;
+
+      for (int64_t i02 = 0; i02 < ne02; i02++) {
+        for (int64_t i01 = 0; i01 < ne01; i01++) {
+          const float* const src =
+              reinterpret_cast<float*>(reinterpret_cast<char*>(src0->data) + i02 * nb02 + i01 * nb01);
+          float* dst_data = wdata + i02 * ew0 * ne00;
+          for (int64_t i00 = 0; i00 < ne00; i00++) {
+            dst_data[i00 * ew0 + i01] = src[i00];
+          }
+        }
+      }
+    }
+
+    // prepare source data (src1)
+    {
+      float* const wdata = reinterpret_cast<float*>(params->wdata) + ne02 * ew0 * ne00;
+
+      for (int64_t i11 = 0; i11 < ne11; i11++) {
+        const float* const src = reinterpret_cast<float*>(reinterpret_cast<char*>(src1->data) + i11 * nb11);
+        float* dst_data = wdata;
+        for (int64_t i10 = 0; i10 < ne10; i10++) {
+          dst_data[(i10 + nh) * ew0 + i11] = src[i10];
+        }
+      }
+    }
+
+    return;
+  }
+
+  // total rows in dst
+  const int nr = ne02;
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int i1 = ir0; i1 < ir1; i1++) {
+    float* dst_data = reinterpret_cast<float*>(reinterpret_cast<char*>(dst->data) + i1 * nb1);
+    for (int64_t i0 = 0; i0 < ne10; ++i0) {
+      dst_data[i0] = 0;
+      for (int k = -nh; k <= nh; k++) {
+        float v = 0.0f;
+        ne_vec_dot_f32(ew0, &v, reinterpret_cast<float*>(params->wdata) + i1 * ew0 * ne00 + (nh + k) * ew0,
+                       reinterpret_cast<float*>(params->wdata) + ne02 * ew0 * ne00 + (i0 + nh + k) * ew0);
+
+        dst_data[i0] += v;
+      }
+    }
+  }
+}
+
+void ne_compute_forward_conv_1d_s1_ph(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                      const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F16: {
+      ne_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_F32: {
+      ne_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+static void ne_compute_forward_conv_1d_s2_ph_f16_f32(const struct ne_compute_params* params,
+                                                     const struct ne_tensor* src0, const struct ne_tensor* src1,
+                                                     struct ne_tensor* dst) {
+  NE_ASSERT(src0->type == NE_TYPE_F16);
+  NE_ASSERT(src1->type == NE_TYPE_F32);
+  NE_ASSERT(dst->type == NE_TYPE_F32);
+
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // int64_t t0 = ne_perf_time_us();
+  // UNUSED(t0);
+
+  NE_TENSOR_BINARY_OP_LOCALS;
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nk = ne00;
+  const int nh = nk / 2;
+
+  const int ew0 = ne_up32(ne01);
+
+  NE_ASSERT(ne00 % 2 == 1);  // TODO(Bo): support even kernel sizes
+  NE_ASSERT(nb00 == sizeof(ne_fp16_t));
+  NE_ASSERT(nb10 == sizeof(float));
+
+  if (params->type == NE_TASK_INIT) {
+    // TODO(Bo): fix this memset (wsize is overestimated)
+    memset(params->wdata, 0, params->wsize);
+
+    // prepare kernel data (src0)
+    {
+      ne_fp16_t* const wdata = reinterpret_cast<ne_fp16_t*>(params->wdata) + 0;
+
+      for (int64_t i02 = 0; i02 < ne02; i02++) {
+        for (int64_t i01 = 0; i01 < ne01; i01++) {
+          const ne_fp16_t* const src =
+              reinterpret_cast<ne_fp16_t*>(reinterpret_cast<char*>(src0->data) + i02 * nb02 + i01 * nb01);
+          ne_fp16_t* dst_data = wdata + i02 * ew0 * ne00;
+          for (int64_t i00 = 0; i00 < ne00; i00++) {
+            dst_data[i00 * ew0 + i01] = src[i00];
+          }
+        }
+      }
+    }
+
+    // prepare source data (src1)
+    {
+      ne_fp16_t* const wdata = reinterpret_cast<ne_fp16_t*>(params->wdata) + ne02 * ew0 * ne00;
+
+      for (int64_t i11 = 0; i11 < ne11; i11++) {
+        const float* const src = reinterpret_cast<float*>(reinterpret_cast<char*>(src1->data) + i11 * nb11);
+        ne_fp16_t* dst_data = wdata;
+        for (int64_t i10 = 0; i10 < ne10; i10++) {
+          dst_data[(i10 + nh) * ew0 + i11] = NE_FP32_TO_FP16(src[i10]);
+        }
+      }
+    }
+
+    return;
+  }
+
+  // total rows in dst
+  const int nr = ne02;
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int i1 = ir0; i1 < ir1; i1++) {
+    float* dst_data = reinterpret_cast<float*>(reinterpret_cast<char*>(dst->data) + i1 * nb1);
+    for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
+      dst_data[i0 / 2] = 0;
+      for (int k = -nh; k <= nh; k++) {
+        float v = 0.0f;
+        ne_vec_dot_f16(ew0, &v, reinterpret_cast<ne_fp16_t*>(params->wdata) + i1 * ew0 * ne00 + (nh + k) * ew0,
+                       reinterpret_cast<ne_fp16_t*>(params->wdata) + ne02 * ew0 * ne00 + (i0 + nh + k) * ew0);
+
+        dst_data[i0 / 2] += v;
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_conv_1d_s2_ph_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                                 const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(src0->type == NE_TYPE_F32);
+  NE_ASSERT(src1->type == NE_TYPE_F32);
+  NE_ASSERT(dst->type == NE_TYPE_F32);
+
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // int64_t t0 = ne_perf_time_us();
+  // UNUSED(t0);
+
+  NE_TENSOR_BINARY_OP_LOCALS;
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nk = ne00;
+  const int nh = nk / 2;
+
+  const int ew0 = ne_up32(ne01);
+
+  NE_ASSERT(ne00 % 2 == 1);  // TODO(Bo): support even kernel sizes
+  NE_ASSERT(nb00 == sizeof(float));
+  NE_ASSERT(nb10 == sizeof(float));
+
+  if (params->type == NE_TASK_INIT) {
+    // TODO(Bo): fix this memset (wsize is overestimated)
+    memset(params->wdata, 0, params->wsize);
+
+    // prepare kernel data (src0)
+    {
+      float* const wdata = reinterpret_cast<float*>(params->wdata) + 0;
+
+      for (int64_t i02 = 0; i02 < ne02; i02++) {
+        for (int64_t i01 = 0; i01 < ne01; i01++) {
+          const float* const src =
+              reinterpret_cast<float*>(reinterpret_cast<char*>(src0->data) + i02 * nb02 + i01 * nb01);
+          float* dst_data = wdata + i02 * ew0 * ne00;
+          for (int64_t i00 = 0; i00 < ne00; i00++) {
+            dst_data[i00 * ew0 + i01] = src[i00];
+          }
+        }
+      }
+    }
+
+    // prepare source data (src1)
+    {
+      float* const wdata = reinterpret_cast<float*>(params->wdata) + ne02 * ew0 * ne00;
+
+      for (int64_t i11 = 0; i11 < ne11; i11++) {
+        const float* const src = reinterpret_cast<float*>(reinterpret_cast<char*>(src1->data) + i11 * nb11);
+        float* dst_data = wdata;
+        for (int64_t i10 = 0; i10 < ne10; i10++) {
+          dst_data[(i10 + nh) * ew0 + i11] = src[i10];
+        }
+      }
+    }
+
+    return;
+  }
+
+  // total rows in dst
+  const int nr = ne02;
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int i1 = ir0; i1 < ir1; i1++) {
+    float* dst_data = reinterpret_cast<float*>(reinterpret_cast<char*>(dst->data) + i1 * nb1);
+    for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
+      dst_data[i0 / 2] = 0;
+      for (int k = -nh; k <= nh; k++) {
+        float v = 0.0f;
+        ne_vec_dot_f32(ew0, &v, reinterpret_cast<float*>(params->wdata) + i1 * ew0 * ne00 + (nh + k) * ew0,
+                       reinterpret_cast<float*>(params->wdata) + ne02 * ew0 * ne00 + (i0 + nh + k) * ew0);
+
+        dst_data[i0 / 2] += v;
+      }
+    }
+  }
+}
+
+void ne_compute_forward_conv_1d_s2_ph(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                      const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F16: {
+      ne_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_F32: {
+      ne_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_conv_1d
+
+void ne_compute_forward_conv_1d(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                const struct ne_tensor* src1, struct ne_tensor* dst) {
+  const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+  const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
+  const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
+  NE_ASSERT(d0 == 1);                // dilation not supported
+  NE_ASSERT(p0 == src0->ne[0] / 2);  // only half padding supported
+  if (s0 == 1) {
+    ne_compute_forward_conv_1d_s1_ph(params, src0, src1, dst);
+  } else if (s0 == 2) {
+    ne_compute_forward_conv_1d_s2_ph(params, src0, src1, dst);
+  } else {
+    NE_ASSERT(false);  // only stride 1 and 2 supported
+  }
+}
+
+// ne_compute_forward_conv_1d_1s
+
+static void ne_compute_forward_conv_1d_1s_f16_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                                  const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(src0->type == NE_TYPE_F16);
+  NE_ASSERT(src1->type == NE_TYPE_F32);
+  NE_ASSERT(dst->type == NE_TYPE_F32);
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // int64_t t0 = ne_perf_time_us();
+  // UNUSED(t0);
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  // const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne10 = src1->ne[0];
+  const int64_t ne11 = src1->ne[1];
+  // const int64_t ne12 = src1->ne[2];
+  // const int64_t ne13 = src1->ne[3];
+
+  // const int64_t ne0  = dst->ne[0];
+  // const int64_t ne1  = dst->ne[1];
+  // const int64_t ne2  = dst->ne[2];
+  // const int64_t ne3  = dst->ne[3];
+  // const int64_t ne   = ne0*ne1*ne2*ne3;
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  // const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = src1->nb[1];
+  // const size_t nb12 = src1->nb[2];
+  // const size_t nb13 = src1->nb[3];
+
+  // const size_t nb0  = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  // const size_t nb2  = dst->nb[2];
+  // const size_t nb3  = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nk = ne00;
+  const int nh = nk / 2;
+
+  const int ew0 = ne_up32(ne01);
+
+  NE_ASSERT(ne00 % 2 == 1);  // TODO(Bo): support even kernel sizes
+  NE_ASSERT(nb00 == sizeof(ne_fp16_t));
+  NE_ASSERT(nb10 == sizeof(float));
+
+  if (params->type == NE_TASK_INIT) {
+    // TODO(Bo): fix this memset (wsize is overestimated)
+    memset(params->wdata, 0, params->wsize);
+
+    // prepare kernel data (src0)
+    {
+      ne_fp16_t* const wdata = reinterpret_cast<ne_fp16_t*>(params->wdata) + 0;
+
+      for (int64_t i02 = 0; i02 < ne02; i02++) {
+        for (int64_t i01 = 0; i01 < ne01; i01++) {
+          const ne_fp16_t* const src =
+              reinterpret_cast<ne_fp16_t*>(reinterpret_cast<char*>(src0->data) + i02 * nb02 + i01 * nb01);
+          ne_fp16_t* dst_data = wdata + i02 * ew0 * ne00;
+          for (int64_t i00 = 0; i00 < ne00; i00++) {
+            dst_data[i00 * ew0 + i01] = src[i00];
+          }
+        }
+      }
+    }
+
+    // prepare source data (src1)
+    {
+      ne_fp16_t* const wdata = reinterpret_cast<ne_fp16_t*>(params->wdata) + ne02 * ew0 * ne00;
+
+      for (int64_t i11 = 0; i11 < ne11; i11++) {
+        const float* const src = reinterpret_cast<float*>(reinterpret_cast<char*>(src1->data) + i11 * nb11);
+        ne_fp16_t* dst_data = wdata;
+        for (int64_t i10 = 0; i10 < ne10; i10++) {
+          dst_data[(i10 + nh) * ew0 + i11] = NE_FP32_TO_FP16(src[i10]);
+        }
+      }
+    }
+
+    return;
+  }
+
+  // total rows in dst
+  const int nr = ne02;
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int i1 = ir0; i1 < ir1; i1++) {
+    float* dst_data = reinterpret_cast<float*>(reinterpret_cast<char*>(dst->data) + i1 * nb1);
+    for (int64_t i0 = 0; i0 < ne10; ++i0) {
+      dst_data[i0] = 0;
+      for (int k = -nh; k <= nh; k++) {
+        float v = 0.0f;
+        ne_vec_dot_f16(ew0, &v, reinterpret_cast<ne_fp16_t*>(params->wdata) + i1 * ew0 * ne00 + (nh + k) * ew0,
+                       reinterpret_cast<ne_fp16_t*>(params->wdata) + ne02 * ew0 * ne00 + (i0 + nh + k) * ew0);
+
+        dst_data[i0] += v;
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_conv_1d_1s_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                              const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(src0->type == NE_TYPE_F32);
+  NE_ASSERT(src1->type == NE_TYPE_F32);
+  NE_ASSERT(dst->type == NE_TYPE_F32);
+
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // int64_t t0 = ne_perf_time_us();
+  // UNUSED(t0);
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  // const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne10 = src1->ne[0];
+  const int64_t ne11 = src1->ne[1];
+  // const int64_t ne12 = src1->ne[2];
+  // const int64_t ne13 = src1->ne[3];
+
+  // const int64_t ne0  = dst->ne[0];
+  // const int64_t ne1  = dst->ne[1];
+  // const int64_t ne2  = dst->ne[2];
+  // const int64_t ne3  = dst->ne[3];
+  // const int64_t ne   = ne0*ne1*ne2*ne3;
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  // const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = src1->nb[1];
+  // const size_t nb12 = src1->nb[2];
+  // const size_t nb13 = src1->nb[3];
+
+  // const size_t nb0  = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  // const size_t nb2  = dst->nb[2];
+  // const size_t nb3  = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nk = ne00;
+  const int nh = nk / 2;
+
+  const int ew0 = ne_up32(ne01);
+
+  NE_ASSERT(ne00 % 2 == 1);  // TODO(Bo): support even kernel sizes
+  NE_ASSERT(nb00 == sizeof(float));
+  NE_ASSERT(nb10 == sizeof(float));
+
+  if (params->type == NE_TASK_INIT) {
+    // TODO(Bo): fix this memset (wsize is overestimated)
+    memset(params->wdata, 0, params->wsize);
+
+    // prepare kernel data (src0)
+    {
+      float* const wdata = reinterpret_cast<float*>(params->wdata) + 0;
+
+      for (int64_t i02 = 0; i02 < ne02; i02++) {
+        for (int64_t i01 = 0; i01 < ne01; i01++) {
+          const float* const src =
+              reinterpret_cast<float*>(reinterpret_cast<char*>(src0->data) + i02 * nb02 + i01 * nb01);
+          float* dst_data = wdata + i02 * ew0 * ne00;
+          for (int64_t i00 = 0; i00 < ne00; i00++) {
+            dst_data[i00 * ew0 + i01] = src[i00];
+          }
+        }
+      }
+    }
+
+    // prepare source data (src1)
+    {
+      float* const wdata = reinterpret_cast<float*>(params->wdata) + ne02 * ew0 * ne00;
+
+      for (int64_t i11 = 0; i11 < ne11; i11++) {
+        const float* const src = reinterpret_cast<float*>(reinterpret_cast<char*>(src1->data) + i11 * nb11);
+        float* dst_data = wdata;
+        for (int64_t i10 = 0; i10 < ne10; i10++) {
+          dst_data[(i10 + nh) * ew0 + i11] = src[i10];
+        }
+      }
+    }
+
+    return;
+  }
+
+  // total rows in dst
+  const int nr = ne02;
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int i1 = ir0; i1 < ir1; i1++) {
+    float* dst_data = reinterpret_cast<float*>(reinterpret_cast<char*>(dst->data) + i1 * nb1);
+    for (int64_t i0 = 0; i0 < ne10; ++i0) {
+      dst_data[i0] = 0;
+      for (int k = -nh; k <= nh; k++) {
+        float v = 0.0f;
+        ne_vec_dot_f32(ew0, &v, reinterpret_cast<float*>(params->wdata) + i1 * ew0 * ne00 + (nh + k) * ew0,
+                       reinterpret_cast<float*>(params->wdata) + ne02 * ew0 * ne00 + (i0 + nh + k) * ew0);
+
+        dst_data[i0] += v;
+      }
+    }
+  }
+}
+
+void ne_compute_forward_conv_1d_1s(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F16: {
+      ne_compute_forward_conv_1d_1s_f16_f32(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_F32: {
+      ne_compute_forward_conv_1d_1s_f32(params, src0, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_conv_1d_2s
+
+static void ne_compute_forward_conv_1d_2s_f16_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                                  const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(src0->type == NE_TYPE_F16);
+  NE_ASSERT(src1->type == NE_TYPE_F32);
+  NE_ASSERT(dst->type == NE_TYPE_F32);
+
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // int64_t t0 = ne_perf_time_us();
+  // UNUSED(t0);
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  // const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne10 = src1->ne[0];
+  const int64_t ne11 = src1->ne[1];
+  // const int64_t ne12 = src1->ne[2];
+  // const int64_t ne13 = src1->ne[3];
+
+  // const int64_t ne0  = dst->ne[0];
+  // const int64_t ne1  = dst->ne[1];
+  // const int64_t ne2  = dst->ne[2];
+  // const int64_t ne3  = dst->ne[3];
+  // const int64_t ne   = ne0*ne1*ne2*ne3;
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  // const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = src1->nb[1];
+  // const size_t nb12 = src1->nb[2];
+  // const size_t nb13 = src1->nb[3];
+
+  // const size_t nb0  = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  // const size_t nb2  = dst->nb[2];
+  // const size_t nb3  = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nk = ne00;
+  const int nh = nk / 2;
+
+  const int ew0 = ne_up32(ne01);
+
+  NE_ASSERT(ne00 % 2 == 1);  // TODO(Bo): support even kernel sizes
+  NE_ASSERT(nb00 == sizeof(ne_fp16_t));
+  NE_ASSERT(nb10 == sizeof(float));
+
+  if (params->type == NE_TASK_INIT) {
+    // TODO(Bo): fix this memset (wsize is overestimated)
+    memset(params->wdata, 0, params->wsize);
+
+    // prepare kernel data (src0)
+    {
+      ne_fp16_t* const wdata = reinterpret_cast<ne_fp16_t*>(params->wdata) + 0;
+
+      for (int64_t i02 = 0; i02 < ne02; i02++) {
+        for (int64_t i01 = 0; i01 < ne01; i01++) {
+          const ne_fp16_t* const src =
+              reinterpret_cast<ne_fp16_t*>(reinterpret_cast<char*>(src0->data) + i02 * nb02 + i01 * nb01);
+          ne_fp16_t* dst_data = wdata + i02 * ew0 * ne00;
+          for (int64_t i00 = 0; i00 < ne00; i00++) {
+            dst_data[i00 * ew0 + i01] = src[i00];
+          }
+        }
+      }
+    }
+
+    // prepare source data (src1)
+    {
+      ne_fp16_t* const wdata = reinterpret_cast<ne_fp16_t*>(params->wdata) + ne02 * ew0 * ne00;
+
+      for (int64_t i11 = 0; i11 < ne11; i11++) {
+        const float* const src = reinterpret_cast<float*>(reinterpret_cast<char*>(src1->data) + i11 * nb11);
+        ne_fp16_t* dst_data = wdata;
+        for (int64_t i10 = 0; i10 < ne10; i10++) {
+          dst_data[(i10 + nh) * ew0 + i11] = NE_FP32_TO_FP16(src[i10]);
+        }
+      }
+    }
+
+    return;
+  }
+
+  // total rows in dst
+  const int nr = ne02;
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int i1 = ir0; i1 < ir1; i1++) {
+    float* dst_data = reinterpret_cast<float*>(reinterpret_cast<char*>(dst->data) + i1 * nb1);
+    for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
+      dst_data[i0 / 2] = 0;
+      for (int k = -nh; k <= nh; k++) {
+        float v = 0.0f;
+        ne_vec_dot_f16(ew0, &v, reinterpret_cast<ne_fp16_t*>(params->wdata) + i1 * ew0 * ne00 + (nh + k) * ew0,
+                       reinterpret_cast<ne_fp16_t*>(params->wdata) + ne02 * ew0 * ne00 + (i0 + nh + k) * ew0);
+
+        dst_data[i0 / 2] += v;
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_conv_1d_2s_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                              const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(src0->type == NE_TYPE_F32);
+  NE_ASSERT(src1->type == NE_TYPE_F32);
+  NE_ASSERT(dst->type == NE_TYPE_F32);
+
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // int64_t t0 = ne_perf_time_us();
+  // UNUSED(t0);
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  // const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne10 = src1->ne[0];
+  const int64_t ne11 = src1->ne[1];
+  // const int64_t ne12 = src1->ne[2];
+  // const int64_t ne13 = src1->ne[3];
+
+  // const int64_t ne0  = dst->ne[0];
+  // const int64_t ne1  = dst->ne[1];
+  // const int64_t ne2  = dst->ne[2];
+  // const int64_t ne3  = dst->ne[3];
+  // const int64_t ne   = ne0*ne1*ne2*ne3;
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  // const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = src1->nb[1];
+  // const size_t nb12 = src1->nb[2];
+  // const size_t nb13 = src1->nb[3];
+
+  // const size_t nb0  = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  // const size_t nb2  = dst->nb[2];
+  // const size_t nb3  = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nk = ne00;
+  const int nh = nk / 2;
+
+  const int ew0 = ne_up32(ne01);
+
+  NE_ASSERT(ne00 % 2 == 1);  // TODO(Bo): support even kernel sizes
+  NE_ASSERT(nb00 == sizeof(float));
+  NE_ASSERT(nb10 == sizeof(float));
+
+  if (params->type == NE_TASK_INIT) {
+    // TODO(Bo): fix this memset (wsize is overestimated)
+    memset(params->wdata, 0, params->wsize);
+
+    // prepare kernel data (src0)
+    {
+      float* const wdata = reinterpret_cast<float*>(params->wdata) + 0;
+
+      for (int64_t i02 = 0; i02 < ne02; i02++) {
+        for (int64_t i01 = 0; i01 < ne01; i01++) {
+          const float* const src =
+              reinterpret_cast<float*>(reinterpret_cast<char*>(src0->data) + i02 * nb02 + i01 * nb01);
+          float* dst_data = wdata + i02 * ew0 * ne00;
+          for (int64_t i00 = 0; i00 < ne00; i00++) {
+            dst_data[i00 * ew0 + i01] = src[i00];
+          }
+        }
+      }
+    }
+
+    // prepare source data (src1)
+    {
+      float* const wdata = reinterpret_cast<float*>(params->wdata) + ne02 * ew0 * ne00;
+
+      for (int64_t i11 = 0; i11 < ne11; i11++) {
+        const float* const src = reinterpret_cast<float*>(reinterpret_cast<char*>(src1->data) + i11 * nb11);
+        float* dst_data = wdata;
+        for (int64_t i10 = 0; i10 < ne10; i10++) {
+          dst_data[(i10 + nh) * ew0 + i11] = src[i10];
+        }
+      }
+    }
+
+    return;
+  }
+
+  // total rows in dst
+  const int nr = ne02;
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int i1 = ir0; i1 < ir1; i1++) {
+    float* dst_data = reinterpret_cast<float*>(reinterpret_cast<char*>(dst->data) + i1 * nb1);
+    for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
+      dst_data[i0 / 2] = 0;
+      for (int k = -nh; k <= nh; k++) {
+        float v = 0.0f;
+        ne_vec_dot_f32(ew0, &v, reinterpret_cast<float*>(params->wdata) + i1 * ew0 * ne00 + (nh + k) * ew0,
+                       reinterpret_cast<float*>(params->wdata) + ne02 * ew0 * ne00 + (i0 + nh + k) * ew0);
+
+        dst_data[i0 / 2] += v;
+      }
+    }
+  }
+}
+
+void ne_compute_forward_conv_1d_2s(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F16: {
+      ne_compute_forward_conv_1d_2s_f16_f32(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_F32: {
+      ne_compute_forward_conv_1d_2s_f32(params, src0, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
diff --git a/vllm-ext/inference_engine/cpp/core/layers/conv.h b/vllm-ext/inference_engine/cpp/core/layers/conv.h
new file mode 100644
index 000000000..0fffca9b4
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/conv.h
@@ -0,0 +1,35 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+
+#include "core/ne.h"
+#include "core/data_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ne_compute_forward_conv_1d_s1_ph(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                      const struct ne_tensor* src1, struct ne_tensor* dst);
+void ne_compute_forward_conv_1d_2s(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   const struct ne_tensor* src1, struct ne_tensor* dst);
+void ne_compute_forward_conv_1d(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                const struct ne_tensor* src1, struct ne_tensor* dst);
+void ne_compute_forward_conv_1d_1s(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   const struct ne_tensor* src1, struct ne_tensor* dst);
+void ne_compute_forward_conv_1d_2s(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   const struct ne_tensor* src1, struct ne_tensor* dst);
+#ifdef __cplusplus
+}
+#endif
diff --git a/vllm-ext/inference_engine/cpp/core/layers/ele_reduce.h b/vllm-ext/inference_engine/cpp/core/layers/ele_reduce.h
new file mode 100644
index 000000000..aa4f12935
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/ele_reduce.h
@@ -0,0 +1,60 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+
+#include "core/data_types.h"
+#include "vectors/cpu/simd.h"
+#include "vec_dot.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+inline static void ne_vec_norm_f32(const int n, float* s, const float* x) {
+  ne_vec_dot_f32(n, s, x, x);
+  *s = sqrtf(*s);
+}
+
+inline static void ne_vec_sum_f32(const int n, float* s, const float* x) {
+  ne_float sum = 0.0;
+  for (int i = 0; i < n; ++i) {
+    sum += (ne_float)x[i];
+  }
+  *s = sum;
+}
+
+inline static void ne_vec_sum_ggf(const int n, ne_float* s, const float* x) {
+  ne_float sum = 0.0;
+  for (int i = 0; i < n; ++i) {
+    sum += (ne_float)x[i];
+  }
+  *s = sum;
+}
+
+inline static void ne_vec_max_f32(const int n, float* s, const float* x) {
+  float max = -INFINITY;
+  for (int i = 0; i < n; ++i) {
+    max = x[i] > max ? x[i] : max;
+  }
+  *s = max;
+}
+
+inline static void ne_vec_norm_inv_f32(const int n, float* s, const float* x) {
+  ne_vec_norm_f32(n, s, x);
+  *s = 1.f / (*s);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/vllm-ext/inference_engine/cpp/core/layers/ele_wise.h b/vllm-ext/inference_engine/cpp/core/layers/ele_wise.h
new file mode 100644
index 000000000..4c7605c51
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/ele_wise.h
@@ -0,0 +1,267 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+
+#include <math.h>
+#include "core/data_types.h"
+#include "vectors/cpu/simd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+//
+// fundamental operations
+//
+
+inline static void ne_vec_set_i8(const int n, int8_t* x, const int8_t v) {
+  for (int i = 0; i < n; ++i) x[i] = v;
+}
+
+inline static void ne_vec_set_i16(const int n, int16_t* x, const int16_t v) {
+  for (int i = 0; i < n; ++i) x[i] = v;
+}
+
+inline static void ne_vec_set_i32(const int n, int32_t* x, const int32_t v) {
+  for (int i = 0; i < n; ++i) x[i] = v;
+}
+
+inline static void ne_vec_set_f16(const int n, ne_fp16_t* x, const int32_t v) {
+  for (int i = 0; i < n; ++i) x[i] = v;
+}
+
+inline static void ne_vec_srl_i32(const int n, int32_t* z, const int32_t* x, int32_t v) {
+  for (int i = 0; i < n; ++i) z[i] = x[i] >> v;
+}
+
+inline static void ne_vec_and_i32(const int n, int32_t* z, const int32_t* x, const int32_t* y) {
+  for (int i = 0; i < n; ++i) z[i] = x[i] & y[i];
+}
+
+inline static void ne_vec_add_f32(const int n, float* z, const float* x, const float* y) {
+  for (int i = 0; i < n; ++i) z[i] = x[i] + y[i];
+}
+inline static void ne_vec_add1_f32(const int n, float* z, const float* x, const float v) {
+  for (int i = 0; i < n; ++i) z[i] = x[i] + v;
+}
+inline static void ne_vec_acc_f32(const int n, float* y, const float* x) {
+  for (int i = 0; i < n; ++i) y[i] += x[i];
+}
+inline static void ne_vec_acc1_f32(const int n, float* y, const float v) {
+  for (int i = 0; i < n; ++i) y[i] += v;
+}
+inline static void ne_vec_sub_f32(const int n, float* z, const float* x, const float* y) {
+  for (int i = 0; i < n; ++i) z[i] = x[i] - y[i];
+}
+
+inline static void ne_vec_set_f32(const int n, float* x, const float v) {
+  for (int i = 0; i < n; ++i) x[i] = v;
+}
+
+inline static void ne_vec_cpy_f32(const int n, float* y, const float* x) {
+  for (int i = 0; i < n; ++i) y[i] = x[i];
+}
+inline static void ne_vec_neg_f32(const int n, float* y, const float* x) {
+  for (int i = 0; i < n; ++i) y[i] = -x[i];
+}
+inline static void ne_vec_mul_f32(const int n, float* z, const float* x, const float* y) {
+  for (int i = 0; i < n; ++i) z[i] = x[i] * y[i];
+}
+inline static void ne_vec_div_f32(const int n, float* z, const float* x, const float* y) {
+  for (int i = 0; i < n; ++i) z[i] = x[i] / y[i];
+}
+
+inline static void ne_vec_mad_f32(const int n, float* __restrict y, const float* __restrict x, const float v) {
+#if defined(NE_SIMD)
+  const int np = (n & ~(NE_F32_STEP - 1));
+
+  NE_F32_VEC vx = NE_F32_VEC_SET1(v);
+
+  NE_F32_VEC ax[NE_F32_ARR];
+  NE_F32_VEC ay[NE_F32_ARR];
+
+  for (int i = 0; i < np; i += NE_F32_STEP) {
+    for (int j = 0; j < NE_F32_ARR; j++) {
+      ax[j] = NE_F32_VEC_LOAD(x + i + j * NE_F32_EPR);
+      ay[j] = NE_F32_VEC_LOAD(y + i + j * NE_F32_EPR);
+      ay[j] = NE_F32_VEC_FMA(ay[j], ax[j], vx);
+
+      NE_F32_VEC_STORE(y + i + j * NE_F32_EPR, ay[j]);
+    }
+  }
+
+  // leftovers
+  for (int i = np; i < n; ++i) {
+    y[i] += x[i] * v;
+  }
+#else
+  // scalar
+  for (int i = 0; i < n; ++i) {
+    y[i] += x[i] * v;
+  }
+#endif
+}
+
+// inline static void ne_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *=
+// v;          }
+inline static void ne_vec_scale_f32(const int n, float* y, const float v) {
+#if defined(NE_SIMD)
+  const int np = (n & ~(NE_F32_STEP - 1));
+
+  NE_F32_VEC vx = NE_F32_VEC_SET1(v);
+
+  NE_F32_VEC ay[NE_F32_ARR];
+
+  for (int i = 0; i < np; i += NE_F32_STEP) {
+    for (int j = 0; j < NE_F32_ARR; j++) {
+      ay[j] = NE_F32_VEC_LOAD(y + i + j * NE_F32_EPR);
+      ay[j] = NE_F32_VEC_MUL(ay[j], vx);
+
+      NE_F32_VEC_STORE(y + i + j * NE_F32_EPR, ay[j]);
+    }
+  }
+
+  // leftovers
+  for (int i = np; i < n; ++i) {
+    y[i] *= v;
+  }
+#else
+  // scalar
+  for (int i = 0; i < n; ++i) {
+    y[i] *= v;
+  }
+#endif
+}
+
+inline static void ne_vec_sqr_f32(const int n, float* y, const float* x) {
+  for (int i = 0; i < n; ++i) y[i] = x[i] * x[i];
+}
+inline static void ne_vec_sqrt_f32(const int n, float* y, const float* x) {
+  for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]);
+}
+inline static void ne_vec_log_f32(const int n, float* y, const float* x) {
+  for (int i = 0; i < n; ++i) y[i] = logf(x[i]);
+}
+inline static void ne_vec_abs_f32(const int n, float* y, const float* x) {
+  for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]);
+}
+inline static void ne_vec_sgn_f32(const int n, float* y, const float* x) {
+  for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f);
+}
+inline static void ne_vec_step_f32(const int n, float* y, const float* x) {
+  for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f;
+}
+inline static void ne_vec_relu_f32(const int n, float* y, const float* x) {
+  for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f;
+}
+
+static const float GELU_COEF_A = 0.044715f;
+static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+
+inline static float ne_gelu_f32(float x) {
+  return 0.5f * x * (1.0f + tanhf(SQRT_2_OVER_PI * x * (1.0f + GELU_COEF_A * x * x)));
+}
+
+inline static void ne_vec_gelu_f16(const int n, ne_fp16_t* y, const ne_fp16_t* x) {
+  const uint16_t* i16 = (const uint16_t*)x;
+  for (int i = 0; i < n; ++i) {
+    y[i] = table_gelu_f16[i16[i]];
+  }
+}
+
+inline static void ne_vec_tanh_f32(const int n, float* y, const float* x) {
+  for (int i = 0; i < n; i++) y[i] = tanhf(x[i]);
+}
+
+inline static void ne_vec_gelu_f32(const int n, float* y, const float* x) {
+#ifdef IE_GELU_USE_VEC
+  // compute G(x) = sqrt_root_two_over_pi * x * (1 + fitting_const * x * x)
+  float* aux0 = (float*)malloc(n * sizeof(float));
+  ne_vec_sqr_f32(n, aux0, x);
+  float* aux1 = (float*)malloc(n * sizeof(float));
+  ne_vec_set_f32(n, aux1, 1.0f);
+  ne_vec_mad_f32(n, aux1, aux0, GELU_COEF_A);
+  ne_vec_mul_f32(n, aux0, x, aux1);
+  ne_vec_set_f32(n, aux1, SQRT_2_OVER_PI);
+  ne_vec_mul_f32(n, aux1, aux0, aux1);
+
+  // compute tanh(G(x))
+  ne_vec_tanh_f32(n, aux0, aux1);
+  // Gelu(x)= 0.5f * x * (1.0f + tanh(G(x)))
+  ne_vec_acc1_f32(n, aux0, 1.0f);
+  ne_vec_mul_f32(n, y, x, aux0);
+  ne_vec_set_f32(n, aux0, 0.5f);
+  ne_vec_mul_f32(n, y, y, aux0);
+
+  free(aux0);
+  free(aux1);
+#else
+  for (int i = 0; i < n; ++i) {
+    y[i] = ne_gelu_f32(x[i]);
+  }
+#endif
+}
+
+// Sigmoid Linear Unit (SiLU) function
+inline static float ne_silu_f32(float x) { return x / (1.0f + expf(-x)); }
+
+// inline static void ne_vec_silu_f16(const int n, ne_fp16_t * y, const ne_fp16_t * x) {
+//     const uint16_t * i16 = (const uint16_t *) x;
+//     for (int i = 0; i < n; ++i) {
+//         y[i] = table_silu_f16[i16[i]];
+//     }
+// }
+
+#ifdef NE_SILU_FP16
+inline static void ne_vec_silu_f32(const int n, float* y, const float* x) {
+  uint16_t t;
+  for (int i = 0; i < n; ++i) {
+    ne_fp16_t fp16 = NE_FP32_TO_FP16(x[i]);
+    memcpy(&t, &fp16, sizeof(uint16_t));
+    y[i] = NE_FP16_TO_FP32(table_silu_f16[t]);
+  }
+}
+#else
+inline static void ne_vec_silu_f32(const int n, float* y, const float* x) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = ne_silu_f32(x[i]);
+  }
+}
+#endif
+
+inline static float ne_silu_backward_f32(float x, float dy) {
+  const float s = 1.0f / (1.0f + expf(-x));
+  return dy * s * (1.0f + x * (1.0f - s));
+}
+
+#ifdef NE_SILU_FP16
+inline static void ne_vec_silu_backward_f32(const int n, float* dx, const float* x, const float* dy) {
+  for (int i = 0; i < n; ++i) {
+    // we did not use x[i] to compute forward silu but its f16 equivalent
+    // take derivative at f16 of x[i]:
+    ne_fp16_t fp16 = NE_FP32_TO_FP16(x[i]);
+    float usedx = NE_FP16_TO_FP32(fp16);
+    dx[i] = ne_silu_backward_f32(usedx, dy[i]);
+  }
+}
+#else
+inline static void ne_vec_silu_backward_f32(const int n, float* dx, const float* x, const float* dy) {
+  for (int i = 0; i < n; ++i) {
+    dx[i] = ne_silu_backward_f32(x[i], dy[i]);
+  }
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/vllm-ext/inference_engine/cpp/core/layers/inner_product.cpp b/vllm-ext/inference_engine/cpp/core/layers/inner_product.cpp
new file mode 100644
index 000000000..6ebaf70cc
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/inner_product.cpp
@@ -0,0 +1,196 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#include "bestla_common.hpp"
+#include "bestla_gemm.h"
+
+using namespace bestla;     // NOLINT
+using namespace ne_bestla;  // NOLINT
+
+unsigned long long bestla_f32f32_get_workspace_size(int _m, int _n, int _k, void* wptr) {  // NOLINT
+  // maximum padding
+  int constexpr padding = 128;
+  size_t s = static_cast<size_t>(_m) * utils::padto(static_cast<size_t>(_k), padding) * 4;
+  return s;
+}
+
+// f32f32: activation & output dtype
+void bestla_f32f32_forward(float* activation, void* weiptr, float* output, int _m, int _n, int _k, int lda, int ldo,
+                           void* workspace) {
+  BTLA_GEMM_DATA_PACKED_PARAMS param{activation, weiptr, output, lda, ldo};
+  if (!BTLAGemmBatchDriver(_m, _n, _k, 1, &param, reinterpret_cast<int8_t*>(workspace),
+                           ne_bestla::ne_threading::get())) {
+    printf("Err: invalid parameters\n");
+    assert(0);
+  }
+}
+
+namespace ip_add {
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T, template <class, BTLA_ISA> class Act_T>
+void BTLAGemmCompF32(const int M, const int N, const int K, const float* A, const int lda,
+                     storage::gemm::IWeightBase* _B, float* C, const int ldc, float* bias, bool broadcast_bias,
+                     int8_t* WorkSpace, parallel::IThreading* th) {
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+  using Launcher = wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, Act_T, Wei_T, custom::epilogue::AddFp32>;
+  auto B = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(_B);
+  utils::GemmProblem gp(1, M, N, K, B->mBlockSize);
+  static Launcher kernel;
+  auto reordA = kernel.mProA.createReorderStorage(M, K, B->mBlockSize);
+  typename Launcher::Param args{
+      gp, {A, lda, nullptr, B->ShfIndice(), &reordA}, {B}, {C, bias, ldc, broadcast_bias ? 0 : ldc}};
+  if (B->ShfIndice()) {
+    reordA.assign(WorkSpace);
+    parallel::GemmRunWithA<Parallel>(kernel, args, th);
+  } else {
+    parallel::GemmRun<Parallel>(kernel, args, th);
+  }
+}
+
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T>
+void BTLAGemmCompInt8(const int M, const int N, const int K, const float* A, const int lda,
+                      storage::gemm::IWeightBase* _B, float* C, const int ldc, float* bias, bool broadcast_bias,
+                      int8_t* WorkSpace, parallel::IThreading* th) {
+  using Parallel = parallel::gemm::SchedulerKBlockS<GemmCore_T>;
+  using Launcher = wrapper::gemm::LauncherIntKBlock<GemmCore_T::ISA, GemmCore_T,
+                                                    prologue_a::gemm::ShuffleActivationKBlockQuantizeF32, Wei_T,
+                                                    custom::epilogue::AddFp32>;
+
+  auto B = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(_B);
+  utils::GemmProblem gp(1, M, N, K, B->mBlockSize);
+  static Launcher kernel;
+  auto quanA = kernel.mProA.createQuantStorage(M, K, B->mBlockSize, B->IsAsym());
+  quanA.assign(WorkSpace);
+  WorkSpace += quanA.mSize;
+  auto reordA = kernel.mProA.createReorderStorage(M, K, B->mBlockSize);
+  typename Launcher::Param args{
+      gp, {A, lda, &quanA, B->ShfIndice(), &reordA}, {B}, {C, bias, ldc, broadcast_bias ? 0 : ldc}};
+  if (B->ShfIndice()) {
+    reordA.assign(WorkSpace);
+    kernel.mProA.quantize({A, lda, &quanA, B->ShfIndice(), &reordA}, M, K, th);
+    parallel::GemmRun<Parallel>(kernel, args, th);
+  } else {
+    parallel::GemmRunWithA<Parallel>(kernel, args, th);
+  }
+}
+}  // namespace ip_add
+
+bool bestla_fusion_add_f32f32_support(void* weiptr, int _m, int _n, int _k) {
+  GetCPUDevice();
+  bool support = false;
+  auto wtmp = storage::gemm::PackedWeightParser::deserialBuffer(weiptr);
+  if (wtmp) {
+    if (wtmp->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+      constexpr size_t EleNum = sizeof(AllKBlockCores) / sizeof(AllKBlockCores[0]);  // supported cores
+      support = contains(wtmp->mCoreId, AllKBlockCores, EleNum);
+      support &= hasISA(AllKBlockCores, EleNum);
+    } else if (wtmp->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNFloat) {
+      constexpr size_t EleNum = sizeof(FloatCores) / sizeof(FloatCores[0]);
+      support = contains(wtmp->mCoreId, FloatCores, EleNum);
+      support &= hasISA(FloatCores, EleNum);
+    }
+  }
+  safe_delete(wtmp);
+  return support;
+}
+
+void bestla_fusion_add_f32f32_forward(float* activation, void* weiptr, float* bias, float* output, int _m, int _n,
+                                      int _k, int lda, int ldo, bool broadcast_bias, void* _workspace) {
+  GetCPUDevice();
+  auto pth = ne_bestla::ne_threading::get();
+  auto ptr = storage::gemm::PackedWeightParser::deserialBuffer(const_cast<void*>(weiptr));
+  auto workspace = reinterpret_cast<int8_t*>(_workspace);
+  if (ptr) {
+    auto coretype = ptr->mCoreId;
+    auto NTile = gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+    auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+    auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+    auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+    if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+      auto bptr = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr);
+      auto BlkSize = bptr->mBlockSize;
+      if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+        if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+          ip_add::BTLAGemmCompF32<tAVX512F, tWeiNInt, tActKBaseF32>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                                    broadcast_bias, workspace, pth);
+        } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+          ip_add::BTLAGemmCompF32<tAVX2, tWeiNInt, tActKBaseF32>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                                 broadcast_bias, workspace, pth);
+        }
+      }
+      if (btype == gemm::CompType::tBF16 && PackRow == 2) {
+        if (NTile == tAMX_BF16::NTILE && _cd->AMX_BF16() && BlkSize % tAMX_BF16::KTILE == 0) {
+          if (_m <= tAVX512_BF16::MTILE) {
+            static_assert(tAVX512_BF16::NTILE == tAMX_BF16::NTILE);
+            ip_add::BTLAGemmCompF32<tAVX512_BF16, tWeiNInt, tActKBaseF32>(_m, _n, _k, activation, lda, ptr, output, ldo,
+                                                                          bias, broadcast_bias, workspace, pth);
+          } else {
+            ip_add::BTLAGemmCompF32<tAMX_BF16, tWeiNInt, tActKBaseF32>(_m, _n, _k, activation, lda, ptr, output, ldo,
+                                                                       bias, broadcast_bias, workspace, pth);
+          }
+        }
+      }
+      if (btype == gemm::CompType::tS8 && PackRow == 4) {
+        if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          static_assert(tAMX_INT8_SS_KBlock::NTILE == tAVX512_VNNI_KBlock::NTILE);
+          if (_m <= tAVX512_VNNI_KBlock::MTILE) {
+            ip_add::BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                                    broadcast_bias, workspace, pth);
+          } else {
+            ip_add::BTLAGemmCompInt8<tAMX_INT8_SS_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                                    broadcast_bias, workspace, pth);
+          }
+
+        } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                   BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          ip_add::BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                                  broadcast_bias, workspace, pth);
+        } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          ip_add::BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                               broadcast_bias, workspace, pth);
+        } else if (NTile == tAVX2_VNNI_KBlock::NTILE && _cd->AVX2() && BlkSize % tAVX2_VNNI_KBlock::KTILE == 0) {
+          ip_add::BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                                broadcast_bias, workspace, pth);
+        }
+      }
+    }
+    if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNFloat) {
+      auto bptr = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr);
+      auto BlkSize = bptr->mBlockSize;
+      if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+        if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+          ip_add::BTLAGemmCompF32<tAVX512F, tWeiNFloat, tActKBaseF32>(_m, _n, _k, activation, lda, ptr, output, ldo,
+                                                                      bias, broadcast_bias, workspace, pth);
+        } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+          ip_add::BTLAGemmCompF32<tAVX2, tWeiNFloat, tActKBaseF32>(_m, _n, _k, activation, lda, ptr, output, ldo, bias,
+                                                                   broadcast_bias, workspace, pth);
+        }
+      }
+      if (btype == gemm::CompType::tBF16 && PackRow == 2 && BlkSize % tAMX_BF16::KTILE == 0) {
+        if (NTile == tAMX_BF16::NTILE && _cd->AMX_BF16()) {
+          if (_m <= tAVX512_BF16::MTILE) {
+            static_assert(tAVX512_BF16::NTILE == tAMX_BF16::NTILE);
+            ip_add::BTLAGemmCompF32<tAVX512_BF16, tWeiNFloat, tActKBaseF32>(_m, _n, _k, activation, lda, ptr, output,
+                                                                            ldo, bias, broadcast_bias, workspace, pth);
+          } else {
+            ip_add::BTLAGemmCompF32<tAMX_BF16, tWeiNFloat, tActKBaseF32>(_m, _n, _k, activation, lda, ptr, output, ldo,
+                                                                         bias, broadcast_bias, workspace, pth);
+          }
+        }
+      }
+    }
+    delete ptr;
+  } else {
+    printf("Wrong Input\n");
+    assert(0);
+  }
+}
diff --git a/vllm-ext/inference_engine/cpp/core/layers/ip_fusion_ffn.cpp b/vllm-ext/inference_engine/cpp/core/layers/ip_fusion_ffn.cpp
new file mode 100644
index 000000000..598a9d929
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/ip_fusion_ffn.cpp
@@ -0,0 +1,641 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#include "bestla_common.hpp"
+
+using namespace bestla;     // NOLINT
+using namespace ne_bestla;  // NOLINT
+
+unsigned long long bestla_fusion_FFN_f32f32_get_workspace_size(int seq, int fin, int fmid, int fout,  // NOLINT
+                                                               void* w1ptr, void* w2ptr) {
+  // lazy size: maximum padding
+  int constexpr padding = 128;
+  size_t s = static_cast<size_t>(seq) * utils::padto(static_cast<size_t>(fin), padding) * 4;
+  s += static_cast<size_t>(seq) * utils::padto(static_cast<size_t>(fmid), padding) * 4;
+  return s;
+}
+
+namespace ffn_2w {
+
+template <class Parallel_T, class Launch_T1, class Launch_T2>
+void GemmRunWithA_ffn(Launch_T1* launcher1, Launch_T2* launcher2, const typename Launch_T1::Param& args1,
+                      const typename Launch_T2::Param& args2, parallel::IThreading* th) {
+  parallel::gemm::SchedulerDispatcher<Parallel_T> para1({th, args1.problem});
+  parallel::gemm::SchedulerDispatcher<Parallel_T> para2({th, args2.problem});
+  using AParall1 = typename Launch_T1::PrologueA::Parallel;
+  using AParall2 = typename Launch_T2::PrologueA::Parallel;
+  auto apara1 = launcher1->mProA.createParallel(th->num_threads(), args1.problem);
+  auto apara2 = launcher2->mProA.createParallel(th->num_threads(), args2.problem);
+  static bool flag = false;
+  if (flag) {
+    printf("%s\n", __FUNCTION__);
+    para1.print();
+    para2.print();
+    flag = false;
+  }
+  th->parallel_for([&](int tidx) {
+    typename AParall1::ThreadProblem thdpA1{tidx};
+    apara1.getIndex(thdpA1);
+    if (thdpA1.valid) {
+      launcher1->mProA.run(args1.paramA, thdpA1);
+    }
+    th->sync(tidx, 0);
+    typename Parallel_T::ThreadProblem thdp1{tidx};
+    para1.getIndex(thdp1);
+    if (thdp1.valid) {
+      launcher1->run(args1, thdp1);
+    }
+    th->sync(tidx, 1);
+    typename AParall2::ThreadProblem thdpA2{tidx};
+    apara2.getIndex(thdpA2);
+    if (thdpA2.valid) {
+      launcher2->mProA.run(args2.paramA, thdpA2);
+    }
+    th->sync(tidx, 2);
+    typename Parallel_T::ThreadProblem thdp2{tidx};
+    para2.getIndex(thdp2);
+    if (thdp2.valid) {
+      launcher2->run(args2, thdp2);
+    }
+  });
+}
+
+template <class Parallel_T, class Launch_T1, class Launch_T2>
+void GemmRun_ffn(Launch_T1* launcher1, Launch_T2* launcher2, const typename Launch_T1::Param& args1,
+                 const typename Launch_T2::Param& args2, parallel::IThreading* th) {
+  parallel::gemm::SchedulerDispatcher<Parallel_T> para1({th, args1.problem});
+  parallel::gemm::SchedulerDispatcher<Parallel_T> para2({th, args2.problem});
+  static bool flag = false;
+  if (flag) {
+    printf("%s\n", __FUNCTION__);
+    para1.print();
+    para2.print();
+    flag = false;
+  }
+  th->parallel_for([&](int tidx) {
+    typename Parallel_T::ThreadProblem thdp1{tidx};
+    para1.getIndex(thdp1);
+    if (thdp1.valid) {
+      launcher1->run(args1, thdp1);
+    }
+    th->sync(tidx);
+    typename Parallel_T::ThreadProblem thdp2{tidx};
+    para2.getIndex(thdp2);
+    if (thdp2.valid) {
+      launcher2->run(args2, thdp2);
+    }
+  });
+}
+
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T, template <class, BTLA_ISA> class Act_T,
+          template <BTLA_ISA> class Epi_T1, template <BTLA_ISA> class Epi_T2>
+void BTLAGemmCompF32(float* activation, storage::gemm::IWeightBase* w1ptr, storage::gemm::IWeightBase* w2ptr,
+                     float* tmp, float* output, int seq, int fin, int fmid, int fout, void* workspace,
+                     parallel::IThreading* th, typename Epi_T1<GemmCore_T::ISA>::Param epi_prama1,
+                     typename Epi_T2<GemmCore_T::ISA>::Param epi_prama2) {
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+  using Launcher_epi = wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, Act_T, Wei_T, Epi_T1>;
+  using Launcher = wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, Act_T, Wei_T, Epi_T2>;
+  auto w1ptr_ = reinterpret_cast<typename Launcher_epi::PrologueB::StorageWeight*>(w1ptr);
+  auto w2ptr_ = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(w2ptr);
+  utils::GemmProblem gp1(1, seq, fmid, fin, w1ptr_->mBlockSize);
+  utils::GemmProblem gp2(1, seq, fout, fmid, w2ptr_->mBlockSize);
+  static Launcher_epi kernel_epi;
+  static Launcher kernel;
+  auto reordA1 = kernel_epi.mProA.createReorderStorage(seq, fin, w1ptr_->mBlockSize);
+  auto reordA2 = kernel_epi.mProA.createReorderStorage(seq, fin, w2ptr_->mBlockSize);
+  typename Launcher_epi::Param args1{
+      gp1, {activation, fin, nullptr, w1ptr_->ShfIndice(), &reordA1}, {w1ptr_}, epi_prama1};
+  typename Launcher::Param args2{gp2, {tmp, fmid, nullptr, w2ptr_->ShfIndice(), &reordA2}, {w2ptr_}, epi_prama2};
+  auto WS = reinterpret_cast<int8_t*>(workspace);
+  if (w1ptr_->ShfIndice()) {
+    reordA1.assign(WS);
+    reordA2.assign(WS);
+    GemmRunWithA_ffn<Parallel>(&kernel_epi, &kernel, args1, args2, th);
+  } else {
+    GemmRun_ffn<Parallel>(&kernel_epi, &kernel, args1, args2, th);
+  }
+}
+
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T, template <BTLA_ISA> class Epi_T1,
+          template <BTLA_ISA> class Epi_T2>
+void BTLAGemmCompInt8(float* activation, storage::gemm::IWeightBase* w1ptr, storage::gemm::IWeightBase* w2ptr,
+                      float* tmp, float* output, int seq, int fin, int fmid, int fout, void* workspace,
+                      parallel::IThreading* th, typename Epi_T1<GemmCore_T::ISA>::Param epi_prama1,
+                      typename Epi_T2<GemmCore_T::ISA>::Param epi_prama2) {
+  using Parallel = parallel::gemm::SchedulerKBlockS<GemmCore_T>;
+  using Launcher_epi =
+      wrapper::gemm::LauncherIntKBlock<GemmCore_T::ISA, GemmCore_T,
+                                       prologue_a::gemm::ShuffleActivationKBlockQuantizeF32, Wei_T, Epi_T1>;
+  using Launcher =
+      wrapper::gemm::LauncherIntKBlock<GemmCore_T::ISA, GemmCore_T,
+                                       prologue_a::gemm::ShuffleActivationKBlockQuantizeF32, Wei_T, Epi_T2>;
+  auto w1ptr_ = reinterpret_cast<typename Launcher_epi::PrologueB::StorageWeight*>(w1ptr);
+  auto w2ptr_ = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(w2ptr);
+  utils::GemmProblem gp1(1, seq, fmid, fin, w1ptr_->mBlockSize);
+  utils::GemmProblem gp2(1, seq, fout, fmid, w2ptr_->mBlockSize);
+  static Launcher_epi kernel_epi;
+  static Launcher kernel;
+  auto WS = reinterpret_cast<int8_t*>(workspace);
+  auto quanA1 = kernel_epi.mProA.createStorage(seq, fin, w1ptr_->mBlockSize, w1ptr_->IsAsym());
+  quanA1.assign(WS);
+  WS += quanA1.mSize;
+  auto reordA1 = kernel_epi.mProA.createReorderStorage(seq, fin, w1ptr_->mBlockSize);
+  if (w1ptr_->ShfIndice()) {
+    reordA1.assign(WS);
+  }
+  WS = reinterpret_cast<int8_t*>(workspace);
+  auto quanA2 = kernel.mProA.createStorage(seq, fmid, w2ptr_->mBlockSize, w2ptr_->IsAsym());
+  quanA2.assign(WS);
+  WS += quanA2.mSize;
+  auto reordA2 = kernel_epi.mProA.createReorderStorage(seq, fin, w2ptr_->mBlockSize);
+  if (w2ptr_->ShfIndice()) {
+    reordA2.assign(WS);
+  }
+  typename Launcher_epi::Param args1{
+      gp1, {activation, fin, &quanA1, w1ptr_->ShfIndice(), &reordA1}, {w1ptr_}, epi_prama1};
+  typename Launcher::Param args2{gp2, {tmp, fmid, &quanA2, w2ptr_->ShfIndice(), &reordA2}, {w2ptr_}, epi_prama2};
+  GemmRunWithA_ffn<Parallel>(&kernel_epi, &kernel, args1, args2, th);
+}
+
+bool bestla_fusion_ffn_f32f32_support(void* w1ptr, void* w2ptr, int seq, int fin, int fmid, int fout) {
+  GetCPUDevice();
+  auto w1tmp = storage::gemm::PackedWeightParser::deserialBuffer(w1ptr);
+  auto w2tmp = storage::gemm::PackedWeightParser::deserialBuffer(w2ptr);
+  bool support = false;
+  if (w1tmp != nullptr && w2tmp != nullptr) {
+    auto sameKernel = samePackedWeight(w1tmp, w2tmp);
+    if (sameKernel) {
+      if (w1tmp->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+        constexpr size_t EleNum = sizeof(AllKBlockCores) / sizeof(AllKBlockCores[0]);
+        support = contains(w1tmp->mCoreId, AllKBlockCores, EleNum);
+        support &= hasISA(AllKBlockCores, EleNum);
+      } else if (w1tmp->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNFloat) {
+        constexpr size_t EleNum = sizeof(FloatCores) / sizeof(FloatCores[0]);
+        support = contains(w1tmp->mCoreId, FloatCores, EleNum);
+        support &= hasISA(FloatCores, EleNum);
+      }
+    }
+  }
+  safe_delete(w1tmp);
+  safe_delete(w2tmp);
+  return support;
+}
+
+template <template <BTLA_ISA> class epilogue1, template <BTLA_ISA> class epilogue2, typename Epi_args1,
+          typename Epi_args2>
+void bestla_fusion_ffn_f32f32_forward(float* activation, void* w1ptr, void* w2ptr, float* tmp, float* output, int seq,
+                                      int fin, int fmid, int fout, void* workspace, Epi_args1 epi_args1,
+                                      Epi_args2 epi_args2) {
+  GetCPUDevice();
+  auto pth = ne_threading::get();
+  auto ptr1 = storage::gemm::PackedWeightParser::deserialBuffer(w1ptr);
+  auto ptr2 = storage::gemm::PackedWeightParser::deserialBuffer(w2ptr);
+  auto _workspace = reinterpret_cast<int8_t*>(workspace);
+  if (ptr1) {
+    auto coretype = ptr1->mCoreId;
+    auto NTile = gemm::CoreAttr::get_mask_val(ptr1->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+    auto PackRow = gemm::CoreAttr::get_packrow(ptr1->mCoreId);
+    auto CType = gemm::CoreAttr::get_comp(ptr1->mCoreId);
+    auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+    if (ptr1->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+      auto bptr = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr1);
+      auto BlkSize = bptr->mBlockSize;
+      if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+        if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+          BTLAGemmCompF32<tAVX512F, tWeiNInt, tActKBaseF32, epilogue1, epilogue2>(
+              activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+        } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+          BTLAGemmCompF32<tAVX2, tWeiNInt, tActKBaseF32, epilogue1, epilogue2>(
+              activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+        }
+      }
+      if (btype == gemm::CompType::tBF16 && PackRow == 2) {
+        if (NTile == tAMX_BF16::NTILE && _cd->AMX_BF16() && BlkSize % tAMX_BF16::KTILE == 0) {
+          if (seq <= tAVX512_BF16::MTILE) {
+            static_assert(tAVX512_BF16::NTILE == tAMX_BF16::NTILE);
+            BTLAGemmCompF32<tAVX512_BF16, tWeiNInt, tActKBaseF32, epilogue1, epilogue2>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          } else {
+            BTLAGemmCompF32<tAMX_BF16, tWeiNInt, tActKBaseF32, epilogue1, epilogue2>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          }
+        }
+      }
+      if (btype == gemm::CompType::tS8 && PackRow == 4) {
+        if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          if (seq <= tAVX512_VNNI_KBlock::MTILE) {
+            static_assert(tAVX512_VNNI_KBlock::NTILE == tAMX_INT8_SS_KBlock::NTILE);
+            BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          } else {
+            BTLAGemmCompInt8<tAMX_INT8_SS_KBlock, tWeiNInt, epilogue1, epilogue2>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          }
+
+        } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                   BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(
+              activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+        } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(
+              activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+        } else if (NTile == tAVX2_VNNI_KBlock::NTILE && _cd->AVX2() && BlkSize % tAVX2_VNNI_KBlock::KTILE == 0) {
+          BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(
+              activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+        }
+      }
+    }
+    if (ptr1->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNFloat) {
+      auto bptr = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr1);
+      auto BlkSize = bptr->mBlockSize;
+      if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+        if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+          BTLAGemmCompF32<tAVX512F, tWeiNFloat, tActKBaseF32, epilogue1, epilogue2>(
+              activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+        } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+          BTLAGemmCompF32<tAVX2, tWeiNFloat, tActKBaseF32, epilogue1, epilogue2>(
+              activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+        }
+      }
+      if (btype == gemm::CompType::tBF16 && PackRow == 2) {
+        if (NTile == tAMX_BF16::NTILE && _cd->AMX_BF16() && BlkSize % tAMX_BF16::KTILE == 0) {
+          if (seq <= tAVX512_BF16::MTILE) {
+            static_assert(tAVX512_BF16::NTILE == tAMX_BF16::NTILE);
+            BTLAGemmCompF32<tAVX512_BF16, tWeiNFloat, tActKBaseF32, epilogue1, epilogue2>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          } else {
+            BTLAGemmCompF32<tAMX_BF16, tWeiNFloat, tActKBaseF32, epilogue1, epilogue2>(
+                activation, ptr1, ptr2, tmp, output, seq, fin, fmid, fout, workspace, pth, epi_args1, epi_args2);
+          }
+        }
+      }
+    }
+    delete ptr1;
+    delete ptr2;
+  } else {
+    printf("Wrong Input\n");
+    assert(0);
+  }
+}
+}  // namespace ffn_2w
+
+namespace ffn_3w {
+
+template <class Parallel_T, class Launch_T1, class Launch_T2, class Launch_T3>
+void GemmRunWithA_ffn(Launch_T1* launcher1, Launch_T2* launcher2, Launch_T3* launcher3,
+                      const typename Launch_T1::Param& args1, const typename Launch_T2::Param& args2,
+                      const typename Launch_T3::Param& args3, parallel::IThreading* th) {
+  parallel::gemm::SchedulerDispatcher<Parallel_T> para1({th, args1.problem});
+  parallel::gemm::SchedulerDispatcher<Parallel_T> para3({th, args3.problem});
+  using AParall1 = typename Launch_T1::PrologueA::Parallel;
+  using AParall3 = typename Launch_T3::PrologueA::Parallel;
+  auto apara1 = launcher1->mProA.createParallel(th->num_threads(), args1.problem);
+  auto apara3 = launcher3->mProA.createParallel(th->num_threads(), args3.problem);
+  static bool flag = false;
+  if (flag) {
+    printf("%s\n", __FUNCTION__);
+    para1.print();
+    para3.print();
+    flag = false;
+  }
+  th->parallel_for([&](int tidx) {
+    typename AParall1::ThreadProblem thdpA1{tidx};
+    apara1.getIndex(thdpA1);
+    if (thdpA1.valid) {
+      launcher1->mProA.run(args1.paramA, thdpA1);
+    }
+    th->sync(tidx, 0);
+    typename Parallel_T::ThreadProblem thdp1{tidx};
+    para1.getIndex(thdp1);
+    if (thdp1.valid) {
+      launcher1->run(args1, thdp1);
+      launcher2->run(args2, thdp1);
+    }
+    th->sync(tidx, 1);
+    typename AParall3::ThreadProblem thdpA3{tidx};
+    apara3.getIndex(thdpA3);
+    if (thdpA3.valid) {
+      launcher3->mProA.run(args3.paramA, thdpA3);
+    }
+    th->sync(tidx, 2);
+    typename Parallel_T::ThreadProblem thdp3{tidx};
+    para3.getIndex(thdp3);
+    if (thdp3.valid) {
+      launcher3->run(args3, thdp3);
+    }
+  });
+}
+
+template <class Parallel_T, class Launch_T1, class Launch_T2, class Launch_T3>
+void GemmRun_ffn(Launch_T1* launcher1, Launch_T2* launcher2, Launch_T3* launcher3,
+                 const typename Launch_T1::Param& args1, const typename Launch_T2::Param& args2,
+                 const typename Launch_T3::Param& args3, parallel::IThreading* th) {
+  parallel::gemm::SchedulerDispatcher<Parallel_T> para1({th, args1.problem});
+  parallel::gemm::SchedulerDispatcher<Parallel_T> para3({th, args3.problem});
+  static bool flag = false;
+  if (flag) {
+    printf("%s\n", __FUNCTION__);
+    para1.print();
+    para3.print();
+    flag = false;
+  }
+  th->parallel_for([&](int tidx) {
+    typename Parallel_T::ThreadProblem thdp1{tidx};
+    para1.getIndex(thdp1);
+    if (thdp1.valid) {
+      launcher1->run(args1, thdp1);
+      launcher2->run(args2, thdp1);
+    }
+    th->sync(tidx);
+    typename Parallel_T::ThreadProblem thdp3{tidx};
+    para3.getIndex(thdp3);
+    if (thdp3.valid) {
+      launcher3->run(args3, thdp3);
+    }
+  });
+}
+
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T, template <class, BTLA_ISA> class Act_T,
+          template <BTLA_ISA> class Epi_T1, template <BTLA_ISA> class Epi_T2>
+void BTLAGemmCompF32(float* activation, storage::gemm::IWeightBase* w1ptr, storage::gemm::IWeightBase* w2ptr,
+                     storage::gemm::IWeightBase* w3ptr, float* tmp1, float* tmp2, float* output, int seq, int fin,
+                     int fmid, int fout, void* workspace, parallel::IThreading* th,
+                     typename Epi_T1<GemmCore_T::ISA>::Param epi_prama1,
+                     typename Epi_T2<GemmCore_T::ISA>::Param epi_prama2) {
+  using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+  using Launcher_epi = wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, Act_T, Wei_T, Epi_T1>;
+  using Launcher_mul =
+      wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, Act_T, Wei_T, custom::epilogue::MulFp32>;
+  using Launcher = wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, Act_T, Wei_T, Epi_T2>;
+  auto w1ptr_ = reinterpret_cast<typename Launcher_epi::PrologueB::StorageWeight*>(w1ptr);
+  auto w2ptr_ = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(w2ptr);
+  auto w3ptr_ = reinterpret_cast<typename Launcher_mul::PrologueB::StorageWeight*>(w3ptr);
+  assert(w1ptr_->ShfIndice() == nullptr);
+  assert(w2ptr_->ShfIndice() == nullptr);
+  assert(w3ptr_->ShfIndice() == nullptr);
+  utils::GemmProblem gp1(1, seq, fmid, fin, w1ptr_->mBlockSize);
+  utils::GemmProblem gp2(1, seq, fout, fmid, w2ptr_->mBlockSize);
+  utils::GemmProblem gp3(1, seq, fmid, fin, w3ptr_->mBlockSize);
+  static Launcher_epi kernel_epi;
+  static Launcher_mul kernel_mul;
+  static Launcher kernel;
+  typename Launcher_epi::Param args1{gp1, {activation, fin, nullptr}, {w1ptr_}, epi_prama1};
+  typename Launcher::Param args2{gp2, {tmp2, fmid, nullptr}, {w2ptr_}, epi_prama2};
+  typename Launcher_mul::Param args3{gp3, {activation, fin, nullptr}, {w3ptr_}, {tmp2, tmp1, fmid, fmid}};
+  GemmRun_ffn<Parallel>(&kernel_epi, &kernel_mul, &kernel, args1, args3, args2, th);
+}
+
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T, template <BTLA_ISA> class Epi_T1,
+          template <BTLA_ISA> class Epi_T2>
+void BTLAGemmCompInt8(float* activation, storage::gemm::IWeightBase* w1ptr, storage::gemm::IWeightBase* w2ptr,
+                      storage::gemm::IWeightBase* w3ptr, float* tmp1, float* tmp2, float* output, int seq, int fin,
+                      int fmid, int fout, void* workspace, parallel::IThreading* th,
+                      typename Epi_T1<GemmCore_T::ISA>::Param epi_prama1,
+                      typename Epi_T2<GemmCore_T::ISA>::Param epi_prama2) {
+  using Parallel = parallel::gemm::SchedulerKBlockS<GemmCore_T>;
+  using Launcher_epi =
+      wrapper::gemm::LauncherIntKBlock<GemmCore_T::ISA, GemmCore_T,
+                                       prologue_a::gemm::ShuffleActivationKBlockQuantizeF32, Wei_T, Epi_T1>;
+  using Launcher_mul = wrapper::gemm::LauncherIntKBlock<GemmCore_T::ISA, GemmCore_T,
+                                                        prologue_a::gemm::ShuffleActivationKBlockQuantizeF32, Wei_T,
+                                                        custom::epilogue::MulFp32>;
+  using Launcher =
+      wrapper::gemm::LauncherIntKBlock<GemmCore_T::ISA, GemmCore_T,
+                                       prologue_a::gemm::ShuffleActivationKBlockQuantizeF32, Wei_T, Epi_T2>;
+  auto w1ptr_ = reinterpret_cast<typename Launcher_epi::PrologueB::StorageWeight*>(w1ptr);
+  auto w2ptr_ = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(w2ptr);
+  auto w3ptr_ = reinterpret_cast<typename Launcher_mul::PrologueB::StorageWeight*>(w3ptr);
+  utils::GemmProblem gp1(1, seq, fmid, fin, w1ptr_->mBlockSize);
+  utils::GemmProblem gp2(1, seq, fout, fmid, w2ptr_->mBlockSize);
+  utils::GemmProblem gp3(1, seq, fmid, fin, w3ptr_->mBlockSize);
+  static Launcher_epi kernel_epi;
+  static Launcher_mul kernel_mul;
+  static Launcher kernel;
+  auto quanA1 = kernel_epi.mProA.createStorage(seq, fin, w1ptr_->mBlockSize, w1ptr_->IsAsym());
+  auto WS = reinterpret_cast<int8_t*>(workspace);
+  quanA1.assign(WS);
+
+  auto quanA2 = kernel.mProA.createStorage(seq, fmid, w2ptr_->mBlockSize, w2ptr_->IsAsym());
+  WS = reinterpret_cast<int8_t*>(workspace);
+  quanA2.assign(WS);
+  assert(w1ptr_->ShfIndice() == nullptr);
+  assert(w2ptr_->ShfIndice() == nullptr);
+  assert(w3ptr_->ShfIndice() == nullptr);
+  typename Launcher_epi::Param args1{gp1, {activation, fin, &quanA1}, {w1ptr_}, epi_prama1};
+  typename Launcher::Param args2{gp2, {tmp2, fmid, &quanA2}, {w2ptr_}, epi_prama2};
+  typename Launcher_mul::Param args3{gp3, {activation, fin, &quanA1}, {w3ptr_}, {tmp2, tmp1, fmid, fmid}};
+  GemmRunWithA_ffn<Parallel>(&kernel_epi, &kernel_mul, &kernel, args1, args3, args2, th);
+}
+
+bool bestla_fusion_ffn_f32f32_support(void* w1ptr, void* w2ptr, void* w3ptr, int seq, int fin, int fmid, int fout) {
+  GetCPUDevice();
+  auto w1tmp = storage::gemm::PackedWeightParser::deserialBuffer(w1ptr);
+  auto w2tmp = storage::gemm::PackedWeightParser::deserialBuffer(w2ptr);
+  auto w3tmp = storage::gemm::PackedWeightParser::deserialBuffer(w3ptr);
+  bool support = false;
+  if (w1tmp != nullptr && w2tmp != nullptr && w3tmp != nullptr) {
+    storage::gemm::IWeightBase* tmps[3] = {w1tmp, w2tmp, w3tmp};
+    auto sameKernel = samePackedWeight(tmps, 3);
+    if (sameKernel) {
+      if (w1tmp->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+        auto w1ptr = reinterpret_cast<storage::gemm::StorageWeightKBlockNInteger*>(w1tmp);
+        if (w1ptr->ShfIndice()) {
+          return false;  // Do not support 3w ffn fusion for activation shuffle
+        }
+        constexpr size_t EleNum = sizeof(AllKBlockCores) / sizeof(AllKBlockCores[0]);
+        support = contains(w1tmp->mCoreId, AllKBlockCores, EleNum);
+        support &= hasISA(AllKBlockCores, EleNum);
+      } else if (w1tmp->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNFloat) {
+        constexpr size_t EleNum = sizeof(FloatCores) / sizeof(FloatCores[0]);
+        support = contains(w1tmp->mCoreId, FloatCores, EleNum);
+        support &= hasISA(FloatCores, EleNum);
+      }
+    }
+  }
+  safe_delete(w1tmp);
+  safe_delete(w2tmp);
+  safe_delete(w3tmp);
+  return support;
+}
+
+template <template <BTLA_ISA> class epilogue1, template <BTLA_ISA> class epilogue2, typename Epi_args1,
+          typename Epi_args2>
+void bestla_fusion_ffn_f32f32_forward(float* activation, void* w1ptr, void* w2ptr, void* w3ptr, float* tmp1,
+                                      float* tmp2, float* output, int seq, int fin, int fmid, int fout, void* workspace,
+                                      Epi_args1 epi_args1, Epi_args2 epi_args2) {
+  GetCPUDevice();
+  auto pth = ne_threading::get();
+  auto ptr1 = storage::gemm::PackedWeightParser::deserialBuffer(w1ptr);
+  auto ptr2 = storage::gemm::PackedWeightParser::deserialBuffer(w2ptr);
+  auto ptr3 = storage::gemm::PackedWeightParser::deserialBuffer(w3ptr);
+  auto _workspace = reinterpret_cast<int8_t*>(workspace);
+  if (ptr1) {
+    auto coretype = ptr1->mCoreId;
+    auto NTile = gemm::CoreAttr::get_mask_val(ptr1->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+    auto PackRow = gemm::CoreAttr::get_packrow(ptr1->mCoreId);
+    auto CType = gemm::CoreAttr::get_comp(ptr1->mCoreId);
+    auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+    if (ptr1->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+      auto bptr = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr1);
+      auto BlkSize = bptr->mBlockSize;
+      if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+        if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+          BTLAGemmCompF32<tAVX512F, tWeiNInt, tActKBaseF32, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1,
+                                                                                  tmp2, output, seq, fin, fmid, fout,
+                                                                                  workspace, pth, epi_args1, epi_args2);
+        } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+          BTLAGemmCompF32<tAVX2, tWeiNInt, tActKBaseF32, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1, tmp2,
+                                                                               output, seq, fin, fmid, fout, workspace,
+                                                                               pth, epi_args1, epi_args2);
+        }
+      }
+      if (btype == gemm::CompType::tBF16 && PackRow == 2) {
+        if (NTile == tAMX_BF16::NTILE && _cd->AMX_BF16() && BlkSize % tAMX_BF16::KTILE == 0) {
+          if (seq <= tAVX512_BF16::MTILE) {
+            static_assert(tAVX512_BF16::NTILE == tAMX_BF16::NTILE);
+            BTLAGemmCompF32<tAVX512_BF16, tWeiNInt, tActKBaseF32, epilogue1, epilogue2>(
+                activation, ptr1, ptr2, ptr3, tmp1, tmp2, output, seq, fin, fmid, fout, workspace, pth, epi_args1,
+                epi_args2);
+          } else {
+            BTLAGemmCompF32<tAMX_BF16, tWeiNInt, tActKBaseF32, epilogue1, epilogue2>(
+                activation, ptr1, ptr2, ptr3, tmp1, tmp2, output, seq, fin, fmid, fout, workspace, pth, epi_args1,
+                epi_args2);
+          }
+        }
+      }
+      if (btype == gemm::CompType::tS8 && PackRow == 4) {
+        if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          if (seq <= tAVX512_VNNI_KBlock::MTILE) {
+            static_assert(tAVX512_VNNI_KBlock::NTILE == tAMX_INT8_SS_KBlock::NTILE);
+            BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1,
+                                                                                  tmp2, output, seq, fin, fmid, fout,
+                                                                                  workspace, pth, epi_args1, epi_args2);
+          } else {
+            BTLAGemmCompInt8<tAMX_INT8_SS_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1,
+                                                                                  tmp2, output, seq, fin, fmid, fout,
+                                                                                  workspace, pth, epi_args1, epi_args2);
+          }
+
+        } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                   BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1,
+                                                                                tmp2, output, seq, fin, fmid, fout,
+                                                                                workspace, pth, epi_args1, epi_args2);
+        } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1, tmp2,
+                                                                             output, seq, fin, fmid, fout, workspace,
+                                                                             pth, epi_args1, epi_args2);
+        } else if (NTile == tAVX2_VNNI_KBlock::NTILE && _cd->AVX2() && BlkSize % tAVX2_VNNI_KBlock::KTILE == 0) {
+          BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1, tmp2,
+                                                                              output, seq, fin, fmid, fout, workspace,
+                                                                              pth, epi_args1, epi_args2);
+        }
+      }
+    }
+    if (ptr1->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNFloat) {
+      auto bptr = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr1);
+      auto BlkSize = bptr->mBlockSize;
+      if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+        if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+          BTLAGemmCompF32<tAVX512F, tWeiNFloat, tActKBaseF32, epilogue1, epilogue2>(
+              activation, ptr1, ptr2, ptr3, tmp1, tmp2, output, seq, fin, fmid, fout, workspace, pth, epi_args1,
+              epi_args2);
+        } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+          BTLAGemmCompF32<tAVX2, tWeiNFloat, tActKBaseF32, epilogue1, epilogue2>(activation, ptr1, ptr2, ptr3, tmp1,
+                                                                                 tmp2, output, seq, fin, fmid, fout,
+                                                                                 workspace, pth, epi_args1, epi_args2);
+        }
+      }
+      if (btype == gemm::CompType::tBF16 && PackRow == 2) {
+        if (NTile == tAMX_BF16::NTILE && _cd->AMX_BF16() && BlkSize % tAMX_BF16::KTILE == 0) {
+          if (seq <= tAVX512_BF16::MTILE) {
+            static_assert(tAVX512_BF16::NTILE == tAMX_BF16::NTILE);
+            BTLAGemmCompF32<tAVX512_BF16, tWeiNFloat, tActKBaseF32, epilogue1, epilogue2>(
+                activation, ptr1, ptr2, ptr3, tmp1, tmp2, output, seq, fin, fmid, fout, workspace, pth, epi_args1,
+                epi_args2);
+          } else {
+            BTLAGemmCompF32<tAMX_BF16, tWeiNFloat, tActKBaseF32, epilogue1, epilogue2>(
+                activation, ptr1, ptr2, ptr3, tmp1, tmp2, output, seq, fin, fmid, fout, workspace, pth, epi_args1,
+                epi_args2);
+          }
+        }
+      }
+    }
+    delete ptr1;
+    delete ptr2;
+    delete ptr3;
+  } else {
+    printf("Wrong Input\n");
+    assert(0);
+  }
+}
+}  // namespace ffn_3w
+
+bool bestla_fusion_FFN_SiLu_f32f32_support(void* w1ptr, void* w2ptr, void* w3ptr, int seq, int fin, int fmid,
+                                           int fout) {
+  return ffn_3w::bestla_fusion_ffn_f32f32_support(w1ptr, w2ptr, w3ptr, seq, fin, fmid, fout);
+}
+
+bool bestla_fusion_FFN_Gelu_Mul_f32f32_support(void* w1ptr, void* w2ptr, void* w3ptr, int seq, int fin, int fmid,
+                                               int fout) {
+  return ffn_3w::bestla_fusion_ffn_f32f32_support(w1ptr, w2ptr, w3ptr, seq, fin, fmid, fout);
+}
+
+void bestla_fusion_FFN_SiLu_f32f32_forward(float* activation, void* w1ptr, void* w2ptr, void* w3ptr, float* tmp1,
+                                           float* tmp2, float* output, int seq, int fin, int fmid, int fout,
+                                           void* workspace) {
+  float silu_alpha = -1.0f;
+  epilogue::gemm::ParamAccumulatorWriteBack<float> epi_args1 = {tmp1, fmid, &silu_alpha};
+  epilogue::gemm::ParamAccumulatorWriteBack<float> epi_args2 = {output, fout};
+  ffn_3w::bestla_fusion_ffn_f32f32_forward<epilogue::gemm::AccumulatorWriteBackWithSwishFp32,
+                                           epilogue::gemm::AccumulatorWriteBackFp32>(
+      activation, w1ptr, w2ptr, w3ptr, tmp1, tmp2, output, seq, fin, fmid, fout, workspace, epi_args1, epi_args2);
+}
+
+void bestla_fusion_FFN_Gelu_Mul_f32f32_forward(float* activation, void* w1ptr, void* w2ptr, void* w3ptr, float* tmp1,
+                                               float* tmp2, float* output, int seq, int fin, int fmid, int fout,
+                                               void* workspace) {
+  epilogue::gemm::ParamAccumulatorWriteBack<float> epi_args1 = {tmp1, fmid};
+  epilogue::gemm::ParamAccumulatorWriteBack<float> epi_args2 = {output, fout};
+  ffn_3w::bestla_fusion_ffn_f32f32_forward<epilogue::gemm::AccumulatorWriteBackWithGeluFp32,
+                                           epilogue::gemm::AccumulatorWriteBackFp32>(
+      activation, w1ptr, w2ptr, w3ptr, tmp1, tmp2, output, seq, fin, fmid, fout, workspace, epi_args1, epi_args2);
+}
+
+bool bestla_fusion_FFN_GeLu_f32f32_support(void* w1ptr, void* w2ptr, int seq, int fin, int fmid, int fout) {
+  return ffn_2w::bestla_fusion_ffn_f32f32_support(w1ptr, w2ptr, seq, fin, fmid, fout);
+}
+
+void bestla_fusion_FFN_GeLu_f32f32_forward(float* activation, void* w1ptr, void* w2ptr, float* tmp1, float* output,
+                                           int seq, int fin, int fmid, int fout, void* workspace) {
+  epilogue::gemm::ParamAccumulatorWriteBack<float> epi_args1 = {tmp1, fmid, nullptr};
+  epilogue::gemm::ParamAccumulatorWriteBack<float> epi_args2 = {output, fout};
+  ffn_2w::bestla_fusion_ffn_f32f32_forward<epilogue::gemm::AccumulatorWriteBackWithGeluFp32,
+                                           epilogue::gemm::AccumulatorWriteBackFp32>(
+      activation, w1ptr, w2ptr, tmp1, output, seq, fin, fmid, fout, workspace, epi_args1, epi_args2);
+}
+
+bool bestla_fusion_FFN_Add_GeLu_f32f32_support(void* w1ptr, void* w2ptr, int seq, int fin, int fmid, int fout) {
+  return ffn_2w::bestla_fusion_ffn_f32f32_support(w1ptr, w2ptr, seq, fin, fmid, fout);
+}
+
+void bestla_fusion_FFN_Add_GeLu_f32f32_forward(float* activation, void* w1ptr, void* w2ptr, float* b1ptr, float* b2ptr,
+                                               float* tmp1, float* output, int seq, int fin, int fmid, int fout,
+                                               bool broadcast_bias, void* workspace) {
+  custom::epilogue::ParamAdd_Gelu<float> epi_args1 = {tmp1, b1ptr, fmid, broadcast_bias ? 0 : fmid};
+  custom::epilogue::ParamAdd<float> epi_args2 = {output, b2ptr, fout, broadcast_bias ? 0 : fout};
+  ffn_2w::bestla_fusion_ffn_f32f32_forward<custom::epilogue::Add_GeluFp32, custom::epilogue::AddFp32>(
+      activation, w1ptr, w2ptr, tmp1, output, seq, fin, fmid, fout, workspace, epi_args1, epi_args2);
+}
diff --git a/vllm-ext/inference_engine/cpp/core/layers/ip_fusion_qkv.cpp b/vllm-ext/inference_engine/cpp/core/layers/ip_fusion_qkv.cpp
new file mode 100644
index 000000000..c5e96b270
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/ip_fusion_qkv.cpp
@@ -0,0 +1,249 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#include "bestla_common.hpp"
+
+using namespace bestla;     // NOLINT
+using namespace ne_bestla;  // NOLINT
+
+namespace ip_qkv {
+
+template <class Parallel_T, class Launch_T>
+void GemmRun_QKV(Launch_T* launcher, const typename Launch_T::Param* args, parallel::IThreading* th) {
+  parallel::gemm::SchedulerDispatcher<Parallel_T> para({th, args[0].problem});
+  static bool flag = false;
+  if (flag) {
+    printf("%s\n", __FUNCTION__);
+    para.print();
+    flag = false;
+  }
+  th->parallel_for([&](int tidx) {
+    typename Parallel_T::ThreadProblem thdp{tidx};
+    para.getIndex(thdp);
+    if (thdp.valid) {
+      for (size_t i = 0; i < 3; i++) {
+        launcher->run(args[i], thdp);
+      }
+    }
+  });
+}
+
+template <class Parallel_T, class Launch_T>
+void GemmRunWithA_QKV(Launch_T* launcher, const typename Launch_T::Param* args, parallel::IThreading* th) {
+  parallel::gemm::SchedulerDispatcher<Parallel_T> para({th, args[0].problem});
+  using AParall = typename Launch_T::PrologueA::Parallel;
+  auto apara = launcher->mProA.createParallel(th->num_threads(), args[0].problem);
+  static bool flag = false;
+  if (flag) {
+    printf("%s\n", __FUNCTION__);
+    para.print();
+    flag = false;
+  }
+  th->parallel_for([&](int tidx) {
+    typename AParall::ThreadProblem thdpA{tidx};
+    apara.getIndex(thdpA);
+    if (thdpA.valid) {
+      launcher->mProA.run(args[0].paramA, thdpA);
+    }
+    th->sync(tidx);
+    typename Parallel_T::ThreadProblem thdp{tidx};
+    para.getIndex(thdp);
+    if (thdp.valid) {
+      for (size_t i = 0; i < 3; i++) {
+        launcher->run(args[i], thdp);
+      }
+    }
+  });
+}
+
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T>
+void BTLAGemmCompF32(const int M, const int N, const int K, const float* A, const int lda,
+                     storage::gemm::IWeightBase* _BQ, storage::gemm::IWeightBase* _BK, storage::gemm::IWeightBase* _BV,
+                     float* C, const int ldc, int8_t* WorkSpace, parallel::IThreading* th) {
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+  using Launcher =
+      wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ShuffleActivationKBlockBaseF32, Wei_T,
+                                  epilogue::gemm::AccumulatorWriteBackFp32>;
+  static Launcher kernel;
+  auto BQ = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(_BQ);
+  auto BK = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(_BK);
+  auto BV = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(_BV);
+  auto reordA = kernel.mProA.createReorderStorage(M, K, BQ->mBlockSize);
+  utils::GemmProblem gpq(1, M, N, K, BQ->mBlockSize);
+  utils::GemmProblem gpk(1, M, N, K, BK->mBlockSize);
+  utils::GemmProblem gpv(1, M, N, K, BV->mBlockSize);
+  typename Launcher::Param args[3]{{gpq, {A, K, nullptr, BQ->ShfIndice(), &reordA}, {BQ}, {C, ldc}},
+                                   {gpk, {A, K, nullptr, BK->ShfIndice(), &reordA}, {BK}, {C + M * ldc, ldc}},
+                                   {gpv, {A, K, nullptr, BV->ShfIndice(), &reordA}, {BV}, {C + M * ldc * 2, ldc}}};
+  if (BQ->ShfIndice()) {
+    reordA.assign(WorkSpace);
+    GemmRunWithA_QKV<Parallel>(&kernel, args, th);
+  } else {
+    GemmRun_QKV<Parallel>(&kernel, args, th);
+  }
+}
+
+template <class GemmCore_T, template <class, BTLA_ISA> class Wei_T>
+void BTLAGemmCompInt8(const int M, const int N, const int K, const float* A, const int lda,
+                      storage::gemm::IWeightBase* _BQ, storage::gemm::IWeightBase* _BK, storage::gemm::IWeightBase* _BV,
+                      float* C, const int ldc, int8_t* WorkSpace, parallel::IThreading* th) {
+  using Parallel = parallel::gemm::SchedulerKBlockS<GemmCore_T>;
+  using Launcher = tLauncher_Int8_F32F32<GemmCore_T, Wei_T>;
+  auto BQ = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(_BQ);
+  auto BK = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(_BK);
+  auto BV = reinterpret_cast<typename Launcher::PrologueB::StorageWeight*>(_BV);
+  static Launcher kernel;
+  auto quanA = kernel.mProA.createStorage(M, K, BQ->mBlockSize, BQ->IsAsym());
+  quanA.assign(WorkSpace);
+  WorkSpace += quanA.mSize;
+  auto reordA = kernel.mProA.createReorderStorage(M, K, BQ->mBlockSize);
+  utils::GemmProblem gp(1, M, N, K, BQ->mBlockSize);  // If mixed blocksize, change it to three instances.
+  typename Launcher::Param args[3]{{gp, {A, K, &quanA, BQ->ShfIndice(), &reordA}, {BQ}, {C, N}},
+                                   {gp, {A, K, &quanA, BK->ShfIndice(), &reordA}, {BK}, {C + M * ldc, N}},
+                                   {gp, {A, K, &quanA, BV->ShfIndice(), &reordA}, {BV}, {C + M * ldc * 2, N}}};
+  GemmRunWithA_QKV<Parallel>(&kernel, args, th);
+}
+}  // namespace ip_qkv
+
+unsigned long long bestla_fusion_QKV_f32f32_get_workspace_size(int _m, int _n, int _k, void* w1ptr) {  // NOLINT
+  // maximum padding
+  // we can parse w1ptr to get a accurate size, but not necessary
+  int constexpr padding = 128;
+  size_t s = static_cast<size_t>(_m) * utils::padto(static_cast<size_t>(_k), padding) * 4;
+  return s;
+}
+
+bool bestla_fusion_QKV_f32f32_support(void* wqptr, void* wkptr, void* wvptr, int _m, int _n, int _k) {
+  GetCPUDevice();
+  bool support = false;
+  auto wqtmp = storage::gemm::PackedWeightParser::deserialBuffer(wqptr);
+  auto wktmp = storage::gemm::PackedWeightParser::deserialBuffer(wkptr);
+  auto wvtmp = storage::gemm::PackedWeightParser::deserialBuffer(wvptr);
+  if (wqtmp && wktmp && wvtmp) {
+    storage::gemm::IWeightBase* wset[] = {wqtmp, wktmp, wvtmp};
+    if (samePackedWeight(wset, 3)) {
+      if (wqtmp->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+        auto wqptr = reinterpret_cast<storage::gemm::StorageWeightKBlockNInteger*>(wqtmp);
+        if (wqptr->ShfIndice()) {
+          return false;  // Do not support QKV fusion for activation shuffle
+        }
+        constexpr size_t EleNum = sizeof(AllKBlockCores) / sizeof(AllKBlockCores[0]);
+        support = contains(wqtmp->mCoreId, AllKBlockCores, EleNum);
+        support &= hasISA(AllKBlockCores, EleNum);
+      } else if (wqtmp->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNFloat) {
+        constexpr size_t EleNum = sizeof(FloatCores) / sizeof(FloatCores[0]);
+        support = contains(wqtmp->mCoreId, FloatCores, EleNum);
+        support &= hasISA(FloatCores, EleNum);
+      }
+    }
+  }
+  safe_delete(wqtmp);
+  safe_delete(wktmp);
+  safe_delete(wvtmp);
+  return support;
+}
+
+// f32f32: activation & output dtype
+void bestla_fusion_QKV_f32f32_forward(float* activation, void* wqptr, void* wkptr, void* wvptr, float* output, int _m,
+                                      int _n, int _k, int lda, int ldo, void* _workspace) {
+  GetCPUDevice();
+  auto wqtmp = storage::gemm::PackedWeightParser::deserialBuffer(wqptr);
+  auto wktmp = storage::gemm::PackedWeightParser::deserialBuffer(wkptr);
+  auto wvtmp = storage::gemm::PackedWeightParser::deserialBuffer(wvptr);
+  // must check support before forward, there is no need to check support twice.
+  auto ptr = wqtmp;
+  auto coretype = ptr->mCoreId;
+  auto NTile = gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+  auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+  auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+  auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+  auto pth = ne_bestla::ne_threading::get();
+  auto workspace = reinterpret_cast<int8_t*>(_workspace);
+  if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+    auto bptr = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr);
+    auto BlkSize = bptr->mBlockSize;
+    if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+      if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+        ip_qkv::BTLAGemmCompF32<tAVX512F, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output, ldo,
+                                                    workspace, pth);
+      } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+        ip_qkv::BTLAGemmCompF32<tAVX2, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output, ldo,
+                                                 workspace, pth);
+      }
+    }
+    if (btype == gemm::CompType::tBF16 && PackRow == 2) {
+      if (NTile == tAMX_BF16::NTILE && _cd->AMX_BF16() && BlkSize % tAMX_BF16::KTILE == 0) {
+        if (_m <= tAVX512_BF16::MTILE) {
+          static_assert(tAVX512_BF16::NTILE == tAMX_BF16::NTILE);
+          ip_qkv::BTLAGemmCompF32<tAVX512_BF16, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output, ldo,
+                                                          workspace, pth);
+        } else {
+          ip_qkv::BTLAGemmCompF32<tAMX_BF16, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output, ldo,
+                                                       workspace, pth);
+        }
+      }
+    }
+    if (btype == gemm::CompType::tS8 && PackRow == 4) {
+      if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+        if (_m <= tAVX512_VNNI_KBlock::MTILE) {
+          static_assert(tAVX512_VNNI_KBlock::NTILE == tAMX_INT8_SS_KBlock::NTILE);
+          ip_qkv::BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp,
+                                                                  output, ldo, workspace, pth);
+        } else {
+          ip_qkv::BTLAGemmCompInt8<tAMX_INT8_SS_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp,
+                                                                  output, ldo, workspace, pth);
+        }
+
+      } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                 BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+        ip_qkv::BTLAGemmCompInt8<tAVX512_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp,
+                                                                output, ldo, workspace, pth);
+      } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+        ip_qkv::BTLAGemmCompInt8<tAVX_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output,
+                                                             ldo, workspace, pth);
+      } else if (NTile == tAVX2_VNNI_KBlock::NTILE && _cd->AVX2() && BlkSize % tAVX2_VNNI_KBlock::KTILE == 0) {
+        ip_qkv::BTLAGemmCompInt8<tAVX2_VNNI_KBlock, tWeiNInt>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output,
+                                                              ldo, workspace, pth);
+      }
+    }
+  }
+  if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNFloat) {
+    auto bptr = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(ptr);
+    auto BlkSize = bptr->mBlockSize;
+    if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+      if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+        ip_qkv::BTLAGemmCompF32<tAVX512F, tWeiNFloat>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output, ldo,
+                                                      workspace, pth);
+      } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+        ip_qkv::BTLAGemmCompF32<tAVX2, tWeiNFloat>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output, ldo,
+                                                   workspace, pth);
+      }
+    }
+    if (btype == gemm::CompType::tBF16 && PackRow == 2) {
+      if (NTile == tAMX_BF16::NTILE && _cd->AMX_BF16() && BlkSize % tAMX_BF16::KTILE == 0) {
+        if (_m <= tAVX512_BF16::MTILE) {
+          static_assert(tAVX512_BF16::NTILE == tAMX_BF16::NTILE);
+          ip_qkv::BTLAGemmCompF32<tAVX512_BF16, tWeiNFloat>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output,
+                                                            ldo, workspace, pth);
+        } else {
+          ip_qkv::BTLAGemmCompF32<tAMX_BF16, tWeiNFloat>(_m, _n, _k, activation, lda, wqtmp, wktmp, wvtmp, output, ldo,
+                                                         workspace, pth);
+        }
+      }
+    }
+  }
+  safe_delete(wqtmp);
+  safe_delete(wktmp);
+  safe_delete(wvtmp);
+}
diff --git a/vllm-ext/inference_engine/cpp/core/layers/layers.h b/vllm-ext/inference_engine/cpp/core/layers/layers.h
new file mode 100644
index 000000000..34f7620b8
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/layers.h
@@ -0,0 +1,19 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#include "ele_wise.h"
+#include "ele_reduce.h"
+
+#include "conv.h"
+#include "memory.h"
+#include "argsort.h"
diff --git a/vllm-ext/inference_engine/cpp/core/layers/memory.cpp b/vllm-ext/inference_engine/cpp/core/layers/memory.cpp
new file mode 100644
index 000000000..6fe792416
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/memory.cpp
@@ -0,0 +1,33 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#include "memory.h"
+
+void ne_attention_padding_mask_f32_forward(const int bs, const int nr_qk, const int qlen, const int ith, const int nth,
+                                           const void* padding, const float p_value, struct ne_tensor* dst) {
+  // mask padding token (padding left)
+  for (int b = 0; b < bs; b++) {
+    const int n_padding = (reinterpret_cast<const int32_t*>(padding))[b];
+    if (n_padding == 0) continue;
+    for (int k = 0; k < (nr_qk / bs); k++) {
+      for (int j = ith; j < qlen; j += nth) {
+        // it will not affect next token if don't mask the pad_token row
+        ne_vec_set_f32(n_padding,
+                       reinterpret_cast<float*>(reinterpret_cast<char*>(dst->data) + b * dst->nb[3] + k * dst->nb[2] +
+                                                j * dst->nb[1]),
+                       p_value);
+      }
+    }
+  }
+}
diff --git a/vllm-ext/inference_engine/cpp/core/layers/memory.h b/vllm-ext/inference_engine/cpp/core/layers/memory.h
new file mode 100644
index 000000000..f37a408c5
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/memory.h
@@ -0,0 +1,29 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#pragma once
+
+#include "ele_wise.h"
+#include "core/ne.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ne_attention_padding_mask_f32_forward(const int bs, const int nr_qk, const int qlen, const int ith, const int nth,
+                                           const void* padding, const float p_value, struct ne_tensor* dst);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/vllm-ext/inference_engine/cpp/core/layers/mha_dense.cpp b/vllm-ext/inference_engine/cpp/core/layers/mha_dense.cpp
new file mode 100644
index 000000000..b0ab5118b
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/mha_dense.cpp
@@ -0,0 +1,499 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#include "layers/mha_dense.h"
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <random>
+#include <vector>
+
+#include "core/data_types.h"
+#include "mha_dense_wrapper.h"
+
+using namespace bestla;       // NOLINT
+using namespace ne_bestla;    // NOLINT
+using namespace custom::mha;  // NOLINT
+
+void bestla_fusion_attn_bf16_forward(const attn_bf16_fwd_args_t* params) {
+  return bestla_fusion_attn_forward(*reinterpret_cast<const attn_fwd_args_t<bf16, bf16, bf16, bf16>*>(params));
+}
+
+bool bestla_fusion_attn_fp32_fp16_fp16_fp32_support(const attn_shape_t* params) {
+#if CompileBF16()
+  GetCPUDevice();
+  // TODO(Yi): check K V's layout
+  return _cd->AMX_BF16();
+#endif
+  return false;
+}
+void bestla_fusion_attn_fp32_fp16_fp16_fp32_forward(const attn_fp32_fp16_fp16_fp32_fwd_args_t* params) {
+  return bestla_fusion_attn_forward(*reinterpret_cast<const attn_fwd_args_t<float, fp16, fp16, float>*>(params));
+  // return bestla_fusion_attn_forward_ref(*reinterpret_cast<const attn_fwd_args_t<float, fp16, fp16, float>*>(params));
+}
+
+bool bestla_fusion_attn_fp16_support(const attn_shape_t* params) {
+#if CompileFP16()
+  GetCPUDevice();
+  // TODO(Yi): check K V's layout
+  return _cd->AMX_BF16();
+#endif
+  return false;
+}
+void bestla_fusion_attn_fp16_forward(const attn_fp16_fwd_args_t* params) {
+  return bestla_fusion_attn_forward<fp16, fp16, fp16, fp16>(
+      *reinterpret_cast<const attn_fwd_args_t<fp16, fp16, fp16, fp16>*>(params));
+}
+void bestla_fusion_attn_int8_forward(const attn_int8_fwd_args_t* params) {
+  return bestla_fusion_attn_forward<int8_t, int8_t, int8_t, int8_t>(
+      *reinterpret_cast<const attn_fwd_args_t<int8_t, int8_t, int8_t, int8_t>*>(params));
+}
+size_t bestla_fusion_attn_workspace_size(const attn_shape_t* params) {
+  const auto& p = *params;  // TODO(Yi): Better way to get tmp size?
+  return size_t(ne_threading::get()->num_threads() * sizeof(float) * 16) * padto(padto(p.sl_kv, 48), 64);
+}
+
+bool bestla_reordered_attn_fp32_support(const attn_shape_t* params) {
+  GetCPUDevice();
+#if CompileBF16()
+  // TODO(Yi): check K V's layout
+  if (_cd->AMX_BF16()) return true;
+#endif
+  // use avx2 and f16c on avx2 platforms
+  // todo: check avx2 mha on sever
+  return !_cd->AVX512F() && _cd->AVX2();
+}
+// kv cache sizes in bytes per layer per batch per beam for;
+void bestla_reordered_attn_fp32_batch_kv_info(const kv_shape_t* params, kv_cache_info_t* out) {
+  GetCPUDevice();
+  // use bf16 for kv-cache
+  const auto p = *params;
+  int n_tile = 0, row_pad = 0, elt_size = 0;
+  if (_cd->AVX512F()) {
+    out->k_layout = ATTN_FWD_LAYOUT_NTILE48_ROWPACK2;
+    out->v_layout = ATTN_FWD_LAYOUT_NTILE48_ROWPACK2;
+    n_tile = 48;
+    row_pad = 32;
+    elt_size = sizeof(bf16);
+  } else if (_cd->AVX2()) {
+    out->k_layout = ATTN_FWD_LAYOUT_NTILE24_ROWPACK1;
+    out->v_layout = ATTN_FWD_LAYOUT_NTILE24_ROWPACK1;
+    n_tile = 24;
+    row_pad = 1;
+    elt_size = sizeof(fp16);
+  } else {
+    assert(false);
+  }
+
+  out->stride_k_head_size = elt_size * n_tile;
+  out->stride_k_sl = elt_size * padto(static_cast<int>(p.head_size), row_pad);
+  out->stride_k_head_num = out->stride_k_sl * padto(static_cast<int>(p.sl_kv_max), n_tile);
+  out->k_bytes = out->stride_k_head_num * static_cast<size_t>(p.heads_kv);
+
+  out->stride_v_sl = elt_size * n_tile;
+  out->stride_v_head_size = elt_size * padto(static_cast<int>(p.sl_kv_max), row_pad);
+  out->stride_v_head_num = out->stride_v_head_size * padto(static_cast<int>(p.head_size), n_tile);
+  out->v_bytes = out->stride_v_head_num * static_cast<size_t>(p.heads_kv);
+}
+template <ATTN_FWD_LAYOUT KV_LAYOUT>
+void bestla_reordered_attn_fp32_forward_(const bestla_reordered_attn_fp32_fp32_fwd_args_t* params) {
+  using kv_t = std::conditional_t<  //
+      KV_LAYOUT == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2, bf16,
+      std::conditional_t<  //
+          KV_LAYOUT == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4, int8_t,
+          std::conditional_t<  //
+              KV_LAYOUT == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1, fp16, void>>>;
+  const auto n_tile = KV_LAYOUT == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2   ? 48
+                      : KV_LAYOUT == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4 ? 48
+                      : KV_LAYOUT == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1 ? 24
+                                                                      : 1;
+
+  const attn_fwd_args_t<float, kv_t, kv_t, float> bestla_params = {
+      /* .Q = */ params->Q,
+      /* .K = */ reinterpret_cast<kv_t*>(params->K),
+      /* .V = */ reinterpret_cast<kv_t*>(params->V),
+      /* .dst = */ params->dst,
+      /* .Q_sc = */ params->Q_sc,
+      /* .K_sc = */ params->K_sc,
+      /* .V_sc = */ params->V_sc,
+      /* .dst_sc = */ params->dst_sc,
+      /* .tmp = */ params->tmp,
+      /* .QK_scale = */ params->QK_scale,
+      /* .attn_flags = */ params->attn_flags,
+      /* .batch_size = */ params->batch_size,
+      /* .head_num = */ params->head_num,
+      /* .heads_kv = */ params->heads_kv,
+      /* .head_size = */ params->head_size,
+      /* .sl_q = */ params->sl_q,
+      /* .sl_kv = */ params->sl_kv,
+      /* .Q_layout = */ params->Q_layout,
+      /* .K_layout = */ params->K_layout,
+      /* .V_layout = */ params->V_layout,
+      /* .dst_layout = */ params->dst_layout,
+      /* .step_q_bs = */ params->step_q_bs,
+      /* .step_q_head_num = */ params->step_q_head_num,
+      /* .step_q_sl = */ params->step_q_sl,
+      /* .step_k_bs = */ static_cast<int>(params->stride_k_bs / sizeof(kv_t)),
+      /* .step_k_head_num = */ static_cast<int>(params->stride_k_head_num / sizeof(kv_t)),
+      /* .step_k_sl = */ static_cast<int>(params->stride_k_sl / sizeof(kv_t)),
+      /* .step_k_head_size = */ n_tile,
+      /* .step_v_bs = */ static_cast<int>(params->stride_v_bs / sizeof(kv_t)),
+      /* .step_v_head_num = */ static_cast<int>(params->stride_v_head_num / sizeof(kv_t)),
+      /* .step_v_sl = */ n_tile,
+      /* .step_v_head_size = */ static_cast<int>(params->stride_v_head_size / sizeof(kv_t)),
+      /* .step_dst_bs = */ params->step_dst_bs,
+      /* .step_dst_head_num = */ params->step_dst_head_num,
+      /* .step_dst_sl = */ params->step_dst_sl,
+  };
+  return bestla_fusion_attn_forward<float, kv_t, kv_t, float>(bestla_params);
+}
+
+void bestla_reordered_attn_fp32_forward(const bestla_reordered_attn_fp32_fp32_fwd_args_t* params) {
+  assert(params->K_layout == params->V_layout);
+  switch (params->K_layout) {
+    case ATTN_FWD_LAYOUT_NTILE48_ROWPACK2:
+      return bestla_reordered_attn_fp32_forward_<ATTN_FWD_LAYOUT_NTILE48_ROWPACK2>(params);
+    case ATTN_FWD_LAYOUT_NTILE24_ROWPACK1:
+      return bestla_reordered_attn_fp32_forward_<ATTN_FWD_LAYOUT_NTILE24_ROWPACK1>(params);
+    // case ATTN_FWD_LAYOUT_NTILE48_ROWPACK4:
+    //   return bestla_reordered_attn_fp32_forward_<ATTN_FWD_LAYOUT_NTILE48_ROWPACK4>(params);
+    default:
+      assert(false);
+      break;
+  }
+}
+
+template <bool zero_padding>
+void bestla_reordered_attn_fp32_update_k_48x2(const bestla_fusion_attn_fp32_update_kv_args_t* params) {
+  const auto p = *params;
+  NE_ASSERT(p.step_head_size == 1);
+  const auto pad_headsize = padto(p.head_size, 32);
+  const auto pad_seq_max = padto(p.seq_max, 48);
+  const auto cache_step_head_num = pad_headsize * pad_seq_max;
+  const auto cache_step_bs = p.heads_kv * cache_step_head_num;
+  GetCPUDevice();
+  const bool use_jit = _cd->AVX512_BF16() && (p.seq_off == 0) && zero_padding;
+
+#pragma omp parallel for collapse(2)
+  for (int ibs = 0; ibs < p.batch_size; ++ibs) {
+    for (int ihn = 0; ihn < p.heads_kv; ++ihn) {
+      const auto dst = reinterpret_cast<bf16*>(p.cache) + ibs * cache_step_bs + ihn * cache_step_head_num;
+      const auto src = p.src + ibs * p.step_bs + ihn * p.step_head_num;
+
+      if (use_jit) {
+        kernel::jit::PaddingTransInterleaveCvt::forward<48>(  //
+            src, dst, p.seq_size, p.head_size, padto(p.seq_size, 48), padto(p.head_size, 32), p.step_seq, pad_headsize);
+      } else {
+        for (int i = 0; i < p.seq_size; ++i) {      // QK_GEMM should not require 0-padding on seq_kv (i.e. N-dim)
+          for (int j = 0; j < pad_headsize; ++j) {  // K-dim padding for QK_GEMM
+            const auto i_dst = p.seq_off + i;
+            const auto ii = i_dst % 48;
+            const auto i_blk = i_dst - ii;
+            const auto jj = j % 2;
+            const auto j_blk = j - jj;
+            if constexpr (zero_padding) {
+              dst[i_blk * pad_headsize + ii * 2 + j_blk * 48 + jj] =
+                  j < p.head_size ? static_cast<bf16>(src[i * p.step_seq + j]) : bf16(0);
+            } else {
+              if (j < p.head_size)
+                dst[i_blk * pad_headsize + ii * 2 + j_blk * 48 + jj] = static_cast<bf16>(src[i * p.step_seq + j]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <bool zero_padding>
+void bestla_reordered_attn_fp32_update_k_24x1(const bestla_fusion_attn_fp32_update_kv_args_t* params) {
+  const auto p = *params;
+  NE_ASSERT(p.step_head_size == 1);
+  const auto pad_headsize = padto(p.head_size, 1);
+  const auto pad_seq_max = padto(p.seq_max, 24);
+  const auto cache_step_head_num = pad_headsize * pad_seq_max;
+  const auto cache_step_bs = p.heads_kv * cache_step_head_num;
+
+  const int n_para = p.batch_size * p.heads_kv;
+  // #pragma omp parallel
+  for (int i_para = 0; i_para < n_para; ++i_para) {
+    const int ibs = i_para / p.heads_kv;
+    const int ihn = i_para % p.heads_kv;
+
+    const auto dst = reinterpret_cast<fp16*>(p.cache) + ibs * cache_step_bs + ihn * cache_step_head_num;
+    const auto src = p.src + ibs * p.step_bs + ihn * p.step_head_num;
+
+    if (p.seq_off == 0 && p.head_size % 8 == 0 && zero_padding) {
+      int i = 0;
+      for (; i < padto_le(p.seq_size, 8); i += 8) {  // QK_GEMM should not require 0-padding on seq_kv (i.e. N-dim)
+        for (int j = 0; j < pad_headsize; j += 8) {  // K-dim padding for QK_GEMM
+          const auto i_dst = p.seq_off + i;
+          const auto ii = i_dst % 24;
+          const auto i_blk = i_dst - ii;
+          const auto mm_dst = kernel::avx2::load_fp32_fp16_tr_x8_word<8>(src + i * p.step_seq + j, p.step_seq);
+          for (int jj = 0; jj < 8; ++jj)
+            _mm_store_si128(reinterpret_cast<__m128i*>(dst + i_blk * pad_headsize + ii + (j + jj) * 24), mm_dst[jj]);
+        }
+      }
+      if (i < p.seq_size) {
+        for (int j = 0; j < pad_headsize; j += 8) {  // K-dim padding for QK_GEMM
+          const auto i_dst = p.seq_off + i;
+          const auto ii = i_dst % 24;
+          const auto i_blk = i_dst - ii;
+          const auto mm_dst =
+              kernel::avx2::load_fp32_fp16_tr_x8_word_tbl[p.seq_size - i](src + i * p.step_seq + j, p.step_seq);
+          for (int jj = 0; jj < 8; ++jj)
+            _mm_store_si128(reinterpret_cast<__m128i*>(dst + i_blk * pad_headsize + ii + (j + jj) * 24), mm_dst[jj]);
+        }
+      }
+    } else {
+      for (int i = 0; i < p.seq_size; ++i) {      // QK_GEMM should not require 0-padding on seq_kv (i.e. N-dim)
+        for (int j = 0; j < pad_headsize; ++j) {  // K-dim padding for QK_GEMM
+          const auto i_dst = p.seq_off + i;
+          const auto ii = i_dst % 24;
+          const auto i_blk = i_dst - ii;
+          if constexpr (zero_padding) {
+            dst[i_blk * pad_headsize + ii + j * 24] = utils::bit_cast<fp16, int16_t>(
+                j < p.head_size ? NE_FP32_TO_FP16(src[i * p.step_seq + j]) : NE_FP32_TO_FP16(0.f));
+          } else {
+            if (j < p.head_size)
+              dst[i_blk * pad_headsize + ii + j * 24] =
+                  utils::bit_cast<fp16, int16_t>(NE_FP32_TO_FP16(src[i * p.step_seq + j]));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <bool zero_padding>
+void bestla_reordered_attn_fp32_update_k_(const bestla_fusion_attn_fp32_update_kv_args_t* params) {
+  GetCPUDevice();
+  if (_cd->AVX512F()) {
+    // ATTN_FWD_LAYOUT_NTILE48_ROWPACK2
+    return bestla_reordered_attn_fp32_update_k_48x2<zero_padding>(params);
+  } else if (_cd->AVX2()) {
+    // ATTN_FWD_LAYOUT_NTILE24_ROWPACK1
+    return bestla_reordered_attn_fp32_update_k_24x1<zero_padding>(params);
+  } else {
+    assert(false);
+  }
+}
+
+void bestla_reordered_attn_fp32_update_k(const bestla_fusion_attn_fp32_update_kv_args_t* params) {
+  return params->no_zeroing ? bestla_reordered_attn_fp32_update_k_<false>(params)
+                            : bestla_reordered_attn_fp32_update_k_<true>(params);
+}
+
+template <bool zero_padding>
+void bestla_reordered_attn_fp32_update_v_48x2(const bestla_fusion_attn_fp32_update_kv_args_t* params) {
+  const auto p = *params;
+  NE_ASSERT(p.step_head_size == 1);
+  const auto pad_headsize = padto(p.head_size, 48);
+  const auto pad_seq_max = padto(p.seq_max, 32);
+  const auto step_cache_head_num = pad_headsize * pad_seq_max;
+  const auto step_cache_bs = p.heads_kv * step_cache_head_num;
+  GetCPUDevice();
+  const bool use_jit = _cd->AVX512_BF16() && (p.seq_off == 0) && zero_padding;
+
+#pragma omp parallel for collapse(2)
+  for (int ibs = 0; ibs < p.batch_size; ++ibs) {
+    for (int ihn = 0; ihn < p.heads_kv; ++ihn) {
+      const auto dst = reinterpret_cast<bf16*>(p.cache) + ibs * step_cache_bs + ihn * step_cache_head_num;
+      const auto src = p.src + ibs * p.step_bs + ihn * p.step_head_num;
+      if (use_jit) {
+        kernel::jit::PaddingInterleaveCvt::forward<48>(  //
+            src, dst, p.seq_size, p.head_size, padto(p.seq_size, 32), padto(p.head_size, 48), p.step_seq, pad_seq_max);
+      } else {
+        for (int i = 0; i < padto(p.seq_off + p.seq_size, 32) - p.seq_off; ++i) {  // K-dim padding for PV_GEMM
+          for (int j = 0; j < p.head_size; ++j) {  // PV_GEMM shouldn't require 0-padding on head_size (i.e. N-dim)
+            const auto i_dst = p.seq_off + i;
+            const auto ii = i_dst % 2;
+            const auto i_blk = i_dst - ii;
+            const auto jj = j % 48;
+            const auto j_blk = j - jj;
+            if constexpr (zero_padding) {
+              dst[i_blk * 48 + ii + j_blk * pad_seq_max + jj * 2] =
+                  i < p.seq_size ? static_cast<bf16>(src[i * p.step_seq + j]) : bf16(0);
+            } else {
+              if (i < p.seq_size)
+                dst[i_blk * 48 + ii + j_blk * pad_seq_max + jj * 2] = static_cast<bf16>(src[i * p.step_seq + j]);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+template <bool zero_padding>
+void bestla_reordered_attn_fp32_update_v_24x1(const bestla_fusion_attn_fp32_update_kv_args_t* params) {
+  const auto p = *params;
+  NE_ASSERT(p.step_head_size == 1);
+  const auto pad_headsize = padto(p.head_size, 24);
+  const auto pad_seq_max = padto(p.seq_max, 1);
+  const auto step_cache_head_num = pad_headsize * pad_seq_max;
+  const auto step_cache_bs = p.heads_kv * step_cache_head_num;
+
+  const int n_para = p.batch_size * p.heads_kv;
+#pragma omp parallel
+  for (int i_para = 0; i_para < n_para; ++i_para) {
+    const int ibs = i_para / p.heads_kv;
+    const int ihn = i_para % p.heads_kv;
+    const auto dst = reinterpret_cast<fp16*>(p.cache) + ibs * step_cache_bs + ihn * step_cache_head_num;
+    const auto src = p.src + ibs * p.step_bs + ihn * p.step_head_num;
+    if (p.seq_off == 0 && p.head_size % 8 == 0) {
+      for (int i = 0; i < p.seq_size; ++i) {        // K-dim padding for PV_GEMM
+        for (int j = 0; j < p.head_size; j += 8) {  // PV_GEMM shouldn't require 0-padding on head_size (i.e. N-dim)
+          const auto jj = j % 24;
+          const auto j_blk = j - jj;
+          const auto dst_m128 = _mm256_cvtps_ph(_mm256_load_ps(src + i * p.step_seq + j), _MM_FROUND_TO_NEAREST_INT);
+          _mm_store_si128(reinterpret_cast<__m128i*>(dst + i * 24 + j_blk * pad_seq_max + jj), dst_m128);
+        }
+      }
+    } else {
+      for (int i = 0; i < p.seq_size; ++i) {     // K-dim padding for PV_GEMM
+        for (int j = 0; j < p.head_size; ++j) {  // PV_GEMM shouldn't require 0-padding on head_size (i.e. N-dim)
+          const auto i_dst = p.seq_off + i;
+          const auto jj = j % 24;
+          const auto j_blk = j - jj;
+          if constexpr (zero_padding) {
+            dst[i_dst * 24 + j_blk * pad_seq_max + jj] = utils::bit_cast<fp16, int16_t>(
+                i < p.seq_size ? NE_FP32_TO_FP16(src[i * p.step_seq + j]) : NE_FP32_TO_FP16(0));
+          } else {
+            if (i < p.seq_size)
+              dst[i_dst * 24 + j_blk * pad_seq_max + jj] =
+                  utils::bit_cast<fp16, int16_t>(NE_FP32_TO_FP16(src[i * p.step_seq + j]));
+          }
+        }
+      }
+    }
+  }
+}
+template <bool zero_padding>
+void bestla_reordered_attn_fp32_update_v_(const bestla_fusion_attn_fp32_update_kv_args_t* params) {
+  GetCPUDevice();
+  if (_cd->AVX512F()) {
+    // ATTN_FWD_LAYOUT_NTILE48_ROWPACK2
+    return bestla_reordered_attn_fp32_update_v_48x2<zero_padding>(params);
+  } else if (_cd->AVX2()) {
+    // ATTN_FWD_LAYOUT_NTILE24_ROWPACK1
+    return bestla_reordered_attn_fp32_update_v_24x1<zero_padding>(params);
+  } else {
+    assert(false);
+  }
+}
+void bestla_reordered_attn_fp32_update_v(const bestla_fusion_attn_fp32_update_kv_args_t* params) {
+  return params->no_zeroing ? bestla_reordered_attn_fp32_update_v_<false>(params)
+                            : bestla_reordered_attn_fp32_update_v_<true>(params);
+}
+
+void bestla_reordered_attn_fp32_shift_rope_k(char* cache, const ne_fp16_t* cossin, int batch_size, int heads_kv,
+                                             int head_size, int seq_max, int seq_keep) {
+  const auto pad_headsize = padto(head_size, 32);
+  const auto pad_seq_max = padto(seq_max, 48);
+  const auto cache_step_head_num = pad_headsize * pad_seq_max;
+  const auto cache_step_bs = heads_kv * cache_step_head_num;
+
+#pragma omp parallel for collapse(2)
+  for (int ibs = 0; ibs < batch_size; ++ibs)
+    for (int ihn = 0; ihn < heads_kv; ++ihn) {
+      const auto src = reinterpret_cast<bf16*>(cache) + ibs * cache_step_bs + ihn * cache_step_head_num;
+      kernel::jit::CScaleInterleavedBF16FP16::forward<48>(  // NOLINT [build/include_what_you_use]
+          src, reinterpret_cast<const fp16*>(cossin), head_size, pad_seq_max, pad_headsize, seq_keep);
+    }
+}
+
+template <bool zero_padding>
+void bestla_fusion_attn_fp32_batch_cpy_k_(const bestla_fusion_attn_fp32_batch_cpy_kv_args_t* params) {
+  GetCPUDevice();
+  assert(_cd->AVX512F());  // TODO(Yi): add avx2 implementation
+  static constexpr auto N_TILE = 48;
+  static constexpr auto K_TILE = 32;
+  static constexpr auto K_PACK = 2;
+  const auto p = *params;
+  const auto pad_headsize = padto(p.head_size, K_TILE);
+  const auto pad_seq_max = padto(p.seq_max, N_TILE);
+  const auto step_head_num = pad_headsize * pad_seq_max;
+
+  const auto seq_unaligned = std::min(padto(p.seq_off, N_TILE) - p.seq_off, p.seq_size);
+  const auto size_aligned_cpy = pad_headsize * (padto(p.seq_off + p.seq_size, N_TILE) - padto(p.seq_off, N_TILE));
+#pragma omp parallel for
+  for (int ihn = 0; ihn < p.heads_kv; ++ihn) {
+    const auto dst = reinterpret_cast<bf16*>(p.dst) + ihn * step_head_num;
+    const auto src = reinterpret_cast<bf16*>(p.src) + ihn * step_head_num;
+
+    if (seq_unaligned) {
+      const auto ii = p.seq_off % N_TILE;
+      const auto i_blk = p.seq_off - ii;
+      const auto off = i_blk * pad_headsize + ii * K_PACK;
+      for (int j = 0; j < pad_headsize; j += K_PACK) {  // K-dim padding for QK_GEMM
+        memcpy(dst + off + j * N_TILE, src + off + j * N_TILE, sizeof(bf16) * K_PACK * seq_unaligned);
+      }
+    }
+    if constexpr (zero_padding) {
+      if (size_aligned_cpy) {
+        const auto off = padto(p.seq_off, N_TILE) * pad_headsize;
+        memcpy(dst + off, src + off, sizeof(bf16) * size_aligned_cpy);
+      }
+    } else {
+      assert(("Unimplemented!", false));
+    }
+  }
+}
+void bestla_fusion_attn_fp32_batch_cpy_k(const bestla_fusion_attn_fp32_batch_cpy_kv_args_t* params) {
+  return params->no_zeroing ? bestla_fusion_attn_fp32_batch_cpy_k_<false>(params)
+                            : bestla_fusion_attn_fp32_batch_cpy_k_<true>(params);
+}
+
+template <bool zero_padding>
+void bestla_fusion_attn_fp32_batch_cpy_v_(const bestla_fusion_attn_fp32_batch_cpy_kv_args_t* params) {
+  GetCPUDevice();
+  assert(_cd->AVX512F());  // TODO(Yi): add avx2 implementation
+  static constexpr auto N_TILE = 48;
+  static constexpr auto K_TILE = 32;
+  static constexpr auto K_PACK = 2;
+  const auto p = *params;
+  const auto pad_headsize = padto(p.head_size, N_TILE);
+  const auto pad_seq_max = padto(p.seq_max, K_TILE);
+  const auto step_head_num = pad_headsize * pad_seq_max;
+
+  const auto seq_off_aligned = padto(p.seq_off, K_PACK);
+  const auto seq_end_aligned = padto(p.seq_off + p.seq_size, K_TILE);
+  const auto seq_size_aligned = seq_end_aligned - seq_off_aligned;
+#pragma omp parallel for collapse(2)
+  for (int ihn = 0; ihn < p.heads_kv; ++ihn) {
+    for (int j = 0; j < p.head_size; j += N_TILE) {
+      const auto dst = reinterpret_cast<bf16*>(p.dst) + ihn * step_head_num + pad_seq_max * j;
+      const auto src = reinterpret_cast<bf16*>(p.src) + ihn * step_head_num + pad_seq_max * j;
+      if (p.seq_off != seq_off_aligned) {  // seq_size_unaligen must be 0 or 1 as K_PACK = 2
+        const auto off = (seq_off_aligned - K_PACK) * N_TILE + 1;
+        for (int jj = 0; jj < N_TILE; ++jj) dst[off + jj * K_PACK] = src[off + jj * K_PACK];
+      }
+      if constexpr (zero_padding) {
+        if (seq_off_aligned != seq_end_aligned) {
+          const auto off = seq_off_aligned * N_TILE;
+          memcpy(dst + off, src + off, sizeof(bf16) * N_TILE * seq_size_aligned);
+        }
+      } else {
+        assert(("Unimplemented!", false));
+      }
+    }
+  }
+}
+void bestla_fusion_attn_fp32_batch_cpy_v(const bestla_fusion_attn_fp32_batch_cpy_kv_args_t* params) {
+  return params->no_zeroing ? bestla_fusion_attn_fp32_batch_cpy_v_<false>(params)
+                            : bestla_fusion_attn_fp32_batch_cpy_v_<true>(params);
+}
diff --git a/vllm-ext/inference_engine/cpp/core/layers/mha_dense.h b/vllm-ext/inference_engine/cpp/core/layers/mha_dense.h
new file mode 100644
index 000000000..e51ab0ebc
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/mha_dense.h
@@ -0,0 +1,177 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#ifndef NE_CORE_GRAPH_MHA_DENSE_H
+#define NE_CORE_GRAPH_MHA_DENSE_H
+
+#include "core/data_types.h"
+#include "core/ne_layers.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct attn_shape_t {
+  int batch_size, head_num, heads_kv, head_size, sl_q, sl_kv;
+} attn_shape_t;
+size_t bestla_fusion_attn_workspace_size(const attn_shape_t* params);
+
+typedef struct kv_shape_t {
+  uint32_t heads_kv;   // number of heads in K/V matrix
+  uint32_t head_size;  // channel dimension of each head
+  uint32_t sl_kv_max;  // maximum length of sequence (affect internal step size)
+} kv_shape_t;
+
+typedef enum ATTN_FWD_LAYOUT {
+  // plain layout
+  ATTN_FWD_LAYOUT_PLAIN,
+
+  // step of sl/hs only works on indices which is a multiple of 48/4 on corresponding dimensions
+  ATTN_FWD_LAYOUT_NTILE48_ROWPACK4,
+
+  // step of sl/hs only works on indices which is a multiple of 48/2 on corresponding dimensions
+  ATTN_FWD_LAYOUT_NTILE48_ROWPACK2,
+
+  // step of sl/hs only works on indices which is a multiple of 24/1 on corresponding dimensions
+  ATTN_FWD_LAYOUT_NTILE24_ROWPACK1,
+} ATTN_FWD_LAYOUT;
+
+typedef struct kv_cache_info_t {
+  size_t k_bytes, v_bytes;
+  ATTN_FWD_LAYOUT k_layout, v_layout;
+  int stride_k_head_num, stride_k_sl, stride_k_head_size;
+  int stride_v_head_num, stride_v_sl, stride_v_head_size;
+} kv_cache_info_t;
+
+typedef struct attn_bf16_fwd_args_t {
+  ne_bf16_t *Q, *K, *V, *dst;
+  float Q_sc, K_sc, V_sc, dst_sc;
+  char* tmp;
+  float QK_scale;
+  ne_attn_flags_t attn_flags;
+  int batch_size, head_num, heads_kv, head_size, sl_q, sl_kv;
+  ATTN_FWD_LAYOUT Q_layout, K_layout, V_layout, dst_layout;
+  int step_q_bs, step_q_head_num, step_q_sl;
+  int step_k_bs, step_k_head_num, step_k_sl, step_k_head_size;
+  int step_v_bs, step_v_head_num, step_v_sl, step_v_head_size;
+  int step_dst_bs, step_dst_head_num, step_dst_sl;
+} attn_bf16_fwd_args_t;
+void bestla_fusion_attn_bf16_forward(const attn_bf16_fwd_args_t* params);
+
+typedef struct attn_fp32_fp16_fp16_fp32_fwd_args_t {
+  float* Q;
+  ne_fp16_t* K;
+  ne_fp16_t* V;
+  float* dst;
+  float Q_sc, K_sc, V_sc, dst_sc;
+  char* tmp;
+  float QK_scale;
+  ne_attn_flags_t attn_flags;
+  int batch_size, head_num, heads_kv, head_size, sl_q, sl_kv;
+  ATTN_FWD_LAYOUT Q_layout, K_layout, V_layout, dst_layout;
+  int step_q_bs, step_q_head_num, step_q_sl;
+  int step_k_bs, step_k_head_num, step_k_sl, step_k_head_size;
+  int step_v_bs, step_v_head_num, step_v_sl, step_v_head_size;
+  int step_dst_bs, step_dst_head_num, step_dst_sl;
+} attn_fp32_fp16_fp16_fp32_fwd_args_t;
+
+void bestla_fusion_attn_bf16_forward(const attn_bf16_fwd_args_t* params);
+
+bool bestla_fusion_attn_fp32_fp16_fp16_fp32_support(const attn_shape_t* params);
+void bestla_fusion_attn_fp32_fp16_fp16_fp32_forward(const attn_fp32_fp16_fp16_fp32_fwd_args_t* params);
+
+typedef struct attn_fp16_fwd_args_t {
+  ne_fp16_t *Q, *K, *V, *dst;
+  float Q_sc, K_sc, V_sc, dst_sc;
+  char* tmp;
+  float QK_scale;
+  ne_attn_flags_t attn_flags;
+  int batch_size, head_num, heads_kv, head_size, sl_q, sl_kv;
+  ATTN_FWD_LAYOUT Q_layout, K_layout, V_layout, dst_layout;
+  int step_q_bs, step_q_head_num, step_q_sl;
+  int step_k_bs, step_k_head_num, step_k_sl, step_k_head_size;
+  int step_v_bs, step_v_head_num, step_v_sl, step_v_head_size;
+  int step_dst_bs, step_dst_head_num, step_dst_sl;
+} attn_fp16_fwd_args_t;
+bool bestla_fusion_attn_fp16_support(const attn_shape_t* params);
+void bestla_fusion_attn_fp16_forward(const attn_fp16_fwd_args_t* params);
+
+typedef struct attn_int8_fwd_args_t {
+  int8_t *Q, *K, *V, *dst;
+  float Q_sc, K_sc, V_sc, dst_sc;
+  char* tmp;
+  float QK_scale;
+  ne_attn_flags_t attn_flags;
+  int batch_size, head_num, heads_kv, head_size, sl_q, sl_kv;
+  ATTN_FWD_LAYOUT Q_layout, K_layout, V_layout, dst_layout;
+  int step_q_bs, step_q_head_num, step_q_sl;
+  int step_k_bs, step_k_head_num, step_k_sl, step_k_head_size;
+  int step_v_bs, step_v_head_num, step_v_sl, step_v_head_size;
+  int step_dst_bs, step_dst_head_num, step_dst_sl;
+} attn_int8_fwd_args_t;
+void bestla_fusion_attn_int8_forward(const attn_int8_fwd_args_t* params);
+
+// check if bestla_reordered_attn is supported at runtime
+bool bestla_reordered_attn_fp32_support(const attn_shape_t* params);
+
+// kv cache sizes in bytes per layer per batch per beam
+void bestla_reordered_attn_fp32_batch_kv_info(const kv_shape_t* params, kv_cache_info_t* out);
+
+typedef struct bestla_fusion_attn_fp32_update_kv_args_t {
+  float* src;
+  char* cache;
+  int batch_size, heads_kv, head_size, seq_off, seq_size, seq_max;
+  int step_bs, step_head_num, step_seq, step_head_size;
+  bool no_zeroing;  // set to true to prevent zeroing unaligned seq
+} bestla_fusion_attn_fp32_update_kv_args_t;
+// update k-cache and output the memory layout of it
+void bestla_reordered_attn_fp32_update_k(const bestla_fusion_attn_fp32_update_kv_args_t* params);
+// update v-cache and output the memory layout of it
+void bestla_reordered_attn_fp32_update_v(const bestla_fusion_attn_fp32_update_kv_args_t* params);
+// shift-RoPE k-cache with pre-computed cos/sin values
+void bestla_reordered_attn_fp32_shift_rope_k(char* cache, const ne_fp16_t* cossin, int batch_size, int heads_kv,
+                                             int head_size, int seq_max, int seq_keep);
+
+typedef struct bestla_fusion_attn_fp32_batch_cpy_kv_args_t {
+  char* src;
+  char* dst;
+  int heads_kv, head_size, seq_off, seq_size, seq_max;
+  bool no_zeroing;  // set to true to prevent zeroing unaligned seq
+} bestla_fusion_attn_fp32_batch_cpy_kv_args_t;
+// copy k-cache across batch from seq_off to (seq_off + seq_size)
+void bestla_fusion_attn_fp32_batch_cpy_k(const bestla_fusion_attn_fp32_batch_cpy_kv_args_t* params);
+// copy v-cache across batch from seq_off to (seq_off + seq_size)
+void bestla_fusion_attn_fp32_batch_cpy_v(const bestla_fusion_attn_fp32_batch_cpy_kv_args_t* params);
+
+typedef struct bestla_reordered_attn_fp32_fp32_fwd_args_t {
+  float* Q;
+  char* K;  // K/V should be of type and layout used in corresponding bestla_reordered_attn_xxx_update_kv
+  char* V;  // K/V should be of type and layout used in corresponding bestla_reordered_attn_xxx_update_kv
+  float* dst;
+  float Q_sc, K_sc, V_sc, dst_sc;
+  char* tmp;
+  float QK_scale;
+  ne_attn_flags_t attn_flags;
+  int batch_size, head_num, heads_kv, head_size, sl_q, sl_kv;
+  ATTN_FWD_LAYOUT Q_layout, K_layout, V_layout, dst_layout;
+  int step_q_bs, step_q_head_num, step_q_sl;
+  int stride_k_bs, stride_k_head_num, stride_k_sl, stride_k_head_size;
+  int stride_v_bs, stride_v_head_num, stride_v_sl, stride_v_head_size;
+  int step_dst_bs, step_dst_head_num, step_dst_sl;
+} bestla_reordered_attn_fp32_fp32_fwd_args_t;
+void bestla_reordered_attn_fp32_forward(const bestla_reordered_attn_fp32_fp32_fwd_args_t* params);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // NE_CORE_GRAPH_MHA_DENSE_H
diff --git a/vllm-ext/inference_engine/cpp/core/layers/mha_dense_tests.cpp b/vllm-ext/inference_engine/cpp/core/layers/mha_dense_tests.cpp
new file mode 100644
index 000000000..cdf4ca4f6
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/mha_dense_tests.cpp
@@ -0,0 +1,499 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#include <memory>
+#include <tuple>
+
+#include "layers/mha_dense.h"
+#include "layers/mha_dense_wrapper.h"
+#include "layers/ne_test_layers_utils.hpp"
+
+#ifndef IE_TESTS
+static_assert(false, "Only compile this source file for testing!");
+#endif
+
+using namespace ne_bestla::custom::mha;  // NOLINT
+
+#define CheckISA(ISA) \
+  (bestla::device::CpuDevice::getInstance()->ISA() || (printf("Wrong Device ISA: " #ISA "\n"), false))
+
+namespace {
+bool ret_ok = true;
+
+class test_mha_dese_t {
+ public:
+  test_mha_dese_t() {
+    printf("Test suit: %s\n", __FUNCTION__);
+    GetCPUDevice();
+    static const int max_threads = std::thread::hardware_concurrency();
+    ne_threading::get()->set_threads(std::min(_cd->getThreads(), max_threads));
+
+#if CompileFP16()
+    if (CheckISA(AMX_BF16)) {
+      ret_ok &= test_case<float, fp16, fp16, float>({1, 1, 1, 32, 128, 64}, NE_ATTN_FLAG_NONE);
+      ret_ok &= test_case<float, fp16, fp16, float>({2, 5, 5, 32, 64, 128}, NE_ATTN_FLAG_NONE);
+      ret_ok &= test_case<float, fp16, fp16, float>({2, 5, 5, 80, 128, 77}, NE_ATTN_FLAG_NONE);
+      ret_ok &= test_case<float, fp16, fp16, float>({1, 1, 1, 32, 63, 63}, NE_ATTN_FLAG_NONE);
+      ret_ok &= test_case<float, fp16, fp16, float>({3, 4, 4, 256, 1, 384}, NE_ATTN_FLAG_NONE);
+      ret_ok &= test_case<float, fp16, fp16, float>({1, 1, 1, 64, 64, 64}, NE_ATTN_FLAG_IS_CAUSAL);
+
+      ret_ok &= test_case<fp16, fp16, fp16, fp16>({1, 1, 1, 32, 128, 64}, NE_ATTN_FLAG_NONE, true);
+      ret_ok &= test_case<fp16, fp16, fp16, fp16>({2, 5, 5, 32, 64, 128}, NE_ATTN_FLAG_NONE, true);
+      ret_ok &= test_case<fp16, fp16, fp16, fp16>({2, 5, 5, 80, 128, 77}, NE_ATTN_FLAG_NONE, true);
+      ret_ok &= test_case<fp16, fp16, fp16, fp16>({1, 1, 1, 256, 63, 63}, NE_ATTN_FLAG_NONE, true);
+      ret_ok &= test_case<fp16, fp16, fp16, fp16>({3, 4, 4, 256, 1, 384}, NE_ATTN_FLAG_NONE, true);
+      ret_ok &= test_case<fp16, fp16, fp16, fp16>({1, 1, 1, 64, 64, 64}, NE_ATTN_FLAG_IS_CAUSAL, true);
+
+      ret_ok &= test_case<float, fp16, fp16, float>({1, 1, 1, 32, 128, 64}, NE_ATTN_FLAG_NONE, true);
+      ret_ok &= test_case<float, fp16, fp16, float>({2, 5, 5, 32, 64, 128}, NE_ATTN_FLAG_NONE, true);
+      ret_ok &= test_case<float, fp16, fp16, float>({2, 5, 5, 80, 128, 77}, NE_ATTN_FLAG_NONE, true);
+      ret_ok &= test_case<float, fp16, fp16, float>({1, 1, 1, 256, 63, 63}, NE_ATTN_FLAG_NONE, true);
+      ret_ok &= test_case<float, fp16, fp16, float>({3, 4, 4, 256, 1, 384}, NE_ATTN_FLAG_NONE, true);
+      ret_ok &= test_case<float, fp16, fp16, float>({1, 1, 1, 64, 64, 64}, NE_ATTN_FLAG_IS_CAUSAL, true);
+    }
+#endif
+
+    if (CheckISA(AMX_BF16)) {
+      const auto BA48b4a = ATTN_FWD_LAYOUT_NTILE48_ROWPACK4;
+      ret_ok &= test_case<int8_t, int8_t, int8_t, int8_t>({1, 1, 1, 32, 128, 64}, NE_ATTN_FLAG_NONE, false, BA48b4a);
+      ret_ok &= test_case<int8_t, int8_t, int8_t, int8_t>({2, 5, 5, 32, 64, 128}, NE_ATTN_FLAG_NONE, false, BA48b4a);
+      ret_ok &= test_case<int8_t, int8_t, int8_t, int8_t>({2, 5, 5, 80, 128, 77}, NE_ATTN_FLAG_NONE, false, BA48b4a);
+      ret_ok &= test_case<int8_t, int8_t, int8_t, int8_t>({1, 1, 1, 256, 63, 63}, NE_ATTN_FLAG_NONE, false, BA48b4a);
+      ret_ok &= test_case<int8_t, int8_t, int8_t, int8_t>({3, 4, 4, 256, 1, 384}, NE_ATTN_FLAG_NONE, false, BA48b4a);
+      ret_ok &=
+          test_case<int8_t, int8_t, int8_t, int8_t>({1, 1, 1, 64, 64, 64}, NE_ATTN_FLAG_IS_CAUSAL, false, BA48b4a);
+    }
+
+    if (CheckISA(AMX_BF16)) {
+      const auto BA48b2a = ATTN_FWD_LAYOUT_NTILE48_ROWPACK2;
+      int flags = NE_ATTN_FLAG_NONE;
+      ret_ok &= test_case<float, bf16, bf16, float>({1, 1, 1, 32, 128, 64}, flags, false, BA48b2a, 1e-3f);
+      ret_ok &= test_case<float, bf16, bf16, float>({2, 5, 5, 32, 64, 128}, flags, false, BA48b2a, 1e-3f);
+      ret_ok &= test_case<float, bf16, bf16, float>({2, 5, 5, 80, 128, 77}, flags, false, BA48b2a, 1e-3f);
+      ret_ok &= test_case<float, bf16, bf16, float>({1, 1, 1, 256, 63, 63}, flags, false, BA48b2a, 1e-3f);
+      ret_ok &= test_case<float, bf16, bf16, float>({3, 4, 4, 256, 1, 384}, flags, false, BA48b2a, 1e-3f);
+
+      flags |= NE_ATTN_FLAG_IS_CAUSAL;
+      ret_ok &= test_case<float, bf16, bf16, float>({1, 1, 1, 64, 64, 64}, flags, false, BA48b2a, 1e-3f);
+    }
+
+    if (CheckISA(AVX512F)) {  // PREFER_FP32
+      const auto BA48b2a = ATTN_FWD_LAYOUT_NTILE48_ROWPACK2;
+      int flags = NE_ATTN_FLAG_PREFER_FP32;
+      ret_ok &= test_case<float, bf16, bf16, float>({1, 1, 1, 32, 128, 64}, flags, false, BA48b2a, 1e-3f);
+      ret_ok &= test_case<float, bf16, bf16, float>({2, 5, 5, 32, 64, 128}, flags, false, BA48b2a, 1e-3f);
+      ret_ok &= test_case<float, bf16, bf16, float>({2, 5, 5, 80, 128, 77}, flags, false, BA48b2a, 1e-3f);
+      ret_ok &= test_case<float, bf16, bf16, float>({1, 1, 1, 256, 63, 63}, flags, false, BA48b2a, 1e-3f);
+      ret_ok &= test_case<float, bf16, bf16, float>({3, 4, 4, 256, 1, 384}, flags, false, BA48b2a, 1e-3f);
+
+      flags |= NE_ATTN_FLAG_IS_CAUSAL;
+      ret_ok &= test_case<float, bf16, bf16, float>({1, 1, 1, 64, 64, 64}, flags, false, BA48b2a, 1e-3f);
+    }
+    if (CheckISA(AVX2)) {  // avx2
+      const auto Ba24b = ATTN_FWD_LAYOUT_NTILE24_ROWPACK1;
+      int flags = NE_ATTN_FLAG_PREFER_FP32;
+      ret_ok &= test_case<float, fp16, fp16, float>({1, 1, 1, 32, 128, 64}, flags, false, Ba24b, 1e-3f);
+      ret_ok &= test_case<float, fp16, fp16, float>({2, 5, 5, 32, 64, 128}, flags, false, Ba24b, 1e-3f);
+      ret_ok &= test_case<float, fp16, fp16, float>({2, 5, 5, 80, 128, 77}, flags, false, Ba24b, 1e-3f);
+      ret_ok &= test_case<float, fp16, fp16, float>({1, 1, 1, 256, 63, 63}, flags, false, Ba24b, 1e-3f);
+      ret_ok &= test_case<float, fp16, fp16, float>({3, 4, 4, 256, 1, 384}, flags, false, Ba24b, 1e-3f);
+
+      flags |= NE_ATTN_FLAG_IS_CAUSAL;
+      ret_ok &= test_case<float, fp16, fp16, float>({1, 1, 1, 64, 64, 64}, flags, false, Ba24b, 1e-3f);
+    }
+
+    {  // amxbf16 => avx2 fallback
+      int flags = NE_ATTN_FLAG_NONE;
+      ret_ok &= test_reorder_pipe<float, float, float, float>({1, 1, 1, 32, 128, 64}, 64, flags);
+      ret_ok &= test_reorder_pipe<float, float, float, float>({2, 5, 5, 32, 64, 128}, 256, flags);
+      ret_ok &= test_reorder_pipe<float, float, float, float>({2, 5, 5, 80, 128, 77}, 256, flags);
+      ret_ok &= test_reorder_pipe<float, float, float, float>({2, 5, 1, 80, 128, 77}, 256, flags);
+      ret_ok &= test_reorder_pipe<float, float, float, float>({1, 1, 1, 256, 63, 63}, 256, flags);
+      ret_ok &= test_reorder_pipe<float, float, float, float>({3, 4, 4, 256, 1, 384}, 384, flags);
+      ret_ok &= test_reorder_pipe<float, float, float, float>({3, 4, 2, 256, 1, 384}, 384, flags);
+      flags |= NE_ATTN_FLAG_IS_CAUSAL;
+      ret_ok &= test_reorder_pipe<float, float, float, float>({1, 1, 1, 64, 64, 64}, 128, flags);
+      flags |= NE_ATTN_FLAG_IS_ALIBI8;
+      ret_ok &= test_reorder_pipe<float, float, float, float>({1, 8, 8, 64, 64, 64}, 128, flags);
+    }
+    printf("Test suit done: %s\n", __FUNCTION__);
+  }
+
+  template <class T>
+  static constexpr float init_min_val = std::is_same<T, int8_t>::value    ? -127.f
+                                        : std::is_same<T, uint8_t>::value ? 0.f
+                                                                          : -1.f;
+  template <class T>
+  static constexpr float init_max_val = std::is_same<T, int8_t>::value    ? 127.f
+                                        : std::is_same<T, uint8_t>::value ? 255.f
+                                                                          : 1.f;
+  template <class T>
+  static constexpr float init_scale_val = 1.f / init_max_val<T>;
+
+#ifdef _MSC_VER
+#define __PRETTY_FUNCTION__ __FUNCSIG__
+#endif
+
+  template <class Q_T, class K_T, class V_T, class DST_T>
+  bool test_case(const attn_shape_t& s, ne_attn_flags_t flags, bool k_trans = false,
+                 ATTN_FWD_LAYOUT kv_layout = ATTN_FWD_LAYOUT_PLAIN, float eps = 1e-2f) {
+    assert(kv_layout == ATTN_FWD_LAYOUT_PLAIN || !k_trans);
+    const auto batch_size = s.batch_size;
+    const auto head_num = s.head_num;
+    const auto heads_kv = s.heads_kv;
+    const auto head_size = s.head_size;
+    const auto sl_q = s.sl_q;
+    const auto sl_kv = s.sl_kv;
+    assert(("GQA not supported!", s.head_num == s.heads_kv));
+
+    const auto is_causal = flags & NE_ATTN_FLAG_IS_CAUSAL ? "maksed" : "unmask";
+    const auto is_alibi8 = flags & NE_ATTN_FLAG_IS_ALIBI8 ? "alibi8" : "";
+    const auto prefer_fp32 = flags & NE_ATTN_FLAG_PREFER_FP32 ? "FP32" : "";
+    printf("\ntest_case: %s\t", __PRETTY_FUNCTION__);
+    printf("bs_%d hn_%d hkv_%d hs_%d sl_q_%d sk_kv_%d %s %s %s\n", batch_size, head_num, heads_kv, head_size, sl_q,
+           sl_kv, is_causal, is_alibi8, prefer_fp32);
+
+    const auto NTILE = kv_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4   ? 48
+                       : kv_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2 ? 48
+                       : kv_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1 ? 24
+                                                                       : 0;
+    const auto ROWPACK = kv_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4   ? 4
+                         : kv_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2 ? 2
+                         : kv_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1 ? 1
+                                                                         : 0;
+    const auto ROWPAD = ROWPACK > 1 ? ROWPACK * 16 : 1;
+    const auto k_rows_pad = kv_layout != ATTN_FWD_LAYOUT_PLAIN ? padto(head_size, ROWPAD) : head_size;
+    const auto k_cols_pad = kv_layout != ATTN_FWD_LAYOUT_PLAIN ? padto(sl_kv, NTILE) : sl_kv;
+    const auto v_rows_pad = kv_layout != ATTN_FWD_LAYOUT_PLAIN ? padto(sl_kv, ROWPAD) : sl_kv;
+    const auto v_cols_pad = kv_layout != ATTN_FWD_LAYOUT_PLAIN ? padto(head_size, NTILE) : head_size;
+
+    std::vector<Q_T> src_q(batch_size * head_num * sl_q * head_size);
+    std::vector<K_T> src_k(batch_size * heads_kv * k_rows_pad * k_cols_pad);
+    std::vector<V_T> src_v(batch_size * heads_kv * v_rows_pad * v_cols_pad);
+    std::vector<DST_T> dst(batch_size * head_num * sl_q * head_size);
+    std::vector<DST_T> ref(batch_size * head_num * sl_q * head_size);  // reference result
+    std::vector<char> tmp(bestla_fusion_attn_workspace_size(&s));
+
+    // init vector
+    static std::mt19937 rng(1);
+    std::uniform_int_distribution<> dist;
+    init_vector(&src_q, init_min_val<Q_T>, init_max_val<Q_T>, dist(rng));
+    init_vector(&src_k, init_min_val<K_T>, init_max_val<K_T>, dist(rng));
+    init_vector(&src_v, init_min_val<V_T>, init_max_val<V_T>, dist(rng));
+
+    // pad0 for padded layouts
+    if (kv_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4 || kv_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2 ||
+        kv_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1) {
+#pragma omp parallel for collapse(2)
+      for (int ibs = 0; ibs < batch_size; ++ibs) {
+        for (int ihn = 0; ihn < heads_kv; ++ihn) {
+          // K
+          const auto k_off = (ibs * heads_kv + ihn) * k_rows_pad * k_cols_pad;
+          for (int i = 0; i < k_rows_pad; ++i) {
+            for (int j = 0; j < k_cols_pad; ++j) {
+              if (i < head_size && j < sl_kv) continue;
+
+              const auto j_remain = j % NTILE;
+              const auto j_block = j - j_remain;
+              const auto i_remain = i % ROWPACK;
+              const auto i_block = i - i_remain;
+              src_k[k_off + j_block * k_rows_pad + i_block * NTILE + j_remain * ROWPACK + i_remain] = K_T(0);
+            }
+          }
+          // V
+          const auto v_off = (ibs * heads_kv + ihn) * v_rows_pad * v_cols_pad;
+          for (int i = 0; i < v_rows_pad; ++i) {
+            for (int j = 0; j < v_cols_pad; ++j) {
+              if (i < sl_kv && j < head_size) continue;
+
+              const auto j_remain = j % NTILE;
+              const auto j_block = j - j_remain;
+              const auto i_remain = i % ROWPACK;
+              const auto i_block = i - i_remain;
+              src_v[v_off + j_block * v_rows_pad + i_block * NTILE + j_remain * ROWPACK + i_remain] = V_T(0);
+            }
+          }
+        }
+      }
+    }
+
+    attn_fwd_args_t<Q_T, K_T, V_T, DST_T> args{
+        /* .Q = */ src_q.data(),
+        /* .K = */ src_k.data(),
+        /* .V = */ src_v.data(),
+        /* .dst = */ ref.data(),
+        /* .Q_sc = */ init_scale_val<Q_T>,
+        /* .K_sc = */ init_scale_val<K_T>,
+        /* .V_sc = */ init_scale_val<V_T>,
+        /* .dst_sc = */ init_scale_val<V_T>,
+        /* .tmp = */ tmp.data(),
+        /* .QK_scale = */ 1.f / sqrtf(static_cast<float>(head_size)),
+        /* .attn_flags = */ flags,
+        /* .batch_size = */ batch_size,
+        /* .head_num = */ head_num,
+        /* .heads_kv = */ heads_kv,
+        /* .head_size = */ head_size,
+        /* .sl_q = */ sl_q,
+        /* .sl_kv = */ sl_kv,
+        /* .Q_layout = */ ATTN_FWD_LAYOUT_PLAIN,
+        /* .K_layout = */ kv_layout,
+        /* .V_layout = */ kv_layout,
+        /* .dst_layout = */ ATTN_FWD_LAYOUT_PLAIN,
+        /* .step_q_bs = */ sl_q * head_num * head_size,
+        /* .step_q_head_num = */ head_size,
+        /* .step_q_sl = */ head_num * head_size,
+        /* .step_k_bs = */ sl_kv * heads_kv * head_size,
+        /* .step_k_head_num = */ k_trans ? head_size * sl_kv : head_size,
+        /* .step_k_sl = */ k_trans ? 1 : heads_kv * head_size,
+        /* .step_k_head_size = */ k_trans ? sl_kv : 1,
+        /* .step_v_bs = */ sl_kv * heads_kv * head_size,
+        /* .step_v_head_num = */ head_size,
+        /* .step_v_sl = */ heads_kv * head_size,
+        /* .step_v_head_size = */ 1,
+        /* .step_dst_bs = */ sl_q * head_num * head_size,
+        /* .step_dst_head_num = */ head_size,
+        /* .step_dst_sl = */ head_num * head_size,
+    };
+    if (kv_layout != ATTN_FWD_LAYOUT_PLAIN) {
+      args.step_k_bs = heads_kv * k_rows_pad * k_cols_pad;
+      args.step_k_head_num = k_rows_pad * k_cols_pad;
+      args.step_k_sl = k_rows_pad;
+      args.step_k_head_size = NTILE;
+      args.step_v_bs = heads_kv * v_rows_pad * v_cols_pad;
+      args.step_v_head_num = v_rows_pad * v_cols_pad;
+      args.step_v_sl = NTILE;
+      args.step_v_head_size = v_rows_pad;
+    }
+
+    bestla_fusion_attn_forward_ref(args);
+
+    args.dst = dst.data();
+    bestla_fusion_attn_forward(args);
+
+    // Check result
+    return compare_data(dst.data(), ref.data(), dst.size(), eps);
+  }
+
+  template <class Q_T, class K_T, class V_T, class DST_T>
+  bool test_reorder_pipe(const attn_shape_t& s, int sl_kv_max, ne_attn_flags_t flags) {
+    const auto batch_size = s.batch_size;
+    const auto head_num = s.head_num;
+    const auto heads_kv = s.heads_kv;
+    const auto head_size = s.head_size;
+    const auto sl_q = s.sl_q;
+    const auto sl_kv = s.sl_kv;
+    assert(("head_num must be a multiple of heads_kv!", head_num % heads_kv == 0));
+
+    const auto is_causal = flags & NE_ATTN_FLAG_IS_CAUSAL ? "maksed" : "unmask";
+    const auto is_alibi8 = flags & NE_ATTN_FLAG_IS_ALIBI8 ? "alibi8" : "";
+    const auto prefer_fp32 = flags & NE_ATTN_FLAG_PREFER_FP32 ? "FP32" : "";
+    printf("\ntest_case: %s\t", __PRETTY_FUNCTION__);
+    printf("bs_%d hn_%d hkv_%d hs_%d sl_q_%d sk_kv_%d %s %s %s\n", batch_size, head_num, heads_kv, head_size, sl_q,
+           sl_kv, is_causal, is_alibi8, prefer_fp32);
+
+    assert(sl_kv_max >= sl_kv);
+
+    kv_shape_t kv_shape = {
+        /* .heads_kv */ static_cast<uint32_t>(heads_kv),
+        /* .head_size */ static_cast<uint32_t>(head_size),
+        /* .sl_kv_max */ static_cast<uint32_t>(sl_kv_max),
+    };
+    kv_cache_info_t kv_cache_info;
+    bestla_reordered_attn_fp32_batch_kv_info(&kv_shape, &kv_cache_info);
+    assert(kv_cache_info.k_layout >= kv_cache_info.v_layout);
+    const auto kv_layout = kv_cache_info.k_layout;
+    const auto NTILE = kv_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4   ? 48
+                       : kv_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2 ? 48
+                       : kv_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1 ? 24
+                                                                       : 0;
+    const auto ROWPACK = kv_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4   ? 4
+                         : kv_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2 ? 2
+                         : kv_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1 ? 1
+                                                                         : 0;
+    const auto ROWPAD = ROWPACK > 1 ? ROWPACK * 16 : 1;
+    const auto k_rows_pad = kv_layout != ATTN_FWD_LAYOUT_PLAIN ? padto(head_size, ROWPAD) : head_size;
+    const auto k_cols_pad = kv_layout != ATTN_FWD_LAYOUT_PLAIN ? padto(sl_kv, NTILE) : sl_kv;
+    const auto v_rows_pad = kv_layout != ATTN_FWD_LAYOUT_PLAIN ? padto(sl_kv, ROWPAD) : sl_kv;
+    const auto v_cols_pad = kv_layout != ATTN_FWD_LAYOUT_PLAIN ? padto(head_size, NTILE) : head_size;
+
+    std::vector<Q_T> src_q(batch_size * head_num * sl_q * head_size);
+    std::vector<K_T> src_k(batch_size * heads_kv * sl_kv * head_size);
+    std::vector<V_T> src_v(batch_size * heads_kv * sl_kv * head_size);
+    std::vector<char> k_cache(batch_size * kv_cache_info.k_bytes);
+    std::vector<char> v_cache(batch_size * kv_cache_info.v_bytes);
+    std::vector<DST_T> dst(batch_size * head_num * sl_q * head_size);
+    std::vector<DST_T> ref(batch_size * head_num * sl_q * head_size);  // reference result
+    std::vector<char> tmp(bestla_fusion_attn_workspace_size(&s));
+
+    // init vector
+    static std::mt19937 rng(1);
+    std::uniform_int_distribution<> dist;
+    init_vector(&src_q, init_min_val<Q_T>, init_max_val<Q_T>, dist(rng));
+    init_vector(&src_k, init_min_val<K_T>, init_max_val<K_T>, dist(rng));
+    init_vector(&src_v, init_min_val<V_T>, init_max_val<V_T>, dist(rng));
+
+    // undefined values
+    init_vector(&k_cache, INT8_MIN, INT8_MAX, dist(rng));
+    init_vector(&v_cache, INT8_MIN, INT8_MAX, dist(rng));
+
+    int step_src_k_bs = sl_kv * heads_kv * head_size;
+    int step_src_k_head_num = head_size;
+    int step_src_k_sl = heads_kv * head_size;
+    int step_src_k_head_size = 1;
+    int step_src_v_bs = sl_kv * heads_kv * head_size;
+    int step_src_v_head_num = head_size;
+    int step_src_v_sl = heads_kv * head_size;
+    int step_src_v_head_size = 1;
+    attn_fwd_args_t<Q_T, K_T, V_T, DST_T> ref_args{
+        /* .Q = */ src_q.data(),
+        /* .K = */ src_k.data(),
+        /* .V = */ src_v.data(),
+        /* .dst = */ ref.data(),
+        /* .Q_sc = */ init_scale_val<Q_T>,
+        /* .K_sc = */ init_scale_val<K_T>,
+        /* .V_sc = */ init_scale_val<V_T>,
+        /* .dst_sc = */ init_scale_val<V_T>,
+        /* .tmp = */ tmp.data(),
+        /* .QK_scale = */ 1.f / sqrtf(static_cast<float>(head_size)),
+        /* .attn_flags = */ flags,
+        /* .batch_size = */ batch_size,
+        /* .head_num = */ head_num,
+        /* .heads_kv = */ heads_kv,
+        /* .head_size = */ head_size,
+        /* .sl_q = */ sl_q,
+        /* .sl_kv = */ sl_kv,
+        /* .Q_layout = */ ATTN_FWD_LAYOUT_PLAIN,
+        /* .K_layout = */ ATTN_FWD_LAYOUT_PLAIN,
+        /* .V_layout = */ ATTN_FWD_LAYOUT_PLAIN,
+        /* .dst_layout = */ ATTN_FWD_LAYOUT_PLAIN,
+        /* .step_q_bs = */ sl_q * head_num * head_size,
+        /* .step_q_head_num = */ head_size,
+        /* .step_q_sl = */ head_num * head_size,
+
+        /* .step_k_bs = */ step_src_k_bs,
+        /* .step_k_head_num = */ step_src_k_head_num,
+        /* .step_k_sl = */ step_src_k_sl,
+        /* .step_k_head_size = */ step_src_k_head_size,
+        /* .step_v_bs = */ step_src_v_bs,
+        /* .step_v_head_num = */ step_src_v_head_num,
+        /* .step_v_sl = */ step_src_v_sl,
+        /* .step_v_head_size = */ step_src_v_head_size,
+
+        /* .step_dst_bs = */ sl_q * head_num * head_size,
+        /* .step_dst_head_num = */ head_size,
+        /* .step_dst_sl = */ head_num * head_size,
+    };
+    bestla_fusion_attn_forward_ref(ref_args);
+
+    if (std::is_same<std::tuple<Q_T, K_T, V_T, DST_T>, std::tuple<float, float, float, float>>::value) {
+      assert(kv_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2 || kv_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1);
+      // for testing, first reorder sl_kv - 1 and than concat the last 1 line
+      const auto seq_size_first = sl_kv - 1;
+      const auto seq_size_next = 1;
+      bestla_fusion_attn_fp32_update_kv_args_t update_k_args = {
+          /* .src = */ src_k.data(),
+          /* .cache = */ k_cache.data(),
+          /* .batch_size = */ batch_size,
+          /* .heads_kv = */ heads_kv,
+          /* .head_size = */ head_size,
+          /* .seq_off = */ 0,
+          /* .seq_size = */ seq_size_first,
+          /* .seq_max = */ sl_kv_max,
+          /* .step_bs = */ step_src_k_bs,
+          /* .step_head_num = */ step_src_k_head_num,
+          /* .step_seq = */ step_src_k_sl,
+          /* .step_head_size = */ step_src_k_head_size,
+      };
+      bestla_reordered_attn_fp32_update_k(&update_k_args);
+
+      bestla_fusion_attn_fp32_update_kv_args_t update_v_args = {
+          /* .src = */ src_v.data(),
+          /* .cache = */ v_cache.data(),
+          /* .batch_size = */ batch_size,
+          /* .heads_kv = */ heads_kv,
+          /* .head_size = */ head_size,
+          /* .seq_off = */ 0,
+          /* .seq_size = */ seq_size_first,
+          /* .seq_max = */ sl_kv_max,
+          /* .step_bs = */ step_src_v_bs,
+          /* .step_head_num = */ step_src_v_head_num,
+          /* .step_seq = */ step_src_v_sl,
+          /* .step_head_size = */ step_src_v_head_size,
+      };
+      bestla_reordered_attn_fp32_update_v(&update_v_args);
+
+      update_k_args.seq_off = seq_size_first;
+      update_k_args.seq_size = seq_size_next;
+      update_k_args.src = src_k.data() + seq_size_first * step_src_k_sl;
+      bestla_reordered_attn_fp32_update_k(&update_k_args);
+
+      update_v_args.seq_off = seq_size_first;
+      update_v_args.seq_size = seq_size_next;
+      update_v_args.src = src_v.data() + seq_size_first * step_src_v_sl;
+      bestla_reordered_attn_fp32_update_v(&update_v_args);
+
+      bestla_reordered_attn_fp32_fp32_fwd_args_t kern_args{
+          /* .Q = */ reinterpret_cast<float*>(src_q.data()),
+          /* .K = */ k_cache.data(),
+          /* .V = */ v_cache.data(),
+          /* .dst = */ reinterpret_cast<float*>(dst.data()),
+          /* .Q_sc = */ init_scale_val<Q_T>,
+          /* .K_sc = */ init_scale_val<K_T>,
+          /* .V_sc = */ init_scale_val<V_T>,
+          /* .dst_sc = */ init_scale_val<V_T>,
+          /* .tmp = */ tmp.data(),
+          /* .QK_scale = */ 1.f / sqrtf(static_cast<float>(head_size)),
+          /* .attn_flags = */ flags,
+          /* .batch_size = */ batch_size,
+          /* .head_num = */ head_num,
+          /* .heads_kv = */ heads_kv,
+          /* .head_size = */ head_size,
+          /* .sl_q = */ sl_q,
+          /* .sl_kv = */ sl_kv,
+          /* .Q_layout = */ ATTN_FWD_LAYOUT_PLAIN,
+          /* .K_layout = */ kv_layout,
+          /* .V_layout = */ kv_layout,
+          /* .dst_layout = */ ATTN_FWD_LAYOUT_PLAIN,
+          /* .step_q_bs = */ sl_q * head_num * head_size,
+          /* .step_q_head_num = */ head_size,
+          /* .step_q_sl = */ head_num * head_size,
+
+          /* .stride_k_bs = */ static_cast<int>(kv_cache_info.k_bytes),
+          /* .stride_k_head_num = */ kv_cache_info.stride_k_head_num,
+          /* .stride_k_sl = */ kv_cache_info.stride_k_sl,
+          /* .stride_k_head_size = */ kv_cache_info.stride_k_head_size,
+          /* .stride_v_bs = */ static_cast<int>(kv_cache_info.v_bytes),
+          /* .stride_v_head_num = */ kv_cache_info.stride_v_head_num,
+          /* .stride_v_sl = */ kv_cache_info.stride_v_sl,
+          /* .stride_v_head_size = */ kv_cache_info.stride_v_head_size,
+
+          /* .step_dst_bs = */ sl_q * head_num * head_size,
+          /* .step_dst_head_num = */ head_size,
+          /* .step_dst_sl = */ head_num * head_size,
+      };
+      bestla_reordered_attn_fp32_forward(&kern_args);
+    }
+
+    // Check result
+    return compare_data(dst.data(), ref.data(), dst.size(), 1e-2f);
+  }
+};
+const test_mha_dese_t inst_;
+
+}  // namespace
+
+int main() {
+  printf("IE_TESTS: mha_dense ");
+  printf(ret_ok ? "OK\n" : "FAILED\n");
+  return ret_ok ? 0 : -1;
+}
diff --git a/vllm-ext/inference_engine/cpp/core/layers/mha_dense_wrapper.h b/vllm-ext/inference_engine/cpp/core/layers/mha_dense_wrapper.h
new file mode 100644
index 000000000..e9ebc30a5
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/mha_dense_wrapper.h
@@ -0,0 +1,2148 @@
+//  Copyright (c) 2024 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#ifndef NE_CORE_GRAPH_MHA_DENSE_WRAPPER_H
+#define NE_CORE_GRAPH_MHA_DENSE_WRAPPER_H
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <random>
+#include <vector>
+
+#include "bestla/bestla.h"
+#include "bestla/bestla_epilogue.h"
+#include "bestla/bestla_gemm.h"
+#include "bestla/bestla_parallel.h"
+#include "bestla/bestla_prologue_a.h"
+#include "bestla/bestla_prologue_b.h"
+#include "bestla/bestla_storage.h"
+#include "bestla/bestla_wrapper.h"
+#include "core/data_types.h"
+#include "layers/bestla_common.hpp"
+
+#ifdef IE_TESTS
+#include <memory>
+#include <tuple>
+
+#include "layers/ne_test_layers_utils.hpp"
+#endif
+
+#define MHA_2ND_EXP 1
+constexpr bool MHA_PREFER_AVX512FP16 = true;
+
+#if defined(__GNUC__) && !defined(__clang_major__)  // clang cannot understand target("t1", "t2", "t3" ...)
+#define ADD_TARGET(T, ...) __VA_ARGS__, T
+#define TARGETS_512_0() "avx512f", "avx512bw", "avx512vl"
+#if CompileBF16()
+#define TARGETS_512_1() ADD_TARGET("avx512bf16", TARGETS_512_0())
+#else
+#define TARGETS_512_1() TARGETS_512_0()
+#endif
+#if CompileFP16()
+#define TARGETS_512_2() ADD_TARGET("avx512fp16", TARGETS_512_1())
+#else
+#define TARGETS_512_2() TARGETS_512_1()
+#endif
+#define TARGET_512 __attribute__((target(TARGETS_512_2())))
+#else
+#define TARGET_512
+#endif
+
+namespace ne_bestla {
+namespace custom {
+namespace mha {
+using namespace bestla;     // NOLINT
+using namespace ne_bestla;  // NOLINT
+using bestla::utils::bf16;
+using bestla::utils::fp16;
+using bestla::utils::padto;
+using bestla::utils::padto_le;
+using bestla::utils::remainsize;
+using bestla::utils::updiv;
+namespace utils = bestla::utils;
+
+template <typename Q_T, typename K_T, typename V_T, typename DST_T>
+struct attn_fwd_args_t {
+  Q_T* Q;
+  K_T* K;
+  V_T* V;
+  DST_T* dst;
+  float Q_sc, K_sc, V_sc, dst_sc;
+  char* tmp;
+  float QK_scale;
+  ne_attn_flags_t attn_flags;
+  int batch_size, head_num, heads_kv, head_size, sl_q, sl_kv;
+  ATTN_FWD_LAYOUT Q_layout, K_layout, V_layout, dst_layout;
+  int step_q_bs, step_q_head_num, step_q_sl;
+  int step_k_bs, step_k_head_num, step_k_sl, step_k_head_size;
+  int step_v_bs, step_v_head_num, step_v_sl, step_v_head_size;
+  int step_dst_bs, step_dst_head_num, step_dst_sl;
+};
+
+struct mha_problem_t {
+  int batch_size, head_num, heads_kv, head_size, sl_q, sl_kv;
+};
+
+inline float mha_exp_ref(float x) {
+#if MHA_2ND_EXP
+  return kernel::ref::exp_ps_0_1(x);
+#else
+  return expf(x);
+#endif
+}
+
+#ifdef NOT_CURRENTLY_USED
+TARGET_512 inline __m512 exp_2nd_ph(const __m512 z, const __m512 f, const __m512 c0, const __m512 c1, const __m512 c2) {
+  const auto y = _mm512_fmadd_ph(_mm512_fmadd_ph(f, c0, c1), f, c2);  // auto y = (f * c0 + c1) * f + c2;
+  const auto exp = _mm512_scalef_ph(y, z);
+  return exp;
+}
+
+TARGET_512 inline __m512 exp_ph_0_1(const __m512 x) {
+  static const auto c0 = _mm512_castsi512_ph(_mm512_set1_epi16(fp16(0.240226507f).x));
+  static const auto c1 = _mm512_castsi512_ph(_mm512_set1_epi16(fp16(0.452920674f).x));
+  static const auto c2 = _mm512_castsi512_ph(_mm512_set1_epi16(fp16(0.713483036f).x));
+  static const float v_log2e = std::log2(std::exp(1.f));
+  static const auto log2e = _mm512_castsi512_ph(_mm512_set1_epi16(fp16(v_log2e).x));
+  static const auto half = _mm512_castsi512_ph(_mm512_set1_epi16(fp16(.5f).x));
+
+  const auto x1 = _mm512_fmadd_ph(x, log2e, half);  // auto x1 = x * log2e + _mm512_set1_ph(.5f);
+  const auto z = _mm512_floor_ph(x1);
+  const auto f = _mm512_sub_ph(x1, z);  // auto f = x1 - z;
+
+  return exp_2nd_ph(z, f, c0, c1, c2);
+}
+#endif
+
+alignas(32) const uint32_t mask8[9][8]{
+    {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
+    {0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
+    {0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
+    {0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
+    {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000},
+    {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000},
+    {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000},
+    {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000},
+    {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff},
+};
+
+/**
+ * @brief An Epilogue that optionally apply a casual mask and scale the fp32 result, performing exp, accumulating sum of
+ * each line of exp, and storing exp as bf16 results
+ */
+template <BTLA_ISA ISA_T, typename T_DST>
+class scale_exp_acc_sum_fp32_t {
+ public:
+  struct Param {  // NOLINT(readability-identifier-naming): align with bestla name
+    T_DST* dst;
+    float* dst_sum;
+    int ld_dst;  // #elements
+    float scale;
+    int causal_offset;  // offset for causal mask; negative value for disabling causal mask
+    float alibi_slope;  // m-factor in the alibi paper for current head: https://arxiv.org/abs/2108.12409
+  };
+
+  TARGET_512 BTLA_CODE forward(const float* src, const int src_step, const int M_offset, const int N_offset,
+                               const int M, const int N, const Param& p, void* /* tmpcache */,
+                               size_t /* cachesize */) const {
+    assert(("alibi not supported!", p.alibi_slope == 0.f));
+    const auto dst = p.dst + M_offset * p.ld_dst + N_offset;
+    const auto dst_sum = p.dst_sum + M_offset;
+#if MHA_2ND_EXP && CompileBF16()
+    static_assert(std::is_same<T_DST, bf16>::value, "bf16 support only");
+    const auto v_scale = _mm512_set1_ps(p.scale);
+    for (int i = 0; i < M; ++i) {
+      const auto N_unmasked =
+          std::min(N, p.causal_offset < 0 ? INT32_MAX : i + M_offset - N_offset + p.causal_offset + 1);
+
+      const auto v_mask = _cvtu32_mask16((1U << (N_unmasked % 16)) - 1);
+      int j = 0;
+      auto v_sum = _mm512_setzero_ps();
+      for (; j < N_unmasked - 15; j += 16) {
+        const auto v_exp = kernel::avx512f::exp_ps_0_1(_mm512_mul_ps(v_scale, _mm512_loadu_ps(src + i * src_step + j)));
+        v_sum = _mm512_add_ps(v_sum, v_exp);
+        _mm256_storeu_epi16(dst + i * p.ld_dst + j, (__m256i)_mm512_cvtneps_pbh(v_exp));
+      }
+      if (j < N_unmasked) {
+        const auto v_exp =
+            kernel::avx512f::exp_ps_0_1(_mm512_mul_ps(v_scale, _mm512_maskz_loadu_ps(v_mask, src + i * src_step + j)));
+        v_sum = _mm512_mask_add_ps(v_sum, v_mask, v_sum, v_exp);
+        _mm256_storeu_epi16(dst + i * p.ld_dst + j, (__m256i)_mm512_maskz_cvtneps_pbh(v_mask, v_exp));
+        j += 16;
+      }
+      dst_sum[i] += _mm512_reduce_add_ps(v_sum);
+
+      if (j < utils::padto(N, 64)) memset(dst + i * p.ld_dst + j, 0, sizeof(*dst) * (utils::padto(N, 64) - j));
+    }
+#else
+    for (int i = 0; i < M; ++i) {
+      const auto N_unmasked =
+          std::min(N, p.causal_offset < 0 ? INT32_MAX : i + M_offset - N_offset + p.causal_offset + 1);
+      for (int j = 0; j < N_unmasked; ++j) {
+        const auto exp_ = expf(src[i * src_step + j] * p.scale);
+        dst[i * p.ld_dst + j] = static_cast<T_DST>(exp_);
+        dst_sum[i] += exp_;
+      }
+      if (N_unmasked < utils::padto(N, 64))
+        memset(dst + i * p.ld_dst + N_unmasked, 0, sizeof(*dst) * (utils::padto(N, 64) - N_unmasked));
+    }
+#endif
+
+    return BTLA_CODE::Success;
+  }
+};
+template <BTLA_ISA ISA_T>
+using ScaleExpAccSumFp32Bf16 = scale_exp_acc_sum_fp32_t<ISA_T, bf16>;
+
+/**
+ * @brief An Epilogue that scale the fp32 result, convert to bf16 and write back to memory
+ */
+template <BTLA_ISA ISA_T, typename T_SRC, typename T_DST>
+class scale_write_back_t {
+ public:
+  using SType = T_SRC;
+  using DType = T_DST;
+  struct Param {  // NOLINT(readability-identifier-naming): align with bestla name
+    const float* scale;
+    DType* dst;
+    int ld_dst;
+  };
+
+  BTLA_CODE forward(const SType* src, const int src_step, const int M_offset, const int N_offset, const int M,
+                    const int N, const Param& p, void* /* tmpcache */, size_t /* cachesize */) {
+    const auto dst = p.dst + M_offset * p.ld_dst + N_offset;
+    const auto scale = p.scale + M_offset;
+    // TODO(Yi): high performance implementation
+    for (int i = 0; i < M; ++i)
+      for (int j = 0; j < N; ++j)  //
+        dst[i * p.ld_dst + j] = static_cast<DType>(scale[i] * src[i * src_step + j]);
+
+    return BTLA_CODE::Success;
+  }
+};
+template <BTLA_ISA ISA_T>
+using ScaleWriteBackFp32Bf16 = scale_write_back_t<ISA_T, float, bf16>;
+template <BTLA_ISA ISA_T>
+using ScaleWriteBackFp32Fp32 = scale_write_back_t<ISA_T, float, float>;
+template <BTLA_ISA ISA_T>
+using ScaleWriteBackS32S8 = scale_write_back_t<ISA_T, int32_t, int8_t>;
+
+/**
+ * @brief PackedWeight(Default) with batch
+ */
+class storage_packed_weight_batch_t : public storage::gemm::IWeightBase {
+  using Base = storage::gemm::IWeightBase;
+
+ public:
+  int mBatch;
+  storage::ObjectAlignedBuffer<NE_ALIGNMENT> mWBuf;
+  // size_t mWSize;
+
+  explicit storage_packed_weight_batch_t(uint64_t _core_id) : Base(_core_id), mBatch(0) {}
+  size_t resize(int NPad, int KPad, int N, int K, int num_batch, BTLA_DTYPE dtype) {
+    IWeightBase::resize(NPad, KPad, N, K, dtype);
+    mBatch = num_batch;
+    auto bsize = static_cast<size_t>(mBatch) * NPad * KPad * utils::bestla_dtype_size(dtype);
+    mWBuf.resize(bsize);
+    mSize = utils::padto(IWeightBase::getSerializedSize() + mWBuf.getSerializedSize(), NE_ALIGNMENT);
+    return mSize;
+  }
+
+  template <typename T>
+  inline constexpr T* WPtr() const {
+    return mWBuf.get<T>();
+  }
+
+  void assign(int8_t* buf) override {
+    deserializeBuffer(buf, true);
+    mWBuf.deserializeBuffer(buf, true);
+  }
+
+  void serialize(int8_t* wptr) override {
+    serializeToBuffer(wptr);
+    mWBuf.serializeToBuffer(wptr);
+  }
+
+  void deserialize(int8_t* rptr) override {
+    deserializeBuffer(rptr, false);
+    mWBuf.deserializeBuffer(rptr, false);
+  }
+
+ protected:
+  size_t getSerializedSize() override { return Base::getSerializedSize() + sizeof(mBatch); }
+
+  void serializeToBuffer(int8_t*& wptr) override {
+    Base::serializeToBuffer(wptr);
+    utils::serialize(wptr, mBatch);
+  }
+  void deserializeBuffer(int8_t*& rptr, bool map_buf) override {
+    Base::deserializeBuffer(rptr, map_buf);
+    if (!map_buf) {
+      mBatch = utils::deserialize<int>(rptr);
+    } else {
+      utils::serialize<int>(rptr, mBatch);
+    }
+  }
+};
+
+/**
+ * @brief An weight Prologue that Packs transposed Bf16 weight; optimized for runtime packing. It is the base type of
+ * that for transposed / non-transposed source
+ */
+template <class GemmCore_T, BTLA_ISA ISA_T, bool IsTrans, typename T_SRC = typename GemmCore_T::BType>
+class weight_pack_batch_bf16_base_t {
+ public:
+  using WType = typename GemmCore_T::BType;           // weight type
+  using SType = T_SRC;                                // source type (before packed)
+  using StorageType = storage_packed_weight_batch_t;  // packed weight type
+
+  struct Param {  // NOLINT(readability-identifier-naming): align with bestla name
+    const SType* B;
+    const int ldb;
+    const StorageType* packedW;
+  };
+
+  BTLA_CODE getWeight(...) = delete;
+
+  BTLA_CODE getWeight(WType** dstptr, int* dststep, int /* b_size */, int /* k_size */, int /* n_size */, int b_offset,
+                      int k_offset, int n_offset, const Param& param, void* /* tmpcache */, size_t /* cachesize */) {
+    const auto wptr = param.packedW;
+    if (!wptr) return BTLA_CODE::InvalidParam;
+    assert(k_offset % GemmCore_T::KTILE == 0);
+    assert(n_offset % GemmCore_T::NTILE == 0);
+    auto KPad = wptr->mKPad;
+    auto NPad = wptr->mNPad;
+    *dstptr = wptr->template WPtr<WType>() + n_offset * KPad + k_offset * GemmCore_T::NTILE;
+    *dststep = KPad;
+    return BTLA_CODE::Success;
+  }
+
+  BTLA_CODE getWeight(WType** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
+                      const Param& param, void* tmpcache, size_t cachesize) {
+    return getWeight(dstptr, dststep, 1, k_size, n_size, 0, k_offset, n_offset, param, tmpcache, cachesize);
+  }
+
+  BTLA_CODE packWeight(...) = delete;
+};
+
+template <class GemmCore_T, BTLA_ISA ISA_T, typename T_SRC = typename GemmCore_T::BType>
+class weight_pack_batch_bf16_trans_t : public weight_pack_batch_bf16_base_t<GemmCore_T, ISA_T, true, T_SRC> {
+  using Base = weight_pack_batch_bf16_base_t<GemmCore_T, ISA_T, true, T_SRC>;
+
+ public:
+  using typename Base::Param;
+  using typename Base::StorageType;
+  using typename Base::SType;
+  using typename Base::WType;
+
+  /// Reorder job of a thread
+  void run(const Param& p, const parallel::ThreadProblem2D& thdp, const std::function<int(int)>& step_batch) {
+    if (!thdp.valid) return;
+    const auto pw = dynamic_cast<const StorageType*>(p.packedW);
+    assert(pw != nullptr);
+    const int KPad = pw->mKPad;  // K size after transpose & padding
+    const int NPad = pw->mNPad;  // N size after transpose & padding
+    assert(pw->mK <= KPad);
+    assert(pw->mN <= NPad);
+
+    // y for batch; x for major-dim of the source data (N-dim of the packed weight)
+    const auto [y, x] = thdp.loc;
+    const auto [ny, nx] = thdp.size;
+    const auto nx_pad = utils::padto(nx, GemmCore_T::NTILE);
+
+    assert(padto(pw->mK, GemmCore_T::KTILE) == KPad);
+
+    using KernInterleave = typename kernel::wrapper::PaddingTransInterleaveMN<  //
+        GemmCore_T::NTILE, GemmCore_T::PACK_ROW>;
+
+    for (int ibat = y; ibat < y + ny; ++ibat) {
+      const auto forward_stat = KernInterleave::template forward<ISA_T, T_SRC, WType>(  //
+          p.B + step_batch(ibat) + x * p.ldb,                                           //
+          pw->template WPtr<WType>() + ibat * KPad * NPad + x * KPad,                   //
+          nx, pw->mK,                                                                   // size
+          nx_pad, KPad,                                                                 // padded size
+          p.ldb, KPad);                                                                 // step
+      assert(forward_stat == BTLA_CODE::Success);
+    }
+  }
+};
+
+template <class GemmCore_T, BTLA_ISA ISA_T, typename T_SRC = typename GemmCore_T::BType>
+class weight_pack_batch_bf16_non_tr_t : public weight_pack_batch_bf16_base_t<GemmCore_T, ISA_T, false, T_SRC> {
+  using Base = weight_pack_batch_bf16_base_t<GemmCore_T, ISA_T, false, T_SRC>;
+
+ public:
+  using typename Base::Param;
+  using typename Base::StorageType;
+  using typename Base::SType;
+  using typename Base::WType;
+
+  /// Reorder job of a thread
+  void run(const Param& p, const parallel::ThreadProblem2D& thdp, const std::function<int(int)>& step_batch) {
+    if (!thdp.valid) return;
+    const auto pw = dynamic_cast<const StorageType*>(p.packedW);
+    assert(pw != nullptr);
+    const int KPad = pw->mKPad;  // K size after padding
+    const int NPad = pw->mNPad;  // N size after padding
+    assert(pw->mK <= KPad);
+    assert(pw->mN <= NPad);
+    assert(padto(pw->mN, GemmCore_T::NTILE) == NPad);
+
+    auto [y, x] = thdp.loc;
+    auto [ny, nx] = thdp.size;
+    const auto nx_pad = utils::padto(nx, GemmCore_T::KTILE);
+
+    using KernInterleave = typename kernel::wrapper::PaddingInterleaveMN<  //
+        GemmCore_T::NTILE, GemmCore_T::PACK_ROW>;
+
+    for (int ibat = y; ibat < y + ny; ++ibat) {
+      const auto forward_stat = KernInterleave::template forward<ISA_T, T_SRC, WType>(  //
+          p.B + step_batch(ibat) + x * p.ldb,                                           //
+          pw->template WPtr<WType>() + ibat * KPad * NPad + x * GemmCore_T::NTILE,      //
+          nx, pw->mN,                                                                   // size
+          nx_pad, NPad,                                                                 // padded size
+          p.ldb, KPad);                                                                 // stride
+      assert(forward_stat == BTLA_CODE::Success);
+    }
+  }
+};
+
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+class activation_identity_t {
+ public:
+  using AType = typename _GemmCore_T::AType;
+  struct Param {  // NOLINT(readability-identifier-naming): align with bestla name
+    const AType* A;
+    int lda;
+  };
+  activation_identity_t() = default;
+
+  BTLA_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
+                          int k_offset, void* /* tmpcache */, size_t /* cachesize */) {
+    auto aptr = const_cast<AType*>(_param.A);
+    *dstptr = aptr + m_offset * _param.lda + k_offset;
+    *dststep = _param.lda;
+    return BTLA_CODE::Success;
+  }
+};
+
+/**
+ * @brief LauncherBase with addition input as packed weight offset
+ */
+template <BTLA_ISA RT_ISA_, class _GemmCore_T, template <class, BTLA_ISA> class _PrologueA_T,
+          template <class, BTLA_ISA> class _PrologueB_T, template <BTLA_ISA> class _Epilogue_T>
+class launcher_base_off_t                  //
+    : public wrapper::gemm::LauncherBase<  //
+          RT_ISA_, _GemmCore_T, _PrologueA_T, _PrologueB_T, _Epilogue_T> {
+  using Base = wrapper::gemm::LauncherBase<  //
+      RT_ISA_, _GemmCore_T, _PrologueA_T, _PrologueB_T, _Epilogue_T>;
+
+ public:
+  using typename Base::GemmCore;
+  using Param = typename Base::Param;
+  using AType = typename Base::AType;
+  using BType = typename Base::BType;
+  using CType = typename Base::CType;
+  static constexpr auto RT_ISA = RT_ISA_;
+
+  void run(const Param& _param, const parallel::gemm::ThreadProblemBase& _config,
+           int w_offset /* weight offset for batching */) {
+    // TO(Yi) temporarily configure to max tiling size
+    this->mGemmCore.configure(16, 16, 16);  // Need 'this->' here; See：https://stackoverflow.com/questions/11405
+    auto StackTmp = alloca(_config.stacksize);
+    auto tmpB = reinterpret_cast<BType*>(StackTmp);
+    tmpB = utils::cpu_pointer_align(tmpB);
+    auto tmpA = reinterpret_cast<AType*>(tmpB + static_cast<size_t>(_config.block[1]) * _config.block[2]);
+    tmpA = utils::cpu_pointer_align(tmpA);
+    auto tmpC = reinterpret_cast<CType*>(tmpA + static_cast<size_t>(GemmCore::MTILE) * _config.block[2]);
+    tmpC = utils::cpu_pointer_align(tmpC);
+    auto tmpCache = tmpC + _config.block[0] * _config.block[1];
+    tmpCache = utils::cpu_pointer_align(tmpCache);
+
+    for (int itern = 0; itern < _config.size[1]; itern += _config.block[1]) {
+      int n_remain = utils::remainsize(itern, _config.size[1], _config.block[1]);
+      for (int iterm = 0; iterm < _config.size[0]; iterm += _config.block[0]) {
+        int m_remain = utils::remainsize(iterm, _config.size[0], _config.block[0]);
+        run_block(_param, _config, w_offset, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpC, tmpCache);
+      }
+    }
+  }
+
+ protected:
+  void run_block(const Param& _param, const parallel::gemm::ThreadProblemBase& _config,
+                 int w_offset /* weight offset for batching */, int blk_m, int blk_n, int blk_msize, int blk_nsize,
+                 AType* tmpA, BType* /*tmpB*/, CType* tmpC, void* tmpcache) {
+    int n_padded = padto(blk_nsize, GemmCore::NTILE);
+    for (int iterk = 0; iterk < _param.problem.dims[3]; iterk += _config.block[2]) {
+      int k_remain = utils::remainsize(iterk, _param.problem.dims[3], _config.block[2]);
+      int k_padded = padto(k_remain, GemmCore::KTILE);
+      int k_paddedle = padto_le(k_remain, GemmCore::KTILE);
+      BType* bptr_cache = nullptr;
+      int bcache_step = 0;
+      this->mProB.getWeight(&bptr_cache, &bcache_step,      // See：https://stackoverflow.com/questions/11405
+                            k_padded, n_padded,             //
+                            iterk, _config.loc[1] + blk_n,  //
+                            _param.paramB, tmpcache, _config.tmpcachesize);
+      bptr_cache += w_offset;
+      int bcache_stride = bcache_step * sizeof(BType);
+      for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
+        int m_remain = remainsize(i, blk_msize, GemmCore::MTILE);
+        auto cptr_cache = tmpC + i * _config.block[1];
+        int ccache_stride = _config.block[1] * sizeof(CType);
+
+        int acache_step = 0;
+        if (k_paddedle) {
+          AType* aptr_cache = tmpA;
+          this->mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_paddedle,
+                                    blk_m + i + _config.loc[0], iterk, tmpcache, _config.tmpcachesize);
+          this->mGemmCore.forward(aptr_cache, bptr_cache, cptr_cache, m_remain, n_padded, k_paddedle,
+                                  acache_step * sizeof(AType), bcache_stride, ccache_stride, iterk, tmpcache,
+                                  _config.tmpcachesize);
+        }
+        int k_tail = k_remain - k_paddedle;
+        if (k_tail) {
+          AType* aptr_cache = tmpA;
+          this->mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_tail,
+                                    blk_m + i + _config.loc[0], iterk + k_paddedle, tmpcache, _config.tmpcachesize);
+          this->mGemmCore.forward(aptr_cache, bptr_cache + k_paddedle * GemmCore::NTILE, cptr_cache, m_remain, n_padded,
+                                  GemmCore::KTILE, acache_step * sizeof(AType), bcache_stride, ccache_stride,
+                                  iterk + k_paddedle, tmpcache, _config.tmpcachesize);
+        }
+      }
+    }
+    this->mEpilogue.forward(tmpC, _config.block[1], _config.loc[0] + blk_m, _config.loc[1] + blk_n, blk_msize,
+                            blk_nsize, _param.paramC, tmpcache, _config.tmpcachesize);
+  }
+};
+
+/**
+ * @brief LauncherBase with addition input as packed weight offset
+ */
+template <BTLA_ISA RT_ISA_, class _GemmCore_T, template <class, BTLA_ISA> class _PrologueA_T,
+          template <class, BTLA_ISA> class _PrologueB_T, template <BTLA_ISA> class _Epilogue_T>
+class launcher_base_weight_t               //
+    : public wrapper::gemm::LauncherBase<  //
+          RT_ISA_, _GemmCore_T, _PrologueA_T, _PrologueB_T, _Epilogue_T> {
+  using Base = wrapper::gemm::LauncherBase<  //
+      RT_ISA_, _GemmCore_T, _PrologueA_T, _PrologueB_T, _Epilogue_T>;
+
+ public:
+  using typename Base::GemmCore;
+  using Param = typename Base::Param;
+  using AType = typename Base::AType;
+  using BType = typename Base::BType;
+  using CType = typename Base::CType;
+  static constexpr auto RT_ISA = RT_ISA_;
+
+  void run(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
+    this->mGemmCore.configure(16, 16, 16);  // Need 'this->' here; See：https://stackoverflow.com/questions/11405
+    auto StackTmp = alloca(_config.stacksize);
+    auto tmpB = reinterpret_cast<BType*>(StackTmp);
+    tmpB = utils::cpu_pointer_align(tmpB);
+    auto tmpA = reinterpret_cast<AType*>(tmpB + static_cast<size_t>(_config.block[1]) * _config.block[2]);
+    tmpA = utils::cpu_pointer_align(tmpA);
+    auto tmpC = reinterpret_cast<CType*>(tmpA + static_cast<size_t>(GemmCore::MTILE) * _config.block[2]);
+    tmpC = utils::cpu_pointer_align(tmpC);
+    auto tmpCache = tmpC + _config.block[0] * _config.block[1];
+    tmpCache = utils::cpu_pointer_align(tmpCache);
+
+    for (int itern = 0; itern < _config.size[1]; itern += _config.block[1]) {
+      int n_remain = utils::remainsize(itern, _config.size[1], _config.block[1]);
+      for (int iterm = 0; iterm < _config.size[0]; iterm += _config.block[0]) {
+        int m_remain = utils::remainsize(iterm, _config.size[0], _config.block[0]);
+        run_block(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpC, tmpCache);
+      }
+    }
+  }
+
+ protected:
+  void run_block(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
+                 int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, CType* tmpC, void* tmpcache) {
+    int n_padded = padto(blk_nsize, GemmCore::NTILE);
+    for (int iterk = 0; iterk < _param.problem.dims[3]; iterk += _config.block[2]) {
+      int k_remain = remainsize(iterk, _param.problem.dims[3], _config.block[2]);
+      int k_padded = padto(k_remain, GemmCore::KTILE);
+      int k_paddedle = padto_le(k_remain, GemmCore::KTILE);
+      auto bptr_cache = tmpB;
+      int bcache_step = 0;
+
+      this->mProB.getWeight(&bptr_cache, &bcache_step, _param.paramB, k_padded, blk_nsize, iterk,
+                            _config.loc[1] + blk_n, tmpcache, _config.tmpcachesize);
+      int bcache_stride = bcache_step * sizeof(BType);
+      for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
+        int m_remain = remainsize(i, blk_msize, GemmCore::MTILE);
+        auto cptr_cache = tmpC + i * _config.block[1];
+        int ccache_stride = _config.block[1] * sizeof(CType);
+
+        int acache_step = 0;
+        if (k_paddedle) {
+          AType* aptr_cache = tmpA;
+          this->mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_paddedle,
+                                    (blk_m + i + _config.loc[0]), iterk, tmpcache, _config.tmpcachesize);
+          this->mGemmCore.forward(aptr_cache, bptr_cache, cptr_cache, m_remain, n_padded, k_paddedle,
+                                  acache_step * sizeof(AType), bcache_stride, ccache_stride, iterk, tmpcache,
+                                  _config.tmpcachesize);
+        }
+        int k_tail = k_remain - k_paddedle;
+        if (k_tail) {
+          AType* aptr_cache = tmpA;
+          this->mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_tail,
+                                    (blk_m + i + _config.loc[0]), iterk + k_paddedle, tmpcache, _config.tmpcachesize);
+          this->mGemmCore.forward(aptr_cache, bptr_cache + k_paddedle * GemmCore::NTILE, cptr_cache, m_remain, n_padded,
+                                  GemmCore::KTILE, acache_step * sizeof(AType), bcache_stride, ccache_stride,
+                                  iterk + k_paddedle, tmpcache, _config.tmpcachesize);
+        }
+      }
+    }
+    this->mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize,
+                            blk_nsize, _param.paramC, tmpcache, _config.tmpcachesize);
+  }
+};
+
+/**
+ * @brief MHA interface
+ *
+ * @tparam L_ExpSum Launcher type of the QK exp sum matmul
+ * @tparam L_Scale Launcher type of the PV scale matmul (S for that in the flash-attn paper)
+ */
+template </* class Parallel_T, */ class L_ExpSum, class L_Scale>
+class mha_interface_t {
+ public:
+  using PrologueQ = typename L_ExpSum::PrologueA;
+  using PrologueK = typename L_ExpSum::PrologueB;
+  using QKProQArgs = typename PrologueQ::Param;
+  using QKProKArgs = typename PrologueK::Param;
+  using QKArgs = typename L_ExpSum::Param;
+  using QKEpiArgs = typename L_ExpSum::EpiParam;
+
+  using PrologueS = typename L_Scale::PrologueA;
+  using PrologueV = typename L_Scale::PrologueB;
+  using PVProPArgs = typename PrologueS::Param;
+  using PVProVArgs = typename PrologueV::Param;
+  using PVArgs = typename L_Scale::Param;
+  using PVEpiArgs = typename L_Scale::EpiParam;
+
+  using GemmQK = typename L_ExpSum::GemmCore;
+  using GemmPV = typename L_Scale::GemmCore;
+  using Q_T = typename std::remove_const<typename std::remove_pointer<decltype(QKProQArgs::A)>::type>::type;
+  using K_T = typename PrologueK::SType;
+  using V_T = typename PrologueV::SType;
+  using DST_T = typename std::remove_const<typename std::remove_pointer<decltype(PVEpiArgs::dst)>::type>::type;
+
+  static_assert(GemmQK::MTILE == GemmPV::MTILE, "2 GEMM should have the same M_TILE.");
+
+  BTLA_CODE compute(const attn_fwd_args_t<Q_T, K_T, V_T, DST_T>& p, parallel::IThreading& th) {
+    static constexpr auto M_TILE = GemmQK::MTILE;
+    assert(p.Q_sc == 1 && p.K_sc == 1 && p.V_sc == 1 && p.dst_sc == 1);
+    assert(p.Q_layout == ATTN_FWD_LAYOUT_PLAIN && p.K_layout == ATTN_FWD_LAYOUT_PLAIN &&
+           p.V_layout == ATTN_FWD_LAYOUT_PLAIN && p.dst_layout == ATTN_FWD_LAYOUT_PLAIN);
+    assert(p.step_v_head_size == 1);
+    assert(p.step_k_head_size == 1 || p.step_k_sl == 1);
+    const auto num_heads = p.batch_size * p.head_num;  // Total number of heads
+    GetCPUDevice();
+
+    const bool is_causal = (p.attn_flags & NE_ATTN_FLAG_IS_CAUSAL) != 0;
+    const bool is_alibi = (p.attn_flags & NE_ATTN_FLAG_IS_ALIBI8) != 0;
+    const bool prefer_fp32 = (p.attn_flags & NE_ATTN_FLAG_PREFER_FP32) != 0;
+
+    assert(!is_causal || p.sl_q <= p.sl_kv);
+    assert(("qlen should be no greater then klen/vlen!", !is_causal || p.sl_q <= p.sl_kv));
+    assert(("prefer_fp32 not implemented!", !prefer_fp32));
+    assert(("alibi not supported!", !is_alibi));
+    assert(("GQA not supported!", p.head_num == p.heads_kv));
+    const auto sl_diff = p.sl_kv - p.sl_q;
+
+    // prepare memory for packed weight
+    // TODO(Yi): init packed weight with p.tmp
+    storage_packed_weight_batch_t /*<typename GemmQK::BType>*/ K_pack(GemmQK::ID);  // packed K
+    K_pack.resize(padto(p.sl_kv, GemmQK::NTILE), padto(p.head_size, GemmQK::KTILE), p.sl_kv, p.head_size, num_heads,
+                  utils::bestla_dtype<typename GemmQK::BType>);
+    auto bufferK = utils::amalloc<int8_t>(K_pack.mSize);
+    K_pack.assign(bufferK);
+    storage_packed_weight_batch_t /*<typename GemmPV::BType>*/ V_pack(GemmPV::ID);  // packed V
+    V_pack.resize(padto(p.head_size, GemmPV::NTILE), padto(p.sl_kv, GemmPV::KTILE), p.head_size, p.sl_kv, num_heads,
+                  utils::bestla_dtype<typename GemmPV::BType>);
+    auto bufferV = utils::amalloc<int8_t>(V_pack.mSize);
+    V_pack.assign(bufferV);
+    const auto K_pack_batch_off = K_pack.mKPad * K_pack.mNPad;
+    const auto V_pack_batch_off = V_pack.mKPad * V_pack.mNPad;
+
+    const auto step_batch_k = [step_bs = p.step_k_bs, step_hn = p.step_k_head_num, hn = p.heads_kv](int ibat) {
+      return (ibat / hn) * step_bs + (ibat % hn) * step_hn;
+    };
+    const auto step_batch_v = [step_bs = p.step_v_bs, step_hn = p.step_v_head_num, hn = p.heads_kv](int ibat) {
+      return (ibat / hn) * step_bs + (ibat % hn) * step_hn;
+    };
+
+    // prepare parallel scheduler for packed weight
+    using Scheduler2D = bestla::parallel::Scheduler2D;
+    using ThreadProblem2D = bestla::parallel::ThreadProblem2D;
+    const auto schK = p.step_k_head_size == 1
+                          ? Scheduler2D({th.num_threads(), {num_heads, p.sl_kv}, {1, GemmQK::NTILE}})
+                          : Scheduler2D({th.num_threads(), {num_heads, p.head_size}, {1, GemmQK::KTILE}});
+    const auto schV = Scheduler2D({th.num_threads(), {num_heads, p.sl_kv}, {1, GemmPV::KTILE}});
+
+    const mha_problem_t problem = {p.batch_size, p.head_num, p.heads_kv, p.head_size, p.sl_q, p.sl_kv};
+    const auto m_tiles = updiv(p.sl_q, M_TILE);
+    const auto num_tasks = num_heads * m_tiles;
+    const Scheduler2D parl({th.num_threads(), {num_tasks, 1}, {1, 1}, {0, 0}});
+
+    th.parallel_for([&](int tid) {
+      {  // reorder K & V
+        ThreadProblem2D thdpK{tid};
+        schK.getIndex(thdpK);
+        l_expsum.mProB.run(  // pack K
+            QKProKArgs{
+                /* .B = */ p.K,
+                /* .ldb = */ p.step_k_sl * p.step_k_head_size,  //  use the non-one step
+                /* .StorageType = */ &K_pack,
+            },
+            thdpK, step_batch_k);
+
+        ThreadProblem2D thdpV{tid};
+        schV.getIndex(thdpV);
+        l_scale.mProB.run(  // pack V
+            PVProVArgs{
+                /* .B = */ p.V,
+                /* .ldb = */ p.step_v_sl,
+                /* .StorageType = */ &V_pack,
+            },
+            thdpV, step_batch_v);
+      }
+
+      th.sync(tid);
+
+      // calculate mm + softmax + mm
+      {
+        const int tmp_exp_size = M_TILE * padto(p.sl_kv, GemmQK::NTILE) * sizeof(ne_bf16_t);  // TODO(Yi): alignment?
+        const auto tmp = p.tmp + tid * tmp_exp_size;
+        ThreadProblem2D thdp{tid};
+        parl.getIndex(thdp);
+        const auto [task_start, _assert0] = thdp.loc;
+        auto [task_size, _assert_max1] = thdp.size;
+        assert(task_size == 0 || _assert0 == 0);
+        assert(task_size == 0 || _assert_max1 == 1 || _assert_max1 == 0);
+        if (_assert_max1 == 0 || !thdp.valid) task_size = 0;
+
+        for (int task_id = task_start; task_id < task_start + task_size; ++task_id) {
+          const int ibat = task_id / m_tiles;
+          const int i_m = task_id % m_tiles * M_TILE;
+          const int ibs = ibat / p.head_num;
+          const int ihn = ibat % p.head_num;
+          const int m_size = std::min(M_TILE, p.sl_q - i_m);
+          // TODO(Yi): heads_kv
+
+          float exp_sum[M_TILE]{};
+          memset(exp_sum, 0, sizeof(exp_sum));
+
+          // ptr to Q / dst matrix of the current head
+          const auto head_q = p.Q + ibs * p.step_q_bs + ihn * p.step_q_head_num;
+          const auto head_dst = p.dst + ibs * p.step_dst_bs + ihn * p.step_dst_head_num;
+          const auto unmasked_size = is_causal ? std::min(p.sl_kv, p.sl_kv - p.sl_q + i_m + M_TILE - 1 + 1) : p.sl_kv;
+
+          const auto unmasked_size_pad_qk = std::min(p.sl_kv, padto(unmasked_size, GemmQK::NTILE));
+          const auto unmasked_size_pad_pv = std::min(p.sl_kv, padto(unmasked_size, GemmPV::KTILE));
+          const auto ld_tmp_exp = padto(padto(unmasked_size_pad_pv, GemmQK::NTILE), GemmPV::KTILE);
+
+          typename parallel::gemm::ThreadProblemBase tpQK{
+              /* ThreadProblem2D */ {tid, {}, {i_m, 0}, {m_size, unmasked_size_pad_qk}, true},
+              /* .block = */ {M_TILE, GemmQK::NTILE, p.head_size},
+              /* .stacksize = */ _cd->getL2CacheSize(),
+              /* .tmpcachesize = */ _cd->getL2CacheSize(),
+          };
+          const auto bf16_tmp = reinterpret_cast<bf16*>(tmp);
+          l_expsum.run(  // QxK => S ==exp==> P
+              QKArgs{
+                  utils::GemmProblem{
+                      /* .batch */ 1,
+                      /* .M = */ p.sl_q,
+                      /* .N = */ unmasked_size_pad_qk,
+                      /* .K = */ p.head_size,
+                  },
+                  /* .paramA = */ QKProQArgs{head_q, p.step_q_sl},
+                  /* .paramB = */ QKProKArgs{nullptr, 0, &K_pack},
+                  /* .paramC = */
+                  QKEpiArgs{
+                      /* .dst = */ bf16_tmp - i_m * ld_tmp_exp,  // pretend that there is a whole exp mat
+                      /* .dst_sum = */ exp_sum - i_m,            // pretend that there is a whole exp sum
+                      /* .ld_dst = */ ld_tmp_exp,
+                      /* .scale = */ p.QK_scale,
+                      /* .causal_offset = */ is_causal ? sl_diff : -1,
+                      /* .alibi_slope = */ 0.f,
+                  },
+                  // /* .workspace = */ nullptr,
+              },
+              tpQK, /* w_offset */ ibat * K_pack_batch_off);
+          for (int ii = 0; ii < M_TILE; ++ii) exp_sum[ii] = 1.f / exp_sum[ii];
+
+          typename parallel::gemm::ThreadProblemBase tpPV{
+              /* ThreadProblem2D */ {tid, {}, {0, 0}, {m_size, p.head_size}, true},
+              /* .block = */ {M_TILE, GemmPV::NTILE, unmasked_size_pad_qk},
+              /* .stacksize = */ _cd->getL2CacheSize(),
+              /* .tmpcachesize = */ _cd->getL2CacheSize(),
+          };
+          l_scale.run(  // PxV => O
+              PVArgs{
+                  utils::GemmProblem{
+                      /* .batch */ 1,
+                      /* .M = */ std::min(p.sl_q - i_m, M_TILE),
+                      /* .N = */ p.head_size,
+                      /* .K = */ unmasked_size_pad_qk,
+                  },
+                  /* .paramA = */ PVProPArgs{(utils::bf16*)tmp, ld_tmp_exp},
+                  /* .paramB = */ PVProVArgs{nullptr, 0, &V_pack},
+                  /* .paramC = */
+                  PVEpiArgs{
+                      /* .scale = */ exp_sum,
+                      /* .dst = */ head_dst + i_m * p.step_dst_sl,
+                      /* .ld_dst = */ p.step_dst_sl,
+                  },
+                  // /* .workspace = */ nullptr,
+              },
+              tpPV, /* w_offset */ ibat * V_pack_batch_off);
+        }
+      }
+    });
+    utils::afree(bufferK);
+    utils::afree(bufferV);
+    return BTLA_CODE::Success;
+  }
+
+ protected:
+  L_ExpSum l_expsum;
+  L_Scale l_scale;
+};
+
+/**
+ * @brief An Epilogue that optionally apply a casual mask (but may not filling zero) and scale the fp32 result, update
+ * the maximum of each line of the result, and storing exp as bf16 results
+ */
+template <BTLA_ISA ISA_T, typename T_SRC, typename T_DST>
+class scale_track_max_t {
+ public:
+  using DType = T_DST;
+  using SType = T_SRC;
+  struct Param;
+
+  BTLA_CODE forward(const SType* src, const int src_step, const int M_offset, const int N_offset, const int M,
+                    const int N, const Param& p) const {
+    assert(false);
+    return BTLA_CODE::NotSupport;
+  }
+};
+template <BTLA_ISA ISA_T>
+class scale_track_max_t<ISA_T, fp16, float> {
+ public:
+  using DType = float;
+  using SType = fp16;
+  struct Param {  // NOLINT(readability-identifier-naming): align with bestla name
+    DType* dst;
+    DType* dst_max;
+    int ld_dst;  // #elements
+    float scale;
+    int causal_offset;  // offset for causal mask; negative value for disabling causal mask
+    float alibi_slope;  // m-factor in the alibi paper for current head: https://arxiv.org/abs/2108.12409
+  };
+
+  TARGET_512 BTLA_CODE forward(const SType* src, const int src_step, const int M_offset, const int N_offset,
+                               const int M, const int N, const Param& p, void* /* tmpcache */,
+                               size_t /* cachesize */) const {
+    assert(("alibi not supported!", p.alibi_slope == 0.f));
+    const auto dst = p.dst + M_offset * p.ld_dst + N_offset;
+    const auto dst_max = p.dst_max + M_offset;
+#if CompileFP16()
+#if MHA_2ND_EXP
+    const auto v_scale = _mm512_set1_ps(p.scale);
+
+    for (int i = 0; i < M; ++i) {
+      const auto N_unmasked =
+          std::min(N, p.causal_offset < 0 ? INT32_MAX : i + M_offset - N_offset + p.causal_offset + 1);
+
+      const auto v_mask = _cvtu32_mask16((1U << (N_unmasked % 16)) - 1);
+      int j = 0;
+      auto v_max = _mm512_set1_ps(-INFINITY);
+      for (; j < N_unmasked - 15; j += 16) {
+        const auto xs = _mm512_mul_ps(v_scale, _mm512_cvtxph_ps(_mm256_loadu_ph(src + i * src_step + j)));
+        v_max = _mm512_max_ps(v_max, xs);
+        _mm512_storeu_ps(dst + i * p.ld_dst + j, xs);
+      }
+      if (j < N_unmasked) {
+        const auto xs = _mm512_mul_ps(
+            v_scale, _mm512_cvtxph_ps(_mm256_castsi256_ph(_mm256_maskz_loadu_epi16(v_mask, src + i * src_step + j))));
+        v_max = _mm512_mask_max_ps(v_max, v_mask, v_max, xs);
+        _mm512_storeu_ps(dst + i * p.ld_dst + j, xs);
+        j += 16;
+      }
+      dst_max[i] = std::max(dst_max[i], _mm512_reduce_max_ps(v_max));
+
+      // if (j < utils::padto(N, 64))
+      //   memset(dst + i * p.ld_dst + j, 0, sizeof(*dst) * (utils::padto(N, 64) - j));
+    }
+#else
+    for (int i = 0; i < M; ++i) {
+      const auto N_unmasked =
+          std::min(N, p.causal_offset < 0 ? INT32_MAX : i + M_offset - N_offset + p.causal_offset + 1);
+      for (int j = 0; j < N_unmasked; ++j) {
+        const auto val_ = src[i * src_step + j] * p.scale;
+        dst[i * p.ld_dst + j] = static_cast<T_DST>(val_);
+        dst_max[i] = std::max(dst_max[i], val_);
+      }
+      if (N_unmasked < utils::padto(N, 64))
+        memset(dst + i * p.ld_dst + N_unmasked, 0, sizeof(*dst) * (utils::padto(N, 64) - N_unmasked));
+    }
+#endif
+
+    return BTLA_CODE::Success;
+#else
+    return BTLA_CODE::NotSupport;
+#endif
+  }
+};
+template <BTLA_ISA ISA_T>
+using ScaleTrackMaxFp16Fp32 = scale_track_max_t<ISA_T, fp16, float>;
+
+template <BTLA_ISA ISA_T>
+class scale_track_max_t<ISA_T, float, float> {
+ public:
+  using DType = float;
+  using SType = float;
+  struct Param {  // NOLINT(readability-identifier-naming): align with bestla name
+    DType* dst;
+    DType* dst_max;
+    int ld_dst;  // #elements
+    float scale;
+    int causal_offset;  // offset for causal mask; negative value for disabling causal mask
+    float alibi_slope;  // m-factor in the alibi paper for current head: https://arxiv.org/abs/2108.12409
+  };
+  static constexpr float seq15[16]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+  BTLA_CODE forward(const SType* src, const int src_step, const int M_offset, const int N_offset, const int M,
+                    const int N, const Param& p, void* /* tmpcache */, size_t /* cachesize */) const {
+    return p.alibi_slope == 0 ? forward_<false>(src, src_step, M_offset, N_offset, M, N, p)
+                              : forward_<true>(src, src_step, M_offset, N_offset, M, N, p);
+  }
+
+#if CompileAVX512F()
+  template <bool HAS_ALIBI>
+  TARGET_512 BTLA_CODE forward_512(const SType* src, const int src_step, const int M_offset, const int N_offset,
+                                   const int M, const int N, const Param& p) const {
+    const auto dst = p.dst + M_offset * p.ld_dst + N_offset;
+    const auto dst_max = p.dst_max + M_offset;
+    const auto v_scale = _mm512_set1_ps(p.scale);
+    const auto v_seq15 = _mm512_loadu_ps(seq15);
+    const auto alibi_slope = _mm512_set1_ps(p.alibi_slope);
+    const auto alibi_base = _mm512_mul_ps(alibi_slope, _mm512_add_ps(v_seq15, _mm512_set1_ps(N_offset)));
+    const auto alibi_step = _mm512_set1_ps(p.alibi_slope * 16);
+
+    for (int i = 0; i < M; ++i) {
+      auto alibi_curr = alibi_base;
+      const auto N_unmasked =
+          std::min(N, p.causal_offset < 0 ? INT32_MAX : i + M_offset - N_offset + p.causal_offset + 1);
+
+      const auto v_mask = _cvtu32_mask16((1U << (N_unmasked % 16)) - 1);
+      int j = 0;
+      auto v_max = _mm512_set1_ps(-INFINITY);
+      for (; j < N_unmasked - 15; j += 16) {
+        const auto xs = _mm512_fmadd_ps(v_scale, _mm512_loadu_ps(src + i * src_step + j), alibi_curr);
+        v_max = _mm512_max_ps(v_max, xs);
+        _mm512_storeu_ps(dst + i * p.ld_dst + j, xs);
+        if constexpr (HAS_ALIBI) alibi_curr = _mm512_add_ps(alibi_curr, alibi_step);
+      }
+      if (j < N_unmasked) {
+        const auto xs = _mm512_fmadd_ps(v_scale, _mm512_maskz_loadu_ps(v_mask, src + i * src_step + j), alibi_curr);
+        v_max = _mm512_mask_max_ps(v_max, v_mask, v_max, xs);
+        _mm512_storeu_ps(dst + i * p.ld_dst + j, xs);
+        if constexpr (HAS_ALIBI) alibi_curr = _mm512_add_ps(alibi_curr, alibi_step);
+        j += 16;
+      }
+      dst_max[i] = std::max(dst_max[i], _mm512_reduce_max_ps(v_max));
+
+      // if (j < utils::padto(N, 64))
+      //   memset(dst + i * p.ld_dst + j, 0, sizeof(*dst) * (utils::padto(N, 64) - j));
+    }
+    return BTLA_CODE::Success;
+  }
+#endif
+#if CompileAVX2()
+  template <bool HAS_ALIBI>
+  BTLA_CODE forward_avx2(const SType* src, const int src_step, const int M_offset, const int N_offset, const int M,
+                         const int N, const Param& p) const {
+    const auto dst = p.dst + M_offset * p.ld_dst + N_offset;
+    const auto dst_max = p.dst_max + M_offset;
+    const auto v_scale = _mm256_set1_ps(p.scale);
+    const auto v_seq7 = _mm256_loadu_ps(seq15);
+    const auto alibi_slope = _mm256_set1_ps(p.alibi_slope);
+    const auto alibi_base = _mm256_mul_ps(alibi_slope, _mm256_add_ps(v_seq7, _mm256_set1_ps(N_offset)));
+    const auto alibi_step = _mm256_set1_ps(p.alibi_slope * 8);
+    const auto infinity_neg = _mm256_set1_ps(-INFINITY);
+    for (int i = 0; i < M; ++i) {
+      auto alibi_curr = alibi_base;
+      const auto N_unmasked =
+          std::min(N, p.causal_offset < 0 ? INT32_MAX : i + M_offset - N_offset + p.causal_offset + 1);
+
+      const auto v_mask = _mm256_load_si256(reinterpret_cast<const __m256i*>(mask8[N_unmasked % 8]));
+      int j = 0;
+      auto v_max = infinity_neg;
+      for (; j < N_unmasked - 7; j += 8) {
+        const auto xs = _mm256_fmadd_ps(v_scale, _mm256_loadu_ps(src + i * src_step + j), alibi_curr);
+        v_max = _mm256_max_ps(v_max, xs);
+        _mm256_storeu_ps(dst + i * p.ld_dst + j, xs);
+        if constexpr (HAS_ALIBI) alibi_curr = _mm256_add_ps(alibi_curr, alibi_step);
+      }
+      if (j < N_unmasked) {
+        const auto xs = _mm256_fmadd_ps(v_scale, _mm256_maskload_ps(src + i * src_step + j, v_mask), alibi_curr);
+        const auto masked_xs = _mm256_blendv_ps(infinity_neg, xs, _mm256_castsi256_ps(v_mask));
+        v_max = _mm256_max_ps(v_max, masked_xs);
+        _mm256_storeu_ps(dst + i * p.ld_dst + j, xs);
+        if constexpr (HAS_ALIBI) alibi_curr = _mm256_add_ps(alibi_curr, alibi_step);
+        j += 8;
+      }
+      alignas(32) float dst_tmp[8];
+      _mm256_store_ps(dst_tmp, v_max);
+      for (int ii = 0; ii < 8; ++ii) dst_max[i] = std::max(dst_max[i], dst_tmp[ii]);
+      // if (j < bestla::utils::padto(N, 64))
+      //   memset(dst + i * p.ld_dst + j, 0, sizeof(*dst) * (bestla::utils::padto(N, 64) - j));
+    }
+    return BTLA_CODE::Success;
+  }
+#endif
+  template <bool HAS_ALIBI>
+  BTLA_CODE forward_(const SType* src, const int src_step, const int M_offset, const int N_offset, const int M,
+                     const int N, const Param& p) const {
+    const auto dst = p.dst + M_offset * p.ld_dst + N_offset;
+    const auto dst_max = p.dst_max + M_offset;
+#if MHA_2ND_EXP
+#if CompileAVX512F()
+    if constexpr (ISA_T >= BTLA_ISA::AVX512F) {
+      return forward_512<HAS_ALIBI>(src, src_step, M_offset, N_offset, M, N, p);
+    }
+#endif
+#if CompileAVX2()
+    if constexpr (ISA_T >= BTLA_ISA::AVX2) {
+      return forward_avx2<HAS_ALIBI>(src, src_step, M_offset, N_offset, M, N, p);
+    }
+#endif
+#endif
+
+    // reference
+    for (int i = 0; i < M; ++i) {
+      const auto N_unmasked =
+          std::min(N, p.causal_offset < 0 ? INT32_MAX : i + M_offset - N_offset + p.causal_offset + 1);
+      for (int j = 0; j < N_unmasked; ++j) {
+        const auto val_ = src[i * src_step + j] * p.scale;
+        dst[i * p.ld_dst + j] = static_cast<DType>(val_);
+        dst_max[i] = std::max(dst_max[i], val_);
+      }
+      if (N_unmasked < utils::padto(N, 64))
+        memset(dst + i * p.ld_dst + N_unmasked, 0, sizeof(*dst) * (utils::padto(N, 64) - N_unmasked));
+    }
+    return BTLA_CODE::Success;
+  }
+};
+template <BTLA_ISA ISA_T>
+using ScaleTrackMaxFp32Fp32 = scale_track_max_t<ISA_T, float, float>;
+
+template <BTLA_ISA ISA_T>
+class scale_track_max_t<ISA_T, int32_t, float> {
+ public:
+  using DType = float;
+  using SType = int32_t;
+  struct Param {  // NOLINT(readability-identifier-naming): align with bestla name
+    DType* dst;
+    DType* dst_max;
+    int ld_dst;  // #elements
+    float scale;
+    int causal_offset;  // offset for causal mask; negative value for disabling causal mask
+    float alibi_slope;  // m-factor in the alibi paper for current head: https://arxiv.org/abs/2108.12409
+  };
+
+  TARGET_512 BTLA_CODE forward(const SType* src, const int src_step, const int M_offset, const int N_offset,
+                               const int M, const int N, const Param& p, void* /* tmpcache */,
+                               size_t /* cachesize */) const {
+    assert(("alibi not supported!", p.alibi_slope == 0.f));
+    const auto dst = p.dst + M_offset * p.ld_dst + N_offset;
+    const auto dst_max = p.dst_max + M_offset;
+#if CompileAVX512F()
+    const auto v_scale = _mm512_set1_ps(p.scale);
+
+    for (int i = 0; i < M; ++i) {
+      const auto N_unmasked =
+          std::min(N, p.causal_offset < 0 ? INT32_MAX : i + M_offset - N_offset + p.causal_offset + 1);
+
+      const auto v_mask = _cvtu32_mask16((1U << (N_unmasked % 16)) - 1);
+      int j = 0;
+      auto v_max = _mm512_set1_ps(-INFINITY);
+      for (; j < N_unmasked - 15; j += 16) {
+        const auto xs = _mm512_mul_ps(v_scale, _mm512_cvtepi32_ps(_mm512_loadu_si512(src + i * src_step + j)));
+        v_max = _mm512_max_ps(v_max, xs);
+        _mm512_storeu_ps(dst + i * p.ld_dst + j, xs);
+      }
+      if (j < N_unmasked) {
+        const auto xs =
+            _mm512_mul_ps(v_scale, _mm512_cvtepi32_ps(_mm512_maskz_loadu_epi32(v_mask, src + i * src_step + j)));
+        v_max = _mm512_mask_max_ps(v_max, v_mask, v_max, xs);
+        _mm512_storeu_ps(dst + i * p.ld_dst + j, xs);
+        j += 16;
+      }
+      dst_max[i] = std::max(dst_max[i], _mm512_reduce_max_ps(v_max));
+      // if (j < utils::padto(N, 64))
+      //   memset(dst + i * p.ld_dst + j, 0, sizeof(*dst) * (utils::padto(N, 64) - j));
+    }
+    return BTLA_CODE::Success;
+#else
+    return BTLA_CODE::NotSupport;
+#endif
+  }
+};
+template <BTLA_ISA ISA_T>
+using ScaleTrackMaxS32Fp32 = scale_track_max_t<ISA_T, int32_t, float>;
+
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+class weight_base_t {
+ public:
+  using BType = typename _GemmCore_T::BType;
+  using SType = BType;
+  struct Param {  // NOLINT(readability-identifier-naming): align with bestla name
+    const SType* B;
+    int ldb;
+    bool is_padded;
+  };
+  weight_base_t() = default;
+  BTLA_CODE getWeight(BType** dst_ptr, int* dst_step, const Param& p, int k_size, int n_size, int k_offset,
+                      int n_offset, void* /* tmpcache */, size_t /* cachesize */) {
+    if ((n_size % _GemmCore_T::NTILE == 0) && std::is_same<SType, BType>::value &&
+        0) {  // TODO(Yi) : use a gemm core accept step for K or reorder at runtime
+      *dst_ptr = const_cast<SType*>(p.B) + k_offset * p.ldb + n_offset;
+      *dst_step = p.ldb;
+      return BTLA_CODE::Success;
+    } else if (*dst_ptr != nullptr && std::is_same<SType, BType>::value) {
+      const auto src = const_cast<SType*>(p.B) + k_offset * p.ldb + n_offset;
+      const auto npad = padto(n_size, _GemmCore_T::NTILE);
+      *dst_step = npad;
+      for (int k = 0; k < k_size; ++k) {
+        memcpy(*dst_ptr + k * npad, src + k * p.ldb, sizeof(BType) * n_size);
+        memset(*dst_ptr + k * npad + n_size, 0, sizeof(BType) * (npad - n_size));
+      }
+      return BTLA_CODE::Success;
+    } else {
+      assert(false);
+      return BTLA_CODE::NotSupport;
+    }
+  }
+};
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+class weight_forward_n_tile48_t {
+ public:
+  using BType = typename _GemmCore_T::BType;
+  using SType = BType;
+  struct Param {  // NOLINT(readability-identifier-naming): align with bestla name
+    const SType* B;
+    int ldb;
+    bool is_padded;
+  };
+  weight_forward_n_tile48_t() = default;
+  BTLA_CODE getWeight(BType** dst_ptr, int* dst_step, const Param& p, int k_size, int n_size, int k_offset,
+                      int n_offset, void* /* tmpcache */, size_t /* cachesize */) {
+    assert(p.is_padded);
+    *dst_ptr = const_cast<SType*>(p.B) + k_offset * 48 + n_offset * p.ldb;
+    *dst_step = p.ldb;
+    return BTLA_CODE::Success;
+  }
+};
+#if CompileAVX512F()
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+class weight_cvt_bf16_ntile48_t {
+ public:
+  using BType = typename _GemmCore_T::BType;
+  using SType = bf16;
+  struct Param {  // NOLINT(readability-identifier-naming): align with bestla name
+    const SType* B;
+    int ldb;
+    bool is_padded;
+  };
+  weight_cvt_bf16_ntile48_t() = default;
+  TARGET_512 BTLA_CODE getWeight(BType** dst_ptr, int* dst_step, const Param& p, int k_size, int n_size, int k_offset,
+                                 int n_offset, void* /* tmpcache */, size_t /* cachesize */) {
+    assert(p.is_padded);
+    const auto src = const_cast<SType*>(p.B) + k_offset * 48 + n_offset * p.ldb;
+    const auto dst = *dst_ptr;
+    *dst_step = _GemmCore_T::NTILE;
+    if constexpr (std::is_same_v<BType, float> && std::is_same_v<SType, utils::bf16>) {
+      assert(n_size <= _GemmCore_T::NTILE);
+      assert(n_size <= 48);
+      assert(n_offset % 2 == 0 && k_offset % 2 == 0);
+      // static const auto mask_lo = _cvtu32_mask32(0x55555555U);
+      static const auto mask_hi = _cvtu32_mask32(0xaaaaaaaaU);
+      for (int i = 0; i < k_size; i += 2) {
+        for (int j = 0; j < n_size; j += 16) {
+          const auto cur_src = src + i * 48 + j * 2;
+          const auto cur_dst = dst + i * 48 + j;
+          const auto src_lo = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_loadu_si512(cur_src), 16U));
+          const auto src_hi = _mm512_castsi512_ps(_mm512_maskz_loadu_epi16(mask_hi, cur_src));
+          _mm512_store_ps(cur_dst + 0, src_lo);
+          _mm512_store_ps(cur_dst + 48, src_hi);
+        }
+      }
+    } else {
+      assert(0);
+    }
+    return BTLA_CODE::Success;
+  }
+};
+#endif
+
+#if CompileAVX2()
+template <class _GemmCore_T, BTLA_ISA ISA_T>
+class weight_cvt_f16_n_tile24_t {  // convert fp16 weight to fp32 using F16C
+ public:
+  using BType = typename _GemmCore_T::BType;
+  using SType = fp16;
+  struct Param {  // NOLINT(readability-identifier-naming): align with bestla name
+    const SType* B;
+    int ldb;
+    bool is_padded;
+  };
+  weight_cvt_f16_n_tile24_t() = default;
+  BTLA_CODE getWeight(BType** dst_ptr, int* dst_step, const Param& p, int k_size, int n_size, int k_offset,
+                      int n_offset, void* /* tmpcache */, size_t /* cachesize */) {
+    assert(p.is_padded);
+    const auto src = const_cast<SType*>(p.B) + k_offset * 24 + n_offset * p.ldb;
+    const auto dst = *dst_ptr;
+    *dst_step = _GemmCore_T::NTILE;
+    if constexpr (std::is_same_v<BType, float> && std::is_same_v<SType, utils::fp16>) {
+      assert(n_size <= _GemmCore_T::NTILE);
+      assert(n_size <= 24);
+      assert(n_offset % 24 == 0);
+      if (n_size == 24) {
+        constexpr auto n_size = 24;
+        for (int i = 0; i < k_size; ++i) {
+          for (int j = 0; j < n_size; j += 8) {
+            const auto cur_src = src + i * 24 + j;
+            const auto cur_dst = dst + i * 24 + j;
+            const auto src = _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(cur_src)));
+            _mm256_store_ps(cur_dst, src);
+          }
+        }
+      } else {
+        for (int i = 0; i < k_size; ++i) {
+          for (int j = 0; j < n_size; j += 8) {
+            const auto cur_src = src + i * 24 + j;
+            const auto cur_dst = dst + i * 24 + j;
+            const auto src = _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(cur_src)));
+            _mm256_store_ps(cur_dst, src);
+          }
+        }
+      }
+
+    } else {
+      assert(0);
+    }
+    return BTLA_CODE::Success;
+  }
+};
+#endif
+template <class SRC_T, class DST_T, BTLA_ISA ISA_T>
+struct inplace_precompute_max_softmax_t {
+  // nsize is the staring n-size when causal mask enabled
+  // src and dst cam be on the same address if sizeof(SRC_T) >= sizeof(DST_T) and ld is correctly set
+  // s_max and expsum cam be on the same address
+  static void forward(int m_size, int n_size, int n_pad_size, bool is_causal, SRC_T* src, DST_T* dst,
+                      const SRC_T* s_max, float* expsum, int ld_src, int ld_dst) {
+    assert(false);
+  }
+};
+#if CompileFP16()
+template <BTLA_ISA ISA_T>
+struct inplace_precompute_max_softmax_t<float, fp16, ISA_T> {
+  TARGET_512 static void forward(int m_size, int n_size, int n_pad_size, bool is_causal, float* src, fp16* dst,
+                                 const float* s_max, float* expsum, int ld_src, int ld_dst) {
+    for (int ii = 0; ii < m_size; ++ii) {
+      const auto i_src = src + ii * ld_src;
+      const auto i_dst = dst + ii * ld_dst;
+      const auto curr_n_size = n_size + (is_causal ? ii : 0);
+      const uint16_t v_mask = _cvtu32_mask16((1U << (curr_n_size % 16)) - 1);
+      {  // subtract max
+        const auto row_max = _mm512_set1_ps(s_max[ii]);
+        for (int jj = 0; jj < curr_n_size; jj += 16) {  // should be fine to do extra work on idx >= curr_n_size
+          _mm512_storeu_ps(i_src + jj, _mm512_sub_ps(_mm512_loadu_ps(i_src + jj), row_max));
+        }
+      }
+      auto v_sum = _mm512_setzero_ps();
+      {  // exp & sum
+        int jj = 0;
+        for (; jj < curr_n_size / 16 * 16; jj += 16) {
+          const auto v_exp = kernel::avx512f::exp_ps_0_1(_mm512_loadu_ps(i_src + jj));
+          v_sum = _mm512_add_ps(v_sum, v_exp);
+          _mm512_storeu_ps(i_src + jj, v_exp);
+        }
+        if (jj < curr_n_size) {
+          const auto v_exp =
+              kernel::avx512f::exp_ps_0_1(_mm512_loadu_ps(i_src + jj));  // should be fine to load some extra
+          v_sum = _mm512_mask_add_ps(v_sum, v_mask, v_sum, v_exp);
+          _mm512_storeu_ps(i_src + jj, v_exp);  // should be fine to store some extra
+        }
+        expsum[ii] = _mm512_reduce_add_ps(v_sum);
+        v_sum = _mm512_set1_ps(expsum[ii]);
+      }
+      {  // scale & fp16
+        int jj = 0;
+        for (; jj < curr_n_size / 16 * 16; jj += 16) {
+          const auto v_softmax = _mm512_div_ps(_mm512_loadu_ps(i_src + jj), v_sum);
+          _mm256_storeu_ph(i_dst + jj, _mm512_cvtxps_ph(v_softmax));
+        }
+        if (jj < curr_n_size) {
+          const auto v_softmax = _mm512_div_ps(_mm512_loadu_ps(i_src + jj), v_sum);
+          _mm256_storeu_ph(i_dst + jj, _mm512_maskz_cvtxps_ph(v_mask, v_softmax));
+          jj += 16;
+        }
+        if (jj < n_pad_size) memset(i_dst + jj, 0, sizeof(fp16) * (n_pad_size - jj));
+      }
+    }
+  }
+};
+#endif
+
+#if CompileBF16()
+template <BTLA_ISA ISA_T>
+struct inplace_precompute_max_softmax_t<float, bf16, ISA_T> {
+  TARGET_512 static void forward(int m_size, int n_size, int n_pad_size, bool is_causal, float* src, bf16* dst,
+                                 const float* s_max, float* expsum, int ld_src, int ld_dst) {
+    for (int ii = 0; ii < m_size; ++ii) {
+      const auto i_src = src + ii * ld_src;
+      const auto i_dst = dst + ii * ld_dst;
+      const auto curr_n_size = n_size + (is_causal ? ii : 0);
+      const auto v_mask = _cvtu32_mask16((1U << (curr_n_size % 16)) - 1);
+      const auto v_mask32 = _cvtu32_mask32((1U << (curr_n_size % 32)) - 1);
+      {  // subtract max
+        const auto row_max = _mm512_set1_ps(s_max[ii]);
+        for (int jj = 0; jj < curr_n_size; jj += 16) {  // should be fine to do extra work on idx >= curr_n_size
+          _mm512_storeu_ps(i_src + jj, _mm512_sub_ps(_mm512_loadu_ps(i_src + jj), row_max));
+        }
+      }
+      auto v_sum = _mm512_setzero_ps();
+      {  // exp & sum
+        int jj = 0;
+        for (; jj < curr_n_size / 16 * 16; jj += 16) {
+          const auto v_exp = kernel::avx512f::exp_ps_0_1(_mm512_loadu_ps(i_src + jj));
+          v_sum = _mm512_add_ps(v_sum, v_exp);
+          _mm512_storeu_ps(i_src + jj, v_exp);
+        }
+        if (jj < curr_n_size) {
+          const auto v_exp =
+              kernel::avx512f::exp_ps_0_1(_mm512_loadu_ps(i_src + jj));  // should be fine to load some extra
+          v_sum = _mm512_mask_add_ps(v_sum, v_mask, v_sum, v_exp);
+          _mm512_storeu_ps(i_src + jj, v_exp);  // should be fine to store some extra
+        }
+        expsum[ii] = _mm512_reduce_add_ps(v_sum);
+        v_sum = _mm512_set1_ps(expsum[ii]);
+      }
+      {  // scale & bf16
+        int jj = 0;
+        for (; jj < curr_n_size / 32 * 32; jj += 32) {
+          const auto v_softmax0 = _mm512_div_ps(_mm512_loadu_ps(i_src + jj), v_sum);
+          const auto v_softmax1 = _mm512_div_ps(_mm512_loadu_ps(i_src + jj + 16), v_sum);
+          _mm512_storeu_epi16(i_dst + jj, (__m512i)_mm512_cvtne2ps_pbh(v_softmax1, v_softmax0));
+        }
+        if (jj < curr_n_size) {
+          const auto v_softmax0 = _mm512_div_ps(_mm512_loadu_ps(i_src + jj), v_sum);
+          const auto v_softmax1 = _mm512_div_ps(_mm512_loadu_ps(i_src + jj + 16), v_sum);
+#if defined(__GNUC__) && (__GNUC__ == 13) && (__GNUC_MINOR__ <= 2)
+          // There is a bug on gcc 13.1/13.2 what reverse the parameter order;
+          // A GUN team member said that it will befixed in GCC 13.3
+          _mm512_storeu_epi16(i_dst + jj, (__m512i)_mm512_maskz_cvtne2ps_pbh(v_mask32, v_softmax0, v_softmax1));
+#else
+          _mm512_storeu_epi16(i_dst + jj, (__m512i)_mm512_maskz_cvtne2ps_pbh(v_mask32, v_softmax1, v_softmax0));
+#endif
+          jj += 32;
+        }
+        if (jj < n_pad_size) memset(i_dst + jj, 0, sizeof(bf16) * (n_pad_size - jj));
+      }
+    }
+  }
+};
+#endif
+
+#if CompileAVX512F()
+template <BTLA_ISA ISA_T>
+struct inplace_precompute_max_softmax_t<std::enable_if_t<ISA_T >= BTLA_ISA::AVX512F, float>, float, ISA_T> {
+  TARGET_512 static void forward(  // NOLINT [build/include_what_you_use]
+      int m_size, int n_size, int n_pad_size, bool is_causal, float* src, float* dst, const float* s_max, float* expsum,
+      int ld_src, int ld_dst) {
+    for (int ii = 0; ii < m_size; ++ii) {
+      const auto i_src = src + ii * ld_src;
+      const auto i_dst = dst + ii * ld_dst;
+      const auto curr_n_size = n_size + (is_causal ? ii : 0);
+      const auto v_mask = _cvtu32_mask16((1U << (curr_n_size % 16)) - 1);
+      {  // subtract max
+        const auto row_max = _mm512_set1_ps(s_max[ii]);
+        for (int jj = 0; jj < curr_n_size; jj += 16) {  // should be fine to do extra work on idx >= curr_n_size
+          _mm512_storeu_ps(i_src + jj, _mm512_sub_ps(_mm512_loadu_ps(i_src + jj), row_max));
+        }
+      }
+      auto v_sum = _mm512_setzero_ps();
+      {  // exp & sum
+        int jj = 0;
+        for (; jj < curr_n_size / 16 * 16; jj += 16) {
+          const auto v_exp = kernel::avx512f::exp_ps_0_1(_mm512_loadu_ps(i_src + jj));
+          v_sum = _mm512_add_ps(v_sum, v_exp);
+          _mm512_storeu_ps(i_src + jj, v_exp);
+        }
+        if (jj < curr_n_size) {
+          const auto v_exp =
+              kernel::avx512f::exp_ps_0_1(_mm512_loadu_ps(i_src + jj));  // should be fine to load some extra
+          v_sum = _mm512_mask_add_ps(v_sum, v_mask, v_sum, v_exp);
+          _mm512_storeu_ps(i_src + jj, v_exp);  // should be fine to store some extra
+        }
+        expsum[ii] = _mm512_reduce_add_ps(v_sum);
+        v_sum = _mm512_set1_ps(expsum[ii]);
+      }
+      {  // scale & store
+        int jj = 0;
+        for (; jj < padto_le(curr_n_size, 16); jj += 16) {
+          _mm512_store_ps(i_dst + jj, _mm512_div_ps(_mm512_loadu_ps(i_src + jj), v_sum));
+        }
+        if (jj < curr_n_size) {
+          _mm512_store_ps(i_dst + jj, _mm512_maskz_div_ps(v_mask, _mm512_loadu_ps(i_src + jj), v_sum));
+          jj += 16;
+        }
+        if (jj < n_pad_size) memset(i_dst + jj, 0, sizeof(bf16) * (n_pad_size - jj));
+      }
+    }
+  }
+};
+#endif
+
+#if CompileAVX2()
+template <BTLA_ISA ISA_T>
+struct inplace_precompute_max_softmax_t<std::enable_if_t<(ISA_T < BTLA_ISA::AVX512F && ISA_T >= BTLA_ISA::AVX2), float>,
+                                        float, ISA_T> {
+  static void forward(  // NOLINT [build/include_what_you_use]
+      int m_size, int n_size, int n_pad_size, bool is_causal, float* src, float* dst, const float* s_max, float* expsum,
+      int ld_src, int ld_dst) {
+    for (int ii = 0; ii < m_size; ++ii) {
+      const auto i_src = src + ii * ld_src;
+      const auto i_dst = dst + ii * ld_dst;
+      const auto curr_n_size = n_size + (is_causal ? ii : 0);
+      const auto v_mask = _mm256_load_si256(reinterpret_cast<const __m256i*>(mask8[curr_n_size % 8]));
+      {  // subtract max
+        const auto row_max = _mm256_set1_ps(s_max[ii]);
+        for (int jj = 0; jj < curr_n_size; jj += 8) {  // should be fine to do extra work on idx >= curr_n_size
+          _mm256_storeu_ps(i_src + jj, _mm256_sub_ps(_mm256_loadu_ps(i_src + jj), row_max));
+        }
+      }
+      auto v_sum = _mm256_setzero_ps();
+      {  // exp & sum
+        int jj = 0;
+        for (; jj < padto_le(curr_n_size, 8); jj += 8) {
+          const auto v_exp = kernel::avx2::exp_ps_0_1(_mm256_loadu_ps(i_src + jj));
+          v_sum = _mm256_add_ps(v_sum, v_exp);
+          _mm256_storeu_ps(i_src + jj, v_exp);
+        }
+        if (jj < curr_n_size) {
+          const auto v_exp = kernel::avx2::exp_ps_0_1(_mm256_loadu_ps(i_src + jj));  // should be fine to load extra
+          v_sum = _mm256_add_ps(v_sum, _mm256_and_ps(v_exp, _mm256_castsi256_ps(v_mask)));
+          _mm256_storeu_ps(i_src + jj, v_exp);  // should be fine to store some extra
+        }
+
+        alignas(32) float sum_tmp[8];
+        _mm256_store_ps(sum_tmp, v_sum);
+        expsum[ii] = 0.f;
+        for (int iii = 0; iii < 8; ++iii) expsum[ii] += sum_tmp[iii];
+        v_sum = _mm256_set1_ps(expsum[ii]);
+      }
+      {  // scale & store
+        int jj = 0;
+        for (; jj < padto_le(curr_n_size, 8); jj += 8) {
+          _mm256_store_ps(i_dst + jj, _mm256_div_ps(_mm256_loadu_ps(i_src + jj), v_sum));
+        }
+        if (jj < curr_n_size) {
+          const auto quotient = _mm256_div_ps(_mm256_loadu_ps(i_src + jj), v_sum);
+          _mm256_store_ps(i_dst + jj, _mm256_and_ps(quotient, _mm256_castsi256_ps(v_mask)));
+          jj += 8;
+        }
+        if (jj < n_pad_size) memset(i_dst + jj, 0, sizeof(float) * (n_pad_size - jj));
+      }
+    }
+  }
+};
+#endif
+template <BTLA_ISA ISA_T>
+struct inplace_precompute_max_softmax_t<float, uint8_t, ISA_T> {
+  TARGET_512 static void forward(int m_size, int n_size, int n_pad_size, bool is_causal, float* src, uint8_t* dst,
+                                 float* s_max, float* expsum, int ld_src, int ld_dst) {
+    for (int ii = 0; ii < m_size; ++ii) {
+      const auto i_src = src + ii * ld_src;
+      const auto i_dst = dst + ii * ld_dst;
+      const auto curr_n_size = n_size + (is_causal ? ii : 0);
+      const uint16_t v_mask = _cvtu32_mask16((1U << (curr_n_size % 16)) - 1);
+      {  // subtract max
+        const auto row_max = _mm512_set1_ps(s_max[ii]);
+        for (int jj = 0; jj < curr_n_size; jj += 16) {  // should be fine to do extra work on idx >= curr_n_size
+          _mm512_storeu_ps(i_src + jj, _mm512_sub_ps(_mm512_loadu_ps(i_src + jj), row_max));
+        }
+      }
+      {  // exp & sum
+        auto v_sum = _mm512_setzero_ps();
+        int jj = 0;
+        for (; jj < curr_n_size / 16 * 16; jj += 16) {
+          const auto v_exp = kernel::avx512f::exp_ps_0_1(_mm512_loadu_ps(i_src + jj));
+          v_sum = _mm512_add_ps(v_sum, v_exp);
+          _mm512_storeu_ps(i_src + jj, v_exp);
+        }
+        if (jj < curr_n_size) {
+          const auto v_exp =
+              kernel::avx512f::exp_ps_0_1(_mm512_loadu_ps(i_src + jj));  // should be fine to load some extra
+          v_sum = _mm512_mask_add_ps(v_sum, v_mask, v_sum, v_exp);
+          _mm512_storeu_ps(i_src + jj, v_exp);  // should be fine to store some extra
+        }
+        expsum[ii] = _mm512_reduce_add_ps(v_sum);
+      }
+      {  // scale & int8
+        const auto v_scale = _mm512_set1_ps(UINT8_MAX);
+        int jj = 0;
+        for (; jj < curr_n_size / 16 * 16; jj += 16) {
+          const auto v_softmax = _mm512_mul_ps(_mm512_loadu_ps(i_src + jj), v_scale);
+          _mm_storeu_si128(reinterpret_cast<__m128i*>(i_dst + jj),
+                           _mm512_cvtusepi32_epi8(_mm512_cvtps_epu32(v_softmax)));
+        }
+        if (jj < curr_n_size) {
+          const auto v_softmax = _mm512_mul_ps(_mm512_loadu_ps(i_src + jj), v_scale);
+          _mm_storeu_si128(reinterpret_cast<__m128i*>(i_dst + jj),
+                           _mm512_maskz_cvtusepi32_epi8(v_mask, _mm512_cvtps_epu32(v_softmax)));
+          jj += 16;
+        }
+        if (jj < n_pad_size) memset(i_dst + jj, 0, sizeof(uint8_t) * (n_pad_size - jj));
+      }
+    }
+  }
+};
+
+/**
+ * @brief MHA interface with N-dim parallelism & stable softmax
+ *
+ * @tparam L_Max Launcher type of the QK matmul; tracking the dst max value of each row
+ * @tparam L_Scale Launcher type of the PV scale matmul (S for that in the flash-attn paper)
+ */
+template </* class Parallel_T, */ class L_Max, class L_Scale>
+class mha_stable_interface_t {
+  template <class EpiArgs, bool HAS_SCALE, class T>
+  static inline typename std::enable_if<!HAS_SCALE, EpiArgs>::type composeEpiArgs(float*, T* dst, int ld_dst) {
+    return {dst, ld_dst};
+  }
+  template <class EpiArgs, bool HAS_SCALE, class T>
+  static inline typename std::enable_if<HAS_SCALE, EpiArgs>::type composeEpiArgs(float* scale, T* dst, int ld_dst) {
+    return {scale, dst, ld_dst};
+  }
+
+ public:
+  using PrologueQ = typename L_Max::PrologueA;
+  using PrologueK = typename L_Max::PrologueB;
+  using QKProQArgs = typename PrologueQ::Param;
+  using QKProKArgs = typename PrologueK::Param;
+  using QKArgs = typename L_Max::Param;
+  using QKEpiArgs = typename L_Max::EpiParam;
+
+  using PrologueS = typename L_Scale::PrologueA;
+  using PrologueV = typename L_Scale::PrologueB;
+  using PVProPArgs = typename PrologueS::Param;
+  using PVProVArgs = typename PrologueV::Param;
+  using PVArgs = typename L_Scale::Param;
+  using PVEpiArgs = typename L_Scale::EpiParam;
+
+  using GemmQK = typename L_Max::GemmCore;
+  using GemmPV = typename L_Scale::GemmCore;
+  using Q_T = typename std::remove_const<typename std::remove_pointer<decltype(QKProQArgs::A)>::type>::type;
+  using K_T = typename PrologueK::SType;
+  using V_T = typename PrologueV::SType;
+  using DST_T = typename L_Scale::Epilogue::DType;
+
+  static constexpr auto RT_ISA = std::max(L_Max::RT_ISA, L_Scale::RT_ISA);
+
+  static_assert(GemmQK::MTILE == GemmPV::MTILE, "2 GEMM should have the same M_TILE.");
+  static constexpr auto M_TILE = GemmQK::MTILE;
+
+  BTLA_CODE compute(const attn_fwd_args_t<Q_T, K_T, V_T, DST_T>& p, parallel::IThreading& th) {
+    assert((std::is_same<Q_T, int8_t>::value || p.Q_sc == 1));
+    assert((std::is_same<K_T, int8_t>::value || p.K_sc == 1));
+    assert((std::is_same<V_T, int8_t>::value || p.V_sc == 1));
+    assert((std::is_same<DST_T, int8_t>::value || p.dst_sc == 1));
+
+    assert((p.Q_layout == ATTN_FWD_LAYOUT_PLAIN && p.dst_layout == ATTN_FWD_LAYOUT_PLAIN));
+    assert((p.K_layout == ATTN_FWD_LAYOUT_PLAIN ||
+            (std::is_same<K_T, int8_t>::value && p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4) ||
+            (std::is_same<K_T, bf16>::value && p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2) ||
+            (std::is_same<K_T, fp16>::value && p.K_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1)));
+    assert((p.V_layout == ATTN_FWD_LAYOUT_PLAIN ||
+            (std::is_same<V_T, int8_t>::value && p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4) ||
+            (std::is_same<V_T, bf16>::value && p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2) ||
+            (std::is_same<V_T, fp16>::value && p.V_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1)));
+
+    assert((!std::is_same<  //
+               PrologueK, mha::weight_forward_n_tile48_t<typename L_Max::GemmCore, L_Max::RT_ISA>>::value) ||
+           p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4 ||
+           p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2);  // WeightForward can only be used with preprocessed layout
+
+    assert((!std::is_same<  //
+               PrologueV, mha::weight_forward_n_tile48_t<typename L_Scale::GemmCore, L_Scale::RT_ISA>>::value) ||
+           p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4 ||
+           p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2);  // WeightForward can only be used with preprocessed layout
+
+    assert((p.K_layout != ATTN_FWD_LAYOUT_PLAIN || p.step_v_head_size == 1));
+    assert((p.V_layout != ATTN_FWD_LAYOUT_PLAIN || p.step_k_sl == 1));
+    const auto num_heads = p.batch_size * p.head_num;  // Total number of heads
+    GetCPUDevice();
+    const bool is_causal = (p.attn_flags & NE_ATTN_FLAG_IS_CAUSAL) != 0;
+    const bool is_alibi = (p.attn_flags & NE_ATTN_FLAG_IS_ALIBI8) != 0;
+    const bool prefer_fp32 = (p.attn_flags & NE_ATTN_FLAG_PREFER_FP32) != 0;
+
+    assert(("prefer_fp32 not followed!",  //
+            !prefer_fp32 || (GemmQK::COMP == bestla::gemm::CompType::COMP_FP32 &&
+                             GemmPV::COMP == bestla::gemm::CompType::COMP_FP32)));
+    assert(("qlen should be no greater then klen/vlen!", !is_causal || p.sl_q <= p.sl_kv));
+    assert(!is_causal || p.sl_q <= p.sl_kv);
+    assert(("head_num must be a multiple of heads_kv!", p.head_num % p.heads_kv == 0));
+    const auto group_heads = p.head_num / p.heads_kv;
+    const auto sl_diff = p.sl_kv - p.sl_q;
+
+    // TP will need the real rank order of k
+    int32_t k_offset = 0;
+    int32_t log_head_num = p.head_num;
+#ifdef IE_TP_MODEL
+    parallel_context* p_ctx = init_parallel_context();
+    int32_t world_size = get_tp_size(p_ctx);
+    int32_t rank = get_tp_rank(p_ctx);
+    if (world_size > 1) k_offset += rank * p.head_num;
+    log_head_num *= world_size;
+#endif
+
+    // alibi slope
+    const int n_heads_log2_floor = 1 << static_cast<int>(floor(log2(log_head_num)));
+    const float m0 = powf(2.0f, -(8.f) / n_heads_log2_floor);
+    const float m1 = powf(2.0f, -(8.f / 2.0f) / n_heads_log2_floor);
+
+    const auto m_tiles = updiv(p.sl_q, M_TILE);
+    const auto num_tasks = num_heads * m_tiles;
+
+    using Scheduler2D = bestla::parallel::Scheduler2D;
+    const Scheduler2D parl({th.num_threads(), {num_tasks, 1}, {1, 1}, {0, 0}});  // main parallel scheduler
+
+    th.parallel_for([&](int tid) {
+      const int tmp_s_size = M_TILE * padto(padto(p.sl_kv, GemmQK::NTILE), GemmPV::KTILE);
+      const int tmp_p_size = tmp_s_size;
+      const int tmp_bytes = tmp_s_size * sizeof(float);  // S & exp
+      const auto tmp_s = reinterpret_cast<float*>(p.tmp + tid * tmp_bytes);
+      using PType = typename GemmPV::AType;
+      const auto tmp_p = reinterpret_cast<PType*>(tmp_s);  // overwrite tmp_s row-wisely
+
+      // calculate mm + softmax + mm
+      {
+        typename parallel::ThreadProblem2D thdp{tid};
+        parl.getIndex(thdp);
+        const auto [task_start, _assert0] = thdp.loc;
+        auto [task_size, _assert_max1] = thdp.size;
+        assert(task_size == 0 || _assert0 == 0);
+        assert(task_size == 0 || _assert_max1 == 1 || _assert_max1 == 0);
+        if (_assert_max1 == 0 || !thdp.valid) task_size = 0;
+
+        for (int task_id = task_start; task_id < task_start + task_size; ++task_id) {
+          const int ibat = task_id / m_tiles;
+          const int i_m = task_id % m_tiles * M_TILE;
+          const int ibs = ibat / p.head_num;
+          const int ihn = ibat % p.head_num;
+          const int ihkv = ihn / group_heads;
+          const int m_size = std::min(M_TILE, p.sl_q - i_m);
+
+          const auto alibi_ihn_m = !is_alibi ? 0.f
+                                   : (ihn + k_offset < n_heads_log2_floor)
+                                       ? powf(m0, ihn + k_offset + 1)
+                                       : powf(m1, 2 * (ihn + k_offset - n_heads_log2_floor) + 1);
+
+          float s_max[M_TILE]{};  // maximum for each row of the S matrix
+          std::fill_n(s_max, M_TILE, -INFINITY);
+
+          // ptr to Q / dst matrix of the current head
+          const auto head_q = p.Q + ibs * p.step_q_bs + ihn * p.step_q_head_num;
+          const auto head_k = p.K + ibs * p.step_k_bs + ihkv * p.step_k_head_num;
+          const auto head_v = p.V + ibs * p.step_v_bs + ihkv * p.step_v_head_num;
+          const auto head_dst = p.dst + ibs * p.step_dst_bs + ihn * p.step_dst_head_num;
+          const auto unmasked_size = is_causal ? std::min(p.sl_kv, sl_diff + i_m + M_TILE - 1 + 1) : p.sl_kv;
+
+          const auto unmasked_size_pad_qk = std::min(p.sl_kv, padto(unmasked_size, GemmQK::NTILE));
+          const auto unmasked_size_pad_pv = std::min(p.sl_kv, padto(unmasked_size, GemmPV::KTILE));
+          const int ld_tmp_s = padto(padto(unmasked_size_pad_pv, GemmQK::NTILE), GemmPV::KTILE);
+          static_assert(sizeof(float) >= sizeof(PType), "PType exceeded float size!");
+          const int ld_tmp_p = ld_tmp_s * sizeof(float) / sizeof(PType);
+          const auto qk_prok_ldb = p.step_k_sl == 1                                 ? p.step_k_head_size
+                                   : p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4 ? p.step_k_sl
+                                   : p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2 ? p.step_k_sl
+                                   : p.K_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1 ? p.step_k_sl
+                                                                                    : (assert(0), 0);
+
+          typename parallel::gemm::ThreadProblemBase tpQK{
+              /* ThreadProblem2D */ {tid, {}, {i_m, 0}, {m_size, unmasked_size_pad_qk}, true},
+              /* .block = */ {M_TILE, GemmQK::NTILE, p.head_size},
+              /* .stacksize = */ _cd->getL2CacheSize(),
+              /* .tmpcachesize = */ _cd->getL2CacheSize(),
+          };
+          l_qk.run(  // QxK => S ==exp==> P
+              QKArgs{
+                  utils::GemmProblem{
+                      /* .batch */ 1,
+                      /* .M = */ p.sl_q,
+                      /* .N = */ unmasked_size_pad_qk,
+                      /* .K = */ p.head_size,
+                  },
+                  /* .paramA = */
+                  QKProQArgs{
+                      head_q,
+                      p.step_q_sl,
+                  },
+                  /* .paramB = */
+                  QKProKArgs{
+                      /* .B = */ head_k,
+                      /* .ldb = */ qk_prok_ldb,
+                      /* .is_padded = */ true,
+                  },  // K should be pre-transposed
+                  /* .paramC = */
+                  QKEpiArgs{
+                      /* .dst = */ tmp_s - i_m * ld_tmp_s,  // pretend that there is a whole S mat
+                      /* .dst_sum = */ s_max - i_m,         // pretend that there is a whole S mat
+                      /* .ld_dst = */ ld_tmp_s,
+                      /* .scale = */ p.QK_scale * p.Q_sc * p.K_sc,
+                      /* .causal_offset = */ is_causal ? sl_diff : -1,
+                      /* .alibi_slope = */ alibi_ihn_m,
+                  },
+                  // /* .workspace = */ nullptr,
+              },
+              tpQK);
+
+          // softmax (with pre-computed row_max)
+          const auto unmasked_size_start = is_causal ? std::min(sl_diff + i_m + 1, p.sl_kv) : p.sl_kv;
+          float expsum[M_TILE]{};  // maximum for each row of the S matrix
+          const auto softmax_npad_size = padto(unmasked_size_pad_pv, GemmPV::KTILE);
+          inplace_precompute_max_softmax_t<float, PType, RT_ISA>::forward(  //
+              m_size, unmasked_size_start, softmax_npad_size,               // m / n
+              is_causal, tmp_s, tmp_p, s_max, expsum, ld_tmp_s, ld_tmp_p);  //
+
+          const auto pv_scale = expsum;
+          for (int i = 0; i < M_TILE; ++i) pv_scale[i] = p.V_sc / UINT8_MAX / expsum[i] / p.dst_sc;
+
+          const auto pv_prov_ldb = p.step_v_head_size == 1                          ? p.step_v_sl
+                                   : p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4 ? p.step_v_head_size
+                                   : p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2 ? p.step_v_head_size
+                                   : p.V_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1 ? p.step_v_head_size
+                                                                                    : (assert(0), 0);
+
+          typename parallel::gemm::ThreadProblemBase tpPV{
+              /* ThreadProblem2D */ {tid, {}, {0, 0}, {m_size, p.head_size}, true},
+              /* .block = */ {M_TILE, GemmPV::NTILE, unmasked_size_pad_pv},
+              /* .stacksize = */ _cd->getL2CacheSize(),
+              /* .tmpcachesize = */ _cd->getL2CacheSize(),
+          };
+          l_pv.run(  // PxV => O
+              PVArgs{
+                  utils::GemmProblem{
+                      /* .batch */ 1,
+                      /* .M = */ std::min(p.sl_q - i_m, M_TILE),
+                      /* .N = */ p.head_size,
+                      /* .K = */ unmasked_size_pad_pv,
+                  },
+                  /* .paramA = */ PVProPArgs{tmp_p, ld_tmp_p},
+                  /* .paramB = */
+                  PVProVArgs{
+                      /* .B = */ head_v,
+                      /* .ldb = */ pv_prov_ldb,
+                      /* .is_padded = */ true,
+                  },
+                  /* .paramC = */
+                  composeEpiArgs<PVEpiArgs, std::is_same<V_T, int8_t>::value>(  //
+                      pv_scale, head_dst + i_m * p.step_dst_sl, p.step_dst_sl),
+                  // /* .workspace = */ nullptr,
+              },
+              tpPV);
+        }
+      }
+    });
+    return BTLA_CODE::Success;
+  }
+
+ protected:
+  L_Max l_qk;
+  L_Scale l_pv;
+};
+
+template <typename Q_T, typename K_T, typename V_T, typename DST_T>
+inline void bestla_fusion_attn_forward(const attn_fwd_args_t<Q_T, K_T, V_T, DST_T>& params) = delete;
+
+template <class GEMM_T, BTLA_ISA ISA_T>
+using WeightPackBatchBf16Bf16NonTr = weight_pack_batch_bf16_non_tr_t<GEMM_T, ISA_T, bf16>;
+template <class GEMM_T, BTLA_ISA ISA_T>
+using WeightPackBatchBf16Bf16Trans = weight_pack_batch_bf16_trans_t<GEMM_T, ISA_T, bf16>;
+template <>
+inline void bestla_fusion_attn_forward<bf16, bf16, bf16, bf16>(const attn_fwd_args_t<bf16, bf16, bf16, bf16>& p) {
+  using GemmKernelBF16ExpSum = mha::launcher_base_off_t<  //
+      BTLA_ISA::AMX_BF16,                                 //
+      gemm::HCoreRowNAmxbf16<64, 16>,                     //
+      prologue_a::gemm::ActivationBase,                   //
+      WeightPackBatchBf16Bf16Trans,                       //
+      mha::ScaleExpAccSumFp32Bf16>;                       //
+  using GemmKernelBF16 = mha::launcher_base_off_t<        //
+      BTLA_ISA::AMX_BF16,                                 //
+      gemm::HCoreRowNAmxbf16<64, 16>,                     //
+      prologue_a::gemm::ActivationBase,                   //
+      WeightPackBatchBf16Bf16NonTr,                       //
+      mha::ScaleWriteBackFp32Bf16>;
+  static mha_interface_t<GemmKernelBF16ExpSum, GemmKernelBF16> kernel;
+  const auto pth = ne_threading::get();
+  [[maybe_unused]] const auto ret = kernel.compute(p, *pth);
+  assert(ret == BTLA_CODE::Success);
+}
+
+template <class GEMM_T, BTLA_ISA ISA_T>
+using WeightPackBatchFp16Bf16NonTr = weight_pack_batch_bf16_non_tr_t<GEMM_T, ISA_T, fp16>;
+template <class GEMM_T, BTLA_ISA ISA_T>
+using WeightPackBatchFp16Bf16Trans = weight_pack_batch_bf16_trans_t<GEMM_T, ISA_T, fp16>;
+template <>
+inline void bestla_fusion_attn_forward<float, fp16, fp16, float>(
+    const attn_fwd_args_t<float, fp16, fp16, float>& params) {
+  GetCPUDevice();
+  const auto pth = ne_threading::get();
+  if (MHA_PREFER_AVX512FP16 && _cd->AVX512_FP16() && params.step_k_sl == 1) {
+    using GemmKernelFP16TrackMax = mha::launcher_base_weight_t<  //
+        BTLA_ISA::AVX512_FP16,                                   //
+        gemm::HCoreRowNAvx512fp16<64, 8>,                        //
+        prologue_a::gemm::ActivationConverterFp32,               //
+        mha::weight_base_t,                                      //
+        mha::ScaleTrackMaxFp16Fp32>;                             //
+    using GemmKernelFP16 = mha::launcher_base_weight_t<          //
+        BTLA_ISA::AVX512_FP16,                                   //
+        gemm::HCoreRowNAvx512fp16<64, 8>,                        //
+        prologue_a::gemm::ActivationBase,                        //
+        mha::weight_base_t,                                      //
+        bestla::epilogue::gemm::AccumulatorWriteBackFp16Fp32>;
+    static mha_stable_interface_t<GemmKernelFP16TrackMax, GemmKernelFP16> kernel;
+    [[maybe_unused]] const auto ret = kernel.compute(params, *pth);
+    assert(ret == BTLA_CODE::Success);
+  } else if (_cd->AMX_BF16() &&                           //
+             params.K_layout == ATTN_FWD_LAYOUT_PLAIN &&  //
+             params.V_layout == ATTN_FWD_LAYOUT_PLAIN) {
+    if (params.step_k_head_size == 1) {
+      using GemmKernelFP32FP16BF16ExpSum = mha::launcher_base_off_t<  //
+          BTLA_ISA::AMX_BF16,                                         //
+          gemm::HCoreRowNAmxbf16<64, 16>,                             //
+          prologue_a::gemm::ActivationConverterFp32,                  //
+          WeightPackBatchFp16Bf16Trans,                               //
+          mha::ScaleExpAccSumFp32Bf16>;                               //
+      using GemmKernelBF16FP16FP32 = mha::launcher_base_off_t<        //
+          BTLA_ISA::AMX_BF16,                                         //
+          gemm::HCoreRowNAmxbf16<64, 16>,                             //
+          prologue_a::gemm::ActivationBase,                           //
+          WeightPackBatchFp16Bf16NonTr,                               //
+          mha::ScaleWriteBackFp32Fp32>;
+      static mha_interface_t<GemmKernelFP32FP16BF16ExpSum, GemmKernelBF16FP16FP32> kernel;
+      [[maybe_unused]] const auto ret = kernel.compute(params, *pth);
+      assert(ret == BTLA_CODE::Success);
+    } else if (params.step_k_sl == 1) {
+      using GemmKernelFP32FP16BF16ExpSum = mha::launcher_base_off_t<  //
+          BTLA_ISA::AMX_BF16,                                         //
+          gemm::HCoreRowNAmxbf16<64, 16>,                             //
+          prologue_a::gemm::ActivationConverterFp32,                  //
+          WeightPackBatchFp16Bf16NonTr,                               //
+          mha::ScaleExpAccSumFp32Bf16>;                               //
+      using GemmKernelBF16FP16FP32 = mha::launcher_base_off_t<        //
+          BTLA_ISA::AMX_BF16,                                         //
+          gemm::HCoreRowNAmxbf16<64, 16>,                             //
+          prologue_a::gemm::ActivationBase,                           //
+          WeightPackBatchFp16Bf16NonTr,                               //
+          mha::ScaleWriteBackFp32Fp32>;
+      static mha_interface_t<GemmKernelFP32FP16BF16ExpSum, GemmKernelBF16FP16FP32> kernel;
+      [[maybe_unused]] const auto ret = kernel.compute(params, *pth);
+      assert(ret == BTLA_CODE::Success);
+    }
+  } else if (_cd->AVX2() &&  //
+             params.K_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1 &&
+             params.V_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1) {
+    using GemmKernelTrackMax = mha::launcher_base_weight_t<  //
+        BTLA_ISA::AVX2,                                      //
+        gemm::SCoreRowNAvx2<24, 4>,                          //
+        prologue_a::gemm::ActivationBase,                    //
+        mha::weight_cvt_f16_n_tile24_t,                      //
+        mha::ScaleTrackMaxFp32Fp32>;                         //
+    using GemmKernelId = mha::launcher_base_weight_t<        //
+        BTLA_ISA::AVX2,                                      //
+        gemm::SCoreRowNAvx2<24, 4>,                          //
+        mha::activation_identity_t,                          // pretty sure we have enough paddings for P-matrix
+        mha::weight_cvt_f16_n_tile24_t,                      //
+        bestla::epilogue::gemm::AccumulatorWriteBackFp32>;   //
+    static mha_stable_interface_t<GemmKernelTrackMax, GemmKernelId> mha;
+    [[maybe_unused]] const auto ret = mha.compute(params, *pth);
+    assert(ret == BTLA_CODE::Success);
+  } else {
+    assert(false);  // no suitbale launcher
+  }
+}
+
+template <>
+inline void bestla_fusion_attn_forward<fp16, fp16, fp16, fp16>(const attn_fwd_args_t<fp16, fp16, fp16, fp16>& params) {
+  GetCPUDevice();
+  const auto pth = ne_threading::get();
+  if (_cd->AMX_BF16()) {
+    using GemmKernelFP16TrackMax = mha::launcher_base_weight_t<  //
+        BTLA_ISA::AVX512_FP16,                                   //
+        gemm::HCoreRowNAvx512fp16<64, 8>,                        //
+        prologue_a::gemm::ActivationBase,                        //
+        mha::weight_base_t,                                      //
+        mha::ScaleTrackMaxFp16Fp32>;                             //
+    using GemmKernelFP16 = mha::launcher_base_weight_t<          //
+        BTLA_ISA::AVX512_FP16,                                   //
+        gemm::HCoreRowNAvx512fp16<64, 8>,                        //
+        prologue_a::gemm::ActivationBase,                        //
+        mha::weight_base_t,                                      //
+        bestla::epilogue::gemm::AccumulatorWriteBackFp16>;
+    static mha_stable_interface_t<GemmKernelFP16TrackMax, GemmKernelFP16> kernel;
+    [[maybe_unused]] const auto ret = kernel.compute(params, *pth);
+    assert(ret == BTLA_CODE::Success);
+  } else {
+    assert(0);
+  }
+}
+
+template <>
+inline void bestla_fusion_attn_forward<int8_t, int8_t, int8_t, int8_t>(
+    const attn_fwd_args_t<int8_t, int8_t, int8_t, int8_t>& params) {
+  GetCPUDevice();
+  const auto pth = ne_threading::get();
+  if (/* params.sl_q > 4 &&  */ _cd->AMX_INT8()) {                // TODO(Yi): add vnni impl
+    using GemmKernelInt32TrackMax = mha::launcher_base_weight_t<  //
+        BTLA_ISA::AMX_INT8,                                       //
+        gemm::ICoreRowNAmxint8SS<48, 16>,                         //
+        prologue_a::gemm::ActivationBase,                         //
+        mha::weight_forward_n_tile48_t,                           //
+        mha::ScaleTrackMaxS32Fp32>;                               //
+    using GemmKernelInt32 = mha::launcher_base_weight_t<          //
+        BTLA_ISA::AMX_INT8,                                       //
+        gemm::ICoreRowNAmxint8<48, 16>,                           //
+        prologue_a::gemm::ActivationBase,                         //
+        mha::weight_forward_n_tile48_t,                           //
+        mha::ScaleWriteBackS32S8>;                                //
+    static mha_stable_interface_t<GemmKernelInt32TrackMax, GemmKernelInt32> mha;
+    [[maybe_unused]] const auto ret = mha.compute(params, *pth);
+    assert(ret == BTLA_CODE::Success);
+  } else if (_cd->AVX512_VNNI()) {
+    // using GemmKernelInt32TrackMax = mha::launcher_base_weight_t<  //
+    //     BTLA_ISA::AMX_INT8,                                       // TODO(Yi): s8s8 vnni kernel?
+    //     gemm::GemmCore_Row_NN_16x48_AMX_S8S8,                     //
+    //     prologue::gemm::ActivationBase,                           //
+    //     mha::weight_forward_n_tile48_t,                           //
+    //     mha::ScaleTrackMaxS32Fp32>;                               //
+    // using GemmKernelInt32 = mha::launcher_base_weight_t<          //
+    //     BTLA_ISA::AVX512_VNNI,                                    //
+    //     gemm::GemmCore_Row_NN_8x48_AVX512_VNNI,                   //
+    //     prologue::gemm::ActivationBase,                           //
+    //     mha::weight_forward_n_tile48_t,                           //
+    //     mha::ScaleWriteBackS32S8>;                                //
+    // static mha_stable_interface_t<GemmKernelInt32TrackMax, GemmKernelInt32> mha;
+    // [[maybe_unused]] const auto ret = mha.compute(params);
+    // assert(ret == BTLA_CODE::Success);
+    assert(0);
+  } else {
+    assert(0);
+  }
+}
+
+template <>
+inline void bestla_fusion_attn_forward<float, bf16, bf16, float>(
+    const attn_fwd_args_t<float, bf16, bf16, float>& params) {
+  GetCPUDevice();
+  const auto pth = ne_threading::get();
+  if (_cd->AVX512F() && (params.attn_flags & NE_ATTN_FLAG_PREFER_FP32) != 0) {
+    using GemmKernelBF16TrackMax = mha::launcher_base_weight_t<  //
+        BTLA_ISA::AMX_BF16,                                      //
+        gemm::SCoreRowNAvx512f<48, 8>,                           //
+        prologue_a::gemm::ActivationBase,                        //
+        mha::weight_cvt_bf16_ntile48_t,                          //
+        mha::ScaleTrackMaxFp32Fp32>;                             //
+    using GemmKernelBF16 = mha::launcher_base_weight_t<          //
+        BTLA_ISA::AMX_BF16,                                      //
+        gemm::SCoreRowNAvx512f<48, 8>,                           //
+        mha::activation_identity_t,                              // pretty sure we have enough paddings for P-matrix
+        mha::weight_cvt_bf16_ntile48_t,                          //
+        bestla::epilogue::gemm::AccumulatorWriteBackFp32>;       //
+    static mha_stable_interface_t<GemmKernelBF16TrackMax, GemmKernelBF16> mha;
+    [[maybe_unused]] const auto ret = mha.compute(params, *pth);
+    assert(ret == BTLA_CODE::Success);
+  } else if (/* params.sl_q > 4 &&  */ _cd->AMX_BF16()) {        // TODO(Yi): add vdpbf16ps impl
+    using GemmKernelBF16TrackMax = mha::launcher_base_weight_t<  //
+        BTLA_ISA::AMX_BF16,                                      //
+        gemm::HCoreRowNAmxbf16<48, 16>,                          //
+        prologue_a::gemm::ActivationConverterFp32,               //
+        mha::weight_forward_n_tile48_t,                          //
+        mha::ScaleTrackMaxFp32Fp32>;                             //
+    using GemmKernelBF16 = mha::launcher_base_weight_t<          //
+        BTLA_ISA::AMX_BF16,                                      //
+        gemm::HCoreRowNAmxbf16<48, 16>,                          //
+        mha::activation_identity_t,                              // pretty sure we have enough paddings for P-matrix
+        mha::weight_forward_n_tile48_t,                          //
+        bestla::epilogue::gemm::AccumulatorWriteBackFp32>;       //
+    static mha_stable_interface_t<GemmKernelBF16TrackMax, GemmKernelBF16> mha;
+    [[maybe_unused]] const auto ret = mha.compute(params, *pth);
+    assert(ret == BTLA_CODE::Success);
+  } else {
+    assert(0);
+  }
+}
+
+template <typename Q_T, typename K_T, typename V_T, typename DST_T>
+inline void bestla_fusion_attn_forward_ref(const attn_fwd_args_t<Q_T, K_T, V_T, DST_T>& p) {
+  const bool is_causal = (p.attn_flags & NE_ATTN_FLAG_IS_CAUSAL) != 0;
+  const bool is_alibi = (p.attn_flags & NE_ATTN_FLAG_IS_ALIBI8) != 0;
+  const bool prefer_fp32 = (p.attn_flags & NE_ATTN_FLAG_PREFER_FP32) != 0;
+  assert(!is_causal || p.sl_q <= p.sl_kv);
+  assert(("head_num must be a multiple of heads_kv!", p.head_num % p.heads_kv == 0));
+  const auto group_heads = p.head_num / p.heads_kv;
+  attn_shape_t attn_shape{
+      p.batch_size, p.head_num, p.heads_kv, p.head_size, p.sl_q, p.sl_kv,
+  };
+  const auto workspace_size = bestla_fusion_attn_workspace_size(&attn_shape);
+  static std::mt19937 rng;
+  static std::uniform_int_distribution<> dist;
+#ifdef IE_TESTS
+  init_vector(p.tmp, workspace_size, INT8_MIN - 1, INT8_MAX + 1, dist(rng));
+#else
+  std::fill_n(p.tmp, workspace_size, 'f');
+#endif
+  const bool IS_BF16_GEMM =
+      !prefer_fp32 &&  //
+      ((std::is_same<Q_T, float>::value && std::is_same<K_T, fp16>::value && std::is_same<V_T, fp16>::value &&
+        std::is_same<DST_T, float>::value && (!MHA_PREFER_AVX512FP16 || (p.step_k_head_size == 1))) ||
+       (std::is_same<Q_T, float>::value && std::is_same<K_T, bf16>::value && std::is_same<V_T, bf16>::value &&
+        std::is_same<DST_T, float>::value));
+  assert(p.Q_layout == ATTN_FWD_LAYOUT_PLAIN);
+  assert(p.dst_layout == ATTN_FWD_LAYOUT_PLAIN);
+  assert((p.K_layout == ATTN_FWD_LAYOUT_PLAIN ||
+          (std::is_same<K_T, int8_t>::value && p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4) ||
+          (std::is_same<K_T, bf16>::value && p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2) ||
+          (std::is_same<K_T, fp16>::value && p.K_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1)));
+  assert((p.V_layout == ATTN_FWD_LAYOUT_PLAIN ||
+          (std::is_same<V_T, int8_t>::value && p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4) ||
+          (std::is_same<V_T, bf16>::value && p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2) ||
+          (std::is_same<V_T, fp16>::value && p.V_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1)));
+
+  const auto NTILE = p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4   ? 48
+                     : p.K_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2 ? 48
+                     : p.K_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1 ? 24
+                                                                      : 0;
+  const auto ROWPACK = p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4   ? 4
+                       : p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2 ? 2
+                       : p.V_layout == ATTN_FWD_LAYOUT_NTILE24_ROWPACK1 ? 1
+                                                                        : 0;
+  // TP will need the real rank order of k
+  int32_t k_offset = 0;
+  int32_t log_head_num = p.head_num;
+  const int n_heads_log2_floor = 1 << static_cast<int>(floor(log2(log_head_num)));
+  const float m0 = powf(2.0f, -(8.f) / n_heads_log2_floor);
+  const float m1 = powf(2.0f, -(8.f / 2.0f) / n_heads_log2_floor);
+
+#pragma omp parallel for collapse(3)
+  for (int ibs = 0; ibs < p.batch_size; ++ibs)
+    for (int ihn = 0; ihn < p.head_num; ++ihn)
+      for (int i = 0; i < p.sl_q; ++i) {
+        const auto ihkv = ihn / group_heads;
+        const auto q_curr = p.Q + ibs * p.step_q_bs + ihn * p.step_q_head_num + i * p.step_q_sl;
+        const auto dst_curr = p.dst + ibs * p.step_dst_bs + ihn * p.step_dst_head_num + i * p.step_dst_sl;
+
+        const auto k_curr = p.K + ibs * p.step_k_bs + ihkv * p.step_k_head_num;
+        const auto v_curr = p.V + ibs * p.step_v_bs + ihkv * p.step_v_head_num;
+
+        const auto sl_diff = p.sl_kv - p.sl_q;
+        const auto unmasked = is_causal ? sl_diff + i + 1 : p.sl_kv;
+        const auto curr_row = std::unique_ptr<float[]>(new float[unmasked]);
+
+        const auto alibi_ihn_m = !is_alibi ? 0.f
+                                 : (ihn + k_offset < n_heads_log2_floor)
+                                     ? powf(m0, ihn + k_offset + 1)
+                                     : powf(m1, 2 * (ihn + k_offset - n_heads_log2_floor) + 1);
+
+        // Q x K
+        float row_max = -INFINITY;
+        for (int j = 0; j < unmasked; ++j) {
+          curr_row[j] = 0.f;
+          for (int k = 0; k < p.head_size; ++k) {
+            if (p.K_layout != ATTN_FWD_LAYOUT_PLAIN) {
+              const auto j_remain = j % NTILE;
+              const auto j_block = j - j_remain;
+              const auto k_remain = k % ROWPACK;
+              const auto k_block = k - k_remain;
+              const auto k_value =
+                  static_cast<float>(k_curr[j_block * p.step_k_sl + k_block * NTILE + j_remain * ROWPACK + k_remain]);
+              const auto q_value =
+                  IS_BF16_GEMM ? static_cast<float>(static_cast<bf16>(q_curr[k])) : static_cast<float>(q_curr[k]);
+              curr_row[j] += k_value * q_value;
+            } else if (IS_BF16_GEMM) {
+              curr_row[j] += static_cast<float>(static_cast<bf16>(q_curr[k])) *  // TODO(Yi) fp16 acc
+                             static_cast<float>(static_cast<bf16>(k_curr[j * p.step_k_sl + k * p.step_k_head_size]));
+            } else {
+              curr_row[j] += static_cast<float>(q_curr[k]) *  // TODO(Yi) fp16 acc
+                             static_cast<float>(k_curr[j * p.step_k_sl + k * p.step_k_head_size]);
+            }
+          }
+          curr_row[j] = curr_row[j] * p.QK_scale * p.Q_sc * p.K_sc + j * alibi_ihn_m;
+          row_max = std::max(row_max, curr_row[j]);
+        }
+
+        // exp
+        float exp_sum = 0.f;
+        for (int j = 0; j < unmasked; ++j) {
+          curr_row[j] = mha_exp_ref(curr_row[j] - row_max);
+          exp_sum += curr_row[j];
+        }
+
+        // softmax
+        if (std::is_same<V_T, int8_t>::value) {
+          for (int j = 0; j < unmasked; ++j) curr_row[j] = roundf(curr_row[j] * UINT8_MAX) / UINT8_MAX / exp_sum;
+        } else {
+          for (int j = 0; j < unmasked; ++j) {
+            curr_row[j] /= exp_sum;
+            if (IS_BF16_GEMM) curr_row[j] = static_cast<float>(static_cast<bf16>(curr_row[j]));
+          }
+        }
+
+        // P x V
+        for (int j = 0; j < p.head_size; ++j) {
+          float dst_f32_val = 0.f;
+          for (int k = 0; k < unmasked; ++k) {
+            if (p.V_layout != ATTN_FWD_LAYOUT_PLAIN) {
+              const auto j_remain = j % NTILE;
+              const auto j_block = j - j_remain;
+              const auto k_remain = k % ROWPACK;
+              const auto k_block = k - k_remain;
+              const auto v_value = static_cast<float>(
+                  v_curr[j_block * p.step_v_head_size + k_block * NTILE + j_remain * ROWPACK + k_remain]);
+              dst_f32_val += curr_row[k] * v_value;
+            } else if (IS_BF16_GEMM) {
+              dst_f32_val += curr_row[k] * static_cast<float>(static_cast<bf16>(v_curr[k * p.step_v_sl + j]));
+            } else {
+              dst_f32_val += curr_row[k] * static_cast<float>(v_curr[k * p.step_v_sl + j]);
+            }
+          }
+          dst_curr[j] = static_cast<DST_T>(dst_f32_val * p.V_sc / p.dst_sc);
+        }
+      }
+}
+}  // namespace mha
+}  // namespace custom
+}  // namespace ne_bestla
+#endif  // NE_CORE_GRAPH_MHA_DENSE_WRAPPER_H
diff --git a/vllm-ext/inference_engine/cpp/core/layers/ne_bestla.cpp b/vllm-ext/inference_engine/cpp/core/layers/ne_bestla.cpp
new file mode 100644
index 000000000..a75d1a2a5
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/ne_bestla.cpp
@@ -0,0 +1,128 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#include "bestla_common.hpp"
+#include "bestla_gemm.h"
+using namespace bestla;     // NOLINT
+using namespace ne_bestla;  // NOLINT
+
+void bestla_init() {
+  GetCPUDevice();
+  if (_cd->AMX_BF16() || _cd->AMX_INT8()) {
+    utils::request_perm_xtile_data();
+  }
+  _cd->print();
+}
+
+void bestla_timer(bool _init) {
+  static utils::timer<utils::microseconds> tr;
+  if (_init)
+    tr.start();
+  else
+    printf("time :%f us\n", tr.stop());
+}
+
+int bestla_set_threads(int _nth) {
+  ne_bestla::ne_threading::get()->set_threads(_nth);
+  return ne_bestla::ne_threading::get()->num_threads();
+}
+
+int bestla_get_threads() {
+  return ne_bestla::ne_threading::get()->num_threads();
+}
+
+void* bestla_get_thread_handle() { return ne_bestla::ne_threading::get(); }
+
+void bestla_parallel_for(forward_compute_fptr fcomp, ne_compute_params* mainparams, ne_tensor* node) {
+  auto threading = ne_bestla::ne_threading::get();
+  if (mainparams->nth == 1) {
+    struct ne_compute_params params = *mainparams;
+    if (node->need_init) {
+      params.type = NE_TASK_INIT;
+      fcomp(&params, node);
+    }
+    params.type = NE_TASK_COMPUTE;
+    fcomp(&params, node);
+    if (node->need_finalize) {
+      params.type = NE_TASK_FINALIZE;
+      fcomp(&params, node);
+    }
+  } else {
+    threading->parallel_for([&](int tidx) {
+      struct ne_compute_params params = *mainparams;
+      params.ith = tidx;
+      if (node->need_init) {
+        params.type = NE_TASK_INIT;
+        if (tidx == 0) {
+          fcomp(&params, node);
+        }
+        threading->sync(tidx, 0);
+      }
+      params.type = NE_TASK_COMPUTE;
+      if (params.ith < params.nth) {
+        fcomp(&params, node);
+      }
+      if (node->need_finalize) {
+        threading->sync(tidx, 1);
+        params.type = NE_TASK_FINALIZE;
+        if (params.ith < params.nth) {
+          fcomp(&params, node);
+        }
+      }
+    });
+  }
+}
+
+void bestla_unpackweight_fp32(void* wptr, int n, int k, float* fp32data, int ld) {
+  BTLAGemmUnPackB(fp32data, wptr, static_cast<size_t>(n), static_cast<size_t>(k), static_cast<size_t>(ld),
+                  ne_bestla::ne_threading::get());
+}
+
+void bestla_packweight_copyattr(const float* f32ptr, void* dstptr, int n, int k, int ld, void* srcptr) {
+  auto wtmp = storage::gemm::PackedWeightParser::deserialBuffer(srcptr);
+  if (wtmp != nullptr) {
+    auto proID = wtmp->mPrologueID;
+    if (wtmp->mPrologueID != BTLA_PROLOGUEB_IDS::WeightPack) {
+      auto kwtmp = reinterpret_cast<storage::gemm::IWeightKBlockBase*>(wtmp);
+      auto coreID = wtmp->mCoreId;
+      auto comptype = gemm::CoreAttr::get_comp(coreID);
+      auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(comptype));
+      ne_comp_type ne_comptype{ne_comp_type::NE_COMP_UNDEF};
+      if (btype == gemm::CompType::tBF16) {
+        ne_comptype = ne_comp_type::NE_COMP_BF16;
+      }
+      if (btype == gemm::CompType::tS8) {
+        ne_comptype = ne_comp_type::NE_COMP_INT8;
+      }
+      if (btype == gemm::CompType::tFP32) {
+        ne_comptype = ne_comp_type::NE_COMP_F32;
+      }
+      if (kwtmp->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+        auto niptr = reinterpret_cast<storage::gemm::StorageWeightKBlockNInteger*>(kwtmp);
+
+        BTLAGemmQuantPackB(dstptr, f32ptr, n, k, ld, niptr->mBlockSize, niptr->mDType, niptr->SDtype(), niptr->IsAsym(),
+                           ne_comptype, false, ne_bestla::ne_threading::get());
+      } else if (kwtmp->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNFloat) {
+        auto f4ptr = reinterpret_cast<storage::gemm::StorageWeightKBlockNFloat*>(kwtmp);
+        BTLAGemmQuantPackB(dstptr, f32ptr, n, k, ld, f4ptr->mBlockSize, f4ptr->mDType, f4ptr->SDtype(), false,
+                           ne_comptype, false, ne_bestla::ne_threading::get());
+      }
+    }
+  }
+  safe_delete(wtmp);
+}
+
+void bestla_layernormalization(int norm_count, int norm_size, bool isrms, float epsilon, const float* FpIn,
+                               float* FpOut) {
+  BTLALayerNorm(norm_count, norm_size, isrms, epsilon, FpIn, FpOut, ne_threading::get());
+}
diff --git a/vllm-ext/inference_engine/cpp/core/layers/ne_test_layers_utils.hpp b/vllm-ext/inference_engine/cpp/core/layers/ne_test_layers_utils.hpp
new file mode 100644
index 000000000..0468a61a2
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/ne_test_layers_utils.hpp
@@ -0,0 +1,96 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#ifndef NE_CORE_GRAPH_NE_TEST_LAYERS_UTILS_H
+#define NE_CORE_GRAPH_NE_TEST_LAYERS_UTILS_H
+
+#include <algorithm>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include "bestla/bestla_utils.h"
+
+#ifndef IE_TESTS
+static_assert(false, "Only include this header file for testing!");
+#endif
+
+template <typename T>
+inline void init_vector(T* v, size_t size, float v_min = -10, float v_max = 10, int seed = 5489u) {
+  float low_value = std::max(v_min, static_cast<float>(std::numeric_limits<T>::lowest()) + 1);
+  std::mt19937 gen(seed);
+  std::uniform_real_distribution<float> u(low_value, v_max);
+  for (size_t i = 0; i < size; ++i) v[i] = u(gen);
+}
+
+template <>
+inline void init_vector<bestla::utils::bf16>(bestla::utils::bf16* v, size_t size, float v_min, float v_max, int seed) {
+  std::mt19937 gen(seed);
+  std::uniform_real_distribution<float> u(v_min, v_max);
+  for (size_t i = 0; i < size; ++i) v[i] = bestla::utils::bf16(u(gen));
+}
+
+template <>
+inline void init_vector<bestla::utils::fp16>(bestla::utils::fp16* v, size_t size, float v_min, float v_max, int seed) {
+  std::mt19937 gen(seed);
+  std::uniform_real_distribution<float> u(v_min, v_max);
+  for (size_t i = 0; i < size; ++i) v[i] = bestla::utils::fp16(u(gen));
+}
+
+template <typename T>
+inline void init_vector(std::vector<T>* v, float v_min = -10, float v_max = 10, int seed = 5489u) {
+  init_vector<T>(v->data(), v->size(), v_min, v_max, seed);
+}
+
+template <typename T>
+struct s_is_u8s8 {
+  enum { value = false };
+};
+
+template <>
+struct s_is_u8s8<int8_t> {
+  enum { value = true };
+};
+
+template <>
+struct s_is_u8s8<uint8_t> {
+  enum { value = true };
+};
+
+template <typename T>
+inline typename std::enable_if<!s_is_u8s8<T>::value, float>::type get_err(const T& a, const T& b) {
+  // we compare float relative error ratio here
+  return fabs(static_cast<float>(a) - static_cast<float>(b)) /
+         std::max(static_cast<float>(fabs(static_cast<float>(b))), 1.0f);
+}
+template <typename T>
+inline typename std::enable_if<s_is_u8s8<T>::value, float>::type get_err(const T& a, const T& b) {
+  // for quantized value, error ratio was calculated with its data range
+  return fabs(static_cast<float>(a) - static_cast<float>(b)) / UINT8_MAX;
+}
+
+template <typename T>
+bool compare_data(const T* buf1, const T* buf2, size_t size, float eps = 1e-6) {
+  if (buf1 == buf2) return false;
+
+  for (size_t i = 0; i < size; ++i) {
+    if (get_err(buf1[i], buf2[i]) > eps) {
+      std::cerr << static_cast<float>(buf1[i]) << "vs" << static_cast<float>(buf2[i]) << " idx=" << i << std::endl;
+      return false;
+    }
+  }
+  return true;
+}
+#endif  // NE_CORE_GRAPH_NE_TEST_LAYERS_UTILS_H
diff --git a/vllm-ext/inference_engine/cpp/core/layers/vec_dot.h b/vllm-ext/inference_engine/cpp/core/layers/vec_dot.h
new file mode 100644
index 000000000..d31208fc3
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/layers/vec_dot.h
@@ -0,0 +1,1554 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#pragma once
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <malloc.h>  // using malloc.h with MSC/MINGW
+#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif
+
+#include <assert.h>
+#include <errno.h>
+#include <time.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <float.h>
+#include <limits.h>
+
+#ifndef __STDC_VERSION__
+#define restrict
+#elif __STDC_VERSION__ < 199901L
+#define restrict
+#endif
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
+#include <immintrin.h>
+#endif
+
+#include "vectors/cpu/simd.h"
+#include "core/data_types.h"
+#include "vectors/cpu/quantize.h"
+
+#define NE_VEC_DOT_UNROLL 2
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static void ne_vec_dot_f32(const int n, float* restrict s, const float* restrict x, const float* restrict y) {
+#ifdef NE_SIMD
+  float sumf = 0.0f;
+  const int np = (n & ~(NE_F32_STEP - 1));
+
+  NE_F32_VEC sum[NE_F32_ARR] = {NE_F32_VEC_ZERO};
+
+  NE_F32_VEC ax[NE_F32_ARR];
+  NE_F32_VEC ay[NE_F32_ARR];
+
+  for (int i = 0; i < np; i += NE_F32_STEP) {
+    for (int j = 0; j < NE_F32_ARR; j++) {
+      ax[j] = NE_F32_VEC_LOAD(x + i + j * NE_F32_EPR);
+      ay[j] = NE_F32_VEC_LOAD(y + i + j * NE_F32_EPR);
+
+      sum[j] = NE_F32_VEC_FMA(sum[j], ax[j], ay[j]);
+    }
+  }
+
+  // reduce sum0..sum3 to sum0
+  NE_F32_VEC_REDUCE(sumf, sum);
+
+  // leftovers
+  for (int i = np; i < n; ++i) {
+    sumf += x[i] * y[i];
+  }
+#else
+  // scalar
+  ne_float sumf = 0.0;
+  for (int i = 0; i < n; ++i) {
+    sumf += (ne_float)(x[i] * y[i]);
+  }
+#endif
+
+  *s = sumf;
+}
+
+static void ne_vec_dot_f16(const int n, float* restrict s, ne_fp16_t* restrict x, ne_fp16_t* restrict y) {
+  ne_float sumf = 0.0;
+
+// IE_SIMD_VEC_DOT_F16 (sum order may affect logits, like padding and no padding)
+#if defined(NE_SIMD) && defined(IE_SIMD_VEC_DOT_F16)
+  const int np = (n & ~(NE_F16_STEP - 1));
+
+  NE_F16_VEC sum[NE_F16_ARR] = {NE_F16_VEC_ZERO};
+
+  NE_F16_VEC ax[NE_F16_ARR];
+  NE_F16_VEC ay[NE_F16_ARR];
+
+  for (int i = 0; i < np; i += NE_F16_STEP) {
+    for (int j = 0; j < NE_F16_ARR; j++) {
+      ax[j] = NE_F16_VEC_LOAD(x + i + j * NE_F16_EPR, j);
+      ay[j] = NE_F16_VEC_LOAD(y + i + j * NE_F16_EPR, j);
+
+      sum[j] = NE_F16_VEC_FMA(sum[j], ax[j], ay[j]);
+    }
+  }
+
+  // reduce sum0..sum3 to sum0
+  NE_F16_VEC_REDUCE(sumf, sum);
+
+  // leftovers
+  for (int i = np; i < n; ++i) {
+    sumf += (ne_float)(NE_FP16_TO_FP32(x[i]) * NE_FP16_TO_FP32(y[i]));
+  }
+#else
+  for (int i = 0; i < n; ++i) {
+    sumf += (ne_float)(NE_FP16_TO_FP32(x[i]) * NE_FP16_TO_FP32(y[i]));
+  }
+#endif
+
+  *s = sumf;
+}
+
+static void ne_vec_dot_q4_0_q8_0(const int n, float* restrict s, const void* restrict vx, const void* restrict vy) {
+  const int qk = QK8_0;
+  const int nb = n / qk;
+
+  assert(n % qk == 0);
+  assert(nb % 2 == 0);
+
+  const block_q4_0* restrict x = (const block_q4_0*)vx;
+  const block_q8_0* restrict y = (const block_q8_0*)vy;
+
+#if defined(__AVX2__)
+  // Initialize accumulator with zeros
+  __m256 acc = _mm256_setzero_ps();
+
+  // Main loop
+  for (int i = 0; i < nb; ++i) {
+    /* Compute combined scale for the block */
+    const __m256 d = _mm256_set1_ps(NE_FP16_TO_FP32(x[i].d) * NE_FP16_TO_FP32(y[i].d));
+
+    __m256i bx = bytes_from_nibbles_32(x[i].qs);
+
+    // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+    const __m256i off = _mm256_set1_epi8(8);
+    bx = _mm256_sub_epi8(bx, off);
+
+    __m256i by = _mm256_loadu_si256((const __m256i*)y[i].qs);
+
+    const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+    /* Multiply q with scale and accumulate */
+    acc = _mm256_fmadd_ps(d, q, acc);
+  }
+
+  *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+  // Initialize accumulator with zeros
+  __m256 acc = _mm256_setzero_ps();
+
+  // Main loop
+  for (int i = 0; i < nb; ++i) {
+    // Compute combined scale for the block
+    const __m256 d = _mm256_set1_ps(NE_FP16_TO_FP32(x[i].d) * NE_FP16_TO_FP32(y[i].d));
+
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    const __m128i off = _mm_set1_epi8(8);
+
+    const __m128i tmp = _mm_loadu_si128((const __m128i*)x[i].qs);
+
+    __m128i bx = _mm_and_si128(lowMask, tmp);
+    __m128i by = _mm_loadu_si128((const __m128i*)y[i].qs);
+    bx = _mm_sub_epi8(bx, off);
+    const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
+
+    bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
+    by = _mm_loadu_si128((const __m128i*)(y[i].qs + 16));
+    bx = _mm_sub_epi8(bx, off);
+    const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
+
+    // Convert int32_t to float
+    __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1));
+
+    // Apply the scale, and accumulate
+    acc = _mm256_add_ps(_mm256_mul_ps(d, p), acc);
+  }
+
+  *s = hsum_float_8(acc);
+#elif defined(__SSSE3__)
+  // set constants
+  const __m128i lowMask = _mm_set1_epi8(0xF);
+  const __m128i off = _mm_set1_epi8(8);
+
+  // Initialize accumulator with zeros
+  __m128 acc_0 = _mm_setzero_ps();
+  __m128 acc_1 = _mm_setzero_ps();
+  __m128 acc_2 = _mm_setzero_ps();
+  __m128 acc_3 = _mm_setzero_ps();
+
+  // First round without accumulation
+  {
+    _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0);
+    _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
+
+    // Compute combined scale for the block 0 and 1
+    const __m128 d_0_1 = _mm_set1_ps(NE_FP16_TO_FP32(x[0].d) * NE_FP16_TO_FP32(y[0].d));
+
+    const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i*)x[0].qs);
+
+    __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+    __m128i by_0 = _mm_loadu_si128((const __m128i*)y[0].qs);
+    bx_0 = _mm_sub_epi8(bx_0, off);
+    const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+    __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+    __m128i by_1 = _mm_loadu_si128((const __m128i*)(y[0].qs + 16));
+    bx_1 = _mm_sub_epi8(bx_1, off);
+    const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+    _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
+    _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
+
+    // Compute combined scale for the block 2 and 3
+    const __m128 d_2_3 = _mm_set1_ps(NE_FP16_TO_FP32(x[1].d) * NE_FP16_TO_FP32(y[1].d));
+
+    const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i*)x[1].qs);
+
+    __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+    __m128i by_2 = _mm_loadu_si128((const __m128i*)y[1].qs);
+    bx_2 = _mm_sub_epi8(bx_2, off);
+    const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+    __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+    __m128i by_3 = _mm_loadu_si128((const __m128i*)(y[1].qs + 16));
+    bx_3 = _mm_sub_epi8(bx_3, off);
+    const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+    // Convert int32_t to float
+    __m128 p0 = _mm_cvtepi32_ps(i32_0);
+    __m128 p1 = _mm_cvtepi32_ps(i32_1);
+    __m128 p2 = _mm_cvtepi32_ps(i32_2);
+    __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+    // Apply the scale
+    acc_0 = _mm_mul_ps(d_0_1, p0);
+    acc_1 = _mm_mul_ps(d_0_1, p1);
+    acc_2 = _mm_mul_ps(d_2_3, p2);
+    acc_3 = _mm_mul_ps(d_2_3, p3);
+  }
+
+  // Main loop
+  for (int i = 2; i < nb; i += 2) {
+    _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
+    _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
+
+    // Compute combined scale for the block 0 and 1
+    const __m128 d_0_1 = _mm_set1_ps(NE_FP16_TO_FP32(x[i].d) * NE_FP16_TO_FP32(y[i].d));
+
+    const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i*)x[i].qs);
+
+    __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+    __m128i by_0 = _mm_loadu_si128((const __m128i*)y[i].qs);
+    bx_0 = _mm_sub_epi8(bx_0, off);
+    const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+    __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+    __m128i by_1 = _mm_loadu_si128((const __m128i*)(y[i].qs + 16));
+    bx_1 = _mm_sub_epi8(bx_1, off);
+    const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+    _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
+    _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
+
+    // Compute combined scale for the block 2 and 3
+    const __m128 d_2_3 = _mm_set1_ps(NE_FP16_TO_FP32(x[i + 1].d) * NE_FP16_TO_FP32(y[i + 1].d));
+
+    const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i*)x[i + 1].qs);
+
+    __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+    __m128i by_2 = _mm_loadu_si128((const __m128i*)y[i + 1].qs);
+    bx_2 = _mm_sub_epi8(bx_2, off);
+    const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+    __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+    __m128i by_3 = _mm_loadu_si128((const __m128i*)(y[i + 1].qs + 16));
+    bx_3 = _mm_sub_epi8(bx_3, off);
+    const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+    // Convert int32_t to float
+    __m128 p0 = _mm_cvtepi32_ps(i32_0);
+    __m128 p1 = _mm_cvtepi32_ps(i32_1);
+    __m128 p2 = _mm_cvtepi32_ps(i32_2);
+    __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+    // Apply the scale
+    __m128 p0_d = _mm_mul_ps(d_0_1, p0);
+    __m128 p1_d = _mm_mul_ps(d_0_1, p1);
+    __m128 p2_d = _mm_mul_ps(d_2_3, p2);
+    __m128 p3_d = _mm_mul_ps(d_2_3, p3);
+
+    // Acummulate
+    acc_0 = _mm_add_ps(p0_d, acc_0);
+    acc_1 = _mm_add_ps(p1_d, acc_1);
+    acc_2 = _mm_add_ps(p2_d, acc_2);
+    acc_3 = _mm_add_ps(p3_d, acc_3);
+  }
+
+  *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+#else
+  // scalar
+  float sumf = 0.0;
+
+  for (int i = 0; i < nb; i++) {
+    int sumi = 0;
+
+    for (int j = 0; j < qk / 2; ++j) {
+      const int v0 = (x[i].qs[j] & 0x0F) - 8;
+      const int v1 = (x[i].qs[j] >> 4) - 8;
+
+      sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk / 2]);
+    }
+
+    sumf += sumi * NE_FP16_TO_FP32(x[i].d) * NE_FP16_TO_FP32(y[i].d);
+  }
+
+  *s = sumf;
+#endif
+}
+
+static void ne_vec_dot_q4_1_q8_1(const int n, float* restrict s, const void* restrict vx, const void* restrict vy) {
+  const int qk = QK8_1;
+  const int nb = n / qk;
+
+  assert(n % qk == 0);
+  assert(nb % 2 == 0);
+
+  const block_q4_1* restrict x = (const block_q4_1*)vx;
+  const block_q8_1* restrict y = (const block_q8_1*)vy;
+
+  // TODO: add WASM SIMD
+#if defined(__AVX2__) || defined(__AVX__)
+  // Initialize accumulator with zeros
+  __m256 acc = _mm256_setzero_ps();
+
+  float summs = 0;
+
+  // Main loop
+  for (int i = 0; i < nb; ++i) {
+    const float d0 = NE_FP16_TO_FP32(x[i].d);
+    const float d1 = y[i].d;
+
+    summs += NE_FP16_TO_FP32(x[i].m) * y[i].s;
+
+    const __m256 d0v = _mm256_set1_ps(d0);
+    const __m256 d1v = _mm256_set1_ps(d1);
+
+    // Compute combined scales
+    const __m256 d0d1 = _mm256_mul_ps(d0v, d1v);
+
+    // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+    const __m256i bx = bytes_from_nibbles_32(x[i].qs);
+    const __m256i by = _mm256_loadu_si256((const __m256i*)y[i].qs);
+
+    const __m256 xy = mul_sum_us8_pairs_float(bx, by);
+
+    // Accumulate d0*d1*x*y
+#if defined(__AVX2__)
+    acc = _mm256_fmadd_ps(d0d1, xy, acc);
+#else
+    acc = _mm256_add_ps(_mm256_mul_ps(d0d1, xy), acc);
+#endif
+  }
+
+  *s = hsum_float_8(acc) + summs;
+#else
+  // scalar
+  float sumf = 0.0;
+
+  for (int i = 0; i < nb; i++) {
+    int sumi = 0;
+
+    for (int j = 0; j < qk / 2; ++j) {
+      const int v0 = (x[i].qs[j] & 0x0F);
+      const int v1 = (x[i].qs[j] >> 4);
+
+      sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk / 2]);
+    }
+
+    sumf += (NE_FP16_TO_FP32(x[i].d) * y[i].d) * sumi + NE_FP16_TO_FP32(x[i].m) * y[i].s;
+  }
+
+  *s = sumf;
+#endif
+}
+
+static void ne_vec_dot_q5_0_q8_0(const int n, float* restrict s, const void* restrict vx, const void* restrict vy) {
+  const int qk = QK8_0;
+  const int nb = n / qk;
+
+  assert(n % qk == 0);
+  assert(nb % 2 == 0);
+  assert(qk == QK5_0);
+
+  const block_q5_0* restrict x = (const block_q5_0*)vx;
+  const block_q8_0* restrict y = (const block_q8_0*)vy;
+
+#if defined(__AVX2__)
+  // Initialize accumulator with zeros
+  __m256 acc = _mm256_setzero_ps();
+
+  // Main loop
+  for (int i = 0; i < nb; i++) {
+    /* Compute combined scale for the block */
+    const __m256 d = _mm256_set1_ps(NE_FP16_TO_FP32(x[i].d) * NE_FP16_TO_FP32(y[i].d));
+
+    __m256i bx = bytes_from_nibbles_32(x[i].qs);
+    __m256i bxhi = bytes_from_bits_32(x[i].qh);
+    bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
+    bx = _mm256_or_si256(bx, bxhi);
+
+    __m256i by = _mm256_loadu_si256((const __m256i*)y[i].qs);
+
+    const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+    /* Multiply q with scale and accumulate */
+    acc = _mm256_fmadd_ps(d, q, acc);
+  }
+
+  *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+  // Initialize accumulator with zeros
+  __m256 acc = _mm256_setzero_ps();
+  __m128i mask = _mm_set1_epi8((char)0xF0);
+
+  // Main loop
+  for (int i = 0; i < nb; i++) {
+    /* Compute combined scale for the block */
+    const __m256 d = _mm256_set1_ps(NE_FP16_TO_FP32(x[i].d) * NE_FP16_TO_FP32(y[i].d));
+
+    __m256i bx = bytes_from_nibbles_32(x[i].qs);
+    const __m256i bxhi = bytes_from_bits_32(x[i].qh);
+    __m128i bxhil = _mm256_castsi256_si128(bxhi);
+    __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+    bxhil = _mm_andnot_si128(bxhil, mask);
+    bxhih = _mm_andnot_si128(bxhih, mask);
+    __m128i bxl = _mm256_castsi256_si128(bx);
+    __m128i bxh = _mm256_extractf128_si256(bx, 1);
+    bxl = _mm_or_si128(bxl, bxhil);
+    bxh = _mm_or_si128(bxh, bxhih);
+    bx = _mm256_set_m128i(bxh, bxl);
+
+    const __m256i by = _mm256_loadu_si256((const __m256i*)y[i].qs);
+
+    const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+    /* Multiply q with scale and accumulate */
+    acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
+  }
+
+  *s = hsum_float_8(acc);
+#else
+  // scalar
+  float sumf = 0.0;
+
+  for (int i = 0; i < nb; i++) {
+    uint32_t qh;
+    memcpy(&qh, x[i].qh, sizeof(qh));
+
+    int sumi = 0;
+
+    for (int j = 0; j < qk / 2; ++j) {
+      const uint8_t xh_0 = ((qh & (1u << (j + 0))) >> (j + 0)) << 4;
+      const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+      const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
+      const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
+
+      sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk / 2]);
+    }
+
+    sumf += (NE_FP16_TO_FP32(x[i].d) * NE_FP16_TO_FP32(y[i].d)) * sumi;
+  }
+
+  *s = sumf;
+#endif
+}
+
+static void ne_vec_dot_q5_1_q8_1(const int n, float* restrict s, const void* restrict vx, const void* restrict vy) {
+  const int qk = QK8_1;
+  const int nb = n / qk;
+
+  assert(n % qk == 0);
+  assert(nb % 2 == 0);
+  assert(qk == QK5_1);
+
+  const block_q5_1* restrict x = (const block_q5_1*)vx;
+  const block_q8_1* restrict y = (const block_q8_1*)vy;
+
+#if defined(__AVX2__)
+  // Initialize accumulator with zeros
+  __m256 acc = _mm256_setzero_ps();
+
+  float summs = 0.0f;
+
+  // Main loop
+  for (int i = 0; i < nb; i++) {
+    const __m256 dx = _mm256_set1_ps(NE_FP16_TO_FP32(x[i].d));
+
+    summs += NE_FP16_TO_FP32(x[i].m) * y[i].s;
+
+    __m256i bx = bytes_from_nibbles_32(x[i].qs);
+    __m256i bxhi = bytes_from_bits_32(x[i].qh);
+    bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
+    bx = _mm256_or_si256(bx, bxhi);
+
+    const __m256 dy = _mm256_set1_ps(y[i].d);
+    const __m256i by = _mm256_loadu_si256((const __m256i*)y[i].qs);
+
+    const __m256 q = mul_sum_us8_pairs_float(bx, by);
+
+    acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
+  }
+
+  *s = hsum_float_8(acc) + summs;
+#elif defined(__AVX__)
+  // Initialize accumulator with zeros
+  __m256 acc = _mm256_setzero_ps();
+  __m128i mask = _mm_set1_epi8(0x10);
+
+  float summs = 0.0f;
+
+  // Main loop
+  for (int i = 0; i < nb; i++) {
+    const __m256 dx = _mm256_set1_ps(NE_FP16_TO_FP32(x[i].d));
+
+    summs += NE_FP16_TO_FP32(x[i].m) * y[i].s;
+
+    __m256i bx = bytes_from_nibbles_32(x[i].qs);
+    const __m256i bxhi = bytes_from_bits_32(x[i].qh);
+    __m128i bxhil = _mm256_castsi256_si128(bxhi);
+    __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+    bxhil = _mm_and_si128(bxhil, mask);
+    bxhih = _mm_and_si128(bxhih, mask);
+    __m128i bxl = _mm256_castsi256_si128(bx);
+    __m128i bxh = _mm256_extractf128_si256(bx, 1);
+    bxl = _mm_or_si128(bxl, bxhil);
+    bxh = _mm_or_si128(bxh, bxhih);
+    bx = _mm256_set_m128i(bxh, bxl);
+
+    const __m256 dy = _mm256_set1_ps(y[i].d);
+    const __m256i by = _mm256_loadu_si256((const __m256i*)y[i].qs);
+
+    const __m256 q = mul_sum_us8_pairs_float(bx, by);
+
+    acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
+  }
+
+  *s = hsum_float_8(acc) + summs;
+#else
+  // scalar
+  float sumf = 0.0;
+
+  for (int i = 0; i < nb; i++) {
+    uint32_t qh;
+    memcpy(&qh, x[i].qh, sizeof(qh));
+
+    int sumi = 0;
+
+    for (int j = 0; j < qk / 2; ++j) {
+      const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+      const uint8_t xh_1 = ((qh >> (j + 12))) & 0x10;
+
+      const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
+      const int32_t x1 = (x[i].qs[j] >> 4) | xh_1;
+
+      sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk / 2]);
+    }
+
+    sumf += (NE_FP16_TO_FP32(x[i].d) * y[i].d) * sumi + NE_FP16_TO_FP32(x[i].m) * y[i].s;
+  }
+
+  *s = sumf;
+#endif
+}
+
+static void ne_vec_dot_q8_0_q8_0(const int n, float* restrict s, const void* restrict vx, const void* restrict vy) {
+  const int qk = QK8_0;
+  const int nb = n / qk;
+
+  assert(n % qk == 0);
+  assert(nb % 2 == 0);
+
+  const block_q8_0* restrict x = (const block_q8_0*)vx;
+  const block_q8_0* restrict y = (const block_q8_0*)vy;
+
+#if defined(__AVX2__) || defined(__AVX__)
+  // Initialize accumulator with zeros
+  __m256 acc = _mm256_setzero_ps();
+
+  // Main loop
+  for (int i = 0; i < nb; ++i) {
+    // Compute combined scale for the block
+    const __m256 d = _mm256_set1_ps(NE_FP16_TO_FP32(x[i].d) * NE_FP16_TO_FP32(y[i].d));
+    __m256i bx = _mm256_loadu_si256((const __m256i*)x[i].qs);
+    __m256i by = _mm256_loadu_si256((const __m256i*)y[i].qs);
+
+    const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+    // Multiply q with scale and accumulate
+#if defined(__AVX2__)
+    acc = _mm256_fmadd_ps(d, q, acc);
+#else
+    acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
+#endif
+  }
+
+  *s = hsum_float_8(acc);
+#else
+  // scalar
+  float sumf = 0.0;
+
+  for (int i = 0; i < nb; i++) {
+    int sumi = 0;
+
+    for (int j = 0; j < qk; j++) {
+      sumi += x[i].qs[j] * y[i].qs[j];
+    }
+
+    sumf += sumi * (NE_FP16_TO_FP32(x[i].d) * NE_FP16_TO_FP32(y[i].d));
+  }
+
+  *s = sumf;
+#endif
+}
+
+// compute NE_VEC_DOT_UNROLL dot products at once
+// xs - x row stride in bytes
+static void ne_vec_dot_f16_unroll(const int n, const int xs, float* restrict s, void* restrict xv,
+                                  ne_fp16_t* restrict y) {
+  ne_float sumf[NE_VEC_DOT_UNROLL] = {0.0};
+
+  ne_fp16_t* restrict x[NE_VEC_DOT_UNROLL];
+
+  for (int i = 0; i < NE_VEC_DOT_UNROLL; ++i) {
+    x[i] = (ne_fp16_t*)((char*)xv + i * xs);
+  }
+
+#if defined(NE_SIMD)
+  const int np = (n & ~(NE_F16_STEP - 1));
+
+  NE_F16_VEC sum[NE_VEC_DOT_UNROLL][NE_F16_ARR] = {{NE_F16_VEC_ZERO}};
+
+  NE_F16_VEC ax[NE_F16_ARR];
+  NE_F16_VEC ay[NE_F16_ARR];
+
+  for (int i = 0; i < np; i += NE_F16_STEP) {
+    for (int j = 0; j < NE_F16_ARR; j++) {
+      ay[j] = NE_F16_VEC_LOAD(y + i + j * NE_F16_EPR, j);
+
+      for (int k = 0; k < NE_VEC_DOT_UNROLL; ++k) {
+        ax[j] = NE_F16_VEC_LOAD(x[k] + i + j * NE_F16_EPR, j);
+
+        sum[k][j] = NE_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
+      }
+    }
+  }
+
+  // reduce sum0..sum3 to sum0
+  for (int k = 0; k < NE_VEC_DOT_UNROLL; ++k) {
+    NE_F16_VEC_REDUCE(sumf[k], sum[k]);
+  }
+
+  // leftovers
+  for (int i = np; i < n; ++i) {
+    for (int j = 0; j < NE_VEC_DOT_UNROLL; ++j) {
+      sumf[j] += (ne_float)(NE_FP16_TO_FP32(x[j][i]) * NE_FP16_TO_FP32(y[i]));
+    }
+  }
+#else
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < NE_VEC_DOT_UNROLL; ++j) {
+      sumf[j] += (ne_float)(NE_FP16_TO_FP32(x[j][i]) * NE_FP16_TO_FP32(y[i]));
+    }
+  }
+#endif
+
+  for (int i = 0; i < NE_VEC_DOT_UNROLL; ++i) {
+    s[i] = sumf[i];
+  }
+}
+
+//===================================== Dot ptoducts =================================
+
+//
+// Helper functions
+//
+#if __AVX__ || __AVX2__ || __AVX512F__
+
+// shuffles to pick the required scales in dot products
+static inline __m256i get_scale_shuffle_q3k(int i) {
+  static const uint8_t k_shuffle[128] = {
+      0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,
+      2,  3,  2,  3,  2,  3,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  6,  7,  6,  7,
+      6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,
+      8,  9,  10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 12, 13, 12, 13, 12, 13, 12, 13,
+      12, 13, 12, 13, 12, 13, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+  };
+  return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
+}
+static inline __m256i get_scale_shuffle_k4(int i) {
+  static const uint8_t k_shuffle[256] = {
+      0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
+      0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,
+      2,  3,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,
+      4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  6,  7,  6,  7,  6,  7,  6,  7,
+      6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  8,  9,
+      8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,
+      8,  9,  8,  9,  10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11,
+      10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
+      12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+      14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15};
+  return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
+}
+static inline __m128i get_scale_shuffle(int i) {
+  static const uint8_t k_shuffle[128] = {
+      0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,
+      3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,
+      6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,
+      9,  9,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12,
+      13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15};
+  return _mm_loadu_si128((const __m128i*)k_shuffle + i);
+}
+#endif
+
+#if QK_K == 256
+static void ggml_vec_dot_q6_K_q8_K(const int n, float* restrict s, const void* restrict vx, const void* restrict vy) {
+  assert(n % QK_K == 0);
+
+  const block_q6_K* restrict x = (const block_q6_K*)vx;
+  const block_q8_K* restrict y = (const block_q8_K*)vy;
+
+  const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+
+  float sum = 0;
+
+  const uint8x16_t m4b = vdupq_n_u8(0xF);
+#if defined(__ARM_FEATURE_DOTPROD)
+  const int32x4_t vzero = vdupq_n_s32(0);
+#endif
+  // const int8x16_t  m32s = vdupq_n_s8(32);
+
+  const uint8x16_t mone = vdupq_n_u8(3);
+
+  ggml_int8x16x4_t q6bytes;
+  ggml_uint8x16x4_t q6h;
+
+  for (int i = 0; i < nb; ++i) {
+    const float d_all = NE_FP16_TO_FP32(x[i].d);
+
+    const uint8_t* restrict q6 = x[i].ql;
+    const uint8_t* restrict qh = x[i].qh;
+    const int8_t* restrict q8 = y[i].qs;
+
+    const int8_t* restrict scale = x[i].scales;
+
+    const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
+    const int8x16_t scales = vld1q_s8(scale);
+    const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
+
+    const int32x4_t prod =
+        vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16(q8sums.val[0]), vget_low_s16(q6scales.val[0])),
+                            vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
+                  vaddq_s32(vmull_s16(vget_low_s16(q8sums.val[1]), vget_low_s16(q6scales.val[1])),
+                            vmull_s16(vget_high_s16(q8sums.val[1]), vget_high_s16(q6scales.val[1]))));
+    int32_t isum_mins = vaddvq_s32(prod);
+
+    int32_t isum = 0;
+
+    for (int j = 0; j < QK_K / 128; ++j) {
+      ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
+      qh += 32;
+      ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6);
+      q6 += 64;
+      ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
+      q8 += 64;
+
+      q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
+      q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
+      uint8x16_t shifted = vshrq_n_u8(qhbits.val[0], 2);
+      q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+      shifted = vshrq_n_u8(qhbits.val[1], 2);
+      q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+
+      // q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s);
+      // q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s);
+      // q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])), m32s);
+      // q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])), m32s);
+      q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0]));
+      q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1]));
+      q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
+      q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+      isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+              vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+              vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+              vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+      scale += 4;
+
+#else
+
+      int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8(q6bytes.val[0]), vget_low_s8(q8bytes.val[0])),
+                               vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
+      int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8(q6bytes.val[1]), vget_low_s8(q8bytes.val[1])),
+                               vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
+      isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
+      scale += 2;
+
+      int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8(q6bytes.val[2]), vget_low_s8(q8bytes.val[2])),
+                               vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
+      int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8(q6bytes.val[3]), vget_low_s8(q8bytes.val[3])),
+                               vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
+      isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1];
+      scale += 2;
+#endif
+
+      q8bytes = ggml_vld1q_s8_x4(q8);
+      q8 += 64;
+
+      shifted = vshrq_n_u8(qhbits.val[0], 4);
+      q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+      shifted = vshrq_n_u8(qhbits.val[1], 4);
+      q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+      shifted = vshrq_n_u8(qhbits.val[0], 6);
+      q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+      shifted = vshrq_n_u8(qhbits.val[1], 6);
+      q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+
+      // q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])), m32s);
+      // q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])), m32s);
+      // q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])), m32s);
+      // q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])), m32s);
+      q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0]));
+      q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1]));
+      q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
+      q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+      isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+              vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+              vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+              vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+      scale += 4;
+
+      // for (int l = 0; l < 4; ++l) {
+      //     const int32x4_t p = vdotq_s32(vzero, q6bytes.val[l], q8bytes.val[l]);
+      //     isum += vaddvq_s32(p) * *scale++;
+      // }
+#else
+      p0 = vaddq_s16(vmull_s8(vget_low_s8(q6bytes.val[0]), vget_low_s8(q8bytes.val[0])),
+                     vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
+      p1 = vaddq_s16(vmull_s8(vget_low_s8(q6bytes.val[1]), vget_low_s8(q8bytes.val[1])),
+                     vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
+      isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
+      scale += 2;
+
+      p2 = vaddq_s16(vmull_s8(vget_low_s8(q6bytes.val[2]), vget_low_s8(q8bytes.val[2])),
+                     vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
+      p3 = vaddq_s16(vmull_s8(vget_low_s8(q6bytes.val[3]), vget_low_s8(q8bytes.val[3])),
+                     vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
+      isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1];
+      scale += 2;
+#endif
+    }
+    // sum += isum * d_all * y[i].d;
+    sum += d_all * y[i].d * (isum - 32 * isum_mins);
+  }
+  *s = sum;
+
+#elif defined __AVX2__
+
+  const __m256i m4 = _mm256_set1_epi8(0xF);
+  const __m256i m2 = _mm256_set1_epi8(3);
+  const __m256i m32s = _mm256_set1_epi8(32);
+
+  __m256 acc = _mm256_setzero_ps();
+
+  for (int i = 0; i < nb; ++i) {
+    const float d = y[i].d * NE_FP16_TO_FP32(x[i].d);
+
+    const uint8_t* restrict q4 = x[i].ql;
+    const uint8_t* restrict qh = x[i].qh;
+    const int8_t* restrict q8 = y[i].qs;
+
+    const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+
+    __m256i sumi = _mm256_setzero_si256();
+
+    int is = 0;
+
+    for (int j = 0; j < QK_K / 128; ++j) {
+      const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
+      const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
+      const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
+      const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
+      is += 4;
+
+      const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
+      q4 += 32;
+      const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4);
+      q4 += 32;
+      const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh);
+      qh += 32;
+
+      const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
+      const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
+      const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
+      const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
+
+      const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
+      const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
+      const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
+      const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
+
+      const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8);
+      q8 += 32;
+      const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8);
+      q8 += 32;
+      const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8);
+      q8 += 32;
+      const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8);
+      q8 += 32;
+
+      __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
+      __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
+      __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
+      __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
+
+      __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
+      __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
+      __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
+      __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
+
+      p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
+      p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
+      p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
+      p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
+
+      p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
+      p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
+      p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
+      p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
+
+      sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
+      sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
+    }
+
+    acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+  }
+
+  *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+  const __m128i m4 = _mm_set1_epi8(0xF);
+  const __m128i m3 = _mm_set1_epi8(3);
+  const __m128i m32s = _mm_set1_epi8(32);
+  const __m128i m2 = _mm_set1_epi8(2);
+
+  __m256 acc = _mm256_setzero_ps();
+
+  for (int i = 0; i < nb; ++i) {
+    const float d = y[i].d * NE_FP16_TO_FP32(x[i].d);
+
+    const uint8_t* restrict q4 = x[i].ql;
+    const uint8_t* restrict qh = x[i].qh;
+    const int8_t* restrict q8 = y[i].qs;
+
+    const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+
+    __m128i sumi_0 = _mm_setzero_si128();
+    __m128i sumi_1 = _mm_setzero_si128();
+
+    __m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+    for (int j = 0; j < QK_K / 128; ++j) {
+      const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh);
+      qh += 16;
+      const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh);
+      qh += 16;
+
+      const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
+      const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
+      const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4);
+      const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4);
+      const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4);
+      const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4);
+      const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4);
+      const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4);
+
+      const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4);
+      q4 += 16;
+      const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4);
+      q4 += 16;
+      const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4);
+      q4 += 16;
+      const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4);
+      q4 += 16;
+
+      const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0);
+      const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1);
+      const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2);
+      const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3);
+      const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4);
+      const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5);
+      const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6);
+      const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7);
+
+      const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8);
+      q8 += 16;
+      const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8);
+      q8 += 16;
+      const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8);
+      q8 += 16;
+      const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8);
+      q8 += 16;
+      const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8);
+      q8 += 16;
+      const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8);
+      q8 += 16;
+      const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8);
+      q8 += 16;
+      const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8);
+      q8 += 16;
+
+      __m128i q8s_0 = _mm_maddubs_epi16(m32s, q8_0);
+      __m128i q8s_1 = _mm_maddubs_epi16(m32s, q8_1);
+      __m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2);
+      __m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3);
+      __m128i q8s_4 = _mm_maddubs_epi16(m32s, q8_4);
+      __m128i q8s_5 = _mm_maddubs_epi16(m32s, q8_5);
+      __m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6);
+      __m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7);
+
+      __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
+      __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
+      __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
+      __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
+      __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
+      __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
+      __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
+      __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
+
+      p16_0 = _mm_sub_epi16(p16_0, q8s_0);
+      p16_1 = _mm_sub_epi16(p16_1, q8s_1);
+      p16_2 = _mm_sub_epi16(p16_2, q8s_2);
+      p16_3 = _mm_sub_epi16(p16_3, q8s_3);
+      p16_4 = _mm_sub_epi16(p16_4, q8s_4);
+      p16_5 = _mm_sub_epi16(p16_5, q8s_5);
+      p16_6 = _mm_sub_epi16(p16_6, q8s_6);
+      p16_7 = _mm_sub_epi16(p16_7, q8s_7);
+
+      const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
+      shuffle = _mm_add_epi8(shuffle, m2);
+      const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
+      shuffle = _mm_add_epi8(shuffle, m2);
+      const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle);
+      shuffle = _mm_add_epi8(shuffle, m2);
+      const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle);
+      shuffle = _mm_add_epi8(shuffle, m2);
+
+      p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
+      p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
+      p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
+      p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
+      p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
+      p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5);
+      p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
+      p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7);
+
+      sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+      sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+      sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
+      sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
+    }
+
+    __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+    acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
+  }
+
+  *s = hsum_float_8(acc);
+
+#elif defined __riscv_v_intrinsic
+
+  float sumf = 0;
+  for (int i = 0; i < nb; ++i) {
+    const float d = NE_FP16_TO_FP32(x[i].d) * y[i].d;
+
+    const uint8_t* restrict q6 = x[i].ql;
+    const uint8_t* restrict qh = x[i].qh;
+    const int8_t* restrict q8 = y[i].qs;
+
+    const int8_t* restrict scale = x[i].scales;
+
+    size_t vl;
+
+    vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+
+    int sum_t = 0;
+    int is = 0;
+
+    for (int j = 0; j < QK_K / 128; ++j) {
+      vl = 32;
+
+      // load qh
+      vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
+
+      // load Q6
+      vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
+      vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6 + 32, vl);
+
+      vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
+      vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
+      vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
+      vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
+
+      vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
+      vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03, vl);
+      vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03, vl);
+      vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03, vl);
+
+      vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
+      vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
+      vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
+      vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
+
+      vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
+      vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
+      vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
+      vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
+
+      // load Q8 and take product
+      vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
+      vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8 + 32, vl), vl);
+      vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8 + 64, vl), vl);
+      vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8 + 96, vl), vl);
+
+      vl = 16;
+
+      vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is + 0], vl);
+      vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is + 1], vl);
+      vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is + 2], vl);
+      vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is + 3], vl);
+      vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is + 4], vl);
+      vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is + 5], vl);
+      vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is + 6], vl);
+      vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is + 7], vl);
+
+      vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
+      vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
+      vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
+      vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
+
+      sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
+
+      q6 += 64;
+      qh += 32;
+      q8 += 128;
+      is = 8;
+    }
+
+    sumf += d * sum_t;
+  }
+
+  *s = sumf;
+
+#else
+
+  int8_t aux8[QK_K];
+  int16_t aux16[8];
+  float sums[8];
+  int32_t aux32[8];
+  memset(sums, 0, 8 * sizeof(float));
+
+  float sumf = 0;
+  for (int i = 0; i < nb; ++i) {
+    const uint8_t* restrict q4 = x[i].ql;
+    const uint8_t* restrict qh = x[i].qh;
+    const int8_t* restrict q8 = y[i].qs;
+    memset(aux32, 0, 8 * sizeof(int32_t));
+    int8_t* restrict a = aux8;
+    for (int j = 0; j < QK_K; j += 128) {
+      for (int l = 0; l < 32; ++l) {
+        a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+        a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+        a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+        a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+      }
+      a += 128;
+      q4 += 64;
+      qh += 32;
+    }
+    a = aux8;
+    int is = 0;
+    for (int j = 0; j < QK_K / 16; ++j) {
+      int scale = x[i].scales[is++];
+      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      q8 += 8;
+      a += 8;
+      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      q8 += 8;
+      a += 8;
+    }
+    const float d = NE_FP16_TO_FP32(x[i].d) * y[i].d;
+    for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+  }
+  for (int l = 0; l < 8; ++l) sumf += sums[l];
+  *s = sumf;
+#endif
+}
+
+#else
+
+static void ggml_vec_dot_q6_K_q8_K(const int n, float* restrict s, const void* restrict vx, const void* restrict vy) {
+  assert(n % QK_K == 0);
+
+  const block_q6_K* restrict x = vx;
+  const block_q8_K* restrict y = vy;
+
+  const int nb = n / QK_K;
+
+#ifdef __ARM_NEON
+
+  float sum = 0;
+
+  const uint8x16_t m4b = vdupq_n_u8(0xF);
+  const int8x16_t m32s = vdupq_n_s8(32);
+#if defined(__ARM_FEATURE_DOTPROD)
+  const int32x4_t vzero = vdupq_n_s32(0);
+#endif
+
+  const uint8x16_t mone = vdupq_n_u8(3);
+
+  ggml_int8x16x4_t q6bytes;
+  ggml_uint8x16x4_t q6h;
+
+  for (int i = 0; i < nb; ++i) {
+    const float d_all = (float)x[i].d;
+
+    const uint8_t* restrict q6 = x[i].ql;
+    const uint8_t* restrict qh = x[i].qh;
+    const int8_t* restrict q8 = y[i].qs;
+
+    const int8_t* restrict scale = x[i].scales;
+
+    int32_t isum = 0;
+
+    uint8x16_t qhbits = vld1q_u8(qh);
+    ggml_uint8x16x2_t q6bits = ggml_vld1q_u8_x2(q6);
+    ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
+
+    q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4);
+    uint8x16_t shifted = vshrq_n_u8(qhbits, 2);
+    q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+    shifted = vshrq_n_u8(qhbits, 4);
+    q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+    shifted = vshrq_n_u8(qhbits, 6);
+    q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+
+    q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s);
+    q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s);
+    q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
+    q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+    isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+            vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+            vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+            vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+#else
+
+    int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8(q6bytes.val[0]), vget_low_s8(q8bytes.val[0])),
+                             vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
+    int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8(q6bytes.val[1]), vget_low_s8(q8bytes.val[1])),
+                             vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
+    isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
+
+    int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8(q6bytes.val[2]), vget_low_s8(q8bytes.val[2])),
+                             vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
+    int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8(q6bytes.val[3]), vget_low_s8(q8bytes.val[3])),
+                             vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
+    isum += vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
+#endif
+
+    sum += isum * d_all * y[i].d;
+  }
+  *s = sum;
+
+#elif defined __AVX2__
+
+  const __m256i m4 = _mm256_set1_epi8(0xF);
+  const __m256i m2 = _mm256_set1_epi8(3);
+  const __m256i m32s = _mm256_set1_epi8(32);
+
+  __m256 acc = _mm256_setzero_ps();
+
+  for (int i = 0; i < nb; ++i) {
+    const float d = y[i].d * NE_FP16_TO_FP32(x[i].d);
+
+    const uint8_t* restrict q4 = x[i].ql;
+    const uint8_t* restrict qh = x[i].qh;
+    const int8_t* restrict q8 = y[i].qs;
+
+    const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
+    const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
+    const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
+    const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
+
+    __m256i sumi = _mm256_setzero_si256();
+
+    const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
+    const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
+
+    const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
+    const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
+
+    const __m256i q4h_0 =
+        _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
+    const __m256i q4h_1 = _mm256_slli_epi16(
+        _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
+
+    const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
+    const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
+
+    const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8 + 0));
+    const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8 + 32));
+
+    __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
+    __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
+
+    __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
+    __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
+
+    p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
+    p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
+
+    p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
+    p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
+
+    sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
+
+    acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+  }
+
+  *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+  const __m128i m4 = _mm_set1_epi8(0xF);
+  const __m128i m2 = _mm_set1_epi8(3);
+  const __m128i m32s = _mm_set1_epi8(32);
+
+  __m256 acc = _mm256_setzero_ps();
+
+  for (int i = 0; i < nb; ++i) {
+    const float d = y[i].d * NE_FP16_TO_FP32(x[i].d);
+
+    const uint8_t* restrict q4 = x[i].ql;
+    const uint8_t* restrict qh = x[i].qh;
+    const int8_t* restrict q8 = y[i].qs;
+
+    const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
+    const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
+    const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
+    const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
+
+    __m128i sumi_0 = _mm_setzero_si128();
+    __m128i sumi_1 = _mm_setzero_si128();
+
+    const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
+    const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
+
+    const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
+    const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
+
+    const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH, m2), 4);
+    const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 2), m2), 4);
+    const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 4), m2), 4);
+    const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 6), m2), 4);
+
+    const __m128i q4_0 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 0), m4), q4h_0);
+    const __m128i q4_1 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 1), m4), q4h_1);
+    const __m128i q4_2 =
+        _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 0), 4), m4), q4h_2);
+    const __m128i q4_3 =
+        _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 1), 4), m4), q4h_3);
+
+    const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8 + 0));
+    const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8 + 32));
+
+    __m128i q8s_0 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 0));
+    __m128i q8s_1 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 1));
+    __m128i q8s_2 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 0));
+    __m128i q8s_3 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 1));
+
+    __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
+    __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
+    __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
+    __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
+
+    p16_0 = _mm_sub_epi16(p16_0, q8s_0);
+    p16_1 = _mm_sub_epi16(p16_1, q8s_1);
+    p16_2 = _mm_sub_epi16(p16_2, q8s_2);
+    p16_3 = _mm_sub_epi16(p16_3, q8s_3);
+
+    p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
+    p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
+    p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
+    p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
+
+    sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+    sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+
+    acc =
+        _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
+  }
+
+  *s = hsum_float_8(acc);
+
+#elif defined __riscv_v_intrinsic
+
+  float sumf = 0;
+
+  for (int i = 0; i < nb; ++i) {
+    const float d_all = (float)x[i].d;
+
+    const uint8_t* restrict q6 = x[i].ql;
+    const uint8_t* restrict qh = x[i].qh;
+    const int8_t* restrict q8 = y[i].qs;
+
+    const int8_t* restrict scale = x[i].scales;
+
+    int32_t isum = 0;
+
+    size_t vl = 16;
+
+    vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+
+    // load Q6
+    vuint8mf2_t q6_0 = __riscv_vle8_v_u8mf2(q6, vl);
+    vuint8mf2_t q6_1 = __riscv_vle8_v_u8mf2(q6 + 16, vl);
+
+    // load qh
+    vuint8mf2_t qh_x = __riscv_vle8_v_u8mf2(qh, vl);
+
+    vuint8mf2_t qh0 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
+    qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
+    vuint8mf2_t qh1 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
+    qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
+    vuint8mf2_t qh2 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
+    qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
+    vuint8mf2_t qh3 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
+
+    vuint8mf2_t q6h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_0, 0xF, vl), qh0, vl);
+    vuint8mf2_t q6h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_1, 0xF, vl), qh1, vl);
+    vuint8mf2_t q6h_2 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_0, 0x4, vl), qh2, vl);
+    vuint8mf2_t q6h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_1, 0x4, vl), qh3, vl);
+
+    vint8mf2_t q6v_0 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_0), 32, vl);
+    vint8mf2_t q6v_1 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_1), 32, vl);
+    vint8mf2_t q6v_2 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_2), 32, vl);
+    vint8mf2_t q6v_3 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_3), 32, vl);
+
+    // load Q8 and take product
+    vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q6v_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
+    vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q6v_1, __riscv_vle8_v_i8mf2(q8 + 16, vl), vl);
+    vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q6v_2, __riscv_vle8_v_i8mf2(q8 + 32, vl), vl);
+    vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q6v_3, __riscv_vle8_v_i8mf2(q8 + 48, vl), vl);
+
+    vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
+    vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
+    vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
+    vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
+
+    isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scale[0];
+    isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scale[1];
+    isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scale[2];
+    isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scale[3];
+
+    sumf += isum * d_all * y[i].d;
+  }
+
+  *s = sumf;
+
+#else
+
+  int8_t aux8[QK_K];
+  int16_t aux16[8];
+  float sums[8];
+  int32_t aux32[8];
+  memset(sums, 0, 8 * sizeof(float));
+
+  float sumf = 0;
+  for (int i = 0; i < nb; ++i) {
+    const uint8_t* restrict q4 = x[i].ql;
+    const uint8_t* restrict qh = x[i].qh;
+    const int8_t* restrict q8 = y[i].qs;
+    memset(aux32, 0, 8 * sizeof(int32_t));
+    int8_t* restrict a = aux8;
+    for (int l = 0; l < 16; ++l) {
+      a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+      a[l + 16] = (int8_t)((q4[l + 16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+      a[l + 32] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+      a[l + 48] = (int8_t)((q4[l + 16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+    }
+    int is = 0;
+    for (int j = 0; j < QK_K / 16; ++j) {
+      int scale = x[i].scales[is++];
+      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      q8 += 8;
+      a += 8;
+      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      q8 += 8;
+      a += 8;
+    }
+    const float d = NE_FP16_TO_FP32(x[i].d) * y[i].d;
+    for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+  }
+  for (int l = 0; l < 8; ++l) sumf += sums[l];
+  *s = sumf;
+#endif
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/vllm-ext/inference_engine/cpp/core/ne.h b/vllm-ext/inference_engine/cpp/core/ne.h
new file mode 100644
index 000000000..12e963ebb
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/ne.h
@@ -0,0 +1,248 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#ifdef NE_SHARED
+#if defined(_WIN32) && !defined(__MINGW32__)
+#ifdef NE_BUILD
+#define NE_API __declspec(dllexport)
+#else
+#define NE_API __declspec(dllimport)
+#endif
+#else
+#define NE_API __attribute__((visibility("default")))
+#endif
+#else
+#define NE_API
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+
+#include "core/data_types.h"
+#include "layers/Ops.h"
+
+#define NE_FILE_MAGIC 0x67676d6c  // "ne"
+#define NE_FILE_VERSION 1
+
+#define NE_MAX_DIMS 4
+#define NE_MAX_NODES 40960
+#define NE_CGRAPH_HASHSET_SIZE 131101 // ne_hash_size(NE_MAX_NODES * 2)
+#define NE_MAX_PARAMS 256
+#define NE_MAX_CONTEXTS 64
+#define NE_MAX_OPT 28
+#define NE_DEFAULT_N_THREADS 4
+#define NE_MAX_OP_PARAMS 32
+
+#define NE_SIZE_CALC -1
+
+#define NE_HASHTABLE_FULL ((size_t)-1)
+#define NE_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
+
+#if __AVX512F__
+#define NE_ALIGNMENT 64
+#else
+#define NE_ALIGNMENT 32
+#endif
+
+#define NE_ASSERT(x)                                                     \
+  do {                                                                   \
+    if (!(x)) {                                                          \
+      fprintf(stderr, "NE_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+      abort();                                                           \
+    }                                                                    \
+  } while (0)
+
+//
+// logging
+//
+
+#define NE_DEBUG 0
+
+#if (NE_DEBUG >= 1)
+#define NE_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define NE_PRINT_DEBUG(...)
+#endif
+
+#if (NE_DEBUG >= 5)
+#define NE_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define NE_PRINT_DEBUG_5(...)
+#endif
+
+#if (NE_DEBUG >= 10)
+#define NE_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define NE_PRINT_DEBUG_10(...)
+#endif
+
+#define NE_PRINT(...) printf(__VA_ARGS__)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ne_object;
+struct ne_context;
+
+enum ne_backend {
+  NE_BACKEND_CPU = 0,
+  NE_BACKEND_CUDA = 1,
+};
+
+// ne object
+struct ne_object {
+  size_t offs;
+  size_t size;
+
+  struct ne_object* next;
+
+  char padding[40];  // 8 for NE_MEM_ALIGN=16, 40 for NE_MEM_ALIGN=64
+};
+
+static const size_t NE_OBJECT_SIZE = sizeof(struct ne_object);
+
+// scratch buffer
+struct ne_scratch {
+  size_t offs;
+  size_t size;
+  void* data;
+};
+
+//
+// ne context
+//
+
+struct ne_context {
+  size_t mem_size;
+  void* mem_buffer;
+  bool mem_buffer_owned;
+  bool no_alloc;
+
+  int n_objects;
+
+  struct ne_object* objects_begin;
+  struct ne_object* objects_end;
+
+  struct ne_scratch scratch;
+  struct ne_scratch scratch_save;
+};
+
+struct ne_context_container {
+  bool used;
+
+  struct ne_context context;
+};
+
+// n-dimensional tensor
+struct ne_tensor {
+  enum ne_type type;
+  enum ne_backend backend;
+
+  int n_dims;
+  int64_t ne[NE_MAX_DIMS];  // number of elements
+  size_t nb[NE_MAX_DIMS];   // stride in bytes:
+                            // nb[0] = sizeof(type)
+                            // nb[1] = nb[0]   * ne[0] + padding
+                            // nb[i] = nb[i-1] * ne[i-1]
+
+  // compute data
+  enum ne_op op;
+
+  bool is_param;
+
+  // op params - allocated as int32_t for alignment
+  int32_t op_params[NE_MAX_OP_PARAMS / sizeof(int32_t)];
+
+  struct ne_tensor* grad;
+  struct ne_tensor* src0;
+  struct ne_tensor* src1;
+  struct ne_tensor* opt[NE_MAX_OPT];
+
+  // thread scheduling
+  int n_tasks;
+
+  // performance
+  int perf_runs;
+  int64_t perf_cycles;
+  int64_t perf_time_us;
+
+  void* data;
+  size_t size;
+
+  char name[24];
+
+  // for quick cgraph visit and graph compute
+  bool visited;
+  bool need_init;
+  bool need_finalize;
+  int opts_used;
+
+  char padding[8];
+};
+
+static const size_t NE_TENSOR_SIZE = sizeof(struct ne_tensor);
+
+// computation graph
+struct ne_cgraph {
+  int n_nodes;
+  int n_leafs;
+  int n_threads;
+  int last_nthreads;
+
+  size_t work_size;
+  struct ne_tensor* work;
+
+  struct ne_tensor* nodes[NE_MAX_NODES];
+  struct ne_tensor* leafs[1];
+
+  // struct ne_tensor* visited_tensors_hashset[NE_CGRAPH_HASHSET_SIZE];
+
+  // performance
+  int perf_runs;
+  int64_t perf_cycles;
+  int64_t perf_time_us;
+};
+
+struct ne_init_params {
+  // memory pool
+  size_t mem_size;   // bytes
+  void* mem_buffer;  // if nullptr, memory will be allocated internally
+  bool no_alloc;     // don't allocate memory for the tensor data
+};
+
+//
+// compute types
+//
+
+enum ne_task_type {
+  NE_TASK_INIT = 0,
+  NE_TASK_COMPUTE,
+  NE_TASK_FINALIZE,
+};
+
+struct ne_compute_params {
+  enum ne_task_type type;
+
+  int ith, nth;
+
+  // work buffer for all threads
+  size_t wsize;
+  void* wdata;
+};
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/vllm-ext/inference_engine/cpp/core/ne_bestla.h b/vllm-ext/inference_engine/cpp/core/ne_bestla.h
new file mode 100644
index 000000000..2bc8e525a
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/ne_bestla.h
@@ -0,0 +1,83 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#ifndef NE_CORE_GRAPH_INNER_PRODUCT_H
+#define NE_CORE_GRAPH_INNER_PRODUCT_H
+#include "ne.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void bestla_timer(bool _init);
+
+int bestla_set_threads(int _nth);
+
+int bestla_get_threads();
+
+void* bestla_get_thread_handle();
+
+typedef void (*forward_compute_fptr)(struct ne_compute_params*, struct ne_tensor*);
+
+void bestla_parallel_for(forward_compute_fptr, struct ne_compute_params*, struct ne_tensor*);
+
+void bestla_init();
+
+unsigned long long bestla_f32f32_get_workspace_size(int _m, int _n, int _k, void* wptr);
+
+void bestla_f32f32_forward(float* activation, void* weiptr, float* output, int _m, int _n, int _k, int lda, int ldo,
+                           void* workspace);
+
+bool bestla_fusion_add_f32f32_support(void* weiptr, int _m, int _n, int _k);
+void bestla_fusion_add_f32f32_forward(float* activation, void* weiptr, float* bias, float* output, int _m, int _n,
+                                      int _k, int lda, int ldo, bool boardcast_bias, void* workspace);
+
+unsigned long long bestla_fusion_QKV_f32f32_get_workspace_size(int _m, int _n, int _k, void* w1ptr);
+
+bool bestla_fusion_QKV_f32f32_support(void* wqptr, void* wkptr, void* wvptr, int _m, int _n, int _k);
+
+void bestla_fusion_QKV_f32f32_forward(float* activation, void* wqptr, void* wkptr, void* wvptr, float* output, int _m,
+                                      int _n, int _k, int lda, int ldo, void* workspace);
+
+unsigned long long bestla_fusion_FFN_f32f32_get_workspace_size(int seq, int fin, int fmid, int fout, void* w1ptr,
+                                                               void* w2ptr);
+
+bool bestla_fusion_FFN_Gelu_Mul_f32f32_support(void* w1ptr, void* w2ptr, void* w3ptr, int seq, int fin, int fmid,
+                                               int fout);
+void bestla_fusion_FFN_Gelu_Mul_f32f32_forward(float* activation, void* w1ptr, void* w2ptr, void* w3ptr, float* tmp1,
+                                               float* tmp2, float* output, int seq, int fin, int fmid, int fout,
+                                               void* workspace);
+
+bool bestla_fusion_FFN_SiLu_f32f32_support(void* w1ptr, void* w2ptr, void* w3ptr, int seq, int fin, int fmid, int fout);
+void bestla_fusion_FFN_SiLu_f32f32_forward(float* activation, void* w1ptr, void* w2ptr, void* w3ptr, float* tmp1,
+                                           float* tmp2, float* output, int seq, int fin, int fmid, int fout,
+                                           void* workspace);
+
+bool bestla_fusion_FFN_GeLu_f32f32_support(void* w1ptr, void* w2ptr, int seq, int fin, int fmid, int fout);
+void bestla_fusion_FFN_GeLu_f32f32_forward(float* activation, void* w1ptr, void* w2ptr, float* tmp1, float* output,
+                                           int seq, int fin, int fmid, int fout, void* workspace);
+
+bool bestla_fusion_FFN_Add_GeLu_f32f32_support(void* w1ptr, void* w2ptr, int seq, int fin, int fmid, int fout);
+void bestla_fusion_FFN_Add_GeLu_f32f32_forward(float* activation, void* w1ptr, void* w2ptr, float* b1ptr, float* b2ptr,
+                                               float* tmp1, float* output, int seq, int fin, int fmid, int fout,
+                                               bool boardcast_bias, void* workspace);
+
+void bestla_unpackweight_fp32(void* wptr, int n, int k, float* fp32data, int ld);
+// packweight to dstptr, copy weight attributes from srcptr
+void bestla_packweight_copyattr(const float* f32ptr, void* dstpr, int n, int k, int ld, void* srcptr);
+
+void bestla_layernormalization(int norm_count, int norm_size, bool isrms, float epsilon, const float* FpIn,
+                               float* FpOut);
+#ifdef __cplusplus
+}
+#endif
+#endif  // NE_CORE_GRAPH_INNER_PRODUCT_H
diff --git a/vllm-ext/inference_engine/cpp/core/ne_layers.c b/vllm-ext/inference_engine/cpp/core/ne_layers.c
new file mode 100644
index 000000000..747668786
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/ne_layers.c
@@ -0,0 +1,11453 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+// Defines CLOCK_MONOTONIC on Linux
+#define _GNU_SOURCE
+
+#include "ne_layers.h"
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <malloc.h>  // using malloc.h with MSC/MINGW
+#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif
+
+#include <assert.h>
+#include <time.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
+#include <immintrin.h>
+#include <mm_malloc.h>
+#include <sched.h>
+#endif
+
+// layers
+#include "layers/vec_dot.h"
+#include "vectors/cpu/quantize.h"
+#include "data_types.h"
+#include "layers/Ops.h"
+#include "layers/ele_reduce.h"
+#include "layers/ele_wise.h"
+#include "layers/mha_dense.h"
+#include "ne.h"
+#include "ne_bestla.h"
+
+// if C99 - static_assert is noop
+// ref: https://stackoverflow.com/a/53923785/4039976
+#ifndef static_assert
+#define static_assert(cond, msg) struct global_scope_noop_trick
+#endif
+
+#if defined(_WIN32)
+
+#include <windows.h>
+
+typedef volatile LONG atomic_int;
+typedef atomic_int atomic_bool;
+
+static void atomic_store(atomic_int* ptr, LONG val) { InterlockedExchange(ptr, val); }
+static LONG atomic_load(atomic_int* ptr) { return InterlockedCompareExchange(ptr, 0, 0); }
+static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) { return InterlockedExchangeAdd(ptr, inc); }
+static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) { return atomic_fetch_add(ptr, -(dec)); }
+
+typedef HANDLE pthread_t;
+
+typedef DWORD thread_ret_t;
+static int pthread_create(pthread_t* out, void* unused, thread_ret_t (*func)(void*), void* arg) {
+  (void)unused;
+  HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL);
+  if (handle == NULL) return EAGAIN;
+
+  *out = handle;
+  return 0;
+}
+
+static int pthread_join(pthread_t thread, void* unused) {
+  (void)unused;
+  return (int)WaitForSingleObject(thread, INFINITE);
+}
+
+static int sched_yield(void) {
+  Sleep(0);
+  return 0;
+}
+#else
+#include <pthread.h>
+#include <stdatomic.h>
+
+typedef void* thread_ret_t;
+#endif
+
+static_assert(sizeof(block_q4_0) == sizeof(ne_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
+static_assert(sizeof(block_q4_1) == 2 * sizeof(ne_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
+static_assert(sizeof(block_q5_0) == sizeof(ne_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ne_fp16_t) + sizeof(uint32_t) + QK5_1 / 2,
+              "wrong q5_1 block size/padding");
+static_assert(sizeof(block_q8_0) == sizeof(ne_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
+static_assert(sizeof(block_q8_1) == 2 * sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
+
+// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
+#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __FMA__
+#define __FMA__
+#endif
+#ifndef __F16C__
+#define __F16C__
+#endif
+#ifndef __SSE3__
+#define __SSE3__
+#endif
+#endif
+
+/*#define IE_PERF*/
+#define NE_GELU_FP16
+#define NE_SILU_FP16
+
+#define NE_SOFT_MAX_UNROLL 4
+
+#if UINTPTR_MAX == 0xFFFFFFFF
+#define NE_MEM_ALIGN 4
+#else
+#define NE_MEM_ALIGN 64
+#endif
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#define NE_ALIGNED_MALLOC(size) _aligned_malloc(size, NE_MEM_ALIGN)
+#define NE_ALIGNED_FREE(ptr) _aligned_free(ptr)
+#else
+inline static void* ne_aligned_malloc(size_t size) {
+  void* aligned_memory = NULL;
+  int result = posix_memalign(&aligned_memory, NE_MEM_ALIGN, size);
+  if (result != 0) {
+    // Handle allocation failure
+    return NULL;
+  }
+  return aligned_memory;
+}
+#define NE_ALIGNED_MALLOC(size) ne_aligned_malloc(size)
+#define NE_ALIGNED_FREE(ptr) free(ptr)
+#endif
+
+#define UNUSED(x) (void)(x)
+#define SWAP(x, y, T) \
+  do {                \
+    T SWAP = x;       \
+    x = y;            \
+    y = SWAP;         \
+  } while (0)
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define NE_TENSOR_BINARY_OP_LOCALS          \
+  NE_TENSOR_LOCALS(int64_t, ne0, src0, ne); \
+  NE_TENSOR_LOCALS(size_t, nb0, src0, nb);  \
+  NE_TENSOR_LOCALS(int64_t, ne1, src1, ne); \
+  NE_TENSOR_LOCALS(size_t, nb1, src1, nb);  \
+  NE_TENSOR_LOCALS(int64_t, ne, dst, ne);   \
+  NE_TENSOR_LOCALS(size_t, nb, dst, nb);
+
+// note: do not use these inside ne.c
+// these are meant to be used via the ne.h API
+float ne_fp16_to_fp32(ne_fp16_t x) { return (float)NE_FP16_TO_FP32(x); }
+
+ne_fp16_t ne_fp32_to_fp16(float x) { return NE_FP32_TO_FP16(x); }
+
+void ne_fp16_to_fp32_row(const ne_fp16_t* x, float* y, size_t n) {
+  for (size_t i = 0; i < n; i++) {
+    y[i] = NE_FP16_TO_FP32(x[i]);
+  }
+}
+
+void ne_fp32_to_fp16_row(const float* x, ne_fp16_t* y, size_t n) {
+  size_t i = 0;
+#if defined(__F16C__)
+  for (; i + 7 < n; i += 8) {
+    __m256 x_vec = _mm256_loadu_ps(x + i);
+    __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+    _mm_storeu_si128((__m128i*)(y + i), y_vec);
+  }
+  for (; i + 3 < n; i += 4) {
+    __m128 x_vec = _mm_loadu_ps(x + i);
+    __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+    _mm_storel_epi64((__m128i*)(y + i), y_vec);
+  }
+#endif
+  for (; i < n; i++) {
+    y[i] = NE_FP32_TO_FP16(x[i]);
+  }
+}
+
+//
+// timing
+//
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+static int64_t timer_freq;
+void ne_time_init(void) {
+  LARGE_INTEGER frequency;
+  QueryPerformanceFrequency(&frequency);
+  timer_freq = frequency.QuadPart;
+}
+int64_t ne_time_ms(void) {
+  LARGE_INTEGER t;
+  QueryPerformanceCounter(&t);
+  return (t.QuadPart * 1000) / timer_freq;
+}
+int64_t ne_time_us(void) {
+  LARGE_INTEGER t;
+  QueryPerformanceCounter(&t);
+  return (t.QuadPart * 1000000) / timer_freq;
+}
+#else
+void ne_time_init(void) {}
+int64_t ne_time_ms(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return (int64_t)ts.tv_sec * 1000 + (int64_t)ts.tv_nsec / 1000000;
+}
+
+int64_t ne_time_us(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return (int64_t)ts.tv_sec * 1000000 + (int64_t)ts.tv_nsec / 1000;
+}
+#endif
+
+int64_t ne_cycles(void) { return clock(); }
+
+int64_t ne_cycles_per_ms(void) { return CLOCKS_PER_SEC / 1000; }
+
+#ifdef IE_PERF
+#define ne_perf_time_ms() ne_time_ms()
+#define ne_perf_time_us() ne_time_us()
+#define ne_perf_cycles() ne_cycles()
+#define ne_perf_cycles_per_ms() ne_cycles_per_ms()
+#else
+#define ne_perf_time_ms() 0
+#define ne_perf_time_us() 0
+#define ne_perf_cycles() 0
+#define ne_perf_cycles_per_ms() 0
+#endif
+
+//
+// cache line
+//
+
+#if defined(__cpp_lib_hardware_interference_size)
+#define CACHE_LINE_SIZE hardware_destructive_interference_size
+#else
+#define CACHE_LINE_SIZE 64
+#endif
+
+static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE / sizeof(float);
+
+static const quantize_fns_t quantize_fns[NE_TYPE_COUNT] = {
+    [NE_TYPE_Q4_0] =
+        {
+            .dequantize_row_q = (dequantize_row_q_t)dequantize_row_q4_0,
+            .quantize_row_q = quantize_row_q4_0,
+            .quantize_row_q_reference = (quantize_row_q_t)quantize_row_q4_0_reference,
+            .quantize_row_q_dot = quantize_row_q8_0,
+            .vec_dot_q = ne_vec_dot_q4_0_q8_0,
+            .vec_dot_type = NE_TYPE_Q8_0,
+        },
+    [NE_TYPE_Q4_1] =
+        {
+            .dequantize_row_q = (dequantize_row_q_t)dequantize_row_q4_1,
+            .quantize_row_q = quantize_row_q4_1,
+            .quantize_row_q_reference = (quantize_row_q_t)quantize_row_q4_1_reference,
+            .quantize_row_q_dot = quantize_row_q8_1,
+            .vec_dot_q = ne_vec_dot_q4_1_q8_1,
+            .vec_dot_type = NE_TYPE_Q8_1,
+        },
+    [NE_TYPE_Q5_0] =
+        {
+            .dequantize_row_q = (dequantize_row_q_t)dequantize_row_q5_0,
+            .quantize_row_q = quantize_row_q5_0,
+            .quantize_row_q_reference = (quantize_row_q_t)quantize_row_q5_0_reference,
+            .quantize_row_q_dot = quantize_row_q8_0,
+            .vec_dot_q = ne_vec_dot_q5_0_q8_0,
+            .vec_dot_type = NE_TYPE_Q8_0,
+        },
+    [NE_TYPE_Q5_1] =
+        {
+            .dequantize_row_q = (dequantize_row_q_t)dequantize_row_q5_1,
+            .quantize_row_q = quantize_row_q5_1,
+            .quantize_row_q_reference = (quantize_row_q_t)quantize_row_q5_1_reference,
+            .quantize_row_q_dot = quantize_row_q8_1,
+            .vec_dot_q = ne_vec_dot_q5_1_q8_1,
+            .vec_dot_type = NE_TYPE_Q8_1,
+        },
+    [NE_TYPE_Q8_0] =
+        {
+            .dequantize_row_q = dequantize_row_q8_0,
+            .quantize_row_q = quantize_row_q8_0,
+            .quantize_row_q_reference = (quantize_row_q_t)quantize_row_q8_0_reference,
+            .quantize_row_q_dot = quantize_row_q8_0,
+            .vec_dot_q = ne_vec_dot_q8_0_q8_0,
+            .vec_dot_type = NE_TYPE_Q8_0,
+        },
+    [NE_TYPE_Q8_1] =
+        {
+            .dequantize_row_q = NULL,  // TODO
+            .quantize_row_q = quantize_row_q8_1,
+            .quantize_row_q_reference = (quantize_row_q_t)quantize_row_q8_1_reference,
+            .quantize_row_q_dot = quantize_row_q8_1,
+            .vec_dot_q = NULL,  // TODO
+            .vec_dot_type = NE_TYPE_Q8_1,
+        },
+    [NE_TYPE_Q6_K] =
+        {
+            .dequantize_row_q = (dequantize_row_q_t)dequantize_row_q6_K,  // TODO
+            .quantize_row_q = quantize_row_q6_K,
+            .quantize_row_q_reference = (quantize_row_q_t)quantize_row_q6_K_reference,
+            .quantize_row_q_dot = quantize_row_q8_K,
+            .vec_dot_q = ggml_vec_dot_q6_K_q8_K,  // TODO
+            .vec_dot_type = NE_TYPE_Q8_K,
+        },
+};
+
+// For internal test use
+quantize_fns_t ne_internal_get_quantize_fn(size_t i) {
+  NE_ASSERT(i < NE_TYPE_COUNT);
+  return quantize_fns[i];
+}
+
+//
+// data types
+//
+
+static const int NE_BLCK_SIZE[NE_TYPE_COUNT] = {
+    [NE_TYPE_F32] = 1,      [NE_TYPE_F16] = 1,      [NE_TYPE_Q4_0] = QK4_0, [NE_TYPE_Q4_1] = QK4_1,
+    [NE_TYPE_Q5_0] = QK5_0, [NE_TYPE_Q5_1] = QK5_1, [NE_TYPE_Q8_0] = QK8_0, [NE_TYPE_Q8_1] = QK8_1,
+    [NE_TYPE_Q6_K] = QK_K,  [NE_TYPE_Q8_K] = QK_K,  [NE_TYPE_I8] = 1,       [NE_TYPE_I16] = 1,
+    [NE_TYPE_I32] = 1,
+};
+static_assert(NE_TYPE_COUNT == 20, "NE_BLCK_SIZE is outdated");
+
+static const size_t NE_TYPE_SIZE[NE_TYPE_COUNT] = {
+    [NE_TYPE_F32] = sizeof(float),       [NE_TYPE_F16] = sizeof(ne_fp16_t),   [NE_TYPE_Q4_0] = sizeof(block_q4_0),
+    [NE_TYPE_Q4_1] = sizeof(block_q4_1), [NE_TYPE_Q5_0] = sizeof(block_q5_0), [NE_TYPE_Q5_1] = sizeof(block_q5_1),
+    [NE_TYPE_Q8_0] = sizeof(block_q8_0), [NE_TYPE_Q8_1] = sizeof(block_q8_1), [NE_TYPE_Q6_K] = sizeof(block_q6_K),
+    [NE_TYPE_Q8_K] = sizeof(block_q8_K), [NE_TYPE_I8] = sizeof(int8_t),       [NE_TYPE_I16] = sizeof(int16_t),
+    [NE_TYPE_I32] = sizeof(int32_t),
+};
+static_assert(NE_TYPE_COUNT == 20, "NE_TYPE_SIZE is outdated");
+
+static const char* NE_TYPE_NAME[NE_TYPE_COUNT] = {
+    [NE_TYPE_F32] = "f32",   [NE_TYPE_F16] = "f16",   [NE_TYPE_Q4_0] = "q4_0", [NE_TYPE_Q4_1] = "q4_1",
+    [NE_TYPE_Q5_0] = "q5_0", [NE_TYPE_Q5_1] = "q5_1", [NE_TYPE_Q8_0] = "q8_0", [NE_TYPE_Q8_1] = "q8_1",
+    [NE_TYPE_Q6_K] = "q6_k", [NE_TYPE_Q8_K] = "q8_k", [NE_TYPE_I8] = "i8",     [NE_TYPE_I16] = "i16",
+    [NE_TYPE_I32] = "i32",
+};
+static_assert(NE_TYPE_COUNT == 20, "NE_TYPE_NAME is outdated");
+
+static bool NE_IS_QUANTIZED[NE_TYPE_COUNT] = {
+    [NE_TYPE_F32] = false, [NE_TYPE_F16] = false, [NE_TYPE_Q4_0] = true, [NE_TYPE_Q4_1] = true, [NE_TYPE_Q5_0] = true,
+    [NE_TYPE_Q5_1] = true, [NE_TYPE_Q8_0] = true, [NE_TYPE_Q8_1] = true, [NE_TYPE_Q6_K] = true, [NE_TYPE_Q6_K] = true,
+    [NE_TYPE_I8] = false,  [NE_TYPE_I16] = false, [NE_TYPE_I32] = false, [NE_TYPE_BTLA] = true,
+};
+static_assert(NE_TYPE_COUNT == 20, "NE_IS_QUANTIZED is outdated");
+
+static const char* NE_OP_LABEL[NE_OP_COUNT] = {
+    "NONE",
+
+    "DUP",
+    "ADD",
+    "ADD1",
+    "ACC",
+    "SUB",
+    "MUL",
+    "DIV",
+    "SQR",
+    "SQRT",
+    "LOG",
+    "SUM",
+    "SUM_ROWS",
+    "MEAN",
+    "REPEAT",
+    "ABS",
+    "SGN",
+    "NEG",
+    "STEP",
+    "RELU",
+    "GELU",
+    "SILU",
+    "SILU_BACK",
+    "NORM",
+    "RMS_NORM",
+    "RMS_NORM_BACK",
+    "ARGSORT",
+    "MUL_MAT",
+    "MUL_MAT_WITH_BIAS",
+    "MUL_MAT_ID",
+    "SCALE",
+    "SET",
+    "CPY",
+    "CONT",
+    "RESHAPE",
+    "VIEW",
+    "PERMUTE",
+    "TRANSPOSE",
+    "GET_ROWS",
+    "GET_ROWS_BACK",
+    "DIAG",
+    "DIAG_MASK_INF",
+    "DIAG_MASK_ZERO",
+    "PADDING_MASK_INF",
+    "SOFT_MAX",
+    "ROPE",
+    "ROPE_BACK",
+    "ALIBI",
+    "CLAMP",
+    "CONV_1D_1S",
+    "CONV_1D_2S",
+
+    "MUL_QKV",
+    "FFN_SILU",
+    "FFN_GeLU",
+    "FFN_GeLU_MUL",
+    "FFN_ADD_GeLU",
+    "FFN_ID_SILU",
+    "FLASH_ATTN",
+    "FLASH_ATTN_KV_UPDATE",
+    "FLASH_FF",
+    "MAP_UNARY",
+    "MAP_BINARY",
+    "SPLIT",
+    "ALL_REDUCE",
+    "TP_CONCAT",
+    "DUMP_TENSOR",
+    "CONV_1D",
+    "DEBUG",
+};
+
+static_assert(NE_OP_COUNT == 69, "NE_OP_COUNT != 69");
+
+static const char* NE_OP_SYMBOL[NE_OP_COUNT] = {
+    "none",
+
+    "x",
+    "x+y",
+    "x+y",
+    "view(x,nb,offset)+=y->x",
+    "x-y",
+    "x*y",
+    "x/y",
+    "x^2",
+    "√x",
+    "log(x)",
+    "Σx",
+    "Σx_k",
+    "Σx/n",
+    "repeat(x)",
+    "abs(x)",
+    "sgn(x)",
+    "-x",
+    "step(x)",
+    "relu(x)",
+    "gelu(x)",
+    "silu(x)",
+    "silu_back(x)",
+    "norm(x)",
+    "rms_norm(x)",
+    "rms_norm_back(x)",
+
+    "X*Y",
+    "X*Y+Z",
+    "x*v",
+    "matmul_id",
+    "y-\\>view(x)",
+    "x-\\>y",
+    "cont(x)",
+    "reshape(x)",
+    "view(x)",
+    "permute(x)",
+    "transpose(x)",
+    "get_rows(x)",
+    "get_rows_back(x)",
+    "diag(x)",
+    "diag_mask_inf(x)",
+    "diag_mask_zero(x)",
+    "padding_mask_inf(x)",
+    "soft_max(x)",
+    "rope(x)",
+    "rope_back(x)",
+    "alibi(x)",
+    "clamp(x)",
+    "conv_1d_1s(x)",
+    "conv_1d_2s(x)",
+
+    "QKV(x)",
+    "ffn_silu(x)",
+    "ffn_id_silu(x)",
+    "ffn_gelu(x)",
+    "ffn_gelu_mul(x)",
+    "ffn_gelu_with_bias(x)",
+    "flash_attn(x)",
+    "flash_attn_kv_update(x)",
+    "flash_ff(x)",
+    "f(x)",
+    "f(x,y)",
+    "conv_1d(x)",
+    "debug(x)",
+    "argsort(x)",
+};
+
+static_assert(sizeof(struct ne_object) % NE_MEM_ALIGN == 0, "ne_object size must be a multiple of NE_MEM_ALIGN");
+static_assert(sizeof(struct ne_tensor) % NE_MEM_ALIGN == 0, "ne_tensor size must be a multiple of NE_MEM_ALIGN");
+
+//
+// ne state
+//
+
+struct ne_state {
+  struct ne_context_container contexts[NE_MAX_CONTEXTS];
+};
+
+// global state
+static struct ne_state g_state;
+static atomic_int g_state_barrier = 0;
+
+// barrier via spin lock
+inline static void ne_critical_section_start(void) {
+  int processing = atomic_fetch_add(&g_state_barrier, 1);
+
+  while (processing > 0) {
+    // wait for other threads to finish
+    atomic_fetch_sub(&g_state_barrier, 1);
+    sched_yield();  // TODO: reconsider this
+    processing = atomic_fetch_add(&g_state_barrier, 1);
+  }
+}
+
+// TODO: make this somehow automatically executed
+//       some sort of "sentry" mechanism
+inline static void ne_critical_section_end(void) { atomic_fetch_sub(&g_state_barrier, 1); }
+
+////////////////////////////////////////////////////////////////////////////////
+
+void ne_print_object(const struct ne_object* obj) {
+  NE_PRINT(" - ne_object: offset = %zu, size = %zu, next = %p\n", obj->offs, obj->size, (const void*)obj->next);
+}
+
+void ne_print_objects(const struct ne_context* ctx) {
+  struct ne_object* obj = ctx->objects_begin;
+
+  NE_PRINT("%s: objects in context %p:\n", __func__, (const void*)ctx);
+
+  while (obj != NULL) {
+    ne_print_object(obj);
+    obj = obj->next;
+  }
+
+  NE_PRINT("%s: --- end ---\n", __func__);
+}
+
+int64_t ne_nelements(const struct ne_tensor* tensor) {
+  static_assert(NE_MAX_DIMS == 4, "NE_MAX_DIMS is not 4 - update this function");
+
+  return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3];
+}
+
+int ne_nrows(const struct ne_tensor* tensor) {
+  static_assert(NE_MAX_DIMS == 4, "NE_MAX_DIMS is not 4 - update this function");
+
+  return tensor->ne[1] * tensor->ne[2] * tensor->ne[3];
+}
+
+size_t ne_nbytes(const struct ne_tensor* tensor) {
+  static_assert(NE_MAX_DIMS == 4, "NE_MAX_DIMS is not 4 - update this function");
+
+  return (ne_nelements(tensor) * NE_TYPE_SIZE[tensor->type]) / NE_BLCK_SIZE[tensor->type];
+}
+
+int ne_blck_size(enum ne_type type) { return NE_BLCK_SIZE[type]; }
+
+size_t ne_type_size(enum ne_type type) { return NE_TYPE_SIZE[type]; }
+
+float ne_type_sizef(enum ne_type type) { return ((float)(NE_TYPE_SIZE[type])) / NE_BLCK_SIZE[type]; }
+
+const char* ne_type_name(enum ne_type type) { return NE_TYPE_NAME[type]; }
+
+size_t ne_element_size(const struct ne_tensor* tensor) { return NE_TYPE_SIZE[tensor->type]; }
+
+static inline bool ne_is_scalar(const struct ne_tensor* tensor) {
+  static_assert(NE_MAX_DIMS == 4, "NE_MAX_DIMS is not 4 - update this function");
+
+  return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
+}
+
+static inline bool ne_is_vector(const struct ne_tensor* tensor) {
+  static_assert(NE_MAX_DIMS == 4, "NE_MAX_DIMS is not 4 - update this function");
+
+  return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
+}
+
+static inline bool ne_is_matrix(const struct ne_tensor* tensor) {
+  static_assert(NE_MAX_DIMS == 4, "NE_MAX_DIMS is not 4 - update this function");
+
+  return tensor->ne[2] == 1 && tensor->ne[3] == 1;
+}
+
+static inline bool ne_can_mul_mat(const struct ne_tensor* t0, const struct ne_tensor* t1) {
+  static_assert(NE_MAX_DIMS == 4, "NE_MAX_DIMS is not 4 - update this function");
+
+  // verify t0 is broadcastable
+  return (t0->ne[0] == t1->ne[0]) && (t1->ne[2] % t0->ne[2] == 0) && (t1->ne[3] % t0->ne[3] == 0);
+}
+
+bool ne_is_quantized(enum ne_type type) { return NE_IS_QUANTIZED[type]; }
+
+enum ne_type ne_ftype_to_ne_type(enum ne_ftype ftype) {
+  enum ne_type wtype = NE_TYPE_COUNT;
+
+  switch (ftype) {
+    case NE_FTYPE_ALL_F32:
+      wtype = NE_TYPE_F32;
+      break;
+    case NE_FTYPE_MOSTLY_F16:
+      wtype = NE_TYPE_F16;
+      break;
+    case NE_FTYPE_MOSTLY_Q4_0:
+      wtype = NE_TYPE_Q4_0;
+      break;
+    case NE_FTYPE_MOSTLY_Q4_1:
+      wtype = NE_TYPE_Q4_1;
+      break;
+    case NE_FTYPE_MOSTLY_Q5_0:
+      wtype = NE_TYPE_Q5_0;
+      break;
+    case NE_FTYPE_MOSTLY_Q5_1:
+      wtype = NE_TYPE_Q5_1;
+      break;
+    case NE_FTYPE_MOSTLY_Q8_0:
+      wtype = NE_TYPE_Q8_0;
+      break;
+    case NE_FTYPE_UNKNOWN:
+      wtype = NE_TYPE_COUNT;
+      break;
+    case NE_FTYPE_MOSTLY_Q4_1_SOME_F16:
+      wtype = NE_TYPE_COUNT;
+      break;
+  }
+
+  NE_ASSERT(wtype != NE_TYPE_COUNT);
+
+  return wtype;
+}
+
+static inline bool ne_is_transposed(const struct ne_tensor* tensor) { return tensor->nb[0] > tensor->nb[1]; }
+
+static inline bool ne_is_contiguous(const struct ne_tensor* tensor) {
+  static_assert(NE_MAX_DIMS == 4, "NE_MAX_DIMS is not 4 - update this function");
+
+  return tensor->nb[0] == NE_TYPE_SIZE[tensor->type] &&
+         tensor->nb[1] == (tensor->nb[0] * tensor->ne[0]) / NE_BLCK_SIZE[tensor->type] &&
+         tensor->nb[2] == tensor->nb[1] * tensor->ne[1] && tensor->nb[3] == tensor->nb[2] * tensor->ne[2];
+}
+
+static inline bool ne_is_padded_1d(const struct ne_tensor* tensor) {
+  static_assert(NE_MAX_DIMS == 4, "NE_MAX_DIMS is not 4 - update this function");
+
+  return tensor->nb[0] == NE_TYPE_SIZE[tensor->type] && tensor->nb[2] == tensor->nb[1] * tensor->ne[1] &&
+         tensor->nb[3] == tensor->nb[2] * tensor->ne[2];
+}
+
+static inline bool ne_are_same_shape(const struct ne_tensor* t0, const struct ne_tensor* t1) {
+  static_assert(NE_MAX_DIMS == 4, "NE_MAX_DIMS is not 4 - update this function");
+
+  return (t0->ne[0] == t1->ne[0]) && (t0->ne[1] == t1->ne[1]) && (t0->ne[2] == t1->ne[2]) && (t0->ne[3] == t1->ne[3]);
+}
+
+// check if t1 can be represented as a repeatition of t0
+static inline bool ne_can_repeat(const struct ne_tensor* t0, const struct ne_tensor* t1) {
+  static_assert(NE_MAX_DIMS == 4, "NE_MAX_DIMS is not 4 - update this function");
+
+  return (t1->ne[0] % t0->ne[0] == 0) && (t1->ne[1] % t0->ne[1] == 0) && (t1->ne[2] % t0->ne[2] == 0) &&
+         (t1->ne[3] % t0->ne[3] == 0);
+}
+
+static inline bool ne_can_repeat_rows(const struct ne_tensor* t0, const struct ne_tensor* t1) {
+  static_assert(NE_MAX_DIMS == 4, "NE_MAX_DIMS is not 4 - update this function");
+
+  return (t0->ne[0] == t1->ne[0]) && ne_can_repeat(t0, t1);
+}
+
+static inline int ne_up32(int n) { return (n + 31) & ~31; }
+
+// static inline int ne_up64(int n) {
+//     return (n + 63) & ~63;
+// }
+
+static inline int ne_up(int n, int m) {
+  // assert m is a power of 2
+  NE_ASSERT((m & (m - 1)) == 0);
+  return (n + m - 1) & ~(m - 1);
+}
+
+// assert that pointer is aligned to NE_MEM_ALIGN
+#define ne_assert_aligned(ptr) NE_ASSERT(((uintptr_t)(ptr)) % NE_MEM_ALIGN == 0)
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ne_context* ne_init(struct ne_init_params params) {
+  // make this function thread safe
+  ne_critical_section_start();
+
+  static bool is_first_call = true;
+
+  if (is_first_call) {
+    // initialize time system (required on Windows)
+    ne_time_init();
+    // initialize bestla's amx instruction.
+    bestla_init();
+    // initialize GELU, SILU and EXP F32 tables
+    {
+      const uint64_t t_start = ne_time_us();
+      UNUSED(t_start);
+
+      ne_fp16_t ii;
+      for (int i = 0; i < (1 << 16); ++i) {
+        uint16_t ui = i;
+        memcpy(&ii, &ui, sizeof(ii));
+        const float f = table_f32_f16[i] = NE_COMPUTE_FP16_TO_FP32(ii);
+        table_gelu_f16[i] = NE_FP32_TO_FP16(ne_gelu_f32(f));
+        table_silu_f16[i] = NE_FP32_TO_FP16(ne_silu_f32(f));
+        table_exp_f16[i] = NE_FP32_TO_FP16(expf(f));
+      }
+
+      const uint64_t t_end = ne_time_us();
+      UNUSED(t_end);
+
+      NE_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start) / 1000.0f);
+    }
+
+    // initialize g_state
+    {
+      const uint64_t t_start = ne_time_us();
+      UNUSED(t_start);
+
+      g_state = (struct ne_state){
+          /*.contexts =*/{{0}},
+      };
+
+      for (int i = 0; i < NE_MAX_CONTEXTS; ++i) {
+        g_state.contexts[i].used = false;
+      }
+
+      const uint64_t t_end = ne_time_us();
+      UNUSED(t_end);
+
+      NE_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start) / 1000.0f);
+    }
+
+    is_first_call = false;
+  }
+
+  // find non-used context in g_state
+  struct ne_context* ctx = NULL;
+
+  for (int i = 0; i < NE_MAX_CONTEXTS; i++) {
+    if (!g_state.contexts[i].used) {
+      g_state.contexts[i].used = true;
+      ctx = &g_state.contexts[i].context;
+
+      NE_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
+      break;
+    }
+  }
+
+  if (ctx == NULL) {
+    NE_PRINT_DEBUG("%s: no unused context found\n", __func__);
+
+    ne_critical_section_end();
+
+    return NULL;
+  }
+
+  const size_t mem_size = (params.mem_size + NE_MEM_ALIGN - 1) & ~(NE_MEM_ALIGN - 1);
+
+  *ctx = (struct ne_context){
+      /*.mem_size           =*/mem_size,
+      /*.mem_buffer         =*/params.mem_buffer ? params.mem_buffer : NE_ALIGNED_MALLOC(mem_size),
+      /*.mem_buffer_owned   =*/params.mem_buffer ? false : true,
+      /*.no_alloc           =*/params.no_alloc,
+      /*.n_objects          =*/0,
+      /*.objects_begin      =*/NULL,
+      /*.objects_end        =*/NULL,
+      /*.scratch            =*/
+      {
+          0,
+          0,
+          NULL,
+      },
+      /*.scratch_save       =*/
+      {
+          0,
+          0,
+          NULL,
+      },
+  };
+
+  NE_ASSERT(ctx->mem_buffer != NULL);
+
+  ne_assert_aligned(ctx->mem_buffer);
+
+  NE_PRINT_DEBUG("%s: context initialized\n", __func__);
+
+  ne_critical_section_end();
+
+  return ctx;
+}
+
+void ne_free(struct ne_context* ctx) {
+  // make this function thread safe
+  ne_critical_section_start();
+
+  bool found = false;
+
+  for (int i = 0; i < NE_MAX_CONTEXTS; i++) {
+    if (&g_state.contexts[i].context == ctx) {
+      g_state.contexts[i].used = false;
+
+      NE_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", __func__, i, ctx->n_objects,
+                     ctx->objects_end->offs + ctx->objects_end->size);
+
+      if (ctx->mem_buffer_owned) {
+        NE_ALIGNED_FREE(ctx->mem_buffer);
+      }
+
+      found = true;
+      break;
+    }
+  }
+
+  if (!found) {
+    NE_PRINT_DEBUG("%s: context not found\n", __func__);
+  }
+
+  ne_critical_section_end();
+}
+
+size_t ne_used_mem(const struct ne_context* ctx) {
+  return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
+}
+
+size_t ne_set_scratch(struct ne_context* ctx, struct ne_scratch scratch) {
+  const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;
+
+  ctx->scratch = scratch;
+
+  return result;
+}
+
+// IMPORTANT:
+// when creating "opt" tensors, always save and load the scratch buffer
+// this is an error prone process, but it is necessary to support inplace
+// operators when using scratch buffers
+// TODO: implement a better way
+void ne_scratch_save(struct ne_context* ctx) {
+  ctx->scratch_save = ctx->scratch;
+  ctx->scratch.data = NULL;
+}
+
+void ne_scratch_load(struct ne_context* ctx) { ctx->scratch = ctx->scratch_save; }
+
+////////////////////////////////////////////////////////////////////////////////
+
+static void ne_set_op_params(struct ne_tensor* tensor, const void* params, size_t params_size) {
+  NE_ASSERT(tensor != NULL);  // silence -Warray-bounds warnings
+  // assert(params_size <= NE_MAX_OP_PARAMS);
+  memcpy(tensor->op_params, params, params_size);
+}
+
+struct ne_tensor* ne_new_tensor_impl(struct ne_context* ctx, enum ne_type type, int n_dims, const int64_t* ne,
+                                     void* data, size_t size) {
+  // always insert objects at the end of the context's memory pool
+  struct ne_object* obj_cur = ctx->objects_end;
+
+  const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
+  const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
+  const size_t cur_end = cur_offs + cur_size;
+
+  size_t size_needed = 0;
+
+  if (data == NULL && !ctx->no_alloc) {
+    if (type == NE_TYPE_BTLA) {
+      size_needed = size;
+    } else {
+      size_needed += NE_TYPE_SIZE[type] * (ne[0] / NE_BLCK_SIZE[type]);
+      for (int i = 1; i < n_dims; i++) {
+        size_needed *= ne[i];
+      }
+      size_needed = ((size_needed + NE_MEM_ALIGN - 1) / NE_MEM_ALIGN) * NE_MEM_ALIGN;
+    }
+  }
+
+  char* const mem_buffer = ctx->mem_buffer;
+  struct ne_object* const obj_new = (struct ne_object*)(mem_buffer + cur_end);
+
+  if (ctx->scratch.data == NULL || data != NULL) {
+    size_needed += sizeof(struct ne_tensor);
+
+    if (cur_end + size_needed + NE_OBJECT_SIZE > ctx->mem_size) {
+      NE_PRINT(
+          "%s: %d Context's memory pool is not enough(current %zu MB, ctx->mem_size available %zu MB), please increase "
+          "the scratch_size_ratio.\n",
+          __func__, __LINE__, (cur_end + size_needed + NE_OBJECT_SIZE) / 1024 / 1024, ctx->mem_size / 1024 / 1024);
+      assert(false);
+      return NULL;
+    }
+
+    *obj_new = (struct ne_object){
+        .offs = cur_end + NE_OBJECT_SIZE,
+        .size = size_needed,
+        .next = NULL,
+    };
+  } else {
+    if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
+      NE_PRINT(
+          "%s: %d scratch.size pool is not enough(current %zu MB, ctx->scratch.size available %zu MB), please increase "
+          "the scratch_size_ratio.\n",
+          __func__, __LINE__, (ctx->scratch.offs + size_needed) / 1024 / 1024, ctx->scratch.size / 1024 / 1024);
+      assert(false);
+      return NULL;
+    }
+
+    if (cur_end + sizeof(struct ne_tensor) + NE_OBJECT_SIZE > ctx->mem_size) {
+      NE_PRINT("%s: %d not enough space in the context's memory pool (needed %zu, ctx->mem_size available %zu)\n",
+               __func__, __LINE__, cur_end + sizeof(struct ne_tensor) + NE_OBJECT_SIZE, ctx->mem_size);
+      assert(false);
+      return NULL;
+    }
+
+    data = (char* const)ctx->scratch.data + ctx->scratch.offs;
+
+    *obj_new = (struct ne_object){
+        .offs = cur_end + NE_OBJECT_SIZE,
+        .size = sizeof(struct ne_tensor),
+        .next = NULL,
+    };
+
+    ctx->scratch.offs += size_needed;
+  }
+
+  if (obj_cur != NULL) {
+    obj_cur->next = obj_new;
+  } else {
+    // this is the first object in this context
+    ctx->objects_begin = obj_new;
+  }
+
+  ctx->objects_end = obj_new;
+
+  struct ne_tensor* const result = (struct ne_tensor*)(mem_buffer + obj_new->offs);
+
+  *result = (struct ne_tensor){
+      .type = type,
+      .backend = NE_BACKEND_CPU,
+      .n_dims = n_dims,
+      .ne = {1, 1, 1, 1},
+      .nb = {0, 0, 0, 0},
+      .op = NE_OP_NONE,
+      .is_param = false,
+      .op_params = {0},
+      .grad = NULL,
+      .src0 = NULL,
+      .src1 = NULL,
+      .opt = {NULL},
+      .n_tasks = 0,
+      .perf_runs = 0,
+      .perf_cycles = 0,
+      .perf_time_us = 0,
+      .data = (data == NULL && !ctx->no_alloc) ? (void*)(result + 1) : data,
+      .size = size_needed,
+      .name = {0},
+      .padding = {0},
+  };
+
+  for (int i = 0; i < n_dims; i++) {
+    result->ne[i] = ne[i];
+  }
+  result->nb[0] = NE_TYPE_SIZE[type];
+  if (type != NE_TYPE_BTLA) {
+    result->nb[1] = result->nb[0] * (result->ne[0] / NE_BLCK_SIZE[type]);
+  }
+
+  for (int i = 2; i < NE_MAX_DIMS; i++) {
+    result->nb[i] = result->nb[i - 1] * result->ne[i - 1];
+  }
+
+  ctx->n_objects++;
+
+  return result;
+}
+
+struct ne_tensor* ne_new_tensor(struct ne_context* ctx, enum ne_type type, int n_dims, const int64_t* ne, size_t size) {
+  return ne_new_tensor_impl(ctx, type, n_dims, ne, NULL, size);
+}
+
+struct ne_tensor* ne_new_tensor_1d(struct ne_context* ctx, enum ne_type type, int64_t ne0, size_t size) {
+  return ne_new_tensor(ctx, type, 1, &ne0, size);
+}
+
+struct ne_tensor* ne_new_tensor_2d(struct ne_context* ctx, enum ne_type type, int64_t ne0, int64_t ne1, size_t size) {
+  const int64_t ne[2] = {ne0, ne1};
+  return ne_new_tensor(ctx, type, 2, ne, size);
+}
+
+struct ne_tensor* ne_new_tensor_3d(struct ne_context* ctx, enum ne_type type, int64_t ne0, int64_t ne1, int64_t ne2,
+                                   size_t size) {
+  const int64_t ne[3] = {ne0, ne1, ne2};
+  return ne_new_tensor(ctx, type, 3, ne, size);
+}
+
+struct ne_tensor* ne_new_tensor_4d(struct ne_context* ctx, enum ne_type type, int64_t ne0, int64_t ne1, int64_t ne2,
+                                   int64_t ne3, size_t size) {
+  const int64_t ne[4] = {ne0, ne1, ne2, ne3};
+  return ne_new_tensor(ctx, type, 4, ne, size);
+}
+
+struct ne_tensor* ne_new_i32(struct ne_context* ctx, int32_t value) {
+  ne_scratch_save(ctx);
+
+  struct ne_tensor* result = ne_new_tensor_1d(ctx, NE_TYPE_I32, 1, NE_SIZE_CALC);
+
+  ne_scratch_load(ctx);
+
+  ne_set_i32(result, value);
+
+  return result;
+}
+
+struct ne_tensor* ne_new_f32(struct ne_context* ctx, float value) {
+  ne_scratch_save(ctx);
+
+  struct ne_tensor* result = ne_new_tensor_1d(ctx, NE_TYPE_F32, 1, NE_SIZE_CALC);
+
+  ne_scratch_load(ctx);
+
+  ne_set_f32(result, value);
+
+  return result;
+}
+
+struct ne_tensor* ne_dup_tensor(struct ne_context* ctx, const struct ne_tensor* src) {
+  return ne_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL, src->size);
+}
+
+struct ne_tensor* ne_set_zero(struct ne_tensor* tensor) {
+  memset(tensor->data, 0, ne_nbytes(tensor));
+  return tensor;
+}
+
+struct ne_tensor* ne_set_i32(struct ne_tensor* tensor, int32_t value) {
+  const int n = ne_nrows(tensor);
+  const int nc = tensor->ne[0];
+  const size_t n1 = tensor->nb[1];
+
+  char* const data = tensor->data;
+
+  switch (tensor->type) {
+    case NE_TYPE_I8: {
+      assert(tensor->nb[0] == sizeof(int8_t));
+      for (int i = 0; i < n; i++) {
+        ne_vec_set_i8(nc, (int8_t*)(data + i * n1), value);
+      }
+    } break;
+    case NE_TYPE_I16: {
+      assert(tensor->nb[0] == sizeof(int16_t));
+      for (int i = 0; i < n; i++) {
+        ne_vec_set_i16(nc, (int16_t*)(data + i * n1), value);
+      }
+    } break;
+    case NE_TYPE_I32: {
+      assert(tensor->nb[0] == sizeof(int32_t));
+      for (int i = 0; i < n; i++) {
+        ne_vec_set_i32(nc, (int32_t*)(data + i * n1), value);
+      }
+    } break;
+    case NE_TYPE_F16: {
+      assert(tensor->nb[0] == sizeof(ne_fp16_t));
+      for (int i = 0; i < n; i++) {
+        ne_vec_set_f16(nc, (ne_fp16_t*)(data + i * n1), value);
+      }
+    } break;
+    case NE_TYPE_F32: {
+      assert(tensor->nb[0] == sizeof(float));
+      for (int i = 0; i < n; i++) {
+        ne_vec_set_f32(nc, (float*)(data + i * n1), value);
+      }
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+
+  return tensor;
+}
+
+struct ne_tensor* ne_set_f32(struct ne_tensor* tensor, float value) {
+  const int n = ne_nrows(tensor);
+  const int nc = tensor->ne[0];
+  const size_t n1 = tensor->nb[1];
+
+  char* const data = tensor->data;
+
+  switch (tensor->type) {
+    case NE_TYPE_I8: {
+      assert(tensor->nb[0] == sizeof(int8_t));
+      for (int i = 0; i < n; i++) {
+        ne_vec_set_i8(nc, (int8_t*)(data + i * n1), value);
+      }
+    } break;
+    case NE_TYPE_I16: {
+      assert(tensor->nb[0] == sizeof(int16_t));
+      for (int i = 0; i < n; i++) {
+        ne_vec_set_i16(nc, (int16_t*)(data + i * n1), value);
+      }
+    } break;
+    case NE_TYPE_I32: {
+      assert(tensor->nb[0] == sizeof(int32_t));
+      for (int i = 0; i < n; i++) {
+        ne_vec_set_i32(nc, (int32_t*)(data + i * n1), value);
+      }
+    } break;
+    case NE_TYPE_F16: {
+      assert(tensor->nb[0] == sizeof(ne_fp16_t));
+      for (int i = 0; i < n; i++) {
+        ne_vec_set_f16(nc, (ne_fp16_t*)(data + i * n1), value);
+      }
+    } break;
+    case NE_TYPE_F32: {
+      assert(tensor->nb[0] == sizeof(float));
+      for (int i = 0; i < n; i++) {
+        ne_vec_set_f32(nc, (float*)(data + i * n1), value);
+      }
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+
+  return tensor;
+}
+
+int32_t ne_get_i32_1d(const struct ne_tensor* tensor, int i) {
+  switch (tensor->type) {
+    case NE_TYPE_I8: {
+      NE_ASSERT(tensor->nb[0] == sizeof(int8_t));
+      return ((int8_t*)(tensor->data))[i];
+    } break;
+    case NE_TYPE_I16: {
+      NE_ASSERT(tensor->nb[0] == sizeof(int16_t));
+      return ((int16_t*)(tensor->data))[i];
+    } break;
+    case NE_TYPE_I32: {
+      NE_ASSERT(tensor->nb[0] == sizeof(int32_t));
+      return ((int32_t*)(tensor->data))[i];
+    } break;
+    case NE_TYPE_F16: {
+      NE_ASSERT(tensor->nb[0] == sizeof(ne_fp16_t));
+      return NE_FP16_TO_FP32(((ne_fp16_t*)(tensor->data))[i]);
+    } break;
+    case NE_TYPE_F32: {
+      NE_ASSERT(tensor->nb[0] == sizeof(float));
+      return ((float*)(tensor->data))[i];
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+
+  return 0.0f;
+}
+
+void ne_set_i32_1d(const struct ne_tensor* tensor, int i, int32_t value) {
+  switch (tensor->type) {
+    case NE_TYPE_I8: {
+      NE_ASSERT(tensor->nb[0] == sizeof(int8_t));
+      ((int8_t*)(tensor->data))[i] = value;
+    } break;
+    case NE_TYPE_I16: {
+      NE_ASSERT(tensor->nb[0] == sizeof(int16_t));
+      ((int16_t*)(tensor->data))[i] = value;
+    } break;
+    case NE_TYPE_I32: {
+      NE_ASSERT(tensor->nb[0] == sizeof(int32_t));
+      ((int32_t*)(tensor->data))[i] = value;
+    } break;
+    case NE_TYPE_F16: {
+      NE_ASSERT(tensor->nb[0] == sizeof(ne_fp16_t));
+      ((ne_fp16_t*)(tensor->data))[i] = NE_FP32_TO_FP16(value);
+    } break;
+    case NE_TYPE_F32: {
+      NE_ASSERT(tensor->nb[0] == sizeof(float));
+      ((float*)(tensor->data))[i] = value;
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+float ne_get_f32_1d(const struct ne_tensor* tensor, int i) {
+  switch (tensor->type) {
+    case NE_TYPE_I8: {
+      NE_ASSERT(tensor->nb[0] == sizeof(int8_t));
+      return ((int8_t*)(tensor->data))[i];
+    } break;
+    case NE_TYPE_I16: {
+      NE_ASSERT(tensor->nb[0] == sizeof(int16_t));
+      return ((int16_t*)(tensor->data))[i];
+    } break;
+    case NE_TYPE_I32: {
+      NE_ASSERT(tensor->nb[0] == sizeof(int32_t));
+      return ((int32_t*)(tensor->data))[i];
+    } break;
+    case NE_TYPE_F16: {
+      NE_ASSERT(tensor->nb[0] == sizeof(ne_fp16_t));
+      return NE_FP16_TO_FP32(((ne_fp16_t*)(tensor->data))[i]);
+    } break;
+    case NE_TYPE_F32: {
+      NE_ASSERT(tensor->nb[0] == sizeof(float));
+      return ((float*)(tensor->data))[i];
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+
+  return 0.0f;
+}
+
+void ne_set_f32_1d(const struct ne_tensor* tensor, int i, float value) {
+  switch (tensor->type) {
+    case NE_TYPE_I8: {
+      NE_ASSERT(tensor->nb[0] == sizeof(int8_t));
+      ((int8_t*)(tensor->data))[i] = value;
+    } break;
+    case NE_TYPE_I16: {
+      NE_ASSERT(tensor->nb[0] == sizeof(int16_t));
+      ((int16_t*)(tensor->data))[i] = value;
+    } break;
+    case NE_TYPE_I32: {
+      NE_ASSERT(tensor->nb[0] == sizeof(int32_t));
+      ((int32_t*)(tensor->data))[i] = value;
+    } break;
+    case NE_TYPE_F16: {
+      NE_ASSERT(tensor->nb[0] == sizeof(ne_fp16_t));
+      ((ne_fp16_t*)(tensor->data))[i] = NE_FP32_TO_FP16(value);
+    } break;
+    case NE_TYPE_F32: {
+      NE_ASSERT(tensor->nb[0] == sizeof(float));
+      ((float*)(tensor->data))[i] = value;
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+void* ne_get_data(const struct ne_tensor* tensor) { return tensor->data; }
+
+float* ne_get_data_f32(const struct ne_tensor* tensor) {
+  assert(tensor->type == NE_TYPE_F32);
+  return (float*)(tensor->data);
+}
+
+const char* ne_get_name(const struct ne_tensor* tensor) { return tensor->name; }
+
+void ne_set_name(struct ne_tensor* tensor, const char* name) {
+  // Ensure that there is space for the null terminator
+  size_t name_len = sizeof(tensor->name) - 1;
+  // Copy up to name_len characters from 'name' to 'tensor->name'
+  strncpy(tensor->name, name, name_len);
+  // Explicitly set the last character to the null terminator
+  tensor->name[name_len] = '\0';
+}
+
+struct ne_tensor* ne_view_tensor(struct ne_context* ctx, const struct ne_tensor* src) {
+  struct ne_tensor* result = ne_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data, src->size);
+
+  result->nb[0] = src->nb[0];
+  result->nb[1] = src->nb[1];
+  result->nb[2] = src->nb[2];
+  result->nb[3] = src->nb[3];
+
+  return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// ne_dup
+
+struct ne_tensor* ne_dup_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace) {
+  bool is_node = false;
+
+  if (!inplace && (a->grad)) is_node = true;
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_DUP;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_debug_op(struct ne_context* ctx, struct ne_tensor* a, ne_debug_callback_t cb) {
+  struct ne_tensor* result = ne_view_tensor(ctx, a);
+  result->op = NE_OP_DEBUG;
+  result->src0 = a;
+  static_assert(sizeof(void*) <= sizeof(result->padding), "No enough space for function ptr!");
+  *((void**)(result->padding)) = cb;
+  return result;
+}
+
+struct ne_tensor* ne_dup(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_dup_impl(ctx, a, false);
+}
+
+struct ne_tensor* ne_dup_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_dup_impl(ctx, a, true);
+}
+
+// ne_add
+
+struct ne_tensor* ne_add_impl(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, bool inplace) {
+  NE_ASSERT(ne_can_repeat_rows(b, a));
+
+  bool is_node = false;
+
+  if (!inplace && (a->grad || b->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_ADD;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+struct ne_tensor* ne_add(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  return ne_add_impl(ctx, a, b, false);
+}
+
+struct ne_tensor* ne_add_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  return ne_add_impl(ctx, a, b, true);
+}
+
+// ne_add1
+
+struct ne_tensor* ne_add1_impl(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, bool inplace) {
+  NE_ASSERT(ne_is_scalar(b));
+  NE_ASSERT(ne_is_padded_1d(a));
+
+  bool is_node = false;
+
+  if (!inplace && (a->grad || b->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_ADD1;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+struct ne_tensor* ne_add1(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  return ne_add1_impl(ctx, a, b, false);
+}
+
+struct ne_tensor* ne_add1_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  return ne_add1_impl(ctx, a, b, true);
+}
+
+// ne_acc
+
+struct ne_tensor* ne_acc_impl(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t nb1, size_t nb2,
+                              size_t nb3, size_t offset, bool inplace) {
+  NE_ASSERT(ne_nelements(b) <= ne_nelements(a));
+  NE_ASSERT(ne_is_contiguous(a));
+  NE_ASSERT(a->type == NE_TYPE_F32);
+  NE_ASSERT(b->type == NE_TYPE_F32);
+
+  bool is_node = false;
+
+  if (!inplace && (a->grad || b->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  ne_scratch_save(ctx);
+
+  struct ne_tensor* c = ne_new_tensor_1d(ctx, NE_TYPE_I32, 5, NE_SIZE_CALC);
+
+  ((int32_t*)c->data)[0] = nb1;
+  ((int32_t*)c->data)[1] = nb2;
+  ((int32_t*)c->data)[2] = nb3;
+  ((int32_t*)c->data)[3] = offset;
+  ((int32_t*)c->data)[4] = inplace ? 1 : 0;
+
+  ne_scratch_load(ctx);
+
+  result->op = NE_OP_ACC;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+  result->opt[0] = c;
+
+  return result;
+}
+
+struct ne_tensor* ne_acc(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t nb1, size_t nb2,
+                         size_t nb3, size_t offset) {
+  return ne_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
+}
+
+struct ne_tensor* ne_acc_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t nb1,
+                                 size_t nb2, size_t nb3, size_t offset) {
+  return ne_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
+}
+
+// ne_sub
+
+struct ne_tensor* ne_sub_impl(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, bool inplace) {
+  NE_ASSERT(ne_are_same_shape(a, b));
+
+  bool is_node = false;
+
+  if (!inplace && (a->grad || b->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_SUB;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+struct ne_tensor* ne_sub(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  return ne_sub_impl(ctx, a, b, false);
+}
+
+struct ne_tensor* ne_sub_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  return ne_sub_impl(ctx, a, b, true);
+}
+
+// ne_mul
+
+struct ne_tensor* ne_mul_impl(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, bool inplace) {
+  // TODO: support less-strict constraint
+  //       NE_ASSERT(ne_can_repeat(b, a));
+  NE_ASSERT(ne_can_repeat_rows(b, a));
+
+  bool is_node = false;
+
+  if (!inplace && (a->grad || b->grad)) {
+    // TODO: support backward pass for broadcasting
+    NE_ASSERT(ne_are_same_shape(a, b));
+    is_node = true;
+  }
+
+  if (inplace) {
+    NE_ASSERT(is_node == false);
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_MUL;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+struct ne_tensor* ne_mul(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  return ne_mul_impl(ctx, a, b, false);
+}
+
+struct ne_tensor* ne_mul_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  return ne_mul_impl(ctx, a, b, true);
+}
+
+// ne_div
+
+struct ne_tensor* ne_div_impl(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, bool inplace) {
+  NE_ASSERT(ne_are_same_shape(a, b));
+
+  bool is_node = false;
+
+  if (!inplace && (a->grad || b->grad)) {
+    is_node = true;
+  }
+
+  if (inplace) {
+    NE_ASSERT(is_node == false);
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_DIV;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+struct ne_tensor* ne_div(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  return ne_div_impl(ctx, a, b, false);
+}
+
+struct ne_tensor* ne_div_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  return ne_div_impl(ctx, a, b, true);
+}
+
+// ne_sqr
+
+struct ne_tensor* ne_sqr_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace) {
+  bool is_node = false;
+
+  if (!inplace && (a->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_SQR;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_sqr(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_sqr_impl(ctx, a, false);
+}
+
+struct ne_tensor* ne_sqr_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_sqr_impl(ctx, a, true);
+}
+
+// ne_sqrt
+
+struct ne_tensor* ne_sqrt_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace) {
+  bool is_node = false;
+
+  if (!inplace && (a->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_SQRT;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_sqrt(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_sqrt_impl(ctx, a, false);
+}
+
+struct ne_tensor* ne_sqrt_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_sqrt_impl(ctx, a, true);
+}
+
+// ne_log
+
+struct ne_tensor* ne_log_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace) {
+  bool is_node = false;
+
+  if (!inplace && (a->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_LOG;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_log(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_log_impl(ctx, a, false);
+}
+
+struct ne_tensor* ne_log_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_log_impl(ctx, a, true);
+}
+
+// ne_sum
+
+struct ne_tensor* ne_sum(struct ne_context* ctx, struct ne_tensor* a) {
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = ne_new_tensor_1d(ctx, a->type, 1, NE_SIZE_CALC);
+
+  result->op = NE_OP_SUM;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+// ne_sum_rows
+
+struct ne_tensor* ne_sum_rows(struct ne_context* ctx, struct ne_tensor* a) {
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  int64_t ne[4] = {1, 1, 1, 1};
+  for (int i = 1; i < a->n_dims; ++i) {
+    ne[i] = a->ne[i];
+  }
+
+  struct ne_tensor* result = ne_new_tensor(ctx, a->type, a->n_dims, ne, a->size);
+
+  result->op = NE_OP_SUM_ROWS;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+// ne_mean
+
+struct ne_tensor* ne_mean(struct ne_context* ctx, struct ne_tensor* a) {
+  bool is_node = false;
+
+  if (a->grad) {
+    NE_ASSERT(false);  // TODO: implement
+    is_node = true;
+  }
+
+  int64_t ne[NE_MAX_DIMS] = {1, a->ne[1], a->ne[2], a->ne[3]};
+  struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, a->n_dims, ne, NE_SIZE_CALC);
+
+  result->op = NE_OP_MEAN;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+// ne_repeat
+
+struct ne_tensor* ne_repeat(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  NE_ASSERT(ne_can_repeat(a, b));
+
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  if (ne_are_same_shape(a, b) && !is_node) {
+    return a;
+  }
+
+  struct ne_tensor* result = ne_new_tensor(ctx, a->type, b->n_dims, b->ne, NE_SIZE_CALC);
+
+  result->op = NE_OP_REPEAT;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+// ne_abs
+
+struct ne_tensor* ne_abs_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace) {
+  bool is_node = false;
+
+  if (!inplace && (a->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_ABS;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_abs(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_abs_impl(ctx, a, false);
+}
+
+struct ne_tensor* ne_abs_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_abs_impl(ctx, a, true);
+}
+
+// ne_sgn
+
+struct ne_tensor* ne_sgn_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace) {
+  bool is_node = false;
+
+  if (!inplace && (a->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_SGN;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_sgn(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_sgn_impl(ctx, a, false);
+}
+
+struct ne_tensor* ne_sgn_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_sgn_impl(ctx, a, true);
+}
+
+// ne_neg
+
+struct ne_tensor* ne_neg_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace) {
+  bool is_node = false;
+
+  if (!inplace && (a->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_NEG;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_neg(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_neg_impl(ctx, a, false);
+}
+
+struct ne_tensor* ne_neg_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_neg_impl(ctx, a, true);
+}
+
+// ne_step
+
+struct ne_tensor* ne_step_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace) {
+  bool is_node = false;
+
+  if (!inplace && (a->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_STEP;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_step(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_step_impl(ctx, a, false);
+}
+
+struct ne_tensor* ne_step_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_step_impl(ctx, a, true);
+}
+
+// ne_relu
+
+struct ne_tensor* ne_relu_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace) {
+  bool is_node = false;
+
+  if (!inplace && (a->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_RELU;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_relu(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_relu_impl(ctx, a, false);
+}
+
+struct ne_tensor* ne_relu_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_relu_impl(ctx, a, true);
+}
+
+// ne_gelu
+
+struct ne_tensor* ne_gelu_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace) {
+  bool is_node = false;
+
+  if (!inplace && (a->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_GELU;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_gelu(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_gelu_impl(ctx, a, false);
+}
+
+struct ne_tensor* ne_gelu_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_gelu_impl(ctx, a, true);
+}
+
+// ne_silu
+
+struct ne_tensor* ne_silu_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace) {
+  bool is_node = false;
+
+  if (!inplace && (a->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_SILU;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_silu(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_silu_impl(ctx, a, false);
+}
+
+struct ne_tensor* ne_silu_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_silu_impl(ctx, a, true);
+}
+
+// ne_silu_back
+
+struct ne_tensor* ne_silu_back(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  bool is_node = false;
+
+  if (a->grad || b->grad) {
+    // TODO: implement backward
+    is_node = true;
+  }
+
+  struct ne_tensor* result = ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_SILU_BACK;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+// ne_norm
+
+struct ne_tensor* ne_norm_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace, float eps) {
+  bool is_node = false;
+
+  if (!inplace && (a->grad)) {
+    NE_ASSERT(false);  // TODO: implement backward
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  ne_set_op_params(result, &eps, sizeof(eps));
+
+  result->op = NE_OP_NORM;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+
+  return result;
+}
+
+struct ne_tensor* ne_norm(struct ne_context* ctx, struct ne_tensor* a, float eps) {
+  return ne_norm_impl(ctx, a, false, eps);
+}
+
+struct ne_tensor* ne_norm_inplace(struct ne_context* ctx, struct ne_tensor* a, float eps) {
+  return ne_norm_impl(ctx, a, true, eps);
+}
+
+struct ne_tensor* ne_rms_norm_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace, float eps) {
+  bool is_node = false;
+
+  if (!inplace && (a->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  ne_set_op_params(result, &eps, sizeof(eps));
+
+  result->op = NE_OP_RMS_NORM;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+
+  return result;
+}
+
+struct ne_tensor* ne_rms_norm(struct ne_context* ctx, struct ne_tensor* a, float eps) {
+  return ne_rms_norm_impl(ctx, a, false, eps);
+}
+
+struct ne_tensor* ne_rms_norm_inplace(struct ne_context* ctx, struct ne_tensor* a, float eps) {
+  return ne_rms_norm_impl(ctx, a, true, eps);
+}
+
+struct ne_tensor* ne_rms_norm_back(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  bool is_node = false;
+
+  if (a->grad) {
+    // TODO: implement backward
+    is_node = true;
+  }
+
+  struct ne_tensor* result = ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_RMS_NORM_BACK;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+// ne_mul_mat
+
+struct ne_tensor* ne_mul_mat(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  NE_ASSERT(ne_can_mul_mat(a, b));
+  NE_ASSERT(!ne_is_transposed(a));
+
+  bool is_node = false;
+
+  if (a->grad || b->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[4] = {a->ne[1], b->ne[1], b->ne[2], b->ne[3]};
+  struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, MAX(a->n_dims, b->n_dims), ne, NE_SIZE_CALC);
+
+  result->op = NE_OP_MUL_MAT;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+// ne_mul_mat_with_bias
+
+struct ne_tensor* ne_mul_mat_with_bias(struct ne_context* ctx, struct ne_tensor* w, struct ne_tensor* b,
+                                       struct ne_tensor* a) {
+  NE_ASSERT(ne_can_mul_mat(w, a));
+  NE_ASSERT(!ne_is_transposed(w));
+
+  bool is_node = false;
+
+  if (w->grad || b->grad || a->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[4] = {w->ne[1], a->ne[1], w->ne[2], a->ne[3]};
+  struct ne_tensor* result = ne_new_tensor(ctx, a->type, MIN(w->n_dims, a->n_dims), ne, NE_SIZE_CALC);
+
+  result->op = NE_OP_MUL_MAT_BIAS;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = w;
+  result->src1 = a;
+  result->opt[0] = b;
+  return result;
+}
+
+struct ne_tensor* ne_mul_mat_id(struct ne_context* ctx, struct ne_tensor* const as[], int n_as, struct ne_tensor* ids,
+                                int id, struct ne_tensor* b) {
+  NE_ASSERT(ids->type == NE_TYPE_I32);
+  NE_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
+  NE_ASSERT(ids->ne[1] == b->ne[1]);
+  NE_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
+  NE_ASSERT(n_as > 0 && n_as <= 8);
+  NE_ASSERT(id >= 0 && id < ids->ne[0]);
+
+  bool is_node = false;
+
+  if (as[0]->grad || b->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[4] = {as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3]};
+  struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne, NE_SIZE_CALC);
+  int params[] = {id, n_as};
+  ne_set_op_params(result, &params, sizeof(params));
+  result->op = NE_OP_MUL_MAT_ID;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = ids;
+  result->src1 = b;
+
+  for (int i = 0; i < n_as; i++) {
+    struct ne_tensor* a = as[i];
+    NE_ASSERT(ne_are_same_shape(as[0], a));
+    NE_ASSERT(ne_can_mul_mat(a, b));
+    NE_ASSERT(!ne_is_transposed(a));
+    result->opt[i] = a;
+  }
+
+  return result;
+}
+
+struct ne_tensor* ne_mul_id_ffn_silu(struct ne_context* ctx, struct ne_tensor* const down[],
+                                     struct ne_tensor* const gate[], struct ne_tensor* const up[], int n_as,
+                                     struct ne_tensor* ids, int id, struct ne_tensor* src) {
+  struct ne_tensor* w1 = gate[0];
+  struct ne_tensor* w2 = down[0];
+  struct ne_tensor* w3 = up[0];
+  NE_ASSERT(ids->type == NE_TYPE_I32);
+  NE_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
+  NE_ASSERT(ids->ne[1] == src->ne[1]);
+  NE_ASSERT(ids->ne[2] == src->ne[2] && ids->ne[3] == src->ne[3]);
+  NE_ASSERT(n_as > 0 && n_as <= 8);
+  NE_ASSERT(id >= 0 && id < ids->ne[0]);
+  NE_ASSERT(ne_are_same_shape(w1, w3));
+  NE_ASSERT(w2->ne[0] == w1->ne[1]);
+
+  bool is_node = false;
+
+  if (down[0]->grad || src->grad) {
+    is_node = true;
+  }
+  const int64_t ne[4] = {w2->ne[1], src->ne[1], src->ne[2], src->ne[3]};
+  struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC);
+  const int64_t tne[4] = {w1->ne[1], src->ne[1], src->ne[2], src->ne[3]};
+  struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC);
+  struct ne_tensor* tmp1 = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC);
+  int params[] = {id, n_as};
+  ne_set_op_params(result, &params, sizeof(params));
+  result->op = NE_OP_MUL_ID_FFN_SILU;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = src;
+  result->src1 = ids;
+  for (int i = 0; i < n_as; i++) {
+    struct ne_tensor* a = gate[i];
+    struct ne_tensor* b = down[i];
+    struct ne_tensor* c = up[i];
+    result->opt[i] = a;
+    result->opt[i + 8] = b;
+    result->opt[i + 16] = c;
+  }
+  result->opt[24] = tmp;
+  result->opt[25] = tmp1;
+  // struct ne_tensor *result = ne_ffn_silu(ctx,gate[row_id], down[row_id],up[row_id], b);
+  return result;
+}
+struct ne_tensor* ne_argsort(struct ne_context* ctx, struct ne_tensor* a) {
+  bool is_node = false;
+
+  struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_I32, NE_MAX_DIMS, a->ne, NE_SIZE_CALC);
+
+  result->op = NE_OP_ARGSORT;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+
+  return result;
+}
+
+// ne_top_k
+
+struct ne_tensor* ne_top_k(struct ne_context* ctx, struct ne_tensor* a, int k) {
+  NE_ASSERT(a->ne[0] >= k);
+
+  struct ne_tensor* result = ne_argsort(ctx, a);
+
+  result = ne_view_4d(ctx, result, k, result->ne[1], result->ne[2], result->ne[3], result->nb[1], result->nb[2],
+                      result->nb[3], 0);
+
+  return result;
+}
+// ne_mul_qkv
+
+struct ne_tensor* ne_mul_qkv(struct ne_context* ctx, struct ne_tensor* qw, struct ne_tensor* kw, struct ne_tensor* vw,
+                             struct ne_tensor* src) {
+  NE_ASSERT(ne_can_mul_mat(src, qw));
+  NE_ASSERT(ne_can_mul_mat(src, kw));
+  NE_ASSERT(ne_can_mul_mat(src, vw));
+  NE_ASSERT(ne_are_same_shape(qw, kw));
+  NE_ASSERT(ne_are_same_shape(qw, vw));
+  NE_ASSERT(!ne_is_transposed(src));
+
+  bool is_node = false;
+
+  if (src->grad || qw->grad || vw->grad || kw->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[4] = {qw->ne[1], src->ne[1], src->ne[2] * 3, src->ne[3]};
+  struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, MIN(src->n_dims, qw->n_dims), ne, NE_SIZE_CALC);
+
+  result->op = NE_OP_MUL_QKV;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = src;
+  result->src1 = qw;
+  result->opt[0] = kw;
+  result->opt[1] = vw;
+
+  return result;
+}
+
+// src -w1-> tmp -> silu -> tmp
+//     -w3-> tmp1               -mul->tmp -w2-> dst
+struct ne_tensor* ne_ffn_silu(struct ne_context* ctx, struct ne_tensor* w1, struct ne_tensor* w2, struct ne_tensor* w3,
+                              struct ne_tensor* src) {
+  NE_ASSERT(ne_are_same_shape(w1, w3));
+  NE_ASSERT(w2->ne[0] == w1->ne[1]);
+
+  bool is_node = false;
+
+  if (src->grad || w1->grad || w2->grad || w3->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[4] = {w2->ne[1], src->ne[1], src->ne[2], src->ne[3]};
+  struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC);
+  const int64_t tne[4] = {w1->ne[1], src->ne[1], src->ne[2], src->ne[3]};
+  struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC);
+  struct ne_tensor* tmp1 = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC);
+
+  result->op = NE_OP_MUL_FFN_SILU;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = src;
+  result->src1 = w1;
+  result->opt[0] = w2;
+  result->opt[1] = w3;
+  result->opt[2] = tmp;
+  result->opt[3] = tmp1;
+  return result;
+}
+
+struct ne_tensor* ne_ffn_add_gelu(struct ne_context* ctx, struct ne_tensor* w1, struct ne_tensor* w2,
+                                  struct ne_tensor* b1, struct ne_tensor* b2, struct ne_tensor* src) {
+  NE_ASSERT(w2->ne[0] == w1->ne[1]);
+
+  bool is_node = false;
+
+  if (src->grad || w1->grad || w2->grad || b1->grad || b2->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[4] = {w2->ne[1], src->ne[1], src->ne[2], src->ne[3]};
+  struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC);
+  const int64_t tne[4] = {w1->ne[1], src->ne[1], src->ne[2], src->ne[3]};
+  struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC);
+
+  result->op = NE_OP_MUL_FFN_ADD_GELU;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = src;
+  result->src1 = w1;
+  result->opt[0] = w2;
+  result->opt[1] = b1;
+  result->opt[2] = b2;
+  result->opt[3] = tmp;
+  return result;
+}
+
+struct ne_tensor* ne_ffn_gelu(struct ne_context* ctx, struct ne_tensor* w1, struct ne_tensor* w2,
+                              struct ne_tensor* src) {
+  NE_ASSERT(w2->ne[0] == w1->ne[1]);
+
+  bool is_node = false;
+
+  if (src->grad || w1->grad || w2->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[4] = {w2->ne[1], src->ne[1], src->ne[2], src->ne[3]};
+  struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC);
+  const int64_t tne[4] = {w1->ne[1], src->ne[1], src->ne[2], src->ne[3]};
+  struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC);
+
+  result->op = NE_OP_MUL_FFN_GELU;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = src;
+  result->src1 = w1;
+  result->opt[0] = w2;
+  result->opt[1] = tmp;
+  return result;
+}
+
+struct ne_tensor* ne_ffn_gelu_mul(struct ne_context* ctx, struct ne_tensor* w1, struct ne_tensor* w2,
+                                  struct ne_tensor* w3, struct ne_tensor* src) {
+  NE_ASSERT(ne_are_same_shape(w1, w3));
+  NE_ASSERT(w2->ne[0] == w1->ne[1]);
+
+  bool is_node = false;
+
+  if (src->grad || w1->grad || w2->grad || w3->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[4] = {w2->ne[1], src->ne[1], src->ne[2], src->ne[3]};
+  struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, ne, NE_SIZE_CALC);
+  const int64_t tne[4] = {w1->ne[1], src->ne[1], src->ne[2], src->ne[3]};
+  struct ne_tensor* tmp = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC);
+  struct ne_tensor* tmp1 = ne_new_tensor(ctx, NE_TYPE_F32, src->n_dims, tne, NE_SIZE_CALC);
+
+  result->op = NE_OP_MUL_FFN_GELU_MUL;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = src;
+  result->src1 = w1;
+  result->opt[0] = w2;
+  result->opt[1] = w3;
+  result->opt[2] = tmp;
+  result->opt[3] = tmp1;
+  return result;
+}
+// ne_scale
+
+struct ne_tensor* ne_scale_impl(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, bool inplace) {
+  NE_ASSERT(ne_is_scalar(b));
+  NE_ASSERT(ne_is_padded_1d(a));
+
+  bool is_node = false;
+
+  if (!inplace && (a->grad || b->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_SCALE;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+struct ne_tensor* ne_scale(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  return ne_scale_impl(ctx, a, b, false);
+}
+
+struct ne_tensor* ne_scale_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  return ne_scale_impl(ctx, a, b, true);
+}
+
+// ne_set
+
+struct ne_tensor* ne_set_impl(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t nb1, size_t nb2,
+                              size_t nb3, size_t offset, bool inplace) {
+  NE_ASSERT(ne_nelements(a) >= ne_nelements(b));
+
+  bool is_node = false;
+
+  if (!inplace && (a->grad || b->grad)) {
+    is_node = true;
+  }
+
+  // make a view of the destination
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  ne_scratch_save(ctx);
+
+  struct ne_tensor* c = ne_new_tensor_1d(ctx, NE_TYPE_I32, 5, NE_SIZE_CALC);
+
+  ((int32_t*)c->data)[0] = nb1;
+  ((int32_t*)c->data)[1] = nb2;
+  ((int32_t*)c->data)[2] = nb3;
+  ((int32_t*)c->data)[3] = offset;
+  ((int32_t*)c->data)[4] = inplace ? 1 : 0;
+
+  ne_scratch_load(ctx);
+
+  result->op = NE_OP_SET;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+  result->opt[0] = c;
+
+  return result;
+}
+
+struct ne_tensor* ne_set(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t nb1, size_t nb2,
+                         size_t nb3, size_t offset) {
+  return ne_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
+}
+
+struct ne_tensor* ne_set_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t nb1,
+                                 size_t nb2, size_t nb3, size_t offset) {
+  return ne_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
+}
+
+struct ne_tensor* ne_set_1d(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t offset) {
+  return ne_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false);
+}
+
+struct ne_tensor* ne_set_1d_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t offset) {
+  return ne_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true);
+}
+
+struct ne_tensor* ne_set_2d(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t nb1,
+                            size_t offset) {
+  return ne_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
+}
+
+struct ne_tensor* ne_set_2d_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t nb1,
+                                    size_t offset) {
+  return ne_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
+}
+
+// ne_cpy
+
+struct ne_tensor* ne_cpy_impl(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, bool inplace) {
+  NE_ASSERT(ne_nelements(a) == ne_nelements(b));
+
+  bool is_node = false;
+
+  if (!inplace && (a->grad || b->grad)) {
+    is_node = true;
+  }
+
+  // make a view of the destination
+  struct ne_tensor* result = ne_view_tensor(ctx, b);
+
+  result->op = NE_OP_CPY;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+struct ne_tensor* ne_cpy(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  return ne_cpy_impl(ctx, a, b, false);
+}
+
+struct ne_tensor* ne_cpy_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  return ne_cpy_impl(ctx, a, b, true);
+}
+
+// ne_cont
+
+struct ne_tensor* ne_cont_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace) {
+  bool is_node = false;
+
+  if (!inplace && a->grad) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_CONT;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_cont(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_cont_impl(ctx, a, false);
+}
+
+struct ne_tensor* ne_cont_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_cont_impl(ctx, a, true);
+}
+
+// ne_reshape
+
+struct ne_tensor* ne_reshape(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  NE_ASSERT(ne_is_contiguous(a));
+  NE_ASSERT(ne_is_contiguous(b));
+  NE_ASSERT(ne_nelements(a) == ne_nelements(b));
+
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  if (b->grad) {
+    // gradient propagation is not supported
+    // NE_ASSERT(false);
+  }
+
+  struct ne_tensor* result = ne_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data, NE_SIZE_CALC);
+
+  result->op = NE_OP_RESHAPE;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_reshape_1d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0) {
+  NE_ASSERT(ne_is_contiguous(a));
+  NE_ASSERT(ne_nelements(a) == ne0);
+
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[1] = {ne0};
+  struct ne_tensor* result = ne_new_tensor_impl(ctx, a->type, 1, ne, a->data, NE_SIZE_CALC);
+
+  result->op = NE_OP_RESHAPE;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_reshape_2d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0, int64_t ne1) {
+  NE_ASSERT(ne_is_contiguous(a));
+  NE_ASSERT(ne_nelements(a) == ne0 * ne1);
+
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[2] = {ne0, ne1};
+  struct ne_tensor* result = ne_new_tensor_impl(ctx, a->type, 2, ne, a->data, NE_SIZE_CALC);
+
+  result->op = NE_OP_RESHAPE;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_reshape_3d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0, int64_t ne1, int64_t ne2) {
+  NE_ASSERT(ne_is_contiguous(a));
+  NE_ASSERT(ne_nelements(a) == ne0 * ne1 * ne2);
+
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[3] = {ne0, ne1, ne2};
+  struct ne_tensor* result = ne_new_tensor_impl(ctx, a->type, 3, ne, a->data, NE_SIZE_CALC);
+
+  result->op = NE_OP_RESHAPE;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_reshape_4d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0, int64_t ne1, int64_t ne2,
+                                int64_t ne3) {
+  NE_ASSERT(ne_is_contiguous(a));
+  NE_ASSERT(ne_nelements(a) == ne0 * ne1 * ne2 * ne3);
+
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[4] = {ne0, ne1, ne2, ne3};
+  struct ne_tensor* result = ne_new_tensor_impl(ctx, a->type, 4, ne, a->data, NE_SIZE_CALC);
+
+  result->op = NE_OP_RESHAPE;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+// ne_view_1d
+
+struct ne_tensor* ne_view_1d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0, size_t offset) {
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = ne_new_tensor_impl(ctx, a->type, 1, &ne0, (char*)a->data + offset, NE_SIZE_CALC);
+
+  result->op = NE_OP_VIEW;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  if (is_node) {
+    memcpy(result->padding, &offset, sizeof(offset));
+  }
+
+  return result;
+}
+
+// ne_view_2d
+
+struct ne_tensor* ne_view_2d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0, int64_t ne1, size_t nb1,
+                             size_t offset) {
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[NE_MAX_DIMS] = {ne0, ne1, 1, 1};
+
+  struct ne_tensor* result = ne_new_tensor_impl(ctx, a->type, 2, ne, (char*)a->data + offset, NE_SIZE_CALC);
+
+  result->nb[1] = nb1;
+  result->nb[2] = result->nb[1] * ne1;
+  result->nb[3] = result->nb[2];
+
+  result->op = NE_OP_VIEW;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  if (is_node) {
+    memcpy(result->padding, &offset, sizeof(offset));
+  }
+
+  return result;
+}
+
+// ne_view_3d
+
+struct ne_tensor* ne_view_3d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0, int64_t ne1, int64_t ne2,
+                             size_t nb1, size_t nb2, size_t offset) {
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[NE_MAX_DIMS] = {ne0, ne1, ne2, 1};
+
+  struct ne_tensor* result = ne_new_tensor_impl(ctx, a->type, 3, ne, (char*)a->data + offset, NE_SIZE_CALC);
+
+  result->nb[1] = nb1;
+  result->nb[2] = nb2;
+  result->nb[3] = result->nb[2] * ne2;
+
+  result->op = NE_OP_VIEW;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  if (is_node) {
+    memcpy(result->padding, &offset, sizeof(offset));
+  }
+
+  return result;
+}
+
+// ne_view_4d
+
+struct ne_tensor* ne_view_4d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0, int64_t ne1, int64_t ne2,
+                             int64_t ne3, size_t nb1, size_t nb2, size_t nb3, size_t offset) {
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[NE_MAX_DIMS] = {ne0, ne1, ne2, ne3};
+
+  struct ne_tensor* result = ne_new_tensor_impl(ctx, a->type, 4, ne, (char*)a->data + offset, NE_SIZE_CALC);
+
+  result->nb[1] = nb1;
+  result->nb[2] = nb2;
+  result->nb[3] = nb3;
+
+  result->op = NE_OP_VIEW;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  if (is_node) {
+    memcpy(result->padding, &offset, sizeof(offset));
+  }
+
+  return result;
+}
+
+// ne_permute
+
+struct ne_tensor* ne_permute(struct ne_context* ctx, struct ne_tensor* a, int axis0, int axis1, int axis2, int axis3) {
+  NE_ASSERT(axis0 >= 0 && axis0 < NE_MAX_DIMS);
+  NE_ASSERT(axis1 >= 0 && axis1 < NE_MAX_DIMS);
+  NE_ASSERT(axis2 >= 0 && axis2 < NE_MAX_DIMS);
+  NE_ASSERT(axis3 >= 0 && axis3 < NE_MAX_DIMS);
+
+  NE_ASSERT(axis0 != axis1);
+  NE_ASSERT(axis0 != axis2);
+  NE_ASSERT(axis0 != axis3);
+  NE_ASSERT(axis1 != axis2);
+  NE_ASSERT(axis1 != axis3);
+  NE_ASSERT(axis2 != axis3);
+
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = ne_view_tensor(ctx, a);
+
+  int ne[NE_MAX_DIMS];
+  int nb[NE_MAX_DIMS];
+
+  ne[axis0] = a->ne[0];
+  ne[axis1] = a->ne[1];
+  ne[axis2] = a->ne[2];
+  ne[axis3] = a->ne[3];
+
+  nb[axis0] = a->nb[0];
+  nb[axis1] = a->nb[1];
+  nb[axis2] = a->nb[2];
+  nb[axis3] = a->nb[3];
+
+  result->ne[0] = ne[0];
+  result->ne[1] = ne[1];
+  result->ne[2] = ne[2];
+  result->ne[3] = ne[3];
+
+  result->nb[0] = nb[0];
+  result->nb[1] = nb[1];
+  result->nb[2] = nb[2];
+  result->nb[3] = nb[3];
+
+  result->op = NE_OP_PERMUTE;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  if (is_node) {
+    result->padding[0] = axis0;
+    result->padding[1] = axis1;
+    result->padding[2] = axis2;
+    result->padding[3] = axis3;
+  }
+
+  return result;
+}
+
+// ne_transpose
+
+struct ne_tensor* ne_transpose(struct ne_context* ctx, struct ne_tensor* a) {
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = ne_view_tensor(ctx, a);
+
+  result->ne[0] = a->ne[1];
+  result->ne[1] = a->ne[0];
+
+  result->nb[0] = a->nb[1];
+  result->nb[1] = a->nb[0];
+
+  result->op = NE_OP_TRANSPOSE;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+// ne_get_rows
+
+struct ne_tensor* ne_get_rows(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  NE_ASSERT(a->ne[2] == b->ne[1]);
+  NE_ASSERT(b->ne[3] == 1);
+  NE_ASSERT(b->type == NE_TYPE_I32);
+
+  bool is_node = false;
+
+  if (a->grad || b->grad) {
+    is_node = true;
+  }
+  enum ne_type type = NE_TYPE_F32;
+  if (a->type == NE_TYPE_I32) {
+    type = a->type;
+  }
+  // TODO: implement non F32 return
+  // struct ne_tensor * result = ne_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
+  struct ne_tensor* result = ne_new_tensor_4d(ctx, NE_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2], NE_SIZE_CALC);
+
+  result->op = NE_OP_GET_ROWS;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+// ne_get_rows_back
+
+struct ne_tensor* ne_get_rows_back(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b,
+                                   struct ne_tensor* c) {
+  NE_ASSERT(ne_is_matrix(a) && ne_is_vector(b) && b->type == NE_TYPE_I32);
+  NE_ASSERT(ne_is_matrix(c) && (a->ne[0] == c->ne[0]));
+
+  bool is_node = false;
+
+  if (a->grad || b->grad) {
+    is_node = true;
+  }
+
+  // TODO: implement non F32 return
+  // struct ne_tensor * result = ne_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
+  struct ne_tensor* result = ne_new_tensor_2d(ctx, NE_TYPE_F32, c->ne[0], c->ne[1], NE_SIZE_CALC);
+
+  result->op = NE_OP_GET_ROWS_BACK;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+  result->opt[0] = c;
+
+  return result;
+}
+
+// ne_diag
+
+struct ne_tensor* ne_diag(struct ne_context* ctx, struct ne_tensor* a) {
+  NE_ASSERT(a->ne[1] == 1);
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  const int64_t ne[4] = {a->ne[0], a->ne[0], a->ne[2], a->ne[3]};
+  struct ne_tensor* result = ne_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne, NE_SIZE_CALC);
+
+  result->op = NE_OP_DIAG;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+// ne_diag_mask_inf
+
+struct ne_tensor* ne_diag_mask_inf_impl(struct ne_context* ctx, struct ne_tensor* a, int n_past, bool inplace,
+                                        int* n_padding, bool padding_left) {
+  NE_ASSERT(padding_left);
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  ne_scratch_save(ctx);
+
+  const int bs = a->ne[3];
+  struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 2 + bs, NE_SIZE_CALC);
+
+  ((int32_t*)b->data)[0] = n_past;
+  ((int32_t*)b->data)[1] = inplace ? 1 : 0;
+  for (int i = 0; i < bs; ++i) {
+    if (n_padding == NULL) {
+      ((int32_t*)b->data)[2 + i] = 0;
+    } else {
+      ((int32_t*)b->data)[2 + i] = *(n_padding + i);
+    }
+  }
+
+  ne_scratch_load(ctx);
+
+  result->op = NE_OP_DIAG_MASK_INF;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+struct ne_tensor* ne_diag_mask_inf(struct ne_context* ctx, struct ne_tensor* a, int n_past) {
+  return ne_diag_mask_inf_impl(ctx, a, n_past, false, NULL, true);
+}
+
+struct ne_tensor* ne_diag_mask_inf_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past) {
+  return ne_diag_mask_inf_impl(ctx, a, n_past, true, NULL, true);
+}
+
+struct ne_tensor* ne_diag_mask_inf_with_padding(struct ne_context* ctx, struct ne_tensor* a, int n_past,
+                                                int* n_padding) {
+  return ne_diag_mask_inf_impl(ctx, a, n_past, false, n_padding, true);
+}
+
+struct ne_tensor* ne_diag_mask_inf_with_padding_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past,
+                                                        int* n_padding) {
+  return ne_diag_mask_inf_impl(ctx, a, n_past, true, n_padding, true);
+}
+
+// ne_diag_mask_zero
+
+struct ne_tensor* ne_diag_mask_zero_impl(struct ne_context* ctx, struct ne_tensor* a, int n_past, bool inplace) {
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  ne_scratch_save(ctx);
+
+  struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 2, NE_SIZE_CALC);
+  ne_set_name(b, "n_past, inplace");
+
+  ((int32_t*)b->data)[0] = n_past;
+  ((int32_t*)b->data)[1] = inplace ? 1 : 0;
+
+  ne_scratch_load(ctx);
+
+  result->op = NE_OP_DIAG_MASK_ZERO;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+struct ne_tensor* ne_diag_mask_zero(struct ne_context* ctx, struct ne_tensor* a, int n_past) {
+  return ne_diag_mask_zero_impl(ctx, a, n_past, false);
+}
+
+struct ne_tensor* ne_diag_mask_zero_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past) {
+  return ne_diag_mask_zero_impl(ctx, a, n_past, true);
+}
+
+// ne_padding_mask_inf
+
+struct ne_tensor* ne_padding_mask_inf_impl(struct ne_context* ctx, struct ne_tensor* a, int n_past, int* n_padding,
+                                           bool padding_left, bool inplace) {
+  NE_ASSERT(padding_left);
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  ne_scratch_save(ctx);
+
+#define PM_PARAMS_NUM 2
+#define PM_NPAST_IDX 0
+#define PM_INPLACE_IDX 1
+#define PM_PADDING_IDX 2
+
+  const int bs = a->ne[3];
+  struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, PM_PARAMS_NUM + bs, NE_SIZE_CALC);
+
+  ((int32_t*)b->data)[PM_NPAST_IDX] = n_past;
+  ((int32_t*)b->data)[PM_INPLACE_IDX] = inplace ? 1 : 0;
+  for (int i = 0; i < bs; ++i) {
+    if (n_padding == NULL) {
+      ((int32_t*)b->data)[PM_PADDING_IDX + i] = 0;
+    } else {
+      ((int32_t*)b->data)[PM_PADDING_IDX + i] = *(n_padding + i);
+    }
+  }
+
+  ne_scratch_load(ctx);
+
+  result->op = NE_OP_PADDING_MASK_INF;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+struct ne_tensor* ne_padding_left_mask_inf(struct ne_context* ctx, struct ne_tensor* a, int* n_padding) {
+  return ne_padding_mask_inf_impl(ctx, a, 0, n_padding, true, false);
+}
+
+struct ne_tensor* ne_padding_left_mask_inf_inplace(struct ne_context* ctx, struct ne_tensor* a, int* n_padding) {
+  return ne_padding_mask_inf_impl(ctx, a, 0, n_padding, true, true);
+}
+
+// ne_soft_max
+
+struct ne_tensor* ne_soft_max_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace) {
+  bool is_node = false;
+
+  if (a->grad) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_SOFT_MAX;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = NULL;
+
+  return result;
+}
+
+struct ne_tensor* ne_soft_max(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_soft_max_impl(ctx, a, false);
+}
+
+struct ne_tensor* ne_soft_max_inplace(struct ne_context* ctx, struct ne_tensor* a) {
+  return ne_soft_max_impl(ctx, a, true);
+}
+
+// ne_rope
+
+struct ne_tensor* ne_rope_impl(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode,
+                               int prompt_size, bool inplace, int n_keep, struct ne_tensor* cossin, int* n_padding,
+                               bool padding_left, float freq_base, float freq_scale, int yarn_orig_ctx,
+                               float ext_factor, float attn_factor, float beta_fast, float beta_slow) {
+  NE_ASSERT(n_past >= 0 || n_keep >= 0);
+  NE_ASSERT(padding_left);
+  bool is_node = false;
+
+  if (!inplace && a->grad) {
+    is_node = true;
+  }
+
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  ne_scratch_save(ctx);
+
+#define ROPE_PARAMS_NUM 5
+#define ROPE_NPAST_IDX 0
+#define ROPE_NDIMS_IDX 1
+#define ROPE_MODE_IDX 2
+#define ROPE_PROMPTSIZE_IDX 3
+#define ROPE_NKEEP_IDX 4
+#define ROPE_PADDING_IDX 5
+
+  const int bs = a->ne[3];
+  struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, ROPE_PARAMS_NUM + bs, NE_SIZE_CALC);
+
+  ((int32_t*)b->data)[ROPE_NPAST_IDX] = n_past;
+  ((int32_t*)b->data)[ROPE_NDIMS_IDX] = n_dims;
+  ((int32_t*)b->data)[ROPE_MODE_IDX] = mode;
+  ((int32_t*)b->data)[ROPE_PROMPTSIZE_IDX] = prompt_size;
+  ((int32_t*)b->data)[ROPE_NKEEP_IDX] = n_keep;  // set to non-negative value to enable shift mode
+  // store n_padding (chatglm position ids)
+  for (int i = 0; i < bs; ++i) {
+    if (n_padding == NULL) {
+      ((int32_t*)b->data)[ROPE_PADDING_IDX + i] = 0;
+    } else {
+      ((int32_t*)b->data)[ROPE_PADDING_IDX + i] = *(n_padding + i);
+    }
+  }
+
+  ne_scratch_load(ctx);
+
+  /* what the difference of setting parameters in b->data and in op_parameters */
+  /* float and int are in different data ?? */
+  float params[] = {freq_base, freq_scale, (float)yarn_orig_ctx, ext_factor, attn_factor, beta_fast, beta_slow};
+  ne_set_op_params(result, &params, sizeof(params));
+
+  result->op = NE_OP_ROPE;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+  result->opt[0] = cossin;
+
+  return result;
+}
+
+struct ne_tensor* ne_rope(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode,
+                          int prompt_size, float freq_base, float freq_scale) {
+  return ne_rope_impl(ctx, a, n_past, n_dims, mode, prompt_size, false, -1, NULL, NULL, true, freq_base, freq_scale, 0,
+                      0.0f, 1.0f, 0.0f, 0.0f);
+}
+
+struct ne_tensor* ne_rope_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode,
+                                  int prompt_size, float freq_base, float freq_scale) {
+  return ne_rope_impl(ctx, a, n_past, n_dims, mode, prompt_size, true, -1, NULL, NULL, true, freq_base, freq_scale, 0,
+                      0.0f, 1.0f, 0.0f, 0.0f);
+}
+
+struct ne_tensor* ne_rope_shift_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_shift, int n_dims, int mode,
+                                        int prompt_size, int n_keep, struct ne_tensor* cossin, float freq_base,
+                                        float freq_scale) {
+  return ne_rope_impl(ctx, a, n_shift, n_dims, mode, prompt_size, true, n_keep, cossin, NULL, true, freq_base,
+                      freq_scale, 0, 0.0f, 1.0f, 0.0f, 0.0f);
+}
+
+struct ne_tensor* ne_rope_custom_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode,
+                                         int prompt_size, float freq_base, float freq_scale, int yarn_orig_ctx,
+                                         float ext_factor, float attn_factor, float beta_fast, float beta_slow) {
+  return ne_rope_impl(ctx, a, n_past, n_dims, mode, prompt_size, true, -1, NULL, NULL, true, freq_base, freq_scale,
+                      yarn_orig_ctx, ext_factor, attn_factor, beta_fast, beta_slow);
+}
+
+struct ne_tensor* ne_rope_custom_shift_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_shift, int n_dims,
+                                               int mode, int prompt_size, int n_keep, struct ne_tensor* cossin,
+                                               float freq_base, float freq_scale, int yarn_orig_ctx, float ext_factor,
+                                               float attn_factor, float beta_fast, float beta_slow) {
+  return ne_rope_impl(ctx, a, n_shift, n_dims, mode, prompt_size, true, n_keep, cossin, NULL, true, freq_base,
+                      freq_scale, yarn_orig_ctx, ext_factor, attn_factor, beta_fast, beta_slow);
+}
+
+// ne_rope_back
+
+struct ne_tensor* ne_rope_back(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode) {
+  NE_ASSERT(n_past >= 0);
+  bool is_node = false;
+
+  if (a->grad) {
+    NE_ASSERT(false);  // TODO: implement backward
+    is_node = true;
+  }
+
+  struct ne_tensor* result = ne_dup_tensor(ctx, a);
+
+  ne_scratch_save(ctx);
+
+  struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 3, NE_SIZE_CALC);
+  ne_set_name(b, "n_past, n_dims, mode");
+
+  ((int32_t*)b->data)[0] = n_past;
+  ((int32_t*)b->data)[1] = n_dims;
+  ((int32_t*)b->data)[2] = mode;
+
+  ne_scratch_load(ctx);
+
+  result->op = NE_OP_ROPE_BACK;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+struct ne_tensor* ne_rope_with_padding(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode,
+                                       int prompt_size, int* n_padding, float freq_base, float freq_scale) {
+  return ne_rope_impl(ctx, a, n_past, n_dims, mode, prompt_size, false, -1, NULL, n_padding, true, freq_base,
+                      freq_scale, 0, 0.0f, 1.0f, 0.0f, 0.0f);
+}
+
+struct ne_tensor* ne_rope_with_padding_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims,
+                                               int mode, int prompt_size, int* n_padding, float freq_base,
+                                               float freq_scale) {
+  return ne_rope_impl(ctx, a, n_past, n_dims, mode, prompt_size, true, -1, NULL, n_padding, true, freq_base, freq_scale,
+                      0, 0.0f, 1.0f, 0.0f, 0.0f);
+}
+
+// ne_alibi
+
+struct ne_tensor* ne_alibi(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_head, float bias_max) {
+  NE_ASSERT(n_past >= 0);
+  bool is_node = false;
+
+  if (a->grad) {
+    NE_ASSERT(false);  // TODO: implement backward
+    is_node = true;
+  }
+
+  // TODO: when implement backward, fix this:
+  // struct ne_tensor * result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+  struct ne_tensor* result = ne_view_tensor(ctx, a);
+
+  ne_scratch_save(ctx);
+
+  struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 3, NE_SIZE_CALC);
+
+  ((int32_t*)b->data)[0] = n_past;
+  ((int32_t*)b->data)[1] = n_head;
+  NE_ASSERT(sizeof(float) == sizeof(int32_t));
+  (((float*)b->data)[2]) = bias_max;
+
+  ne_scratch_load(ctx);
+
+  result->op = NE_OP_ALIBI;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+// ne_clamp
+
+struct ne_tensor* ne_clamp(struct ne_context* ctx, struct ne_tensor* a, float min, float max) {
+  bool is_node = false;
+
+  if (a->grad) {
+    NE_ASSERT(false);  // TODO: implement backward
+    is_node = true;
+  }
+
+  // TODO: when implement backward, fix this:
+  struct ne_tensor* result = ne_view_tensor(ctx, a);
+
+  ne_scratch_save(ctx);
+
+  struct ne_tensor* b = ne_new_tensor_1d(ctx, NE_TYPE_I32, 3, NE_SIZE_CALC);
+
+  ((float*)b->data)[0] = min;
+  ((float*)b->data)[1] = max;
+
+  ne_scratch_load(ctx);
+
+  result->op = NE_OP_CLAMP;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+// ne_conv_1d_1s
+
+struct ne_tensor* ne_conv_1d_1s(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  NE_ASSERT(ne_is_matrix(b));
+  NE_ASSERT(a->ne[1] == b->ne[1]);
+  NE_ASSERT(a->ne[3] == 1);
+  bool is_node = false;
+
+  if (a->grad || b->grad) {
+    NE_ASSERT(false);  // TODO: implement backward
+    is_node = true;
+  }
+
+  const int64_t ne[4] = {
+      b->ne[0],
+      a->ne[2],
+      1,
+      1,
+  };
+  struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, 2, ne, NE_SIZE_CALC);
+
+  result->op = NE_OP_CONV_1D_1S;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+// ne_conv_1d_2s
+
+struct ne_tensor* ne_conv_1d_2s(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b) {
+  NE_ASSERT(ne_is_matrix(b));
+  NE_ASSERT(a->ne[1] == b->ne[1]);
+  NE_ASSERT(a->ne[3] == 1);
+  bool is_node = false;
+
+  if (a->grad || b->grad) {
+    NE_ASSERT(false);  // TODO: implement backward
+    is_node = true;
+  }
+
+  const int64_t ne[4] = {
+      b->ne[0] / 2,
+      a->ne[2],
+      1,
+      1,
+  };
+  struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, 2, ne, NE_SIZE_CALC);
+
+  result->op = NE_OP_CONV_1D_2S;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+// for ne_conv_1d
+static int64_t ne_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+  return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
+}
+
+NE_API struct ne_tensor* ne_conv_1d(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, int s0, int p0,
+                                    int d0) {
+  NE_ASSERT(ne_is_matrix(b));
+  NE_ASSERT(a->ne[1] == b->ne[1]);
+  bool is_node = false;
+
+  if (a->grad || b->grad) {
+    NE_ASSERT(false);  // TODO: implement backward
+    is_node = true;
+  }
+
+  const int64_t ne[4] = {
+      ne_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
+      a->ne[2],
+      1,
+      1,
+  };
+  struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, 2, ne, NE_SIZE_CALC);
+
+  int32_t params[] = {s0, p0, d0};
+  ne_set_op_params(result, params, sizeof(params));
+
+  result->op = NE_OP_CONV_1D;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+
+  return result;
+}
+
+// ne_conv_1d_ph
+
+struct ne_tensor* ne_conv_1d_ph(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, int s, int d) {
+  return ne_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
+}
+
+// ne_flash_attn
+
+struct ne_tensor* ne_flash_attn(struct ne_context* ctx, struct ne_tensor* q, struct ne_tensor* k, struct ne_tensor* v,
+                                float scale, ne_attn_flags_t flags) {
+  NE_ASSERT(ne_can_mul_mat(k, q));
+  int batch = q->ne[3];
+  int headnum = q->ne[2];
+  int heads_kv = k->ne[2];
+  int seq_cur = q->ne[1];
+  int headsize = q->ne[0];
+  int seq_all = k->ne[1];
+  // int seq_past = seq_all - seq_cur;
+  NE_ASSERT(("headnum must be a multiple of heads_kv", headnum % heads_kv == 0));
+  NE_ASSERT(headsize == k->ne[0]);
+  NE_ASSERT(headsize == v->ne[1]);
+  NE_ASSERT(seq_all == v->ne[0]);
+  NE_ASSERT(("n_heads must be the same for K/V!", k->ne[2] == v->ne[2]));
+  NE_ASSERT(batch == k->ne[3]);
+  NE_ASSERT(batch == v->ne[3]);
+  bool is_node = true;
+  struct ne_tensor* result = ne_new_tensor_4d(ctx, NE_TYPE_F32, headsize, headnum, seq_cur, batch, NE_SIZE_CALC);
+  attn_shape_t atte_shape = {batch, headnum, headsize, seq_cur, seq_all};
+  size_t tmpsize = bestla_fusion_attn_workspace_size(&atte_shape);
+  struct ne_tensor* tmp_t = ne_new_tensor_1d(ctx, NE_TYPE_I8, tmpsize, NE_SIZE_CALC);
+  result->op = NE_OP_FLASH_ATTN;
+  result->grad = NULL;
+  result->src0 = q;
+  result->src1 = k;
+  result->opt[0] = v;
+  result->opt[1] = tmp_t;
+  *(float*)result->padding = scale;
+  *(ne_attn_flags_t*)&result->padding[sizeof(scale)] = flags;
+  return result;
+}
+
+// ne_flash_attn_kv_update
+struct ne_tensor* ne_flash_attn_kv_update(struct ne_context* ctx, struct ne_tensor* cache, struct ne_tensor* cur,
+                                          int n_past, bool no_zeroing, bool is_v) {
+  NE_ASSERT(!(cache->grad || cur->grad));  // backward not implemented
+  // make a view of the destination
+  struct ne_tensor* result = ne_view_tensor(ctx, cache);
+
+  ne_scratch_save(ctx);
+
+  struct ne_tensor* params = ne_new_tensor_1d(ctx, NE_TYPE_I32, 3, NE_SIZE_CALC);
+
+  ((int32_t*)params->data)[0] = n_past;
+  ((int32_t*)params->data)[1] = (int)is_v;
+  ((int32_t*)params->data)[2] = (int)no_zeroing;
+
+  ne_scratch_load(ctx);
+
+  result->op = NE_OP_FLASH_ATTN_KV_UPDATE;
+  result->grad = NULL;
+  result->src0 = cache;
+  result->src1 = cur;
+  result->opt[0] = params;
+  return result;
+}
+struct ne_tensor* ne_flash_attn_update_k(struct ne_context* ctx, struct ne_tensor* cache, struct ne_tensor* cur,
+                                         int n_past, bool no_zeroing) {
+  return ne_flash_attn_kv_update(ctx, cache, cur, n_past, no_zeroing, false);
+}
+struct ne_tensor* ne_flash_attn_update_v(struct ne_context* ctx, struct ne_tensor* cache, struct ne_tensor* cur,
+                                         int n_past, bool no_zeroing) {
+  return ne_flash_attn_kv_update(ctx, cache, cur, n_past, no_zeroing, true);
+}
+
+// ne_flash_ff
+
+struct ne_tensor* ne_flash_ff(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b0, struct ne_tensor* b1,
+                              struct ne_tensor* c0, struct ne_tensor* c1) {
+  NE_ASSERT(ne_can_mul_mat(b0, a));
+  // TODO: more checks
+
+  bool is_node = false;
+
+  if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {
+    NE_ASSERT(false);  // TODO: implement backward
+    is_node = true;
+  }
+
+  // struct ne_tensor * result = ne_dup_tensor(ctx, a);
+  struct ne_tensor* result = ne_new_tensor(ctx, NE_TYPE_F32, 4, a->ne, NE_SIZE_CALC);
+
+  result->op = NE_OP_FLASH_FF;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b0;
+  result->opt[0] = b1;
+  result->opt[1] = c0;
+  result->opt[2] = c1;
+
+  return result;
+}
+
+// ne_map_unary
+
+struct ne_tensor* ne_map_unary_impl_f32(struct ne_context* ctx, struct ne_tensor* a, const ne_unary_op_f32_t fun,
+                                        bool inplace) {
+  bool is_node = false;
+
+  if (!inplace && a->grad) {
+    is_node = true;
+  }
+
+  struct ne_tensor* addr_tensor = ne_new_tensor_1d(ctx, NE_TYPE_I32, sizeof(void*) / sizeof(int32_t), NE_SIZE_CALC);
+  *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_MAP_UNARY;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->opt[0] = addr_tensor;
+
+  return result;
+}
+
+struct ne_tensor* ne_map_unary_f32(struct ne_context* ctx, struct ne_tensor* a, const ne_unary_op_f32_t fun) {
+  return ne_map_unary_impl_f32(ctx, a, fun, false);
+}
+
+struct ne_tensor* ne_map_unary_inplace_f32(struct ne_context* ctx, struct ne_tensor* a, const ne_unary_op_f32_t fun) {
+  return ne_map_unary_impl_f32(ctx, a, fun, true);
+}
+
+// ne_map_binary
+
+struct ne_tensor* ne_map_binary_impl_f32(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b,
+                                         const ne_binary_op_f32_t fun, bool inplace) {
+  NE_ASSERT(ne_are_same_shape(a, b));
+
+  bool is_node = false;
+
+  if (!inplace && (a->grad || b->grad)) {
+    is_node = true;
+  }
+
+  struct ne_tensor* addr_tensor = ne_new_tensor_1d(ctx, NE_TYPE_I32, sizeof(void*) / sizeof(int32_t), NE_SIZE_CALC);
+  *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
+  struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);
+
+  result->op = NE_OP_MAP_BINARY;
+  result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
+  result->src0 = a;
+  result->src1 = b;
+  result->opt[0] = addr_tensor;
+
+  return result;
+}
+
+struct ne_tensor* ne_map_binary_f32(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b,
+                                    const ne_binary_op_f32_t fun) {
+  return ne_map_binary_impl_f32(ctx, a, b, fun, false);
+}
+
+struct ne_tensor* ne_map_binary_inplace_f32(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b,
+                                            const ne_binary_op_f32_t fun) {
+  return ne_map_binary_impl_f32(ctx, a, b, fun, true);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void ne_set_param(struct ne_context* ctx, struct ne_tensor* tensor) {
+  tensor->is_param = true;
+
+  NE_ASSERT(tensor->grad == NULL);
+  tensor->grad = ne_dup_tensor(ctx, tensor);
+}
+
+static void ne_compute_forward_dump_tensor(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                           const struct ne_tensor* dst) {
+  if (params->type == NE_TASK_INIT || params->type == NE_TASK_FINALIZE) {
+    return;
+  }
+  // make sure each file different as the multi-node will use the file
+  int random_num = rand();
+  char file_name[255];
+  snprintf(file_name, 255, "%s_%d.txt", src0->name, random_num);
+  FILE* file = fopen(file_name, "w");
+  if (file == NULL) {
+    NE_ASSERT(false);
+  }
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+  const int64_t nr = ne_nrows(src0);
+
+  fprintf(file, "Total element is %" PRId64 "\n", ne_nelements(src0));
+  fprintf(file,
+          "ne[0] size is %" PRId64 " ne[1] size is %" PRId64 " ne[2] size is %" PRId64 " ne[3] size is %" PRId64 " \n",
+          ne00, ne01, ne02, ne03);
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      for (int64_t ir = 0; ir < nr; ++ir) {
+        for (int64_t i0 = 0; i0 < ne00; ++i0) {
+          fprintf(file, "%f ", *(((float*)src0->data) + i0 + ne00 * ir));
+        }
+        fprintf(file, "\n");
+      }
+    } break;
+    case NE_TYPE_F16: {
+      for (int64_t ir = 0; ir < nr; ++ir) {
+        for (int64_t i0 = 0; i0 < ne00; ++i0) {
+          float src_data = (float)(*(((ne_fp16_t*)src0->data) + i0 + ne00 * ir));
+          fprintf(file, "%f ", src_data);
+        }
+        fprintf(file, "\n");
+      }
+    } break;
+    case NE_TYPE_Q4_0: {
+      for (int64_t ir = 0; ir < nr; ++ir) {
+        for (int64_t i0 = 0; i0 < ne00 / 2; ++i0) {
+          int high_half = *(((char*)src0->data) + i0 + ne00 / 2 * ir) >> 4;
+          int low_half = *(((char*)src0->data) + i0 + ne00 / 2 * ir) & 0x0F;
+          fprintf(file, "%d %d ", high_half, low_half);
+        }
+        fprintf(file, "\n");
+      }
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+  fclose(file);
+  NE_ASSERT(false);
+}
+// ne_compute_forward_dup
+
+static void ne_compute_forward_dup_same_cont(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                             struct ne_tensor* dst) {
+  NE_ASSERT(ne_nelements(dst) == ne_nelements(src0));
+  NE_ASSERT(ne_is_contiguous(dst) && ne_is_contiguous(src0));
+  NE_ASSERT(src0->type == dst->type);
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb0 = dst->nb[0];
+
+  const int ith = params->ith;  // thread index
+  const int nth = params->nth;  // number of threads
+
+  // parallelize by elements
+  const int ne = ne_nelements(dst);
+  const int dr = (ne + nth - 1) / nth;
+  const int ie0 = dr * ith;
+  const int ie1 = MIN(ie0 + dr, ne);
+
+  if (ie0 < ie1) {
+    memcpy(((char*)dst->data + ie0 * nb0), ((char*)src0->data + ie0 * nb00), (ie1 - ie0) * NE_TYPE_SIZE[src0->type]);
+  }
+}
+
+static void ne_compute_forward_dup_f16(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       struct ne_tensor* dst) {
+  NE_ASSERT(ne_nelements(dst) == ne_nelements(src0));
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int ith = params->ith;  // thread index
+  const int nth = params->nth;  // number of threads
+
+  if (ne_is_contiguous(src0) && ne_is_contiguous(dst) && src0->type == dst->type) {
+    ne_compute_forward_dup_same_cont(params, src0, dst);
+    return;
+  }
+
+  // parallelize by rows
+  const int nr = ne01;
+  // number of rows per thread
+  const int dr = (nr + nth - 1) / nth;
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  if (src0->type == dst->type && ne00 == ne0 && nb00 == NE_TYPE_SIZE[src0->type] && nb0 == NE_TYPE_SIZE[dst->type]) {
+    // copy by rows
+    const size_t rs = ne00 * nb00;
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+      for (int64_t i02 = 0; i02 < ne02; i02++) {
+        for (int64_t i01 = ir0; i01 < ir1; i01++) {
+          memcpy(((char*)dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3),
+                 ((char*)src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03), rs);
+        }
+      }
+    }
+    return;
+  }
+
+  // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
+
+  if (ne_is_contiguous(dst)) {
+    if (nb00 == sizeof(ne_fp16_t)) {
+      if (dst->type == NE_TYPE_F16) {
+        size_t id = 0;
+        const size_t rs = ne00 * nb00;
+        char* dst_ptr = (char*)dst->data;
+
+        for (int i03 = 0; i03 < ne03; i03++) {
+          for (int i02 = 0; i02 < ne02; i02++) {
+            id += rs * ir0;
+            for (int i01 = ir0; i01 < ir1; i01++) {
+              const char* src0_ptr = (char*)src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03;
+              memcpy(dst_ptr + id, src0_ptr, rs);
+              id += rs;
+            }
+            id += rs * (ne01 - ir1);
+          }
+        }
+      } else if (dst->type == NE_TYPE_F32) {
+        size_t id = 0;
+        float* dst_ptr = (float*)dst->data;
+
+        for (int i03 = 0; i03 < ne03; i03++) {
+          for (int i02 = 0; i02 < ne02; i02++) {
+            id += ne00 * ir0;
+            for (int i01 = ir0; i01 < ir1; i01++) {
+              const ne_fp16_t* src0_ptr = (ne_fp16_t*)((char*)src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+              for (int i00 = 0; i00 < ne00; i00++) {
+                dst_ptr[id] = NE_FP16_TO_FP32(src0_ptr[i00]);
+                id++;
+              }
+            }
+            id += ne00 * (ne01 - ir1);
+          }
+        }
+      } else if (ne_is_quantized(dst->type)) {
+        quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
+        float* src0_f32 = (float*)params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
+
+        size_t id = 0;
+        size_t rs = nb0 * (ne00 / NE_BLCK_SIZE[dst->type]);
+        char* dst_ptr = (char*)dst->data;
+
+        for (int i03 = 0; i03 < ne03; i03++) {
+          for (int i02 = 0; i02 < ne02; i02++) {
+            id += rs * ir0;
+            for (int i01 = ir0; i01 < ir1; i01++) {
+              const ne_fp16_t* src0_ptr = (ne_fp16_t*)((char*)src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+              for (int i00 = 0; i00 < ne00; i00++) {
+                src0_f32[i00] = NE_FP16_TO_FP32(src0_ptr[i00]);
+              }
+
+              quantize_row_q(src0_f32, dst_ptr + id, ne00);
+              id += rs;
+            }
+            id += rs * (ne01 - ir1);
+          }
+        }
+      } else {
+        NE_ASSERT(false);  // TODO: implement
+      }
+    } else {
+      // printf("%s: this is not optimal - fix me\n", __func__);
+
+      if (dst->type == NE_TYPE_F32) {
+        size_t id = 0;
+        float* dst_ptr = (float*)dst->data;
+
+        for (int i03 = 0; i03 < ne03; i03++) {
+          for (int i02 = 0; i02 < ne02; i02++) {
+            id += ne00 * ir0;
+            for (int i01 = ir0; i01 < ir1; i01++) {
+              for (int i00 = 0; i00 < ne00; i00++) {
+                const ne_fp16_t* src0_ptr =
+                    (ne_fp16_t*)((char*)src0->data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+                dst_ptr[id] = NE_FP16_TO_FP32(*src0_ptr);
+                id++;
+              }
+            }
+            id += ne00 * (ne01 - ir1);
+          }
+        }
+      } else if (dst->type == NE_TYPE_F16) {
+        size_t id = 0;
+        ne_fp16_t* dst_ptr = (ne_fp16_t*)dst->data;
+
+        for (int i03 = 0; i03 < ne03; i03++) {
+          for (int i02 = 0; i02 < ne02; i02++) {
+            id += ne00 * ir0;
+            for (int i01 = ir0; i01 < ir1; i01++) {
+              for (int i00 = 0; i00 < ne00; i00++) {
+                const ne_fp16_t* src0_ptr =
+                    (ne_fp16_t*)((char*)src0->data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+                dst_ptr[id] = *src0_ptr;
+                id++;
+              }
+            }
+            id += ne00 * (ne01 - ir1);
+          }
+        }
+      } else {
+        NE_ASSERT(false);  // TODO: implement
+      }
+    }
+    return;
+  }
+
+  // dst counters
+  int64_t i10 = 0;
+  int64_t i11 = 0;
+  int64_t i12 = 0;
+  int64_t i13 = 0;
+
+  if (dst->type == NE_TYPE_F16) {
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+      for (int64_t i02 = 0; i02 < ne02; i02++) {
+        i10 += ne00 * ir0;
+        while (i10 >= ne0) {
+          i10 -= ne0;
+          if (++i11 == ne1) {
+            i11 = 0;
+            if (++i12 == ne2) {
+              i12 = 0;
+              if (++i13 == ne3) {
+                i13 = 0;
+              }
+            }
+          }
+        }
+        for (int64_t i01 = ir0; i01 < ir1; i01++) {
+          for (int64_t i00 = 0; i00 < ne00; i00++) {
+            const char* src0_ptr = ((char*)src0->data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03);
+            char* dst_ptr = ((char*)dst->data + i10 * nb0 + i11 * nb1 + i12 * nb2 + i13 * nb3);
+
+            NE_ASSERT(dst_ptr != NULL);
+            memcpy(dst_ptr, src0_ptr, sizeof(ne_fp16_t));
+
+            if (++i10 == ne00) {
+              i10 = 0;
+              if (++i11 == ne01) {
+                i11 = 0;
+                if (++i12 == ne02) {
+                  i12 = 0;
+                  if (++i13 == ne03) {
+                    i13 = 0;
+                  }
+                }
+              }
+            }
+          }
+        }
+        i10 += ne00 * (ne01 - ir1);
+        while (i10 >= ne0) {
+          i10 -= ne0;
+          if (++i11 == ne1) {
+            i11 = 0;
+            if (++i12 == ne2) {
+              i12 = 0;
+              if (++i13 == ne3) {
+                i13 = 0;
+              }
+            }
+          }
+        }
+      }
+    }
+  } else if (dst->type == NE_TYPE_F32) {
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+      for (int64_t i02 = 0; i02 < ne02; i02++) {
+        i10 += ne00 * ir0;
+        while (i10 >= ne0) {
+          i10 -= ne0;
+          if (++i11 == ne1) {
+            i11 = 0;
+            if (++i12 == ne2) {
+              i12 = 0;
+              if (++i13 == ne3) {
+                i13 = 0;
+              }
+            }
+          }
+        }
+        for (int64_t i01 = ir0; i01 < ir1; i01++) {
+          for (int64_t i00 = 0; i00 < ne00; i00++) {
+            const char* src0_ptr = ((char*)src0->data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03);
+            char* dst_ptr = ((char*)dst->data + i10 * nb0 + i11 * nb1 + i12 * nb2 + i13 * nb3);
+
+            *(float*)dst_ptr = NE_FP16_TO_FP32(*(const ne_fp16_t*)src0_ptr);
+
+            if (++i10 == ne0) {
+              i10 = 0;
+              if (++i11 == ne1) {
+                i11 = 0;
+                if (++i12 == ne2) {
+                  i12 = 0;
+                  if (++i13 == ne3) {
+                    i13 = 0;
+                  }
+                }
+              }
+            }
+          }
+        }
+        i10 += ne00 * (ne01 - ir1);
+        while (i10 >= ne0) {
+          i10 -= ne0;
+          if (++i11 == ne1) {
+            i11 = 0;
+            if (++i12 == ne2) {
+              i12 = 0;
+              if (++i13 == ne3) {
+                i13 = 0;
+              }
+            }
+          }
+        }
+      }
+    }
+  } else {
+    NE_ASSERT(false);  // TODO: implement
+  }
+}
+
+static void ne_compute_forward_dup_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       struct ne_tensor* dst) {
+  NE_ASSERT(ne_nelements(dst) == ne_nelements(src0));
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int ith = params->ith;  // thread index
+  const int nth = params->nth;  // number of threads
+
+  if (ne_is_contiguous(src0) && ne_is_contiguous(dst) && src0->type == dst->type) {
+    ne_compute_forward_dup_same_cont(params, src0, dst);
+    return;
+  }
+
+  // parallelize by rows
+  const int nr = ne01;
+  // number of rows per thread
+  const int dr = (nr + nth - 1) / nth;
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  if (src0->type == dst->type && ne00 == ne0 && nb00 == NE_TYPE_SIZE[src0->type] && nb0 == NE_TYPE_SIZE[dst->type]) {
+    // copy by rows
+    const size_t rs = ne00 * nb00;
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+      for (int64_t i02 = 0; i02 < ne02; i02++) {
+        for (int64_t i01 = ir0; i01 < ir1; i01++) {
+          memcpy(((char*)dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3),
+                 ((char*)src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03), rs);
+        }
+      }
+    }
+    return;
+  }
+
+  if (ne_is_contiguous(dst)) {
+    // TODO: simplify
+    if (nb00 == sizeof(float)) {
+      if (dst->type == NE_TYPE_F32) {
+        size_t id = 0;
+        const size_t rs = ne00 * nb00;
+        char* dst_ptr = (char*)dst->data;
+
+        for (int i03 = 0; i03 < ne03; i03++) {
+          for (int i02 = 0; i02 < ne02; i02++) {
+            id += rs * ir0;
+            for (int i01 = ir0; i01 < ir1; i01++) {
+              const char* src0_ptr = (char*)src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03;
+              memcpy(dst_ptr + id, src0_ptr, rs);
+              id += rs;
+            }
+            id += rs * (ne01 - ir1);
+          }
+        }
+      } else if (dst->type == NE_TYPE_F16) {
+        size_t id = 0;
+        ne_fp16_t* dst_ptr = (ne_fp16_t*)dst->data;
+
+        for (int i03 = 0; i03 < ne03; i03++) {
+          for (int i02 = 0; i02 < ne02; i02++) {
+            id += ne00 * ir0;
+            for (int i01 = ir0; i01 < ir1; i01++) {
+              for (int i00 = 0; i00 < ne00; i00++) {
+                const float* src0_ptr = (float*)((char*)src0->data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+                dst_ptr[id] = NE_FP32_TO_FP16(*src0_ptr);
+                id++;
+              }
+            }
+            id += ne00 * (ne01 - ir1);
+          }
+        }
+      } else if (ne_is_quantized(dst->type)) {
+        quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
+
+        size_t id = 0;
+        size_t rs = nb0 * (ne00 / NE_BLCK_SIZE[dst->type]);
+        char* dst_ptr = (char*)dst->data;
+
+        for (int i03 = 0; i03 < ne03; i03++) {
+          for (int i02 = 0; i02 < ne02; i02++) {
+            id += rs * ir0;
+            for (int i01 = ir0; i01 < ir1; i01++) {
+              const float* src0_ptr = (float*)((char*)src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+              quantize_row_q(src0_ptr, dst_ptr + id, ne00);
+              id += rs;
+            }
+            id += rs * (ne01 - ir1);
+          }
+        }
+      } else {
+        NE_ASSERT(false);  // TODO: implement
+      }
+    } else {
+      // printf("%s: this is not optimal - fix me\n", __func__);
+
+      if (dst->type == NE_TYPE_F32) {
+        size_t id = 0;
+        float* dst_ptr = (float*)dst->data;
+
+        for (int i03 = 0; i03 < ne03; i03++) {
+          for (int i02 = 0; i02 < ne02; i02++) {
+            id += ne00 * ir0;
+            for (int i01 = ir0; i01 < ir1; i01++) {
+              for (int i00 = 0; i00 < ne00; i00++) {
+                const float* src0_ptr = (float*)((char*)src0->data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+                dst_ptr[id] = *src0_ptr;
+                id++;
+              }
+            }
+            id += ne00 * (ne01 - ir1);
+          }
+        }
+      } else if (dst->type == NE_TYPE_F16) {
+        size_t id = 0;
+        ne_fp16_t* dst_ptr = (ne_fp16_t*)dst->data;
+
+        for (int i03 = 0; i03 < ne03; i03++) {
+          for (int i02 = 0; i02 < ne02; i02++) {
+            id += ne00 * ir0;
+            for (int i01 = ir0; i01 < ir1; i01++) {
+              for (int i00 = 0; i00 < ne00; i00++) {
+                const float* src0_ptr = (float*)((char*)src0->data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+                dst_ptr[id] = NE_FP32_TO_FP16(*src0_ptr);
+                id++;
+              }
+            }
+            id += ne00 * (ne01 - ir1);
+          }
+        }
+      } else {
+        NE_ASSERT(false);  // TODO: implement
+      }
+    }
+
+    return;
+  }
+
+  // dst counters
+
+  int64_t i10 = 0;
+  int64_t i11 = 0;
+  int64_t i12 = 0;
+  int64_t i13 = 0;
+
+  if (dst->type == NE_TYPE_F32) {
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+      for (int64_t i02 = 0; i02 < ne02; i02++) {
+        i10 += ne00 * ir0;
+        while (i10 >= ne0) {
+          i10 -= ne0;
+          if (++i11 == ne1) {
+            i11 = 0;
+            if (++i12 == ne2) {
+              i12 = 0;
+              if (++i13 == ne3) {
+                i13 = 0;
+              }
+            }
+          }
+        }
+        for (int64_t i01 = ir0; i01 < ir1; i01++) {
+          for (int64_t i00 = 0; i00 < ne00; i00++) {
+            const char* src0_ptr = ((char*)src0->data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03);
+            char* dst_ptr = ((char*)dst->data + i10 * nb0 + i11 * nb1 + i12 * nb2 + i13 * nb3);
+
+            NE_ASSERT(dst_ptr != NULL);
+            memcpy(dst_ptr, src0_ptr, sizeof(float));
+
+            if (++i10 == ne0) {
+              i10 = 0;
+              if (++i11 == ne1) {
+                i11 = 0;
+                if (++i12 == ne2) {
+                  i12 = 0;
+                  if (++i13 == ne3) {
+                    i13 = 0;
+                  }
+                }
+              }
+            }
+          }
+        }
+        i10 += ne00 * (ne01 - ir1);
+        while (i10 >= ne0) {
+          i10 -= ne0;
+          if (++i11 == ne1) {
+            i11 = 0;
+            if (++i12 == ne2) {
+              i12 = 0;
+              if (++i13 == ne3) {
+                i13 = 0;
+              }
+            }
+          }
+        }
+      }
+    }
+  } else if (dst->type == NE_TYPE_F16) {
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+      for (int64_t i02 = 0; i02 < ne02; i02++) {
+        i10 += ne00 * ir0;
+        while (i10 >= ne0) {
+          i10 -= ne0;
+          if (++i11 == ne1) {
+            i11 = 0;
+            if (++i12 == ne2) {
+              i12 = 0;
+              if (++i13 == ne3) {
+                i13 = 0;
+              }
+            }
+          }
+        }
+        for (int64_t i01 = ir0; i01 < ir1; i01++) {
+          for (int64_t i00 = 0; i00 < ne00; i00++) {
+            const char* src0_ptr = ((char*)src0->data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03);
+            char* dst_ptr = ((char*)dst->data + i10 * nb0 + i11 * nb1 + i12 * nb2 + i13 * nb3);
+
+            *(ne_fp16_t*)dst_ptr = NE_FP32_TO_FP16(*(const float*)src0_ptr);
+
+            if (++i10 == ne0) {
+              i10 = 0;
+              if (++i11 == ne1) {
+                i11 = 0;
+                if (++i12 == ne2) {
+                  i12 = 0;
+                  if (++i13 == ne3) {
+                    i13 = 0;
+                  }
+                }
+              }
+            }
+          }
+        }
+        i10 += ne00 * (ne01 - ir1);
+        while (i10 >= ne0) {
+          i10 -= ne0;
+          if (++i11 == ne1) {
+            i11 = 0;
+            if (++i12 == ne2) {
+              i12 = 0;
+              if (++i13 == ne3) {
+                i13 = 0;
+              }
+            }
+          }
+        }
+      }
+    }
+  } else {
+    NE_ASSERT(false);  // TODO: implement
+  }
+}
+
+static void ne_compute_forward_debug(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                     struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+  const ne_debug_callback_t cb = *((void**)(dst->padding));
+  cb(src0);
+}
+
+static void ne_compute_forward_dup(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   struct ne_tensor* dst) {
+  if (ne_is_contiguous(src0) && ne_is_contiguous(dst) && src0->type == dst->type) {
+    ne_compute_forward_dup_same_cont(params, src0, dst);
+    return;
+  }
+  switch (src0->type) {
+    case NE_TYPE_F16: {
+      ne_compute_forward_dup_f16(params, src0, dst);
+    } break;
+    case NE_TYPE_F32: {
+      ne_compute_forward_dup_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_add
+
+static void ne_compute_forward_add_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(ne_can_repeat_rows(src1, src0) && ne_are_same_shape(src0, dst));
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int64_t nr = ne_nrows(src0);
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+
+  const int64_t ne10 = src1->ne[0];
+  const int64_t ne11 = src1->ne[1];
+  const int64_t ne12 = src1->ne[2];
+  const int64_t ne13 = src1->ne[3];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = ne11 == 1 ? 0 : src1->nb[1];
+  const size_t nb12 = ne12 == 1 ? 0 : src1->nb[2];
+  const size_t nb13 = ne13 == 1 ? 0 : src1->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb00 == sizeof(float));
+  NE_ASSERT(ne00 == ne10);
+
+  if (nb10 == sizeof(float)) {
+    for (int64_t ir = ith; ir < nr; ir += nth) {
+      // src0 and dst are same shape => same indices
+      const int64_t i03 = ir / (ne02 * ne01);
+      const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
+      const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
+
+      const int64_t i13 = i03 % ne13;
+      const int64_t i12 = i02 % ne12;
+      const int64_t i11 = i01 % ne11;
+
+      float* dst_ptr = (float*)((char*)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1);
+      float* src0_ptr = (float*)((char*)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01);
+      float* src1_ptr = (float*)((char*)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11);
+
+      ne_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
+    }
+  } else {
+    // src1 is not contiguous
+    for (int64_t ir = ith; ir < nr; ir += nth) {
+      // src0 and dst are same shape => same indices
+      // src1 is broadcastable across src0 and dst in i1, i2, i3
+      const int64_t i03 = ir / (ne02 * ne01);
+      const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
+      const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
+
+      const int64_t i13 = i03 % ne13;
+      const int64_t i12 = i02 % ne12;
+      const int64_t i11 = i01 % ne11;
+
+      float* dst_ptr = (float*)((char*)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1);
+      float* src0_ptr = (float*)((char*)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01);
+
+      for (int64_t i0 = 0; i0 < ne00; i0++) {
+        float* src1_ptr = (float*)((char*)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11 + i0 * nb10);
+
+        dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_add_f16_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                           const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(ne_are_same_shape(src0, src1) && ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nr = ne_nrows(src0);
+  const int64_t ne0 = src0->ne[0];
+  const int64_t ne1 = src0->ne[1];
+  const int64_t ne2 = src0->ne[2];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = src1->nb[1];
+  const size_t nb12 = src1->nb[2];
+  const size_t nb13 = src1->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  NE_ASSERT(src0->type == NE_TYPE_F16);
+  NE_ASSERT(src1->type == NE_TYPE_F32);
+  NE_ASSERT(dst->type == NE_TYPE_F16);
+
+  NE_ASSERT(nb0 == sizeof(ne_fp16_t));
+  NE_ASSERT(nb00 == sizeof(ne_fp16_t));
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  if (nb10 == sizeof(float)) {
+    for (int ir = ir0; ir < ir1; ++ir) {
+      // src0, src1 and dst are same shape => same indices
+      const int i3 = ir / (ne2 * ne1);
+      const int i2 = (ir - i3 * ne2 * ne1) / ne1;
+      const int i1 = (ir - i3 * ne2 * ne1 - i2 * ne1);
+
+      ne_fp16_t* dst_ptr = (ne_fp16_t*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
+      ne_fp16_t* src0_ptr = (ne_fp16_t*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
+      float* src1_ptr = (float*)((char*)src1->data + i3 * nb13 + i2 * nb12 + i1 * nb11);
+
+      for (int i = 0; i < ne0; i++) {
+        dst_ptr[i] = NE_FP32_TO_FP16(NE_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
+      }
+    }
+  } else {
+    // src1 is not contiguous
+    NE_ASSERT(false);
+  }
+}
+
+static void ne_compute_forward_add_f16_f16(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                           const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(ne_are_same_shape(src0, src1) && ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nr = ne_nrows(src0);
+  const int64_t ne0 = src0->ne[0];
+  const int64_t ne1 = src0->ne[1];
+  const int64_t ne2 = src0->ne[2];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = src1->nb[1];
+  const size_t nb12 = src1->nb[2];
+  const size_t nb13 = src1->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  NE_ASSERT(src0->type == NE_TYPE_F16);
+  NE_ASSERT(src1->type == NE_TYPE_F16);
+  NE_ASSERT(dst->type == NE_TYPE_F16);
+
+  NE_ASSERT(nb0 == sizeof(ne_fp16_t));
+  NE_ASSERT(nb00 == sizeof(ne_fp16_t));
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  if (nb10 == sizeof(ne_fp16_t)) {
+    for (int ir = ir0; ir < ir1; ++ir) {
+      // src0, src1 and dst are same shape => same indices
+      const int i3 = ir / (ne2 * ne1);
+      const int i2 = (ir - i3 * ne2 * ne1) / ne1;
+      const int i1 = (ir - i3 * ne2 * ne1 - i2 * ne1);
+
+      ne_fp16_t* dst_ptr = (ne_fp16_t*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
+      ne_fp16_t* src0_ptr = (ne_fp16_t*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
+      ne_fp16_t* src1_ptr = (ne_fp16_t*)((char*)src1->data + i3 * nb13 + i2 * nb12 + i1 * nb11);
+
+      for (int i = 0; i < ne0; i++) {
+        dst_ptr[i] = NE_FP32_TO_FP16(NE_FP16_TO_FP32(src0_ptr[i]) + NE_FP16_TO_FP32(src1_ptr[i]));
+      }
+    }
+  } else {
+    // src1 is not contiguous
+    NE_ASSERT(false);
+  }
+}
+
+static void ne_compute_forward_add_q_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                         const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(ne_are_same_shape(src0, src1) && ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int nr = ne_nrows(src0);
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  // const int64_t ne03 = src0->ne[3];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = src1->nb[1];
+  const size_t nb12 = src1->nb[2];
+  const size_t nb13 = src1->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const enum ne_type type = src0->type;
+  dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
+  quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
+
+  // we don't support permuted src0 or src1
+  NE_ASSERT(nb00 == NE_TYPE_SIZE[type]);
+  NE_ASSERT(nb10 == sizeof(float));
+
+  // dst cannot be transposed or permuted
+  NE_ASSERT(nb0 <= nb1);
+  NE_ASSERT(nb1 <= nb2);
+  NE_ASSERT(nb2 <= nb3);
+
+  NE_ASSERT(ne_is_quantized(src0->type));
+  NE_ASSERT(dst->type == src0->type);
+  NE_ASSERT(src1->type == NE_TYPE_F32);
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  float* wdata = (float*)params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
+
+  for (int ir = ir0; ir < ir1; ++ir) {
+    // src0 indices
+    const int i03 = ir / (ne02 * ne01);
+    const int i02 = (ir - i03 * ne02 * ne01) / ne01;
+    const int i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
+
+    // src1 and dst are same shape as src0 => same indices
+    const int i13 = i03;
+    const int i12 = i02;
+    const int i11 = i01;
+
+    const int i3 = i03;
+    const int i2 = i02;
+    const int i1 = i01;
+
+    void* src0_row = (void*)((char*)src0->data + (i01 * nb01 + i02 * nb02 + i03 * nb03));
+    float* src1_row = (float*)((char*)src1->data + (i11 * nb11 + i12 * nb12 + i13 * nb13));
+    void* dst_row = (void*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb0));
+
+    assert(ne00 % 32 == 0);
+
+    // unquantize row from src0 to temp buffer
+    dequantize_row_q(src0_row, wdata, ne00);
+    // add src1
+    ne_vec_acc_f32(ne00, wdata, src1_row);
+    // quantize row to dst
+    quantize_row_q(wdata, dst_row, ne00);
+  }
+}
+
+static void ne_compute_forward_add(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_add_f32(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_F16: {
+      if (src1->type == NE_TYPE_F16) {
+        ne_compute_forward_add_f16_f16(params, src0, src1, dst);
+      } else if (src1->type == NE_TYPE_F32) {
+        ne_compute_forward_add_f16_f32(params, src0, src1, dst);
+      } else {
+        NE_ASSERT(false);
+      }
+    } break;
+    case NE_TYPE_Q4_0:
+    case NE_TYPE_Q4_1:
+    case NE_TYPE_Q5_0:
+    case NE_TYPE_Q5_1:
+    case NE_TYPE_Q8_0: {
+      ne_compute_forward_add_q_f32(params, src0, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_add1
+
+static void ne_compute_forward_add1_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                        const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(ne_are_same_shape(src0, dst));
+  NE_ASSERT(ne_is_scalar(src1));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nr = ne_nrows(src0);
+  const int64_t ne0 = src0->ne[0];
+  const int64_t ne1 = src0->ne[1];
+  const int64_t ne2 = src0->ne[2];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb00 == sizeof(float));
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int ir = ir0; ir < ir1; ++ir) {
+    // src0 and dst are same shape => same indices
+    const int i3 = ir / (ne2 * ne1);
+    const int i2 = (ir - i3 * ne2 * ne1) / ne1;
+    const int i1 = (ir - i3 * ne2 * ne1 - i2 * ne1);
+
+    ne_vec_add1_f32(ne0, (float*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1),
+                    (float*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01), *(float*)src1->data);
+  }
+}
+
+static void ne_compute_forward_add1_f16_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                            const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(ne_are_same_shape(src0, dst));
+  NE_ASSERT(ne_is_scalar(src1));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // scalar to add
+  const float v = *(float*)src1->data;
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nr = ne_nrows(src0);
+  const int64_t ne0 = src0->ne[0];
+  const int64_t ne1 = src0->ne[1];
+  const int64_t ne2 = src0->ne[2];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  NE_ASSERT(src0->type == NE_TYPE_F16);
+  NE_ASSERT(src1->type == NE_TYPE_F32);
+  NE_ASSERT(dst->type == NE_TYPE_F16);
+
+  NE_ASSERT(nb0 == sizeof(ne_fp16_t));
+  NE_ASSERT(nb00 == sizeof(ne_fp16_t));
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int ir = ir0; ir < ir1; ++ir) {
+    // src0 and dst are same shape => same indices
+    const int i3 = ir / (ne2 * ne1);
+    const int i2 = (ir - i3 * ne2 * ne1) / ne1;
+    const int i1 = (ir - i3 * ne2 * ne1 - i2 * ne1);
+
+    ne_fp16_t* dst_ptr = (ne_fp16_t*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
+    ne_fp16_t* src0_ptr = (ne_fp16_t*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
+    for (int i = 0; i < ne0; i++) {
+      dst_ptr[i] = NE_FP32_TO_FP16(NE_FP16_TO_FP32(src0_ptr[i]) + v);
+    }
+  }
+}
+
+static void ne_compute_forward_add1_f16_f16(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                            const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(ne_are_same_shape(src0, dst));
+  NE_ASSERT(ne_is_scalar(src1));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // scalar to add
+  const float v = NE_FP16_TO_FP32(*(ne_fp16_t*)src1->data);
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nr = ne_nrows(src0);
+  const int64_t ne0 = src0->ne[0];
+  const int64_t ne1 = src0->ne[1];
+  const int64_t ne2 = src0->ne[2];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  NE_ASSERT(src0->type == NE_TYPE_F16);
+  NE_ASSERT(src1->type == NE_TYPE_F16);
+  NE_ASSERT(dst->type == NE_TYPE_F16);
+
+  NE_ASSERT(nb0 == sizeof(ne_fp16_t));
+  NE_ASSERT(nb00 == sizeof(ne_fp16_t));
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int ir = ir0; ir < ir1; ++ir) {
+    // src0 and dst are same shape => same indices
+    const int i3 = ir / (ne2 * ne1);
+    const int i2 = (ir - i3 * ne2 * ne1) / ne1;
+    const int i1 = (ir - i3 * ne2 * ne1 - i2 * ne1);
+
+    ne_fp16_t* dst_ptr = (ne_fp16_t*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
+    ne_fp16_t* src0_ptr = (ne_fp16_t*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
+    for (int i = 0; i < ne0; i++) {
+      dst_ptr[i] = NE_FP32_TO_FP16(NE_FP16_TO_FP32(src0_ptr[i]) + v);
+    }
+  }
+}
+
+static void ne_compute_forward_add1_q_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                          const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(ne_are_same_shape(src0, dst));
+  NE_ASSERT(ne_is_scalar(src1));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // scalar to add
+  const float v = *(float*)src1->data;
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nr = ne_nrows(src0);
+  const int64_t ne0 = src0->ne[0];
+  const int64_t ne1 = src0->ne[1];
+  const int64_t ne2 = src0->ne[2];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const enum ne_type type = src0->type;
+  dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
+  quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
+
+  // we don't support permuted src0
+  NE_ASSERT(nb00 == NE_TYPE_SIZE[type]);
+
+  // dst cannot be transposed or permuted
+  NE_ASSERT(nb0 <= nb1);
+  NE_ASSERT(nb1 <= nb2);
+  NE_ASSERT(nb2 <= nb3);
+
+  NE_ASSERT(ne_is_quantized(src0->type));
+  NE_ASSERT(dst->type == src0->type);
+  NE_ASSERT(src1->type == NE_TYPE_F32);
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  float* wdata = (float*)params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
+
+  for (int ir = ir0; ir < ir1; ++ir) {
+    // src0 and dst are same shape => same indices
+    const int i3 = ir / (ne2 * ne1);
+    const int i2 = (ir - i3 * ne2 * ne1) / ne1;
+    const int i1 = (ir - i3 * ne2 * ne1 - i2 * ne1);
+
+    void* src0_row = (void*)((char*)src0->data + (i1 * nb01 + i2 * nb02 + i3 * nb03));
+    void* dst_row = (void*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb0));
+
+    assert(ne0 % 32 == 0);
+
+    // unquantize row from src0 to temp buffer
+    dequantize_row_q(src0_row, wdata, ne0);
+    // add src1
+    ne_vec_acc1_f32(ne0, wdata, v);
+    // quantize row to dst
+    quantize_row_q(wdata, dst_row, ne0);
+  }
+}
+
+static void ne_compute_forward_add1(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                    const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_add1_f32(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_F16: {
+      if (src1->type == NE_TYPE_F16) {
+        ne_compute_forward_add1_f16_f16(params, src0, src1, dst);
+      } else if (src1->type == NE_TYPE_F32) {
+        ne_compute_forward_add1_f16_f32(params, src0, src1, dst);
+      } else {
+        NE_ASSERT(false);
+      }
+    } break;
+    case NE_TYPE_Q4_0:
+    case NE_TYPE_Q4_1:
+    case NE_TYPE_Q5_0:
+    case NE_TYPE_Q5_1:
+    case NE_TYPE_Q8_0:
+    case NE_TYPE_Q8_1: {
+      ne_compute_forward_add1_q_f32(params, src0, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_acc
+
+static void ne_compute_forward_acc_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       const struct ne_tensor* src1, const struct ne_tensor* opt0,
+                                       struct ne_tensor* dst) {
+  NE_ASSERT(ne_are_same_shape(src0, dst));
+  NE_ASSERT(ne_is_contiguous(dst) && ne_is_contiguous(src0));
+
+  NE_ASSERT(opt0->type == NE_TYPE_I32);
+  NE_ASSERT(ne_nelements(opt0) == 5);
+
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // view src0 and dst with these strides and data offset inbytes during acc
+  // nb0 is implicitly element_size because src0 and dst are contiguous
+  size_t nb1 = ((int32_t*)opt0->data)[0];
+  size_t nb2 = ((int32_t*)opt0->data)[1];
+  size_t nb3 = ((int32_t*)opt0->data)[2];
+  size_t offset = ((int32_t*)opt0->data)[3];
+  bool inplace = (bool)((int32_t*)opt0->data)[4];
+
+  if (!inplace && (params->type == NE_TASK_INIT)) {
+    // memcpy needs to be synchronized across threads to avoid race conditions.
+    // => do it in INIT phase
+    memcpy(((char*)dst->data), ((char*)src0->data), ne_nbytes(dst));
+  }
+
+  if (params->type == NE_TASK_INIT) {
+    return;
+  }
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nr = ne_nrows(src1);
+  const int nc = src1->ne[0];
+
+  const int64_t ne10 = src1->ne[0];
+  const int64_t ne11 = src1->ne[1];
+  const int64_t ne12 = src1->ne[2];
+  const int64_t ne13 = src1->ne[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = src1->nb[1];
+  const size_t nb12 = src1->nb[2];
+  const size_t nb13 = src1->nb[3];
+
+  // src0 and dst as viewed during acc
+  const size_t nb0 = ne_element_size(src0);
+
+  const size_t nb00 = nb0;
+  const size_t nb01 = nb1;
+  const size_t nb02 = nb2;
+  const size_t nb03 = nb3;
+
+  NE_ASSERT(offset + (ne10 == 0 ? 0 : ne10 - 1) * nb0 + (ne11 == 0 ? 0 : ne11 - 1) * nb1 +
+                (ne12 == 0 ? 0 : ne12 - 1) * nb2 + (ne13 == 0 ? 0 : ne13 - 1) * nb3 <
+            ne_nbytes(dst));
+  NE_ASSERT(offset + (ne10 == 0 ? 0 : ne10 - 1) * nb00 + (ne11 == 0 ? 0 : ne11 - 1) * nb01 +
+                (ne12 == 0 ? 0 : ne12 - 1) * nb02 + (ne13 == 0 ? 0 : ne13 - 1) * nb03 <
+            ne_nbytes(src0));
+
+  NE_ASSERT(nb10 == sizeof(float));
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int ir = ir0; ir < ir1; ++ir) {
+    // src0 and dst are viewed with shape of src1 and offset
+    // => same indices
+    const int i3 = ir / (ne12 * ne11);
+    const int i2 = (ir - i3 * ne12 * ne11) / ne11;
+    const int i1 = (ir - i3 * ne12 * ne11 - i2 * ne11);
+
+    ne_vec_add_f32(nc, (float*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1 + offset),
+                   (float*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01 + offset),
+                   (float*)((char*)src1->data + i3 * nb13 + i2 * nb12 + i1 * nb11));
+  }
+}
+
+static void ne_compute_forward_acc(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   const struct ne_tensor* src1, const struct ne_tensor* opt0, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_acc_f32(params, src0, src1, opt0, dst);
+    } break;
+    case NE_TYPE_F16:
+    case NE_TYPE_Q4_0:
+    case NE_TYPE_Q4_1:
+    case NE_TYPE_Q5_0:
+    case NE_TYPE_Q5_1:
+    case NE_TYPE_Q8_0:
+    case NE_TYPE_Q8_1:
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_sub
+
+static void ne_compute_forward_sub_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       const struct ne_tensor* src1, struct ne_tensor* dst) {
+  assert(params->ith == 0);
+  assert(ne_are_same_shape(src0, src1) && ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int nr = ne_nrows(src0);
+  const int64_t ne0 = src0->ne[0];
+  const int64_t ne1 = src0->ne[1];
+  const int64_t ne2 = src0->ne[2];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = src1->nb[1];
+  const size_t nb12 = src1->nb[2];
+  const size_t nb13 = src1->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb00 == sizeof(float));
+
+  if (nb10 == sizeof(float)) {
+    for (int ir = 0; ir < nr; ++ir) {
+      // src0, src1 and dst are same shape => same indices
+      const int i3 = ir / (ne2 * ne1);
+      const int i2 = (ir - i3 * ne2 * ne1) / ne1;
+      const int i1 = (ir - i3 * ne2 * ne1 - i2 * ne1);
+
+      ne_vec_sub_f32(ne0, (float*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1),
+                     (float*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01),
+                     (float*)((char*)src1->data + i3 * nb13 + i2 * nb12 + i1 * nb11));
+      // }
+      // }
+    }
+  } else {
+    // src1 is not contiguous
+    for (int ir = 0; ir < nr; ++ir) {
+      // src0, src1 and dst are same shape => same indices
+      const int i3 = ir / (ne2 * ne1);
+      const int i2 = (ir - i3 * ne2 * ne1) / ne1;
+      const int i1 = (ir - i3 * ne2 * ne1 - i2 * ne1);
+
+      float* dst_ptr = (float*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
+      float* src0_ptr = (float*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
+      for (int i0 = 0; i0 < ne0; i0++) {
+        float* src1_ptr = (float*)((char*)src1->data + i3 * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
+
+        dst_ptr[i0] = src0_ptr[i0] - *src1_ptr;
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_sub(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_sub_f32(params, src0, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_mul
+
+static void ne_compute_forward_mul_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(ne_can_repeat_rows(src1, src0) && ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int64_t nr = ne_nrows(src0);
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+
+  const int64_t ne10 = src1->ne[0];
+  const int64_t ne11 = src1->ne[1];
+  const int64_t ne12 = src1->ne[2];
+  const int64_t ne13 = src1->ne[3];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = ne11 == 1 ? 0 : src1->nb[1];
+  const size_t nb12 = ne12 == 1 ? 0 : src1->nb[2];
+  const size_t nb13 = ne13 == 1 ? 0 : src1->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb00 == sizeof(float));
+  NE_ASSERT(ne00 == ne10);
+
+  if (nb10 == sizeof(float)) {
+    for (int64_t ir = ith; ir < nr; ir += nth) {
+      // src0 and dst are same shape => same indices
+      const int64_t i03 = ir / (ne02 * ne01);
+      const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
+      const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
+
+      const int64_t i13 = i03 % ne13;
+      const int64_t i12 = i02 % ne12;
+      const int64_t i11 = i01 % ne11;
+
+      float* dst_ptr = (float*)((char*)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1);
+      float* src0_ptr = (float*)((char*)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01);
+      float* src1_ptr = (float*)((char*)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11);
+
+      ne_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
+    }
+  } else {
+    // src1 is not contiguous
+    for (int64_t ir = ith; ir < nr; ir += nth) {
+      // src0 and dst are same shape => same indices
+      // src1 is broadcastable across src0 and dst in i1, i2, i3
+      const int64_t i03 = ir / (ne02 * ne01);
+      const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
+      const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
+
+      const int64_t i13 = i03 % ne13;
+      const int64_t i12 = i02 % ne12;
+      const int64_t i11 = i01 % ne11;
+
+      float* dst_ptr = (float*)((char*)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1);
+      float* src0_ptr = (float*)((char*)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01);
+
+      for (int64_t i0 = 0; i0 < ne00; i0++) {
+        float* src1_ptr = (float*)((char*)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11 + i0 * nb10);
+
+        dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_mul(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_mul_f32(params, src0, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_div
+
+static void ne_compute_forward_div_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       const struct ne_tensor* src1, struct ne_tensor* dst) {
+  // assert(params->ith == 0);
+  assert(ne_are_same_shape(src0, src1) && ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int nr = ne_nrows(src0);
+  const int64_t ne0 = src0->ne[0];
+  const int64_t ne1 = src0->ne[1];
+  const int64_t ne2 = src0->ne[2];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = src1->nb[1];
+  const size_t nb12 = src1->nb[2];
+  const size_t nb13 = src1->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb00 == sizeof(float));
+
+  if (nb10 == sizeof(float)) {
+    for (int ir = 0; ir < nr; ++ir) {
+      // src0, src1 and dst are same shape => same indices
+      const int i3 = ir / (ne2 * ne1);
+      const int i2 = (ir - i3 * ne2 * ne1) / ne1;
+      const int i1 = (ir - i3 * ne2 * ne1 - i2 * ne1);
+
+      ne_vec_div_f32(ne0, (float*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1),
+                     (float*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01),
+                     (float*)((char*)src1->data + i3 * nb13 + i2 * nb12 + i1 * nb11));
+      // }
+      // }
+    }
+  } else {
+    // src1 is not contiguous
+    for (int ir = 0; ir < nr; ++ir) {
+      // src0, src1 and dst are same shape => same indices
+      const int i3 = ir / (ne2 * ne1);
+      const int i2 = (ir - i3 * ne2 * ne1) / ne1;
+      const int i1 = (ir - i3 * ne2 * ne1 - i2 * ne1);
+
+      float* dst_ptr = (float*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
+      float* src0_ptr = (float*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
+      for (int i0 = 0; i0 < ne0; i0++) {
+        float* src1_ptr = (float*)((char*)src1->data + i3 * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
+
+        dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_div(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_div_f32(params, src0, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_sqr
+
+static void ne_compute_forward_sqr_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       struct ne_tensor* dst) {
+  assert(params->ith == 0);
+  assert(ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int n = ne_nrows(src0);
+  const int nc = src0->ne[0];
+
+  assert(dst->nb[0] == sizeof(float));
+  assert(src0->nb[0] == sizeof(float));
+
+  for (int i = 0; i < n; i++) {
+    ne_vec_sqr_f32(nc, (float*)((char*)dst->data + i * (dst->nb[1])), (float*)((char*)src0->data + i * (src0->nb[1])));
+  }
+}
+
+static void ne_compute_forward_sqr(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_sqr_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_sqrt
+
+static void ne_compute_forward_sqrt_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                        struct ne_tensor* dst) {
+  assert(params->ith == 0);
+  assert(ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int n = ne_nrows(src0);
+  const int nc = src0->ne[0];
+
+  assert(dst->nb[0] == sizeof(float));
+  assert(src0->nb[0] == sizeof(float));
+
+  for (int i = 0; i < n; i++) {
+    ne_vec_sqrt_f32(nc, (float*)((char*)dst->data + i * (dst->nb[1])), (float*)((char*)src0->data + i * (src0->nb[1])));
+  }
+}
+
+static void ne_compute_forward_sqrt(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                    struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_sqrt_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_log
+
+static void ne_compute_forward_log_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       struct ne_tensor* dst) {
+  NE_ASSERT(params->ith == 0);
+  NE_ASSERT(ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int n = ne_nrows(src0);
+  const int nc = src0->ne[0];
+
+  NE_ASSERT(dst->nb[0] == sizeof(float));
+  NE_ASSERT(src0->nb[0] == sizeof(float));
+
+  for (int i = 0; i < n; i++) {
+    ne_vec_log_f32(nc, (float*)((char*)dst->data + i * (dst->nb[1])), (float*)((char*)src0->data + i * (src0->nb[1])));
+  }
+}
+
+static void ne_compute_forward_log(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_log_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_sum
+
+static void ne_compute_forward_sum_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       struct ne_tensor* dst) {
+  assert(params->ith == 0);
+  assert(ne_is_scalar(dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  assert(ne_is_scalar(dst));
+  assert(src0->nb[0] == sizeof(float));
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  ne_float sum = 0;
+  ne_float row_sum = 0;
+
+  for (int64_t i03 = 0; i03 < ne03; i03++) {
+    for (int64_t i02 = 0; i02 < ne02; i02++) {
+      for (int64_t i01 = 0; i01 < ne01; i01++) {
+        ne_vec_sum_ggf(ne00, &row_sum, (float*)((char*)src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
+        sum += row_sum;
+      }
+    }
+  }
+  ((float*)dst->data)[0] = sum;
+}
+
+static void ne_compute_forward_sum(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_sum_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_sum_rows
+
+static void ne_compute_forward_sum_rows_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                            struct ne_tensor* dst) {
+  // NE_ASSERT(params->ith == 0);
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  NE_ASSERT(src0->nb[0] == sizeof(float));
+  NE_ASSERT(dst->nb[0] == sizeof(float));
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  NE_ASSERT(ne0 == 1);
+  NE_ASSERT(ne1 == ne01);
+  NE_ASSERT(ne2 == ne02);
+  NE_ASSERT(ne3 == ne03);
+
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  for (int64_t i3 = 0; i3 < ne03; i3++) {
+    for (int64_t i2 = 0; i2 < ne02; i2++) {
+      for (int64_t i1 = 0; i1 < ne01; i1++) {
+        float* src_row = (float*)((char*)src0->data + i1 * nb01 + i2 * nb02 + i3 * nb03);
+        float* dst_row = (float*)((char*)dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3);
+        float row_sum = 0;
+        ne_vec_sum_f32(ne00, &row_sum, src_row);
+        dst_row[0] = row_sum;
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_sum_rows(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                        struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_sum_rows_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_mean
+
+static void ne_compute_forward_mean_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                        struct ne_tensor* dst) {
+  assert(params->ith == 0);
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  assert(src0->nb[0] == sizeof(float));
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  assert(ne0 == 1);
+  assert(ne1 == ne01);
+  assert(ne2 == ne02);
+  assert(ne3 == ne03);
+
+  UNUSED(ne0);
+  UNUSED(ne1);
+  UNUSED(ne2);
+  UNUSED(ne3);
+
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  for (int64_t i03 = 0; i03 < ne03; i03++) {
+    for (int64_t i02 = 0; i02 < ne02; i02++) {
+      for (int64_t i01 = 0; i01 < ne01; i01++) {
+        ne_vec_sum_f32(ne00, (float*)((char*)dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3),
+                       (float*)((char*)src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
+
+        *(float*)((char*)dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3) /= (float)ne00;
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_mean(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                    struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_mean_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_repeat
+
+static void ne_compute_forward_repeat_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                          struct ne_tensor* dst) {
+  NE_ASSERT(params->ith == 0);
+  NE_ASSERT(ne_can_repeat(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  // guaranteed to be an integer due to the check in ne_can_repeat
+  const int nr0 = (int)(ne0 / ne00);
+  const int nr1 = (int)(ne1 / ne01);
+  const int nr2 = (int)(ne2 / ne02);
+  const int nr3 = (int)(ne3 / ne03);
+
+  // TODO: support for transposed / permuted tensors
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb00 == sizeof(float));
+
+  // TODO: maybe this is not optimal?
+  for (int i3 = 0; i3 < nr3; i3++) {
+    for (int k3 = 0; k3 < ne03; k3++) {
+      for (int i2 = 0; i2 < nr2; i2++) {
+        for (int k2 = 0; k2 < ne02; k2++) {
+          for (int i1 = 0; i1 < nr1; i1++) {
+            for (int k1 = 0; k1 < ne01; k1++) {
+              for (int i0 = 0; i0 < nr0; i0++) {
+                ne_vec_cpy_f32(ne00,
+                               (float*)((char*)dst->data + (i3 * ne03 + k3) * nb3 + (i2 * ne02 + k2) * nb2 +
+                                        (i1 * ne01 + k1) * nb1 + (i0 * ne00) * nb0),
+                               (float*)((char*)src0->data + (k3)*nb03 + (k2)*nb02 + (k1)*nb01));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_repeat(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                      struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_repeat_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_abs
+
+static void ne_compute_forward_abs_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       struct ne_tensor* dst) {
+  assert(params->ith == 0);
+  assert(ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int n = ne_nrows(src0);
+  const int nc = src0->ne[0];
+
+  assert(dst->nb[0] == sizeof(float));
+  assert(src0->nb[0] == sizeof(float));
+
+  for (int i = 0; i < n; i++) {
+    ne_vec_abs_f32(nc, (float*)((char*)dst->data + i * (dst->nb[1])), (float*)((char*)src0->data + i * (src0->nb[1])));
+  }
+}
+
+static void ne_compute_forward_abs(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_abs_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_sgn
+
+static void ne_compute_forward_sgn_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       struct ne_tensor* dst) {
+  assert(params->ith == 0);
+  assert(ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int n = ne_nrows(src0);
+  const int nc = src0->ne[0];
+
+  assert(dst->nb[0] == sizeof(float));
+  assert(src0->nb[0] == sizeof(float));
+
+  for (int i = 0; i < n; i++) {
+    ne_vec_sgn_f32(nc, (float*)((char*)dst->data + i * (dst->nb[1])), (float*)((char*)src0->data + i * (src0->nb[1])));
+  }
+}
+
+static void ne_compute_forward_sgn(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_sgn_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_neg
+
+static void ne_compute_forward_neg_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       struct ne_tensor* dst) {
+  assert(params->ith == 0);
+  assert(ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int n = ne_nrows(src0);
+  const int nc = src0->ne[0];
+
+  assert(dst->nb[0] == sizeof(float));
+  assert(src0->nb[0] == sizeof(float));
+
+  for (int i = 0; i < n; i++) {
+    ne_vec_neg_f32(nc, (float*)((char*)dst->data + i * (dst->nb[1])), (float*)((char*)src0->data + i * (src0->nb[1])));
+  }
+}
+
+static void ne_compute_forward_neg(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_neg_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_step
+
+static void ne_compute_forward_step_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                        struct ne_tensor* dst) {
+  assert(params->ith == 0);
+  assert(ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int n = ne_nrows(src0);
+  const int nc = src0->ne[0];
+
+  assert(dst->nb[0] == sizeof(float));
+  assert(src0->nb[0] == sizeof(float));
+
+  for (int i = 0; i < n; i++) {
+    ne_vec_step_f32(nc, (float*)((char*)dst->data + i * (dst->nb[1])), (float*)((char*)src0->data + i * (src0->nb[1])));
+  }
+}
+
+static void ne_compute_forward_step(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                    struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_step_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_relu
+
+static void ne_compute_forward_relu_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                        struct ne_tensor* dst) {
+  assert(params->ith == 0);
+  assert(ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int n = ne_nrows(src0);
+  const int nc = src0->ne[0];
+
+  assert(dst->nb[0] == sizeof(float));
+  assert(src0->nb[0] == sizeof(float));
+
+  for (int i = 0; i < n; i++) {
+    ne_vec_relu_f32(nc, (float*)((char*)dst->data + i * (dst->nb[1])), (float*)((char*)src0->data + i * (src0->nb[1])));
+  }
+}
+
+static void ne_compute_forward_relu(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                    struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_relu_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_gelu
+
+static void ne_compute_forward_gelu_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                        struct ne_tensor* dst) {
+  NE_ASSERT(ne_is_contiguous(src0));
+  NE_ASSERT(ne_is_contiguous(dst));
+  NE_ASSERT(ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nc = src0->ne[0];
+  const int nr = ne_nrows(src0);
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int i1 = ir0; i1 < ir1; i1++) {
+    ne_vec_gelu_f32(nc, (float*)((char*)dst->data + i1 * (dst->nb[1])),
+                    (float*)((char*)src0->data + i1 * (src0->nb[1])));
+
+#ifndef NDEBUG
+    for (int k = 0; k < nc; k++) {
+      const float x = ((float*)((char*)dst->data + i1 * (dst->nb[1])))[k];
+      UNUSED(x);
+      assert(!isnan(x));
+      assert(!isinf(x));
+    }
+#endif
+  }
+}
+
+static void ne_compute_forward_gelu(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                    struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_gelu_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_silu
+static inline bool ne_is_contiguous_except_dim_1(const struct ne_tensor* tensor) {
+  static_assert(NE_MAX_DIMS == 4, "NE_MAX_DIMS is not 4 - update this function");
+
+  return tensor->nb[0] == NE_TYPE_SIZE[tensor->type] && tensor->nb[2] == tensor->nb[1] * tensor->ne[1] &&
+         tensor->nb[3] == tensor->nb[2] * tensor->ne[2];
+}
+
+static void ne_compute_forward_silu_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                        struct ne_tensor* dst) {
+  NE_ASSERT(ne_is_contiguous_except_dim_1(src0));
+  NE_ASSERT(ne_is_contiguous_except_dim_1(dst));
+  NE_ASSERT(ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nc = src0->ne[0];
+  const int nr = ne_nrows(src0);
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int i1 = ir0; i1 < ir1; i1++) {
+    ne_vec_silu_f32(nc, (float*)((char*)dst->data + i1 * (dst->nb[1])),
+                    (float*)((char*)src0->data + i1 * (src0->nb[1])));
+
+#ifndef NDEBUG
+    for (int k = 0; k < nc; k++) {
+      const float x = ((float*)((char*)dst->data + i1 * (dst->nb[1])))[k];
+      UNUSED(x);
+      assert(!isnan(x));
+      assert(!isinf(x));
+    }
+#endif
+  }
+}
+
+static void ne_compute_forward_silu(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                    struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_silu_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_silu_back
+
+static void ne_compute_forward_silu_back_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                             const struct ne_tensor* grad, struct ne_tensor* dst) {
+  NE_ASSERT(ne_is_contiguous(grad));
+  NE_ASSERT(ne_is_contiguous(src0));
+  NE_ASSERT(ne_is_contiguous(dst));
+  NE_ASSERT(ne_are_same_shape(src0, dst));
+  NE_ASSERT(ne_are_same_shape(src0, grad));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nc = src0->ne[0];
+  const int nr = ne_nrows(src0);
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int i1 = ir0; i1 < ir1; i1++) {
+    ne_vec_silu_backward_f32(nc, (float*)((char*)dst->data + i1 * (dst->nb[1])),
+                             (float*)((char*)src0->data + i1 * (src0->nb[1])),
+                             (float*)((char*)grad->data + i1 * (grad->nb[1])));
+
+#ifndef NDEBUG
+    for (int k = 0; k < nc; k++) {
+      const float x = ((float*)((char*)dst->data + i1 * (dst->nb[1])))[k];
+      UNUSED(x);
+      assert(!isnan(x));
+      assert(!isinf(x));
+    }
+#endif
+  }
+}
+
+static void ne_compute_forward_silu_back(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                         const struct ne_tensor* grad, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_silu_back_f32(params, src0, grad, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_norm
+
+static void ne_compute_forward_norm_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                        struct ne_tensor* dst) {
+  NE_ASSERT(ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  NE_ASSERT(src0->nb[0] == sizeof(float));
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  float eps;
+  memcpy(&eps, dst->op_params, sizeof(float));
+
+  if (ne_is_contiguous(src0) && ne_is_contiguous(dst)) {
+    bestla_layernormalization(ne03 * ne02 * ne01, ne00, false, eps, (const float*)src0->data, (float*)dst->data);
+    return;
+  }
+
+  // TODO: optimize
+  for (int64_t i03 = 0; i03 < ne03; i03++) {
+    for (int64_t i02 = 0; i02 < ne02; i02++) {
+      for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+        const float* x = (float*)((char*)src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+        ne_float sum = 0.0;
+        for (int64_t i00 = 0; i00 < ne00; i00++) {
+          sum += (ne_float)x[i00];
+        }
+
+        float mean = sum / ne00;
+
+        float* y = (float*)((char*)dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
+
+        ne_float sum2 = 0.0;
+        for (int64_t i00 = 0; i00 < ne00; i00++) {
+          float v = x[i00] - mean;
+          y[i00] = v;
+          sum2 += (ne_float)(v * v);
+        }
+
+        float variance = sum2 / ne00;
+        const float scale = 1.0f / sqrtf(variance + eps);
+
+        ne_vec_scale_f32(ne00, y, scale);
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_norm(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                    struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_norm_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+static void ne_compute_forward_rms_norm_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                            struct ne_tensor* dst) {
+  NE_ASSERT(ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  NE_ASSERT(src0->nb[0] == sizeof(float));
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  float eps;
+  memcpy(&eps, dst->op_params, sizeof(float));
+
+  if (ne_is_contiguous(src0) && ne_is_contiguous(dst)) {
+    bestla_layernormalization(ne03 * ne02 * ne01, ne00, true, eps, (const float*)src0->data, (float*)dst->data);
+    return;
+  }
+  // TODO: optimize
+  for (int64_t i03 = 0; i03 < ne03; i03++) {
+    for (int64_t i02 = 0; i02 < ne02; i02++) {
+      for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+        const float* x = (float*)((char*)src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+
+        ne_float sum = 0.0;
+        for (int64_t i00 = 0; i00 < ne00; i00++) {
+          sum += (ne_float)(x[i00] * x[i00]);
+        }
+
+        float mean = sum / ne00;
+
+        float* y = (float*)((char*)dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
+
+        memcpy(y, x, ne00 * sizeof(float));
+        // for (int i00 = 0; i00 < ne00; i00++) {
+        //     y[i00] = x[i00];
+        // }
+
+        const float scale = 1.0f / sqrtf(mean + eps);
+
+        ne_vec_scale_f32(ne00, y, scale);
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_rms_norm(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                        struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_rms_norm_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+static void ne_compute_forward_rms_norm_back_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                                 const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(ne_are_same_shape(src0, dst) && ne_are_same_shape(src0, src1));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  NE_ASSERT(src0->nb[0] == sizeof(float));
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb11 = src1->nb[1];
+  const size_t nb12 = src1->nb[2];
+  const size_t nb13 = src1->nb[3];
+
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const float eps = 1e-6f;  // TODO: make this a parameter
+
+  // TODO: optimize
+  for (int64_t i03 = 0; i03 < ne03; i03++) {
+    for (int64_t i02 = 0; i02 < ne02; i02++) {
+      for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+        // src1 is same shape as src0 => same indices
+        const int64_t i11 = i01;
+        const int64_t i12 = i02;
+        const int64_t i13 = i03;
+
+        const float* x = (float*)((char*)src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
+        const float* dz = (float*)((char*)src1->data + i11 * nb11 + i12 * nb12 + i13 * nb13);
+
+        ne_float sum_xx = 0.0;
+        ne_float sum_xdz = 0.0;
+
+        for (int64_t i00 = 0; i00 < ne00; i00++) {
+          sum_xx += (ne_float)(x[i00] * x[i00]);
+          sum_xdz += (ne_float)(x[i00] * dz[i00]);
+        }
+
+        // const float mean     = (float)(sum_xx)/ne00;
+        const float mean_eps = (float)(sum_xx) / ne00 + eps;
+        const float sum_eps = (float)(sum_xx) + eps * ne00;
+        // const float mean_xdz = (float)(sum_xdz)/ne00;
+        //  we could cache rms from forward pass to improve performance.
+        //  to do this implement ne_rms and compose ne_rms_norm using ne_rms.
+        // const float rms      = sqrtf(mean_eps);
+        const float rrms = 1.0f / sqrtf(mean_eps);
+        // const float scale    = -rrms/(ne00 * mean_eps); // -1/(n*rms**3)
+
+        {
+          // z = rms_norm(x)
+          //
+          // rms_norm(src0) =
+          //     scale(
+          //         src0,
+          //         div(
+          //             1,
+          //             sqrt(
+          //                 add(
+          //                     scale(
+          //                         sum(
+          //                             sqr(
+          //                                 src0)),
+          //                         (1.0/N)),
+          //                     eps))));
+
+          // postorder:
+          // ## op    args         grad
+          // 00 param src0         grad[#00]
+          // 01 const 1
+          // 02 sqr   (#00)        grad[#02]
+          // 03 sum   (#02)        grad[#03]
+          // 04 const 1/N
+          // 05 scale (#03, #04)   grad[#05]
+          // 06 const eps
+          // 07 add   (#05, #06)   grad[#07]
+          // 08 sqrt  (#07)        grad[#08]
+          // 09 div   (#01,#08)    grad[#09]
+          // 10 scale (#00,#09)    grad[#10]
+          //
+          // backward pass, given grad[#10]
+          // #10: scale
+          // grad[#00] += scale(grad[#10],#09)
+          // grad[#09] += sum(mul(grad[#10],#00))
+          // #09: div
+          // grad[#08] += neg(mul(grad[#09], div(#09,#08)))
+          // #08: sqrt
+          // grad[#07] += mul(grad[#08], div(0.5, #08))
+          // #07: add
+          // grad[#05] += grad[#07]
+          // #05: scale
+          // grad[#03] += scale(grad[#05],#04)
+          // #03: sum
+          // grad[#02] += repeat(grad[#03], #02)
+          // #02:
+          // grad[#00] += scale(mul(#00, grad[#02]), 2.0)
+          //
+          // substitute and simplify:
+          // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
+          // grad[#02] = repeat(grad[#03], #02)
+          // grad[#02] = repeat(scale(grad[#05],#04), #02)
+          // grad[#02] = repeat(scale(grad[#07],#04), #02)
+          // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02)
+          // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02)
+          // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02)
+          // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02)
+          // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02)
+          // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02)
+          // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)
+          // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
+          // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5,
+          // #08) * (1/N)), #02)), 2.0) grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) *
+          // div(1,#07) * div(0.5, #08) * (1/N))), 2.0) grad[#00] = scale(grad(#10), #09) + scale(#00,
+          // -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N))) grad[#00] = scale(grad(#10), #09) +
+          // scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N)) grad[#00] = scale(grad(#10), #09) +
+          // scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N)) grad[#00] = scale(grad(#10), #09) +
+          // scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N)) grad[#00] = scale(grad(#10), #09) +
+          // scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps)) grad[#00] = scale(grad(#10), #09) + scale(#00,
+          // sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps))) grad[#00] = scale(grad(#10), #09) + scale(#00,
+          // sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps)) grad[#00] = scale(dz, rrms) + scale(x,
+          // sum(mul(dz,x)) * div(-1,rms*N*mean_eps)) grad[#00] = scale(dz, rrms) + scale(x, sum_xdz *
+          // div(-1,rms*N*mean_eps)) a = b*c + d*e a = b*c*f/f + d*e*f/f a = (b*c*f + d*e*f)*(1/f) a = (b*c*(1/c) +
+          // d*e*(1/c))*(1/(1/c)) a = (b + d*e/c)*c b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps) a =
+          // (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms
+          // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms
+          // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms
+          // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms
+          // a = (dz + x*div(-mean_xdz,mean_eps))*rrms
+          // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms)
+          // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+          // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+        }
+        // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
+        // post-order:
+        // dx := x
+        // dx := scale(dx,-mean_xdz/mean_eps)
+        // dx := add(dx, dz)
+        // dx := scale(dx, rrms)
+        float* dx = (float*)((char*)dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
+
+        ne_vec_cpy_f32(ne00, dx, x);
+        // ne_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
+        ne_vec_scale_f32(ne00, dx, (float)(-sum_xdz) / sum_eps);
+        ne_vec_acc_f32(ne00, dx, dz);
+        ne_vec_scale_f32(ne00, dx, rrms);
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_rms_norm_back(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                             const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_rms_norm_back_f32(params, src0, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+static void ne_compute_forward_mul_mat_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                           const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne11 = src1->ne[1];
+
+  const int64_t ne12 = src1->ne[2];
+  const int64_t ne13 = src1->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  const size_t nb00 = src0->nb[0];
+
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+
+  const size_t nb11 = src1->nb[1];
+  UNUSED(nb11);
+  const size_t nb12 = src1->nb[2];
+  UNUSED(nb12);
+  const size_t nb13 = src1->nb[3];
+  UNUSED(nb13);
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  NE_ASSERT(ne0 == ne01);
+  NE_ASSERT(ne1 == ne11);
+  NE_ASSERT(ne2 == ne12);
+  NE_ASSERT(ne3 == ne13);
+
+  // we don't support permuted src0 or src1
+  NE_ASSERT(nb00 == sizeof(float));
+  NE_ASSERT(nb10 == sizeof(float));
+
+  // dst cannot be transposed or permuted
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb0 <= nb1);
+  NE_ASSERT(nb1 <= nb2);
+  NE_ASSERT(nb2 <= nb3);
+
+  // nb01 >= nb00 - src0 is not transposed
+  //   compute by src0 rows
+
+  // if (params->type == NE_TASK_INIT) {
+  //   return;
+  // }
+
+  // if (params->type == NE_TASK_FINALIZE) {
+  //   return;
+  // }
+
+  // parallelize by src0 rows
+  const int64_t dr = (ne01 + nth - 1) / nth;
+
+  const int64_t ir10 = dr * ith;
+  const int64_t ir11 = MIN(ir10 + dr, ne01);
+
+  // src1 rows
+  const int64_t nr1 = ne11 * ne12 * ne13;
+
+  for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
+    const int64_t i13 = (ir1 / (ne12 * ne11));
+    const int64_t i12 = (ir1 - i13 * ne12 * ne11) / ne11;
+    const int64_t i11 = (ir1 - i13 * ne12 * ne11 - i12 * ne11);
+
+    const int64_t ir0 = (ir1 / ne11) % (ne02 * ne03);
+    const int64_t i03 = (ir0 / (ne02));
+    // Hack for "Falcon multi-query-attention key stutter" / alternative to ne_repeat2.
+    // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
+    const int64_t i02 = (i12 / (ne12 / ne02));
+    // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
+    // const int64_t i02 = (ir0 - i03*ne02);
+
+    const int64_t i1 = i11;
+    const int64_t i2 = i12;
+    const int64_t i3 = i13;
+
+    char* src0_row = (char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
+    char* src1_col = (char*)src1->data + (i11 * nb11 + i12 * nb12 + i13 * nb13);
+
+    float* dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
+
+    for (int64_t ir = ir10; ir < ir11; ++ir) {
+      ne_vec_dot_f32(ne00, &dst_col[ir], (float*)(src0_row + ir * nb01), (float*)src1_col);
+    }
+  }
+}
+
+static void ne_compute_forward_mul_mat_f16_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                               const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne10 = src1->ne[0];
+  const int64_t ne11 = src1->ne[1];
+  const int64_t ne12 = src1->ne[2];
+  const int64_t ne13 = src1->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+  // const int64_t ne   = ne0*ne1*ne2*ne3;
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = src1->nb[1];
+  const size_t nb12 = src1->nb[2];
+  const size_t nb13 = src1->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  NE_ASSERT(ne0 == ne01);
+  NE_ASSERT(ne1 == ne11);
+  NE_ASSERT(ne2 == ne12);
+  NE_ASSERT(ne3 == ne13);
+
+  // TODO: we don't support permuted src0
+  NE_ASSERT(nb00 == sizeof(ne_fp16_t));
+
+  // dst cannot be transposed or permuted
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb0 <= nb1);
+  NE_ASSERT(nb1 <= nb2);
+  NE_ASSERT(nb2 <= nb3);
+
+  // nb01 >= nb00 - src0 is not transposed
+  //   compute by src0 rows
+
+  if (params->type == NE_TASK_INIT) {
+    ne_fp16_t* const wdata = params->wdata;
+
+    size_t id = 0;
+    for (int64_t i13 = 0; i13 < ne13; ++i13) {
+      for (int64_t i12 = 0; i12 < ne12; ++i12) {
+        for (int64_t i11 = 0; i11 < ne11; ++i11) {
+          for (int64_t i10 = 0; i10 < ne10; ++i10) {
+            wdata[id++] =
+                NE_FP32_TO_FP16(*(float*)((char*)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11 + i10 * nb10));
+          }
+        }
+      }
+    }
+
+    NE_ASSERT(id * sizeof(ne_fp16_t) <= params->wsize);
+
+    return;
+  }
+
+  // fp16 -> half the size, so divide by 2
+  // TODO: do not support transposed src1
+  assert(nb10 / 2 == sizeof(ne_fp16_t));
+
+  // parallelize by src0 rows
+  const int64_t dr = (ne01 + nth - 1) / nth;
+
+  const int64_t ir10 = dr * ith;
+  const int64_t ir11 = MIN(ir10 + dr, ne01);
+
+  // src1 rows
+  const int64_t nr1 = ne11 * ne12 * ne13;
+
+  void* wdata = params->wdata;
+  const size_t row_size = ne10 * NE_TYPE_SIZE[NE_TYPE_F16];
+
+  for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
+    const int64_t i13 = (ir1 / (ne12 * ne11));
+    const int64_t i12 = (ir1 - i13 * ne12 * ne11) / ne11;
+    const int64_t i11 = (ir1 - i13 * ne12 * ne11 - i12 * ne11);
+
+    const int64_t ir0 = (ir1 / ne11) % (ne02 * ne03);
+    const int64_t i03 = (ir0 / (ne02));
+    // Hack for "Falcon multi-query-attention key stutter" / alternative to ne_repeat2.
+    // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
+    const int64_t i02 = (i12 / (ne12 / ne02));
+    // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
+    // const int64_t i02 = (ir0 - i03*ne02);
+
+    const int64_t i1 = i11;
+    const int64_t i2 = i12;
+    const int64_t i3 = i13;
+
+    char* src0_row = (char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
+    char* src1_col = (char*)wdata + (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size;
+
+    float* dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
+
+    for (int64_t ir = ir10; ir < ir11; ++ir) {
+      ne_vec_dot_f16(ne00, &dst_col[ir], (ne_fp16_t*)(src0_row + ir * nb01), (ne_fp16_t*)src1_col);
+    }
+  }
+
+  // int64_t t1 = ne_time_us();
+  // static int64_t acc = 0;
+  // acc += t1 - t0;
+  // if (t1 - t0 > 10) {
+  //     printf("\n");
+  //     printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
+  //     printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
+  //     printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
+
+  //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1
+  //    - t0), (int) acc);
+  //}
+}
+
+static void ne_compute_forward_mul_mat_q_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                             const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne10 = src1->ne[0];
+  const int64_t ne11 = src1->ne[1];
+  const int64_t ne12 = src1->ne[2];
+  const int64_t ne13 = src1->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = src1->nb[1];
+  const size_t nb12 = src1->nb[2];
+  const size_t nb13 = src1->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const enum ne_type type = src0->type;
+  quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot;
+  vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q;
+  enum ne_type const vec_dot_type = quantize_fns[type].vec_dot_type;
+
+  NE_ASSERT(ne0 == ne01);
+  NE_ASSERT(ne1 == ne11);
+  NE_ASSERT(ne2 == ne12);
+  NE_ASSERT(ne3 == ne13);
+
+  // we don't support permuted src0 or src1
+  NE_ASSERT(nb00 == (int)NE_TYPE_SIZE[type]);
+  NE_ASSERT(nb10 == sizeof(float));
+
+  // dst cannot be transposed or permuted
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb0 <= nb1);
+  NE_ASSERT(nb1 <= nb2);
+  NE_ASSERT(nb2 <= nb3);
+
+  // nb01 >= nb00 - src0 is not transposed
+  //   compute by src0 rows
+
+  if (params->type == NE_TASK_INIT) {
+    char* wdata = params->wdata;
+    const size_t row_size = ne10 * NE_TYPE_SIZE[vec_dot_type] / NE_BLCK_SIZE[vec_dot_type];
+
+    for (int64_t i13 = 0; i13 < ne13; ++i13) {
+      for (int64_t i12 = 0; i12 < ne12; ++i12) {
+        for (int64_t i11 = 0; i11 < ne11; ++i11) {
+          quantize_row_q_dot((float*)((char*)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11), (void*)wdata, ne10);
+          wdata += row_size;
+        }
+      }
+    }
+
+    return;
+  }
+
+  // parallelize by src0 rows
+  const int64_t dr = (ne01 + nth - 1) / nth;
+
+  const int64_t ir10 = dr * ith;
+  const int64_t ir11 = MIN(ir10 + dr, ne01);
+
+  // src1 rows
+  const int64_t nr1 = ne11 * ne12 * ne13;
+
+  const void* wdata = params->wdata;
+  const size_t row_size = ne10 * NE_TYPE_SIZE[vec_dot_type] / NE_BLCK_SIZE[vec_dot_type];
+
+  for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
+    const int64_t i13 = (ir1 / (ne12 * ne11));
+    const int64_t i12 = (ir1 - i13 * ne12 * ne11) / ne11;
+    const int64_t i11 = (ir1 - i13 * ne12 * ne11 - i12 * ne11);
+
+    const int64_t ir0 = (ir1 / ne11) % (ne02 * ne03);
+    const int64_t i03 = (ir0 / (ne02));
+    // Hack for "Falcon multi-query-attention key stutter" / alternative to ne_repeat2.
+    // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
+    const int64_t i02 = (i12 / (ne12 / ne02));
+    // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
+    // const int64_t i02 = (ir0 - i03*ne02);
+
+    const int64_t i1 = i11;
+    const int64_t i2 = i12;
+    const int64_t i3 = i13;
+
+    const char* src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
+    const char* src1_col = (const char*)wdata + (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size;
+
+    float* dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
+
+    for (int64_t ir = ir10; ir < ir11; ++ir) {
+      vec_dot_q(ne00, &dst_col[ir], src0_row + ir * nb01, src1_col);
+    }
+  }
+
+  // int64_t t1 = ne_time_us();
+  // static int64_t acc = 0;
+  // acc += t1 - t0;
+  // if (t1 - t0 > 10) {
+  //     printf("\n");
+  //     printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
+  //     printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
+  //     printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
+
+  //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1
+  //    - t0), (int) acc);
+  //}
+}
+
+static void ne_compute_forward_mul_mat_q_f32_bestla(const struct ne_compute_params* params,
+                                                    const struct ne_tensor* src0, const struct ne_tensor* src1,
+                                                    struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne10 = src1->ne[0];
+  const int64_t ne11 = src1->ne[1];
+  const int64_t ne12 = src1->ne[2];
+  const int64_t ne13 = src1->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = src1->nb[1];
+  const size_t nb12 = src1->nb[2];
+  const size_t nb13 = src1->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  NE_ASSERT(ne02 == ne12);
+  NE_ASSERT(ne03 == ne13);
+  NE_ASSERT(ne2 == ne12);
+  NE_ASSERT(ne3 == ne13);
+
+  const enum ne_type type = src0->type;
+  quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot;
+  vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q;
+  enum ne_type const vec_dot_type = quantize_fns[type].vec_dot_type;
+
+  // we don't support permuted src0 or src1
+  NE_ASSERT(nb00 == (int)NE_TYPE_SIZE[type]);
+  NE_ASSERT(nb10 == sizeof(float));
+
+  // dst cannot be transposed or permuted
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb0 <= nb1);
+  NE_ASSERT(nb1 <= nb2);
+  NE_ASSERT(nb2 <= nb3);
+
+  NE_ASSERT(ne0 == ne01);
+  NE_ASSERT(ne1 == ne11);
+  NE_ASSERT(ne2 == ne02);
+  NE_ASSERT(ne3 == ne03);
+
+  // nb01 >= nb00 - src0 is not transposed
+  //   compute by src0 rows
+
+  // if (params->type == NE_TASK_INIT) {
+  //   return;
+  // }
+
+  // if (params->type == NE_TASK_FINALIZE) {
+  //   return;
+  // }
+  bestla_f32f32_forward((float*)src1->data, src0->data, (float*)dst->data, ne1, ne0, ne10, nb11 / ne_element_size(src1),
+                        nb1 / ne_element_size(dst), params->wdata);
+}
+
+static void ne_compute_forward_mul_mat(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_Q4_0:
+    case NE_TYPE_Q4_1:
+    case NE_TYPE_Q5_0:
+    case NE_TYPE_Q5_1:
+    case NE_TYPE_Q8_0:
+    case NE_TYPE_Q6_K:
+    case NE_TYPE_Q8_1: {
+      ne_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_BTLA: {
+      ne_compute_forward_mul_mat_q_f32_bestla(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_F16: {
+      ne_compute_forward_mul_mat_f16_f32(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_F32: {
+      ne_compute_forward_mul_mat_f32(params, src0, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+static void ne_compute_forward_mul_mat_id_q_f32(const struct ne_compute_params* params, const struct ne_tensor* ids,
+                                                const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+  const struct ne_tensor* src0 = dst->opt[0];
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+  const int64_t ne10 = src1->ne[0];
+  const int64_t ne11 = src1->ne[1];
+
+  const int64_t ne12 = src1->ne[2];
+  const int64_t ne13 = src1->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  const size_t nb00 = src0->nb[0];
+
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+
+  const size_t nb11 = src1->nb[1];
+  UNUSED(nb11);
+  const size_t nb12 = src1->nb[2];
+  UNUSED(nb12);
+  const size_t nb13 = src1->nb[3];
+  UNUSED(nb13);
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const enum ne_type type = src0->type;
+  quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot;
+  vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q;
+  enum ne_type const vec_dot_type = quantize_fns[type].vec_dot_type;
+
+  NE_ASSERT(ne0 == ne01);
+  NE_ASSERT(ne1 == ne11);
+  NE_ASSERT(ne2 == ne12);
+  NE_ASSERT(ne3 == ne13);
+
+  // we don't support permuted src0 or src1
+  NE_ASSERT(nb00 == (int)NE_TYPE_SIZE[type]);
+  NE_ASSERT(nb10 == sizeof(float));
+
+  // dst cannot be transposed or permuted
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb0 <= nb1);
+  NE_ASSERT(nb1 <= nb2);
+  NE_ASSERT(nb2 <= nb3);
+  const int id = dst->op_params[0];
+  const int n_as = dst->op_params[1];
+  // char * wdata_src1_end = (char *)params->wdata;
+  // int64_t wdata_src1_end = 0;
+
+#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
+
+  // nb01 >= nb00 - src0 is not transposed
+  //   compute by src0 rows
+
+  if (params->type == NE_TASK_INIT) {
+    if (ith != 0) {
+      return;
+    }
+    char* wdata = params->wdata;
+    const size_t row_size = ne10 * NE_TYPE_SIZE[vec_dot_type] / NE_BLCK_SIZE[vec_dot_type];
+    for (int64_t i13 = 0; i13 < ne13; ++i13) {
+      for (int64_t i12 = 0; i12 < ne12; ++i12) {
+        for (int64_t i11 = 0; i11 < ne11; ++i11) {
+          quantize_row_q_dot((float*)((char*)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11), (void*)wdata, ne10);
+          wdata += row_size;
+        }
+      }
+    }
+
+    return;
+  }
+
+  // if (params->type == NE_TASK_FINALIZE) {
+  //   return;
+  // }
+  int64_t matrix_row_counts[100];  // [n_as]
+  int64_t matrix_rows[30000];      // [n_as][ne11]
+  memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
+  memset(matrix_rows, -1, 30000 * sizeof(int64_t));
+  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+    const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]);
+    NE_ASSERT(row_id >= 0 && row_id < n_as);
+    mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01;
+    matrix_row_counts[row_id] += 1;
+  }
+  for (int cur_a = 0; cur_a < n_as; ++cur_a) {
+    const int64_t cne1 = matrix_row_counts[cur_a];
+    if (cne1 == 0) {
+      continue;
+    }
+    const struct ne_tensor* src0_cur = dst->opt[cur_a];
+    // parallelize by src0 rows
+    const int64_t dr = (ne01 + nth - 1) / nth;
+
+    const int64_t ir10 = dr * ith;
+    const int64_t ir11 = MIN(ir10 + dr, ne01);
+
+    // src1 rows
+    const int64_t nr1 = cne1 * ne12 * ne13;
+
+    void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const size_t row_size = ne10 * NE_TYPE_SIZE[vec_dot_type] / NE_BLCK_SIZE[vec_dot_type];
+
+    for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
+      const int64_t i13 = (ir1 / (ne12 * cne1));
+      const int64_t i12 = (ir1 - i13 * ne12 * cne1) / cne1;
+      const int64_t _i11 = (ir1 - i13 * ne12 * cne1 - i12 * cne1);
+      const int64_t i11 = mmid_matrix_row(cur_a, _i11);
+      if (i11 == -1) {
+        continue;
+      }
+
+      const int64_t ir0 = (ir1 / ne11) % (ne02 * ne03);
+      const int64_t i03 = (ir0 / (ne02));
+      // Hack for "Falcon multi-query-attention key stutter" / alternative to ne_repeat2.
+      // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
+      const int64_t i02 = (i12 / (ne12 / ne02));
+      // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
+      // const int64_t i02 = (ir0 - i03*ne02);
+
+      const int64_t i1 = i11;
+      const int64_t i2 = i12;
+      const int64_t i3 = i13;
+
+      char* src0_row = (char*)src0_cur->data + (0 + i02 * nb02 + i03 * nb03);
+      char* src1_col = (char*)wdata + (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size;
+
+      float* dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
+
+      for (int64_t ir = ir10; ir < ir11; ++ir) {
+        vec_dot_q(ne00, &dst_col[ir], src0_row + ir * nb01, src1_col);
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_mul_mat_id_f32(const struct ne_compute_params* params, const struct ne_tensor* ids,
+                                              const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+  const struct ne_tensor* src0 = dst->opt[0];
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne11 = src1->ne[1];
+
+  const int64_t ne12 = src1->ne[2];
+  const int64_t ne13 = src1->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  const size_t nb00 = src0->nb[0];
+
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+
+  const size_t nb11 = src1->nb[1];
+  UNUSED(nb11);
+  const size_t nb12 = src1->nb[2];
+  UNUSED(nb12);
+  const size_t nb13 = src1->nb[3];
+  UNUSED(nb13);
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  NE_ASSERT(ne0 == ne01);
+  NE_ASSERT(ne1 == ne11);
+  NE_ASSERT(ne2 == ne12);
+  NE_ASSERT(ne3 == ne13);
+
+  // we don't support permuted src0 or src1
+  NE_ASSERT(nb00 == sizeof(float));
+  NE_ASSERT(nb10 == sizeof(float));
+
+  // dst cannot be transposed or permuted
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb0 <= nb1);
+  NE_ASSERT(nb1 <= nb2);
+  NE_ASSERT(nb2 <= nb3);
+  const int id = dst->op_params[0];
+  const int n_as = dst->op_params[1];
+  // char * wdata_src1_end = (char *)params->wdata;
+  // int64_t wdata_src1_end = 0;
+
+  // nb01 >= nb00 - src0 is not transposed
+  //   compute by src0 rows
+
+  // if (params->type == NE_TASK_INIT) {
+  //   return;
+  // }
+
+  // if (params->type == NE_TASK_FINALIZE) {
+  //   return;
+  // }
+  int64_t matrix_row_counts[100];  // [n_as]
+  int64_t matrix_rows[30000];      // [n_as][ne11]
+#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
+  memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
+  memset(matrix_rows, -1, 30000 * sizeof(int64_t));
+  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+    const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]);
+    NE_ASSERT(row_id >= 0 && row_id < n_as);
+    mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01;
+    matrix_row_counts[row_id] += 1;
+  }
+  for (int cur_a = 0; cur_a < n_as; ++cur_a) {
+    const int64_t cne1 = matrix_row_counts[cur_a];
+    if (cne1 == 0) {
+      continue;
+    }
+    const struct ne_tensor* src0_cur = dst->opt[cur_a];
+    // parallelize by src0 rows
+    const int64_t dr = (ne01 + nth - 1) / nth;
+
+    const int64_t ir10 = dr * ith;
+    const int64_t ir11 = MIN(ir10 + dr, ne01);
+
+    // src1 rows
+    const int64_t nr1 = cne1 * ne12 * ne13;
+
+    for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
+      const int64_t i13 = (ir1 / (ne12 * cne1));
+      const int64_t i12 = (ir1 - i13 * ne12 * cne1) / cne1;
+      const int64_t _i11 = (ir1 - i13 * ne12 * cne1 - i12 * cne1);
+      const int64_t i11 = mmid_matrix_row(cur_a, _i11);
+      if (i11 == -1) {
+        continue;
+      }
+
+      const int64_t ir0 = (ir1 / ne11) % (ne02 * ne03);
+      const int64_t i03 = (ir0 / (ne02));
+      // Hack for "Falcon multi-query-attention key stutter" / alternative to ne_repeat2.
+      // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
+      const int64_t i02 = (i12 / (ne12 / ne02));
+      // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
+      // const int64_t i02 = (ir0 - i03*ne02);
+
+      const int64_t i1 = i11;
+      const int64_t i2 = i12;
+      const int64_t i3 = i13;
+
+      char* src0_row = (char*)src0_cur->data + (0 + i02 * nb02 + i03 * nb03);
+      char* src1_col = (char*)src1->data + (i11 * nb11 + i12 * nb12 + i13 * nb13);
+
+      float* dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
+
+      for (int64_t ir = ir10; ir < ir11; ++ir) {
+        ne_vec_dot_f32(ne00, &dst_col[ir], (float*)(src0_row + ir * nb01), (float*)src1_col);
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_mul_mat_id_f16_f32(const struct ne_compute_params* params, const struct ne_tensor* ids,
+                                                  const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+  const struct ne_tensor* src0 = dst->opt[0];
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne10 = src1->ne[0];
+  const int64_t ne11 = src1->ne[1];
+
+  const int64_t ne12 = src1->ne[2];
+  const int64_t ne13 = src1->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  const size_t nb00 = src0->nb[0];
+
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+
+  const size_t nb11 = src1->nb[1];
+  UNUSED(nb11);
+  const size_t nb12 = src1->nb[2];
+  UNUSED(nb12);
+  const size_t nb13 = src1->nb[3];
+  UNUSED(nb13);
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  NE_ASSERT(ne0 == ne01);
+  NE_ASSERT(ne1 == ne11);
+  NE_ASSERT(ne2 == ne12);
+  NE_ASSERT(ne3 == ne13);
+
+  // we don't support permuted src0 or src1
+  NE_ASSERT(nb00 == sizeof(ne_fp16_t));
+
+  // dst cannot be transposed or permuted
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb0 <= nb1);
+  NE_ASSERT(nb1 <= nb2);
+  NE_ASSERT(nb2 <= nb3);
+  const int id = dst->op_params[0];
+  const int n_as = dst->op_params[1];
+  // char * wdata_src1_end = (char *)params->wdata;
+  // int64_t wdata_src1_end = 0;
+
+  // nb01 >= nb00 - src0 is not transposed
+  //   compute by src0 rows
+
+  if (params->type == NE_TASK_INIT) {
+    ne_fp16_t* const wdata = params->wdata;
+
+    size_t id = 0;
+    for (int64_t i13 = 0; i13 < ne13; ++i13) {
+      for (int64_t i12 = 0; i12 < ne12; ++i12) {
+        for (int64_t i11 = 0; i11 < ne11; ++i11) {
+          for (int64_t i10 = 0; i10 < ne10; ++i10) {
+            wdata[id++] =
+                NE_FP32_TO_FP16(*(float*)((char*)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11 + i10 * nb10));
+          }
+        }
+      }
+    }
+
+    NE_ASSERT(id * sizeof(ne_fp16_t) <= params->wsize);
+
+    return;
+  }
+
+  // if (params->type == NE_TASK_FINALIZE) {
+  //   return;
+  // }
+  int64_t matrix_row_counts[100];  // [n_as]
+  int64_t matrix_rows[30000];      // [n_as][ne11]
+#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
+  memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
+  memset(matrix_rows, -1, 30000 * sizeof(int64_t));
+  for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+    const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]);
+    NE_ASSERT(row_id >= 0 && row_id < n_as);
+    mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01;
+    matrix_row_counts[row_id] += 1;
+  }
+  for (int cur_a = 0; cur_a < n_as; ++cur_a) {
+    const int64_t cne1 = matrix_row_counts[cur_a];
+    if (cne1 == 0) {
+      continue;
+    }
+    assert(nb10 / 2 == sizeof(ne_fp16_t));
+    const struct ne_tensor* src0_cur = dst->opt[cur_a];
+    // parallelize by src0 rows
+    const int64_t dr = (ne01 + nth - 1) / nth;
+
+    const int64_t ir10 = dr * ith;
+    const int64_t ir11 = MIN(ir10 + dr, ne01);
+
+    // src1 rows
+    const int64_t nr1 = cne1 * ne12 * ne13;
+    void* wdata = params->wdata;
+    const size_t row_size = ne10 * NE_TYPE_SIZE[NE_TYPE_F16];
+
+    for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
+      const int64_t i13 = (ir1 / (ne12 * cne1));
+      const int64_t i12 = (ir1 - i13 * ne12 * cne1) / cne1;
+      const int64_t _i11 = (ir1 - i13 * ne12 * cne1 - i12 * cne1);
+      const int64_t i11 = mmid_matrix_row(cur_a, _i11);
+      if (i11 == -1) {
+        continue;
+      }
+
+      const int64_t ir0 = (ir1 / ne11) % (ne02 * ne03);
+      const int64_t i03 = (ir0 / (ne02));
+      // Hack for "Falcon multi-query-attention key stutter" / alternative to ne_repeat2.
+      // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
+      const int64_t i02 = (i12 / (ne12 / ne02));
+      // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
+      // const int64_t i02 = (ir0 - i03*ne02);
+
+      const int64_t i1 = i11;
+      const int64_t i2 = i12;
+      const int64_t i3 = i13;
+
+      char* src0_row = (char*)src0_cur->data + (0 + i02 * nb02 + i03 * nb03);
+      char* src1_col = (char*)wdata + (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size;
+
+      float* dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
+
+      for (int64_t ir = ir10; ir < ir11; ++ir) {
+        ne_vec_dot_f16(ne00, &dst_col[ir], (ne_fp16_t*)(src0_row + ir * nb01), (ne_fp16_t*)src1_col);
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_mul_mat_id_q_f32_bestla(const struct ne_compute_params* params,
+                                                       const struct ne_tensor* ids, const struct ne_tensor* src1,
+                                                       struct ne_tensor* dst) {
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+  const struct ne_tensor* src0 = dst->opt[0];
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne10 = src1->ne[0];
+  const int64_t ne11 = src1->ne[1];
+
+  const int64_t ne12 = src1->ne[2];
+  const int64_t ne13 = src1->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  const size_t nb00 = src0->nb[0];
+
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+
+  const size_t nb11 = src1->nb[1];
+  UNUSED(nb11);
+  const size_t nb12 = src1->nb[2];
+  UNUSED(nb12);
+  const size_t nb13 = src1->nb[3];
+  UNUSED(nb13);
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  NE_ASSERT(ne0 == ne01);
+  NE_ASSERT(ne1 == ne11);
+  NE_ASSERT(ne2 == ne12);
+  NE_ASSERT(ne3 == ne13);
+
+  const enum ne_type type = src0->type;
+  quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot;
+  vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q;
+  enum ne_type const vec_dot_type = quantize_fns[type].vec_dot_type;
+  // we don't support permuted src0 or src1
+  NE_ASSERT(nb00 == (int)NE_TYPE_SIZE[type]);
+  NE_ASSERT(nb10 == sizeof(float));
+  // dst cannot be transposed or permuted
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb0 <= nb1);
+  NE_ASSERT(nb1 <= nb2);
+  NE_ASSERT(nb2 <= nb3);
+  const int id = dst->op_params[0];
+  const int n_as = dst->op_params[1];
+  // char * wdata_src1_end = (char *)params->wdata;
+  // int64_t wdata_src1_end = 0;
+  int64_t matrix_row_counts[100];  // [n_as]
+  int64_t matrix_rows[30000];      // [n_as][ne11]
+#define mmid_matrix_row(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
+
+  // nb01 >= nb00 - src0 is not transposed
+  //   compute by src0 rows
+
+  if (params->type == NE_TASK_INIT) {
+    memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
+    memset(matrix_rows, -1, 30000 * sizeof(int64_t));
+    for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+      const int32_t row_id = *(const int32_t*)((const char*)ids->data + i01 * ids->nb[1] + id * ids->nb[0]);
+      NE_ASSERT(row_id >= 0 && row_id < n_as);
+      mmid_matrix_row(row_id, matrix_row_counts[row_id]) = i01;
+      matrix_row_counts[row_id] += 1;
+    }
+
+    return;
+  }
+
+  // if (params->type == NE_TASK_FINALIZE) {
+  //   return;
+  // }
+  for (int cur_a = 0; cur_a < n_as; ++cur_a) {
+    const int64_t cne1 = matrix_row_counts[cur_a];
+    if (cne1 == 0) {
+      continue;
+    }
+    // assert(nb10 / 2 == sizeof(ne_fp16_t));
+    const struct ne_tensor* src0_cur = dst->opt[cur_a];
+    // parallelize by src0 rows
+
+    // src1 rows
+    const int64_t nr1 = cne1 * ne12 * ne13;
+    const size_t row_size = ne10 * NE_TYPE_SIZE[src1->type];
+    for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
+      const int64_t i13 = (ir1 / (ne12 * cne1));
+      const int64_t i12 = (ir1 - i13 * ne12 * cne1) / cne1;
+      const int64_t _i11 = (ir1 - i13 * ne12 * cne1 - i12 * cne1);
+      const int64_t i11 = mmid_matrix_row(cur_a, _i11);
+      if (i11 == -1) {
+        continue;
+      }
+
+      const int64_t ir0 = (ir1 / ne11) % (ne02 * ne03);
+      const int64_t i03 = (ir0 / (ne02));
+      // Hack for "Falcon multi-query-attention key stutter" / alternative to ne_repeat2.
+      // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
+      const int64_t i02 = (i12 / (ne12 / ne02));
+      // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
+      // const int64_t i02 = (ir0 - i03*ne02);
+
+      const int64_t i1 = i11;
+      const int64_t i2 = i12;
+      const int64_t i3 = i13;
+
+      char* src0_row = (char*)src0_cur->data;
+      char* src1_col = (char*)src1->data + (i11 * nb11 + i12 * nb12 + i13 * nb13);
+
+      float* dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
+
+      // parallelize by src0 rows
+
+      bestla_f32f32_forward((float*)src1_col, (float*)src0_row, dst_col, 1, ne0, ne10, nb11 / ne_element_size(src1),
+                            nb1 / ne_element_size(dst), params->wdata);
+    }
+  }
+}
+static void ne_compute_forward_mul_mat_id(const struct ne_compute_params* params, const struct ne_tensor* ids,
+                                          const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (dst->opt[0]->type) {
+    case NE_TYPE_Q4_0:
+    case NE_TYPE_Q4_1:
+    case NE_TYPE_Q5_0:
+    case NE_TYPE_Q5_1:
+    case NE_TYPE_Q8_0:
+    case NE_TYPE_Q6_K:
+    case NE_TYPE_Q8_1: {
+      ne_compute_forward_mul_mat_id_q_f32(params, ids, src1, dst);
+    } break;
+    case NE_TYPE_BTLA: {
+      ne_compute_forward_mul_mat_id_q_f32_bestla(params, ids, src1, dst);
+    } break;
+    case NE_TYPE_F16: {
+      ne_compute_forward_mul_mat_id_f16_f32(params, ids, src1, dst);
+    } break;
+    case NE_TYPE_F32: {
+      ne_compute_forward_mul_mat_id_f32(params, ids, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+static void ne_compute_forward_mul_mat_bias_q_f32_bestla(const struct ne_compute_params* params,
+                                                         const struct ne_tensor* src0, const struct ne_tensor* src1,
+                                                         const struct ne_tensor* bias, struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int64_t ne00 = src0->ne[0];
+  const int64_t ne01 = src0->ne[1];
+  const int64_t ne02 = src0->ne[2];
+  const int64_t ne03 = src0->ne[3];
+
+  const int64_t ne10 = src1->ne[0];
+  const int64_t ne11 = src1->ne[1];
+  const int64_t ne12 = src1->ne[2];
+  const int64_t ne13 = src1->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = src1->nb[1];
+  const size_t nb12 = src1->nb[2];
+  const size_t nb13 = src1->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  NE_ASSERT(ne02 == ne12);
+  NE_ASSERT(ne03 == ne13);
+  NE_ASSERT(ne2 == ne12);
+  NE_ASSERT(ne3 == ne13);
+
+  const enum ne_type type = src0->type;
+  quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot;
+  vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q;
+  enum ne_type const vec_dot_type = quantize_fns[type].vec_dot_type;
+
+  // we don't support permuted src0 or src1
+  NE_ASSERT(nb00 == (int)NE_TYPE_SIZE[type]);
+  NE_ASSERT(nb10 == sizeof(float));
+
+  // dst cannot be transposed or permuted
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb0 <= nb1);
+  NE_ASSERT(nb1 <= nb2);
+  NE_ASSERT(nb2 <= nb3);
+
+  NE_ASSERT(ne0 == ne01);
+  NE_ASSERT(ne1 == ne11);
+  NE_ASSERT(ne2 == ne02);
+  NE_ASSERT(ne3 == ne03);
+
+  // nb01 >= nb00 - src0 is not transposed
+  //   compute by src0 rows
+
+  // if (params->type == NE_TASK_INIT) {
+  //   return;
+  // }
+
+  // if (params->type == NE_TASK_FINALIZE) {
+  //   return;
+  // }
+  const bool boardcast_bias = bias->ne[1] == 1;
+  bestla_fusion_add_f32f32_forward((float*)src1->data, src0->data, (float*)bias->data, (float*)dst->data, ne1, ne0,
+                                   ne10, ne10, ne0, boardcast_bias, params->wdata);
+}
+
+static void ne_compute_forward_mul_mat_bias(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                            const struct ne_tensor* src1, const struct ne_tensor* bias,
+                                            struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_BTLA: {
+      ne_compute_forward_mul_mat_bias_q_f32_bestla(params, src0, src1, bias, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+static void ne_compute_forward_mul_qkv(const struct ne_compute_params* params, const struct ne_tensor* src,
+                                       const struct ne_tensor* qw, const struct ne_tensor* kw, struct ne_tensor* vw,
+                                       struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+  const int n = dst->ne[0];
+  const int m = dst->ne[1];
+  const int k = src->ne[0];
+  bestla_fusion_QKV_f32f32_forward((float*)src->data, qw->data, kw->data, vw->data, (float*)dst->data, m, n, k, k, n,
+                                   params->wdata);
+}
+static void ne_compute_forward_ffn_id_silu(const struct ne_compute_params* params, const struct ne_tensor* src,
+                                           const struct ne_tensor* ids, const struct ne_tensor* tmp,
+                                           struct ne_tensor* tmp1, struct ne_tensor* dst) {
+  const int id = dst->op_params[0];
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+  const int32_t row_id = *(const int32_t*)((const char*)ids->data + id * ids->nb[0]);
+  const struct ne_tensor* w1 = dst->opt[row_id];
+  const struct ne_tensor* w2 = dst->opt[row_id + 8];
+  const struct ne_tensor* w3 = dst->opt[row_id + 16];
+
+  const int fin = src->ne[0];
+  const int fout = dst->ne[0];
+  const int fmid = w1->ne[1];
+  const int seq = dst->ne[1];
+  bestla_fusion_FFN_SiLu_f32f32_forward((float*)src->data, w1->data, w2->data, w3->data, (float*)tmp->data,
+                                        (float*)tmp1->data, (float*)dst->data, seq, fin, fmid, fout, params->wdata);
+}
+static void ne_compute_forward_ffn_silu(const struct ne_compute_params* params, const struct ne_tensor* src,
+                                        const struct ne_tensor* w1, const struct ne_tensor* w2, struct ne_tensor* w3,
+                                        const struct ne_tensor* tmp, struct ne_tensor* tmp1, struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+  const int fin = src->ne[0];
+  const int fout = dst->ne[0];
+  const int fmid = w1->ne[1];
+  const int seq = dst->ne[1];
+  bestla_fusion_FFN_SiLu_f32f32_forward((float*)src->data, w1->data, w2->data, w3->data, (float*)tmp->data,
+                                        (float*)tmp1->data, (float*)dst->data, seq, fin, fmid, fout, params->wdata);
+}
+
+static void ne_compute_forward_ffn_add_gelu(const struct ne_compute_params* params, const struct ne_tensor* src,
+                                            const struct ne_tensor* w1, const struct ne_tensor* w2,
+                                            const struct ne_tensor* b1, const struct ne_tensor* b2,
+                                            const struct ne_tensor* tmp, struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+  const int fin = src->ne[0];
+  const int fout = dst->ne[0];
+  const int fmid = w1->ne[1];
+  const int seq = dst->ne[1];
+  const bool boardcast_bias = b1->ne[1] == 1 || b2->ne[1] == 1;
+  bestla_fusion_FFN_Add_GeLu_f32f32_forward((float*)src->data, w1->data, w2->data, (float*)b1->data, (float*)b2->data,
+                                            (float*)tmp->data, (float*)dst->data, seq, fin, fmid, fout, boardcast_bias,
+                                            params->wdata);
+}
+
+static void ne_compute_forward_ffn_gelu(const struct ne_compute_params* params, const struct ne_tensor* src,
+                                        const struct ne_tensor* w1, const struct ne_tensor* w2,
+                                        const struct ne_tensor* tmp, struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+  const int fin = src->ne[0];
+  const int fout = dst->ne[0];
+  const int fmid = w1->ne[1];
+  const int seq = dst->ne[1];
+  bestla_fusion_FFN_GeLu_f32f32_forward((float*)src->data, w1->data, w2->data, (float*)tmp->data, (float*)dst->data,
+                                        seq, fin, fmid, fout, params->wdata);
+}
+
+static void ne_compute_forward_ffn_gelu_mul(const struct ne_compute_params* params, const struct ne_tensor* src,
+                                            const struct ne_tensor* w1, const struct ne_tensor* w2,
+                                            struct ne_tensor* w3, const struct ne_tensor* tmp, struct ne_tensor* tmp1,
+                                            struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+  const int fin = src->ne[0];
+  const int fout = dst->ne[0];
+  const int fmid = w1->ne[1];
+  const int seq = dst->ne[1];
+  bestla_fusion_FFN_Gelu_Mul_f32f32_forward((float*)src->data, w1->data, w2->data, w3->data, (float*)tmp->data,
+                                            (float*)tmp1->data, (float*)dst->data, seq, fin, fmid, fout, params->wdata);
+}
+
+// ne_compute_forward_scale
+
+static void ne_compute_forward_scale_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                         const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(ne_is_contiguous(src0));
+  NE_ASSERT(ne_is_contiguous(dst));
+  NE_ASSERT(ne_are_same_shape(src0, dst));
+  NE_ASSERT(ne_is_scalar(src1));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // scale factor
+  const float v = *(float*)src1->data;
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nc = src0->ne[0];
+  const int nr = ne_nrows(src0);
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  const size_t nb01 = src0->nb[1];
+
+  const size_t nb1 = dst->nb[1];
+
+  for (int i1 = ir0; i1 < ir1; i1++) {
+    if (dst->data != src0->data) {
+      // src0 is same shape as dst => same indices
+      memcpy((char*)dst->data + i1 * nb1, (char*)src0->data + i1 * nb01, nc * sizeof(float));
+    }
+    ne_vec_scale_f32(nc, (float*)((char*)dst->data + i1 * nb1), v);
+  }
+}
+
+static void ne_compute_forward_scale(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                     const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_scale_f32(params, src0, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_set
+
+static void ne_compute_forward_set_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       const struct ne_tensor* src1, const struct ne_tensor* opt0,
+                                       struct ne_tensor* dst) {
+  NE_ASSERT(ne_are_same_shape(src0, dst));
+  NE_ASSERT(ne_is_contiguous(dst) && ne_is_contiguous(src0));
+
+  NE_ASSERT(opt0->type == NE_TYPE_I32);
+  NE_ASSERT(ne_nelements(opt0) == 5);
+
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // view src0 and dst with these strides and data offset inbytes during set
+  // nb0 is implicitly element_size because src0 and dst are contiguous
+  size_t nb1 = ((int32_t*)opt0->data)[0];
+  size_t nb2 = ((int32_t*)opt0->data)[1];
+  size_t nb3 = ((int32_t*)opt0->data)[2];
+  size_t offset = ((int32_t*)opt0->data)[3];
+  bool inplace = (bool)((int32_t*)opt0->data)[4];
+
+  if (!inplace && (params->type == NE_TASK_INIT)) {
+    // memcpy needs to be synchronized across threads to avoid race conditions.
+    // => do it in INIT phase
+    memcpy(((char*)dst->data), ((char*)src0->data), ne_nbytes(dst));
+  }
+
+  if (params->type == NE_TASK_INIT) {
+    return;
+  }
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nr = ne_nrows(src1);
+  const int nc = src1->ne[0];
+
+  const int64_t ne10 = src1->ne[0];
+  const int64_t ne11 = src1->ne[1];
+  const int64_t ne12 = src1->ne[2];
+  const int64_t ne13 = src1->ne[3];
+
+  const size_t nb10 = src1->nb[0];
+  const size_t nb11 = src1->nb[1];
+  const size_t nb12 = src1->nb[2];
+  const size_t nb13 = src1->nb[3];
+
+  // src0 and dst as viewed during set
+  const size_t nb0 = ne_element_size(src0);
+
+  const int im0 = (ne10 == 0 ? 0 : ne10 - 1);
+  const int im1 = (ne11 == 0 ? 0 : ne11 - 1);
+  const int im2 = (ne12 == 0 ? 0 : ne12 - 1);
+  const int im3 = (ne13 == 0 ? 0 : ne13 - 1);
+
+  NE_ASSERT(offset + im0 * nb0 + im1 * nb1 + im2 * nb2 + im3 * nb3 < ne_nbytes(dst));
+
+  NE_ASSERT(nb10 == sizeof(float));
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int ir = ir0; ir < ir1; ++ir) {
+    // src0 and dst are viewed with shape of src1 and offset
+    // => same indices
+    const int i3 = ir / (ne12 * ne11);
+    const int i2 = (ir - i3 * ne12 * ne11) / ne11;
+    const int i1 = (ir - i3 * ne12 * ne11 - i2 * ne11);
+
+    ne_vec_cpy_f32(nc, (float*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1 + offset),
+                   (float*)((char*)src1->data + i3 * nb13 + i2 * nb12 + i1 * nb11));
+  }
+}
+
+static void ne_compute_forward_set(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   const struct ne_tensor* src1, const struct ne_tensor* opt0, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_set_f32(params, src0, src1, opt0, dst);
+    } break;
+    case NE_TYPE_F16:
+    case NE_TYPE_Q4_0:
+    case NE_TYPE_Q4_1:
+    case NE_TYPE_Q5_0:
+    case NE_TYPE_Q5_1:
+    case NE_TYPE_Q8_0:
+    case NE_TYPE_Q8_1:
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_cpy
+
+static void ne_compute_forward_cpy(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                   struct ne_tensor* dst) {
+  ne_compute_forward_dup(params, src0, dst);
+}
+
+// ne_compute_forward_cont
+
+static void ne_compute_forward_cont(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                    struct ne_tensor* dst) {
+  ne_compute_forward_dup(params, src0, dst);
+}
+
+// ne_compute_forward_reshape
+
+static void ne_compute_forward_reshape(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                       struct ne_tensor* dst) {
+  // NOP
+  UNUSED(params);
+  UNUSED(src0);
+  UNUSED(dst);
+}
+
+// ne_compute_forward_view
+
+static void ne_compute_forward_view(const struct ne_compute_params* params, const struct ne_tensor* src0) {
+  // NOP
+  UNUSED(params);
+  UNUSED(src0);
+}
+
+// ne_compute_forward_permute
+
+static void ne_compute_forward_permute(const struct ne_compute_params* params, const struct ne_tensor* src0) {
+  // NOP
+  UNUSED(params);
+  UNUSED(src0);
+}
+
+// ne_compute_forward_transpose
+
+static void ne_compute_forward_transpose(const struct ne_compute_params* params, const struct ne_tensor* src0) {
+  // NOP
+  UNUSED(params);
+  UNUSED(src0);
+}
+
+// ne_compute_forward_get_rows
+
+static void ne_compute_forward_get_rows_q(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                          const struct ne_tensor* src1, struct ne_tensor* dst) {
+  assert(params->ith == 0);
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int nc = src0->ne[0];
+  const int nr = ne_nelements(src1);
+  const enum ne_type type = src0->type;
+  dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
+
+  assert(dst->ne[0] == nc);
+  assert(dst->ne[1] == nr);
+  assert(src0->nb[0] == NE_TYPE_SIZE[type]);
+  assert(src0->ne[2] == src1->ne[1]);
+  const int ne00 = src0->ne[0];
+  const int ne01 = src0->ne[1];
+  const int ne02 = src0->ne[2];
+  const int ne03 = src0->ne[3];
+  const int ne10 = src1->ne[0];
+  const int ne11 = src1->ne[1];
+  const int ne12 = src1->ne[2];
+  const int ne13 = src1->ne[3];
+  const int nb00 = src0->nb[0];
+  const int nb01 = src0->nb[1];
+  const int nb02 = src0->nb[2];
+  const int nb03 = src0->nb[3];
+  const int nb10 = src1->nb[0];
+  const int nb11 = src1->nb[1];
+  const int nb12 = src1->nb[2];
+  const int nb13 = src1->nb[3];
+  const int nb0 = dst->nb[0];
+  const int nb1 = dst->nb[1];
+  const int nb2 = dst->nb[2];
+  const int nb3 = dst->nb[3];
+
+  for (int64_t i12 = 0; i12 < ne12; ++i12) {
+    for (int64_t i11 = 0; i11 < ne11; ++i11) {
+      for (int64_t i10 = 0; i10 < ne10; ++i10) {
+        const int64_t i01 = *(int32_t*)((char*)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12);
+
+        dequantize_row_q((const void*)((char*)src0->data + i01 * nb01 + i11 * nb02 + i12 * nb03),
+                         (float*)((char*)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc);
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_get_rows_f16(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                            const struct ne_tensor* src1, struct ne_tensor* dst) {
+  assert(params->ith == 0);
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int nc = src0->ne[0];
+  const int nr = ne_nelements(src1);
+
+  assert(dst->ne[0] == nc);
+  assert(dst->ne[1] == nr);
+  assert(src0->nb[0] == sizeof(ne_fp16_t));
+  assert(src0->ne[2] == src1->ne[1]);
+  const int ne00 = src0->ne[0];
+  const int ne01 = src0->ne[1];
+  const int ne02 = src0->ne[2];
+  const int ne03 = src0->ne[3];
+  const int ne10 = src1->ne[0];
+  const int ne11 = src1->ne[1];
+  const int ne12 = src1->ne[2];
+  const int ne13 = src1->ne[3];
+  const int nb00 = src0->nb[0];
+  const int nb01 = src0->nb[1];
+  const int nb02 = src0->nb[2];
+  const int nb03 = src0->nb[3];
+  const int nb10 = src1->nb[0];
+  const int nb11 = src1->nb[1];
+  const int nb12 = src1->nb[2];
+  const int nb13 = src1->nb[3];
+  const int nb0 = dst->nb[0];
+  const int nb1 = dst->nb[1];
+  const int nb2 = dst->nb[2];
+  const int nb3 = dst->nb[3];
+  for (int64_t i12 = 0; i12 < ne12; ++i12) {
+    for (int64_t i11 = 0; i11 < ne11; ++i11) {
+      for (int64_t i10 = 0; i10 < ne10; ++i10) {
+        const int64_t i01 = *(int32_t*)((char*)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12);
+
+        ne_fp16_to_fp32_row((const void*)((char*)src0->data + i01 * nb01 + i11 * nb02 + i12 * nb03),
+                            (float*)((char*)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc);
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_get_rows_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                            const struct ne_tensor* src1, struct ne_tensor* dst) {
+  assert(params->ith == 0);
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int nc = src0->ne[0];
+  const int nr = ne_nelements(src1);
+
+  assert(dst->ne[0] == nc);
+  assert(ne_nrows(dst) == nr);
+  assert(src0->ne[2] == src1->ne[1]);
+  assert(src0->nb[0] == sizeof(float));
+  const int ne00 = src0->ne[0];
+  const int ne01 = src0->ne[1];
+  const int ne02 = src0->ne[2];
+  const int ne03 = src0->ne[3];
+  const int ne10 = src1->ne[0];
+  const int ne11 = src1->ne[1];
+  const int ne12 = src1->ne[2];
+  const int ne13 = src1->ne[3];
+  const int nb00 = src0->nb[0];
+  const int nb01 = src0->nb[1];
+  const int nb02 = src0->nb[2];
+  const int nb03 = src0->nb[3];
+  const int nb10 = src1->nb[0];
+  const int nb11 = src1->nb[1];
+  const int nb12 = src1->nb[2];
+  const int nb13 = src1->nb[3];
+  const int nb0 = dst->nb[0];
+  const int nb1 = dst->nb[1];
+  const int nb2 = dst->nb[2];
+  const int nb3 = dst->nb[3];
+
+  for (int64_t i12 = 0; i12 < ne12; ++i12) {
+    for (int64_t i11 = 0; i11 < ne11; ++i11) {
+      for (int64_t i10 = 0; i10 < ne10; ++i10) {
+        const int64_t i01 = *(int32_t*)((char*)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12);
+        ne_vec_cpy_f32(nc, (float*)((char*)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3),
+                       (float*)((char*)src0->data + i01 * nb01 + i11 * nb02 + i12 * nb03));
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_get_rows(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                        const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_Q4_0:
+    case NE_TYPE_Q4_1:
+    case NE_TYPE_Q5_0:
+    case NE_TYPE_Q5_1:
+    case NE_TYPE_Q8_0:
+    case NE_TYPE_Q8_1: {
+      ne_compute_forward_get_rows_q(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_F16: {
+      ne_compute_forward_get_rows_f16(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_F32: {
+      ne_compute_forward_get_rows_f32(params, src0, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_get_rows_back
+
+static void ne_compute_forward_get_rows_back_f32_f16(const struct ne_compute_params* params,
+                                                     const struct ne_tensor* src0, const struct ne_tensor* src1,
+                                                     const struct ne_tensor* opt0, struct ne_tensor* dst) {
+  NE_ASSERT(params->ith == 0);
+  NE_ASSERT(ne_are_same_shape(opt0, dst));
+  NE_ASSERT(ne_is_contiguous(opt0));
+  NE_ASSERT(ne_is_contiguous(dst));
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  ne_compute_forward_dup_same_cont(params, opt0, dst);
+
+  const int nc = src0->ne[0];
+  const int nr = ne_nelements(src1);
+
+  NE_ASSERT(dst->ne[0] == nc);
+  NE_ASSERT(src0->nb[0] == sizeof(ne_fp16_t));
+
+  for (int i = 0; i < nr; ++i) {
+    const int r = ((int32_t*)src1->data)[i];
+
+    for (int j = 0; j < nc; ++j) {
+      ne_fp16_t v = ((ne_fp16_t*)((char*)src0->data + i * src0->nb[1]))[j];
+      ((float*)((char*)dst->data + r * dst->nb[1]))[j] += NE_FP16_TO_FP32(v);
+    }
+  }
+}
+
+static void ne_compute_forward_get_rows_back_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                                 const struct ne_tensor* src1, const struct ne_tensor* opt0,
+                                                 struct ne_tensor* dst) {
+  NE_ASSERT(params->ith == 0);
+  NE_ASSERT(ne_are_same_shape(opt0, dst));
+  NE_ASSERT(ne_is_contiguous(opt0));
+  NE_ASSERT(ne_is_contiguous(dst));
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  ne_compute_forward_dup_same_cont(params, opt0, dst);
+
+  const int nc = src0->ne[0];
+  const int nr = ne_nelements(src1);
+
+  NE_ASSERT(dst->ne[0] == nc);
+  NE_ASSERT(src0->nb[0] == sizeof(float));
+
+  for (int i = 0; i < nr; ++i) {
+    const int r = ((int32_t*)src1->data)[i];
+
+    ne_vec_add_f32(nc, (float*)((char*)dst->data + r * dst->nb[1]), (float*)((char*)dst->data + r * dst->nb[1]),
+                   (float*)((char*)src0->data + i * src0->nb[1]));
+  }
+}
+
+static void ne_compute_forward_get_rows_back(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                             const struct ne_tensor* src1, const struct ne_tensor* opt0,
+                                             struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F16: {
+      ne_compute_forward_get_rows_back_f32_f16(params, src0, src1, opt0, dst);
+    } break;
+    case NE_TYPE_F32: {
+      ne_compute_forward_get_rows_back_f32(params, src0, src1, opt0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+
+  // static bool first = true;
+  // printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
+  // if (first) {
+  //     first = false;
+  // } else {
+  //     for (int k = 0; k < dst->ne[1]; ++k) {
+  //         for (int j = 0; j < dst->ne[0]/16; ++j) {
+  //             for (int i = 0; i < 16; ++i) {
+  //                 printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
+  //             }
+  //             printf("\n");
+  //         }
+  //         printf("\n");
+  //     }
+  //     printf("\n");
+  //     exit(0);
+  // }
+}
+
+// ne_compute_forward_diag
+
+static void ne_compute_forward_diag_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                        struct ne_tensor* dst) {
+  NE_ASSERT(params->ith == 0);
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // TODO: handle transposed/permuted matrices
+
+  const int ne00 = src0->ne[0];
+  const int ne01 = src0->ne[1];
+  const int ne02 = src0->ne[2];
+  const int ne03 = src0->ne[3];
+  const int ne0 = dst->ne[0];
+  const int ne1 = dst->ne[1];
+  const int ne2 = dst->ne[2];
+  const int ne3 = dst->ne[3];
+  NE_ASSERT(ne00 == ne0);
+  NE_ASSERT(ne00 == ne1);
+  NE_ASSERT(ne01 == 1);
+  NE_ASSERT(ne02 == ne2);
+  NE_ASSERT(ne03 == ne3);
+
+  const size_t nb00 = src0->nb[0];
+  // const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  NE_ASSERT(nb00 == sizeof(float));
+  NE_ASSERT(nb0 == sizeof(float));
+
+  for (int i3 = 0; i3 < ne3; i3++) {
+    for (int i2 = 0; i2 < ne2; i2++) {
+      for (int i1 = 0; i1 < ne1; i1++) {
+        float* d = (float*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
+        float* s = (float*)((char*)src0->data + i3 * nb03 + i2 * nb02);
+        for (int i0 = 0; i0 < i1; i0++) {
+          d[i0] = 0;
+        }
+        d[i1] = s[i1];
+        for (int i0 = i1 + 1; i0 < ne0; i0++) {
+          d[i0] = 0;
+        }
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_diag(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                    struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_diag_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_diag_mask_inf
+
+static void ne_compute_forward_diag_mask_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                             const struct ne_tensor* src1, struct ne_tensor* dst, const float value) {
+  assert(src1->type == NE_TYPE_I32);
+  const int bs = src0->ne[3];
+  assert(ne_nelements(src1) == (2 + bs));
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int n_past = ((int32_t*)src1->data)[0];
+  const bool inplace = (bool)((int32_t*)src1->data)[1];
+
+  assert(n_past >= 0);
+
+  if (!inplace && (params->type == NE_TASK_INIT)) {
+    // memcpy needs to be synchronized across threads to avoid race conditions.
+    // => do it in INIT phase
+    NE_ASSERT(ne_nelements(dst) == ne_nelements(src0));
+    NE_ASSERT(ne_is_contiguous(dst) && ne_is_contiguous(src0));
+    memcpy(((char*)dst->data), ((char*)src0->data), ne_nbytes(dst));
+  }
+
+  if (params->type == NE_TASK_INIT) {
+    return;
+  }
+
+  // TODO: handle transposed/permuted matrices
+
+  const int n = ne_nrows(src0);
+  const int nc = src0->ne[0];
+  const int nr = src0->ne[1];
+  const int nz = n / nr;
+
+  assert(dst->nb[0] == sizeof(float));
+  assert(src0->nb[0] == sizeof(float));
+
+  // mask padding token (padding left)
+  for (int b = 0; b < bs; b++) {
+    const int n_padding = ((int32_t*)src1->data)[2 + b];
+    if (n_padding == 0) continue;
+    for (int k = 0; k < (nz / bs); k++) {
+      for (int j = ith; j < nr; j += nth) {
+        // it will not affect next token if don't mask the pad_token row
+        ne_vec_set_f32(n_padding, (float*)((char*)dst->data + b * dst->nb[3] + k * dst->nb[2] + j * dst->nb[1]), value);
+      }
+    }
+  }
+
+  for (int k = 0; k < nz; k++) {
+    for (int j = ith; j < nr; j += nth) {
+      for (int i = n_past; i < nc; i++) {
+        if (i > n_past + j) {
+          *(float*)((char*)dst->data + k * dst->nb[2] + j * dst->nb[1] + i * dst->nb[0]) = value;
+        }
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_diag_mask_inf(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                             const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_diag_mask_f32(params, src0, src1, dst, -INFINITY);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+static void ne_compute_forward_diag_mask_zero(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                              const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_diag_mask_f32(params, src0, src1, dst, 0);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_padding_mask_inf
+
+static void ne_compute_forward_padding_mask_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                                const struct ne_tensor* src1, struct ne_tensor* dst,
+                                                const float value) {
+  assert(src1->type == NE_TYPE_I32);
+  const int bs = src0->ne[3];
+  assert(ne_nelements(src1) == (2 + bs));
+  NE_ASSERT(dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int n_past = ((int32_t*)src1->data)[PM_NPAST_IDX];
+  const bool inplace = (bool)((int32_t*)src1->data)[PM_INPLACE_IDX];
+
+  assert(n_past >= 0);
+
+  if (!inplace && (params->type == NE_TASK_INIT)) {
+    // memcpy needs to be synchronized across threads to avoid race conditions.
+    // => do it in INIT phase
+    NE_ASSERT(ne_nelements(dst) == ne_nelements(src0));
+    NE_ASSERT(ne_is_contiguous(dst) && ne_is_contiguous(src0));
+    memcpy(((char*)dst->data), ((char*)src0->data), ne_nbytes(dst));
+  }
+
+  if (params->type == NE_TASK_INIT) {
+    return;
+  }
+
+  // TODO: handle transposed/permuted matrices
+
+  const int n = ne_nrows(src0);
+  const int nc = src0->ne[0];
+  const int nr = src0->ne[1];
+  const int nz = n / nr;
+
+  assert(dst->nb[0] == sizeof(float));
+  assert(src0->nb[0] == sizeof(float));
+
+  ne_attention_padding_mask_f32_forward(bs, nz, nr, ith, nth, (char*)src1->data + PM_PARAMS_NUM * ne_element_size(src1),
+                                        value, dst);
+}
+
+static void ne_compute_forward_padding_mask_inf(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                                const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_padding_mask_f32(params, src0, src1, dst, -INFINITY);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_soft_max
+
+static void ne_compute_forward_soft_max_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                            struct ne_tensor* dst) {
+  NE_ASSERT(ne_is_contiguous(src0));
+  NE_ASSERT(ne_is_contiguous(dst));
+  NE_ASSERT(ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // TODO: handle transposed/permuted matrices
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nc = src0->ne[0];
+  const int nr = ne_nrows(src0);
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int i1 = ir0; i1 < ir1; i1++) {
+    float* sp = (float*)((char*)src0->data + i1 * src0->nb[1]);
+    float* dp = (float*)((char*)dst->data + i1 * dst->nb[1]);
+
+#ifndef NDEBUG
+    for (int i = 0; i < nc; ++i) {
+      // printf("p[%d] = %f\n", i, p[i]);
+      assert(!isnan(sp[i]));
+    }
+#endif
+
+    float max = -INFINITY;
+    ne_vec_max_f32(nc, &max, sp);
+
+    ne_float sum = (max == -INFINITY) ? (1.0 * nc) : 0.0;
+
+    uint16_t scvt;
+    for (int i = 0; i < nc; i++) {
+      if (sp[i] == -INFINITY) {
+        dp[i] = 0.0f;
+      } else {
+        // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
+        ne_fp16_t s = NE_FP32_TO_FP16(sp[i] - max);
+        memcpy(&scvt, &s, sizeof(scvt));
+        const float val = NE_FP16_TO_FP32(table_exp_f16[scvt]);
+        sum += (ne_float)val;
+        dp[i] = val;
+      }
+    }
+
+    assert(sum > 0.0);
+
+    sum = 1.0 / sum;
+    ne_vec_scale_f32(nc, dp, sum);
+
+#ifndef NDEBUG
+    for (int i = 0; i < nc; ++i) {
+      assert(!isnan(dp[i]));
+      assert(!isinf(dp[i]));
+    }
+#endif
+  }
+}
+
+static void ne_compute_forward_soft_max(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                        struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_soft_max_f32(params, src0, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_alibi
+
+static void ne_compute_forward_alibi_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                         const struct ne_tensor* src1, struct ne_tensor* dst) {
+  assert(params->ith == 0);
+  assert(src1->type == NE_TYPE_I32);
+  assert(ne_nelements(src1) == 3);
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int n_past = ((int32_t*)src1->data)[0];
+  int n_head = ((int32_t*)src1->data)[1];
+  const float max_bias = ((float*)src1->data)[2];
+
+  assert(n_past >= 0);
+
+  const int ne0 = src0->ne[0];  // all_seq_len = n_past + ne1
+  const int ne1 = src0->ne[1];  // seq_len_without_past
+  // const int ne2 = src0->ne[2]; // n_head -> this is k
+  // const int ne3 = src0->ne[3]; // 1 -> bsz
+
+  const int n = ne_nrows(src0);
+  const int ne2_ne3 = n / ne1;  // ne2*ne3
+
+  const size_t nb0 = src0->nb[0];
+  const size_t nb1 = src0->nb[1];
+  const size_t nb2 = src0->nb[2];
+  // const size_t nb3 = src0->nb[3];
+
+  assert(nb0 == sizeof(float));
+  assert(ne1 + n_past == ne0);
+  (void)n_past;
+  // TP will need the real rank order of k
+  int32_t k_offset = 0;
+
+  // add alibi to src0 (KQ_scaled)
+  const int n_heads_log2_floor = 1 << (int)floor(log2(n_head));
+
+  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+  for (int i = 0; i < ne0; i++) {
+    for (int j = 0; j < ne1; j++) {
+      for (int k = 0; k < ne2_ne3; k++) {
+        float* const src = (float*)((char*)src0->data + i * nb0 + j * nb1 + k * nb2);
+        float* pdst = (float*)((char*)dst->data + i * nb0 + j * nb1 + k * nb2);
+
+        // TODO: k*nb2 or k*nb3
+
+        float m_k;
+
+        if (k + k_offset < n_heads_log2_floor) {
+          m_k = powf(m0, k + k_offset + 1);
+        } else {
+          m_k = powf(m1, 2 * (k + k_offset - n_heads_log2_floor) + 1);
+        }
+
+        pdst[0] = (i - ne0 + 1) * m_k + src[0];
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_alibi_f16(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                         const struct ne_tensor* src1, struct ne_tensor* dst) {
+  assert(params->ith == 0);
+  assert(src1->type == NE_TYPE_I32);
+  assert(ne_nelements(src1) == 3);
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int n_past = ((int32_t*)src1->data)[0];
+  int n_head = ((int32_t*)src1->data)[1];
+  const float max_bias = ((float*)src1->data)[2];
+
+  assert(n_past >= 0);
+
+  const int ne0 = src0->ne[0];  // all_seq_len = n_past + ne1
+  const int ne1 = src0->ne[1];  // seq_len_without_past
+  // const int ne2 = src0->ne[2]; // n_head -> this is k
+  // const int ne3 = src0->ne[3]; // 1 -> bsz
+
+  const int n = ne_nrows(src0);
+  const int ne2_ne3 = n / ne1;  // ne2*ne3
+
+  const size_t nb0 = src0->nb[0];
+  const size_t nb1 = src0->nb[1];
+  const size_t nb2 = src0->nb[2];
+  // const size_t nb3 = src0->nb[3];
+
+  assert(nb0 == sizeof(ne_fp16_t));
+  assert(ne1 + n_past == ne0);
+  (void)n_past;
+  // TP will need the real rank order of k
+  int32_t k_offset = 0;
+  // add alibi to src0 (KQ_scaled)
+  const int n_heads_log2_floor = 1 << (int)floor(log2(n_head));
+
+  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+  NE_ASSERT(("OP_ALIBI may not be able handle multi-batch cases", src0->ne[3] == 1));
+  for (int i = 0; i < ne0; i++) {
+    for (int j = 0; j < ne1; j++) {
+      for (int k = 0; k < ne2_ne3; k++) {
+        ne_fp16_t* const src = (ne_fp16_t*)((char*)src0->data + i * nb0 + j * nb1 + k * nb2);
+        float* pdst = (float*)((char*)dst->data + i * nb0 + j * nb1 + k * nb2);
+
+        // TODO: k*nb2 or k*nb3
+
+        float m_k;
+
+        if (k + k_offset < n_heads_log2_floor) {
+          m_k = powf(m0, k + k_offset + 1);
+        } else {
+          m_k = powf(m1, 2 * (k + k_offset - n_heads_log2_floor) + 1);
+        }
+
+        // we return F32
+        pdst[0] = (i - ne0 + 1) * m_k + NE_FP16_TO_FP32(src[0]);
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_alibi(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                     const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F16: {
+      ne_compute_forward_alibi_f16(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_F32: {
+      ne_compute_forward_alibi_f32(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_Q4_0:
+    case NE_TYPE_Q4_1:
+    case NE_TYPE_Q5_0:
+    case NE_TYPE_Q5_1:
+    case NE_TYPE_Q8_0:
+    case NE_TYPE_Q8_1:
+    case NE_TYPE_I8:
+    case NE_TYPE_I16:
+    case NE_TYPE_I32:
+    case NE_TYPE_COUNT: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_clamp
+
+static void ne_compute_forward_clamp_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                         const struct ne_tensor* src1, struct ne_tensor* dst) {
+  assert(params->ith == 0);
+  assert(src1->type == NE_TYPE_I32);
+  assert(ne_nelements(src1) == 2);
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int min = ((float*)src1->data)[0];
+  const int max = ((float*)src1->data)[1];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int n = ne_nrows(src0);
+  const int nc = src0->ne[0];
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb00 == sizeof(float));
+
+  for (int j = ith; j < n; j += nth) {
+    float* dst_ptr = (float*)((char*)dst->data + j * nb1);
+    float* src0_ptr = (float*)((char*)src0->data + j * nb01);
+
+    for (int i = 0; i < nc; i++) {
+      dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
+    }
+  }
+}
+
+static void ne_compute_forward_clamp(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                     const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_clamp_f32(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_F16:
+    case NE_TYPE_Q4_0:
+    case NE_TYPE_Q4_1:
+    case NE_TYPE_Q5_0:
+    case NE_TYPE_Q5_1:
+    case NE_TYPE_Q8_0:
+    case NE_TYPE_Q8_1:
+    case NE_TYPE_I8:
+    case NE_TYPE_I16:
+    case NE_TYPE_I32:
+    case NE_TYPE_COUNT: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+static float rope_yarn_ramp(const float low, const float high, const int i0) {
+  const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
+  return 1.0 - MIN(1.0, MAX(0.0, y));
+}
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static void rope_yarn(float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor,
+                      float mscale, float* cos_theta, float* sin_theta) {
+  // Get n-d rotational scaling corrected for extrapolation
+  float theta_interp = freq_scale * theta_extrap;
+  float theta = theta_interp;
+  if (ext_factor != 0.0f) {
+    float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+    theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+    // Get n-d magnitude scaling corrected for interpolation
+    mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+  }
+  *cos_theta = cosf(theta) * mscale;
+  *sin_theta = sinf(theta) * mscale;
+}
+
+#ifndef NE_PI
+#define NE_PI (3.14159265358979323846)
+#endif
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
+  return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)NE_PI)) / (2 * logf(base));
+}
+
+void ggml_rope_yarn_corr_dims(int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow,
+                              float dims[2]) {
+  // start and end correction dims
+  dims[0] = MAX(0, floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base)));
+  dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base)));
+}
+
+// ne_compute_forward_rope
+#define NE_TENSOR_UNARY_OP_LOCALS           \
+  NE_TENSOR_LOCALS(int64_t, ne0, src0, ne); \
+  NE_TENSOR_LOCALS(size_t, nb0, src0, nb);  \
+  NE_TENSOR_LOCALS(int64_t, ne, dst, ne);   \
+  NE_TENSOR_LOCALS(size_t, nb, dst, nb);
+
+static void ne_compute_forward_rope_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                        const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int bs = src0->ne[3];
+  NE_ASSERT(src1->type == NE_TYPE_I32);
+  NE_ASSERT(ne_nelements(src1) == 5 + bs);  // 5 + bs params
+
+  const float freq_base = ((float*)(dst->op_params))[0];
+  const float freq_scale = 1 / ((float*)(dst->op_params))[1];
+  const int n_orig_ctx = (int)((float*)(dst->op_params))[2];
+  const float ext_factor = ((float*)(dst->op_params))[3];
+  const float attn_factor = ((float*)(dst->op_params))[4];
+  const float beta_fast = ((float*)(dst->op_params))[5];
+  const float beta_slow = ((float*)(dst->op_params))[6];
+
+  const int64_t n_past = ((int32_t*)src1->data)[ROPE_NPAST_IDX];
+  const int64_t n_dims = ((int32_t*)src1->data)[ROPE_NDIMS_IDX];
+  const int64_t mode = ((int32_t*)src1->data)[ROPE_MODE_IDX];
+  const int64_t prompt_size = ((int32_t*)src1->data)[ROPE_PROMPTSIZE_IDX];
+  const int64_t n_keep = ((int32_t*)src1->data)[ROPE_NKEEP_IDX];
+
+  assert(n_past >= 0);
+
+  NE_TENSOR_UNARY_OP_LOCALS;
+
+  NE_ASSERT(nb00 == sizeof(float));
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nr = ne_nrows(dst);
+
+  NE_ASSERT(n_dims <= ne0);
+  NE_ASSERT(n_dims % 2 == 0);
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  // row index used to determine which thread to use
+  int ir = 0;
+
+  const float theta_scale = powf(freq_base, -2.0f / n_dims);
+  const float inv_ndims = -1.f / n_dims;
+  float corr_dims[2];
+  ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
+
+  const bool skip = mode & 1;
+  const bool is_neox = mode & 2;
+  const bool is_glm = mode & 4;
+  const bool is_shift = n_keep >= 0;
+  const bool use_yarn = ((mode & 0x8) != 0);
+  NE_ASSERT(("RoPE shift not supported!", !is_shift));
+
+  NE_ASSERT(ne3 == bs);
+  for (int64_t i3 = 0; i3 < ne3; i3++) {
+    for (int64_t i2 = (skip ? n_past : 0); i2 < ne2; i2++) {
+      const int64_t p = skip ? i2 : n_past + i2;
+      for (int64_t i1 = 0; i1 < ne1; i1++) {
+        if (ir++ < ir0) continue;
+        if (ir > ir1) break;
+
+        float theta_base = (float)p;
+
+        // only for glm when mode == 4
+        if (is_glm) {
+          const int64_t n_padding = ((int32_t*)src1->data)[ROPE_PARAMS_NUM + i3];
+          // position ids
+          theta_base = MIN(MAX(p - n_padding, 0), prompt_size - 2 - n_padding);
+          float block_theta = MAX(p - (prompt_size - 2), 0);
+          for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
+            const float cos_theta = cosf(theta_base);
+            const float sin_theta = sinf(theta_base);
+            const float cos_block_theta = cosf(block_theta);
+            const float sin_block_theta = sinf(block_theta);
+
+            theta_base *= theta_scale;
+            block_theta *= theta_scale;
+
+            const float* const src = (float*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01 + i0 * nb00);
+            float* dst_data = (float*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[n_dims / 2];
+            const float x2 = src[n_dims];
+            const float x3 = src[n_dims / 2 * 3];
+
+            dst_data[0] = x0 * cos_theta - x1 * sin_theta;
+            dst_data[n_dims / 2] = x0 * sin_theta + x1 * cos_theta;
+            dst_data[n_dims] = x2 * cos_block_theta - x3 * sin_block_theta;
+            dst_data[n_dims / 2 * 3] = x2 * sin_block_theta + x3 * cos_block_theta;
+          }
+        } else if (!is_neox) {
+          // printf("theta_base = %ld, freq_scale %.4f, ne0 %d\n", p, freq_scale, ne0);
+          for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+            float cos_theta, sin_theta;
+            rope_yarn(theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+            theta_base *= theta_scale;
+
+            const float* const src = (float*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01 + i0 * nb00);
+            float* dst_data = (float*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[1];
+
+            dst_data[0] = x0 * cos_theta - x1 * sin_theta;
+            dst_data[1] = x0 * sin_theta + x1 * cos_theta;
+          }
+        } else {
+          // TODO: this is probably wrong, but I can't figure it out ..
+          // ref:
+          // https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
+          theta_base = theta_base * freq_scale;
+
+          for (int64_t ib = 0; ib < ne0 / n_dims; ++ib) {
+            for (int64_t ic = 0; ic < n_dims; ic += 2) {
+              // simplified from `(ib * n_dims + ic) * inv_ndims`
+              float cur_rot = inv_ndims * ic - ib;
+
+              float cos_theta, sin_theta;
+              rope_yarn(theta_base, freq_scale, corr_dims, (int)cur_rot, ext_factor, attn_factor, &cos_theta,
+                        &sin_theta);
+
+              theta_base *= theta_scale;
+
+              const int64_t i0 = ib * n_dims + ic / 2;
+
+              const float* const src = (float*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01 + i0 * nb00);
+              float* dst_data = (float*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
+
+              const float x0 = src[0];
+              const float x1 = src[n_dims / 2];
+
+              dst_data[0] = x0 * cos_theta - x1 * sin_theta;
+              dst_data[n_dims / 2] = x0 * sin_theta + x1 * cos_theta;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_rope_f16(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                        const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+  NE_ASSERT(src1->type == NE_TYPE_I32);
+  NE_ASSERT(ne_nelements(src1) == 5);  // 5 params
+
+  const int n_past = ((int32_t*)src1->data)[0];
+  const int n_dims = ((int32_t*)src1->data)[1];
+  const int mode = ((int32_t*)src1->data)[2];
+  const int prompt_size = ((int32_t*)src1->data)[3];
+  const int n_keep = ((int32_t*)src1->data)[4];
+
+  assert(n_past >= 0 || n_keep >= 0);
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  NE_ASSERT(nb0 == sizeof(ne_fp16_t));
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nr = ne_nrows(dst);
+
+  NE_ASSERT(n_dims <= ne0);
+  NE_ASSERT(n_dims % 2 == 0);
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  // row index used to determine which thread to use
+  int ir = 0;
+
+  const float freq_base = ((float*)(dst->op_params))[0];
+  const float freq_scale = 1 / ((float*)(dst->op_params))[1];
+
+  const float theta_scale = powf(freq_base, -2.0f / n_dims);
+
+  const bool skip = mode & 1;
+  const bool is_neox = mode & 2;
+  const bool is_glm = mode & 4;
+  NE_ASSERT(("glm mode RoPE is not implemented!", !is_glm));
+  const bool is_shift = n_keep >= 0;
+  NE_ASSERT(("shift RoPE is only implemented for the vanilla mode", !is_shift || !(is_glm || is_neox || skip)));
+
+  if (is_shift) {
+    float theta = n_past * freq_scale;
+    ne_fp16_t* cossin = (dst->opt[0] != NULL) ? dst->opt[0]->data : NULL;
+    if (cossin == NULL) {
+      cossin = malloc(ne0 * sizeof(ne_fp16_t));
+      for (int i0 = 0; i0 < ne0; i0 += 2) {
+        cossin[i0 + 0] = NE_FP32_TO_FP16(cosf(theta));
+        cossin[i0 + 1] = NE_FP32_TO_FP16(sinf(theta));
+        theta *= theta_scale;
+      }
+    }
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+      for (int64_t i2 = 0; i2 < ne2; i2++) {    // along seq
+        for (int64_t i1 = 0; i1 < ne1; i1++) {  // along head num
+          if (ir++ < ir0) continue;
+          if (ir > ir1) break;
+          if (i2 < n_keep) continue;  // The "break" will break `ir`
+
+          for (int64_t i0 = 0; i0 < ne0; i0 += 2) {  // along head size
+            const ne_fp16_t* const src =
+                (ne_fp16_t*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01 + i0 * nb00);
+            ne_fp16_t* dst_data = (ne_fp16_t*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
+
+            const float x0 = NE_FP16_TO_FP32(src[0]);
+            const float x1 = NE_FP16_TO_FP32(src[1]);
+            const float cos = NE_FP16_TO_FP32(cossin[i0 + 0]);
+            const float sin = NE_FP16_TO_FP32(cossin[i0 + 1]);
+
+            dst_data[0] = NE_FP32_TO_FP16(x0 * cos - x1 * sin);
+            dst_data[1] = NE_FP32_TO_FP16(x1 * cos + x0 * sin);
+          }
+        }
+      }
+    }
+    if (dst->opt[0] == NULL) free(cossin);
+    return;
+  }
+
+  for (int64_t i3 = 0; i3 < ne3; i3++) {
+    for (int64_t i2 = (skip ? n_past : 0); i2 < ne2; i2++) {
+      const int64_t p = skip ? i2 : n_past + i2;
+      for (int64_t i1 = 0; i1 < ne1; i1++) {
+        if (ir++ < ir0) continue;
+        if (ir > ir1) break;
+
+        float theta = freq_scale * (float)p;
+
+        if (!is_neox) {
+          for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+            const float cos_theta = cosf(theta);
+            const float sin_theta = sinf(theta);
+
+            theta *= theta_scale;
+
+            const ne_fp16_t* const src =
+                (ne_fp16_t*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01 + i0 * nb00);
+            ne_fp16_t* dst_data = (ne_fp16_t*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
+
+            const float x0 = NE_FP16_TO_FP32(src[0]);
+            const float x1 = NE_FP16_TO_FP32(src[1]);
+
+            dst_data[0] = NE_FP32_TO_FP16(x0 * cos_theta - x1 * sin_theta);
+            dst_data[1] = NE_FP32_TO_FP16(x0 * sin_theta + x1 * cos_theta);
+          }
+        } else {
+          // TODO: this is probably wrong, but I can't figure it out ..
+          // ref:
+          // https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
+          for (int64_t ib = 0; ib < ne0 / n_dims; ++ib) {
+            for (int64_t ic = 0; ic < n_dims; ic += 2) {
+              const float cos_theta = cosf(theta);
+              const float sin_theta = sinf(theta);
+
+              theta *= theta_scale;
+
+              const int64_t i0 = ib * n_dims + ic / 2;
+
+              const ne_fp16_t* const src =
+                  (ne_fp16_t*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01 + i0 * nb00);
+              ne_fp16_t* dst_data = (ne_fp16_t*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
+
+              const float x0 = NE_FP16_TO_FP32(src[0]);
+              const float x1 = NE_FP16_TO_FP32(src[n_dims / 2]);
+
+              dst_data[0] = NE_FP32_TO_FP16(x0 * cos_theta - x1 * sin_theta);
+              dst_data[n_dims / 2] = NE_FP32_TO_FP16(x0 * sin_theta + x1 * cos_theta);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_rope_bestla(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                           const struct ne_tensor* src1, struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+  NE_ASSERT(("use internal multi-threading", params->nth == 1));
+
+  NE_ASSERT(src1->type == NE_TYPE_I32);
+  NE_ASSERT(ne_nelements(src1) == 5);  // 5 params
+
+  const int n_past = ((int32_t*)src1->data)[0];
+  const int n_dims = ((int32_t*)src1->data)[1];
+  const int mode = ((int32_t*)src1->data)[2];
+  const int prompt_size = ((int32_t*)src1->data)[3];
+  const int n_keep = ((int32_t*)src1->data)[4];
+  const bool skip = mode & 1;
+  const bool is_neox = mode & 2;
+  const bool is_glm = mode & 4;
+  NE_ASSERT(("glm mode RoPE is not implemented!", !is_glm));
+  const bool is_shift = n_keep >= 0;
+  NE_ASSERT(("shift RoPE is only implemented for the vanilla mode", !is_shift || !(is_glm || is_neox || skip)));
+
+  const int batch_size = dst->ne[3];
+  const int head_num = dst->ne[2];
+  const int seq_len = dst->ne[1];
+  const int head_size = dst->ne[0];
+  const float freq_base = ((float*)(dst->op_params))[0];
+  const float freq_scale = 1 / ((float*)(dst->op_params))[1];
+  if (is_shift) {
+    ne_fp16_t* cossin = (dst->opt[0] != NULL) ? dst->opt[0]->data : NULL;
+    if (cossin == NULL) {
+      float theta = n_past * freq_scale;
+      const float theta_scale = powf(freq_base, -2.0f / n_dims);
+      cossin = malloc(head_size * sizeof(ne_fp16_t));
+      for (int i0 = 0; i0 < head_size; i0 += 2) {
+        cossin[i0 + 0] = NE_FP32_TO_FP16(cosf(theta));
+        cossin[i0 + 1] = NE_FP32_TO_FP16(sinf(theta));
+        theta *= theta_scale;
+      }
+    }
+    bestla_reordered_attn_fp32_shift_rope_k(dst->data, cossin, batch_size, head_num, head_size, seq_len, n_keep);
+    if (dst->opt[0] == NULL) free(cossin);
+    return;
+  }
+  NE_ASSERT(("Only shift-RoPE is implemented for customized K!", false));
+}
+
+static void ne_compute_forward_rope(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                    const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F16: {
+      ne_compute_forward_rope_f16(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_F32: {
+      ne_compute_forward_rope_f32(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_BTLA: {
+      ne_compute_forward_rope_bestla(params, src0, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_rope_back
+
+static void ne_compute_forward_rope_back_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                             const struct ne_tensor* src1, struct ne_tensor* dst) {
+  assert(src1->type == NE_TYPE_I32);
+  assert(ne_nelements(src1) == 3);
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // y = rope(x, src1)
+  // dx = rope_back(dy, src1)
+  // src0 is dy, src1 contains options
+
+  const int n_past = ((int32_t*)src1->data)[0];
+  const int n_dims = ((int32_t*)src1->data)[1];
+  const int mode = ((int32_t*)src1->data)[2];
+
+  assert(n_past >= 0);
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  assert(nb0 == sizeof(float));
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nr = ne_nrows(dst);
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  // row index used to determine which thread to use
+  int ir = 0;
+
+  const float theta_scale = powf(10000.0, -2.0f / n_dims);
+
+  const bool is_neox = mode & 2;
+
+  for (int64_t i3 = 0; i3 < ne3; i3++) {
+    for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
+      const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
+      for (int64_t i1 = 0; i1 < ne1; i1++) {
+        if (ir++ < ir0) continue;
+        if (ir > ir1) break;
+
+        float theta = (float)p;
+
+        if (!is_neox) {
+          for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+            const float cos_theta = cosf(theta);
+            const float sin_theta = sinf(theta);
+
+            theta *= theta_scale;
+
+            const float* const dy = (float*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01 + i0 * nb00);
+            float* dx = (float*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
+
+            const float dy0 = dy[0];
+            const float dy1 = dy[1];
+
+            dx[0] = dy0 * cos_theta + dy1 * sin_theta;
+            dx[1] = -dy0 * sin_theta + dy1 * cos_theta;
+          }
+        } else {
+          for (int64_t ib = 0; ib < ne0 / n_dims; ++ib) {
+            for (int64_t ic = 0; ic < n_dims; ic += 2) {
+              const float cos_theta = cosf(theta);
+              const float sin_theta = sinf(theta);
+
+              theta *= theta_scale;
+
+              const int64_t i0 = ib * n_dims + ic / 2;
+
+              const float* const dy = (float*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01 + i0 * nb00);
+              float* dx = (float*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
+
+              const float dy0 = dy[0];
+              const float dy1 = dy[n_dims / 2];
+
+              dx[0] = dy0 * cos_theta + dy1 * sin_theta;
+              dx[n_dims / 2] = -dy0 * sin_theta + dy1 * cos_theta;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_rope_back_f16(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                             const struct ne_tensor* src1, struct ne_tensor* dst) {
+  assert(src1->type == NE_TYPE_I32);
+  assert(ne_nelements(src1) == 3);
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  // y = rope(x, src1)
+  // dx = rope_back(dy, src1)
+  // src0 is dy, src1 contains options
+
+  const int n_past = ((int32_t*)src1->data)[0];
+  const int n_dims = ((int32_t*)src1->data)[1];
+  const int mode = ((int32_t*)src1->data)[2];
+
+  assert(n_past >= 0);
+
+  const size_t nb00 = src0->nb[0];
+  const size_t nb01 = src0->nb[1];
+  const size_t nb02 = src0->nb[2];
+  const size_t nb03 = src0->nb[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  const int64_t ne3 = dst->ne[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  assert(nb0 == sizeof(ne_fp16_t));
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int nr = ne_nrows(dst);
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  // row index used to determine which thread to use
+  int ir = 0;
+
+  const float theta_scale = powf(10000.0, -2.0f / n_dims);
+
+  const bool is_neox = mode & 2;
+
+  for (int64_t i3 = 0; i3 < ne3; i3++) {
+    for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
+      const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
+      for (int64_t i1 = 0; i1 < ne1; i1++) {
+        if (ir++ < ir0) continue;
+        if (ir > ir1) break;
+
+        float theta = (float)p;
+
+        if (!is_neox) {
+          for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+            const float cos_theta = cosf(theta);
+            const float sin_theta = sinf(theta);
+
+            theta *= theta_scale;
+
+            const ne_fp16_t* const dy = (ne_fp16_t*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01 + i0 * nb00);
+            ne_fp16_t* dx = (ne_fp16_t*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
+
+            const float dy0 = NE_FP16_TO_FP32(dy[0]);
+            const float dy1 = NE_FP16_TO_FP32(dy[1]);
+
+            dx[0] = NE_FP32_TO_FP16(dy0 * cos_theta + dy1 * sin_theta);
+            dx[1] = NE_FP32_TO_FP16(-dy0 * sin_theta + dy1 * cos_theta);
+          }
+        } else {
+          for (int64_t ib = 0; ib < ne0 / n_dims; ++ib) {
+            for (int64_t ic = 0; ic < n_dims; ic += 2) {
+              const float cos_theta = cosf(theta);
+              const float sin_theta = sinf(theta);
+
+              theta *= theta_scale;
+
+              const int64_t i0 = ib * n_dims + ic / 2;
+
+              const ne_fp16_t* const dy =
+                  (ne_fp16_t*)((char*)src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01 + i0 * nb00);
+              ne_fp16_t* dx = (ne_fp16_t*)((char*)dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
+
+              const float dy0 = NE_FP16_TO_FP32(dy[0]);
+              const float dy1 = NE_FP16_TO_FP32(dy[n_dims / 2]);
+
+              dx[0] = NE_FP32_TO_FP16(dy0 * cos_theta + dy1 * sin_theta);
+              dx[n_dims / 2] = NE_FP32_TO_FP16(-dy0 * sin_theta + dy1 * cos_theta);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_rope_back(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                         const struct ne_tensor* src1, struct ne_tensor* dst) {
+  switch (src0->type) {
+    case NE_TYPE_F16: {
+      ne_compute_forward_rope_back_f16(params, src0, src1, dst);
+    } break;
+    case NE_TYPE_F32: {
+      ne_compute_forward_rope_back_f32(params, src0, src1, dst);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_flash_attn
+
+static void ne_compute_forward_flash_attn_f32(const struct ne_compute_params* params, const struct ne_tensor* q,
+                                              const struct ne_tensor* k, const struct ne_tensor* v, const bool masked,
+                                              struct ne_tensor* dst) {
+  int64_t t0 = ne_perf_time_us();
+  UNUSED(t0);
+
+  const int64_t neq0 = q->ne[0];
+  const int64_t neq1 = q->ne[1];
+  const int64_t neq2 = q->ne[2];
+  const int64_t neq3 = q->ne[3];
+
+  const int64_t nek0 = k->ne[0];
+  const int64_t nek1 = k->ne[1];
+  // const int64_t nek2 = k->ne[2];
+  // const int64_t nek3 = k->ne[3];
+
+  // const int64_t nev0 = v->ne[0];
+  const int64_t nev1 = v->ne[1];
+  // const int64_t nev2 = v->ne[2];
+  // const int64_t nev3 = v->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  // const int64_t ne2  = dst->ne[2];
+  // const int64_t ne3  = dst->ne[3];
+
+  const size_t nbk0 = k->nb[0];
+  const size_t nbk1 = k->nb[1];
+  const size_t nbk2 = k->nb[2];
+  const size_t nbk3 = k->nb[3];
+
+  const size_t nbq0 = q->nb[0];
+  const size_t nbq1 = q->nb[1];
+  const size_t nbq2 = q->nb[2];
+  const size_t nbq3 = q->nb[3];
+
+  const size_t nbv0 = v->nb[0];
+  const size_t nbv1 = v->nb[1];
+  const size_t nbv2 = v->nb[2];
+  const size_t nbv3 = v->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int64_t D = neq0;
+  const int64_t N = neq1;
+  const int64_t P = nek1 - N;
+  const int64_t M = P + N;
+
+  const int Mup = ne_up(M, NE_SOFT_MAX_UNROLL);
+
+  NE_ASSERT(ne0 == D);
+  NE_ASSERT(ne1 == N);
+  NE_ASSERT(P >= 0);
+
+  NE_ASSERT(nbq0 == sizeof(float));
+  NE_ASSERT(nbk0 == sizeof(float));
+  NE_ASSERT(nbv0 == sizeof(float));
+
+  NE_ASSERT(neq0 == D);
+  NE_ASSERT(nek0 == D);
+  NE_ASSERT(nev1 == D);
+
+  NE_ASSERT(neq1 == N);
+  NE_ASSERT(nek1 == N + P);
+  NE_ASSERT(nev1 == D);
+
+  // dst cannot be transposed or permuted
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb0 <= nb1);
+  NE_ASSERT(nb1 <= nb2);
+  NE_ASSERT(nb2 <= nb3);
+
+  if (params->type == NE_TASK_INIT) {
+    return;
+  }
+
+  if (params->type == NE_TASK_FINALIZE) {
+    return;
+  }
+
+  // parallelize by q rows using ne_vec_dot_f32
+
+  // total rows in q
+  const int nr = neq1 * neq2 * neq3;
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  const float scale = 1.0f / sqrtf(D);
+
+  // printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
+
+  for (int ir = ir0; ir < ir1; ++ir) {
+    // q indices
+    const int iq3 = ir / (neq2 * neq1);
+    const int iq2 = (ir - iq3 * neq2 * neq1) / neq1;
+    const int iq1 = (ir - iq3 * neq2 * neq1 - iq2 * neq1);
+
+    float* S = (float*)params->wdata + ith * (Mup + CACHE_LINE_SIZE_F32);
+
+    for (int i = M; i < Mup; ++i) {
+      S[i] = -INFINITY;
+    }
+
+    for (int64_t ic = 0; ic < nek1; ++ic) {
+      // k indices
+      const int ik3 = iq3;
+      const int ik2 = iq2;
+      const int ik1 = ic;
+
+      // S indices
+      const int i1 = ik1;
+
+      ne_vec_dot_f32(neq0, S + i1, (float*)((char*)k->data + (ik1 * nbk1 + ik2 * nbk2 + ik3 * nbk3)),
+                     (float*)((char*)q->data + (iq1 * nbq1 + iq2 * nbq2 + iq3 * nbq3)));
+    }
+
+    // scale
+    ne_vec_scale_f32(nek1, S, scale);
+
+    if (masked) {
+      for (int64_t i = P; i < M; i++) {
+        if (i > P + iq1) {
+          S[i] = -INFINITY;
+        }
+      }
+    }
+
+    // softmax
+    {
+      float max = -INFINITY;
+      ne_vec_max_f32(M, &max, S);
+
+      ne_float sum = 0.0;
+      {
+        uint16_t scvt[NE_SOFT_MAX_UNROLL];
+        ne_float sump[NE_SOFT_MAX_UNROLL] = {0.0};
+
+        for (int i = 0; i < Mup; i += NE_SOFT_MAX_UNROLL) {
+          float* SS = S + i;
+
+          for (int j = 0; j < NE_SOFT_MAX_UNROLL; ++j) {
+            if (SS[j] == -INFINITY) {
+              SS[j] = 0.0f;
+            } else {
+              ne_fp16_t s = NE_FP32_TO_FP16(SS[j] - max);
+              memcpy(&scvt[j], &s, sizeof(uint16_t));
+              const float val = NE_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+              sump[j] += (ne_float)val;
+              SS[j] = val;
+            }
+          }
+        }
+
+        for (int i = 0; i < NE_SOFT_MAX_UNROLL; i++) {
+          sum += sump[i];
+        }
+      }
+
+      assert(sum > 0.0);
+
+      sum = 1.0 / sum;
+      ne_vec_scale_f32(M, S, sum);
+
+#ifndef NDEBUG
+      for (int i = 0; i < M; ++i) {
+        assert(!isnan(S[i]));
+        assert(!isinf(S[i]));
+      }
+#endif
+    }
+
+    for (int64_t ic = 0; ic < nev1; ++ic) {
+      // dst indices
+      const int i1 = iq1;
+      const int i2 = iq2;
+      const int i3 = iq3;
+
+      ne_vec_dot_f32(nek1, (float*)((char*)dst->data + (ic * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3)),
+                     (float*)((char*)v->data + (ic * nbv1 + i2 * nbv2 + i3 * nbv3)), S);
+    }
+  }
+}
+
+static void ne_compute_forward_flash_attn_f32_f16_f16(const struct ne_compute_params* params, const struct ne_tensor* q,
+                                                      const struct ne_tensor* k, const struct ne_tensor* v,
+                                                      const struct ne_tensor* tmp, struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int64_t neq0 = q->ne[0];
+  const int64_t neq1 = q->ne[1];
+  const int64_t neq2 = q->ne[2];
+  const int64_t neq3 = q->ne[3];
+
+  const int64_t nek0 = k->ne[0];
+  const int64_t nek1 = k->ne[1];
+  const int64_t nek2 = k->ne[2];
+  // const int64_t nek3 = k->ne[3];
+
+  // const int64_t nev0 = v->ne[0];
+  const int64_t nev1 = v->ne[1];
+  const int64_t nev2 = v->ne[2];
+  // const int64_t nev3 = v->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  // const int64_t ne2  = dst->ne[2];
+  // const int64_t ne3  = dst->ne[3];
+
+  const size_t nbk0 = k->nb[0];
+  const size_t nbk1 = k->nb[1];
+  const size_t nbk2 = k->nb[2];
+  const size_t nbk3 = k->nb[3];
+
+  const size_t nbq0 = q->nb[0];
+  const size_t nbq1 = q->nb[1];
+  const size_t nbq2 = q->nb[2];
+  const size_t nbq3 = q->nb[3];
+
+  const size_t nbv0 = v->nb[0];
+  const size_t nbv1 = v->nb[1];
+  const size_t nbv2 = v->nb[2];
+  const size_t nbv3 = v->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int64_t headsize = neq0;
+  const int64_t headnum = neq2;
+  const int64_t heads_kv = nek2;
+  const int64_t embedsize = headnum * headsize;
+  const int64_t seq_cur = neq1;
+  const int64_t seq_all = nek1;
+  const int64_t seq_past = seq_all - seq_cur;
+  const int64_t batch = neq3;
+
+  // if (params->type == NE_TASK_INIT || params->type == NE_TASK_FINALIZE) return;
+
+  const int keles = ne_element_size(k);
+  const int veles = ne_element_size(v);
+  int step_k_sl = k->nb[1] / keles;
+  int step_k_head_num = k->nb[2] / keles;
+  int step_k_head_size = k->nb[0] / keles;
+  int step_k_bs = k->nb[3] / keles;
+  int step_v_sl = v->nb[0] / veles;
+  int step_v_head_size = v->nb[1] / veles;
+  int step_v_head_num = v->nb[2] / veles;
+  int step_v_bs = k->nb[3] / veles;
+  float scale = *(float*)dst->padding;
+  ne_attn_flags_t flags = *(bool*)&dst->padding[sizeof(scale)];
+  attn_fp32_fp16_fp16_fp32_fwd_args_t args = {
+      .Q = (float*)q->data,
+      .K = (ne_fp16_t*)k->data,
+      .V = (ne_fp16_t*)v->data,
+      .dst = (float*)dst->data,
+      .Q_sc = 1.f,
+      .K_sc = 1.f,
+      .V_sc = 1.f,
+      .dst_sc = 1.f,
+      .tmp = tmp->data,
+      .QK_scale = scale,
+      .attn_flags = flags,
+      .batch_size = batch,
+      .head_num = headnum,
+      .heads_kv = heads_kv,
+      .head_size = headsize,
+      .sl_q = seq_cur,
+      .sl_kv = seq_all,
+      .Q_layout = ATTN_FWD_LAYOUT_PLAIN,
+      .K_layout = ATTN_FWD_LAYOUT_PLAIN,
+      .V_layout = ATTN_FWD_LAYOUT_PLAIN,
+      .dst_layout = ATTN_FWD_LAYOUT_PLAIN,
+      .step_q_bs = seq_cur * embedsize,
+      .step_q_head_num = headsize,
+      .step_q_sl = embedsize,
+      .step_k_bs = step_k_bs,
+      .step_k_head_num = step_k_head_num,
+      .step_k_sl = step_k_sl,
+      .step_k_head_size = step_k_head_size,  // TODO
+      .step_v_bs = step_v_bs,
+      .step_v_head_num = step_v_head_num,
+      .step_v_sl = step_v_sl,
+      .step_v_head_size = 1,
+      .step_dst_bs = seq_cur * embedsize,
+      .step_dst_head_num = headsize,
+      .step_dst_sl = embedsize,
+  };
+  bestla_fusion_attn_fp32_fp16_fp16_fp32_forward(&args);
+}
+
+static void ne_compute_forward_flash_attn_reordered(const struct ne_compute_params* params, const struct ne_tensor* q,
+                                                    const struct ne_tensor* k, const struct ne_tensor* v,
+                                                    const struct ne_tensor* tmp, struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int64_t headsize = q->ne[0];
+  const int64_t seq_cur = q->ne[1];
+  const int64_t headnum = q->ne[2];
+  const int64_t heads_kv = k->ne[2];
+  const int64_t batch = q->ne[3];
+  const int64_t seq_all = k->ne[1];
+  const int64_t q_ele_size = ne_element_size(q);
+  const int64_t dst_ele_size = ne_element_size(dst);
+  // const int64_t seq_past = seq_all - seq_cur;
+
+  float scale = *(float*)dst->padding;
+  ne_attn_flags_t flags = *(ne_attn_flags_t*)&dst->padding[sizeof(scale)];
+
+  NE_ASSERT(k->type == NE_TYPE_BTLA && v->type == NE_TYPE_BTLA);
+  ATTN_FWD_LAYOUT K_layout = *(ATTN_FWD_LAYOUT*)(&k->nb[0]);
+  ATTN_FWD_LAYOUT V_layout = *(ATTN_FWD_LAYOUT*)(&v->nb[0]);
+
+  bestla_reordered_attn_fp32_fp32_fwd_args_t args = {
+      .Q = (float*)q->data,
+      .K = (char*)k->data,
+      .V = (char*)v->data,
+      .dst = (float*)dst->data,
+      .Q_sc = 1.f,
+      .K_sc = 1.f,
+      .V_sc = 1.f,
+      .dst_sc = 1.f,
+      .tmp = tmp->data,
+      .QK_scale = scale,
+      .attn_flags = flags,
+      .batch_size = batch,
+      .head_num = headnum,
+      .heads_kv = heads_kv,
+      .head_size = headsize,
+      .sl_q = seq_cur,
+      .sl_kv = seq_all,
+      .Q_layout = ATTN_FWD_LAYOUT_PLAIN,
+      .K_layout = K_layout,
+      .V_layout = V_layout,
+      .dst_layout = ATTN_FWD_LAYOUT_PLAIN,
+      .step_q_bs = q->nb[3] / q_ele_size,
+      .step_q_head_num = q->nb[2] / q_ele_size,
+      .step_q_sl = q->nb[1] / q_ele_size,
+
+      .stride_k_bs = k->nb[3],
+      .stride_k_head_num = k->nb[2],
+      .stride_k_sl = k->nb[1],
+      .stride_k_head_size = 0,
+
+      .stride_v_bs = v->nb[3],
+      .stride_v_head_num = v->nb[2],
+      .stride_v_sl = 0,
+      .stride_v_head_size = v->nb[1],
+
+      // dst in (head_size, n_head, seq, bs)
+      .step_dst_bs = dst->nb[3] / dst_ele_size,
+      .step_dst_head_num = dst->nb[1] / dst_ele_size,
+      .step_dst_sl = dst->nb[2] / dst_ele_size,
+  };
+  bestla_reordered_attn_fp32_forward(&args);
+}
+
+static void ne_compute_forward_flash_attn_f16(const struct ne_compute_params* params, const struct ne_tensor* q,
+                                              const struct ne_tensor* k, const struct ne_tensor* v, const bool masked,
+                                              struct ne_tensor* dst) {
+  int64_t t0 = ne_perf_time_us();
+  UNUSED(t0);
+
+  const int64_t neq0 = q->ne[0];
+  const int64_t neq1 = q->ne[1];
+  const int64_t neq2 = q->ne[2];
+  const int64_t neq3 = q->ne[3];
+
+  const int64_t nek0 = k->ne[0];
+  const int64_t nek1 = k->ne[1];
+  // const int64_t nek2 = k->ne[2];
+  // const int64_t nek3 = k->ne[3];
+
+  // const int64_t nev0 = v->ne[0];
+  const int64_t nev1 = v->ne[1];
+  // const int64_t nev2 = v->ne[2];
+  // const int64_t nev3 = v->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  // const int64_t ne2  = dst->ne[2];
+  // const int64_t ne3  = dst->ne[3];
+
+  const size_t nbk0 = k->nb[0];
+  const size_t nbk1 = k->nb[1];
+  const size_t nbk2 = k->nb[2];
+  const size_t nbk3 = k->nb[3];
+
+  const size_t nbq0 = q->nb[0];
+  const size_t nbq1 = q->nb[1];
+  const size_t nbq2 = q->nb[2];
+  const size_t nbq3 = q->nb[3];
+
+  const size_t nbv0 = v->nb[0];
+  const size_t nbv1 = v->nb[1];
+  const size_t nbv2 = v->nb[2];
+  const size_t nbv3 = v->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int64_t D = neq0;
+  const int64_t N = neq1;
+  const int64_t P = nek1 - N;
+  const int64_t M = P + N;
+
+  const int Mup = ne_up(M, NE_SOFT_MAX_UNROLL);
+
+  NE_ASSERT(ne0 == D);
+  NE_ASSERT(ne1 == N);
+  NE_ASSERT(P >= 0);
+
+  NE_ASSERT(nbq0 == sizeof(ne_fp16_t));
+  NE_ASSERT(nbk0 == sizeof(ne_fp16_t));
+  NE_ASSERT(nbv0 == sizeof(ne_fp16_t));
+
+  NE_ASSERT(neq0 == D);
+  NE_ASSERT(nek0 == D);
+  NE_ASSERT(nev1 == D);
+
+  NE_ASSERT(neq1 == N);
+  NE_ASSERT(nek1 == N + P);
+  NE_ASSERT(nev1 == D);
+
+  // dst cannot be transposed or permuted
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb0 <= nb1);
+  NE_ASSERT(nb1 <= nb2);
+  NE_ASSERT(nb2 <= nb3);
+
+  if (params->type == NE_TASK_INIT) {
+    return;
+  }
+
+  if (params->type == NE_TASK_FINALIZE) {
+    return;
+  }
+
+  // parallelize by q rows using ne_vec_dot_f32
+
+  // total rows in q
+  const int nr = neq1 * neq2 * neq3;
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  const float scale = 1.0f / sqrtf(D);
+
+  // printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
+
+  for (int ir = ir0; ir < ir1; ++ir) {
+    // q indices
+    const int iq3 = ir / (neq2 * neq1);
+    const int iq2 = (ir - iq3 * neq2 * neq1) / neq1;
+    const int iq1 = (ir - iq3 * neq2 * neq1 - iq2 * neq1);
+
+    float* S = (float*)params->wdata + ith * (2 * Mup + CACHE_LINE_SIZE_F32);
+
+    for (int i = M; i < Mup; ++i) {
+      S[i] = -INFINITY;
+    }
+
+    if (NE_VEC_DOT_UNROLL > 2 || nek1 % NE_VEC_DOT_UNROLL != 0) {
+      for (int64_t ic = 0; ic < nek1; ++ic) {
+        // k indices
+        const int ik3 = iq3;
+        const int ik2 = iq2;
+        const int ik1 = ic;
+
+        // S indices
+        const int i1 = ik1;
+
+        ne_vec_dot_f16(neq0, S + i1, (ne_fp16_t*)((char*)k->data + (ik1 * nbk1 + ik2 * nbk2 + ik3 * nbk3)),
+                       (ne_fp16_t*)((char*)q->data + (iq1 * nbq1 + iq2 * nbq2 + iq3 * nbq3)));
+      }
+    } else {
+      for (int64_t ic = 0; ic < nek1; ic += NE_VEC_DOT_UNROLL) {
+        // k indices
+        const int ik3 = iq3;
+        const int ik2 = iq2;
+        const int ik1 = ic;
+
+        // S indices
+        const int i1 = ik1;
+
+        ne_vec_dot_f16_unroll(neq0, nbk1, S + i1, ((char*)k->data + (ik1 * nbk1 + ik2 * nbk2 + ik3 * nbk3)),
+                              (ne_fp16_t*)((char*)q->data + (iq1 * nbq1 + iq2 * nbq2 + iq3 * nbq3)));
+      }
+    }
+
+    // scale
+    ne_vec_scale_f32(nek1, S, scale);
+
+    if (masked) {
+      for (int64_t i = P; i < M; i++) {
+        if (i > P + iq1) {
+          S[i] = -INFINITY;
+        }
+      }
+    }
+
+    // softmax
+    {
+      float max = -INFINITY;
+      ne_vec_max_f32(M, &max, S);
+
+      ne_float sum = 0.0;
+      {
+        uint16_t scvt[NE_SOFT_MAX_UNROLL];
+        ne_float sump[NE_SOFT_MAX_UNROLL] = {0.0};
+
+        for (int i = 0; i < Mup; i += NE_SOFT_MAX_UNROLL) {
+          float* SS = S + i;
+
+          for (int j = 0; j < NE_SOFT_MAX_UNROLL; ++j) {
+            if (SS[j] == -INFINITY) {
+              SS[j] = 0.0f;
+            } else {
+              ne_fp16_t s = NE_FP32_TO_FP16(SS[j] - max);
+              memcpy(&scvt[j], &s, sizeof(uint16_t));
+              const float val = NE_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+              sump[j] += (ne_float)val;
+              SS[j] = val;
+            }
+          }
+        }
+
+        for (int i = 0; i < NE_SOFT_MAX_UNROLL; i++) {
+          sum += sump[i];
+        }
+      }
+
+      assert(sum > 0.0);
+
+      sum = 1.0 / sum;
+      ne_vec_scale_f32(M, S, sum);
+
+#ifndef NDEBUG
+      for (int i = 0; i < M; ++i) {
+        assert(!isnan(S[i]));
+        assert(!isinf(S[i]));
+      }
+#endif
+    }
+
+    ne_fp16_t* S16 = (ne_fp16_t*)((float*)params->wdata + ith * (2 * Mup + CACHE_LINE_SIZE_F32) + Mup);
+
+    for (int64_t i = 0; i < M; i++) {
+      S16[i] = NE_FP32_TO_FP16(S[i]);
+    }
+
+    if (NE_VEC_DOT_UNROLL == 1 || (nev1 % NE_VEC_DOT_UNROLL != 0)) {
+      for (int64_t ic = 0; ic < nev1; ++ic) {
+        // dst indices
+        const int i1 = iq1;
+        const int i2 = iq2;
+        const int i3 = iq3;
+
+        ne_vec_dot_f16(nek1, (float*)((char*)dst->data + (ic * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3)),
+                       (ne_fp16_t*)((char*)v->data + (ic * nbv1 + i2 * nbv2 + i3 * nbv3)), S16);
+      }
+    } else {
+      for (int64_t ic = 0; ic < nev1; ic += NE_VEC_DOT_UNROLL) {
+        // dst indices
+        const int i1 = iq1;
+        const int i2 = iq2;
+        const int i3 = iq3;
+
+        ne_vec_dot_f16_unroll(nek1, nbv1, (float*)((char*)dst->data + (ic * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3)),
+                              ((char*)v->data + (ic * nbv1 + i2 * nbv2 + i3 * nbv3)), S16);
+      }
+    }
+  }
+}
+
+static void ne_compute_forward_flash_attn(const struct ne_compute_params* params, const struct ne_tensor* q,
+                                          const struct ne_tensor* k, const struct ne_tensor* v,
+                                          const struct ne_tensor* tmp, struct ne_tensor* dst) {
+  switch (q->type) {
+    case NE_TYPE_F32: {
+      if (k->type == NE_TYPE_F16) {
+        ne_compute_forward_flash_attn_f32_f16_f16(params, q, k, v, tmp, dst);
+      } else if (k->type == NE_TYPE_BTLA && v->type == NE_TYPE_BTLA) {
+        ne_compute_forward_flash_attn_reordered(params, q, k, v, tmp, dst);
+      } else {
+        NE_ASSERT(false);
+      }
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+static void ne_compute_forward_flash_attn_kv_update(const struct ne_compute_params* params,
+                                                    const struct ne_tensor* cache, const struct ne_tensor* cur,
+                                                    struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  NE_ASSERT(ne_nelements(dst->opt[0]) == 3);  // 3 params
+  const int* p_data = dst->opt[0]->data;
+  const int n_past = p_data[0];
+  const bool is_v = (bool)p_data[1];
+  const bool no_zeroing = (bool)p_data[2];
+  NE_ASSERT(cur->type == NE_TYPE_F32);
+  bestla_fusion_attn_fp32_update_kv_args_t args = {
+      .src = cur->data,
+      .cache = cache->data,
+      .batch_size = cur->ne[3],
+      .heads_kv = cur->ne[1],
+      .head_size = cur->ne[0],
+      .seq_off = n_past,
+      .seq_size = cur->ne[2],
+      .seq_max = cache->ne[1],
+      .step_bs = cur->nb[3] / NE_TYPE_SIZE[cur->type],
+      .step_head_num = cur->nb[1] / NE_TYPE_SIZE[cur->type],
+      .step_seq = cur->nb[2] / NE_TYPE_SIZE[cur->type],
+      .step_head_size = cur->nb[0] / NE_TYPE_SIZE[cur->type],
+      .no_zeroing = no_zeroing,
+  };
+  if (is_v)
+    bestla_reordered_attn_fp32_update_v(&args);
+  else
+    bestla_reordered_attn_fp32_update_k(&args);
+}
+
+// ne_compute_forward_flash_ff
+
+static void ne_compute_forward_flash_ff_f16(const struct ne_compute_params* params,
+                                            const struct ne_tensor* a,   // F16
+                                            const struct ne_tensor* b0,  // F16 fc_w
+                                            const struct ne_tensor* b1,  // F32 fc_b
+                                            const struct ne_tensor* c0,  // F16 proj_w
+                                            const struct ne_tensor* c1,  // F32 proj_b
+                                            struct ne_tensor* dst) {
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int64_t nea0 = a->ne[0];
+  const int64_t nea1 = a->ne[1];
+  const int64_t nea2 = a->ne[2];
+  const int64_t nea3 = a->ne[3];
+
+  const int64_t neb00 = b0->ne[0];
+  const int64_t neb01 = b0->ne[1];
+  // const int64_t neb02 = b0->ne[2];
+  // const int64_t neb03 = b0->ne[3];
+
+  const int64_t neb10 = b1->ne[0];
+  const int64_t neb11 = b1->ne[1];
+  // const int64_t neb12 = b1->ne[2];
+  // const int64_t neb13 = b1->ne[3];
+
+  const int64_t nec00 = c0->ne[0];
+  const int64_t nec01 = c0->ne[1];
+  // const int64_t nec02 = c0->ne[2];
+  // const int64_t nec03 = c0->ne[3];
+
+  const int64_t nec10 = c1->ne[0];
+  const int64_t nec11 = c1->ne[1];
+  // const int64_t nec12 = c1->ne[2];
+  // const int64_t nec13 = c1->ne[3];
+
+  const int64_t ne0 = dst->ne[0];
+  const int64_t ne1 = dst->ne[1];
+  const int64_t ne2 = dst->ne[2];
+  // const int64_t ne3 = dst->ne[3];
+
+  const size_t nba0 = a->nb[0];
+  const size_t nba1 = a->nb[1];
+  const size_t nba2 = a->nb[2];
+  const size_t nba3 = a->nb[3];
+
+  const size_t nbb00 = b0->nb[0];
+  const size_t nbb01 = b0->nb[1];
+  const size_t nbb02 = b0->nb[2];
+  const size_t nbb03 = b0->nb[3];
+
+  const size_t nbb10 = b1->nb[0];
+  // const size_t nbb11 = b1->nb[1];
+  // const size_t nbb12 = b1->nb[2];
+  // const size_t nbb13 = b1->nb[3];
+
+  const size_t nbc00 = c0->nb[0];
+  const size_t nbc01 = c0->nb[1];
+  const size_t nbc02 = c0->nb[2];
+  const size_t nbc03 = c0->nb[3];
+
+  const size_t nbc10 = c1->nb[0];
+  // const size_t nbc11 = c1->nb[1];
+  // const size_t nbc12 = c1->nb[2];
+  // const size_t nbc13 = c1->nb[3];
+
+  const size_t nb0 = dst->nb[0];
+  const size_t nb1 = dst->nb[1];
+  const size_t nb2 = dst->nb[2];
+  const size_t nb3 = dst->nb[3];
+
+  const int ith = params->ith;
+  const int nth = params->nth;
+
+  const int64_t D = nea0;
+  // const int64_t N = nea1;
+  const int64_t M = neb01;
+
+  NE_ASSERT(ne0 == nea0);
+  NE_ASSERT(ne1 == nea1);
+  NE_ASSERT(ne2 == nea2);
+
+  NE_ASSERT(nba0 == sizeof(ne_fp16_t));
+  NE_ASSERT(nbb00 == sizeof(ne_fp16_t));
+  NE_ASSERT(nbb10 == sizeof(float));
+  NE_ASSERT(nbc00 == sizeof(ne_fp16_t));
+  NE_ASSERT(nbc10 == sizeof(float));
+
+  NE_ASSERT(neb00 == D);
+  NE_ASSERT(neb01 == M);
+  NE_ASSERT(neb10 == M);
+  NE_ASSERT(neb11 == 1);
+
+  NE_ASSERT(nec00 == M);
+  NE_ASSERT(nec01 == D);
+  NE_ASSERT(nec10 == D);
+  NE_ASSERT(nec11 == 1);
+
+  // dst cannot be transposed or permuted
+  NE_ASSERT(nb0 == sizeof(float));
+  NE_ASSERT(nb0 <= nb1);
+  NE_ASSERT(nb1 <= nb2);
+  NE_ASSERT(nb2 <= nb3);
+
+  // if (params->type == NE_TASK_INIT) {
+  //   return;
+  // }
+
+  // if (params->type == NE_TASK_FINALIZE) {
+  //   return;
+  // }
+
+  // parallelize by a rows using ne_vec_dot_f32
+
+  // total rows in a
+  const int nr = nea1 * nea2 * nea3;
+
+  // rows per thread
+  const int dr = (nr + nth - 1) / nth;
+
+  // row range for this thread
+  const int ir0 = dr * ith;
+  const int ir1 = MIN(ir0 + dr, nr);
+
+  for (int ir = ir0; ir < ir1; ++ir) {
+    // a indices
+    const int ia3 = ir / (nea2 * nea1);
+    const int ia2 = (ir - ia3 * nea2 * nea1) / nea1;
+    const int ia1 = (ir - ia3 * nea2 * nea1 - ia2 * nea1);
+
+    float* S = (float*)params->wdata + ith * (2 * M + CACHE_LINE_SIZE_F32);
+
+    for (int64_t ic = 0; ic < neb01; ++ic) {
+      // b0 indices
+      const int ib03 = ia3;
+      const int ib02 = ia2;
+      const int ib01 = ic;
+
+      // S indices
+      const int i1 = ib01;
+
+      ne_vec_dot_f16(nea0, S + i1, (ne_fp16_t*)((char*)b0->data + (ib01 * nbb01 + ib02 * nbb02 + ib03 * nbb03)),
+                     (ne_fp16_t*)((char*)a->data + (ia1 * nba1 + ia2 * nba2 + ia3 * nba3)));
+    }
+
+    ne_vec_add_f32(neb01, S, S, (float*)b1->data);
+    // ne_vec_gelu_f32(neb01, S, S);
+
+    ne_fp16_t* S16 = (ne_fp16_t*)((float*)params->wdata + ith * (2 * M + CACHE_LINE_SIZE_F32) + M);
+
+    for (int64_t i = 0; i < M; i++) {
+      S16[i] = NE_FP32_TO_FP16(S[i]);
+    }
+
+    ne_vec_gelu_f16(neb01, S16, S16);
+
+    {
+      // dst indices
+      const int i1 = ia1;
+      const int i2 = ia2;
+      const int i3 = ia3;
+
+      for (int64_t ic = 0; ic < nec01; ++ic) {
+        ne_vec_dot_f16(neb01, (float*)((char*)dst->data + (ic * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3)),
+                       (ne_fp16_t*)((char*)c0->data + (ic * nbc01 + i2 * nbc02 + i3 * nbc03)), S16);
+      }
+
+      ne_vec_add_f32(nec01, (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)),
+                     (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)), (float*)c1->data);
+    }
+  }
+}
+
+static void ne_compute_forward_flash_ff(const struct ne_compute_params* params, const struct ne_tensor* a,
+                                        const struct ne_tensor* b0, const struct ne_tensor* b1,
+                                        const struct ne_tensor* c0, const struct ne_tensor* c1, struct ne_tensor* dst) {
+  switch (b0->type) {
+    case NE_TYPE_F16: {
+      ne_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst);
+    } break;
+    case NE_TYPE_F32: {
+      NE_ASSERT(false);  // TODO
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_map_unary
+
+static void ne_compute_forward_map_unary_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                             struct ne_tensor* dst, const ne_unary_op_f32_t fun) {
+  NE_ASSERT(ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int n = ne_nrows(src0);
+  const int nc = src0->ne[0];
+
+  assert(dst->nb[0] == sizeof(float));
+  assert(src0->nb[0] == sizeof(float));
+
+  for (int i = 0; i < n; i++) {
+    fun(nc, (float*)((char*)dst->data + i * (dst->nb[1])), (float*)((char*)src0->data + i * (src0->nb[1])));
+  }
+}
+
+static void ne_compute_forward_map_unary(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                         struct ne_tensor* dst, const ne_unary_op_f32_t fun) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_map_unary_f32(params, src0, dst, fun);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+// ne_compute_forward_map_binary
+
+static void ne_compute_forward_map_binary_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                              const struct ne_tensor* src1, struct ne_tensor* dst,
+                                              const ne_binary_op_f32_t fun) {
+  assert(params->ith == 0);
+  assert(ne_are_same_shape(src0, src1) && ne_are_same_shape(src0, dst));
+
+  NE_ASSERT(!dst->need_init);
+  NE_ASSERT(!dst->need_finalize);
+
+  const int n = ne_nrows(src0);
+  const int nc = src0->ne[0];
+
+  assert(dst->nb[0] == sizeof(float));
+  assert(src0->nb[0] == sizeof(float));
+  assert(src1->nb[0] == sizeof(float));
+
+  for (int i = 0; i < n; i++) {
+    fun(nc, (float*)((char*)dst->data + i * (dst->nb[1])), (float*)((char*)src0->data + i * (src0->nb[1])),
+        (float*)((char*)src1->data + i * (src1->nb[1])));
+  }
+}
+
+static void ne_compute_forward_map_binary(const struct ne_compute_params* params, const struct ne_tensor* src0,
+                                          const struct ne_tensor* src1, struct ne_tensor* dst,
+                                          const ne_binary_op_f32_t fun) {
+  switch (src0->type) {
+    case NE_TYPE_F32: {
+      ne_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
+    } break;
+    default: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+/////////////////////////////////
+
+static void ne_compute_forward(struct ne_compute_params* params, struct ne_tensor* tensor) {
+  NE_ASSERT(params);
+
+  switch (tensor->op) {
+    case NE_OP_DUP: {
+      ne_compute_forward_dup(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_ADD: {
+      ne_compute_forward_add(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_ADD1: {
+      ne_compute_forward_add1(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_ACC: {
+      ne_compute_forward_acc(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+    } break;
+    case NE_OP_SUB: {
+      ne_compute_forward_sub(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_MUL: {
+      ne_compute_forward_mul(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_DIV: {
+      ne_compute_forward_div(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_SQR: {
+      ne_compute_forward_sqr(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_SQRT: {
+      ne_compute_forward_sqrt(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_LOG: {
+      ne_compute_forward_log(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_SUM: {
+      ne_compute_forward_sum(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_SUM_ROWS: {
+      ne_compute_forward_sum_rows(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_MEAN: {
+      ne_compute_forward_mean(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_REPEAT: {
+      ne_compute_forward_repeat(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_ABS: {
+      ne_compute_forward_abs(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_SGN: {
+      ne_compute_forward_sgn(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_NEG: {
+      ne_compute_forward_neg(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_STEP: {
+      ne_compute_forward_step(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_RELU: {
+      ne_compute_forward_relu(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_GELU: {
+      ne_compute_forward_gelu(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_SILU: {
+      ne_compute_forward_silu(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_SILU_BACK: {
+      ne_compute_forward_silu_back(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_NORM: {
+      ne_compute_forward_norm(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_RMS_NORM: {
+      ne_compute_forward_rms_norm(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_RMS_NORM_BACK: {
+      ne_compute_forward_rms_norm_back(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_MUL_MAT_BIAS: {
+      ne_compute_forward_mul_mat_bias(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+    } break;
+    case NE_OP_MUL_MAT: {
+      ne_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_MUL_MAT_ID: {
+      ne_compute_forward_mul_mat_id(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_ARGSORT: {
+      ne_compute_forward_argsort(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_MUL_QKV: {
+      ne_compute_forward_mul_qkv(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor);
+    } break;
+    case NE_OP_MUL_ID_FFN_SILU: {
+      ne_compute_forward_ffn_id_silu(params, tensor->src0, tensor->src1, tensor->opt[24], tensor->opt[25], tensor);
+    } break;
+    case NE_OP_MUL_FFN_SILU: {
+      ne_compute_forward_ffn_silu(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2],
+                                  tensor->opt[3], tensor);
+    } break;
+    case NE_OP_MUL_FFN_ADD_GELU: {
+      ne_compute_forward_ffn_add_gelu(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1],
+                                      tensor->opt[2], tensor->opt[3], tensor);
+    } break;
+    case NE_OP_MUL_FFN_GELU: {
+      ne_compute_forward_ffn_gelu(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor);
+    } break;
+    case NE_OP_MUL_FFN_GELU_MUL: {
+      ne_compute_forward_ffn_gelu_mul(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1],
+                                      tensor->opt[2], tensor->opt[3], tensor);
+    } break;
+    case NE_OP_SCALE: {
+      ne_compute_forward_scale(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_SET: {
+      ne_compute_forward_set(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+    } break;
+    case NE_OP_CPY: {
+      ne_compute_forward_cpy(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_CONT: {
+      ne_compute_forward_cont(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_RESHAPE: {
+      ne_compute_forward_reshape(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_VIEW: {
+      ne_compute_forward_view(params, tensor->src0);
+    } break;
+    case NE_OP_PERMUTE: {
+      ne_compute_forward_permute(params, tensor->src0);
+    } break;
+    case NE_OP_TRANSPOSE: {
+      ne_compute_forward_transpose(params, tensor->src0);
+    } break;
+    case NE_OP_GET_ROWS: {
+      ne_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_GET_ROWS_BACK: {
+      ne_compute_forward_get_rows_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
+    } break;
+    case NE_OP_DIAG: {
+      ne_compute_forward_diag(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_DIAG_MASK_INF: {
+      ne_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_DIAG_MASK_ZERO: {
+      ne_compute_forward_diag_mask_zero(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_PADDING_MASK_INF: {
+      ne_compute_forward_padding_mask_inf(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_SOFT_MAX: {
+      ne_compute_forward_soft_max(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_ROPE: {
+      ne_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_ROPE_BACK: {
+      ne_compute_forward_rope_back(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_ALIBI: {
+      ne_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_CLAMP: {
+      ne_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_CONV_1D_1S: {
+      ne_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_CONV_1D_2S: {
+      ne_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_CONV_1D: {
+      ne_compute_forward_conv_1d(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_FLASH_ATTN: {
+      ne_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor);
+    } break;
+    case NE_OP_FLASH_ATTN_KV_UPDATE: {
+      ne_compute_forward_flash_attn_kv_update(params, tensor->src0, tensor->src1, tensor);
+    } break;
+    case NE_OP_FLASH_FF: {
+      ne_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2],
+                                  tensor);
+    } break;
+    case NE_OP_MAP_UNARY: {
+      const ne_unary_op_f32_t fun = *((ne_unary_op_f32_t*)tensor->opt[0]->data);
+      ne_compute_forward_map_unary(params, tensor->src0, tensor, fun);
+    } break;
+    case NE_OP_MAP_BINARY: {
+      const ne_binary_op_f32_t fun = *((ne_binary_op_f32_t*)tensor->opt[0]->data);
+      ne_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
+    } break;
+    case NE_OP_NONE: {
+      // nop
+    } break;
+    case NE_OP_DEBUG: {
+      ne_compute_forward_debug(params, tensor->src0, tensor);
+    } break;
+    case NE_OP_COUNT: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+static void ne_compute_backward(struct ne_context* ctx, struct ne_tensor* tensor, bool inplace) {
+  struct ne_tensor* src0 = tensor->src0;
+  struct ne_tensor* src1 = tensor->src1;
+
+  switch (tensor->op) {
+    case NE_OP_DUP: {
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, tensor->grad, inplace);
+      }
+    } break;
+    case NE_OP_ADD: {
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, tensor->grad, inplace);
+      }
+      if (src1->grad) {
+        src1->grad = ne_add_impl(ctx, src1->grad, tensor->grad, inplace);
+      }
+    } break;
+    case NE_OP_ADD1: {
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, tensor->grad, inplace);
+      }
+      if (src1->grad) {
+        src1->grad =
+            ne_add_impl(ctx, src1->grad, ne_mean(ctx, tensor->grad),  // TODO: should probably be sum instead of mean
+                        inplace);
+      }
+    } break;
+    case NE_OP_ACC: {
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, tensor->grad, inplace);
+      }
+      if (src1->grad) {
+        NE_ASSERT(ne_nelements(tensor->opt[0]) == 5);
+        NE_ASSERT(tensor->opt[0]->type == NE_TYPE_I32);
+        const size_t nb1 = ((int32_t*)tensor->opt[0]->data)[0];
+        const size_t nb2 = ((int32_t*)tensor->opt[0]->data)[1];
+        const size_t nb3 = ((int32_t*)tensor->opt[0]->data)[2];
+        const size_t offset = ((int32_t*)tensor->opt[0]->data)[3];
+
+        struct ne_tensor* tensor_grad_view = ne_view_4d(ctx, tensor->grad, src1->grad->ne[0], src1->grad->ne[1],
+                                                        src1->grad->ne[2], src1->grad->ne[3], nb1, nb2, nb3, offset);
+
+        src1->grad = ne_add_impl(ctx, src1->grad, ne_reshape(ctx, ne_cont(ctx, tensor_grad_view), src1->grad), inplace);
+      }
+    } break;
+    case NE_OP_SUB: {
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, tensor->grad, inplace);
+      }
+      if (src1->grad) {
+        src1->grad = ne_sub_impl(ctx, src1->grad, tensor->grad, inplace);
+      }
+    } break;
+    case NE_OP_MUL: {
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, ne_mul(ctx, src1, tensor->grad), inplace);
+      }
+      if (src1->grad) {
+        src1->grad = ne_add_impl(ctx, src1->grad, ne_mul(ctx, src0, tensor->grad), inplace);
+      }
+    } break;
+    case NE_OP_DIV: {
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, ne_div(ctx, tensor->grad, src1), inplace);
+      }
+      if (src1->grad) {
+        src1->grad = ne_sub_impl(ctx, src1->grad, ne_mul(ctx, tensor->grad, ne_div(ctx, tensor, src1)), inplace);
+      }
+    } break;
+    case NE_OP_SQR: {
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, ne_scale(ctx, ne_mul(ctx, src0, tensor->grad), ne_new_f32(ctx, 2.0f)),
+                                 inplace);
+      }
+    } break;
+    case NE_OP_SQRT: {
+      if (src0->grad) {
+        src0->grad =
+            ne_add_impl(ctx, src0->grad,
+                        ne_mul(ctx,
+                               tensor->grad,  // this was not caught by test_grad because in test_grad tensor->grad is 1
+                               ne_div(ctx, ne_repeat(ctx, ne_new_f32(ctx, 0.5f), tensor), tensor)),
+                        inplace);
+      }
+    } break;
+    case NE_OP_LOG: {
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, ne_div(ctx, tensor->grad, src0), inplace);
+      }
+    } break;
+    case NE_OP_SUM: {
+      if (src0->grad) {
+        src0->grad = ne_add1_impl(ctx, src0->grad, tensor->grad, inplace);
+      }
+    } break;
+    case NE_OP_SUM_ROWS: {
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, ne_repeat(ctx, tensor->grad, src0->grad), inplace);
+      }
+    } break;
+    case NE_OP_MEAN: {
+      NE_ASSERT(false);  // TODO: implement
+    } break;
+    case NE_OP_REPEAT: {
+      // necessary for llama
+      if (src0->grad) {
+        NE_ASSERT(src0->n_dims == 1 || src0->n_dims == 2);
+        const int nc = tensor->ne[0];
+        const int nr = tensor->ne[1];
+        const int nc0 = src0->ne[0];
+        const int nr0 = src0->ne[1];
+        const int ncr = nc / nc0;  // guaranteed to be an integer due to the check in ne_can_repeat
+        const int nrr = nr / nr0;  // guaranteed to be an integer due to the check in ne_can_repeat
+        // tensor->grad [nc,nr,1,1]
+        // reshape      [nc0,nc/nc0,nr0,nr/nr0]
+        // permute      [nc0,nr0,nc/nc0,nr/nr0]
+        // substitute   [nc0,nr0,ncr,nrr]
+        // reshape      [nc0*nr0,ncr*nrr,1,1]
+        // transpose    [ncr*nrr,nc0*nr0,1,1]
+        // sum rows     [1,nc0*nr0,1,1]
+        // transpose    [nc0*nr0,1,1]
+        // reshape      [nc0,nr0,1,1] reshape_1d or reshape_2d
+        // add to src0->grad
+
+        int64_t ne[4] = {nc0, ncr, nr0, nrr};
+
+        struct ne_tensor* F00 = tensor->grad;
+        struct ne_tensor* F01 = ne_reshape(ctx, F00, ne_new_tensor(ctx, tensor->grad->type, 4, ne, NE_SIZE_CALC));
+        struct ne_tensor* F02 = ne_permute(ctx, F01, 0, 2, 1, 3);
+        struct ne_tensor* F03 = ne_cont(ctx, F02);
+        struct ne_tensor* F04 = ne_reshape_2d(ctx, F03, nc0 * nr0, ncr * nrr);
+        struct ne_tensor* F05 = ne_transpose(ctx, F04);
+        struct ne_tensor* F06 = ne_cont(ctx, F05);
+        struct ne_tensor* F07 = ne_sum_rows(ctx, F06);
+        struct ne_tensor* F08 = ne_transpose(ctx, F07);
+        struct ne_tensor* F09 = ne_cont(ctx, F08);
+        struct ne_tensor* F10 = ne_reshape(ctx, F09, src0->grad);
+
+        src0->grad = ne_add_impl(ctx, src0->grad, F10, inplace);
+      }
+    } break;
+    case NE_OP_ABS: {
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, ne_mul(ctx, ne_sgn(ctx, src0), tensor->grad), inplace);
+      }
+    } break;
+    case NE_OP_SGN: {
+      if (src0->grad) {
+        // noop
+      }
+    } break;
+    case NE_OP_NEG: {
+      if (src0->grad) {
+        src0->grad = ne_sub_impl(ctx, src0->grad, tensor->grad, inplace);
+      }
+    } break;
+    case NE_OP_STEP: {
+      if (src0->grad) {
+        // noop
+      }
+    } break;
+    case NE_OP_RELU: {
+      if (src0->grad) {
+        src0->grad = ne_sub_impl(ctx, src0->grad, ne_mul(ctx, ne_step(ctx, src0), tensor->grad), inplace);
+      }
+    } break;
+    case NE_OP_GELU: {
+      NE_ASSERT(false);  // TODO: not implemented
+    } break;
+    case NE_OP_ALIBI: {
+      NE_ASSERT(false);  // TODO: not implemented
+    } break;
+    case NE_OP_CLAMP: {
+      NE_ASSERT(false);  // TODO: not implemented
+    } break;
+    case NE_OP_SILU: {
+      // necessary for llama
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, ne_silu_back(ctx, src0, tensor->grad), inplace);
+      }
+    } break;
+    case NE_OP_SILU_BACK: {
+      NE_ASSERT(false);  // TODO: not implemented
+    } break;
+    case NE_OP_NORM: {
+      NE_ASSERT(false);  // TODO: not implemented
+    } break;
+    case NE_OP_RMS_NORM: {
+      // necessary for llama
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, ne_rms_norm_back(ctx, src0, tensor->grad), inplace);
+      }
+    } break;
+    case NE_OP_RMS_NORM_BACK: {
+      NE_ASSERT(false);  // TODO: not implemented
+    } break;
+    case NE_OP_MUL_MAT: {
+      // https://cs231n.github.io/optimization-2/#staged
+      // # forward pass
+      // s0 = np.random.randn(5, 10)
+      // s1 = np.random.randn(10, 3)
+      // t = s0.dot(s1)
+
+      // # now suppose we had the gradient on t from above in the circuit
+      // dt = np.random.randn(*t.shape) # same shape as t
+      // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix
+      // ds1 = t.T.dot(dt)
+
+      // tensor.shape [m,p]
+      // src0.shape   [n,m]
+      // src1.shape   [n,p]
+
+      // necessary for llama
+      if (src0->grad) {
+        // TODO: this requires outer product - ne_out_prod(ctx, src1, tensor->grad);
+        src0->grad = ne_add_impl(ctx, src0->grad,
+                                 // ds0 = dt.dot(s1.T)
+                                 // ne_out_prod(ctx, // [n,m]
+                                 //     src1,          // [n,p]
+                                 //     tensor->grad), // [m,p]
+                                 // for now just using A*B==(B.T*A.T).T
+                                 ne_cont(ctx,                                                          // [n,m]
+                                         ne_transpose(ctx,                                             // [n,m]
+                                                      ne_mul_mat(ctx,                                  // [m,n]
+                                                                 ne_cont(ctx,                          // [p,m]
+                                                                         ne_transpose(ctx,             // [p,m]
+                                                                                      tensor->grad)),  // [m,p]
+                                                                 ne_cont(ctx,                          // [p,n]
+                                                                         ne_transpose(ctx,             // [p,n]
+                                                                                      src1))))),       // [n,p]
+                                 inplace);
+      }
+      if (src1->grad) {
+        src1->grad = ne_add_impl(ctx, src1->grad,
+                                 // ds1 = s0.T.dot(dt):
+                                 ne_mul_mat(ctx,                               // [n,p]
+                                            ne_cont(ctx,                       // [m,n]
+                                                    ne_transpose(ctx, src0)),  // [m,n]
+                                            tensor->grad),                     // [m,p]
+                                 inplace);
+      }
+    } break;
+    case NE_OP_SCALE: {
+      // necessary for llama
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, ne_scale_impl(ctx, tensor->grad, src1, false), inplace);
+      }
+      if (src1->grad) {
+        src1->grad = ne_add_impl(ctx, src1->grad, ne_sum(ctx, ne_mul_impl(ctx, tensor->grad, src0, false)), inplace);
+      }
+    } break;
+    case NE_OP_SET: {
+      NE_ASSERT(ne_nelements(tensor->opt[0]) == 5);
+      NE_ASSERT(tensor->opt[0]->type == NE_TYPE_I32);
+      const size_t nb1 = ((int32_t*)tensor->opt[0]->data)[0];
+      const size_t nb2 = ((int32_t*)tensor->opt[0]->data)[1];
+      const size_t nb3 = ((int32_t*)tensor->opt[0]->data)[2];
+      const size_t offset = ((int32_t*)tensor->opt[0]->data)[3];
+
+      struct ne_tensor* tensor_grad_view = NULL;
+
+      if (src0->grad || src1->grad) {
+        NE_ASSERT(src0->type == tensor->type);
+        NE_ASSERT(tensor->grad->type == tensor->type);
+        NE_ASSERT(tensor->grad->type == src1->grad->type);
+
+        tensor_grad_view = ne_view_4d(ctx, tensor->grad, src1->grad->ne[0], src1->grad->ne[1], src1->grad->ne[2],
+                                      src1->grad->ne[3], nb1, nb2, nb3, offset);
+      }
+
+      if (src0->grad) {
+        src0->grad = ne_add_impl(
+            ctx, src0->grad,
+            ne_acc_impl(ctx, tensor->grad, ne_neg(ctx, tensor_grad_view), nb1, nb2, nb3, offset, false), inplace);
+      }
+
+      if (src1->grad) {
+        src1->grad = ne_add_impl(ctx, src1->grad, ne_reshape(ctx, ne_cont(ctx, tensor_grad_view), src1->grad), inplace);
+      }
+    } break;
+    case NE_OP_CPY: {
+      // necessary for llama
+      // cpy overwrites value of src1 by src0 and returns view(src1)
+      // the overwriting is mathematically equivalent to:
+      // tensor = src0 * 1 + src1 * 0
+      if (src0->grad) {
+        // dsrc0 = dtensor * 1
+        src0->grad = ne_add_impl(ctx, src0->grad, tensor->grad, inplace);
+      }
+      if (src1->grad) {
+        // dsrc1 = dtensor * 0 -> noop
+      }
+    } break;
+    case NE_OP_CONT: {
+      // same as cpy
+      if (src0->grad) {
+        NE_ASSERT(ne_is_contiguous(src0->grad));
+        NE_ASSERT(ne_is_contiguous(tensor->grad));
+        src0->grad = ne_add_impl(ctx, src0->grad, tensor->grad, inplace);
+      }
+    } break;
+    case NE_OP_RESHAPE: {
+      // necessary for llama
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, ne_reshape(ctx, tensor->grad, src0->grad), inplace);
+      }
+    } break;
+    case NE_OP_VIEW: {
+      // necessary for llama
+      if (src0->grad) {
+        size_t offset;
+        memcpy(&offset, tensor->padding, sizeof(offset));
+
+        size_t nb1 = tensor->nb[1];
+        size_t nb2 = tensor->nb[2];
+        size_t nb3 = tensor->nb[3];
+
+        if (src0->type != src0->grad->type) {
+          // gradient is typically F32, but src0 could be other type
+          size_t ng = ne_element_size(src0->grad);
+          size_t n0 = ne_element_size(src0);
+          NE_ASSERT(offset % n0 == 0);
+          NE_ASSERT(nb1 % n0 == 0);
+          NE_ASSERT(nb2 % n0 == 0);
+          NE_ASSERT(nb3 % n0 == 0);
+          offset = (offset / n0) * ng;
+          nb1 = (nb1 / n0) * ng;
+          nb2 = (nb2 / n0) * ng;
+          nb3 = (nb3 / n0) * ng;
+        }
+
+        src0->grad = ne_acc_impl(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, inplace);
+      }
+    } break;
+    case NE_OP_PERMUTE: {
+      // necessary for llama
+      if (src0->grad) {
+        int axis0 = tensor->padding[0] & 0x3;
+        int axis1 = tensor->padding[1] & 0x3;
+        int axis2 = tensor->padding[2] & 0x3;
+        int axis3 = tensor->padding[3] & 0x3;
+        int axes_backward[4] = {0, 0, 0, 0};
+        axes_backward[axis0] = 0;
+        axes_backward[axis1] = 1;
+        axes_backward[axis2] = 2;
+        axes_backward[axis3] = 3;
+        src0->grad = ne_add_impl(
+            ctx, src0->grad,
+            ne_permute(ctx, tensor->grad, axes_backward[0], axes_backward[1], axes_backward[2], axes_backward[3]),
+            inplace);
+      }
+    } break;
+    case NE_OP_TRANSPOSE: {
+      // necessary for llama
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, ne_transpose(ctx, tensor->grad), inplace);
+      }
+    } break;
+    case NE_OP_GET_ROWS: {
+      // necessary for llama (only for tokenizer)
+      if (src0->grad) {
+        src0->grad = ne_add_impl(ctx, src0->grad, ne_get_rows_back(ctx, tensor->grad, src1, src0->grad), inplace);
+      }
+      if (src1->grad) {
+        // noop
+      }
+    } break;
+    case NE_OP_GET_ROWS_BACK: {
+      NE_ASSERT(false);  // TODO: not implemented
+    } break;
+    case NE_OP_DIAG: {
+      NE_ASSERT(false);  // TODO: not implemented
+    } break;
+    case NE_OP_DIAG_MASK_INF: {
+      // necessary for llama
+      if (src0->grad) {
+        assert(src1->type == NE_TYPE_I32);
+        assert(ne_nelements(src1) == 2);
+        const int n_past = ((int32_t*)src1->data)[0];
+        src0->grad = ne_add_impl(ctx, src0->grad, ne_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), inplace);
+      }
+      if (src1->grad) {
+        // noop
+      }
+    } break;
+    case NE_OP_DIAG_MASK_ZERO: {
+      // necessary for llama
+      if (src0->grad) {
+        assert(src1->type == NE_TYPE_I32);
+        assert(ne_nelements(src1) == 2);
+        const int n_past = ((int32_t*)src1->data)[0];
+        src0->grad = ne_add_impl(ctx, src0->grad, ne_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), inplace);
+      }
+      if (src1->grad) {
+        // noop
+      }
+    } break;
+    case NE_OP_SOFT_MAX: {
+      // necessary for llama
+      if (src0->grad) {
+        // y = softmax(x)
+        //
+        // Jii = yi - yi*yi
+        // Jij = -yi*yj
+        // J = diag(y)-y.*y
+        // dx = J * dy
+        // dxk = sum(Jkj * dyk)
+
+        int64_t ne2[4] = {tensor->ne[0], 1, tensor->ne[1] * tensor->ne[2], tensor->ne[3]};
+        struct ne_tensor* tensor2 =
+            ne_cont(ctx, ne_reshape_4d(ctx, ne_cont(ctx, tensor), ne2[0], ne2[1], ne2[2], ne2[3]));
+
+        struct ne_tensor* grad2 =
+            ne_cont(ctx, ne_reshape_4d(ctx, ne_cont(ctx, tensor->grad), ne2[0], ne2[1], ne2[2], ne2[3]));
+
+        struct ne_tensor* tensor2_t = ne_cont(ctx,                 // [1,ne0,ne1*ne2,ne3]
+                                              ne_permute(ctx,      // [1,ne0,ne1*ne2,ne3]
+                                                         tensor2,  // [ne0,1,ne1*ne2,ne3]
+                                                         1, 0, 2, 3));
+
+        src0->grad = ne_add_impl(ctx,
+                                 src0->grad,                                           // [ne0,ne1,ne2,ne3]
+                                 ne_reshape(ctx,                                       // [ne0,ne1,ne2,ne3]
+                                            ne_mul_mat(ctx,                            // [ne0,1,ne1*ne2,ne3]
+                                                       ne_sub(ctx,                     // [ne0,ne0,ne1*ne2,ne3]
+                                                              ne_diag(ctx,             // [ne0,ne0,ne1*ne2,ne3]
+                                                                      tensor2),        // [ne0,1,ne1*ne2,ne3]
+                                                              ne_mul_mat(ctx,          // [ne0,ne0,ne1*ne2,ne3]
+                                                                         tensor2_t,    // [1,ne0,ne1*ne2,ne3]
+                                                                         tensor2_t)),  // [1,ne0,ne1*ne2,ne3]
+                                                       grad2),                         // [ne0,1,ne1*ne2,ne3]
+                                            src0->grad),
+                                 inplace);
+      }
+    } break;
+    case NE_OP_ROPE: {
+      // necessary for llama
+      if (src0->grad) {
+        assert(src1->type == NE_TYPE_I32);
+        assert(ne_nelements(src1) == 3);
+        const int n_past = ((int32_t*)src1->data)[0];
+        const int n_dims = ((int32_t*)src1->data)[1];
+        const int mode = ((int32_t*)src1->data)[2];
+        src0->grad = ne_add_impl(ctx, src0->grad, ne_rope_back(ctx, tensor->grad, n_past, n_dims, mode), inplace);
+      }
+      if (src1->grad) {
+        // noop
+      }
+    } break;
+    case NE_OP_ROPE_BACK: {
+      if (src0->grad) {
+        assert(src1->type == NE_TYPE_I32);
+        assert(ne_nelements(src1) == 3);
+        const int n_past = ((int32_t*)src1->data)[0];
+        const int n_dims = ((int32_t*)src1->data)[1];
+        const int mode = ((int32_t*)src1->data)[2];
+        src0->grad =
+            ne_add_impl(ctx, src0->grad, ne_rope(ctx, tensor->grad, n_past, n_dims, mode, 0, 10000.0, 1.0), inplace);
+      }
+      if (src1->grad) {
+        // noop
+      }
+    } break;
+    case NE_OP_CONV_1D_1S: {
+      NE_ASSERT(false);  // TODO: not implemented
+    } break;
+    case NE_OP_CONV_1D_2S: {
+      NE_ASSERT(false);  // TODO: not implemented
+    } break;
+    case NE_OP_FLASH_ATTN_KV_UPDATE:
+    case NE_OP_FLASH_ATTN: {
+      NE_ASSERT(false);  // not supported
+    } break;
+    case NE_OP_FLASH_FF: {
+      NE_ASSERT(false);  // not supported
+    } break;
+    case NE_OP_MAP_UNARY:
+    case NE_OP_MAP_BINARY: {
+      NE_ASSERT(false);  // not supported
+    } break;
+    case NE_OP_DEBUG:
+    case NE_OP_NONE: {
+      // nop
+    } break;
+    case NE_OP_COUNT: {
+      NE_ASSERT(false);
+    } break;
+  }
+}
+
+static void ne_visit_parents(struct ne_cgraph* cgraph, struct ne_tensor* node) {
+  if (node->visited) {
+    return;
+  }
+  node->visited = true;
+
+  if (node->src0) {
+    ne_visit_parents(cgraph, node->src0);
+  }
+
+  if (node->src1) {
+    ne_visit_parents(cgraph, node->src1);
+  }
+
+  for (int i = 0; i < NE_MAX_OPT; ++i) {
+    if (node->opt[i]) {
+      ne_visit_parents(cgraph, node->opt[i]);
+    }
+  }
+
+  if (node->op == NE_OP_NONE) {
+    // reached a leaf node, not part of the gradient graph (e.g. a constant)
+    NE_ASSERT(cgraph->n_leafs < NE_MAX_NODES);
+
+    // cgraph->leafs[cgraph->n_leafs] = node;
+    cgraph->n_leafs++;
+  } else {
+    NE_ASSERT(cgraph->n_nodes < NE_MAX_NODES);
+
+    cgraph->nodes[cgraph->n_nodes] = node;
+    cgraph->n_nodes++;
+  }
+}
+
+static void ne_build_forward_impl(struct ne_cgraph* cgraph, struct ne_tensor* tensor, bool expand) {
+  if (!expand) {
+    cgraph->n_nodes = 0;
+    cgraph->n_leafs = 0;
+  }
+
+  const int n0 = cgraph->n_nodes;
+  UNUSED(n0);
+
+  ne_visit_parents(cgraph, tensor);
+
+  const int n_new = cgraph->n_nodes - n0;
+  NE_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
+
+  if (n_new > 0) {
+    // the last added node should always be starting point
+    NE_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor);
+  }
+}
+
+void ne_build_forward_expand(struct ne_cgraph* cgraph, struct ne_tensor* tensor) {
+  ne_build_forward_impl(cgraph, tensor, true);
+}
+
+struct ne_cgraph ne_build_forward(struct ne_tensor* tensor) {
+  struct ne_cgraph result = {
+      /*.n_nodes      =*/0,
+      /*.n_leafs      =*/0,
+      /*.n_threads    =*/NE_DEFAULT_N_THREADS,
+      /*.work_size    =*/0,
+      /*.work         =*/NULL,
+      /*.nodes        =*/{NULL},
+      /*.leafs        =*/{NULL},
+      /*.perf_runs    =*/0,
+      /*.perf_cycles  =*/0,
+      /*.perf_time_us =*/0,
+  };
+
+  ne_build_forward_impl(&result, tensor, false);
+
+  return result;
+}
+
+//
+// thread data
+//
+// synchronization is done via busy loops
+// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
+//
+
+void ne_graph_compute(struct ne_context* ctx, struct ne_cgraph* cgraph) {
+  int n_threads = cgraph->n_threads;
+  n_threads = bestla_set_threads(n_threads);
+  // initialize tasks + work buffer
+  {
+    size_t work_size = 0;
+
+    // thread scheduling for the different operations
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+      struct ne_tensor* node = cgraph->nodes[i];
+
+      switch (node->op) {
+        case NE_OP_CPY: {
+          node->n_tasks = n_threads;  // node->ne[0] == 1 ? n_threads : 1;
+          size_t cur = 0;
+          if (ne_is_quantized(node->type)) {
+            cur = NE_TYPE_SIZE[NE_TYPE_F32] * node->ne[0] * n_threads;
+          }
+          work_size = MAX(work_size, cur);
+        } break;
+        case NE_OP_DUP: {
+          node->n_tasks = n_threads;
+
+          size_t cur = 0;
+          if (ne_is_quantized(node->type)) {
+            cur = NE_TYPE_SIZE[NE_TYPE_F32] * node->ne[0] * n_threads;
+          }
+
+          work_size = MAX(work_size, cur);
+        } break;
+        case NE_OP_ADD:
+        case NE_OP_ADD1: {
+          if (node->src0->ne[1] > 4) {
+            node->n_tasks = n_threads;
+          } else {
+            node->n_tasks = 1;
+          }
+
+          size_t cur = 0;
+
+          if (ne_is_quantized(node->src0->type)) {
+            cur = NE_TYPE_SIZE[NE_TYPE_F32] * node->src0->ne[0] * n_threads;
+          }
+
+          work_size = MAX(work_size, cur);
+        } break;
+        case NE_OP_ACC: {
+          node->n_tasks = n_threads;
+          node->need_init = true;
+
+          size_t cur = 0;
+
+          if (ne_is_quantized(node->src0->type)) {
+            cur = NE_TYPE_SIZE[NE_TYPE_F32] * node->src1->ne[0] * n_threads;
+          }
+
+          work_size = MAX(work_size, cur);
+        } break;
+        case NE_OP_SUB:
+        case NE_OP_SUM:
+        case NE_OP_DIV:
+        case NE_OP_SUM_ROWS:
+        //  {
+        //   node->n_tasks = 1;
+        // } break;
+        case NE_OP_SQR:
+        case NE_OP_SQRT:
+        case NE_OP_LOG:
+        case NE_OP_MEAN:
+        case NE_OP_ABS:
+        case NE_OP_ARGSORT:
+        case NE_OP_SGN:
+        case NE_OP_NEG:
+        case NE_OP_STEP:
+        case NE_OP_MUL:
+        case NE_OP_RELU: {
+          if (node->src0->ne[1] > 4) {
+            node->n_tasks = n_threads;
+          } else {
+            node->n_tasks = 1;
+          }
+        } break;
+        case NE_OP_NORM:
+        case NE_OP_RMS_NORM: {
+          if (ne_is_contiguous(node->src0)) {
+            node->n_tasks = 1;
+          } else {
+            if (node->src0->ne[1] > 4) {
+              node->n_tasks = n_threads;
+            } else {
+              node->n_tasks = 1;
+            }
+          }
+        } break;
+        case NE_OP_GELU:
+        case NE_OP_SILU:
+        case NE_OP_SILU_BACK:
+        case NE_OP_RMS_NORM_BACK: {
+          node->n_tasks = n_threads;
+        } break;
+        case NE_OP_CONV_1D: {
+          node->need_init = true;
+        } // no break
+        case NE_OP_MUL_MAT_BIAS:
+        case NE_OP_MUL_MAT_ID:
+        case NE_OP_MUL_MAT: {
+          node->n_tasks = n_threads;
+
+          // TODO: use different scheduling for different matrix sizes
+          // const int nr0 = ne_nrows(node->src0);
+          // const int nr1 = ne_nrows(node->src1);
+
+          // node->n_tasks = MIN(n_threads, MAX(1, nr0/128));
+          // printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
+
+          size_t cur = 0;
+          struct ne_tensor* wei = node->src0;
+          if (node->op == NE_OP_MUL_MAT_ID) {
+            wei = node->opt[0];
+            if (wei->type != NE_TYPE_F32) {
+              node->need_init = true;
+            }
+          } else if (node->op == NE_OP_MUL_MAT) {
+            if (node->src0->type != NE_TYPE_BTLA && node->src0->type != NE_TYPE_F32) {
+              node->need_init = true;
+            }
+          }
+          if (wei->type == NE_TYPE_BTLA) {
+            cur = bestla_f32f32_get_workspace_size(node->src1->ne[1], wei->ne[1], node->src1->ne[0], wei->data);
+            node->n_tasks = 1;
+          } else if (wei->type == NE_TYPE_F16 && node->src1->type == NE_TYPE_F32) {
+            cur = NE_TYPE_SIZE[NE_TYPE_F16] * ne_nelements(node->src1);
+          } else if (wei->type == NE_TYPE_F32 && node->src1->type == NE_TYPE_F32) {
+            cur = 0;
+          } else if (ne_is_quantized(wei->type) && node->src1->type == NE_TYPE_F32) {
+            {
+              const enum ne_type type_q = quantize_fns[wei->type].vec_dot_type;
+              cur = NE_TYPE_SIZE[type_q] * ne_nelements(node->src1) / NE_BLCK_SIZE[type_q];
+            }
+          } else {
+            NE_ASSERT(false);
+          }
+
+          work_size = MAX(work_size, cur);
+        } break;
+        case NE_OP_MUL_FFN_SILU:
+        case NE_OP_MUL_FFN_GELU:
+        case NE_OP_MUL_FFN_GELU_MUL:
+        case NE_OP_MUL_FFN_ADD_GELU: {
+          size_t cur = 0;
+          cur = bestla_fusion_FFN_f32f32_get_workspace_size(node->src0->ne[1], node->src0->ne[0], node->src1->ne[1],
+                                                            node->opt[0]->ne[1], node->src1->data, node->opt[0]->data);
+          work_size = MAX(work_size, cur);
+          node->n_tasks = 1;
+        } break;
+        case NE_OP_MUL_ID_FFN_SILU: {
+          size_t cur = 0;
+          cur =
+              bestla_fusion_FFN_f32f32_get_workspace_size(node->src0->ne[1], node->src0->ne[0], node->opt[0]->ne[1],
+                                                          node->opt[9]->ne[1], node->opt[0]->data, node->opt[9]->data);
+          work_size = MAX(work_size, cur);
+          node->n_tasks = 1;
+        } break;
+        case NE_OP_MUL_QKV: {
+          size_t cur = 0;
+          cur = bestla_fusion_QKV_f32f32_get_workspace_size(node->src0->ne[1], node->src1->ne[1], node->src1->ne[0],
+                                                            node->src1->data);
+          work_size = MAX(work_size, cur);
+          node->n_tasks = 1;
+        } break;
+        case NE_OP_SCALE: {
+          node->n_tasks = 1;
+        } break;
+        case NE_OP_SET: {
+          node->need_init = true; 
+        } // no break
+        case NE_OP_CONT:
+        case NE_OP_RESHAPE:
+        case NE_OP_VIEW:
+        case NE_OP_PERMUTE:
+        case NE_OP_TRANSPOSE:
+        case NE_OP_GET_ROWS:
+        case NE_OP_GET_ROWS_BACK:
+        case NE_OP_REPEAT:
+        case NE_OP_DIAG:
+        case NE_OP_DIAG_MASK_ZERO: {
+          node->n_tasks = 1;
+          if (node->op == NE_OP_DIAG_MASK_ZERO) {
+            node->need_init = true;
+          }
+        } break;
+        case NE_OP_DIAG_MASK_INF:
+        case NE_OP_PADDING_MASK_INF: {
+          node->need_init = true;
+        } // no break
+        case NE_OP_ROPE:
+          if (node->type == NE_TYPE_BTLA) {
+            node->n_tasks = 1;
+          } else if (node->src0->ne[1] > 4) {
+            node->n_tasks = n_threads;
+          } else {
+            node->n_tasks = 1;
+          }
+          break;
+        case NE_OP_SOFT_MAX: {
+          size_t rows = ne_nrows(node->src0);
+          node->n_tasks = rows > 1 ? n_threads : 1;
+        } break;
+        case NE_OP_ROPE_BACK: {
+          node->n_tasks = n_threads;
+        } break;
+        case NE_OP_ALIBI: {
+          node->n_tasks = 1;  // TODO
+        } break;
+        case NE_OP_CLAMP: {
+          node->n_tasks = 1;  // TODO
+        } break;
+        case NE_OP_CONV_1D_1S:
+        case NE_OP_CONV_1D_2S: {
+          node->n_tasks = n_threads;
+          node->need_init = true;
+
+          NE_ASSERT(node->src0->ne[3] == 1);
+          NE_ASSERT(node->src1->ne[2] == 1);
+          NE_ASSERT(node->src1->ne[3] == 1);
+
+          size_t cur = 0;
+          const int nk = node->src0->ne[0];
+
+          if (node->src0->type == NE_TYPE_F16 && node->src1->type == NE_TYPE_F32) {
+            cur = sizeof(ne_fp16_t) * (nk * ne_up32(node->src0->ne[1]) * node->src0->ne[2] +
+                                       (2 * (nk / 2) + node->src1->ne[0]) * node->src1->ne[1]);
+          } else if (node->src0->type == NE_TYPE_F32 && node->src1->type == NE_TYPE_F32) {
+            cur = sizeof(float) * (nk * ne_up32(node->src0->ne[1]) * node->src0->ne[2] +
+                                   (2 * (nk / 2) + node->src1->ne[0]) * node->src1->ne[1]);
+          } else {
+            NE_ASSERT(false);
+          }
+
+          work_size = MAX(work_size, cur);
+        } break;
+        case NE_OP_FLASH_ATTN_KV_UPDATE:
+        case NE_OP_FLASH_ATTN: {
+          node->n_tasks = 1;
+          work_size = 0LL;
+        } break;
+        case NE_OP_FLASH_FF: {
+          node->n_tasks = n_threads;
+
+          size_t cur = 0;
+
+          if (node->src1->type == NE_TYPE_F32) {
+            cur = sizeof(float) * node->src1->ne[1] * node->n_tasks;   // TODO: this can become (n_tasks-1)
+            cur += sizeof(float) * node->src1->ne[1] * node->n_tasks;  // this is overestimated by x2
+          }
+
+          if (node->src1->type == NE_TYPE_F16) {
+            cur = sizeof(float) * node->src1->ne[1] * node->n_tasks;   // TODO: this can become (n_tasks-1)
+            cur += sizeof(float) * node->src1->ne[1] * node->n_tasks;  // this is overestimated by x2
+          }
+
+          work_size = MAX(work_size, cur);
+        } break;
+        case NE_OP_MAP_UNARY:
+        case NE_OP_MAP_BINARY: {
+          node->n_tasks = 1;
+        } break;
+        case NE_OP_NONE: {
+          node->n_tasks = 1;
+        } break;
+        case NE_OP_COUNT: {
+          NE_ASSERT(false);
+        } break;
+        // split and all_reduce do not use thread pool
+        case NE_OP_SPLIT:
+        case NE_OP_ALL_REDUCE:
+        case NE_OP_TP_CONCAT:
+        case NE_OP_DEBUG:
+        case NE_OP_DUMP_TENSOR: {
+          node->n_tasks = 1;
+        } break;
+          // case NE_OP_TP_CONCAT: {
+          //   node->n_tasks = n_threads;
+
+          //   size_t cur = 0;
+          //   if (ne_is_quantized(node->type)) {
+          //     cur = NE_TYPE_SIZE[NE_TYPE_F32] * node->ne[0] * n_threads;
+          //   }
+
+          //   work_size = MAX(work_size, cur);
+          // } break;
+      }
+    }
+
+    if (cgraph->work != NULL && work_size > cgraph->work_size) {
+      NE_ASSERT(false);  // TODO: better handling
+    }
+
+    if (work_size > 0 && cgraph->work == NULL) {
+      cgraph->work_size = work_size + CACHE_LINE_SIZE * (n_threads - 1);
+
+      NE_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
+      cgraph->work = ne_new_tensor_1d(ctx, NE_TYPE_I8, cgraph->work_size, NE_SIZE_CALC);
+    }
+  }
+
+  const int64_t perf_start_cycles = ne_perf_cycles();
+  const int64_t perf_start_time_us = ne_perf_time_us();
+
+  for (int i = 0; i < cgraph->n_nodes; i++) {
+    NE_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
+
+    struct ne_tensor* node = cgraph->nodes[i];
+
+    // TODO: this could be used to avoid unnecessary computations, but it needs to be improved
+    // if (node->grad == NULL && node->perf_runs > 0) {
+    //    continue;
+    //}
+
+    const int64_t perf_node_start_cycles = ne_perf_cycles();
+    const int64_t perf_node_start_time_us = ne_perf_time_us();
+#if NE_DEBUG
+    bestla_timer(true);
+#endif
+    // INIT
+    struct ne_compute_params params = {
+        /*.type  =*/NE_TASK_INIT,
+        /*.ith   =*/0,
+        /*.nth   =*/node->n_tasks,
+        /*.wsize =*/cgraph->work ? ne_nbytes(cgraph->work) : 0,
+        /*.wdata =*/cgraph->work ? cgraph->work->data : NULL,
+    };
+
+    bestla_parallel_for(ne_compute_forward, &params, node);
+#if NE_DEBUG
+    printf("Node %d ", node->op);
+    bestla_timer(false);
+#endif
+    // performance stats (node)
+    {
+      int64_t perf_cycles_cur = ne_perf_cycles() - perf_node_start_cycles;
+      int64_t perf_time_us_cur = ne_perf_time_us() - perf_node_start_time_us;
+
+      node->perf_runs++;
+      node->perf_cycles += perf_cycles_cur;
+      node->perf_time_us += perf_time_us_cur;
+    }
+  }
+
+  // performance stats (graph)
+  {
+    int64_t perf_cycles_cur = ne_perf_cycles() - perf_start_cycles;
+    int64_t perf_time_us_cur = ne_perf_time_us() - perf_start_time_us;
+
+    cgraph->perf_runs++;
+    cgraph->perf_cycles += perf_cycles_cur;
+    cgraph->perf_time_us += perf_time_us_cur;
+
+    NE_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n", __func__, cgraph->perf_runs,
+                   (double)perf_cycles_cur / (double)ne_cycles_per_ms(),
+                   (double)cgraph->perf_cycles / (double)ne_cycles_per_ms() / (double)cgraph->perf_runs,
+                   (double)perf_time_us_cur / 1000.0, (double)cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
+  }
+}
+
+void ne_graph_profiling(const struct ne_cgraph* cgraph) {
+#ifdef IE_PERF
+  int64_t perf_total_per_op_us[NE_OP_COUNT] = {0};
+
+  NE_PRINT("=== GRAPH Profiling ===\n");
+
+  int64_t ip_duration = 0;
+  int64_t mul_mat_id_duration = 0;
+  for (int i = 0; i < cgraph->n_nodes; i++) {
+    struct ne_tensor* node = cgraph->nodes[i];
+    if (node->op == NE_OP_MUL_MAT && node->ne[1] == node->ne[2]) {
+      ip_duration += node->perf_time_us;
+    } else {
+      perf_total_per_op_us[node->op] += node->perf_time_us;
+      if (node->op == NE_OP_MUL_MAT_ID) {
+        mul_mat_id_duration += node->perf_time_us;
+      }
+    }
+  }
+
+  for (int i = 0; i < NE_OP_COUNT; i++) {
+    if (perf_total_per_op_us[i] == 0) {
+      continue;
+    }
+    NE_PRINT("perf_total_per_op_us[%24s] = %7.3f ms\n", NE_OP_LABEL[i], (double)perf_total_per_op_us[i] / 1000.0);
+  }
+  NE_PRINT("perf_total_per_op_us[%24s] = %7.3f ms\n", "INNER PRODUCT", (double)ip_duration / 1000.0);
+  NE_PRINT("perf_total_per_op_us[%24s] = %7.3f ms\n", "MUL_MAT_ID", (double)mul_mat_id_duration / 1000.0);
+  NE_PRINT("========================================\n");
+
+#else
+  NE_PRINT("\n[Warning] To collect profiling data, please recompile with IE_PROFILING=ON.\n");
+#endif
+}
+
+void ne_graph_print(const struct ne_cgraph* cgraph) {
+  int64_t perf_total_per_op_us[NE_OP_COUNT] = {0};
+
+  NE_PRINT("=== GRAPH ===\n");
+
+  NE_PRINT_DEBUG("n_threads       = %d\n", cgraph->n_threads);
+  NE_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
+
+  NE_PRINT("n_nodes = %d\n", cgraph->n_nodes);
+  for (int i = 0; i < cgraph->n_nodes; i++) {
+    struct ne_tensor* node = cgraph->nodes[i];
+
+    perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us);
+
+    NE_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64
+             "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
+             i, node->ne[0], node->ne[1], node->ne[2], NE_OP_LABEL[node->op],
+             node->is_param ? "x"
+             : node->grad   ? "g"
+                            : " ",
+             node->perf_runs, (double)node->perf_cycles / (double)ne_cycles_per_ms(),
+             (double)node->perf_cycles / (double)ne_cycles_per_ms() / (double)node->perf_runs,
+             (double)node->perf_time_us / 1000.0, (double)node->perf_time_us / 1000.0 / node->perf_runs);
+  }
+
+  NE_PRINT("n_leafs = %d\n", cgraph->n_leafs);
+  for (int i = 0; i < cgraph->n_leafs; i++) {
+    struct ne_tensor* node = cgraph->leafs[i];
+
+    NE_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n", i, node->ne[0], node->ne[1], NE_OP_LABEL[node->op]);
+  }
+
+  for (int i = 0; i < NE_OP_COUNT; i++) {
+    if (perf_total_per_op_us[i] == 0) {
+      continue;
+    }
+
+    NE_PRINT("perf_total_per_op_us[%24s] = %7.3f ms\n", NE_OP_LABEL[i], (double)perf_total_per_op_us[i] / 1000.0);
+  }
+
+  NE_PRINT("========================================\n");
+}
+
+// check if node is part of the graph
+static bool ne_graph_find(const struct ne_cgraph* cgraph, const struct ne_tensor* node) {
+  if (cgraph == NULL) {
+    return true;
+  }
+
+  for (int i = 0; i < cgraph->n_nodes; i++) {
+    if (cgraph->nodes[i] == node) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static struct ne_tensor* ne_graph_get_parent(const struct ne_cgraph* cgraph, const struct ne_tensor* node) {
+  for (int i = 0; i < cgraph->n_nodes; i++) {
+    struct ne_tensor* parent = cgraph->nodes[i];
+
+    if (parent->grad == node) {
+      return parent;
+    }
+  }
+
+  return NULL;
+}
+
+void ne_graph_dump_dot(const struct ne_cgraph* gb, const struct ne_cgraph* gf, const char* filename) {
+  char color[16];
+
+  FILE* fp = fopen(filename, "w");
+  NE_ASSERT(fp);
+
+  fprintf(fp, "digraph G {\n");
+  fprintf(fp, "  newrank = true;\n");
+  fprintf(fp, "  rankdir = LR;\n");
+
+  for (int i = 0; i < gb->n_nodes; i++) {
+    struct ne_tensor* node = gb->nodes[i];
+
+    if (ne_graph_get_parent(gb, node) != NULL) {
+      continue;
+    }
+
+    if (node->is_param) {
+      snprintf(color, sizeof(color), "yellow");
+    } else if (node->grad) {
+      if (ne_graph_find(gf, node)) {
+        snprintf(color, sizeof(color), "green");
+      } else {
+        snprintf(color, sizeof(color), "lightblue");
+      }
+    } else {
+      snprintf(color, sizeof(color), "white");
+    }
+
+    fprintf(fp,
+            "  \"%p\" [ "
+            "style = filled; fillcolor = %s; shape = record; "
+            "label=\"",
+            (void*)node, color);
+
+    if (strlen(node->name) > 0) {
+      fprintf(fp, "%s |", node->name);
+    }
+
+    if (node->n_dims == 2) {
+      fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], NE_OP_SYMBOL[node->op]);
+    } else {
+      fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2],
+              NE_OP_SYMBOL[node->op]);
+    }
+
+    if (node->grad) {
+      fprintf(fp, " | <g>%s\"; ]\n", NE_OP_SYMBOL[node->grad->op]);
+    } else {
+      fprintf(fp, "\"; ]\n");
+    }
+  }
+
+  for (int i = 0; i < gb->n_leafs; i++) {
+    struct ne_tensor* node = gb->leafs[i];
+
+    snprintf(color, sizeof(color), "pink");
+
+    fprintf(fp,
+            "  \"%p\" [ "
+            "style = filled; fillcolor = %s; shape = record; "
+            "label=\"<x>",
+            (void*)node, color);
+
+    if (strlen(node->name) > 0) {
+      fprintf(fp, "%s | ", node->name);
+    }
+    if (ne_nelements(node) == 1) {
+      if (node->type == NE_TYPE_I8 || node->type == NE_TYPE_I16 || node->type == NE_TYPE_I32) {
+        fprintf(fp, "%d", ne_get_i32_1d(node, 0));
+      } else {
+        fprintf(fp, "%.1e", (double)ne_get_f32_1d(node, 0));
+      }
+    } else {
+      fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
+    }
+    fprintf(fp, "\"; ]\n");
+  }
+
+  for (int i = 0; i < gb->n_nodes; i++) {
+    struct ne_tensor* node = gb->nodes[i];
+
+    struct ne_tensor* parent = ne_graph_get_parent(gb, node);
+
+    if (node->src0) {
+      struct ne_tensor* parent0 = ne_graph_get_parent(gb, node->src0);
+
+      fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
+              parent0 ? (void*)parent0 : (void*)node->src0, parent0 ? "g" : "x", parent ? (void*)parent : (void*)node,
+              parent ? "g" : "x", parent ? "empty" : "vee", parent ? "dashed" : "solid");
+    }
+
+    if (node->src1) {
+      struct ne_tensor* parent1 = ne_graph_get_parent(gb, node->src1);
+
+      fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n",
+              parent1 ? (void*)parent1 : (void*)node->src1, parent1 ? "g" : "x", parent ? (void*)parent : (void*)node,
+              parent ? "g" : "x", parent ? "empty" : "vee", parent ? "dashed" : "solid");
+    }
+  }
+
+  for (int i = 0; i < gb->n_leafs; i++) {
+    struct ne_tensor* node = gb->leafs[i];
+
+    if (node->src0) {
+      fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n", (void*)node->src0, "x", (void*)node, "x");
+    }
+
+    if (node->src1) {
+      fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n", (void*)node->src1, "x", (void*)node, "x");
+    }
+  }
+
+  fprintf(fp, "}\n");
+
+  fclose(fp);
+
+  NE_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+size_t ne_quantize_q4_0(const float* src, void* dst, int n, int k, int64_t* hist) {
+  assert(k % QK4_0 == 0);
+  const size_t nb = k / QK4_0;
+
+  for (int b = 0; b < n; b += k) {
+    block_q4_0* restrict y = (block_q4_0*)dst + b / QK4_0;
+
+    quantize_row_q4_0_reference(src + b, y, k);
+
+    for (int i = 0; i < nb; i++) {
+      for (int j = 0; j < QK4_0; j += 2) {
+        const uint8_t vi0 = y[i].qs[j / 2] & 0x0F;
+        const uint8_t vi1 = y[i].qs[j / 2] >> 4;
+
+        hist[vi0]++;
+        hist[vi1]++;
+      }
+    }
+  }
+
+  return (n / QK4_0 * sizeof(block_q4_0));
+}
+
+size_t ne_quantize_q4_1(const float* src, void* dst, int n, int k, int64_t* hist) {
+  assert(k % QK4_1 == 0);
+  const size_t nb = k / QK4_1;
+
+  for (int b = 0; b < n; b += k) {
+    block_q4_1* restrict y = (block_q4_1*)dst + b / QK4_1;
+
+    quantize_row_q4_1_reference(src + b, y, k);
+
+    for (int i = 0; i < nb; i++) {
+      for (int j = 0; j < QK4_1; j += 2) {
+        const uint8_t vi0 = y[i].qs[j / 2] & 0x0F;
+        const uint8_t vi1 = y[i].qs[j / 2] >> 4;
+
+        hist[vi0]++;
+        hist[vi1]++;
+      }
+    }
+  }
+
+  return (n / QK4_1 * sizeof(block_q4_1));
+}
+
+size_t ne_quantize_q5_0(const float* src, void* dst, int n, int k, int64_t* hist) {
+  assert(k % QK5_0 == 0);
+  const size_t nb = k / QK5_0;
+
+  for (int b = 0; b < n; b += k) {
+    block_q5_0* restrict y = (block_q5_0*)dst + b / QK5_0;
+
+    quantize_row_q5_0_reference(src + b, y, k);
+
+    for (int i = 0; i < nb; i++) {
+      uint32_t qh;
+      memcpy(&qh, &y[i].qh, sizeof(qh));
+
+      for (int j = 0; j < QK5_0; j += 2) {
+        const uint8_t vh0 = ((qh & (1u << (j + 0))) >> (j + 0)) << 4;
+        const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+        // cast to 16 bins
+        const uint8_t vi0 = ((y[i].qs[j / 2] & 0x0F) | vh0) / 2;
+        const uint8_t vi1 = ((y[i].qs[j / 2] >> 4) | vh1) / 2;
+
+        hist[vi0]++;
+        hist[vi1]++;
+      }
+    }
+  }
+
+  return (n / QK5_0 * sizeof(block_q5_0));
+}
+
+size_t ne_quantize_q5_1(const float* src, void* dst, int n, int k, int64_t* hist) {
+  assert(k % QK5_1 == 0);
+  const size_t nb = k / QK5_1;
+
+  for (int b = 0; b < n; b += k) {
+    block_q5_1* restrict y = (block_q5_1*)dst + b / QK5_1;
+
+    quantize_row_q5_1_reference(src + b, y, k);
+
+    for (int i = 0; i < nb; i++) {
+      uint32_t qh;
+      memcpy(&qh, &y[i].qh, sizeof(qh));
+
+      for (int j = 0; j < QK5_1; j += 2) {
+        const uint8_t vh0 = ((qh & (1u << (j + 0))) >> (j + 0)) << 4;
+        const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+        // cast to 16 bins
+        const uint8_t vi0 = ((y[i].qs[j / 2] & 0x0F) | vh0) / 2;
+        const uint8_t vi1 = ((y[i].qs[j / 2] >> 4) | vh1) / 2;
+
+        hist[vi0]++;
+        hist[vi1]++;
+      }
+    }
+  }
+
+  return (n / QK5_1 * sizeof(block_q5_1));
+}
+
+size_t ne_quantize_q8_0(const float* src, void* dst, int n, int k, int64_t* hist) {
+  assert(k % QK8_0 == 0);
+  const size_t nb = k / QK8_0;
+
+  for (int b = 0; b < n; b += k) {
+    block_q8_0* restrict y = (block_q8_0*)dst + b / QK8_0;
+
+    quantize_row_q8_0_reference(src + b, y, k);
+
+    for (int i = 0; i < nb; i++) {
+      for (int j = 0; j < QK8_0; ++j) {
+        const int8_t vi = y[i].qs[j];
+
+        hist[vi / 16 + 8]++;
+      }
+    }
+  }
+
+  return (n / QK8_0 * sizeof(block_q8_0));
+}
+
+size_t ne_quantize_chunk(enum ne_type type, const float* src, void* dst, int start, int n, int64_t* hist) {
+  size_t result = 0;
+  switch (type) {
+    case NE_TYPE_Q4_0: {
+      NE_ASSERT(start % QK4_0 == 0);
+      block_q4_0* block = (block_q4_0*)dst + start / QK4_0;
+      result = ne_quantize_q4_0(src + start, block, n, n, hist);
+    } break;
+    case NE_TYPE_Q4_1: {
+      NE_ASSERT(start % QK4_1 == 0);
+      block_q4_1* block = (block_q4_1*)dst + start / QK4_1;
+      result = ne_quantize_q4_1(src + start, block, n, n, hist);
+    } break;
+    case NE_TYPE_Q5_0: {
+      NE_ASSERT(start % QK5_0 == 0);
+      block_q5_0* block = (block_q5_0*)dst + start / QK5_0;
+      result = ne_quantize_q5_0(src + start, block, n, n, hist);
+    } break;
+    case NE_TYPE_Q5_1: {
+      NE_ASSERT(start % QK5_1 == 0);
+      block_q5_1* block = (block_q5_1*)dst + start / QK5_1;
+      result = ne_quantize_q5_1(src + start, block, n, n, hist);
+    } break;
+    case NE_TYPE_Q8_0: {
+      NE_ASSERT(start % QK8_0 == 0);
+      block_q8_0* block = (block_q8_0*)dst + start / QK8_0;
+      result = ne_quantize_q8_0(src + start, block, n, n, hist);
+    } break;
+    default:
+      assert(false);
+  }
+  return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+int ne_cpu_has_avx(void) {
+#if defined(__AVX__)
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+int ne_cpu_has_avx2(void) {
+#if defined(__AVX2__)
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+int ne_cpu_has_avx512(void) {
+#if defined(__AVX512F__)
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+int ne_cpu_has_avx512_vbmi(void) {
+#if defined(__AVX512VBMI__)
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+int ne_cpu_has_avx512_vnni(void) {
+#if defined(__AVX512VNNI__)
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+int ne_cpu_has_fma(void) {
+#if defined(__FMA__)
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+int ne_cpu_has_f16c(void) {
+#if defined(__F16C__)
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+int ne_cpu_has_blas(void) { return 0; }
+
+int ne_cpu_has_sse3(void) {
+#if defined(__SSE3__)
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+int ne_cpu_has_vsx(void) { return 0; }
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/vllm-ext/inference_engine/cpp/core/ne_layers.h b/vllm-ext/inference_engine/cpp/core/ne_layers.h
new file mode 100644
index 000000000..a3f4a1dda
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/ne_layers.h
@@ -0,0 +1,646 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+
+#ifdef NE_SHARED
+#if defined(_WIN32) && !defined(__MINGW32__)
+#ifdef NE_BUILD
+#define NE_API __declspec(dllexport)
+#else
+#define NE_API __declspec(dllimport)
+#endif
+#else
+#define NE_API __attribute__((visibility("default")))
+#endif
+#else
+#define NE_API
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "core/ne.h"
+#include "core/data_types.h"
+#include "layers/layers.h"
+
+#define NE_QNT_VERSION 2            // bump this on quantization format changes
+#define NE_QNT_VERSION_FACTOR 1000  // do not change this
+
+#define NE_UNUSED(x) (void)(x)
+#define NE_TENSOR_LOCALS_1(type, prefix, pointer, array) \
+  const type prefix##0 = (pointer)->array[0];            \
+  NE_UNUSED(prefix##0);
+#define NE_TENSOR_LOCALS_2(type, prefix, pointer, array) \
+  NE_TENSOR_LOCALS_1(type, prefix, pointer, array)       \
+  const type prefix##1 = (pointer)->array[1];            \
+  NE_UNUSED(prefix##1);
+#define NE_TENSOR_LOCALS_3(type, prefix, pointer, array) \
+  NE_TENSOR_LOCALS_2(type, prefix, pointer, array)       \
+  const type prefix##2 = (pointer)->array[2];            \
+  NE_UNUSED(prefix##2);
+#define NE_TENSOR_LOCALS(type, prefix, pointer, array) \
+  NE_TENSOR_LOCALS_3(type, prefix, pointer, array)     \
+  const type prefix##3 = (pointer)->array[3];          \
+  NE_UNUSED(prefix##3);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Attention flags
+typedef enum NE_ATTN_FLAG {
+  NE_ATTN_FLAG_NONE = 0,
+  NE_ATTN_FLAG_IS_CAUSAL = 1 << 0,
+  NE_ATTN_FLAG_IS_ALIBI8 = 1 << 1,
+  NE_ATTN_FLAG_PREFER_FP32 = 1 << 2,  // prefer to use FP32 as compute type in attn
+} NE_ATTN_FLAG;
+typedef uint32_t ne_attn_flags_t;
+
+// convert FP16 <-> FP32
+NE_API float ne_fp16_to_fp32(ne_fp16_t x);
+NE_API ne_fp16_t ne_fp32_to_fp16(float x);
+
+NE_API void ne_fp16_to_fp32_row(const ne_fp16_t* x, float* y, size_t n);
+NE_API void ne_fp32_to_fp16_row(const float* x, ne_fp16_t* y, size_t n);
+
+// misc
+
+NE_API void ne_time_init(void);  // call this once at the beginning of the program
+NE_API int64_t ne_time_ms(void);
+NE_API int64_t ne_time_us(void);
+NE_API int64_t ne_cycles(void);
+NE_API int64_t ne_cycles_per_ms(void);
+
+NE_API void ne_print_object(const struct ne_object* obj);
+NE_API void ne_print_objects(const struct ne_context* ctx);
+
+NE_API int64_t ne_nelements(const struct ne_tensor* tensor);
+NE_API size_t ne_nbytes(const struct ne_tensor* tensor);
+
+NE_API int ne_blck_size(enum ne_type type);
+NE_API size_t ne_type_size(enum ne_type type);  // size in bytes for all elements in a block
+NE_API float ne_type_sizef(enum ne_type type);  // ne_type_size()/ne_blck_size() as float
+
+NE_API const char* ne_type_name(enum ne_type type);
+
+NE_API size_t ne_element_size(const struct ne_tensor* tensor);
+
+NE_API bool ne_is_quantized(enum ne_type type);
+
+// TODO: temporary until model loading of ne examples is refactored
+NE_API enum ne_type ne_ftype_to_ne_type(enum ne_ftype ftype);
+
+// main
+
+NE_API struct ne_context* ne_init(struct ne_init_params params);
+NE_API void ne_free(struct ne_context* ctx);
+
+NE_API size_t ne_used_mem(const struct ne_context* ctx);
+
+NE_API size_t ne_set_scratch(struct ne_context* ctx, struct ne_scratch scratch);
+
+NE_API struct ne_tensor* ne_new_tensor(struct ne_context* ctx, enum ne_type type, int n_dims, const int64_t* ne,
+                                       size_t size);
+
+NE_API struct ne_tensor* ne_new_tensor_1d(struct ne_context* ctx, enum ne_type type, int64_t ne0, size_t size);
+
+NE_API struct ne_tensor* ne_new_tensor_2d(struct ne_context* ctx, enum ne_type type, int64_t ne0, int64_t ne1,
+                                          size_t size);
+
+NE_API struct ne_tensor* ne_new_tensor_3d(struct ne_context* ctx, enum ne_type type, int64_t ne0, int64_t ne1,
+                                          int64_t ne2, size_t size);
+
+NE_API struct ne_tensor* ne_new_tensor_4d(struct ne_context* ctx, enum ne_type type, int64_t ne0, int64_t ne1,
+                                          int64_t ne2, int64_t ne3, size_t size);
+
+#define d_ne_new_tensor(...) ne_new_tensor(__VA_ARGS__, NE_SIZE_CALC)
+#define d_ne_new_tensor_1d(...) ne_new_tensor_1d(__VA_ARGS__, NE_SIZE_CALC)
+#define d_ne_new_tensor_2d(...) ne_new_tensor_2d(__VA_ARGS__, NE_SIZE_CALC)
+#define d_ne_new_tensor_3d(...) ne_new_tensor_3d(__VA_ARGS__, NE_SIZE_CALC)
+#define d_ne_new_tensor_4d(...) ne_new_tensor_4d(__VA_ARGS__, NE_SIZE_CALC)
+
+NE_API struct ne_tensor* ne_new_i32(struct ne_context* ctx, int32_t value);
+NE_API struct ne_tensor* ne_new_f32(struct ne_context* ctx, float value);
+
+NE_API struct ne_tensor* ne_dup_tensor(struct ne_context* ctx, const struct ne_tensor* src);
+NE_API struct ne_tensor* ne_view_tensor(struct ne_context* ctx, const struct ne_tensor* src);
+
+NE_API struct ne_tensor* ne_set_zero(struct ne_tensor* tensor);
+NE_API struct ne_tensor* ne_set_i32(struct ne_tensor* tensor, int32_t value);
+NE_API struct ne_tensor* ne_set_f32(struct ne_tensor* tensor, float value);
+
+NE_API int32_t ne_get_i32_1d(const struct ne_tensor* tensor, int i);
+NE_API void ne_set_i32_1d(const struct ne_tensor* tensor, int i, int32_t value);
+
+NE_API float ne_get_f32_1d(const struct ne_tensor* tensor, int i);
+NE_API void ne_set_f32_1d(const struct ne_tensor* tensor, int i, float value);
+
+NE_API void* ne_get_data(const struct ne_tensor* tensor);
+NE_API float* ne_get_data_f32(const struct ne_tensor* tensor);
+
+NE_API const char* ne_get_name(const struct ne_tensor* tensor);
+NE_API void ne_set_name(struct ne_tensor* tensor, const char* name);
+
+//
+// operations on tensors with backpropagation
+//
+
+typedef void (*ne_debug_callback_t)(const struct ne_tensor* t);
+
+// Run a callback function during graph forwarding.
+// Usage (in C++): `cur = ne_debug_op(ctx0, cur, [](const ne_tensor* cur) { std::cout << cur->data[0]; });`
+// Note that the lambda expression must not have any captures.
+NE_API struct ne_tensor* ne_debug_op(struct ne_context* ctx, struct ne_tensor* a, ne_debug_callback_t cb);
+
+NE_API struct ne_tensor* ne_dup(struct ne_context* ctx, struct ne_tensor* a);
+
+NE_API struct ne_tensor* ne_add(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+NE_API struct ne_tensor* ne_add_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+NE_API struct ne_tensor* ne_add1(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+NE_API struct ne_tensor* ne_acc(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t nb1,
+                                size_t nb2, size_t nb3, size_t offset);
+
+NE_API struct ne_tensor* ne_acc_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t nb1,
+                                        size_t nb2, size_t nb3, size_t offset);
+
+NE_API struct ne_tensor* ne_sub(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+NE_API struct ne_tensor* ne_mul(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+NE_API struct ne_tensor* ne_div(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+NE_API struct ne_tensor* ne_sqr(struct ne_context* ctx, struct ne_tensor* a);
+
+NE_API struct ne_tensor* ne_sqrt(struct ne_context* ctx, struct ne_tensor* a);
+
+NE_API struct ne_tensor* ne_log(struct ne_context* ctx, struct ne_tensor* a);
+
+NE_API struct ne_tensor* ne_log_inplace(struct ne_context* ctx, struct ne_tensor* a);
+
+// return scalar
+NE_API struct ne_tensor* ne_sum(struct ne_context* ctx, struct ne_tensor* a);
+
+// sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
+NE_API struct ne_tensor* ne_sum_rows(struct ne_context* ctx, struct ne_tensor* a);
+
+// mean along rows
+NE_API struct ne_tensor* ne_mean(struct ne_context* ctx, struct ne_tensor* a);
+
+// if a is the same shape as b, and a is not parameter, return a
+// otherwise, return a new tensor: repeat(a) to fit in b
+NE_API struct ne_tensor* ne_repeat(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+NE_API struct ne_tensor* ne_abs(struct ne_context* ctx, struct ne_tensor* a);
+
+NE_API struct ne_tensor* ne_sgn(struct ne_context* ctx, struct ne_tensor* a);
+
+NE_API struct ne_tensor* ne_neg(struct ne_context* ctx, struct ne_tensor* a);
+
+NE_API struct ne_tensor* ne_step(struct ne_context* ctx, struct ne_tensor* a);
+
+NE_API struct ne_tensor* ne_relu(struct ne_context* ctx, struct ne_tensor* a);
+
+// TODO: double-check this computation is correct
+NE_API struct ne_tensor* ne_gelu(struct ne_context* ctx, struct ne_tensor* a);
+
+NE_API struct ne_tensor* ne_gelu_inplace(struct ne_context* ctx, struct ne_tensor* a);
+
+NE_API struct ne_tensor* ne_silu(struct ne_context* ctx, struct ne_tensor* a);
+
+// a - x
+// b - dy
+NE_API struct ne_tensor* ne_silu_back(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+// normalize along rows
+NE_API struct ne_tensor* ne_norm(struct ne_context* ctx, struct ne_tensor* a, float eps);
+
+NE_API struct ne_tensor* ne_rms_norm(struct ne_context* ctx, struct ne_tensor* a, float eps);
+
+NE_API struct ne_tensor* ne_rms_norm_inplace(struct ne_context* ctx, struct ne_tensor* a, float eps);
+// a - x
+// b - dy
+NE_API struct ne_tensor* ne_rms_norm_back(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+// A: m rows, n columns
+// B: p rows, n columns (i.e. we transpose it internally)
+// result is m columns, p rows
+NE_API struct ne_tensor* ne_mul_mat(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+NE_API struct ne_tensor* ne_mul_mat_id(struct ne_context* ctx, struct ne_tensor* const as[], int n_as,
+                                       struct ne_tensor* ids, int id, struct ne_tensor* b);
+NE_API struct ne_tensor* ne_mul_id_ffn_silu(struct ne_context* ctx, struct ne_tensor* const down[],
+                                            struct ne_tensor* const gate[], struct ne_tensor* const up[], int n_as,
+                                            struct ne_tensor* ids, int id, struct ne_tensor* b);
+
+NE_API struct ne_tensor* ne_mul_mat_with_bias(struct ne_context* ctx, struct ne_tensor* w, struct ne_tensor* b,
+                                              struct ne_tensor* a);
+NE_API struct ne_tensor* ne_argsort(struct ne_context* ctx, struct ne_tensor* a);
+
+NE_API struct ne_tensor* ne_top_k(struct ne_context* ctx, struct ne_tensor* a, int k);
+// merged Q K V  ne_mul_mat
+NE_API struct ne_tensor* ne_mul_qkv(struct ne_context* ctx, struct ne_tensor* qw, struct ne_tensor* kw,
+                                    struct ne_tensor* vw, struct ne_tensor* src);
+
+// merged Q K V  ne_mul_mat
+NE_API struct ne_tensor* ne_ffn_silu(struct ne_context* ctx, struct ne_tensor* w1, struct ne_tensor* w2,
+                                     struct ne_tensor* w3, struct ne_tensor* src);
+
+NE_API struct ne_tensor* ne_ffn_gelu_mul(struct ne_context* ctx, struct ne_tensor* w1, struct ne_tensor* w2,
+                                         struct ne_tensor* w3, struct ne_tensor* src);
+
+NE_API struct ne_tensor* ne_ffn_add_gelu(struct ne_context* ctx, struct ne_tensor* w1, struct ne_tensor* w2,
+                                         struct ne_tensor* b1, struct ne_tensor* b2, struct ne_tensor* src);
+
+NE_API struct ne_tensor* ne_ffn_gelu(struct ne_context* ctx, struct ne_tensor* w1, struct ne_tensor* w2,
+                                     struct ne_tensor* src);
+
+//
+// operations on tensors without backpropagation
+//
+
+NE_API struct ne_tensor* ne_scale(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+// in-place, returns view(a)
+NE_API struct ne_tensor* ne_scale_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+// b -> view(a,offset,nb1,nb2,3), return modified a
+NE_API struct ne_tensor* ne_set(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t nb1,
+                                size_t nb2, size_t nb3, size_t offset);
+
+// b -> view(a,offset,nb1,nb2,3), return view(a)
+NE_API struct ne_tensor* ne_set_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t nb1,
+                                        size_t nb2, size_t nb3, size_t offset);
+
+NE_API struct ne_tensor* ne_set_1d(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t offset);
+
+NE_API struct ne_tensor* ne_set_1d_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b,
+                                           size_t offset);
+
+// b -> view(a,offset,nb1,nb2,3), return modified a
+NE_API struct ne_tensor* ne_set_2d(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t nb1,
+                                   size_t offset);
+
+// b -> view(a,offset,nb1,nb2,3), return view(a)
+NE_API struct ne_tensor* ne_set_2d_inplace(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, size_t nb1,
+                                           size_t offset);
+
+// a -> b, return view(b)
+NE_API struct ne_tensor* ne_cpy(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+// make contiguous
+NE_API struct ne_tensor* ne_cont(struct ne_context* ctx, struct ne_tensor* a);
+
+// return view(a), b specifies the new shape
+// TODO: when we start computing gradient, make a copy instead of view
+NE_API struct ne_tensor* ne_reshape(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+// return view(a)
+// TODO: when we start computing gradient, make a copy instead of view
+NE_API struct ne_tensor* ne_reshape_1d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0);
+
+NE_API struct ne_tensor* ne_reshape_2d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0, int64_t ne1);
+
+// return view(a)
+// TODO: when we start computing gradient, make a copy instead of view
+NE_API struct ne_tensor* ne_reshape_3d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0, int64_t ne1,
+                                       int64_t ne2);
+
+NE_API struct ne_tensor* ne_reshape_4d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0, int64_t ne1,
+                                       int64_t ne2, int64_t ne3);
+
+// offset in bytes
+NE_API struct ne_tensor* ne_view_1d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0, size_t offset);
+
+NE_API struct ne_tensor* ne_view_2d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0, int64_t ne1,
+                                    size_t nb1,  // row stride in bytes
+                                    size_t offset);
+
+NE_API struct ne_tensor* ne_view_3d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0, int64_t ne1, int64_t ne2,
+                                    size_t nb1,  // row   stride in bytes
+                                    size_t nb2,  // slice stride in bytes
+                                    size_t offset);
+
+NE_API struct ne_tensor* ne_view_4d(struct ne_context* ctx, struct ne_tensor* a, int64_t ne0, int64_t ne1, int64_t ne2,
+                                    int64_t ne3,
+                                    size_t nb1,  // row   stride in bytes
+                                    size_t nb2,  // slice stride in bytes
+                                    size_t nb3, size_t offset);
+
+NE_API struct ne_tensor* ne_permute(struct ne_context* ctx, struct ne_tensor* a, int axis0, int axis1, int axis2,
+                                    int axis3);
+
+// alias for ne_permute(ctx, a, 1, 0, 2, 3)
+NE_API struct ne_tensor* ne_transpose(struct ne_context* ctx, struct ne_tensor* a);
+
+NE_API struct ne_tensor* ne_get_rows(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+NE_API struct ne_tensor* ne_get_rows_back(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b,
+                                          struct ne_tensor* c);
+
+NE_API struct ne_tensor* ne_diag(struct ne_context* ctx, struct ne_tensor* a);
+
+// set elements above the diagonal to -INF
+NE_API struct ne_tensor* ne_diag_mask_inf(struct ne_context* ctx, struct ne_tensor* a, int n_past);
+
+// in-place, returns view(a)
+NE_API struct ne_tensor* ne_diag_mask_inf_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past);
+
+// set padding tokens to -INF
+// only support padding left for now
+NE_API struct ne_tensor* ne_padding_left_mask_inf(struct ne_context* ctx, struct ne_tensor* a, int* n_padding);
+
+// in-place, returns view(a)
+NE_API struct ne_tensor* ne_padding_left_mask_inf_inplace(struct ne_context* ctx, struct ne_tensor* a, int* n_padding);
+
+// set elements above the diagonal and padding tokens to -INF
+NE_API struct ne_tensor* ne_diag_mask_inf_with_padding(struct ne_context* ctx, struct ne_tensor* a, int n_past,
+                                                       int* n_padding);
+
+// in-place, returns view(a)
+NE_API struct ne_tensor* ne_diag_mask_inf_with_padding_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past,
+                                                               int* n_padding);
+
+// set elements above the diagonal to 0
+NE_API struct ne_tensor* ne_diag_mask_zero(struct ne_context* ctx, struct ne_tensor* a, int n_past);
+
+// in-place, returns view(a)
+NE_API struct ne_tensor* ne_diag_mask_zero_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past);
+
+NE_API struct ne_tensor* ne_soft_max(struct ne_context* ctx, struct ne_tensor* a);
+
+// in-place, returns view(a)
+NE_API struct ne_tensor* ne_soft_max_inplace(struct ne_context* ctx, struct ne_tensor* a);
+
+// rotary position embedding
+// if mode & 1 == 1, skip n_past elements
+// if mode & 2 == 1, GPT-NeoX style
+// if mode & 4 == 1, especially for glm
+// TODO: avoid creating a new tensor every time
+NE_API struct ne_tensor* ne_rope(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode,
+                                 int prompt_size, float freq_base, float freq_scale);
+
+// in-place, returns view(a)
+NE_API struct ne_tensor* ne_rope_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode,
+                                         int prompt_size, float freq_base, float freq_scale);
+
+// shift all tokens by a give p (n_shift)
+// Optionally give a 1d tensor of precomputed interleaved cos/sin value of n_shift*scale^k for k \in [0, n_dims)
+NE_API struct ne_tensor* ne_rope_shift_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_shift, int n_dims,
+                                               int mode, int prompt_size, int n_keep, struct ne_tensor* cossin,
+                                               float freq_base, float freq_scale);
+
+// in-place, returns view(a)
+NE_API struct ne_tensor* ne_rope_custom_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims,
+                                                int mode, int prompt_size, float freq_base, float freq_scale,
+                                                int yarn_orig_ctx, float ext_factor, float attn_factor, float beta_fast,
+                                                float beta_slow);
+
+// shift all tokens by a give p (n_shift)
+// Optionally give a 1d tensor of precomputed interleaved cos/sin value of n_shift*scale^k for k \in [0, n_dims)
+NE_API struct ne_tensor* ne_rope_custom_shift_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_shift,
+                                                      int n_dims, int mode, int prompt_size, int n_keep,
+                                                      struct ne_tensor* cossin, float freq_base, float freq_scale,
+                                                      int yarn_orig_ctx, float ext_factor, float attn_factor,
+                                                      float beta_fast, float beta_slow);
+
+// rotary position embedding backward, i.e compute dx from dy
+// a - dy
+NE_API struct ne_tensor* ne_rope_back(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode);
+
+NE_API struct ne_tensor* ne_rope_with_padding(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims,
+                                              int mode, int prompt_size, int* n_padding, float freq_base,
+                                              float freq_scale);
+
+// in-place, returns view(a)
+NE_API struct ne_tensor* ne_rope_with_padding_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past,
+                                                      int n_dims, int mode, int prompt_size, int* n_padding,
+                                                      float freq_base, float freq_scale);
+
+// alibi position embedding
+// in-place, returns view(a)
+struct ne_tensor* ne_alibi(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_head, float bias_max);
+
+// clamp
+// in-place, returns view(a)
+struct ne_tensor* ne_clamp(struct ne_context* ctx, struct ne_tensor* a, float min, float max);
+
+// padding = 1
+// TODO: we don't support extra parameters for now
+//       that's why we are hard-coding the stride, padding, and dilation
+//       not great ..
+NE_API struct ne_tensor* ne_conv_1d_1s(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+NE_API struct ne_tensor* ne_conv_1d_2s(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b);
+
+// conv_1d with padding = half
+// alias for ne_conv_1d(a, b, s, a->ne[0]/2, d)
+NE_API struct ne_tensor* ne_conv_1d_ph(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b, int s, int d);
+
+NE_API struct ne_tensor* ne_flash_attn(struct ne_context* ctx, struct ne_tensor* q, struct ne_tensor* k,
+                                       struct ne_tensor* v, float scale, ne_attn_flags_t flags);
+// set no_zeroing to true to prevent zeroing unaligned seq
+NE_API struct ne_tensor* ne_flash_attn_update_k(struct ne_context* ctx, struct ne_tensor* cache, struct ne_tensor* cur,
+                                                int n_past, bool no_zeroing);
+// set no_zeroing to true to prevent zeroing unaligned seq
+NE_API struct ne_tensor* ne_flash_attn_update_v(struct ne_context* ctx, struct ne_tensor* cache, struct ne_tensor* cur,
+                                                int n_past, bool no_zeroing);
+
+NE_API struct ne_tensor* ne_flash_ff(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b0,
+                                     struct ne_tensor* b1, struct ne_tensor* c0, struct ne_tensor* c1);
+
+// Mapping operations
+typedef void (*ne_unary_op_f32_t)(const int, float*, const float*);
+typedef void (*ne_binary_op_f32_t)(const int, float*, const float*, const float*);
+
+NE_API struct ne_tensor* ne_map_unary_f32(struct ne_context* ctx, struct ne_tensor* a, ne_unary_op_f32_t fun);
+
+NE_API struct ne_tensor* ne_map_binary_f32(struct ne_context* ctx, struct ne_tensor* a, struct ne_tensor* b,
+                                           ne_binary_op_f32_t fun);
+
+//
+// automatic differentiation
+//
+
+NE_API void ne_set_param(struct ne_context* ctx, struct ne_tensor* tensor);
+
+NE_API void ne_build_forward_expand(struct ne_cgraph* cgraph, struct ne_tensor* tensor);
+
+NE_API struct ne_cgraph ne_build_forward(struct ne_tensor* tensor);
+NE_API struct ne_cgraph ne_build_backward(struct ne_context* ctx, struct ne_cgraph* gf, bool keep);
+
+NE_API void ne_graph_compute(struct ne_context* ctx, struct ne_cgraph* cgraph);
+
+// print info and performance information for the graph
+NE_API void ne_graph_print(const struct ne_cgraph* cgraph);
+
+// profiling the performance information for each kernel in graph, enable by set env ENGINE_PROFILING = 1
+NE_API void ne_graph_profiling(const struct ne_cgraph* cgraph);
+
+// dump the graph into a file using the dot format
+NE_API void ne_graph_dump_dot(const struct ne_cgraph* gb, const struct ne_cgraph* gf, const char* filename);
+
+//
+// optimization
+//
+
+// optimization methods
+enum ne_opt_type {
+  NE_OPT_ADAM,
+  NE_OPT_LBFGS,
+};
+
+// linesearch methods
+enum ne_linesearch {
+  NE_LINESEARCH_DEFAULT = 1,
+
+  NE_LINESEARCH_BACKTRACKING_ARMIJO = 0,
+  NE_LINESEARCH_BACKTRACKING_WOLFE = 1,
+  NE_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
+};
+
+// optimization return values
+enum ne_opt_result {
+  NE_OPT_OK = 0,
+  NE_OPT_DID_NOT_CONVERGE,
+  NE_OPT_NO_CONTEXT,
+  NE_OPT_INVALID_WOLFE,
+  NE_OPT_FAIL,
+
+  NE_LINESEARCH_FAIL = -128,
+  NE_LINESEARCH_MINIMUM_STEP,
+  NE_LINESEARCH_MAXIMUM_STEP,
+  NE_LINESEARCH_MAXIMUM_ITERATIONS,
+  NE_LINESEARCH_INVALID_PARAMETERS,
+};
+
+// optimization parameters
+//
+//   see ne.c (ne_opt_default_params) for default values
+//
+struct ne_opt_params {
+  enum ne_opt_type type;
+
+  int n_threads;
+
+  // delta-based convergence test
+  //
+  //   if past == 0 - disabled
+  //   if past > 0:
+  //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
+  //
+  int past;
+  float delta;
+
+  // maximum number of iterations without improvement
+  //
+  //   if 0 - disabled
+  //   if > 0:
+  //     assume convergence if no cost improvement in this number of iterations
+  //
+  int max_no_improvement;
+
+  bool print_forward_graph;
+  bool print_backward_graph;
+
+  // ADAM parameters
+  struct {
+    int n_iter;
+
+    float alpha;  // learning rate
+    float beta1;
+    float beta2;
+    float eps;    // epsilon for numerical stability
+    float eps_f;  // epsilon for convergence test
+    float eps_g;  // epsilon for convergence test
+  } adam;
+
+  // LBFGS parameters
+  struct {
+    int m;  // number of corrections to approximate the inv. Hessian
+    int n_iter;
+    int max_linesearch;
+
+    float eps;   // convergence tolerance
+    float ftol;  // line search tolerance
+    float wolfe;
+    float min_step;
+    float max_step;
+
+    enum ne_linesearch linesearch;
+  } lbfgs;
+};
+
+NE_API struct ne_opt_params ne_opt_default_params(enum ne_opt_type type);
+
+//
+// quantization
+//
+
+NE_API size_t ne_quantize_q4_0(const float* src, void* dst, int n, int k, int64_t* hist);
+NE_API size_t ne_quantize_q4_1(const float* src, void* dst, int n, int k, int64_t* hist);
+NE_API size_t ne_quantize_q5_0(const float* src, void* dst, int n, int k, int64_t* hist);
+NE_API size_t ne_quantize_q5_1(const float* src, void* dst, int n, int k, int64_t* hist);
+NE_API size_t ne_quantize_q8_0(const float* src, void* dst, int n, int k, int64_t* hist);
+
+NE_API size_t ne_quantize_chunk(enum ne_type type, const float* src, void* dst, int start, int n, int64_t* hist);
+
+//
+// system info
+//
+
+NE_API int ne_cpu_has_avx(void);
+NE_API int ne_cpu_has_avx2(void);
+NE_API int ne_cpu_has_avx512(void);
+NE_API int ne_cpu_has_avx512_vbmi(void);
+NE_API int ne_cpu_has_avx512_vnni(void);
+NE_API int ne_cpu_has_fma(void);
+NE_API int ne_cpu_has_f16c(void);
+NE_API int ne_cpu_has_blas(void);
+NE_API int ne_cpu_has_sse3(void);
+NE_API int ne_cpu_has_vsx(void);
+
+//
+// Internal types and functions exposed for tests and benchmarks
+//
+
+#ifdef __cplusplus
+// restrict not standard in C++
+#define NE_RESTRICT
+#else
+#define NE_RESTRICT restrict
+#endif
+typedef void (*dequantize_row_q_t)(const void* NE_RESTRICT x, float* NE_RESTRICT y, int k);
+typedef void (*quantize_row_q_t)(const float* NE_RESTRICT x, void* NE_RESTRICT y, int k);
+typedef void (*vec_dot_q_t)(const int n, float* NE_RESTRICT s, const void* NE_RESTRICT x, const void* NE_RESTRICT y);
+
+typedef struct {
+  dequantize_row_q_t dequantize_row_q;
+  quantize_row_q_t quantize_row_q;
+  quantize_row_q_t quantize_row_q_reference;
+  quantize_row_q_t quantize_row_q_dot;
+  vec_dot_q_t vec_dot_q;
+  enum ne_type vec_dot_type;
+} quantize_fns_t;
+
+quantize_fns_t ne_internal_get_quantize_fn(size_t i);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/vllm-ext/inference_engine/cpp/core/parallel_context.cpp b/vllm-ext/inference_engine/cpp/core/parallel_context.cpp
new file mode 100644
index 000000000..df904a431
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/parallel_context.cpp
@@ -0,0 +1,167 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#include <mpi.h>
+#include "oneapi/ccl.hpp"
+#include "parallel_context.h"
+#include "shared_memory_ccl.hpp"
+
+class parallel_class {
+ public:
+  parallel_class(const parallel_class& obj) = delete;
+  static parallel_class* get_instance() {
+    if (instance_p == nullptr) {
+      instance_p = new parallel_class();
+      return instance_p;
+    } else {
+      return instance_p;
+    }
+  }
+  ~parallel_class() {
+    delete pcomm;
+#ifndef _WIN32
+    if (use_shm) {
+      shared_close(shm_name, cbuffer, world_size * sizeof(struct ccl_buffer));
+    }
+#endif
+  }
+
+  bool is_master() { return rank == 0; }
+
+  int get_rank() { return rank; }
+  int get_size() { return world_size; }
+  // int get_rank() { return 1; }
+  // int get_size() { return 2; }
+
+  // From some example code of oneCCL, inplace reducing is supported
+  void reduce_add(float* sendBuf, float* recvBuf, size_t count) {
+#ifndef _WIN32
+    if (use_shm) {
+      shm_all_reduce(sendBuf, recvBuf, count, rank, world_size);
+    } else {
+      ccl::allreduce(sendBuf, recvBuf, count, ccl::reduction::sum, *pcomm).wait();
+    }
+#else
+    ccl::allreduce(sendBuf, recvBuf, count, ccl::reduction::sum, *pcomm).wait();
+#endif
+  }
+
+  void broadcast(float* buf, size_t count) {
+    int root = 0;  // assume always broadcast from master
+    ccl::broadcast(buf, count, root, *pcomm).wait();
+  }
+  void alltoall(const float* send_buf, float* recv_buf, size_t count) {
+    ccl::alltoall(send_buf, recv_buf, count, *pcomm).wait();
+  }
+  void barrier() { ccl::barrier(*pcomm); }
+
+ private:
+  static parallel_class* instance_p;
+  parallel_class() {
+    ccl::init();
+    MPI_Init(nullptr, nullptr);
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(parallel_class::mpi_finalize);
+
+    if (rank == 0) {
+      kvs = ccl::create_main_kvs();
+      main_addr = kvs->get_address();
+      MPI_Bcast(reinterpret_cast<void*>(main_addr.data()), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+    } else {
+      MPI_Bcast(reinterpret_cast<void*>(main_addr.data()), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+      kvs = ccl::create_kvs(main_addr);
+    }
+
+    pcomm = new ccl::communicator(ccl::create_communicator(world_size, rank, kvs));
+
+    rank = pcomm->rank();
+    world_size = pcomm->size();
+
+#ifndef _WIN32
+    // Check whether all ranks is on the same physical machine.
+    // If true use SHM allreduce
+    auto local_size = std::getenv("TP_LOCAL_SIZE");
+    if (local_size != NULL) {
+      use_shm = std::stoi(local_size) == world_size;
+    }
+    if (use_shm) {
+      void* shared_ptr = nullptr;
+      snprintf(shm_name, sizeof(shm_name), "%s_%d", "shared_memory_tp", getuid());
+      if (rank == 0) {
+        cbuffer = (struct ccl_buffer*)malloc(world_size * sizeof(struct ccl_buffer));
+        shared_ptr = shared_create(shm_name, cbuffer, world_size * sizeof(struct ccl_buffer));
+        assert(shared_ptr != nullptr);
+        cbuffer = (struct ccl_buffer*)(shared_ptr);
+        for (int i = 0; i < world_size; i++) {
+          cbuffer[i].state = ccl_begin;
+        }
+      }
+      ccl::barrier(*pcomm).wait();
+      if (rank != 0) {
+        shared_ptr = shared_open(shm_name, world_size * sizeof(struct ccl_buffer));
+        assert(shared_ptr != nullptr);
+        cbuffer = (struct ccl_buffer*)shared_ptr;
+      }
+    }
+#endif
+  }
+  static void mpi_finalize() {
+    int is_finalized = 0;
+    MPI_Finalized(&is_finalized);
+
+    if (!is_finalized) {
+      MPI_Finalize();
+    }
+  }
+
+  bool use_shm = false;
+  char shm_name[100];
+  int world_size;
+  int rank;
+
+  ccl::shared_ptr_class<ccl::kvs> kvs;
+  ccl::kvs::address_type main_addr;
+  ccl::communicator* pcomm;
+};
+
+struct parallel_context {
+  parallel_class* p_ctx;
+};
+
+parallel_class* parallel_class::instance_p = nullptr;
+
+parallel_context* init_parallel_context() {
+  parallel_context* p_struct = new parallel_context();
+  p_struct->p_ctx = parallel_class::get_instance();
+  return p_struct;
+}
+
+bool is_master(parallel_context* p) { return p->p_ctx->is_master(); }
+
+int get_tp_rank(parallel_context* p) { return p->p_ctx->get_rank(); }
+
+int get_tp_size(parallel_context* p) { return p->p_ctx->get_size(); }
+
+void reduce_add(parallel_context* p, float* send_buffer, float* recv_buffer, size_t count) {
+  p->p_ctx->reduce_add(send_buffer, recv_buffer, count);
+}
+
+void alltoall(parallel_context* p, float* send_buffer, float* recv_buffer, size_t count) {
+  p->p_ctx->alltoall(send_buffer, recv_buffer, count);
+}
+
+void broadcast(parallel_context* p, float* buffer, size_t count) { p->p_ctx->broadcast(buffer, count); }
+
+void barrier(parallel_context* p) { p->p_ctx->barrier(); }
diff --git a/vllm-ext/inference_engine/cpp/core/parallel_context.h b/vllm-ext/inference_engine/cpp/core/parallel_context.h
new file mode 100644
index 000000000..4e7b8664b
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/parallel_context.h
@@ -0,0 +1,51 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// For C++ class used in C code
+typedef struct parallel_context parallel_context;
+
+enum parallel_mode {
+  TENSOR_NO_CHANGE,
+  TENSOR_1D_ROW,
+  TENSOR_1D_COL,
+  TENSOR_2D_ROW,
+  TENSOR_2D_COL,
+
+  TENSOR_3D_INPUT,
+  TENSOR_3D_WEIGHT,
+  TENSOR_3D_OUTPUT,
+  TENSOR_3D_INPUT_X_WEIGHT,
+  TENSOR_3D_OUTPUT_X_WEIGHT,
+
+  TENSOR_2P5D_ROW,
+  TENSOR_2P5D_COL,
+  TENSOR_2P5D_DEP
+};
+parallel_context* init_parallel_context();
+int get_tp_size(parallel_context* p);
+int get_tp_rank(parallel_context* p);
+bool is_master(parallel_context* p);
+void barrier(parallel_context* p);
+void broadcast(parallel_context* p, float* buffer, size_t count);
+void alltoall(parallel_context* p, float* send_buffer, float* recv_buffer, size_t count);
+void reduce_add(parallel_context* p, float* send_buffer, float* recv_buffer, size_t count);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/vllm-ext/inference_engine/cpp/core/shared_memory_ccl.hpp b/vllm-ext/inference_engine/cpp/core/shared_memory_ccl.hpp
new file mode 100644
index 000000000..7d2ef3c6d
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/core/shared_memory_ccl.hpp
@@ -0,0 +1,139 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#include <algorithm>
+#include <assert.h>
+#include <immintrin.h>
+#ifndef _WIN32
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+#include "oneapi/ccl.hpp"
+#include "layers/ele_wise.h"
+
+// states for collectives
+enum ccl_state {
+  ccl_begin = 0,
+  copy_in_done,
+  reduce_done,
+  copy_out_done,
+};
+#ifndef _WIN32
+void* shared_open(const char* name, size_t nbytes) {
+  int d = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR);
+  if (d != -1) {
+    return mmap(NULL, nbytes, PROT_READ | PROT_WRITE, MAP_SHARED, d, 0);
+  } else {
+    printf("shared_open %s failed\n", name);
+    return nullptr;
+  }
+}
+
+void* shared_create(const char* name, void* bytes, size_t nbytes) {
+  int d = shm_open(name, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+  if ((d != -1) && (nbytes = write(d, bytes, nbytes))) {
+    return mmap(NULL, nbytes, PROT_READ | PROT_WRITE, MAP_SHARED, d, 0);
+  } else {
+    printf("shared_create %s failed\n", name);
+    return nullptr;
+  }
+}
+
+void shared_close(const char* name, void* bytes, size_t nbytes) {
+  int d = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR);
+  if (d != -1) {
+    munmap(bytes, nbytes);
+    shm_unlink(name);
+  }
+}
+#endif
+
+static constexpr size_t CCL_BUF_SIZE = 1048576;
+struct ccl_buffer {
+  enum ccl_state state;
+  char data[CCL_BUF_SIZE];
+};
+struct ccl_buffer* cbuffer;
+
+void wait_state_equal(int index, enum ccl_state state) {
+  volatile enum ccl_state* state_ptr = &(cbuffer[index].state);
+  while (*state_ptr != state) {
+    _mm_pause();
+  }
+}
+
+void wait_state_change(int index, enum ccl_state state) {
+  volatile enum ccl_state* state_ptr = &(cbuffer[index].state);
+  while (*state_ptr == state) {
+    _mm_pause();
+  }
+}
+
+void reduce_fp32_buffers(int num_elements, int num_buffers, struct ccl_buffer* cbuffer) {
+  auto rank_0 = reinterpret_cast<float*>(cbuffer[0].data);
+  // all buffers reduce to rank 0 and then broadcast
+  for (int i = 1; i < num_buffers; ++i) {
+    ne_vec_add_f32(num_elements, rank_0, rank_0, reinterpret_cast<float*>(cbuffer[i].data));
+  }
+}
+
+void reduce_buffers(struct ccl_buffer* cbuffer, int num_elements, int num_buffers) {
+  // TODO(chenxi) only support fp32 reduce, add other data type if needed
+  if (num_buffers >= 2) {
+    reduce_fp32_buffers(num_elements, num_buffers, cbuffer);
+  } else {
+    assert(!"Not supported buffer number.");
+  }
+}
+
+void shm_all_reduce(float* sendBuf, float* recvBuf, size_t count, size_t rank, size_t world_size) {
+  for (int offset = 0; offset < count * sizeof(float); offset += CCL_BUF_SIZE) {
+    auto send_ptr = reinterpret_cast<char*>(sendBuf) + offset;
+    auto recv_ptr = reinterpret_cast<char*>(recvBuf) + offset;
+    size_t chunk_size = std::min(count * sizeof(float) - offset, (size_t)CCL_BUF_SIZE);
+    size_t chunk_count = chunk_size / sizeof(float);
+
+    memcpy(cbuffer[rank].data, send_ptr, chunk_size);
+    cbuffer[rank].state = copy_in_done;
+
+    if (rank == 0) {
+      // compute allreduce result on rank 0
+      for (int i = 1; i < world_size; i++) {
+        // wait until the other rank copy the buffer
+        wait_state_equal(i, copy_in_done);
+      }
+      reduce_buffers(cbuffer, chunk_count, world_size);
+      cbuffer[rank].state = reduce_done;
+      memcpy(recv_ptr, cbuffer[0].data, chunk_size);
+    }
+    if (rank != 0) {
+      wait_state_equal(0, reduce_done);
+      memcpy(recv_ptr, cbuffer[0].data, chunk_size);
+      cbuffer[rank].state = copy_out_done;
+    }
+    if (rank == 0) {
+      for (int i = 1; i < world_size; i++) {
+        wait_state_equal(i, copy_out_done);
+      }
+      cbuffer[rank].state = ccl_begin;
+    }
+    if (rank != 0) {
+      // if rank 0 spin too fast it could be in state 1 of next allreduce
+      // in this case wait_state_change(0, 0) may cause deadlock
+      // what we are certain is when rank 0 finishes the state won't be 2
+      wait_state_change(0, reduce_done);
+      cbuffer[rank].state = ccl_begin;
+    }
+  }
+}
diff --git a/vllm-ext/inference_engine/cpp/models/CMakeLists.txt b/vllm-ext/inference_engine/cpp/models/CMakeLists.txt
new file mode 100644
index 000000000..8f5237d4f
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/CMakeLists.txt
@@ -0,0 +1,42 @@
+#  Copyright (c) 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+file(GLOB MODEL_UTILS_SOURCE "model_utils/*.cpp")
+
+function(add_model target)
+  add_library_w_warning(${target} ${ARGN})  # no (gpt) model utils needed
+  target_compile_features(${target} PUBLIC cxx_std_11) # don't bump
+  set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  target_link_libraries(${target} PUBLIC ne_layers bestla)
+endfunction()
+
+add_model(llama llama/llama.cpp llama/llama_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(gptj gptj/gptj.cpp gptj/gptj_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(mpt mpt/mpt.cpp mpt/mpt_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(gptneox gptneox/gptneox.cpp gptneox/gptneox_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(starcoder starcoder/starcoder.cpp starcoder/starcoder_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(falcon falcon/falcon.cpp falcon/falcon_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(opt opt/opt.cpp opt/opt_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(bloom bloom/bloom.cpp bloom/bloom_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(baichuan baichuan/baichuan.cpp baichuan/baichuan_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(qwen qwen/qwen.cpp qwen/qwen_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(whisper whisper/whisper.cpp whisper/whisper_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(chatglm chatglm/chatglm.cpp chatglm/chatglm_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(chatglm2 chatglm/chatglm2.cpp chatglm/chatglm2_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(gemma gemma/gemma.cpp gemma/gemma_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(phi phi/phi.cpp phi/phi_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(stablelm stablelm/stablelm.cpp stablelm/stablelm_utils.cpp ${MODEL_UTILS_SOURCE})
+# add_model(chatglm3 chatglm/chatglm2.cpp chatglm/chatglm2_utils.cpp ${MODEL_UTILS_SOURCE})
+
+add_subdirectory(application)
diff --git a/vllm-ext/inference_engine/cpp/models/application/CMakeLists.txt b/vllm-ext/inference_engine/cpp/models/application/CMakeLists.txt
new file mode 100644
index 000000000..0a26ca79f
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/application/CMakeLists.txt
@@ -0,0 +1,161 @@
+#  Copyright (c) 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+# common
+set(TARGET common)
+
+add_library_w_warning(${TARGET} OBJECT
+    common.h
+    common.cpp
+    )
+
+set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+target_include_directories(${TARGET} PUBLIC .)
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
+target_link_libraries(${TARGET} PUBLIC bestla)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+# pybind
+# set(TARGET GptjPyBind)
+# add_library_w_warning(${TARGET} SHARED pybind_gptj.cpp)
+# target_link_libraries(${TARGET} PUBLIC ne_layers common gptj ${CMAKE_THREAD_LIBS_INIT})
+# target_compile_features(${TARGET} PRIVATE cxx_std_11)
+# set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+# set(TARGET pybind_gptj)
+# add_executable_w_warning(${TARGET} pybind_gptj.cpp)
+# target_link_libraries(${TARGET} PUBLIC ne_layers common gptj ${CMAKE_THREAD_LIBS_INIT})
+# target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+# # all models quant
+# function(compile_quant TARGET SRC MODEL_NAME MODEL_LIB)
+#  add_executable_w_warning(${TARGET} ${SRC})
+#   warning_check(${TARGET})
+#   target_compile_definitions(${TARGET} PUBLIC -DMODEL_NAME="${MODEL_NAME}")
+#   target_link_libraries(${TARGET} PUBLIC ${MODEL_LIB} common ${CMAKE_THREAD_LIBS_INIT})
+#   target_compile_features(${TARGET} PRIVATE cxx_std_11)
+#   if(TARGET BUILD_INFO)
+#     add_dependencies(${TARGET} BUILD_INFO)
+#   endif()
+# endfunction()
+
+# compile_quant(quant_gptj      quant_model.cpp gptj      gptj)
+# compile_quant(quant_falcon    quant_model.cpp falcon    falcon)
+# compile_quant(quant_gptneox   quant_model.cpp gptneox   gptneox)
+# compile_quant(quant_dolly     quant_model.cpp dolly     gptneox)
+# compile_quant(quant_polyglot  quant_model.cpp polyglot  gptneox)
+# compile_quant(quant_llama     quant_model.cpp llama     llama)
+# compile_quant(quant_mpt       quant_model.cpp mpt       mpt)
+# compile_quant(quant_starcoder quant_model.cpp starcoder starcoder)
+# compile_quant(quant_opt       quant_model.cpp opt       opt)
+# compile_quant(quant_bloom     quant_model.cpp bloom     bloom)
+
+# compile_quant(quant_chatglm   quant_model.cpp chatglm   chatglm)
+# compile_quant(quant_chatglm2  quant_model.cpp chatglm2  chatglm2)
+# compile_quant(quant_chatglm3  quant_model.cpp chatglm2  chatglm2)
+# compile_quant(quant_baichuan  quant_model.cpp baichuan  baichuan)
+# compile_quant(quant_mistral   quant_model.cpp mistral   llama)
+# compile_quant(quant_mixtral   quant_model.cpp mixtral   llama)
+# compile_quant(quant_qwen   quant_model.cpp qwen   qwen)
+# compile_quant(quant_phi   quant_model.cpp phi   phi)
+# compile_quant(quant_gemma   quant_model.cpp gemma   gemma)
+# compile_quant(quant_stablelm   quant_model.cpp stablelm   stablelm)
+# compile_quant(quant_whisper   quant_whisper.cpp whisper   whisper)
+
+# all models running
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+set(mymap_gptj 1)
+# set(mymap_falcon 2)
+# set(mymap_gptneox 3)
+# set(mymap_dolly 4)
+set(mymap_llama 5)
+# set(mymap_mpt 6)
+# set(mymap_starcoder 7)
+# set(mymap_opt 8)
+# set(mymap_bloom 9)
+# set(mymap_chatglm2 10)
+# set(mymap_chatglm 11)
+# set(mymap_baichuan 12)
+# set(mymap_polyglot 13)
+# set(mymap_mistral 14)
+# set(mymap_qwen 15)
+# set(mymap_phi 16)
+# set(mymap_stablelm 17)
+# set(mymap_whisper 18)
+# set(mymap_mixtral 19)
+# set(mymap_chatglm3 20)
+# set(mymap_gemma 21)
+
+
+
+# function(compile_run TARGET MAIN_CPP MAIN_PY MODEL_NAME MODEL_LIB)
+#   add_executable_w_warning(${TARGET} ${MAIN_CPP})
+#   warning_check(${TARGET})
+#   target_compile_definitions(${TARGET} PUBLIC -DMODEL_NAME="${MODEL_NAME}" -DMODEL_NAME_ID=${mymap_${MODEL_NAME}})
+#   # target_link_libraries(${TARGET} PUBLIC ne_layers ${MODEL_LIB} common ${CMAKE_THREAD_LIBS_INIT})
+#   target_link_libraries(${TARGET} PUBLIC ne_layers ${MODEL_LIB} ${CMAKE_THREAD_LIBS_INIT})
+#   target_compile_features(${TARGET} PRIVATE cxx_std_11)
+#   if(TARGET BUILD_INFO)
+#     add_dependencies(${TARGET} BUILD_INFO)
+#   endif()
+
+#   if (NOT (MAIN_PY STREQUAL "") )
+#     pybind11_add_module("${MODEL_NAME}_cpp" ${MAIN_PY})
+#     # target_link_libraries("${MODEL_NAME}_cpp" PRIVATE ne_layers ${MODEL_LIB} common)
+#     target_link_libraries("${MODEL_NAME}_cpp" PRIVATE ne_layers ${MODEL_LIB})
+#     target_compile_definitions("${MODEL_NAME}_cpp" PUBLIC -DMODEL_NAME="${MODEL_NAME}" -DMODEL_NAME_ID=${mymap_${MODEL_NAME}})
+#   endif()
+# endfunction()
+# text generation
+  # compile_run(run_gptj      main_run.cpp   main_pybind.cpp gptj      gptj)
+  # compile_run(run_falcon    main_run.cpp   main_pybind.cpp falcon    falcon)
+  # compile_run(run_gptneox   main_run.cpp   main_pybind.cpp gptneox   gptneox)
+  # compile_run(run_dolly     main_run.cpp   main_pybind.cpp dolly     gptneox)
+  # compile_run(run_polyglot  main_run.cpp   main_pybind.cpp polyglot  gptneox)
+# compile_run(run_llama     main_run.cpp   main_pybind.cpp llama     llama)
+# compile_run(run_mpt       main_run.cpp   main_pybind.cpp mpt       mpt)
+# compile_run(run_starcoder main_run.cpp   main_pybind.cpp starcoder starcoder)
+# compile_run(run_opt       main_run.cpp   main_pybind.cpp opt       opt)
+# compile_run(run_bloom     main_run.cpp   main_pybind.cpp bloom     bloom)
+# compile_run(run_chatglm2  main_run.cpp   main_pybind.cpp chatglm2  chatglm2)
+# compile_run(run_chatglm3  main_run.cpp   main_pybind.cpp chatglm3  chatglm3)
+# compile_run(run_chatglm   main_run.cpp   main_pybind.cpp chatglm   chatglm)
+# compile_run(run_baichuan  main_run.cpp   main_pybind.cpp baichuan  baichuan)
+# compile_run(run_mistral   main_run.cpp   main_pybind.cpp mistral   llama)
+# compile_run(run_qwen      main_run.cpp   main_pybind.cpp qwen      qwen)
+# compile_run(run_phi      main_run.cpp   main_pybind.cpp phi      phi)
+# compile_run(run_gemma  main_run.cpp   main_pybind.cpp gemma   gemma)
+# compile_run(run_stablelm      main_run.cpp   main_pybind.cpp stablelm      stablelm)
+# compile_run(run_mixtral   main_run.cpp   main_pybind.cpp mixtral   llama)
+
+
+# speech recognition
+# compile_run(run_whisper   audio_run.cpp  whisper_pybind.cpp whisper   whisper)
+
+# compile for vllm continuous batching
+function(compile_vllm_cont_batch MODEL_CPP MODEL_NAME MODEL_LIB)
+  if (NOT (MODEL_CPP STREQUAL "") )
+    # pybind11_add_module("${MODEL_NAME}_vllm_cb_cpp" ${MAIN_PY})
+    add_shared_library_w_warning("${MODEL_NAME}_vllm_cb_cpp" ${MODEL_CPP})
+    set_target_properties("${MODEL_NAME}_vllm_cb_cpp" PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/inference_engine")
+    target_link_libraries("${MODEL_NAME}_vllm_cb_cpp" PRIVATE ne_layers ${MODEL_LIB} common)
+    target_compile_definitions("${MODEL_NAME}_vllm_cb_cpp" PUBLIC -DMODEL_NAME="${MODEL_NAME}" -DMODEL_NAME_ID=${mymap_${MODEL_NAME}})
+  endif()
+endfunction()
+
+compile_vllm_cont_batch(vllm_cont_batch.cpp llama llama)
+# compile_vllm_cont_batch(vllm_cont_batch_pybind.cpp gptj gptj)
diff --git a/vllm-ext/inference_engine/cpp/models/application/common.cpp b/vllm-ext/inference_engine/cpp/models/application/common.cpp
new file mode 100644
index 000000000..00bc04cd3
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/application/common.cpp
@@ -0,0 +1,1180 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#include "common.h"
+
+#ifndef _WIN32
+#include <ext/alloc_traits.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <unordered_set>
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <thread>  // NOLINT
+#include <type_traits>
+#include <utility>
+
+// third-party utilities
+// use your favorite implementations
+#define DR_WAV_IMPLEMENTATION
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <regex>  // NOLINT
+#include <locale>
+#include <codecvt>
+#include <sstream>
+#include <cassert>
+#include <iostream>
+#include <string>
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <fcntl.h>
+#include <io.h>
+#else
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <wchar.h>
+#endif
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+int32_t get_num_physical_cores() {
+#ifdef __linux__
+  // enumerate the set of thread siblings, num entries is num cores
+  std::unordered_set<std::string> siblings;
+  for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
+    std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
+    if (!thread_siblings.is_open()) {
+      break;  // no more cpus
+    }
+    std::string line;
+    if (std::getline(thread_siblings, line)) {
+      siblings.insert(line);
+    }
+  }
+  if (!siblings.empty()) {
+    return static_cast<int32_t>(siblings.size());
+  }
+#elif defined(__APPLE__) && defined(__MACH__)
+  int32_t num_physical_cores;
+  size_t len = sizeof(num_physical_cores);
+  int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, nullptr, 0);
+  if (result == 0) {
+    return num_physical_cores;
+  }
+  result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, nullptr, 0);
+  if (result == 0) {
+    return num_physical_cores;
+  }
+#elif defined(_WIN32)
+  // TODO(Yucheng): Implement
+#endif
+  unsigned int n_threads = std::thread::hardware_concurrency();
+  return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
+}
+
+bool isValidFilename(const std::string& filename) {
+  std::ifstream infile(filename.c_str());
+  return infile.good();
+}
+
+void gpt_print_usage(int /*argc*/, char** argv, const common_params& params) {
+  fprintf(stderr, "usage: %s [options]\n", argv[0]);
+  fprintf(stderr, "\n");
+  fprintf(stderr, "options:\n");
+  fprintf(stderr, "  -h, --help            show this help message and exit\n");
+  fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
+  fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n",
+          params.n_threads);
+  fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
+  fprintf(stderr, "                        prompt to start generation with (default: random)\n");
+  fprintf(stderr, "  -f FNAME, --file FNAME\n");
+  fprintf(stderr, "                        load prompt from a file\n");
+  fprintf(stderr, "  -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
+  fprintf(stderr, "                        test tokenization\n");
+  fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
+  fprintf(stderr, "  --top_k N             top-k sampling (default: %d, 0 = n_vocab)\n", params.top_k);
+  fprintf(stderr, "  --top_p N             top-p sampling (default: %.2f)\n", params.top_p);
+  fprintf(stderr, "  --temp N              temperature (default: %.2f)\n", params.temp);
+  fprintf(stderr,
+          "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n",
+          params.repeat_last_n);
+  fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n",
+          static_cast<double>(params.repeat_penalty));
+  fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
+  fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+  fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+  fprintf(stderr, "  -m FNAME, --model FNAME\n");
+  fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+  fprintf(stderr, "\n");
+}
+
+bool common_params_parse(int argc, char** argv, common_params& params) {  // NOLINT
+  for (int i = 1; i < argc; i++) {
+    std::string arg = argv[i];
+
+    if (arg == "-s" || arg == "--seed") {
+      params.seed = std::stoi(argv[++i]);
+    } else if (arg == "-t" || arg == "--threads") {
+      params.n_threads = std::stoi(argv[++i]);
+    } else if (arg == "-p" || arg == "--prompt") {
+      params.prompt = argv[++i];
+    } else if (arg == "-n" || arg == "--n_predict") {
+      params.n_predict = std::stoi(argv[++i]);
+    } else if (arg == "--top_k") {
+      params.top_k = std::max(1, std::stoi(argv[++i]));
+    } else if (arg == "--top_p") {
+      params.top_p = std::stof(argv[++i]);
+    } else if (arg == "--temp") {
+      params.temp = std::stof(argv[++i]);
+    } else if (arg == "--repeat-last-n") {
+      params.repeat_last_n = std::stof(argv[++i]);
+    } else if (arg == "--repeat-penalty") {
+      params.repeat_penalty = std::stof(argv[++i]);
+    } else if (arg == "--perplexity") {
+      params.perplexity = true;
+    } else if (arg == "-c" || arg == "--ctx-size") {
+      params.n_ctx = std::stoi(argv[++i]);
+    } else if (arg == "-b" || arg == "--batch_size") {
+      params.n_batch = std::stoi(argv[++i]);
+    } else if (arg == "-m" || arg == "--model") {
+      if (!isValidFilename(argv[i + 1])) return false;
+      params.model = argv[++i];
+    } else if (arg == "-h" || arg == "--help") {
+      gpt_print_usage(argc, argv, params);
+      exit(0);
+    } else if (arg == "-f" || arg == "--file") {
+      if (++i > argc) {
+        fprintf(stderr, "Invalid file param");
+        break;
+      }
+      std::ifstream file(argv[i]);
+      if (!file) {
+        fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+        break;
+      }
+      params.prompt.clear();
+      std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
+      if (params.prompt.back() == '\n') {
+        params.prompt.pop_back();
+      }
+    } else if (arg == "-tt" || arg == "--token_test") {
+      params.token_test = argv[++i];
+    } else {
+      fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+      gpt_print_usage(argc, argv, params);
+      exit(0);
+    }
+  }
+
+  return true;
+}
+
+std::string gpt_random_prompt(std::mt19937& rng) {  // NOLINT
+  const int r = rng() % 10;
+  switch (r) {
+    case 0:
+      return "So";
+    case 1:
+      return "Once upon a time";
+    case 2:
+      return "When";
+    case 3:
+      return "The";
+    case 4:
+      return "After";
+    case 5:
+      return "If";
+    case 6:
+      return "import";
+    case 7:
+      return "He";
+    case 8:
+      return "She";
+    case 9:
+      return "They";
+    default:
+      return "To";
+  }
+
+  return "The";
+}
+
+std::vector<int> gpt_random_ids(std::mt19937& rng) {  // NOLINT
+  const int l = rng() % 10 + 1;
+  std::vector<int> res(l, 0);
+  for (int i = 0; i < l; ++i) {
+    res.push_back(rng() % 1000);
+  }
+  return res;
+}
+
+std::string trim(const std::string& s) {
+  std::regex e("^\\s+|\\s+$");
+  return std::regex_replace(s, e, "");
+}
+
+std::string replace(const std::string& s, const std::string& from, const std::string& to) {
+  std::string result = s;
+  size_t pos = 0;
+  while ((pos = result.find(from, pos)) != std::string::npos) {
+    result.replace(pos, from.length(), to);
+    pos += to.length();
+  }
+  return result;
+}
+
+void gpt_vocab::add_special_token(const std::string& token) { special_tokens.push_back(token); }
+
+std::map<std::string, int32_t> json_parse(const std::string& fname) {
+  std::map<std::string, int32_t> result;
+
+  // read file into string
+  std::string json;
+  {
+    std::ifstream ifs(fname);
+    if (!ifs) {
+      fprintf(stderr, "Failed to open %s\n", fname.c_str());
+      exit(1);
+    }
+
+    json = std::string((std::istreambuf_iterator<char>(ifs)), (std::istreambuf_iterator<char>()));
+  }
+
+  if (json[0] != '{') {
+    return result;
+  }
+
+  // parse json
+  {
+    bool has_key = false;
+    bool in_token = false;
+
+    std::string str_key;
+    std::string str_val;
+
+    int n = json.size();
+    for (int i = 1; i < n; ++i) {
+      if (!in_token) {
+        if (json[i] == ' ') continue;
+        if (json[i] == '"') {
+          in_token = true;
+          continue;
+        }
+      } else {
+        if (json[i] == '\\' && i + 1 < n) {
+          if (has_key == false) {
+            str_key += json[i];
+          } else {
+            str_val += json[i];
+          }
+          ++i;
+        } else if (json[i] == '"') {
+          if (has_key == false) {
+            has_key = true;
+            ++i;
+            while (json[i] == ' ') ++i;
+            ++i;  // :
+            while (json[i] == ' ') ++i;
+            if (json[i] != '\"') {
+              while (json[i] != ',' && json[i] != '}') {
+                str_val += json[i++];
+              }
+              has_key = false;
+            } else {
+              in_token = true;
+              continue;
+            }
+          } else {
+            has_key = false;
+          }
+
+          str_key = ::replace(str_key, "\\u0120", " ");   // \u0120 -> space
+          str_key = ::replace(str_key, "\\u010a", "\n");  // \u010a -> new line
+          str_key = ::replace(str_key, "\\\"", "\"");     // \\\"   -> "
+
+          try {
+            result[str_key] = std::stoi(str_val);
+          } catch (...) {
+            // fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(),
+            // str_val.c_str());
+          }
+          str_key = "";
+          str_val = "";
+          in_token = false;
+          continue;
+        }
+        if (has_key == false) {
+          str_key += json[i];
+        } else {
+          str_val += json[i];
+        }
+      }
+    }
+  }
+
+  return result;
+}
+
+std::string convert_to_utf8(const std::wstring& input) {
+  std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+  return converter.to_bytes(input);
+}
+
+std::wstring convert_to_wstring(const std::string& input) {
+  std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+  return converter.from_bytes(input);
+}
+
+std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab& vocab, const std::string& text) {
+  std::vector<std::string> words;
+
+  // first split the text into words
+  {
+    std::string str = text;
+    std::string pat =
+        R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
+
+    // Generate the subpattern from the special_tokens vector if it's not empty
+    if (!vocab.special_tokens.empty()) {
+      std::string special_tokens_subpattern;
+      for (const auto& token : vocab.special_tokens) {
+        if (!special_tokens_subpattern.empty()) {
+          special_tokens_subpattern += "|";
+        }
+        special_tokens_subpattern += token;
+      }
+
+      // Modify the regex pattern with the generated special tokens subpattern
+      pat = special_tokens_subpattern + "|" + pat;
+    }
+
+    std::regex re(pat);
+    std::smatch m;
+
+    while (std::regex_search(str, m, re)) {
+      for (auto x : m) {
+        words.push_back(x);
+      }
+      str = m.suffix();
+    }
+  }
+
+  // find the longest token that forms each word in words:
+  std::vector<gpt_vocab::id> tokens;
+  for (const auto& word : words) {
+    for (int i = 0; i < word.size();) {
+      for (int j = word.size() - 1; j >= i; j--) {
+        auto cand = word.substr(i, j - i + 1);
+        auto it = vocab.token_to_id.find(cand);
+        if (it != vocab.token_to_id.end()) {  // word.substr(i, j-i+1) in vocab
+          tokens.push_back(it->second);
+          i = j + 1;
+          break;
+        } else if (j == i) {  // word.substr(i, 1) has no matching
+          fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
+          i++;
+        }
+      }
+    }
+  }
+
+  return tokens;
+}
+
+std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
+  std::vector<gpt_vocab::id> output;
+  std::stringstream ss(input);
+  std::string token;
+
+  while (std::getline(ss, token, delimiter)) {
+    output.push_back(std::stoi(token));
+  }
+
+  return output;
+}
+
+std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string& fpath_test) {
+  if (fpath_test.empty()) {
+    fprintf(stderr, "%s : No test file found.\n", __func__);
+    return std::map<std::string, std::vector<gpt_vocab::id>>();
+  }
+
+  std::map<std::string, std::vector<gpt_vocab::id>> tests;
+
+  auto fin = std::ifstream(fpath_test, std::ios_base::in);
+  const char* delimiter = " => ";
+  const char del_tok = ',';
+  std::string line;
+  while (std::getline(fin, line)) {
+    size_t delimiterPos = line.find(delimiter);
+    if (delimiterPos != std::string::npos) {
+      std::string text = line.substr(0, delimiterPos);
+      std::string s_tokens = line.substr(delimiterPos + std::strlen(delimiter));
+      tests[text] = parse_tokens_from_string(s_tokens, del_tok);
+    }
+  }
+  return tests;
+}
+
+bool gpt_vocab_init(const std::string& fname, gpt_vocab* vocab) {
+  printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
+
+  vocab->token_to_id = ::json_parse(fname);
+
+  for (const auto& kv : vocab->token_to_id) {
+    vocab->id_to_token[kv.second] = kv.first;
+  }
+
+  printf("%s: vocab size = %d\n", __func__, static_cast<int>(vocab->token_to_id.size()));
+
+  // print the vocabulary
+  // for (auto kv : vocab.token_to_id) {
+  //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
+  //}
+
+  return true;
+}
+
+gpt_vocab::id gpt_sample_top_k_top_p(const gpt_vocab& vocab, const float* logits, int top_k, double top_p, double temp,
+                                     std::mt19937& rng) {  // NOLINT
+  int n_logits = vocab.id_to_token.size();
+  std::vector<std::pair<double, gpt_vocab::id>> logits_id;
+  logits_id.reserve(n_logits);
+
+  {
+    const double scale = 1.0 / temp;
+    for (int i = 0; i < n_logits; ++i) {
+      logits_id.emplace_back(logits[i] * scale, i);
+    }
+  }
+
+  // find the top K tokens
+  std::partial_sort(logits_id.begin(), logits_id.begin() + top_k, logits_id.end(),
+                    [](const std::pair<double, gpt_vocab::id>& a, const std::pair<double, gpt_vocab::id>& b) {
+                      return a.first > b.first;
+                    });
+
+  logits_id.resize(top_k);
+
+  double maxl = -INFINITY;
+  for (const auto& kv : logits_id) {
+    maxl = std::max(maxl, kv.first);
+  }
+
+  // compute probs for the top K tokens
+  std::vector<double> probs;
+  probs.reserve(logits_id.size());
+
+  double sum = 0.0;
+  for (const auto& kv : logits_id) {
+    double p = exp(kv.first - maxl);
+    probs.push_back(p);
+    sum += p;
+  }
+
+  // normalize the probs
+  for (auto& p : probs) {
+    p /= sum;
+  }
+
+  if (top_p < 1.0f) {
+    double cumsum = 0.0f;
+    for (int i = 0; i < top_k; i++) {
+      cumsum += probs[i];
+      if (cumsum >= top_p) {
+        top_k = i + 1;
+        probs.resize(top_k);
+        logits_id.resize(top_k);
+        break;
+      }
+    }
+
+    cumsum = 1.0 / cumsum;
+    for (size_t i = 0; i < probs.size(); i++) {
+      probs[i] *= cumsum;
+    }
+  }
+
+  // printf("\n");
+  // for (int i = 0; i < (int) probs.size(); i++) {
+  //     printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
+  // }
+  // exit(0);
+
+  std::discrete_distribution<> dist(probs.begin(), probs.end());
+  int idx = dist(rng);
+
+  return logits_id[idx].second;
+}
+
+gpt_vocab::id gpt_sample_top_k_top_p_repeat(const gpt_vocab& vocab, const float* logits,
+                                            const int32_t* last_n_tokens_data, size_t last_n_tokens_data_size,
+                                            int top_k, double top_p, double temp, int repeat_last_n,
+                                            float repeat_penalty, std::mt19937& rng) {  // NOLINT
+  int n_logits = vocab.id_to_token.size();
+
+  const auto* plogits = logits;
+
+  const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
+
+  if (temp <= 0) {
+    // select the token with the highest logit directly
+    float max_logit = plogits[0];
+    gpt_vocab::id max_id = 0;
+
+    for (int i = 1; i < n_logits; ++i) {
+      if (plogits[i] > max_logit) {
+        max_logit = plogits[i];
+        max_id = i;
+      }
+    }
+    return max_id;
+  }
+
+  std::vector<std::pair<double, gpt_vocab::id>> logits_id;
+  logits_id.reserve(n_logits);
+
+  {
+    const float scale = 1.0f / temp;
+    for (int i = 0; i < n_logits; ++i) {
+      // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
+      // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
+      if (repeat_last_n > 0 &&
+          std::find(last_n_tokens.end() - repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
+        // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
+        if (plogits[i] < 0.0f) {
+          logits_id.emplace_back(plogits[i] * scale * repeat_penalty, i);
+        } else {
+          logits_id.emplace_back(plogits[i] * scale / repeat_penalty, i);
+        }
+      } else {
+        logits_id.emplace_back(plogits[i] * scale, i);
+      }
+    }
+  }
+
+  // find the top K tokens
+  std::partial_sort(logits_id.begin(), logits_id.begin() + top_k, logits_id.end(),
+                    [](const std::pair<double, gpt_vocab::id>& a, const std::pair<double, gpt_vocab::id>& b) {
+                      return a.first > b.first;
+                    });
+
+  logits_id.resize(top_k);
+
+  double maxl = -INFINITY;
+  for (const auto& kv : logits_id) {
+    maxl = std::max(maxl, kv.first);
+  }
+
+  // compute probs for the top K tokens
+  std::vector<double> probs;
+  probs.reserve(logits_id.size());
+
+  double sum = 0.0;
+  for (const auto& kv : logits_id) {
+    double p = exp(kv.first - maxl);
+    probs.push_back(p);
+    sum += p;
+  }
+
+  // normalize the probs
+  for (auto& p : probs) {
+    p /= sum;
+  }
+
+  if (top_p < 1.0f) {
+    double cumsum = 0.0f;
+    for (int i = 0; i < top_k; i++) {
+      cumsum += probs[i];
+      if (cumsum >= top_p) {
+        top_k = i + 1;
+        probs.resize(top_k);
+        logits_id.resize(top_k);
+        break;
+      }
+    }
+
+    cumsum = 1.0 / cumsum;
+    for (size_t i = 0; i < probs.size(); i++) {
+      probs[i] *= cumsum;
+    }
+  }
+
+  //    printf("\n");
+  //    for (int i = 0; i < (int) probs.size(); i++) {
+  //    for (int i = 0; i < 10; i++) {
+  //        printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
+  //    }
+
+  std::discrete_distribution<> dist(probs.begin(), probs.end());
+  int idx = dist(rng);
+
+  return logits_id[idx].second;
+}
+
+void quant_print_usage(int argc, char** argv, const quant_params& params) {
+  fprintf(stderr, "usage: %s [options]\n", argv[0]);
+  fprintf(stderr, "\n");
+  fprintf(stderr, "options:\n");
+  fprintf(stderr, "  -h, --help            show this help message and exit\n");
+  fprintf(stderr, "  --model_file          path to the fp32 model\n");
+  fprintf(stderr, "  --out_file            path to the quantized model\n");
+  fprintf(stderr,
+          "  --config              path to the configuration file (default: "
+          ")\n");
+  fprintf(stderr, "  --nthread             number of threads to use (default: 1)\n");
+  fprintf(stderr,
+          "  --weight_dtype        number of bits to use for quantization: int4/int8/fp8_e4m3/fp8_e5m2/"
+          "fp4_e2m1/nf4/int3 (default: int4)\n");
+  fprintf(stderr, "  --alg                 quantization algorithm to use: sym/asym (default: sym)\n");
+  fprintf(stderr, "  --group_size          group size: 32/128/-1 (per channel) (default: 32)\n");
+  fprintf(stderr, "  --scale_dtype         fp32/bf16/fp8 type for scales (default: fp32)\n");
+  fprintf(stderr, "  --compute_dtype       data type of Gemm computation: int8/bf16/fp16/fp32 (default: fp32)\n");
+  fprintf(stderr, "  --use_ggml            enable ggml for quantization and inference\n");
+  fprintf(stderr,
+          "  --model_name          model name like falcon / llama (default: "
+          "unknown)\n");
+  fprintf(stderr, "\n");
+}
+
+bool quant_params_parse(int argc, char** argv, quant_params& params) {  // NOLINT
+  for (int i = 1; i < argc; i++) {
+    std::string arg = argv[i];
+    if (arg == "--model_file") {
+      params.model_file = argv[++i];
+    } else if (arg == "--out_file") {
+      params.out_file = argv[++i];
+    } else if (arg == "--config") {
+      params.config = argv[++i];
+    } else if (arg == "--nthread") {
+      params.nthread = std::stoi(argv[++i]);
+    } else if (arg == "--weight_dtype") {
+      params.weight_dtype = argv[++i];
+      if (params.weight_dtype == "fp8") {
+        params.weight_dtype = "fp8_e4m3";
+      }
+      if (params.weight_dtype == "fp4") {
+        params.weight_dtype = "fp4_e2m1";
+      }
+    } else if (arg == "--alg") {
+      params.alg = argv[++i];
+    } else if (arg == "--group_size") {
+      params.group_size = std::stoi(argv[++i]);
+    } else if (arg == "--scale_dtype") {
+      params.scale_dtype = argv[++i];
+    } else if (arg == "--compute_dtype") {
+      params.compute_dtype = argv[++i];
+    } else if (arg == "--use_ggml") {
+      params.use_ggml = true;
+    } else if (arg == "--model_name") {
+      params.model_name = argv[++i];
+      model_archs mt = model_name_to_arch::init().find(params.model_name);
+      if (mt == MODEL_UNKNOWN) {
+        exit(0);
+      } else {
+        params.model_arch = mt;
+      }
+    } else if (arg == "-h" || arg == "--help") {
+      quant_print_usage(argc, argv, params);
+      exit(0);
+    } else {
+      quant_print_usage(argc, argv, params);
+      fprintf(stderr, "unrecognized arguments: %s", arg.c_str());
+      exit(0);
+    }
+  }
+
+  return true;
+}
+
+ne_ftype quant_params_to_ftype(const quant_params& params) {
+  if (params.use_ggml) {
+    if (params.weight_dtype == "int4") {
+      if (params.alg == "sym") {
+        return NE_FTYPE_MOSTLY_Q4_0;
+      } else {
+        return NE_FTYPE_MOSTLY_Q4_1;
+      }
+    } else if (params.weight_dtype == "int5") {
+      if (params.alg == "sym") {
+        return NE_FTYPE_MOSTLY_Q5_0;
+      } else {
+        return NE_FTYPE_MOSTLY_Q5_1;
+      }
+    } else if (params.weight_dtype == "int8") {
+      return NE_FTYPE_MOSTLY_Q8_0;
+    }
+  } else {
+    return NE_FTYPE_MOSTLY_Q_BTLA;
+  }
+  return NE_FTYPE_UNKNOWN;
+}
+
+ne_type quant_params_to_type(const quant_params& params) {
+  if (params.use_ggml) {
+    if (params.weight_dtype == "int4") {
+      if (params.alg == "sym") {
+        return NE_TYPE_Q4_0;
+      } else {
+        return NE_TYPE_Q4_1;
+      }
+    } else if (params.weight_dtype == "int5") {
+      if (params.alg == "sym") {
+        return NE_TYPE_Q5_0;
+      } else {
+        return NE_TYPE_Q5_1;
+      }
+    } else if (params.weight_dtype == "int8") {
+      return NE_TYPE_Q8_0;
+    }
+  } else {
+    return NE_TYPE_BTLA;
+  }
+  return NE_TYPE_F32;
+}
+
+void console_init(console_state& con_st) {  // NOLINT
+#if defined(_WIN32)
+  // Windows-specific console initialization
+  DWORD dwMode = 0;
+  con_st.hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
+  if (con_st.hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(con_st.hConsole, &dwMode)) {
+    con_st.hConsole = GetStdHandle(STD_ERROR_HANDLE);
+    if (con_st.hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(con_st.hConsole, &dwMode))) {
+      con_st.hConsole = nullptr;
+    }
+  }
+  if (con_st.hConsole) {
+    // Enable ANSI colors on Windows 10+
+    if (con_st.use_color && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
+      SetConsoleMode(con_st.hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
+    }
+    // Set console output codepage to UTF8
+    SetConsoleOutputCP(CP_UTF8);
+  }
+  HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
+  if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
+    // Set console input codepage to UTF16
+    _setmode(_fileno(stdin), _O_WTEXT);
+
+    // Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
+    dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
+    SetConsoleMode(hConIn, dwMode);
+  }
+#else
+  // POSIX-specific console initialization
+  struct termios new_termios;
+  tcgetattr(STDIN_FILENO, &con_st.prev_state);
+  new_termios = con_st.prev_state;
+  new_termios.c_lflag &= ~(ICANON | ECHO);
+  new_termios.c_cc[VMIN] = 1;
+  new_termios.c_cc[VTIME] = 0;
+  tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
+
+  con_st.tty = fopen("/dev/tty", "w+");
+  if (con_st.tty != nullptr) {
+    con_st.out = con_st.tty;
+  }
+
+  setlocale(LC_ALL, "");
+#endif
+}
+
+void console_cleanup(console_state& con_st) {  // NOLINT
+  // Reset console color
+  console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+
+#if !defined(_WIN32)
+  if (con_st.tty != nullptr) {
+    con_st.out = stdout;
+    fclose(con_st.tty);
+    con_st.tty = nullptr;
+  }
+  // Restore the terminal settings on POSIX systems
+  tcsetattr(STDIN_FILENO, TCSANOW, &con_st.prev_state);
+#endif
+}
+
+/* Keep track of current color of output, and emit ANSI code if it changes. */
+void console_set_color(console_state& con_st, console_color_t color) {  // NOLINT
+  if (con_st.use_color && con_st.color != color) {
+    fflush(stdout);
+    switch (color) {
+      case CONSOLE_COLOR_DEFAULT:
+        fprintf(con_st.out, ANSI_COLOR_RESET);
+        break;
+      case CONSOLE_COLOR_PROMPT:
+        fprintf(con_st.out, ANSI_COLOR_YELLOW);
+        break;
+      case CONSOLE_COLOR_USER_INPUT:
+        fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
+        break;
+    }
+    con_st.color = color;
+    fflush(con_st.out);
+  }
+}
+
+char32_t getchar32() {
+  wchar_t wc = getwchar();
+  if (static_cast<wint_t>(wc) == WEOF) {
+    return WEOF;
+  }
+
+#if WCHAR_MAX == 0xFFFF
+  if ((wc >= 0xD800) && (wc <= 0xDBFF)) {  // Check if wc is a high surrogate
+    wchar_t low_surrogate = getwchar();
+    if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) {  // Check if the next wchar is a low surrogate
+      return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
+    }
+  }
+  if ((wc >= 0xD800) && (wc <= 0xDFFF)) {  // Invalid surrogate pair
+    return 0xFFFD;                         // Return the replacement character U+FFFD
+  }
+#endif
+
+  return static_cast<char32_t>(wc);
+}
+
+void pop_cursor(const console_state& con_st) {
+#if defined(_WIN32)
+  if (con_st.hConsole != nullptr) {
+    CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
+    GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo);
+
+    COORD newCursorPosition = bufferInfo.dwCursorPosition;
+    if (newCursorPosition.X == 0) {
+      newCursorPosition.X = bufferInfo.dwSize.X - 1;
+      newCursorPosition.Y -= 1;
+    } else {
+      newCursorPosition.X -= 1;
+    }
+
+    SetConsoleCursorPosition(con_st.hConsole, newCursorPosition);
+    return;
+  }
+#endif
+  putc('\b', con_st.out);
+}
+
+int estimateWidth(char32_t codepoint) {
+#if defined(_WIN32)
+  return 1;
+#else
+  return wcwidth(codepoint);
+#endif
+}
+
+int put_codepoint(const console_state& con_st, const char* utf8_codepoint, size_t length, int expectedWidth) {
+#if defined(_WIN32)
+  CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
+  if (!GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo)) {
+    // go with the default
+    return expectedWidth;
+  }
+  COORD initialPosition = bufferInfo.dwCursorPosition;
+  DWORD nNumberOfChars = length;
+  WriteConsole(con_st.hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, nullptr);
+
+  CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
+  GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
+
+  // Figure out our real position if we're in the last column
+  if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
+    DWORD nNumberOfChars;
+    WriteConsole(con_st.hConsole, &" \b", 2, &nNumberOfChars, nullptr);
+    GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
+  }
+
+  int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
+  if (width < 0) {
+    width += newBufferInfo.dwSize.X;
+  }
+  return width;
+#else
+  // we can trust expectedWidth if we've got one
+  if (expectedWidth >= 0 || con_st.tty == nullptr) {
+    fwrite(utf8_codepoint, length, 1, con_st.out);
+    return expectedWidth;
+  }
+
+  fputs("\033[6n", con_st.tty);  // Query cursor position
+  int x1, x2, y1, y2;
+  int results = 0;
+  results = fscanf(con_st.tty, "\033[%d;%dR", &y1, &x1);
+
+  fwrite(utf8_codepoint, length, 1, con_st.tty);
+
+  fputs("\033[6n", con_st.tty);  // Query cursor position
+  results += fscanf(con_st.tty, "\033[%d;%dR", &y2, &x2);
+
+  if (results != 4) {
+    return expectedWidth;
+  }
+
+  int width = x2 - x1;
+  if (width < 0) {
+    // Calculate the width considering text wrapping
+    struct winsize w;
+    ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
+    width += w.ws_col;
+  }
+  return width;
+#endif
+}
+
+void replace_last(const console_state& con_st, char ch) {
+#if defined(_WIN32)
+  pop_cursor(con_st);
+  put_codepoint(con_st, &ch, 1, 1);
+#else
+  fprintf(con_st.out, "\b%c", ch);
+#endif
+}
+
+void append_utf8(char32_t ch, std::string* out) {
+  if (ch <= 0x7F) {
+    out->push_back(static_cast<unsigned char>(ch));
+  } else if (ch <= 0x7FF) {
+    out->push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
+    out->push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+  } else if (ch <= 0xFFFF) {
+    out->push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
+    out->push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+    out->push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+  } else if (ch <= 0x10FFFF) {
+    out->push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
+    out->push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
+    out->push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+    out->push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+  } else {
+    // Invalid Unicode code point
+  }
+}
+
+// Helper function to remove the last UTF-8 character from a string
+void pop_back_utf8_char(std::string* line) {
+  if (line->empty()) {
+    return;
+  }
+
+  size_t pos = line->length() - 1;
+
+  // Find the start of the last UTF-8 character (checking up to 4 bytes back)
+  for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
+    if (((*line)[pos] & 0xC0) != 0x80) break;  // Found the start of the character
+  }
+  line->erase(pos);
+}
+
+bool console_readline(console_state& con_st, std::string& line) {  // NOLINT
+  console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+  if (con_st.out != stdout) {
+    fflush(stdout);
+  }
+
+  line.clear();
+  std::vector<int> widths;
+  bool is_special_char = false;
+  bool end_of_stream = false;
+
+  char32_t input_char;
+  while (true) {
+    fflush(con_st.out);  // Ensure all output is displayed before waiting for input
+    input_char = getchar32();
+
+    if (input_char == '\r' || input_char == '\n') {
+      break;
+    }
+
+    if (input_char == WEOF || input_char == 0x04 /* Ctrl+D*/) {
+      end_of_stream = true;
+      break;
+    }
+
+    if (is_special_char) {
+      console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+      replace_last(con_st, line.back());
+      is_special_char = false;
+    }
+
+    if (input_char == '\033') {  // Escape sequence
+      char32_t code = getchar32();
+      if (code == '[' || code == 0x1B) {
+        // Discard the rest of the escape sequence
+        while ((code = getchar32()) != WEOF) {
+          if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
+            break;
+          }
+        }
+      }
+    } else if (input_char == 0x08 || input_char == 0x7F) {  // Backspace
+      if (!widths.empty()) {
+        int count;
+        do {
+          count = widths.back();
+          widths.pop_back();
+          // Move cursor back, print space, and move cursor back again
+          for (int i = 0; i < count; i++) {
+            replace_last(con_st, ' ');
+            pop_cursor(con_st);
+          }
+          pop_back_utf8_char(&line);
+        } while (count == 0 && !widths.empty());
+      }
+    } else {
+      int offset = line.length();
+      append_utf8(input_char, &line);
+      int width = put_codepoint(con_st, line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
+      if (width < 0) {
+        width = 0;
+      }
+      widths.push_back(width);
+    }
+
+    if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
+      console_set_color(con_st, CONSOLE_COLOR_PROMPT);
+      replace_last(con_st, line.back());
+      is_special_char = true;
+    }
+  }
+
+  bool has_more = con_st.multiline_input;
+  if (is_special_char) {
+    replace_last(con_st, ' ');
+    pop_cursor(con_st);
+
+    char last = line.back();
+    line.pop_back();
+    if (last == '\\') {
+      line += '\n';
+      fputc('\n', con_st.out);
+      has_more = !has_more;
+    } else {
+      // model will just eat the single space, it won't act as a space
+      if (line.length() == 1 && line.back() == ' ') {
+        line.clear();
+        pop_cursor(con_st);
+      }
+      has_more = false;
+    }
+  } else {
+    if (end_of_stream) {
+      has_more = false;
+    } else {
+      line += '\n';
+      fputc('\n', con_st.out);
+    }
+  }
+
+  fflush(con_st.out);
+  return has_more;
+}
+
+std::string build_prompt_glm2(const std::vector<std::string>& history) {
+  std::ostringstream oss_prompt;
+  for (size_t i = 0; i < history.size(); i += 2) {
+    oss_prompt << "[Round " << i / 2 + 1 << "]\n\n问：" << history[i] << "\n\n答：";
+    if (i < history.size() - 1) {
+      oss_prompt << history[i + 1] << "\n\n";
+    }
+  }
+  return oss_prompt.str();
+}
+
+std::string build_prompt_glm1(const std::vector<std::string>& history) {
+  std::ostringstream oss_prompt;
+  if (history.size() == 1) {
+    oss_prompt << history.front();
+  } else {
+    for (size_t i = 0; i < history.size(); i += 2) {
+      oss_prompt << "[Round " << i / 2 << "]\n问：" << history[i] << "\n答：";
+      if (i < history.size() - 1) {
+        oss_prompt << history[i + 1] << "\n";
+      }
+    }
+  }
+  return oss_prompt.str();
+}
+
+static std::string regex_replace(const std::string& input, const std::regex& regex,
+                                 const std::function<std::string(const std::smatch&)>& format) {
+  std::ostringstream oss;
+  int last_index = 0;
+  for (auto it = std::sregex_iterator(input.begin(), input.end(), regex); it != std::sregex_iterator(); it++) {
+    oss << it->prefix() << format(*it);
+    last_index = it->position() + it->length();
+  }
+  oss << input.substr(last_index);
+  return oss.str();
+}
+
+std::string postprocess(const std::string& text) {
+  std::string output;
+
+  // newline token
+  {
+    static const std::regex pattern(R"(<n>)");
+    output = std::regex_replace(text, pattern, "\n");
+  }
+  // tab token
+  {
+    static const std::regex pattern(R"(<\|tab\|>)");
+    output = std::regex_replace(output, pattern, "\t");
+  }
+  // blank tokens
+  {
+    static const std::regex pattern(R"(<\|blank_(\d+)\|>)");
+    output =
+        regex_replace(output, pattern, [](const std::smatch& sm) { return std::string(std::stoi(sm[1].str()), ' '); });
+  }
+
+  return output;
+}
+
+int64_t get_available_memory() {
+    std::ifstream meminfo("/proc/meminfo");
+    std::string line;
+    while (std::getline(meminfo, line)) {
+        if (line.find("MemAvailable") != std::string::npos) {
+            std::istringstream iss(line);
+            std::string name;
+            int64_t value;
+            std::string unit;
+            iss >> name >> value >> unit;
+            if (unit == "kB")
+                return value * 1024;
+            else if (unit == "mB")
+                return value * 1024 * 1024;
+            else if (unit == "gB")
+                return value * 1024 * 1024 * 1024;
+            else
+                return value;
+        }
+    }
+    throw std::runtime_error("Failed to get available memory"); 
+}
diff --git a/vllm-ext/inference_engine/cpp/models/application/common.h b/vllm-ext/inference_engine/cpp/models/application/common.h
new file mode 100644
index 000000000..76af6e470
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/application/common.h
@@ -0,0 +1,226 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+// Various helper functions and utilities
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <map>
+#include <unordered_map>
+#include <tuple>
+#include <vector>
+#include <random>
+#include <regex>
+#include <thread>
+#include <functional>
+
+#include "core/data_types.h"
+#include "core/ne_layers.h"
+#include "models/model_utils/model_types.h"
+
+#if !defined(_WIN32)
+#include <stdio.h>
+#include <termios.h>
+#endif
+
+#define COMMON_SAMPLE_RATE 16000
+//
+// CLI argument parsing
+//
+
+int32_t get_num_physical_cores();
+
+// construct log message with variadic arguments
+template<typename T>
+void construct_message(std::ostringstream& oss, T t) {
+  oss << t;
+}
+
+template<typename T, typename... Args>
+void construct_message(std::ostringstream& oss, T t, Args... args) {
+  oss << t;
+  construct_message(oss, args...);
+}
+
+int64_t get_available_memory();
+
+struct common_params {
+  int32_t n_threads = get_num_physical_cores();
+
+  int32_t seed = -1;        // RNG seed
+  int32_t n_predict = 200;  // new tokens to predict
+  int32_t n_batch = 8;      // batch size for prompt processing
+  int32_t n_ctx = 512;
+
+  std::string model = "";  // model path
+  std::string prompt = "";
+  std::string token_test = "";
+
+  bool perplexity = false;
+
+  // sampling parameters
+  int32_t top_k = 0;
+  float top_p = 1.0f;
+  float temp = 0.8f;
+  int32_t repeat_last_n = 64;
+  float repeat_penalty = 1.02f;
+};
+
+bool common_params_parse(int argc, char** argv, common_params& params);
+
+bool isValidFilename(const std::string& filename);
+
+void gpt_print_usage(int argc, char** argv, const common_params& params);
+
+std::string gpt_random_prompt(std::mt19937& rng);
+
+std::vector<int> gpt_random_ids(std::mt19937& rng);
+
+//
+// Vocab utils
+//
+
+std::string trim(const std::string& s);
+
+std::string replace(const std::string& s, const std::string& from, const std::string& to);
+
+struct gpt_vocab {
+  using id = int32_t;
+  using token = std::string;
+
+  std::map<token, id> token_to_id;
+  std::map<id, token> id_to_token;
+  std::vector<std::string> special_tokens;
+
+  void add_special_token(const std::string& token);
+};
+
+// poor-man's JSON parsing
+std::map<std::string, int32_t> json_parse(const std::string& fname);
+
+std::string convert_to_utf8(const std::wstring& input);
+
+std::wstring convert_to_wstring(const std::string& input);
+
+// split text into tokens
+//
+// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
+//
+// Regex (Python):
+// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+//
+// Regex (C++):
+// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
+//
+std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab& vocab, const std::string& text);
+
+// load the tokens from encoder.json
+bool gpt_vocab_init(const std::string& fname, gpt_vocab* vocab);
+
+// sample next token given probabilities for each embedding
+//
+//   - consider only the top K tokens
+//   - from them, consider only the top tokens with cumulative probability > P
+//
+// TODO: not sure if this implementation is correct
+// TODO: temperature is not implemented
+//
+gpt_vocab::id gpt_sample_top_k_top_p(const gpt_vocab& vocab, const float* logits, int top_k, double top_p, double temp,
+                                     std::mt19937& rng);
+
+gpt_vocab::id gpt_sample_top_k_top_p_repeat(const gpt_vocab& vocab, const float* logits,
+                                            const int32_t* last_n_tokens_data, size_t last_n_tokens_data_size,
+                                            int top_k, double top_p, double temp, int repeat_last_n,
+                                            float repeat_penalty, std::mt19937& rng);
+
+struct quant_params {
+  std::string model_file = "";
+  std::string out_file = "";
+  std::string config = "";
+  int nthread = 1;
+
+  // [int4, int8, fp8_e5m2, fp8_e4m3, fp4_e2m1, nf4]
+  std::string weight_dtype = "int4";
+  // [sym, asym]
+  std::string alg = "sym";
+  // [-1, 32, 128]
+  int32_t group_size = 32;
+  // [fp32, bf16, fp8]
+  std::string scale_dtype = "fp32";
+  // [fp32, fp16, bf16, int8]
+  std::string compute_dtype = "fp32";
+  std::string model_name = "unknown";
+  bool use_ggml = false;
+  // set by model_name automatically
+  model_archs model_arch = MODEL_UNKNOWN;
+};
+
+ne_ftype quant_params_to_ftype(const quant_params& params);
+
+bool quant_params_parse(int argc, char** argv, quant_params& params);
+
+#define ANSI_COLOR_RED "\x1b[31m"
+#define ANSI_COLOR_GREEN "\x1b[32m"
+#define ANSI_COLOR_YELLOW "\x1b[33m"
+#define ANSI_COLOR_BLUE "\x1b[34m"
+#define ANSI_COLOR_MAGENTA "\x1b[35m"
+#define ANSI_COLOR_CYAN "\x1b[36m"
+#define ANSI_COLOR_RESET "\x1b[0m"
+#define ANSI_BOLD "\x1b[1m"
+
+enum console_color_t { CONSOLE_COLOR_DEFAULT = 0, CONSOLE_COLOR_PROMPT, CONSOLE_COLOR_USER_INPUT };
+
+struct console_state {
+  bool multiline_input = false;
+  bool use_color = false;
+  console_color_t color = CONSOLE_COLOR_DEFAULT;
+
+  FILE* out = stdout;
+#if defined(_WIN32)
+  void* hConsole;
+#else
+  FILE* tty = nullptr;
+  termios prev_state;
+#endif
+};
+
+void console_init(console_state& con_st);
+void console_cleanup(console_state& con_st);
+void console_set_color(console_state& con_st, console_color_t color);
+bool console_readline(console_state& con_st, std::string& line);
+
+std::string build_prompt_glm2(const std::vector<std::string>& history);
+std::string build_prompt_glm1(const std::vector<std::string>& history);
+static std::string regex_replace(const std::string& input, const std::regex& regex,
+                                 const std::function<std::string(const std::smatch&)>& format);
+std::string postprocess(const std::string& text);
+
+#define ARRAY_TYPE_FLOAT  1
+#define ARRAY_TYPE_DOUBLE 2
+#define ARRAY_TYPE_INT8   3
+#define ARRAY_TYPE_INT16  4
+#define ARRAY_TYPE_INT32  5
+#define ARRAY_TYPE_INT64  6
+#define ARRAY_TYPE_UINT8  7
+
+template<int i> class ArrayType {};
+template<> class ArrayType<ARRAY_TYPE_FLOAT> { typedef float* type; };
+template<> class ArrayType<ARRAY_TYPE_DOUBLE> { typedef double* type; };
+template<> class ArrayType<ARRAY_TYPE_INT8> { typedef int8_t* type; };
+template<> class ArrayType<ARRAY_TYPE_INT16> { typedef int16_t* type; };
+template<> class ArrayType<ARRAY_TYPE_INT32> { typedef int32_t* type; };
+template<> class ArrayType<ARRAY_TYPE_INT64> { typedef int64_t* type; };
+template<> class ArrayType<ARRAY_TYPE_UINT8> { typedef uint8_t* type; };
diff --git a/vllm-ext/inference_engine/cpp/models/application/data_types.h b/vllm-ext/inference_engine/cpp/models/application/data_types.h
new file mode 100644
index 000000000..6a496c431
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/application/data_types.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <cstdint>
+
+const int _TORCH_INT8_TYPE      = 3;
+const int _TORCH_INT16_TYPE     = 4;
+const int _TORCH_INT32_TYPE     = 5;
+const int _TORCH_INT64_TYPE     = 6;
+const int _TORCH_UINT8_TYPE     = 7;
+
+template<int i> class DataArray {};
+
+template<> class DataArray<_TORCH_INT8_TYPE> { typedef int8_t * type; };
+template<> class DataArray<_TORCH_INT16_TYPE> { typedef int16_t * type; };
+template<> class DataArray<_TORCH_INT32_TYPE> { typedef int32_t * type; };
+template<> class DataArray<_TORCH_INT64_TYPE> { typedef int64_t * type; };
+template<> class DataArray<_TORCH_UINT8_TYPE> { typedef uint8_t * type; };
\ No newline at end of file
diff --git a/vllm-ext/inference_engine/cpp/models/application/vllm_cont_batch.cpp b/vllm-ext/inference_engine/cpp/models/application/vllm_cont_batch.cpp
new file mode 100644
index 000000000..c25e17029
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/application/vllm_cont_batch.cpp
@@ -0,0 +1,625 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+// Defines sigaction on msys:
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+#include <ctime>
+#include <fstream>
+#include <random>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include <memory>
+
+#include "common.h"
+#include "core/layers/bestla_common.hpp"
+#include "core/layers/bestla_gemm.h"
+#include "bestla/bestla_parallel.h"
+#include "models/model_utils/model_types.h"
+#include "models/model_utils/model_config.h"
+#include "models/model_utils/model_utils.h"
+#include "models/model_utils/quant_utils.h"
+
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <signal.h>
+#include <windows.h>
+#endif
+
+
+static const char * ERROR_MESSAGE_SLOT = "ERROR: no free slot available or illegal state for seq id: ";
+static const char * ERROR_MESSAGE_DUPLICATE_REQID = "ERROR: duplicate request index!\n";
+
+using namespace std;
+
+static std::set<model_archs> cont_batching_model_archs = {MODEL_GPTJ, MODEL_LLAMA};
+void init_model_params(model_params* params, const std::string& model_path, int max_new_tokens = -1, int max_batch_size = 512,
+                     int ctx_size = 512, model_vocab::id pad_token = -1, const std::string& memory_dtype = "auto",
+                     const float& scratch_size_ratio = 1.0f, int threads = 8, int threads_next=8, int max_prompt_tokens = 1024, int seed = -1) {
+  MODEL_ASSERT(params != nullptr);
+#ifdef MODEL_NAME
+  params->model_name = MODEL_NAME;
+#endif
+  params->model_arch = model_name_to_arch::init().find(params->model_name);
+  params->model = model_path;
+  params->n_predict = max_new_tokens;
+  params->max_batch_size = max_batch_size;
+  params->n_ctx = ctx_size;
+  params->seed = seed;
+  params->n_threads = threads;
+  params->n_threads_next = threads_next;
+  params->max_prompt_tokens = max_prompt_tokens;
+  
+  if (memory_dtype == "f32")
+    params->memory_type = KV_MEM_TYPE_F32;
+  else if (memory_dtype == "f16")
+    params->memory_type = KV_MEM_TYPE_F16;
+  else if (memory_dtype == "auto")
+    params->memory_type = KV_MEM_TYPE_AUTO;
+  else
+    fprintf(stderr, "Unexpected memory dtype %s!", memory_dtype.c_str());
+
+  if (cont_batching_model_archs.count(params->model_arch) == 0) {
+    fprintf(stderr, "unsupported model %d!", params->model_arch);
+    exit(0);
+  }
+  params->do_early_stopping = true;
+  params->scratch_size_ratio = scratch_size_ratio;
+
+  // TODO(Yi): MHA FOR LONG TOKENS
+  int32_t tokens_length = 8192;
+  if (params->n_ctx > tokens_length) {
+    params->memory_type = KV_MEM_TYPE_F16;
+  }
+
+  if (params->seed < 0) {
+    params->seed = time(nullptr);
+  }
+
+  printf(
+      "Model Parameters in cpp. model_name: %s, model_arch: %d, n_predict: %d, max_batch_size: %d, n_ctx: %d, memory type: %d, "
+      "early_stopping: %d, scratch_size_ratio: %.3f, threads: %d, seed: %d\n",
+      params->model_name.c_str(), params->model_arch, params->n_predict, params->max_batch_size, params->n_ctx, params->memory_type,
+      params->do_early_stopping, params->scratch_size_ratio, params->n_threads, params->seed);
+}
+
+std::shared_ptr<quant_layer_base> get_model_quant_layer(const std::string model_name) {
+  return ql_registry::create_ql(model_name);
+}
+
+#define STATIC_INPUT_HEAD_IDX 0
+
+#define KV_CACHE_MARK_YES         -1
+#define KV_CACHE_MARK_NO          0
+#define KV_CACHE_LAST_DIM         2
+#define KV_CACHE_CPY_PARAMS_SIZE  5
+#define KV_CACHE_ELEMENT_USED     4
+
+
+// kv cache usage
+// 0 0 -> seq_id
+// 0 1 -> slot_id, will be set in native
+// 1 0 -> has parent sequence, yes: -1, no: 0
+// 1 1 -> if has parent sequence (-1), parent seq_id
+// 2 0 -> if kv cache copied, yes: -1, no: 0
+// 2 1 -> if has parent sequence (-1), parent slot_id
+// 3 0 -> beam_size if it's beam search. should be greater than 1
+// 3 1 -> vllm request idx
+
+struct slot_mapping {
+  int64_t seq_id; // -1 if not used
+  int slot_id; // AKA. req_idx
+};
+
+class SlotManager {
+ public:
+  SlotManager(struct model_context * ctx, int max_batch_size) : ctx(ctx), max_batch_size(max_batch_size) {
+    free_req_idx.resize(max_batch_size, true);
+  }
+
+  int64_t calculate_kv_cache_idx(int block_nbr) {
+    return kv_cache_block_size * block_nbr * KV_CACHE_LAST_DIM;
+  }
+
+  /**
+   * @brief Free a slot by request index as well as vllmgroup request index association if any
+   * @param seq_id: sequence id
+   * @param vllmgroup_reqidx: vllmgroup request index
+   * @return 0 if success, -1 if failed
+  */
+  int free_slot(int64_t seq_id, int64_t vllmgroup_reqidx) {
+    if (vllmseqid_to_reqidx.find(seq_id) == vllmseqid_to_reqidx.end()) {
+      fprintf(stderr, "ERROR: seq_id %ld not found in vllmseqid_to_reqidx, nothing to free!\n", seq_id);
+      return -1;
+    }
+    int req_idx = vllmseqid_to_reqidx[seq_id];
+    // if vllmgroup_reqidx not exists
+    if (vllmgroup_reqidx_to_reqindices.find(vllmgroup_reqidx) == vllmgroup_reqidx_to_reqindices.end()) {
+      free_req_idx[req_idx] = true;
+      vllmseqid_to_reqidx.erase(seq_id);
+      return 0;
+    }
+    // group assigned slots
+    std::vector<slot_mapping>& slots = vllmgroup_reqidx_to_reqindices[vllmgroup_reqidx];
+    bool all_freed = true;
+    for (auto & ss : slots) {
+      if (ss.slot_id == req_idx) {
+        ss.seq_id = -1;
+      }
+      all_freed &= (ss.seq_id == -1);
+    }
+    if (all_freed) { // beam search done. free all slots. If beam search is not done, at lease one slot occupied
+      for (auto & ss : slots) {
+        free_req_idx[ss.slot_id] = true;
+        vllmseqid_to_reqidx.erase(ss.seq_id);
+      }
+      vllmgroup_reqidx_to_reqindices.erase(vllmgroup_reqidx);
+    }
+    return 0;
+  }
+
+  /**
+   * @brief Assign a slot for a given seq_id or multiple slots for vllmgroup reqidx for beam search
+   * @param kv_cache_idx: kv cache index based on block_nbr
+   * @param is_prompt: if it's prompt decoding, true; otherwise, false. It's for checking if there is duplicate vllmgroup_reqidx
+   * @return slot index or first assigned slot index if it's beam search in prefill phase
+  */
+  int assign_slot(int64_t kv_cache_idx, bool is_prompt) {
+    int64_t kv_cache_beam_idx = kv_cache_idx + 3 * KV_CACHE_LAST_DIM; // is beam search, kv_caches[block_nbr][3][0]
+
+    int64_t seq_id = kv_caches[kv_cache_idx];
+    int beam_size = (int)kv_caches[kv_cache_beam_idx];
+    int64_t vllmgroup_reqidx = kv_caches[kv_cache_beam_idx + 1]; // kv_caches[block_nbr][3][1] is the vllmgroup_reqidx
+    if (beam_size > 1) { // beam search case
+      return assign_group_slots(vllmgroup_reqidx, seq_id, kv_cache_idx, beam_size, is_prompt);
+    }
+    int req_idx = query_free_req_idx();
+    if (req_idx == -1) {
+      fprintf(stderr, "ERROR: no free slot available for seq id %ld!\n", seq_id);
+      return -1;
+    }
+    vllmseqid_to_reqidx.emplace(seq_id, req_idx); // kv_caches[block_nbr][0][0] is the vllmseqid
+    kv_caches[kv_cache_idx + 1] = req_idx; // kv_caches[block_nbr][0][1] is the req_idx
+    return req_idx;
+  }
+
+  /**
+   * @brief Get slot for a sequence which has some metadata stored in kv cache entry identified by kv_cache_idx.
+   * If it's prompt decoding, a new slot or multiple slots (beam search) will be assigned. Otherwise, get assigned slot from seq_id -> slot_id map if it's non-beam search.
+   * For beam search in next decoding phase, get slot from group assigned slots to avoid copying all kv cache content from parent cache since prompt tokens kv caches are same across the group.
+  */
+  int get_slot(int64_t kv_cache_idx, int prompt_len, int total_len) {
+    int64_t seq_id = kv_caches[kv_cache_idx]; // kv_caches[block_nbr][0][0] is the vllmseqid
+    int req_idx;
+    // get existing slot id or assign new slot
+    if (vllmseqid_to_reqidx.find(seq_id) != vllmseqid_to_reqidx.end()) { // existing slot
+      req_idx = vllmseqid_to_reqidx[seq_id];
+      MODEL_ASSERT(req_idx == kv_caches[kv_cache_idx + 1]);
+      return req_idx;
+    }
+    int64_t kv_cache_parent_idx = kv_cache_idx + 1 * KV_CACHE_LAST_DIM; // has parent_seq, kv_caches[block_nbr][1][0]
+    int64_t kv_cache_cpy_idx = kv_cache_idx + 2 * KV_CACHE_LAST_DIM; // is kv cache copied, kv_caches[block_nbr][2][0]
+    if ((kv_caches[kv_cache_parent_idx] == KV_CACHE_MARK_YES) && (kv_caches[kv_cache_cpy_idx] == KV_CACHE_MARK_NO)) { // has parent seq, but kv cache not copied yet
+      // parent seqs were not freed from vllm call. so it's ok to copy from parent.
+      return copy_kv_cache(kv_cache_idx, prompt_len, total_len);
+    }
+    
+    if ((kv_caches[kv_cache_parent_idx] == KV_CACHE_MARK_YES) && (kv_caches[kv_cache_cpy_idx] == KV_CACHE_MARK_YES)) { // slot of parent seq reused
+      int64_t parent_seq_id = kv_caches[kv_cache_parent_idx + 1]; // kv_caches[block_nbr][1][1] is the parent_seq_id
+      return transfer_slot(parent_seq_id, seq_id, kv_caches[kv_cache_idx + 3 * KV_CACHE_LAST_DIM + 1], kv_cache_idx); // kv_caches[block_nbr][3][1] is the vllmgroup_reqidx
+    }
+    return assign_slot(kv_cache_idx, false);
+  }
+
+  int64_t get_seq_id(int block_nbr) {
+    return kv_caches[block_nbr * kv_cache_block_size * KV_CACHE_LAST_DIM];
+  }
+
+  void set_block_size(int64_t block_size) {
+    kv_cache_block_size = block_size;
+  }
+
+  void set_kv_caches_ptr(void * kv_caches_ptr) {
+    kv_caches = static_cast<int64_t *>(kv_caches_ptr);
+  }
+
+ private:
+  int max_batch_size;
+  struct model_context * ctx = nullptr;
+  std::vector<bool> free_req_idx;
+  std::unordered_map<int64_t, int> vllmseqid_to_reqidx;
+  // for fixed beam-width slot allocation
+  std::unordered_map<int64_t, std::vector<slot_mapping>> vllmgroup_reqidx_to_reqindices; // via vllmreqidx -> multiple seqs -> reqidices
+
+  int64_t kv_cache_block_size;
+  int64_t * kv_caches;
+
+  /**
+   * @brief Query a free request index and set it to be used
+  */
+  int query_free_req_idx() {
+    auto iter = std::find_if(free_req_idx.begin(), free_req_idx.end(), [](const bool flag) { return flag; });
+    if (iter != free_req_idx.end()) {
+      int idx = std::distance(free_req_idx.begin(), iter);
+      free_req_idx[idx] = false;
+      return idx;
+    }
+    return -1;
+  }
+
+  /**
+   * @brief Assign slots for a group of seqs for beam search in prefill phase. Or assign a slot from group assinged slots for a given seq_id in decoding phase
+   * @param vllmgroup_reqidx: vllmgroup request index
+   * @param seq_id: sequence id
+   * @param kv_cache_idx: kv cache index based on block_nbr
+   * @param beam_size: beam size
+   * @param is_prompt: if it's prompt decoding, true; otherwise, false. It's for checking if there is duplicate vllmgroup_reqidx
+   * @return slot index or first assigned slot index if it's beam search in prefill phase
+  */
+  int assign_group_slots(int64_t vllmgroup_reqidx, int64_t seq_id, int64_t kv_cache_idx, int beam_size, bool is_prompt) {
+    if (vllmgroup_reqidx_to_reqindices.find(vllmgroup_reqidx) == vllmgroup_reqidx_to_reqindices.end()) { // prefill phase
+      // avoid vector copy
+      vllmgroup_reqidx_to_reqindices.emplace(vllmgroup_reqidx, std::vector<slot_mapping>());
+      std::vector<slot_mapping> & slots = vllmgroup_reqidx_to_reqindices[vllmgroup_reqidx];
+      slots.reserve(beam_size);
+      for (int i = 0; i < beam_size; i++) {
+        int req_idx = query_free_req_idx();
+        if (req_idx == -1) { // no enough slots, rollback assignments
+          rollback_group_assignment(vllmgroup_reqidx);
+          fprintf(stderr, "ERROR: no free slot available for group slots assignment, %ld!\n", vllmgroup_reqidx);
+          return -1;
+        }
+        slots.push_back(slot_mapping{-1, req_idx});
+      }
+      // return first available slot id
+      slot_mapping & ss = slots[0];
+      occupy_group_slot(ss, seq_id, kv_cache_idx);
+      return ss.slot_id;
+    }
+    // decoding phase
+    if (is_prompt) {
+      fprintf(stderr, ERROR_MESSAGE_DUPLICATE_REQID);
+      return -2;
+    }
+    std::vector<slot_mapping>& slots = vllmgroup_reqidx_to_reqindices[vllmgroup_reqidx];
+    for (auto & slot : slots) {
+      if (slot.seq_id == -1) {
+        occupy_group_slot(slot, seq_id, kv_cache_idx);
+        return slot.slot_id;
+      }
+    }
+    fprintf(stderr, "ERROR: no free slot available from already assigned slots, seq id is %ld, vllmgroup reqidx is %ld!\n", seq_id, vllmgroup_reqidx);
+    return -1;
+  }
+
+  void occupy_group_slot(slot_mapping & ss, int64_t seq_id, int64_t kv_cache_idx) {
+    ss.seq_id = seq_id;
+    // int kv_cache_idx = block_nbr * kv_cache_block_size * KV_CACHE_LAST_DIM;
+    vllmseqid_to_reqidx.emplace(seq_id, ss.slot_id); // kv_caches[block_nbr][0][0] is the vllmseqid
+    kv_caches[kv_cache_idx + 1] = ss.slot_id; // kv_caches[block_nbr][0][1] is the req_idx
+  }
+
+  void rollback_group_assignment(int64_t vllmgroup_reqidx) {
+    std::vector<slot_mapping>& slots = vllmgroup_reqidx_to_reqindices[vllmgroup_reqidx];
+    for (auto & ss : slots) {
+      free_req_idx[ss.slot_id] = true;
+    }
+    vllmgroup_reqidx_to_reqindices.erase(vllmgroup_reqidx);
+  }
+
+  /**
+   * @brief Transfer slot from parent seq to current seq
+   * @param parent_seq_id: parent sequence id
+   * @param seq_id: sequence id
+   * @param vllmgroup_reqidx: vllmgroup request index
+   * @param kv_cache_idx: kv cache index based on block_nbr
+   * @return slot index
+  */
+  int transfer_slot(int64_t parent_seq_id, int64_t seq_id, int64_t vllmgroup_reqidx, int64_t kv_cache_idx) {
+    if (vllmseqid_to_reqidx.find(parent_seq_id) == vllmseqid_to_reqidx.end()) {
+      fprintf(stderr, "ERROR: parent_seq_id %ld not found in vllmseqid_to_reqidx, nothing to transfer!\n", parent_seq_id);
+      return -1;
+    }
+    if (vllmgroup_reqidx_to_reqindices.find(vllmgroup_reqidx) == vllmgroup_reqidx_to_reqindices.end()) {
+      fprintf(stderr, "ERROR: vllmgroup_reqidx %ld not existed, illegal state!\n", vllmgroup_reqidx);
+      return -1;
+    }
+    // slot_mapping are not changed, only vllmseqid_to_reqidx is updated
+    int req_idx = vllmseqid_to_reqidx[parent_seq_id];
+    vllmseqid_to_reqidx.erase(parent_seq_id);
+    vllmseqid_to_reqidx.emplace(seq_id, req_idx);
+    kv_caches[kv_cache_idx + 1] = req_idx;
+    return req_idx;
+  }
+
+  /**
+   * @brief Copy kv cache from parent seq to current seq. If it's prompt decoding, all kv cache content will be copied. Otherwise, only the kv caches of new tokens will be copied.
+   * @param kv_cache_idx: kv cache index based on block_nbr
+   * @param prompt_len: prompt length
+   * @param total_len: total length
+   * @return slot index
+  */
+  int copy_kv_cache(int64_t kv_cache_idx, int prompt_len, int total_len) {
+    int64_t kv_cache_cpy_idx = kv_cache_idx + 2 * KV_CACHE_LAST_DIM; // is kv cache copied, kv_caches[block_nbr][2][0]
+    MODEL_ASSERT(kv_caches[kv_cache_cpy_idx] == KV_CACHE_MARK_NO);
+
+    // assign slot and copy kv cache
+    int req_idx = assign_slot(kv_cache_idx, false);
+    if (req_idx >= 0) {
+      int parent_req_idx = kv_caches[kv_cache_cpy_idx + 1]; // kv_caches[block_nbr][2][1] is the parent slot id
+      // copy kv cache
+      int start_pos = prompt_len == total_len ? 0 : prompt_len;
+      model_kv_cache_seq_cpy(ctx, parent_req_idx, req_idx, start_pos, total_len);
+      // mark kv cache copied
+      kv_caches[kv_cache_cpy_idx] = KV_CACHE_MARK_YES;
+    }
+
+    return req_idx;
+  }
+};
+
+class Model {
+
+ public:
+  Model() { model_init_backend(); }
+
+  ~Model() {
+    if (ctx) model_free(ctx);
+    if (slot_manager) delete slot_manager;
+  }
+
+  bool init_model(const std::string& model_path, int max_new_tokens, int max_batch_size, int ctx_size, model_vocab::id pad_token,
+                  const std::string& memory_dtype, const float& scratch_size_ratio, int threads, int threads_next, int max_prompt_len, int seed);
+
+  model_token get_eos_id() { return ctx->vocab.eos_token_id; }
+
+  void * generate(void * input_ids_ptr,
+                  void * positions_dataptr,
+                  bool is_prompt,
+                  void * block_tables_ptr,
+                  void * slot_mapping_ptr,
+                  int * prompt_lens_arr, int n_prompts);
+
+  void set_block_size(int64_t block_size) { slot_manager->set_block_size(block_size); }
+
+  void set_kv_caches_ptr(void * kv_caches_ptr) { slot_manager->set_kv_caches_ptr(kv_caches_ptr); }
+
+  bool free_slots(int64_t * seq_id, int nbr_of_ids);
+
+  const char * get_last_error() const {
+    return last_error.c_str();
+  }
+
+  static int quantize_model(const std::string& model_path, const std::string& out_path, const std::string& weight_dtype,
+                         const std::string& alg, int group_size, const std::string& scale_dtype,
+                         const std::string& compute_dtype, bool use_ggml, int threads);
+
+  void print_time() { model_print_timings(ctx); }
+
+  void reset_time() { model_reset_timings(ctx); }
+
+ private:
+  model_context* ctx = nullptr;
+  model_params params;
+  SlotManager * slot_manager = nullptr;
+
+  std::string last_error;
+};
+
+bool Model::init_model(const std::string& model_path, int max_new_tokens, int max_batch_size, int ctx_size, model_vocab::id pad_token,
+                       const std::string& memory_dtype, const float& scratch_size_ratio, int threads, int threads_next, int max_prompt_tokens, int seed) {
+  try {
+    init_model_params(&params, model_path, max_new_tokens, max_batch_size, ctx_size, pad_token, memory_dtype, scratch_size_ratio,
+                      threads, threads_next, max_prompt_tokens, seed);
+    ctx = create_model_context(params);
+    if (pad_token != -1) ctx->vocab.pad_token_id = pad_token;
+
+    slot_manager = new SlotManager(ctx, max_batch_size);
+  } catch (std::exception & e) {
+    last_error = "ERROR: model initialization failed! ";
+    fprintf(stderr, "%s %s\n", last_error, e.what());
+    return false;
+  }
+  return true;
+}
+ 
+void * Model::generate(void * input_ids_ptr,
+                void * positions_ptr,
+                bool is_prompt,
+                void * block_tables_ptr,
+                void * slot_mapping_ptr,
+                int * n_contexts_arr, int n_contexts) {
+  int32_t * input_ids = static_cast<int32_t *>(input_ids_ptr);
+  int64_t * positions_data = static_cast<int64_t *>(positions_ptr);
+  int * block_tables = static_cast<int *>(block_tables_ptr);
+  int64_t * slot_mapping = static_cast<int64_t *>(slot_mapping_ptr);
+
+  // prepare inputs
+  std::vector<model_input> inputs;
+  inputs.reserve(n_contexts);
+  if (is_prompt) {
+    int input_offset = 0;
+    for (int i = 0; i < n_contexts; ++i) {
+      // assign slot based on parameters in kv_cache, like seq_id, beam_size, vllmgroup_reqidx
+      // must assign slot here since it's prompt decoding
+      int64_t kv_cache_idx = slot_manager->calculate_kv_cache_idx(block_tables[i]); // kv_caches[block_nbr][0][0] is the vllmseqid
+      int req_idx = slot_manager->assign_slot(kv_cache_idx, true);
+      if (req_idx < 0) {
+        std::ostringstream oss;
+        construct_message(oss, ERROR_MESSAGE_SLOT, slot_manager->get_seq_id(block_tables[i]), "!\n",
+                          req_idx == -2 ? ERROR_MESSAGE_DUPLICATE_REQID : "");
+        last_error = oss.str();
+        fprintf(stderr, last_error.c_str());
+        return nullptr;
+      }
+      inputs.push_back(model_input{
+          /*.tokens           =*/input_ids + input_offset,
+          /*.n_tokens         =*/(uint32_t)n_contexts_arr[i],
+          /*.n_prompt_tokens  =*/(uint32_t)n_contexts_arr[i],
+          /*.n_past           =*/0,
+          /*.n_total          =*/0,
+          /*.request_idx      =*/req_idx,
+          /*.beam_idx         =*/0,
+          /*.padding_side     =*/0,
+          /*n_padding         =*/0,
+      });
+      input_offset += n_contexts_arr[i];
+    }
+  } else { // next decoding
+    for (int i = 0; i < n_contexts; ++i) {
+      int64_t kv_cache_idx = slot_manager->calculate_kv_cache_idx(block_tables[i]); // kv_caches[block_nbr][0][0] is the vllmseqid
+      int req_idx = slot_manager->get_slot(kv_cache_idx, n_contexts_arr[i], positions_data[i]);
+      if (req_idx < 0) {
+        std::ostringstream oss;
+        construct_message(oss, ERROR_MESSAGE_SLOT, slot_manager->get_seq_id(block_tables[i]), "!\n",
+                          req_idx == -2 ? ERROR_MESSAGE_DUPLICATE_REQID : "");
+        last_error = oss.str();
+        fprintf(stderr, last_error.c_str());
+        return nullptr;
+      }
+      inputs.push_back(model_input{
+          /*.tokens           =*/input_ids + i,
+          /*.n_tokens         =*/1,
+          /*.n_prompt_tokens  =*/(uint32_t)n_contexts_arr[i],
+          /*.n_past           =*/(uint32_t)positions_data[i],
+          /*.n_total          =*/(uint32_t)positions_data[i],
+          /*.request_idx      =*/req_idx,
+          /*.beam_idx         =*/0,
+          /*.padding_side     =*/0,
+          /*n_padding         =*/0,
+      });
+    }
+  }
+  ctx->batch_size = inputs.size();
+  int n_threads = is_prompt ? params.n_threads : params.n_threads_next;
+  if (model_eval(ctx, inputs.data(), inputs.size(), n_threads) > 0) {
+    last_error = "ERROR: model_eval failed!\n";
+    fprintf(stderr, last_error.c_str());
+    return nullptr;
+  }
+  return ctx->last_hidden_states.data();
+}
+
+bool Model::free_slots(int64_t * seq_ids, int nbr_of_ids) {
+  int sum = 0;
+  for (int i = 0; i < nbr_of_ids; i++) {
+    int64_t seq_id = seq_ids[i];
+    int64_t vllmgroup_reqidx = seq_ids[i + 1];
+    sum += slot_manager->free_slot(seq_id, vllmgroup_reqidx);
+  }
+  return sum == 0;
+}
+
+int Model::quantize_model(const std::string& model_path, const std::string& out_path, const std::string& weight_dtype,
+                       const std::string& alg, int group_size, const std::string& scale_dtype,
+                       const std::string& compute_dtype, bool use_ggml, int threads) {
+  quant_params q_params;
+#ifdef MODEL_NAME
+  q_params.model_name = MODEL_NAME;
+#endif
+  model_archs mt = model_name_to_arch::init().find(q_params.model_name);
+  if (mt == MODEL_UNKNOWN) {
+    fprintf(stderr, "error, please set model_name \n");
+    exit(0);
+  }
+  q_params.model_arch = mt;
+  q_params.model_file = model_path;
+  q_params.out_file = out_path;
+  q_params.weight_dtype = weight_dtype;
+  q_params.alg = alg;
+  q_params.group_size = group_size;
+  q_params.scale_dtype = scale_dtype;
+  q_params.compute_dtype = compute_dtype;
+  q_params.use_ggml = use_ggml;
+  q_params.nthread = threads;
+
+  ne_ftype ftype = quant_params_to_ftype(q_params);
+  printf("ne_ftype: %d\n", ftype);
+
+  auto quant_layer = get_model_quant_layer(q_params.model_name);
+  if (model_quantize(q_params, quant_layer)) {
+    fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, q_params.model_file.c_str());
+    return 1;
+  }
+  return 0;
+}
+
+extern "C" {
+  void * create_new_model() {
+    return new(std::nothrow) Model();
+  }
+
+  void destroy_model(void * model_ptr) {
+    delete static_cast<Model*>(model_ptr);
+  }
+
+  bool init_model(void * model_ptr, char* model_path, int max_new_tokens, int max_batch_size,
+                  int ctx_size, int32_t pad_token, char* memory_dtype,
+                  float scratch_size_ratio, int threads, int threads_next, int max_prompt_tokens, int seed) {
+    Model* model = static_cast<Model*>(model_ptr);
+    return model->init_model(model_path, max_new_tokens, max_batch_size, ctx_size, pad_token,
+                              memory_dtype, scratch_size_ratio, threads, threads_next, max_prompt_tokens, seed);
+  }
+
+  int quantize_model(char* model_path, char * out_path, char * weight_dtype,
+                     char * alg, int group_size, char * scale_dtype, char * compute_dtype, bool use_ggml, int threads) {
+    return Model::quantize_model(model_path, out_path, weight_dtype, alg, group_size, scale_dtype,
+                          compute_dtype, use_ggml, threads);
+  }
+
+  void * generate(void * model_ptr,
+                  void * input_ids_ptr,
+                  void * positions_dataptr,
+                  bool is_prompt,
+                  void * block_tables_ptr,
+                  void * slot_mapping_ptr,
+                  int * context_lens_arr, int n_contexts) {
+    Model* model = static_cast<Model*>(model_ptr);
+    return model->generate(input_ids_ptr, positions_dataptr, is_prompt,
+            block_tables_ptr, slot_mapping_ptr, context_lens_arr, n_contexts);
+  }
+
+  void set_block_size(void * model_ptr, int64_t block_size) {
+    Model* model = static_cast<Model*>(model_ptr);
+    model->set_block_size(block_size);
+  }
+
+  bool free_slots(void * model_ptr, int64_t * seq_ids, int nbr_of_ids) {
+    Model* model = static_cast<Model*>(model_ptr);
+    return model->free_slots(seq_ids, nbr_of_ids);
+  }
+
+  void set_kv_caches_ptr(void * model_ptr, void * kv_caches_ptr) {
+    Model* model = static_cast<Model*>(model_ptr);
+    model->set_kv_caches_ptr(kv_caches_ptr);
+  }
+
+  const char * get_last_error(void * model_ptr) {
+    Model* model = static_cast<Model*>(model_ptr);
+    return model->get_last_error();
+  }
+
+}  // extern "C"
diff --git a/vllm-ext/inference_engine/cpp/models/llama/llama.cpp b/vllm-ext/inference_engine/cpp/models/llama/llama.cpp
new file mode 100644
index 000000000..aa7d7e68f
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/llama/llama.cpp
@@ -0,0 +1,615 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#include "models/llama/llama.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstring>
+#include <exception>
+#include <fstream>
+#include <iterator>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "core/data_types.h"
+#include "core/layers/mha_dense.h"
+#include "core/ne.h"
+#include "core/ne_bestla.h"
+#include "core/ne_layers.h"
+#include "models/model_utils/model_config.h"
+#include "models/model_utils/model_files.h"
+#include "models/model_utils/model_types.h"
+#include "models/model_utils/model_utils.h"
+#include "models/model_utils/util.h"
+#include "models/models.h"
+
+static const bool NE_ATTN_PREFER_FP32 =
+    getenv("NE_ATTN_PREFER_FP32") != nullptr && std::string("1") == getenv("NE_ATTN_PREFER_FP32");
+
+// evaluate the transformer
+//
+//   - lctx:      model context
+//   - inputs:    model_input array
+//   - n_input    num of model_input
+//   - n_threads: number of threads to use
+//
+static bool llama_model_eval_internal(model_context* ctx, const model_input* inputs, const int n_input,
+                                      const int n_threads) {
+  const int64_t t_start_us = ne_time_us();
+  model_context& lctx = *ctx;
+  // single prompt
+  const int N = inputs->n_tokens;
+  const int n_past = inputs->n_past;
+  const int n_total = inputs->n_total;
+
+  const int batch_size = lctx.batch_size;
+  MODEL_ASSERT(batch_size == n_input);
+  // continuous batching (no padding)
+  // input shape will be [1, l_sum]
+  const bool concat_multi_seqs = batch_size > 1 ? true : false;
+  std::vector<int> n_tokens(batch_size);
+  std::vector<int> n_pasts(batch_size);
+  std::vector<int> n_totals(batch_size);
+  std::vector<int> block_ids(batch_size);
+  for (int i = 0; i < batch_size; ++i) {
+    n_tokens[i] = inputs[i].n_tokens;
+    n_pasts[i] = inputs[i].n_past;
+    n_totals[i] = inputs[i].n_total;
+    block_ids[i] = inputs[i].request_idx;
+    // enforce that the first token is BOS
+    if (n_totals[i] == 0 && inputs[i].tokens[0] != lctx.vocab.bos_token_id) {
+      fprintf(stderr, "%s: first token must be BOS (token id is %d) in %dth prompt\n", __func__,
+              lctx.vocab.bos_token_id, i);
+      return false;
+    }
+  }
+  const int seq_len_sum = std::accumulate(n_tokens.begin(), n_tokens.end(), 0);
+  const int infer_bs = 1;
+  const int kv_n_ctx_block = lctx.kv_n_ctx_block;
+  const std::vector<std::vector<int>> infer_groups = split_inputs_into_groups(inputs, n_input);
+
+  const auto& model = lctx.model;
+  const auto& hparams = model.hparams;
+
+  const auto& kv_self = model.kv_self;
+
+  MODEL_ASSERT(!!kv_self.ctx);
+
+  int n_embd = hparams.n_embd;
+  const int n_layer = hparams.n_layer;
+  const int n_ctx = lctx.n_ctx;  // max number fo tokens to keep in the kv-cache
+  // Whether kv-cache uses ring-buffer and is already full in the current run of _model_eval
+  const int n_cached = n_past + N;  // #tokens cached after kv-append
+  int n_head = hparams.n_head;
+  int head_size = n_embd / n_head;
+  int n_head_kv = hparams.n_head_kv;
+  int n_expert = hparams.n_experts;
+  int n_expert_used = hparams.n_experts_used;
+
+  bool enable_tp = false;
+  MODEL_ASSERT(("continuous batching mechanism doesn't support TP.\n", !(concat_multi_seqs && enable_tp)));
+
+  const int n_vocab = hparams.n_vocab;
+  const int n_rot = head_size;
+  const int n_embd_gqa = head_size * n_head_kv;
+
+  auto& mem_per_token = lctx.mem_per_token;
+  auto& buf_compute = lctx.buf_compute;
+
+  struct ne_init_params params = {
+      /*.mem_size   =*/buf_compute.size,
+      /*.mem_buffer =*/buf_compute.addr,
+      /*.no_alloc   =*/false,
+  };
+
+  struct ne_context* ctx0 = ne_init(params);
+
+  // for big prompts, if BLAS is enabled, it is better to use only one thread
+  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
+  ne_cgraph gf = {};
+  gf.n_threads = N >= 32 && ne_cpu_has_blas() ? 1 : n_threads;
+
+  const bool run_mha_reordered = kv_self.k->type == NE_TYPE_BTLA;
+  kv_cache_info_t kv_cache_info = {0, 0};
+  if (run_mha_reordered) {
+    NE_ASSERT(kv_self.v->type == NE_TYPE_BTLA);  // kv type should be the same
+    attn_shape_t attn_shape = {
+        /* .batch_size = */ batch_size,
+        /* .head_num = */ n_head,
+        /* .heads_kv = */ n_head_kv,
+        /* .head_size = */ head_size,
+        /* .sl_q = */ N,  // Note: make sure that bestla reordered attn supports next token inferencing
+        /* .sl_kv = */ n_cached,
+    };
+
+    NE_ASSERT(("bestla managed kv-cache not supported; use `--memory-f16 / --memory-f32` instead",
+               bestla_reordered_attn_fp32_support(&attn_shape)));
+    kv_shape_t kv_shape{
+        /* .heads_kv = */ static_cast<uint32_t>(n_head_kv),
+        /* .head_size = */ static_cast<uint32_t>(head_size),
+        /* .sl_kv_max = */ static_cast<uint32_t>(n_ctx),
+    };
+    bestla_reordered_attn_fp32_batch_kv_info(&kv_shape, &kv_cache_info);
+  }
+
+  struct ne_tensor* embd = ne_new_tensor_1d(ctx0, NE_TYPE_I32, seq_len_sum, NE_SIZE_CALC);
+  ne_set_name(embd, "embd");
+  int cpy_off = 0;
+  for (int i = 0; i < batch_size; ++i) {
+    memcpy(static_cast<model_token*>(embd->data) + cpy_off, inputs[i].tokens, n_tokens[i] * ne_element_size(embd));
+    cpy_off += n_tokens[i];
+  }
+  
+
+  struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd);
+
+  for (int il = 0; il < n_layer; ++il) {
+    struct ne_tensor* inpSA = inpL;
+
+    struct ne_tensor* cur;
+
+    lctx.use_buf(ctx0, 0);
+
+    // norm
+    {
+      cur = ne_rms_norm(ctx0, inpL, hparams.norm_eps);
+
+      // cur = cur*attention_norm(broadcasted)
+      cur = ne_mul(ctx0, cur, model.layers[il].norm[0]);
+    }
+    
+    ne_tensor *Qcur, *Kcur, *Vcur;
+    if (bestla_fusion_QKV_f32f32_support(model.layers[il].attn[0]->data, model.layers[il].attn[1]->data,
+                                         model.layers[il].attn[2]->data, seq_len_sum, model.layers[il].attn[0]->ne[1],
+                                         model.layers[il].attn[0]->ne[0]) &&
+        n_head == n_head_kv) {  // fused execution of QKV
+      struct ne_tensor* QKVcur =
+          ne_mul_qkv(ctx0, model.layers[il].attn[0], model.layers[il].attn[1], model.layers[il].attn[2], cur);
+      const size_t qkv_size = head_size * n_head * seq_len_sum;
+      const size_t qkv_bytes = qkv_size * ne_element_size(QKVcur);
+      Qcur = ne_reshape_4d(ctx0, ne_view_1d(ctx0, QKVcur, qkv_size, 0 * qkv_bytes), head_size, n_head, seq_len_sum,
+                           infer_bs);
+      Kcur = ne_reshape_4d(ctx0, ne_view_1d(ctx0, QKVcur, qkv_size, 1 * qkv_bytes), head_size, n_head_kv, seq_len_sum,
+                           infer_bs);
+      Vcur = ne_view_1d(ctx0, QKVcur, qkv_size, 2 * qkv_bytes);
+    } else {
+      Qcur = ne_reshape_4d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[0], cur), head_size, n_head, seq_len_sum,
+                           infer_bs);
+      Kcur = ne_reshape_4d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[1], cur), head_size, n_head_kv, seq_len_sum,
+                           infer_bs);
+      Vcur = ne_mul_mat(ctx0, model.layers[il].attn[2], cur);
+    }
+
+    if (concat_multi_seqs) {
+      size_t off_sl = 0;
+      // per_request rope
+      for (int gi = 0; gi < infer_groups.size(); ++gi) {
+        const int qk_bs = infer_groups[gi].size();
+        const int qk_sl = n_tokens[infer_groups[gi].front()];
+        const int qk_n_past = n_pasts[infer_groups[gi].front()];
+        struct ne_tensor* Qcur_req =
+            ne_view_4d(ctx0, Qcur, head_size, n_head, qk_sl, qk_bs, ne_element_size(Qcur) * head_size,
+                       ne_element_size(Qcur) * head_size * n_head, ne_element_size(Qcur) * head_size * n_head * qk_sl,
+                       off_sl * n_head * ne_element_size(Qcur));
+        ne_build_forward_expand(
+            &gf, ne_rope_inplace(ctx0, Qcur_req, qk_n_past, n_rot, 0, 0, hparams.freq_base, hparams.freq_scale));
+        int64_t t002 = ne_time_us();
+        struct ne_tensor* Kcur_req = ne_view_4d(
+            ctx0, Kcur, head_size, n_head_kv, qk_sl, qk_bs, ne_element_size(Kcur) * head_size,
+            ne_element_size(Kcur) * head_size * n_head_kv, ne_element_size(Kcur) * head_size * n_head_kv * qk_sl,
+            off_sl * n_head_kv * ne_element_size(Kcur));
+        ne_build_forward_expand(
+            &gf, ne_rope_inplace(ctx0, Kcur_req, qk_n_past, n_rot, 0, 0, hparams.freq_base, hparams.freq_scale));
+        off_sl += head_size * qk_bs * qk_sl;
+      }
+    } else {
+      Qcur = ne_rope_inplace(ctx0, Qcur, std::max(n_cached - N, n_past), n_rot, 0, 0, hparams.freq_base,
+                             hparams.freq_scale);
+      Kcur = ne_rope_inplace(  // n_ctx exceeds but it will be shift-roped back with cached K
+          ctx0, Kcur, n_past, n_rot, 0, 0, hparams.freq_base, hparams.freq_scale);
+    //   Vcur = ne_transpose(ctx0, ne_reshape_2d(ctx0, Vcur, head_size * n_head_kv, N));
+    }
+    ne_set_name(Qcur, "Qcur");
+    ne_set_name(Kcur, "Kcur");
+    ne_set_name(Vcur, "Vcur");
+
+    // self-attention
+    const float attn_scale = 1.0f / sqrtf(static_cast<float>(head_size));
+    struct ne_tensor* KQV_merged_contiguous =
+        ne_new_tensor_2d(ctx0, NE_TYPE_F32, head_size * n_head, seq_len_sum, NE_SIZE_CALC);
+    if (!run_mha_reordered) {
+      // store key and value to memory
+      // important:
+      // 1. storing RoPE-ed version of K in the KV cache!
+      // 2. for loop self-attention in multi seqs infer (num_request > 1)
+      {
+        struct ne_tensor* const k_cache =
+            ne_view_1d(ctx0, kv_self.k, n_ctx * n_embd_gqa * kv_n_ctx_block,
+                       il * n_ctx * ne_element_size(kv_self.k) * n_embd_gqa * kv_n_ctx_block);
+        struct ne_tensor* const v_cache =
+            ne_view_1d(ctx0, kv_self.v, n_ctx * n_embd_gqa * kv_n_ctx_block,
+                       il * n_ctx * ne_element_size(kv_self.v) * n_embd_gqa * kv_n_ctx_block);
+        // cache = [tokens, beams, requests, layers],
+        // tokens = [head_dim, head_num, n_ctx] (may different orders)
+        size_t off_N_i = 0;
+        for (int i = 0; i < batch_size; ++i) {
+          const int block_idx = block_ids[i];
+          const int N_i = n_tokens[i];
+          const int n_past_i = n_pasts[i];
+          // batch K
+          struct ne_tensor* Kcur_bs_i =
+              ne_permute(ctx0,
+                         ne_view_4d(ctx0, Kcur, head_size, n_head_kv, N_i, 1, ne_element_size(Kcur) * head_size,
+                                    ne_element_size(Kcur) * n_embd_gqa, ne_element_size(Kcur) * n_embd_gqa * N_i,
+                                    ne_element_size(Kcur) * off_N_i),
+                         0, 2, 1, 3);
+          struct ne_tensor* k_bs_i =
+              ne_view_4d(ctx0, k_cache, head_size, N_i, n_head_kv, 1, ne_element_size(k_cache) * head_size,
+                         ne_element_size(k_cache) * head_size * n_ctx, ne_element_size(k_cache) * n_embd_gqa * n_ctx,
+                         block_idx * n_ctx * n_embd_gqa * ne_element_size(k_cache) +
+                             head_size * n_past_i * ne_element_size(k_cache));
+          // batch V
+          struct ne_tensor* Vcur_bs_i =
+              ne_permute(ctx0,
+                         ne_reshape_4d(ctx0,
+                                       ne_view_2d(ctx0, Vcur, n_embd_gqa, N_i, ne_element_size(Vcur) * n_embd_gqa,
+                                                  ne_element_size(Vcur) * off_N_i),
+                                       head_size, n_head_kv, N_i, 1),
+                         1, 2, 0, 3);
+          struct ne_tensor* v_bs_i = ne_view_4d(
+              ctx0, v_cache, N_i, head_size, n_head_kv, 1, n_ctx * ne_element_size(v_cache),
+              n_ctx * ne_element_size(v_cache) * head_size, n_ctx * ne_element_size(v_cache) * n_embd_gqa,
+              block_idx * n_ctx * n_embd_gqa * ne_element_size(v_cache) + n_past_i * ne_element_size(v_cache));
+          // concat
+          ne_build_forward_expand(&gf, ne_cpy(ctx0, Kcur_bs_i, k_bs_i));
+          ne_build_forward_expand(&gf, ne_cpy(ctx0, Vcur_bs_i, v_bs_i));
+          off_N_i += head_size * n_head_kv * N_i;
+        }
+      }
+
+      // for-loop attention
+      size_t off_sl = 0;
+      for (int gi = 0; gi < infer_groups.size(); ++gi) {
+        const int attn_bs = infer_groups[gi].size();
+        const int attn_sl = n_tokens[infer_groups[gi].front()];
+        const int attn_block_id = block_ids[infer_groups[gi].front()];
+        const int attn_n_past = n_pasts[infer_groups[gi].front()];
+        const int attn_n_total = n_totals[infer_groups[gi].front()];
+        struct ne_tensor* Q =
+            ne_permute(ctx0,
+                       ne_view_4d(ctx0, Qcur, head_size, n_head, attn_sl, attn_bs, ne_element_size(Qcur) * head_size,
+                                  ne_element_size(Qcur) * head_size * n_head,
+                                  ne_element_size(Qcur) * head_size * n_head * attn_sl, off_sl * ne_element_size(Qcur)),
+                       0, 2, 1, 3);
+        std::string suffix = std::to_string(gi);
+        ne_set_name(Q, std::string("Q_" + suffix).c_str());
+        const int n_cached_gi = attn_n_past + attn_sl;
+        std::vector<int> attn_block_ids(infer_groups[gi].size());
+        for (int j = 0; j < infer_groups[gi].size(); ++j) {
+          attn_block_ids[j] = block_ids[infer_groups[gi][j]];
+        }
+        struct ne_tensor* K =
+            model_kv_cache_seq_concat(&gf, &lctx, ctx0, head_size, n_cached_gi, n_head_kv, attn_bs, attn_block_ids, il);
+
+        // split cached V into n_head heads
+        struct ne_tensor* V = model_kv_cache_seq_concat(&gf, &lctx, ctx0, n_cached_gi, head_size, n_head_kv, attn_bs,
+                                                        attn_block_ids, il, false);
+        ne_set_name(K, std::string("K_" + suffix).c_str());
+        ne_set_name(V, std::string("V_" + suffix).c_str());
+
+        // K * Q
+        struct ne_tensor* KQ = ne_mul_mat(ctx0, K, Q);
+        ne_set_name(KQ, std::string("KQ_" + suffix).c_str());
+
+        // KQ_scaled = KQ / sqrt(n_embd/n_head)
+        struct ne_tensor* KQ_scale = ne_new_f32(ctx0, attn_scale);
+        ne_set_name(KQ_scale, std::string("1/sqrt(n_embd/n_head)_" + suffix).c_str());
+
+        // KQ_scaled shape [n_cached, N, n_head, 1]
+        struct ne_tensor* KQ_scaled = ne_scale_inplace(ctx0, KQ, KQ_scale);
+        ne_set_name(KQ_scaled, std::string("KQ_scaled_" + suffix).c_str());
+
+        // KQ_masked = mask_past(KQ_scaled)
+        if (N > 1 || attn_n_total == 0) {  // TODO(Yi): shift roped-k with N > 1 next-token
+          KQ_scaled = ne_diag_mask_inf_inplace(ctx0, KQ_scaled, attn_n_past);
+          ne_set_name(KQ_scaled, std::string("KQ_masked_" + suffix).c_str());
+        }
+
+        // KQ = soft_max(KQ_masked)
+        struct ne_tensor* KQ_soft_max = ne_soft_max_inplace(ctx0, KQ_scaled);
+        ne_set_name(KQ_soft_max, std::string("KQ_soft_max_" + suffix).c_str());
+
+        struct ne_tensor* KQV = ne_mul_mat(ctx0, V, KQ_soft_max);
+        ne_set_name(KQV, std::string("KQV_" + suffix).c_str());
+
+        // KQV_merged = KQV.permute(0, 2, 1, 3)
+        struct ne_tensor* KQV_merged_gi = ne_permute(ctx0, KQV, 0, 2, 1, 3);
+        ne_set_name(KQV_merged_gi, std::string("KQV_merged_" + suffix).c_str());
+
+        ne_build_forward_expand(&gf,
+                                ne_cpy(ctx0, KQV_merged_gi,
+                                       ne_view_2d(ctx0, KQV_merged_contiguous, head_size * n_head, attn_sl * attn_bs,
+                                                  head_size * n_head * ne_element_size(KQV_merged_contiguous),
+                                                  ne_element_size(KQV_merged_contiguous) * off_sl)));
+        off_sl += head_size * n_head * attn_sl * attn_bs;
+      }
+      ne_set_name(KQV_merged_contiguous, "KQV_merged_contiguous");
+      // projection (no bias)
+      cur = ne_mul_mat(ctx0, model.layers[il].attn[3], KQV_merged_contiguous);
+    } else {
+      const auto k_size = kv_cache_info.k_bytes;
+      const auto v_size = kv_cache_info.v_bytes;
+      // store key and value to memory
+      {
+        size_t off_sl = 0;
+        for (int gi = 0; gi < infer_groups.size(); ++gi) {
+          const int update_bs = infer_groups[gi].size();
+          const int update_sl = n_tokens[infer_groups[gi].front()];
+          const int update_block_id = block_ids[infer_groups[gi].front()];
+          const int update_n_past = n_pasts[infer_groups[gi].front()];
+          const auto k_cache_g = ne_view_4d(ctx0, kv_self.k,                         // tensor
+                                            head_size, n_ctx, n_head_kv, update_bs,  // ne
+                                            0, 0, k_size,                            // nb (bestla managed)
+                                            il * kv_n_ctx_block * k_size + update_block_id * k_size);  // offset
+          const auto k_cur_g =
+              ne_view_4d(ctx0, Kcur, head_size, n_head_kv, update_sl, update_bs, ne_element_size(Kcur) * head_size,
+                         ne_element_size(Kcur) * n_embd_gqa, ne_element_size(Kcur) * n_embd_gqa * update_sl,
+                         ne_element_size(Kcur) * off_sl);
+          ne_build_forward_expand(&gf, ne_flash_attn_update_k(ctx0, k_cache_g, k_cur_g, update_n_past, false));
+          struct ne_tensor* v_cache_g =
+              ne_view_4d(ctx0, kv_self.v,                                           // tensor
+                         head_size, n_ctx, n_head_kv, update_bs,                    // ne
+                         0, 0, v_size,                                              // nb (bestla managed)
+                         il * kv_n_ctx_block * v_size + update_block_id * v_size);  // offset);
+          // bestla always view V as (D, n_head, seq, bs)
+          const auto v_cur_g =
+              ne_view_4d(ctx0, Vcur, head_size, n_head_kv, update_sl, update_bs, ne_element_size(Vcur) * head_size,
+                         ne_element_size(Vcur) * n_embd_gqa, ne_element_size(Vcur) * n_embd_gqa * update_sl,
+                         ne_element_size(Vcur) * off_sl);
+          ne_build_forward_expand(&gf, ne_flash_attn_update_v(ctx0, v_cache_g, v_cur_g, update_n_past, false));
+          off_sl += n_embd_gqa * update_sl * update_bs;
+        }
+      }
+
+      // for-loop attention
+      size_t off_sl = 0;
+      for (int gi = 0; gi < infer_groups.size(); ++gi) {
+        const int attn_bs = infer_groups[gi].size();
+        const int attn_sl = n_tokens[infer_groups[gi].front()];
+        const int attn_block_id = block_ids[infer_groups[gi].front()];
+        const int attn_n_past = n_pasts[infer_groups[gi].front()];
+        const int attn_n_total = n_totals[infer_groups[gi].front()];
+        struct ne_tensor* Q =
+            ne_permute(ctx0,
+                       ne_view_4d(ctx0, Qcur, head_size, n_head, attn_sl, attn_bs, ne_element_size(Qcur) * head_size,
+                                  ne_element_size(Qcur) * head_size * n_head,
+                                  ne_element_size(Qcur) * head_size * n_head * attn_sl, off_sl * ne_element_size(Qcur)),
+                       0, 2, 1, 3);
+        std::string suffix = std::to_string(gi);
+        ne_set_name(Q, std::string("Q_" + suffix).c_str());
+        const int n_cached_gi = attn_n_past + attn_sl;
+        struct ne_tensor* K =
+            ne_view_4d(ctx0, kv_self.k,                                                     // tensor
+                       head_size, n_cached_gi, n_head_kv, attn_bs,                          // ne
+                       kv_cache_info.stride_k_sl, kv_cache_info.stride_k_head_num, k_size,  // nb (bestla managed)
+                       il * kv_n_ctx_block * k_size + attn_block_id * k_size);              // offset
+        *reinterpret_cast<ATTN_FWD_LAYOUT*>(&K->nb[0]) = kv_cache_info.k_layout;            // use nb0 for layout
+        struct ne_tensor* V = ne_view_4d(ctx0, kv_self.v,                             // tensor
+                                         n_cached_gi, head_size, n_head_kv, attn_bs,  // ne
+                                         kv_cache_info.stride_v_head_size, kv_cache_info.stride_v_head_num,
+                                         v_size,                                                  // nb (bestla managed)
+                                         il * kv_n_ctx_block * v_size + attn_block_id * v_size);  // use nb0 for layout
+        *reinterpret_cast<ATTN_FWD_LAYOUT*>(&V->nb[0]) = kv_cache_info.v_layout;
+        ne_set_name(K, std::string("K_" + suffix).c_str());
+        ne_set_name(V, std::string("V_" + suffix).c_str());
+
+        ne_attn_flags_t attn_flags = NE_ATTN_FLAG_NONE;
+        if (NE_ATTN_PREFER_FP32) attn_flags |= NE_ATTN_FLAG_PREFER_FP32;
+        if (n_total == 0) attn_flags |= NE_ATTN_FLAG_IS_CAUSAL;  // no causal mask on next-token cases
+        struct ne_tensor* KQV_Out = ne_flash_attn(ctx0, Q, K, V, attn_scale, attn_flags);
+        struct ne_tensor* KQV_merged_gi = ne_view_2d(ctx0, KQV_Out, head_size * n_head, attn_sl * attn_bs,
+                                                     head_size * n_head * ne_element_size(KQV_Out), 0);
+        ne_set_name(KQV_merged_gi, std::string("KQV_merged_" + suffix).c_str());
+        ne_build_forward_expand(&gf,
+                                ne_cpy(ctx0, KQV_merged_gi,
+                                       ne_view_2d(ctx0, KQV_merged_contiguous, head_size * n_head, attn_sl * attn_bs,
+                                                  head_size * n_head * ne_element_size(KQV_merged_contiguous),
+                                                  ne_element_size(KQV_merged_contiguous) * off_sl)));
+        off_sl += head_size * n_head * attn_sl * attn_bs;
+      }
+      ne_set_name(KQV_merged_contiguous, "KQV_merged_contiguous");
+      // projection (no bias)
+      cur = ne_mul_mat(ctx0, model.layers[il].attn[3], KQV_merged_contiguous);
+    }
+
+    lctx.use_buf(ctx0, 1);
+
+    struct ne_tensor* inpFF = ne_add(ctx0, cur, inpSA);
+
+    // feed-forward network
+    {
+      // norm
+      {
+        cur = ne_rms_norm(ctx0, inpFF, hparams.norm_eps);
+
+        // cur = cur*ffn_norm(broadcasted)
+        cur = ne_mul(ctx0, cur, model.layers[il].norm[1]);
+      }
+      if (n_expert == 0) {
+        if (bestla_fusion_FFN_SiLu_f32f32_support(model.layers[il].ffn[0]->data, model.layers[il].ffn[1]->data,
+                                                  model.layers[il].ffn[2]->data, seq_len_sum, cur->ne[0],
+                                                  model.layers[il].ffn[0]->ne[1], model.layers[il].ffn[1]->ne[1])) {
+          cur = ne_ffn_silu(ctx0, model.layers[il].ffn[0], model.layers[il].ffn[1], model.layers[il].ffn[2], cur);
+        } else {
+          struct ne_tensor* tmp = ne_mul_mat(ctx0, model.layers[il].ffn[2], cur);
+          cur = ne_mul_mat(ctx0, model.layers[il].ffn[0], cur);
+          cur = ne_silu(ctx0, cur);
+          cur = ne_mul(ctx0, cur, tmp);
+          cur = ne_mul_mat(ctx0, model.layers[il].ffn[1], cur);
+        }
+      } else {
+        // for-loop MOE (deal with sequence one by one)
+        struct ne_tensor* moe_out = ne_new_tensor_2d(ctx0, NE_TYPE_F32, head_size * n_head, seq_len_sum, NE_SIZE_CALC);
+        size_t off_sl = 0;
+        for (int bi = 0; bi < batch_size; ++bi) {
+          const int moe_sl = n_tokens[bi];
+          struct ne_tensor* cur_seq =
+              ne_view_2d(ctx0, cur, head_size * n_head, moe_sl, head_size * n_head * ne_element_size(cur),
+                         ne_element_size(cur) * off_sl);
+          std::string suffix = std::to_string(bi);
+          ne_tensor* logits = ne_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur_seq);  // [n_tokens, num_experts]
+          ne_tensor* probs = ne_soft_max_inplace(ctx0, logits);
+          ne_tensor* selected_experts = ne_top_k(ctx0, probs, n_expert_used);
+          ne_tensor* weights = ne_get_rows(ctx0, ne_reshape_3d(ctx0, probs, 1, n_expert, moe_sl), selected_experts);
+          weights = ne_reshape_2d(ctx0, weights, n_expert_used, moe_sl);
+          ne_tensor* weights_sum = ne_sum_rows(ctx0, weights);
+          weights_sum = ne_repeat(ctx0, weights_sum, weights);
+          weights = ne_div(ctx0, weights, weights_sum);
+          ne_tensor* moe_out_i = nullptr;
+
+          for (int i = 0; i < n_expert_used; ++i) {
+            ne_tensor* cur_expert;
+            if (moe_sl == 1 && bestla_fusion_FFN_SiLu_f32f32_support(
+                                   model.layers[il].ffn_gate_exp[0]->data, model.layers[il].ffn_down_exp[0]->data,
+                                   model.layers[il].ffn_up_exp[0]->data, moe_sl, cur_seq->ne[0],
+                                   model.layers[il].ffn_gate_exp[0]->ne[1], model.layers[il].ffn_down_exp[0]->ne[1])) {
+              cur_expert = ne_mul_id_ffn_silu(ctx0, model.layers[il].ffn_down_exp, model.layers[il].ffn_gate_exp,
+                                              model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur_seq);
+            } else {
+              ne_tensor* cur_up =
+                  ne_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur_seq);
+              ne_set_name(cur_up, std::string("ffn_moe_up_" + suffix).c_str());
+
+              ne_tensor* cur_gate =
+                  ne_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur_seq);
+              ne_set_name(cur_gate, std::string("ffn_moe_gate_" + suffix).c_str());
+
+              cur_gate = ne_silu(ctx0, cur_gate);
+              ne_set_name(cur_gate, std::string("ffn_moe_silu_" + suffix).c_str());
+
+              cur_expert = ne_mul(ctx0, cur_up, cur_gate);  // [n_tokens, n_embd]
+              ne_set_name(cur_expert, std::string("ffn_moe_gate_par_" + suffix).c_str());
+
+              cur_expert = ne_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i,
+                                         cur_expert);  // [n_tokens, n_embd]
+              ne_set_name(cur_expert, std::string("ffn_moe_down_" + suffix).c_str());
+            }
+
+            cur_expert = ne_mul(
+                ctx0, cur_expert,
+                ne_repeat(ctx0, ne_view_2d(ctx0, weights, 1, moe_sl, weights->nb[1], i * weights->nb[0]), cur_expert));
+            ne_set_name(cur_expert, std::string("ffn_moe_weighted_" + suffix).c_str());
+
+            if (i == 0) {
+              moe_out_i = cur_expert;
+            } else {
+              moe_out_i = ne_add(ctx0, moe_out_i, cur_expert);
+              ne_set_name(moe_out_i, std::string("ffn_moe_out_" + suffix).c_str());
+            }
+          }
+          ne_build_forward_expand(&gf, ne_cpy(ctx0, moe_out_i,
+                                              ne_view_2d(ctx0, moe_out, head_size * n_head, moe_sl,
+                                                         head_size * n_head * ne_element_size(moe_out),
+                                                         ne_element_size(moe_out) * off_sl)));
+          off_sl += head_size * n_head * moe_sl;
+        }
+
+        cur = moe_out;
+      }
+    }
+
+    cur = ne_add(ctx0, cur, inpFF);
+
+    // input for next layer
+    inpL = cur;
+  }
+
+  lctx.use_buf(ctx0, 0);
+
+  // used at the end to optionally extract the embeddings
+  // struct ne_tensor* embeddings = nullptr;
+  // norm
+  {
+    inpL = ne_rms_norm(ctx0, inpL, hparams.norm_eps);
+
+    // inpL = inpL*norm(broadcasted)
+    inpL = ne_mul(ctx0, inpL, model.others[1]);
+
+    // embeddings = inpL;
+  }
+
+  lctx.use_buf(ctx0, -1);
+  
+  // run the computation
+  ne_build_forward_expand(&gf, inpL);
+
+  ne_graph_compute(ctx0, &gf);
+
+  if (ns_log_level() == 0 || ns_log_level() == 2) {
+    ne_graph_profiling(&gf);
+  }
+
+  // update kv token count
+  lctx.model.kv_self.n = n_cached;
+
+  // extract last hidden states
+  {
+    auto& hidden_states_out = lctx.last_hidden_states;
+    hidden_states_out.resize(n_embd * seq_len_sum);
+    memcpy(hidden_states_out.data(), reinterpret_cast<float*>(ne_get_data(inpL)), sizeof(float) * n_embd * seq_len_sum);
+  }
+
+  if (mem_per_token == 0) {
+    mem_per_token = ne_used_mem(ctx0) / N;
+  }
+
+  ne_free(ctx0);
+
+  // // measure the performance only for the single-token evals
+  // int64_t time_interval = ne_time_us() - t_start_us;
+  // if (N == 1) {
+  //   lctx.t_eval_us += time_interval;
+  //   lctx.n_eval++;
+  // } else if (N > 1) {
+  //   lctx.t_p_eval_us += time_interval;
+  //   lctx.n_p_eval += N;
+  // }
+  // lctx.eval_times.push_back(time_interval);
+
+  return true;
+}
+
+int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads) {
+  if (!llama_model_eval_internal(ctx, inputs, n_input, n_threads)) {
+    fprintf(stderr, "%s: failed to eval\n", __func__);
+    return 1;
+  }
+
+  // get a more accurate load time, upon first eval
+
+  if (!ctx->has_evaluated_once) {
+    ctx->t_load_us = ne_time_us() - ctx->t_start_us;
+    ctx->has_evaluated_once = true;
+  }
+
+  return 0;
+}
diff --git a/vllm-ext/inference_engine/cpp/models/llama/llama.h b/vllm-ext/inference_engine/cpp/models/llama/llama.h
new file mode 100644
index 000000000..99fb65a72
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/llama/llama.h
@@ -0,0 +1,81 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#ifndef LLAMA_H
+#define LLAMA_H
+
+#include "models/model_utils/model_files.h"
+#include "models/model_utils/model_types.h"
+
+enum llama_model {
+  LLAMA_UNKNOWN,
+  LLAMA_7B,
+  LLAMA_13B,
+  LLAMA_30B,
+  LLAMA_65B,
+};
+
+static const model_scratch llama_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
+  switch (n_layers) {
+    case 32:
+      return {
+          static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
+      };
+    case 40:
+      return {
+          static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
+      };
+    case 48:
+      return {
+          static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
+      };
+    case 60:
+      return {
+          static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
+      };
+    case 80:
+      return {
+          static_cast<unsigned long long>(scratch_size_ratio * 3072) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
+          static_cast<unsigned long long>(scratch_size_ratio * 3072 * 3) * MB,
+      };
+    default:
+      MODEL_ASSERT(false);
+  }
+}
+
+class Llama : public IModel {
+ private:
+  model_archs arch = MODEL_LLAMA;
+  std::unique_ptr<model_model_loader> ml;
+  uint32_t n_layer, n_embd, n_ff, n_vocab, n_head, n_head_kv, n_expert, n_expert_used;
+  int n_gpu_layer;
+  bool use_mmap, use_mlock, vocab_only;
+  model_scratch scratch;
+
+ public:
+  void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
+            bool vocab_only_) override;
+  void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
+};
+
+#endif  // LLAMA_H
diff --git a/vllm-ext/inference_engine/cpp/models/llama/llama_utils.cpp b/vllm-ext/inference_engine/cpp/models/llama/llama_utils.cpp
new file mode 100644
index 000000000..f54be85b2
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/llama/llama_utils.cpp
@@ -0,0 +1,300 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstring>
+#include <exception>
+#include <fstream>
+#include <iterator>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "core/data_types.h"
+#include "core/ne.h"
+#include "core/ne_layers.h"
+#include "models/llama/llama.h"
+#include "models/model_utils/model_config.h"
+#include "models/model_utils/model_files.h"
+#include "models/model_utils/model_types.h"
+#include "models/model_utils/quant_utils.h"
+#include "models/model_utils/util.h"
+#include "models/models.h"
+
+void load_model_internal(const struct model_params& params, model_context& ctx, model_progress_callback progress_callback,
+                         void* progress_callback_user_data) {
+  std::unique_ptr<Llama> ms(new Llama());
+  ms->init(params.model.c_str(), &ctx, 0, false, true, false);
+  ms->load(&ctx, progress_callback, progress_callback_user_data);
+
+  model_context& lctx = ctx;
+  lctx.support_bestla_kv = true;
+}
+
+void Llama::init(const char* path_model, model_context* ctx, int n_gpu_layer_, bool use_mmap_, bool use_mlock_,
+                 bool vocab_only_) {
+  model_context& lctx = *ctx;
+  n_gpu_layer = n_gpu_layer_;
+  use_mmap = use_mmap_;
+  use_mlock = use_mlock_;
+  vocab_only = vocab_only_;
+  auto& model = lctx.model;
+  ml.reset(new model_model_loader(path_model, use_mmap, vocab_only));
+  lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
+  model.hparams = ml->file_loaders.at(0)->hparams;
+  model_file_version file_version = ml->file_loaders.at(0)->file_version;
+  auto& hparams = model.hparams;
+  n_ff = hparams.ffn_hidden_size;
+  fprintf(stderr, "%s: n_vocab    = %u\n", __func__, hparams.n_vocab);
+  fprintf(stderr, "%s: n_ctx      = %u\n", __func__, hparams.max_seq_len);
+  fprintf(stderr, "%s: n_embd     = %u\n", __func__, hparams.n_embd);
+  fprintf(stderr, "%s: n_mult     = %u\n", __func__, hparams.n_mult);
+  fprintf(stderr, "%s: n_head     = %u\n", __func__, hparams.n_head);
+  fprintf(stderr, "%s: n_head_kv  = %u\n", __func__, hparams.n_head_kv);
+  fprintf(stderr, "%s: n_layer    = %u\n", __func__, hparams.n_layer);
+  fprintf(stderr, "%s: n_rot      = %u\n", __func__, hparams.n_rot);
+  fprintf(stderr, "%s: n_ff       = %u\n", __func__, n_ff);
+  fprintf(stderr, "%s: n_parts    = %zu\n", __func__, ml->file_loaders.size());
+  n_embd = hparams.n_embd;
+  n_vocab = hparams.n_vocab;
+  n_layer = hparams.n_layer;
+  n_head_kv = hparams.n_head_kv;
+  n_head = hparams.n_head;
+  n_expert = hparams.n_experts;
+  n_expert_used = hparams.n_experts_used;
+  scratch = llama_mem_req(n_layer, lctx.scratch_size_ratio);
+  // check if scratch size bigger than available mem * 0.9
+  int64_t avail_mem = get_available_memory() * 0.9;
+  int64_t memory_needed = scratch.scratch0 + scratch.scratch1 + scratch.eval;
+  if (memory_needed > avail_mem) {
+    std::ostringstream oss;
+    construct_message(oss, "Not enough memory for model scratch space. Needed: ", memory_needed,
+                      " Available (90%): ", avail_mem, ". Please reduce either context size or number of max concurrent queries");
+    fprintf(stderr, "ERROR: %s\n", oss.str().c_str());
+    throw std::invalid_argument(oss.str());
+  }
+  model.scratchs = scratch;
+}
+
+#define MODEL_BACKEND_OFFLOAD NE_BACKEND_CPU
+void Llama::load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) {
+  model_context& lctx = *ctx;
+  auto& model = lctx.model;
+  auto& ne_ctx = model.ctx;
+  size_t ctx_size;
+  size_t mmapped_size;
+  ml->calc_sizes(&ctx_size, &mmapped_size);
+  fprintf(stderr, "%s: ctx size   = %7.2f MB\n", __func__, ctx_size / 1024.0 / 1024.0);
+
+  // create the ne context
+  lctx.model.buf.resize(ctx_size);
+  if (use_mlock) {
+    lctx.model.mlock_buf.init(lctx.model.buf.addr);
+    lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
+  }
+
+  struct ne_init_params params = {
+      /*.mem_size   =*/lctx.model.buf.size,
+      /*.mem_buffer =*/lctx.model.buf.addr,
+      /*.no_alloc   =*/ml->use_mmap,
+  };
+
+  model.ctx = ne_init(params);
+  if (!model.ctx) {
+    throw format("ne_init() failed");
+  }
+
+  ml->ne_ctx = ne_ctx;
+
+  const int i_gpu_start = n_layer - n_gpu_layer;
+  model.layers.resize(n_layer);
+  size_t vram_total = 0;
+  if (ml->verify_tensor("token_embd.weight")) {  // GGUF
+    model.others[0] = ml->get_tensor("token_embd.weight", {n_embd, n_vocab}, NE_BACKEND_CPU);
+    model.others[1] = ml->get_tensor("output_norm.weight", {n_embd}, NE_BACKEND_CPU);
+    model.others[2] = ml->get_tensor("output.weight", {n_embd, n_vocab},
+                                     n_gpu_layer > static_cast<int>(n_layer) ? MODEL_BACKEND_OFFLOAD : NE_BACKEND_CPU);
+
+    for (uint32_t i = 0; i < n_layer; ++i) {
+      const ne_backend backend = static_cast<int>(i) < i_gpu_start ? NE_BACKEND_CPU : MODEL_BACKEND_OFFLOAD;
+      auto& layer = model.layers[i];
+      std::string layers_i = "blk." + std::to_string(i);
+
+      // attention norm
+      layer.norm[0] = ml->get_tensor(layers_i + ".attn_norm.weight", {n_embd}, backend);
+
+      // qkv GEMM
+      layer.attn[0] = ml->get_tensor(layers_i + ".attn_q.weight", {n_embd, n_embd}, backend);
+      layer.attn[1] = ml->get_tensor(layers_i + ".attn_k.weight", {n_embd, n_embd / (n_head / n_head_kv)}, backend);
+      layer.attn[2] = ml->get_tensor(layers_i + ".attn_v.weight", {n_embd, n_embd / (n_head / n_head_kv)}, backend);
+      layer.attn[3] = ml->get_tensor(layers_i + ".attn_output.weight", {n_embd, n_embd}, backend);
+
+      // ffn norm
+      layer.norm[1] = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
+
+      // ffn GEMM
+      if (ml->verify_tensor(layers_i + ".ffn_gate.weight")) {
+        NE_ASSERT(n_expert == 0);
+        NE_ASSERT(n_expert_used == 0);
+        layer.ffn[0] = ml->get_tensor(layers_i + ".ffn_gate.weight", {n_embd, n_ff}, backend);
+        layer.ffn[1] = ml->get_tensor(layers_i + ".ffn_down.weight", {n_ff, n_embd}, backend);
+        layer.ffn[2] = ml->get_tensor(layers_i + ".ffn_up.weight", {n_embd, n_ff}, backend);
+      } else {
+        NE_ASSERT(n_expert > 0);
+        NE_ASSERT(n_expert_used > 0);
+        layer.ffn_gate_inp = ml->get_tensor(layers_i + ".ffn_gate_inp.weight", {n_embd, n_expert}, backend);
+        for (uint32_t x = 0; x < n_expert; ++x) {
+          layer.ffn_gate_exp[x] =
+              ml->get_tensor(layers_i + ".ffn_gate." + std::to_string(x) + ".weight", {n_embd, n_ff}, backend);
+          layer.ffn_down_exp[x] =
+              ml->get_tensor(layers_i + ".ffn_down." + std::to_string(x) + ".weight", {n_ff, n_embd}, backend);
+          layer.ffn_up_exp[x] =
+              ml->get_tensor(layers_i + ".ffn_up." + std::to_string(x) + ".weight", {n_embd, n_ff}, backend);
+        }
+      }
+
+      if (backend != NE_BACKEND_CPU) {
+        vram_total += ne_nbytes(layer.norm[0]) + ne_nbytes(layer.attn[0]) + ne_nbytes(layer.attn[1]) +
+                      ne_nbytes(layer.attn[2]) + ne_nbytes(layer.attn[3]) + ne_nbytes(layer.norm[1]) +
+                      ne_nbytes(layer.ffn[0]) + ne_nbytes(layer.ffn[1]) + ne_nbytes(layer.ffn[2]);
+      }
+    }
+  } else {  // NE Fortmat
+    model.others[0] = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, NE_BACKEND_CPU);
+    model.others[1] = ml->get_tensor("norm.weight", {n_embd}, NE_BACKEND_CPU);
+    model.others[2] = ml->get_tensor("output.weight", {n_embd, n_vocab},
+                                     n_gpu_layer > static_cast<int>(n_layer) ? MODEL_BACKEND_OFFLOAD : NE_BACKEND_CPU);
+
+    for (uint32_t i = 0; i < n_layer; ++i) {
+      const ne_backend backend = static_cast<int>(i) < i_gpu_start ? NE_BACKEND_CPU : MODEL_BACKEND_OFFLOAD;
+      auto& layer = model.layers[i];
+      std::string layers_i = "layers." + std::to_string(i);
+
+      // attention norm
+      layer.norm[0] = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
+
+      // qkv GEMM
+      layer.attn[0] = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
+      layer.attn[1] =
+          ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd / (n_head / n_head_kv)}, backend);
+      layer.attn[2] =
+          ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd / (n_head / n_head_kv)}, backend);
+      layer.attn[3] = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
+
+      // ffn norm
+      layer.norm[1] = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
+
+      // ffn GEMM
+
+      if (ml->verify_tensor(layers_i + ".feed_forward.w1.weight")) {
+        NE_ASSERT(n_expert == 0);
+        NE_ASSERT(n_expert_used == 0);
+        layer.ffn[0] = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
+        layer.ffn[1] = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {n_ff, n_embd}, backend);
+        layer.ffn[2] = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
+      } else {
+        NE_ASSERT(n_expert > 0);
+        NE_ASSERT(n_expert_used > 0);
+        layer.ffn_gate_inp = ml->get_tensor(layers_i + ".ffn_gate_inp.weight", {n_embd, n_expert}, backend);
+        for (uint32_t x = 0; x < n_expert; ++x) {
+          layer.ffn_gate_exp[x] =
+              ml->get_tensor(layers_i + ".ffn_gate." + std::to_string(x) + ".weight", {n_embd, n_ff}, backend);
+          layer.ffn_down_exp[x] =
+              ml->get_tensor(layers_i + ".ffn_down." + std::to_string(x) + ".weight", {n_ff, n_embd}, backend);
+          layer.ffn_up_exp[x] =
+              ml->get_tensor(layers_i + ".ffn_up." + std::to_string(x) + ".weight", {n_embd, n_ff}, backend);
+        }
+      }
+      if (backend != NE_BACKEND_CPU) {
+        vram_total += ne_nbytes(layer.norm[0]) + ne_nbytes(layer.attn[0]) + ne_nbytes(layer.attn[1]) +
+                      ne_nbytes(layer.attn[2]) + ne_nbytes(layer.attn[3]) + ne_nbytes(layer.norm[1]) +
+                      ne_nbytes(layer.ffn[0]) + ne_nbytes(layer.ffn[1]) + ne_nbytes(layer.ffn[2]);
+      }
+    }
+  }
+
+  // print memory requirements
+  // this is the total memory required to run the inference
+  const size_t mem_required = ctx_size + mmapped_size - vram_total +  // weights in VRAM not in memory
+                              scratch.scratch0 + scratch.scratch1 + scratch.eval;
+  fprintf(stderr, "%s: scratch0   = %7.2f MB\n", __func__, scratch.scratch0 / 1024.0 / 1024.0);
+  fprintf(stderr, "%s: scratch1   = %7.2f MB\n", __func__, scratch.scratch1 / 1024.0 / 1024.0);
+  fprintf(stderr, "%s: scratch2   = %7.2f MB\n", __func__, scratch.eval / 1024.0 / 1024.0);
+  fprintf(stderr, "%s: mem required  = %7.2f MB (+ memory per state)\n", __func__, mem_required / 1024.0 / 1024.0);
+
+  (void)n_gpu_layer;
+
+  // populate `tensors_by_name`
+  for (model_load_tensor& lt : ml->tensors_map.tensors) {
+    model.tensors_by_name.emplace_back(lt.name, lt.ne_tensor);
+  }
+
+  ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : nullptr);
+
+  if (progress_callback) {
+    progress_callback(1.0f, progress_callback_user_data);
+  }
+
+  model.mapping = std::move(ml->mapping);
+}
+
+#undef MODEL_BACKEND_OFFLOAD
+
+class llama_quant_layer : public quant_layer_base {
+ public:
+  quant_params_internal get_layer_config(std::string layername, std::vector<int64_t> ne, ne_type type) override {
+    bool quantize = layername.rfind("weight") == layername.size() - 6;
+    if ((layername.find("embedding") != std::string::npos) ||
+        (layername == "token_embd.weight" || layername == "tok_embeddings.weight")) {
+      // special layer process, can be loaded by config file
+      return quant_params_internal();  // return q4_0 to cover the usage of getrow
+    }
+
+    quantize &= (ne.size() == 2);
+    if (quantize) {
+      if (mGCfg.bits == quant_bits::q2) {
+        auto q4cfg = mGCfg;
+        q4cfg.bits = quant_bits::q4;
+        q4cfg.alg = quant_alg::sym;
+        if (layername.find("attention.wv") != std::string::npos) {
+          return q4cfg;
+        }
+        /*if (layername.find("attention.wq") != std::string::npos) {
+          return q4cfg;
+        }
+        if (layername.find("attention.wk") != std::string::npos) {
+          return q4cfg;
+        }*/
+        if (layername.find("feed_forward.w2") != std::string::npos) {
+          return q4cfg;
+        }
+        /*if (layername.find("output.weight") != std::string::npos) {
+          return q4cfg;
+        }*/
+      }
+
+      return mGCfg;  // use global quant config
+    } else {
+      return quant_params_internal{quant_bits::count};  // non-quant
+    }
+  }
+};
+REGISTER_QUANT_LAYER_CLASS(llama);
diff --git a/vllm-ext/inference_engine/cpp/models/model_utils/CMakeLists.txt b/vllm-ext/inference_engine/cpp/models/model_utils/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm-ext/inference_engine/cpp/models/model_utils/gguf.h b/vllm-ext/inference_engine/cpp/models/model_utils/gguf.h
new file mode 100644
index 000000000..791fb97a9
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/model_utils/gguf.h
@@ -0,0 +1,568 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+// Defines fileno on msys:
+
+#ifndef GGUF_H
+#define GGUF_H
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#endif
+
+#include "core/layers/bestla_common.hpp"
+#include "core/ne_layers.h"
+#include "models/model_utils/util.h"
+
+#define GGML_MAX_DIMS 4
+#define GGUF_MAGIC "GGUF"
+
+enum ggml_log_level { GGML_LOG_LEVEL_ERROR = 2, GGML_LOG_LEVEL_WARN = 3, GGML_LOG_LEVEL_INFO = 4 };
+
+typedef void (*ggml_log_callback)(enum ggml_log_level level, const char* text, void* user_data);
+static void llama_log_callback_default(ggml_log_level level, const char* text, void* user_data) {
+  (void)level;
+  (void)user_data;
+  fputs(text, stderr);
+  fflush(stderr);
+}
+
+struct llama_state {
+  llama_state() {}
+
+  // We save the log callback globally
+  ggml_log_callback log_callback = llama_log_callback_default;
+  void* log_callback_user_data = nullptr;
+};
+
+static llama_state g_state;
+
+static void llama_log_internal(ggml_log_level level, const char* format, ...);
+
+#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
+#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
+#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+
+static void llama_log_internal_v(ggml_log_level level, const char* format, va_list args) {
+  va_list args_copy;
+  va_copy(args_copy, args);
+  char buffer[128];
+  int len = vsnprintf(buffer, 128, format, args);
+  if (len < 128) {
+    g_state.log_callback(level, buffer, g_state.log_callback_user_data);
+  } else {
+    char* buffer2 = new char[len + 1];
+    vsnprintf(buffer2, len + 1, format, args_copy);
+    buffer2[len] = 0;
+    g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
+    delete[] buffer2;
+  }
+  va_end(args_copy);
+}
+
+static void llama_log_internal(ggml_log_level level, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  llama_log_internal_v(level, format, args);
+  va_end(args);
+}
+
+struct gguf_str {
+  uint64_t n;  // GGUFv2
+  char* data;
+};
+
+enum model_format { GGUF = 0, NE = 1, UNKNOWN = 2 };
+
+enum llama_fver {
+  GGUF_FILE_VERSION_V1 = 1,
+  GGUF_FILE_VERSION_V2 = 2,
+  GGUF_FILE_VERSION_V3 = 3,
+};
+
+enum ggml_type {
+  GGML_TYPE_F32 = 0,
+  GGML_TYPE_F16 = 1,
+  GGML_TYPE_Q4_0 = 2,
+  GGML_TYPE_Q4_1 = 3,
+  // GGML_TYPE_Q4_2 = 4, support has been removed
+  // GGML_TYPE_Q4_3 (5) support has been removed
+  GGML_TYPE_Q5_0 = 6,
+  GGML_TYPE_Q5_1 = 7,
+  GGML_TYPE_Q8_0 = 8,
+  GGML_TYPE_Q8_1 = 9,
+  // k-quantizations
+  GGML_TYPE_Q2_K = 10,
+  GGML_TYPE_Q3_K = 11,
+  GGML_TYPE_Q4_K = 12,
+  GGML_TYPE_Q5_K = 13,
+  GGML_TYPE_Q6_K = 14,
+  GGML_TYPE_Q8_K = 15,
+  GGML_TYPE_I8,
+  GGML_TYPE_I16,
+  GGML_TYPE_I32,
+  GGML_TYPE_COUNT,
+};
+
+enum gguf_type {
+  GGUF_TYPE_UINT8 = 0,
+  GGUF_TYPE_INT8 = 1,
+  GGUF_TYPE_UINT16 = 2,
+  GGUF_TYPE_INT16 = 3,
+  GGUF_TYPE_UINT32 = 4,
+  GGUF_TYPE_INT32 = 5,
+  GGUF_TYPE_FLOAT32 = 6,
+  GGUF_TYPE_BOOL = 7,
+  GGUF_TYPE_STRING = 8,
+  GGUF_TYPE_ARRAY = 9,
+  GGUF_TYPE_UINT64 = 10,
+  GGUF_TYPE_INT64 = 11,
+  GGUF_TYPE_FLOAT64 = 12,
+  GGUF_TYPE_COUNT,  // marks the end of the enum
+};
+
+static const char* GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
+    "u8", "i8", "u16", "i16", "u32", "i32", "f32", "bool", "str", "arr", "u64", "i64", "f64",
+};
+
+union gguf_value {
+  uint8_t uint8;
+  int8_t int8;
+  uint16_t uint16;
+  int16_t int16;
+  uint32_t uint32;
+  int32_t int32;
+  float float32;
+  uint64_t uint64;
+  int64_t int64;
+  double float64;
+  bool bool_;
+
+  struct gguf_str str;
+
+  struct {
+    enum gguf_type type;
+
+    uint64_t n;  // GGUFv2
+    void* data;
+  } arr;
+};
+
+struct gguf_kv {
+  struct gguf_str key;
+
+  enum gguf_type type;
+  union gguf_value value;
+};
+
+struct gguf_header {
+  char magic[4];
+  uint32_t version;
+  uint64_t n_tensors;  // GGUFv2
+  uint64_t n_kv;       // GGUFv2
+};
+
+struct gguf_context {
+  struct gguf_header header;
+
+  struct gguf_kv* kv;
+  struct gguf_tensor_info* infos;
+
+  size_t alignment;
+  size_t offset;  // offset of `data` from beginning of file
+  size_t size;    // size of `data` in bytes
+
+  // uint8_t * padding;
+  void* data;
+};
+
+#if UINTPTR_MAX == 0xFFFFFFFF
+#define GGML_MEM_ALIGN 4
+#else
+#define GGML_MEM_ALIGN 16
+#endif
+
+#define GGUF_DEFAULT_ALIGNMENT 32
+
+static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
+    sizeof(uint8_t),
+    sizeof(int8_t),
+    sizeof(uint16_t),
+    sizeof(int16_t),
+    sizeof(uint32_t),
+    sizeof(int32_t),
+    sizeof(float),
+    sizeof(bool),
+    sizeof(struct gguf_str),
+    0,  // undefined
+    sizeof(uint64_t),
+    sizeof(int64_t),
+    sizeof(double),
+};
+static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
+
+enum llm_arch {
+  LLM_ARCH_LLAMA,
+  LLM_ARCH_FALCON,
+  LLM_ARCH_BAICHUAN,
+  LLM_ARCH_GPT2,
+  LLM_ARCH_GPTJ,
+  LLM_ARCH_GPTNEOX,
+  LLM_ARCH_MPT,
+  LLM_ARCH_STARCODER,
+  LLM_ARCH_PERSIMMON,
+  LLM_ARCH_REFACT,
+  LLM_ARCH_BLOOM,
+  LLM_ARCH_STABLELM,
+  LLM_ARCH_QWEN,
+  LLM_ARCH_CHATGLM,
+  LLM_ARCH_CHATGLM2,
+  LLM_ARCH_CHATGLM3,
+  LLM_ARCH_PHI,
+  LLM_ARCH_GEMMA,
+  LLM_ARCH_QWEN2,
+  LLM_ARCH_UNKNOWN,
+};
+
+static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {{LLM_ARCH_LLAMA, "llama"},
+                                                         {LLM_ARCH_FALCON, "falcon"},
+                                                         {LLM_ARCH_GPT2, "gpt2"},
+                                                         {LLM_ARCH_GPTJ, "gptj"},
+                                                         {LLM_ARCH_GPTNEOX, "gptneox"},
+                                                         {LLM_ARCH_MPT, "mpt"},
+                                                         {LLM_ARCH_BAICHUAN, "baichuan"},
+                                                         {LLM_ARCH_STARCODER, "starcoder"},
+                                                         {LLM_ARCH_PERSIMMON, "persimmon"},
+                                                         {LLM_ARCH_REFACT, "refact"},
+                                                         {LLM_ARCH_BLOOM, "bloom"},
+                                                         {LLM_ARCH_STABLELM, "stablelm"},
+                                                         {LLM_ARCH_QWEN, "qwen"},
+                                                         {LLM_ARCH_CHATGLM, "chatglm"},
+                                                         {LLM_ARCH_CHATGLM2, "chatglm2"},
+                                                         {LLM_ARCH_CHATGLM3, "chatglm3"},
+                                                         {LLM_ARCH_PHI, "phi"},
+                                                         {LLM_ARCH_GEMMA, "gemma"},
+                                                         {LLM_ARCH_QWEN2, "qwen2"}};
+
+struct gguf_tensor_info {
+  struct gguf_str name;
+
+  uint32_t n_dims;
+  uint64_t ne[GGML_MAX_DIMS];
+
+  enum ggml_type type;
+
+  uint64_t offset;  // offset from start of `data`, must be a multiple of `ALIGNMENT`
+
+  // for writing API
+  const void* data;
+  size_t size;
+};
+
+static bool gguf_fread_el(FILE* file, void* dst, size_t size, size_t* offset) {
+  const size_t n = fread(dst, 1, size, file);
+  *offset += n;
+  return n == size;
+}
+
+static bool gguf_fread_str(FILE* file, struct gguf_str* p, size_t* offset) {
+  p->n = 0;
+  p->data = NULL;
+
+  bool ok = true;
+
+  ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
+  p->data = reinterpret_cast<char*>(calloc(p->n + 1, 1));
+  ok = ok && gguf_fread_el(file, p->data, p->n, offset);
+
+  return ok;
+}
+
+static const char* llama_file_version_name(llama_fver version) {
+  switch (version) {
+    case GGUF_FILE_VERSION_V1:
+      return "GGUF V1 (support until nov 2023)";
+    case GGUF_FILE_VERSION_V2:
+      return "GGUF V2";
+    case GGUF_FILE_VERSION_V3:
+      return "GGUF V3 (latest)";
+  }
+
+  return "unknown";
+}
+
+inline static void* ggml_aligned_malloc(size_t size) {
+  if (size == 0) {
+    printf("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
+    return NULL;
+  }
+  void* aligned_memory = NULL;
+#ifdef _MSC_VER
+  aligned_memory = _aligned_malloc(size, GGML_MEM_ALIGN);
+  int result = aligned_memory ? 0 : 1;
+#else
+#ifdef GGML_USE_CPU_HBM
+  int result = hbw_posix_memalign(&aligned_memory, 16, size);
+#elif GGML_USE_METAL
+  int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
+#else
+  int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
+#endif
+#endif
+
+  if (result != 0) {
+    // Handle allocation failure
+    const char* error_desc = "unknown allocation error";
+    switch (result) {
+      case EINVAL:
+        error_desc = "invalid alignment value";
+        break;
+      case ENOMEM:
+        error_desc = "insufficient memory";
+        break;
+    }
+    printf("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size / (1024.0 * 1024.0));
+    return NULL;
+  }
+  return aligned_memory;
+}
+#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
+
+#define GGUF_GET_KEY(ctx, dst, func, type, req, key)                                                        \
+  do {                                                                                                      \
+    const std::string skey(key);                                                                            \
+    const int kid = gguf_find_key(ctx, skey.c_str());                                                       \
+    if (kid >= 0) {                                                                                         \
+      enum gguf_type ktype = gguf_get_kv_type(ctx, kid);                                                    \
+      if (ktype != (type)) {                                                                                \
+        throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
+      }                                                                                                     \
+      (dst) = func(ctx, kid);                                                                               \
+    } else if (req) {                                                                                       \
+      throw std::runtime_error(format("key not found in model: %s", skey.c_str()));                         \
+    }                                                                                                       \
+  } while (0)
+
+static void replace_all(std::string& s, const std::string& search, const std::string& replace) {
+  std::string result;
+  for (size_t pos = 0;; pos += search.length()) {
+    auto new_pos = s.find(search, pos);
+    if (new_pos == std::string::npos) {
+      result += s.substr(pos, s.size() - pos);
+      break;
+    }
+    result += s.substr(pos, new_pos - pos) + replace;
+    pos = new_pos;
+  }
+  s = std::move(result);
+}
+
+static uint32_t codepoint_from_utf8(const std::string& utf8, size_t& offset) {
+  assert(offset < utf8.size());
+  if (!(utf8[offset + 0] & 0x80)) {
+    auto result = utf8[offset + 0];
+    offset += 1;
+    return result;
+  } else if (!(utf8[offset + 0] & 0x40)) {
+    throw std::invalid_argument("invalid character");
+  } else if (!(utf8[offset + 0] & 0x20)) {
+    if (offset + 1 >= utf8.size() || !((utf8[offset + 1] & 0xc0) == 0x80))
+      throw std::invalid_argument("invalid character");
+    auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
+    offset += 2;
+    return result;
+  } else if (!(utf8[offset + 0] & 0x10)) {
+    if (offset + 2 >= utf8.size() || !((utf8[offset + 1] & 0xc0) == 0x80) || !((utf8[offset + 2] & 0xc0) == 0x80))
+      throw std::invalid_argument("invalid character");
+    auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
+    offset += 3;
+    return result;
+  } else if (!(utf8[offset + 0] & 0x08)) {
+    if (offset + 3 >= utf8.size() || !((utf8[offset + 1] & 0xc0) == 0x80) || !((utf8[offset + 2] & 0xc0) == 0x80) ||
+        !((utf8[offset + 3] & 0xc0) == 0x80))
+      throw std::invalid_argument("invalid character");
+    auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) |
+                  ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
+    offset += 4;
+    return result;
+  }
+  throw std::invalid_argument("invalid string");
+}
+
+static std::vector<uint32_t> codepoints_from_utf8(const std::string& utf8) {
+  std::vector<uint32_t> result;
+  size_t offset = 0;
+  while (offset < utf8.size()) {
+    result.push_back(codepoint_from_utf8(utf8, offset));
+  }
+  return result;
+}
+
+enum llm_kv {
+  LLM_KV_GENERAL_ARCHITECTURE,
+  LLM_KV_GENERAL_QUANTIZATION_VERSION,
+  LLM_KV_GENERAL_ALIGNMENT,
+  LLM_KV_GENERAL_NAME,
+  LLM_KV_GENERAL_AUTHOR,
+  LLM_KV_GENERAL_URL,
+  LLM_KV_GENERAL_DESCRIPTION,
+  LLM_KV_GENERAL_LICENSE,
+  LLM_KV_GENERAL_SOURCE_URL,
+  LLM_KV_GENERAL_SOURCE_HF_REPO,
+
+  LLM_KV_CONTEXT_LENGTH,
+  LLM_KV_EMBEDDING_LENGTH,
+  LLM_KV_BLOCK_COUNT,
+  LLM_KV_FEED_FORWARD_LENGTH,
+  LLM_KV_USE_PARALLEL_RESIDUAL,
+  LLM_KV_TENSOR_DATA_LAYOUT,
+
+  LLM_KV_ATTENTION_HEAD_COUNT,
+  LLM_KV_ATTENTION_HEAD_COUNT_KV,
+  LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
+  LLM_KV_ATTENTION_CLAMP_KQV,
+  LLM_KV_ATTENTION_KEY_LENGTH,
+  LLM_KV_ATTENTION_VALUE_LENGTH,
+  LLM_KV_ATTENTION_LAYERNORM_EPS,
+  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
+  LLM_KV_NUM_EXPERTS,
+  LLM_KV_NUM_EXPERTS_USED,
+
+  LLM_KV_ROPE_DIMENSION_COUNT,
+  LLM_KV_ROPE_FREQ_BASE,
+  LLM_KV_ROPE_SCALE_LINEAR,
+  LLM_KV_ROPE_SCALING_TYPE,
+  LLM_KV_ROPE_SCALING_FACTOR,
+  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
+  LLM_KV_ROPE_SCALING_FINETUNED,
+
+  LLM_KV_TOKENIZER_MODEL,
+  LLM_KV_TOKENIZER_LIST,
+  LLM_KV_TOKENIZER_TOKEN_TYPE,
+  LLM_KV_TOKENIZER_SCORES,
+  LLM_KV_TOKENIZER_MERGES,
+  LLM_KV_TOKENIZER_BOS_ID,
+  LLM_KV_TOKENIZER_EOS_ID,
+  LLM_KV_TOKENIZER_UNK_ID,
+  LLM_KV_TOKENIZER_SEP_ID,
+  LLM_KV_TOKENIZER_PAD_ID,
+  LLM_KV_TOKENIZER_ADD_BOS,
+  LLM_KV_TOKENIZER_ADD_EOS,
+  LLM_KV_TOKENIZER_HF_JSON,
+  LLM_KV_TOKENIZER_RWKV,
+};
+
+static std::map<llm_kv, std::string> LLM_KV_NAMES = {
+    {LLM_KV_GENERAL_ARCHITECTURE, "general.architecture"},
+    {LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version"},
+    {LLM_KV_GENERAL_ALIGNMENT, "general.alignment"},
+    {LLM_KV_GENERAL_NAME, "general.name"},
+    {LLM_KV_GENERAL_AUTHOR, "general.author"},
+    {LLM_KV_GENERAL_URL, "general.url"},
+    {LLM_KV_GENERAL_DESCRIPTION, "general.description"},
+    {LLM_KV_GENERAL_LICENSE, "general.license"},
+    {LLM_KV_GENERAL_SOURCE_URL, "general.source.url"},
+    {LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository"},
+
+    {LLM_KV_CONTEXT_LENGTH, "%s.context_length"},
+    {LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length"},
+    {LLM_KV_BLOCK_COUNT, "%s.block_count"},
+    {LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length"},
+    {LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual"},
+    {LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout"},
+    {LLM_KV_NUM_EXPERTS, "%s.expert_count"},
+    {LLM_KV_NUM_EXPERTS_USED, "%s.expert_used_count"},
+
+    {LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count"},
+    {LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv"},
+    {LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias"},
+    {LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv"},
+    {LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length"},
+    {LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length"},
+    {LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon"},
+    {LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon"},
+
+    {LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count"},
+    {LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base"},
+    {LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear"},
+    {LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type"},
+    {LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor"},
+    {LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length"},
+    {LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned"},
+
+    {LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model"},
+    {LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens"},
+    {LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type"},
+    {LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores"},
+    {LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges"},
+    {LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id"},
+    {LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id"},
+    {LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id"},
+    {LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id"},
+    {LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id"},
+    {LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token"},
+    {LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token"},
+    {LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json"},
+    {LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world"},
+};
+
+static llm_arch llm_arch_from_string(const std::string& name) {
+  for (const auto& kv : LLM_ARCH_NAMES) {  // NOLINT
+    if (kv.second == name) {
+      return kv.first;
+    }
+  }
+
+  return LLM_ARCH_UNKNOWN;
+}
+struct LLM_KV {
+  LLM_KV(llm_arch arch) : arch(arch) {}
+
+  llm_arch arch;
+
+  std::string operator()(llm_kv kv) const { return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str()); }
+};
+
+static std::string gguf_data_to_str(enum gguf_type type, const void* data, int i) {
+  switch (type) {
+    case GGUF_TYPE_UINT8:
+      return std::to_string(((const uint8_t*)data)[i]);
+    case GGUF_TYPE_INT8:
+      return std::to_string(((const int8_t*)data)[i]);
+    case GGUF_TYPE_UINT16:
+      return std::to_string(((const uint16_t*)data)[i]);
+    case GGUF_TYPE_INT16:
+      return std::to_string(((const int16_t*)data)[i]);
+    case GGUF_TYPE_UINT32:
+      return std::to_string(((const uint32_t*)data)[i]);
+    case GGUF_TYPE_INT32:
+      return std::to_string(((const int32_t*)data)[i]);
+    case GGUF_TYPE_UINT64:
+      return std::to_string(((const uint64_t*)data)[i]);
+    case GGUF_TYPE_INT64:
+      return std::to_string(((const int64_t*)data)[i]);
+    case GGUF_TYPE_FLOAT32:
+      return std::to_string(((const float*)data)[i]);
+    case GGUF_TYPE_FLOAT64:
+      return std::to_string(((const double*)data)[i]);
+    case GGUF_TYPE_BOOL:
+      return ((const bool*)data)[i] ? "true" : "false";
+    default:
+      return format("unknown type %d", type);
+  }
+}
+
+#endif  // GGUF_H
diff --git a/vllm-ext/inference_engine/cpp/models/model_utils/model_config.h b/vllm-ext/inference_engine/cpp/models/model_utils/model_config.h
new file mode 100644
index 000000000..983501f50
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/model_utils/model_config.h
@@ -0,0 +1,76 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+// Various helper functions and utilities
+
+#pragma once
+
+#include <random>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include "models/model_utils/model_types.h"
+
+#if !defined(_WIN32)
+#include <stdio.h>
+#include <termios.h>
+#endif
+
+struct model_params {
+  std::string model_name = "";
+  model_archs model_arch = MODEL_UNKNOWN;
+  int n_layers;
+  int32_t seed = -1;  // RNG seed
+  int32_t n_threads = get_num_physical_cores();
+  int32_t n_threads_next = n_threads;
+  int32_t n_predict = -1;  // new tokens to predict
+  int32_t n_ctx = 512;     // context size
+
+  int32_t max_batch_size = 512;     // batch size for prompt processing (must be >=32 to use BLAS)
+
+  int32_t max_prompt_tokens = 8192; // max number of prompt tokens to decode in one run. split if exceeds this length
+
+  std::string model = "";  // model path
+
+  std::string lora_adapter = "";  // lora adapter path
+  std::string lora_base = "";     // base model path for the lora adapter
+
+  KV_MEM_TYPE memory_type = KV_MEM_TYPE_AUTO;  // Memory kv data type
+  bool shift_roped_k = false;                  // whether to store non-RoPEd K cache
+ 
+  bool do_early_stopping = false;  // whether to do early stopping
+  
+  float scratch_size_ratio = 1.0f;  // model memory scratch enlarge scale
+};
+
+bool gpt_params_parse(int argc, char** argv, model_params& params);
+
+void gpt_print_usage(int argc, char** argv, const model_params& params);
+
+//
+// Vocab utils
+//
+
+std::vector<model_token> model_tokenize(struct model_context* ctx, const std::string& text, bool add_bos);
+
+//
+// Model utils
+//
+
+struct model_context* create_model_context(const model_params& params);
+
+// KV cache elements per layer per batch per beam
+void get_batch_kv_elements_from_model_params(int heads_kv, int head_size, int n_ctx, ne_type wtype, int64_t* k_size,
+                                           int64_t* v_size);
diff --git a/vllm-ext/inference_engine/cpp/models/model_utils/model_files.h b/vllm-ext/inference_engine/cpp/models/model_utils/model_files.h
new file mode 100644
index 000000000..82194fdab
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/model_utils/model_files.h
@@ -0,0 +1,1511 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+// Defines fileno on msys:
+
+#ifndef MODEL_FILES_H
+#define MODEL_FILES_H
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#endif
+
+#if UINTPTR_MAX == 0xFFFFFFFF
+#define NE_MEM_ALIGN 4
+#else
+#define NE_MEM_ALIGN 16
+#endif
+
+#include "core/layers/bestla_common.hpp"
+#include "core/ne_layers.h"
+#include "models/model_utils/util.h"
+#include "models/models.h"
+#include "models/model_utils/gguf.h"
+#include <iostream>
+
+template <typename T>
+static T checked_mul(T a, T b) {
+  T ret = a * b;
+  if (a != 0 && ret / a != b) {
+    throw format("overflow multiplying %llu * %llu", (unsigned long long)a, (unsigned long long)b);
+  }
+  return ret;
+}
+
+static size_t checked_div(size_t a, size_t b) {
+  if (b == 0 || a % b != 0) {
+    throw format("error dividing %zu / %zu", a, b);
+  }
+  return a / b;
+}
+
+static std::string model_format_tensor_shape(const std::vector<uint32_t>& ne) {
+  char buf[256];
+  snprintf(buf, sizeof(buf), "%5u", ne.at(0));
+  for (size_t i = 1; i < ne.size(); i++) {
+    snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
+  }
+  return buf;
+}
+
+static size_t model_calc_tensor_size(const std::vector<uint32_t>& ne, enum ne_type type) {
+  size_t size = ne_type_size(type);
+  for (uint32_t dim : ne) {
+    size = checked_mul<size_t>(size, dim);
+  }
+  return size / ne_blck_size(type);
+}
+
+struct model_load_tensor_shard {
+  std::vector<uint32_t> ne;
+  size_t size;
+  enum ne_type type;
+  size_t file_idx;
+  size_t file_off;
+
+  void calc_size() { size = model_calc_tensor_size(ne, type); }
+};
+
+enum model_split_type {
+  SPLIT_NONE,
+  SPLIT_BY_COLUMNS,
+  SPLIT_BY_ROWS,
+  TP_1D_ROW,
+  TP_1D_COLUMN,
+  TP_1D_ONLY_MASTER,
+  TP_1D_QKV_ROW,
+  TP_1D_QKV_COLUMN
+};
+
+struct model_load_tensor {
+  std::vector<model_load_tensor_shard> shards;
+
+  std::string name;
+  enum ne_type type = NE_TYPE_F32;
+  model_split_type split_type = SPLIT_NONE;
+  std::vector<uint32_t> ne;
+  size_t size;
+  struct ne_tensor* ne_tensor = nullptr;
+  uint8_t* data;
+
+  model_load_tensor(const std::string& name) : name(name) {}
+
+  void calc_all() {
+    calc_type();
+    calc_split_type();
+    calc_ne();
+    if (type == NE_TYPE_BTLA) {
+      size = shards[0].size;
+    } else {
+      calc_size();
+    }
+  }
+
+  void calc_type() {
+    const auto& first_shard = shards.at(0);
+    for (const auto& shard : shards) {
+      if (shard.type != first_shard.type) {
+        throw format("inconsistent tensor shard type in '%s'", name.c_str());
+      }
+    }
+    type = first_shard.type;
+  }
+
+  void calc_split_type() {
+    if (shards.at(0).ne.size() == 1 ||  // 1D tensors are just duplicated in every file
+        shards.size() == 1) {           // only one file?
+      split_type = SPLIT_NONE;
+    } else if (name.find("tok_embeddings.") == 0 || name.find(".attention.wo.weight") != std::string::npos ||
+               name.find(".feed_forward.w2.weight") != std::string::npos) {
+      split_type = SPLIT_BY_COLUMNS;
+    } else {
+      split_type = SPLIT_BY_ROWS;
+    }
+  }
+
+  void calc_ne() {
+    const auto& first_shard = shards.at(0);
+    for (const auto& shard : shards) {
+      if (shard.ne != first_shard.ne) {
+        throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s", name.c_str(),
+                     model_format_tensor_shape(first_shard.ne).c_str(), model_format_tensor_shape(shard.ne).c_str());
+      }
+    }
+    ne = first_shard.ne;
+    MODEL_ASSERT(shards.size() <= UINT32_MAX);
+    uint32_t n_shards = (uint32_t)shards.size();
+    switch (split_type) {
+      case SPLIT_NONE:
+        ne = first_shard.ne;
+        break;
+      case SPLIT_BY_COLUMNS:
+        ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards), first_shard.ne[1]};
+        break;
+      case SPLIT_BY_ROWS:
+        ne = {first_shard.ne[0], checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
+        break;
+    }
+  }
+
+  void calc_size() { size = model_calc_tensor_size(ne, type); }
+};
+
+struct model_load_tensors_map {
+  // tensors is kept in a separate vector to preserve file order
+  std::vector<model_load_tensor> tensors;
+  std::unordered_map<std::string, size_t> name_to_idx;
+};
+
+struct gguf_loader {
+  FILE* gguf_file;
+
+  gguf_loader(FILE* ne_file) : gguf_file(ne_file) {}
+
+  const char* gguf_type_name(enum gguf_type type) { return GGUF_TYPE_NAME[type]; }
+
+  int gguf_get_version(const struct gguf_context* ctx) { return ctx->header.version; }
+
+  size_t gguf_get_alignment(const struct gguf_context* ctx) { return ctx->alignment; }
+
+  size_t gguf_get_data_offset(const struct gguf_context* ctx) { return ctx->offset; }
+
+  void* gguf_get_data(const struct gguf_context* ctx) { return ctx->data; }
+
+  int gguf_get_n_kv(const struct gguf_context* ctx) { return ctx->header.n_kv; }
+
+  int gguf_find_key(const struct gguf_context* ctx, const char* key) {
+    // return -1 if key not found
+    int keyfound = -1;
+
+    const int n_kv = gguf_get_n_kv(ctx);
+
+    for (int i = 0; i < n_kv; ++i) {
+      if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
+        keyfound = i;
+        break;
+      }
+    }
+
+    return keyfound;
+  }
+
+  const char* gguf_get_key(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    return ctx->kv[key_id].key.data;
+  }
+
+  enum gguf_type gguf_get_kv_type(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    return ctx->kv[key_id].type;
+  }
+
+  enum gguf_type gguf_get_arr_type(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
+    return ctx->kv[key_id].value.arr.type;
+  }
+
+  const void* gguf_get_arr_data(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
+    return ctx->kv[key_id].value.arr.data;
+  }
+
+  const char* gguf_get_arr_str(const struct gguf_context* ctx, int key_id, int i) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
+    struct gguf_kv* kv = &ctx->kv[key_id];
+    struct gguf_str* str = &((struct gguf_str*)kv->value.arr.data)[i];
+    return str->data;
+  }
+
+  int gguf_get_arr_n(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
+    return ctx->kv[key_id].value.arr.n;
+  }
+
+  uint8_t gguf_get_val_u8(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
+    return ctx->kv[key_id].value.uint8;
+  }
+
+  int8_t gguf_get_val_i8(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
+    return ctx->kv[key_id].value.int8;
+  }
+
+  uint16_t gguf_get_val_u16(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
+    return ctx->kv[key_id].value.uint16;
+  }
+
+  int16_t gguf_get_val_i16(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
+    return ctx->kv[key_id].value.int16;
+  }
+
+  uint32_t gguf_get_val_u32(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
+    return ctx->kv[key_id].value.uint32;
+  }
+
+  int32_t gguf_get_val_i32(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
+    return ctx->kv[key_id].value.int32;
+  }
+
+  float gguf_get_val_f32(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
+    return ctx->kv[key_id].value.float32;
+  }
+
+  uint64_t gguf_get_val_u64(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
+    return ctx->kv[key_id].value.uint64;
+  }
+
+  int64_t gguf_get_val_i64(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
+    return ctx->kv[key_id].value.int64;
+  }
+
+  double gguf_get_val_f64(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
+    return ctx->kv[key_id].value.float64;
+  }
+
+  bool gguf_get_val_bool(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
+    return ctx->kv[key_id].value.bool_;
+  }
+
+  const char* gguf_get_val_str(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
+    return ctx->kv[key_id].value.str.data;
+  }
+
+  const void* gguf_get_val_data(const struct gguf_context* ctx, int key_id) {
+    NE_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    NE_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
+    NE_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
+    return &ctx->kv[key_id].value;
+  }
+
+  int gguf_get_n_tensors(const struct gguf_context* ctx) { return ctx->header.n_tensors; }
+
+  int gguf_find_tensor(const struct gguf_context* ctx, const char* name) {
+    // return -1 if tensor not found
+    int tensorfound = -1;
+
+    const int n_tensors = gguf_get_n_tensors(ctx);
+
+    for (int i = 0; i < n_tensors; ++i) {
+      if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
+        tensorfound = i;
+        break;
+      }
+    }
+
+    return tensorfound;
+  }
+
+  size_t gguf_get_tensor_offset(const struct gguf_context* ctx, int i) { return ctx->infos[i].offset; }
+
+  char* gguf_get_tensor_name(const struct gguf_context* ctx, int i) { return ctx->infos[i].name.data; }
+
+  // returns the index
+  // remove static
+  int gguf_get_or_add_key(struct gguf_context* ctx, const char* key) {
+    const int idx = gguf_find_key(ctx, key);
+    if (idx >= 0) {
+      return idx;
+    }
+
+    const int n_kv = gguf_get_n_kv(ctx);
+
+    ctx->kv = reinterpret_cast<struct gguf_kv*>(realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv)));
+    ctx->kv[n_kv].key.n = strlen(key);
+    ctx->kv[n_kv].key.data = strdup(key);
+    ctx->header.n_kv++;
+
+    return n_kv;
+  }
+
+  // remove static
+  std::string gguf_kv_to_str(struct gguf_context* ctx_gguf, int i) {
+    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+
+    switch (type) {
+      case GGUF_TYPE_STRING:
+        return gguf_get_val_str(ctx_gguf, i);
+      case GGUF_TYPE_ARRAY: {
+        const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
+        int arr_n = gguf_get_arr_n(ctx_gguf, i);
+        const void* data = gguf_get_arr_data(ctx_gguf, i);
+        std::stringstream ss;
+        ss << "[";
+        for (int j = 0; j < arr_n; j++) {
+          if (arr_type == GGUF_TYPE_STRING) {
+            std::string val = gguf_get_arr_str(ctx_gguf, i, j);
+            // escape quotes
+            replace_all(val, "\\", "\\\\");
+            replace_all(val, "\"", "\\\"");
+            ss << '"' << val << '"';
+          } else if (arr_type == GGUF_TYPE_ARRAY) {
+            ss << "???";
+          } else {
+            ss << gguf_data_to_str(arr_type, data, j);
+          }
+          if (j < arr_n - 1) {
+            ss << ", ";
+          }
+        }
+        ss << "]";
+        return ss.str();
+      }
+      default:
+        return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
+    }
+  }
+
+  size_t file_offset(const struct gguf_context* ctx_gguf, const char* name) {
+    const int idx = gguf_find_tensor(ctx_gguf, name);
+
+    if (idx < 0) {
+      throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
+    }
+
+    size_t data_offset = gguf_get_data_offset(ctx_gguf);
+    size_t tensor_offset = gguf_get_tensor_offset(ctx_gguf, idx);
+    return data_offset + tensor_offset;
+  }
+
+  void gguf_set_val_u8(struct gguf_context* ctx, const char* key, uint8_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type = GGUF_TYPE_UINT8;
+    ctx->kv[idx].value.uint8 = val;
+  }
+
+  void gguf_set_val_i8(struct gguf_context* ctx, const char* key, int8_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type = GGUF_TYPE_INT8;
+    ctx->kv[idx].value.int8 = val;
+  }
+
+  void gguf_set_val_u16(struct gguf_context* ctx, const char* key, uint16_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type = GGUF_TYPE_UINT16;
+    ctx->kv[idx].value.uint16 = val;
+  }
+
+  void gguf_set_val_i16(struct gguf_context* ctx, const char* key, int16_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type = GGUF_TYPE_INT16;
+    ctx->kv[idx].value.int16 = val;
+  }
+
+  void gguf_set_val_u32(struct gguf_context* ctx, const char* key, uint32_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type = GGUF_TYPE_UINT32;
+    ctx->kv[idx].value.uint32 = val;
+  }
+
+  void gguf_set_val_i32(struct gguf_context* ctx, const char* key, int32_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type = GGUF_TYPE_INT32;
+    ctx->kv[idx].value.int32 = val;
+  }
+
+  void gguf_set_val_f32(struct gguf_context* ctx, const char* key, float val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
+    ctx->kv[idx].value.float32 = val;
+  }
+
+  void gguf_set_val_u64(struct gguf_context* ctx, const char* key, uint64_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type = GGUF_TYPE_UINT64;
+    ctx->kv[idx].value.uint64 = val;
+  }
+
+  void gguf_set_val_i64(struct gguf_context* ctx, const char* key, int64_t val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type = GGUF_TYPE_INT64;
+    ctx->kv[idx].value.int64 = val;
+  }
+
+  void gguf_set_val_f64(struct gguf_context* ctx, const char* key, double val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
+    ctx->kv[idx].value.float64 = val;
+  }
+
+  void gguf_set_val_bool(struct gguf_context* ctx, const char* key, bool val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type = GGUF_TYPE_BOOL;
+    ctx->kv[idx].value.bool_ = val;
+  }
+
+  void gguf_set_val_str(struct gguf_context* ctx, const char* key, const char* val) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type = GGUF_TYPE_STRING;
+    ctx->kv[idx].value.str.n = strlen(val);
+    ctx->kv[idx].value.str.data = strdup(val);
+  }
+
+  void gguf_set_arr_data(struct gguf_context* ctx, const char* key, enum gguf_type type, const void* data, int n) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type = GGUF_TYPE_ARRAY;
+    ctx->kv[idx].value.arr.type = type;
+    ctx->kv[idx].value.arr.n = n;
+    ctx->kv[idx].value.arr.data = malloc(n * GGUF_TYPE_SIZE[type]);
+    memcpy(ctx->kv[idx].value.arr.data, data, n * GGUF_TYPE_SIZE[type]);
+  }
+
+  void gguf_set_arr_str(struct gguf_context* ctx, const char* key, const char** data, int n) {
+    const int idx = gguf_get_or_add_key(ctx, key);
+
+    ctx->kv[idx].type = GGUF_TYPE_ARRAY;
+    ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
+    ctx->kv[idx].value.arr.n = n;
+    ctx->kv[idx].value.arr.data = malloc(n * sizeof(struct gguf_str));
+    for (int i = 0; i < n; i++) {
+      struct gguf_str* str = &((struct gguf_str*)ctx->kv[idx].value.arr.data)[i];
+      str->n = strlen(data[i]);
+      str->data = strdup(data[i]);
+    }
+  }
+
+  void gguf_free(struct gguf_context* ctx) {
+    if (ctx == NULL) {
+      return;
+    }
+
+    if (ctx->kv) {
+      // free string memory - not great..
+      for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+        struct gguf_kv* kv = &ctx->kv[i];
+
+        if (kv->key.data) {
+          free(kv->key.data);
+        }
+
+        if (kv->type == GGUF_TYPE_STRING) {
+          if (kv->value.str.data) {
+            free(kv->value.str.data);
+          }
+        }
+
+        if (kv->type == GGUF_TYPE_ARRAY) {
+          if (kv->value.arr.data) {
+            if (kv->value.arr.type == GGUF_TYPE_STRING) {
+              for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+                struct gguf_str* str = &((struct gguf_str*)kv->value.arr.data)[j];
+                if (str->data) {
+                  free(str->data);
+                }
+              }
+            }
+            free(kv->value.arr.data);
+          }
+        }
+      }
+
+      free(ctx->kv);
+    }
+
+    if (ctx->infos) {
+      for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        struct gguf_tensor_info* info = &ctx->infos[i];
+
+        if (info->name.data) {
+          free(info->name.data);
+        }
+      }
+
+      free(ctx->infos);
+    }
+  }
+
+  struct gguf_context* gguf_init_from_file(model_load_tensors_map& tensors_map, size_t& gguf_data_offset) {
+    if (!gguf_file) {
+      return nullptr;
+    }
+
+    size_t offset = 0;
+    char magic[4];
+
+    gguf_fread_el(gguf_file, &magic, sizeof(magic), &offset);
+
+    struct gguf_context* ctx = reinterpret_cast<struct gguf_context*>(GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)));
+    ctx->offset = 0;
+    // read the header
+    strncpy(ctx->header.magic, magic, 4);
+
+    bool ok = true;
+    ctx->kv = NULL;
+    ctx->infos = NULL;
+    ctx->data = NULL;
+
+    ok = ok && gguf_fread_el(gguf_file, &ctx->header.version, sizeof(ctx->header.version), &offset);
+    ok = ok && gguf_fread_el(gguf_file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
+    ok = ok && gguf_fread_el(gguf_file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
+
+    if (ctx->header.version == 1) {
+      fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
+      fclose(gguf_file);
+      gguf_free(ctx);
+      return nullptr;
+    }
+
+    if (!ok) {
+      fprintf(stderr, "%s: failed to read header\n", __func__);
+      fclose(gguf_file);
+      gguf_free(ctx);
+      return nullptr;
+    }
+
+    // read the kv pairs
+    ctx->kv = reinterpret_cast<struct gguf_kv*>(malloc(ctx->header.n_kv * sizeof(struct gguf_kv)));
+
+    for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
+      struct gguf_kv* kv = &ctx->kv[i];
+
+      ok = ok && gguf_fread_str(gguf_file, &kv->key, &offset);
+      ok = ok && gguf_fread_el(gguf_file, &kv->type, sizeof(kv->type), &offset);
+
+      switch (kv->type) {
+        case GGUF_TYPE_UINT8:
+          ok = ok && gguf_fread_el(gguf_file, &kv->value.uint8, sizeof(kv->value.uint8), &offset);
+          break;
+        case GGUF_TYPE_INT8:
+          ok = ok && gguf_fread_el(gguf_file, &kv->value.int8, sizeof(kv->value.int8), &offset);
+          break;
+        case GGUF_TYPE_UINT16:
+          ok = ok && gguf_fread_el(gguf_file, &kv->value.uint16, sizeof(kv->value.uint16), &offset);
+          break;
+        case GGUF_TYPE_INT16:
+          ok = ok && gguf_fread_el(gguf_file, &kv->value.int16, sizeof(kv->value.int16), &offset);
+          break;
+        case GGUF_TYPE_UINT32:
+          ok = ok && gguf_fread_el(gguf_file, &kv->value.uint32, sizeof(kv->value.uint32), &offset);
+          break;
+        case GGUF_TYPE_INT32:
+          ok = ok && gguf_fread_el(gguf_file, &kv->value.int32, sizeof(kv->value.int32), &offset);
+          break;
+        case GGUF_TYPE_FLOAT32:
+          ok = ok && gguf_fread_el(gguf_file, &kv->value.float32, sizeof(kv->value.float32), &offset);
+          break;
+        case GGUF_TYPE_UINT64:
+          ok = ok && gguf_fread_el(gguf_file, &kv->value.uint64, sizeof(kv->value.uint64), &offset);
+          break;
+        case GGUF_TYPE_INT64:
+          ok = ok && gguf_fread_el(gguf_file, &kv->value.int64, sizeof(kv->value.int64), &offset);
+          break;
+        case GGUF_TYPE_FLOAT64:
+          ok = ok && gguf_fread_el(gguf_file, &kv->value.float64, sizeof(kv->value.float64), &offset);
+          break;
+        case GGUF_TYPE_BOOL:
+          ok = ok && gguf_fread_el(gguf_file, &kv->value.bool_, sizeof(kv->value.bool_), &offset);
+          break;
+        case GGUF_TYPE_STRING:
+          ok = ok && gguf_fread_str(gguf_file, &kv->value.str, &offset);
+          break;
+        case GGUF_TYPE_ARRAY: {
+          ok = ok && gguf_fread_el(gguf_file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
+          ok = ok && gguf_fread_el(gguf_file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
+
+          switch (kv->value.arr.type) {
+            case GGUF_TYPE_UINT8:
+            case GGUF_TYPE_INT8:
+            case GGUF_TYPE_UINT16:
+            case GGUF_TYPE_INT16:
+            case GGUF_TYPE_UINT32:
+            case GGUF_TYPE_INT32:
+            case GGUF_TYPE_FLOAT32:
+            case GGUF_TYPE_UINT64:
+            case GGUF_TYPE_INT64:
+            case GGUF_TYPE_FLOAT64:
+            case GGUF_TYPE_BOOL: {
+              kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
+              ok = ok && gguf_fread_el(gguf_file, kv->value.arr.data,
+                                       kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
+            } break;
+            case GGUF_TYPE_STRING: {
+              kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
+              for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
+                ok = ok && gguf_fread_str(gguf_file, &((struct gguf_str*)kv->value.arr.data)[j], &offset);
+              }
+            } break;
+            case GGUF_TYPE_ARRAY:
+            case GGUF_TYPE_COUNT:
+              printf("False && invalid type");
+              break;  // NE_ASSERT(false && "invalid type"); break;
+          }
+        } break;
+        case GGUF_TYPE_COUNT:
+          printf("False && invalid type");  // NE_ASSERT(false && "invalid type");
+      }
+
+      if (!ok) {
+        break;
+      }
+    }
+
+    // read the tensor infos
+    ctx->infos =
+        reinterpret_cast<struct gguf_tensor_info*>(malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)));
+
+    for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
+      struct gguf_tensor_info* info = &ctx->infos[i];
+
+      for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+        info->ne[j] = 1;
+      }
+
+      ok = ok && gguf_fread_str(gguf_file, &info->name, &offset);
+      ok = ok && gguf_fread_el(gguf_file, &info->n_dims, sizeof(info->n_dims), &offset);
+      for (uint32_t j = 0; j < info->n_dims; ++j) {
+        ok = ok && gguf_fread_el(gguf_file, &info->ne[j], sizeof(info->ne[j]), &offset);
+      }
+      ok = ok && gguf_fread_el(gguf_file, &info->type, sizeof(info->type), &offset);
+      ok = ok && gguf_fread_el(gguf_file, &info->offset, sizeof(info->offset), &offset);
+
+      if (!ok) {
+        fprintf(stderr, "%s: failed to read tensor info\n", __func__);
+        fclose(gguf_file);
+        gguf_free(ctx);
+        return nullptr;
+      }
+
+      model_load_tensor_shard shard;
+      std::string name = gguf_get_tensor_name(ctx, i);
+      uint32_t name_len = name.length();
+      shard.type = (enum ne_type)info->type;
+
+      uint32_t n_dims = info->n_dims;
+      shard.ne.resize(n_dims);
+      for (uint32_t j = 0; j < info->n_dims; ++j) {
+        shard.ne[j] = info->ne[j];
+      }
+
+      if (n_dims < 1 || n_dims > 2) {
+        throw format("model.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
+      }
+      switch (shard.type) {
+        case NE_TYPE_F32:
+        case NE_TYPE_F16:
+        case NE_TYPE_Q4_0:
+        case NE_TYPE_Q4_1:
+        case NE_TYPE_Q5_0:
+        case NE_TYPE_Q5_1:
+        case NE_TYPE_Q8_0:
+        case NE_TYPE_Q6_K:
+        case NE_TYPE_BTLA:
+          break;
+        default: {
+          throw format("unrecognized tensor type %u\n", shard.type);
+        }
+      }
+
+      shard.file_idx = 0;
+      const size_t offs = file_offset(ctx, name.c_str());
+      int length = info->ne[0] * info->ne[1] * info->ne[2] * info->ne[3] * 4;
+
+      shard.file_off = offs;
+
+      auto it = tensors_map.name_to_idx.find(name);
+      size_t idx;
+      if (it != tensors_map.name_to_idx.end()) {
+        idx = it->second;
+      } else {
+        tensors_map.tensors.emplace_back(name);
+        idx = tensors_map.tensors.size() - 1;
+        tensors_map.name_to_idx.emplace(name, idx);
+      }
+      tensors_map.tensors.at(idx).shards.push_back(shard);
+    }
+
+    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
+
+    int alignment_idx = gguf_find_key(ctx, "general.alignment");
+    if (alignment_idx != -1) {
+      ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
+    }
+
+    const size_t offset_pad = offset % ctx->alignment;
+
+    if (offset_pad != 0) {
+      offset += ctx->alignment - offset_pad;
+      // fseek(file, offset, SEEK_SET);
+    }
+
+    ctx->offset = offset;
+    gguf_data_offset = offset;
+
+    return ctx;
+  }
+
+  void gguf_load_from_file(struct gguf_context* ctx_gguf, model_hparams& hparams, model_vocab& vocab) {
+    int n_kv = 0;
+    n_kv = gguf_get_n_kv(ctx_gguf);
+
+    int n_tensors = 0;
+    n_tensors = gguf_get_n_tensors(ctx_gguf);
+
+    llama_fver fver;
+    fver = (enum llama_fver)gguf_get_version(ctx_gguf);
+    printf("%s: loaded meta data with %d key-value pairs and %d tensors (version %s)\n", __func__, n_kv, n_tensors,
+           llama_file_version_name(fver));
+
+    for (int i = 0; i < n_kv; i++) {
+      const char* name = gguf_get_key(ctx_gguf, i);
+      const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+      const std::string type_name =
+          type == GGUF_TYPE_ARRAY ? format("%s[%s,%d]", gguf_type_name(type),
+                                           gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
+                                  : gguf_type_name(type);
+
+      std::string value = gguf_kv_to_str(ctx_gguf, i);
+      const size_t MAX_VALUE_LEN = 40;
+      if (value.size() > MAX_VALUE_LEN) {
+        value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
+      }
+      replace_all(value, "\n", "\\n");
+
+      printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+    }
+
+    // Get model name
+    uint32_t general_architecture_idex = 0;
+    std::string arch_name = gguf_kv_to_str(ctx_gguf, general_architecture_idex);
+    llm_arch arch = llm_arch_from_string(arch_name);
+    const auto kv = LLM_KV(arch);
+
+    // Get general kv
+    uint32_t magic = -1;
+    uint32_t version = -1;
+    GGUF_GET_KEY(ctx_gguf, magic, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "magic");
+    GGUF_GET_KEY(ctx_gguf, version, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "version");
+
+    // Get hparams kv
+    GGUF_GET_KEY(ctx_gguf, hparams.n_vocab, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "n_vocab");
+    GGUF_GET_KEY(ctx_gguf, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_EMBEDDING_LENGTH));
+    GGUF_GET_KEY(ctx_gguf, hparams.n_mult, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "n_mult");
+    GGUF_GET_KEY(ctx_gguf, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT));
+    GGUF_GET_KEY(ctx_gguf, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
+                 kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
+    GGUF_GET_KEY(ctx_gguf, hparams.n_experts, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_NUM_EXPERTS));
+    GGUF_GET_KEY(ctx_gguf, hparams.n_experts_used, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
+                 kv(LLM_KV_NUM_EXPERTS_USED));
+    GGUF_GET_KEY(ctx_gguf, hparams.n_embd_head_k, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
+                 kv(LLM_KV_ATTENTION_KEY_LENGTH));
+    GGUF_GET_KEY(ctx_gguf, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_BLOCK_COUNT));
+    GGUF_GET_KEY(ctx_gguf, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
+
+    GGUF_GET_KEY(ctx_gguf, hparams.norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false,
+                 kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+    GGUF_GET_KEY(ctx_gguf, hparams.freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
+
+    // Get NeuralSpeed ftype
+    uint32_t ftype = 1;
+    GGUF_GET_KEY(ctx_gguf, ftype, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "ftype");
+    hparams.ftype = (enum ne_ftype)ftype;
+
+    // Get specific model parameters
+    GGUF_GET_KEY(ctx_gguf, hparams.max_seq_len, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
+    GGUF_GET_KEY(ctx_gguf, hparams.alibi_bias_max, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false,
+                 kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
+    GGUF_GET_KEY(ctx_gguf, hparams.clip_qkv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "clip_qkv");
+    GGUF_GET_KEY(ctx_gguf, hparams.par_res, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "par_res");
+
+    GGUF_GET_KEY(ctx_gguf, hparams.word_embed_proj_dim, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
+                 "word_embed_proj_dim");
+    GGUF_GET_KEY(ctx_gguf, hparams.do_layer_norm_before, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
+                 "do_layer_norm_before");
+
+    GGUF_GET_KEY(ctx_gguf, hparams.multi_query_group_num, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
+                 "multi_query_group_num");
+    GGUF_GET_KEY(ctx_gguf, hparams.ffn_hidden_size, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
+                 kv(LLM_KV_FEED_FORWARD_LENGTH));
+    GGUF_GET_KEY(ctx_gguf, hparams.inner_hidden_size, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "inner_hidden_size");
+
+    // Get special vocab ids
+    GGUF_GET_KEY(ctx_gguf, vocab.bos_token_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
+                 "tokenizer.ggml.bos_token_id");
+    GGUF_GET_KEY(ctx_gguf, vocab.eos_token_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
+                 "tokenizer.ggml.eos_token_id");
+    GGUF_GET_KEY(ctx_gguf, vocab.pad_token_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
+                 "tokenizer.ggml.pad_token_id");
+    GGUF_GET_KEY(ctx_gguf, vocab.sep_token_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
+                 "tokenizer.ggml.sep_token_id");
+
+    // load vocab
+    std::string tokens = "tokenizer.ggml.tokens";
+    const int token_idx = gguf_find_key(ctx_gguf, tokens.c_str());
+    if (token_idx == -1) {
+      throw std::runtime_error("cannot find tokenizer vocab in model file\n");
+    }
+
+    const float* scores = nullptr;
+    std::string scores_name = "tokenizer.ggml.scores";
+    const int score_idx = gguf_find_key(ctx_gguf, scores_name.c_str());
+    if (score_idx != -1) {
+      scores = (const float*)gguf_get_arr_data(ctx_gguf, score_idx);
+    }
+
+    uint32_t default_n_vocab = 32000;
+    const uint32_t n_vocab = gguf_get_arr_n(ctx_gguf, token_idx);
+    if ((hparams.n_vocab == default_n_vocab) && (hparams.n_vocab != n_vocab)) {
+      hparams.n_vocab = n_vocab;
+    }
+
+    vocab.id_to_token.resize(hparams.n_vocab);
+    for (uint32_t i = 0; i < n_vocab; i++) {
+      std::string word = gguf_get_arr_str(ctx_gguf, token_idx, i);
+      // NE_ASSERT(codepoints_from_utf8(word).size() > 0);
+
+      vocab.token_to_id[word] = i;
+
+      auto& tok_score = vocab.id_to_token[i];
+      tok_score.tok = std::move(word);
+      tok_score.score = scores ? scores[i] : 0.0f;
+    }
+  }
+};
+
+struct model_file_loader {
+  model_file file;
+  model_file_version file_version;
+  model_hparams hparams;
+  model_vocab vocab;
+
+  size_t gguf_data_offset = 0;  // offset of the GGUF tensor data from the beginning of the file.
+  enum model_format model_magic = UNKNOWN;
+
+  model_file_loader(const char* fname, size_t file_idx, model_load_tensors_map& tensors_map) : file(fname, "rb") {
+    fprintf(stderr, "model.cpp: loading model from %s\n", fname);
+    model_magic = read_file_magic();
+    if (model_magic == GGUF) {
+      std::cout << "Loading the bin file with GGUF format..." << std::endl;
+      fseek(file.fp, 0, SEEK_SET);
+      model_magic = GGUF;
+
+      gguf_loader gguf_loader(file.fp);
+
+      struct gguf_context* ctx_gguf = NULL;
+      ctx_gguf = gguf_loader.gguf_init_from_file(tensors_map, gguf_data_offset);
+      if (!ctx_gguf) {
+        throw std::runtime_error(format("%s: failed to load model\n", __func__));
+      }
+
+      gguf_loader.gguf_load_from_file(ctx_gguf, hparams, vocab);
+    } else if (model_magic == NE) {
+      std::cout << "Loading the bin file with NE format..." << std::endl;
+      fseek(file.fp, 0, SEEK_SET);
+      load_ne_magic();
+      load_ne_hparams();
+      load_ne_vocab();
+      read_tensor_metadata(file_idx, tensors_map);
+    } else {
+      throw format("unknown file format model_maigc = %d", model_magic);
+    }
+  }
+
+  void load_ne_magic() {
+    uint32_t magic = file.read_u32();
+
+    if (magic == MODEL_FILE_MAGIC_NE) {
+      file_version = MODEL_FILE_VERSION_NE;
+      return;
+    }
+
+    uint32_t version = file.read_u32();
+
+    switch (magic) {
+      case MODEL_FILE_MAGIC_GGMF:
+        switch (version) {
+          case 1:
+            file_version = MODEL_FILE_VERSION_GGMF_V1;
+            return;
+        }
+        break;
+      case MODEL_FILE_MAGIC_GGJT:
+        switch (version) {
+          case 1:
+            file_version = MODEL_FILE_VERSION_GGJT_V1;
+            return;
+          case 2:
+            file_version = MODEL_FILE_VERSION_GGJT_V2;
+            return;
+          case 3:
+            file_version = MODEL_FILE_VERSION_GGJT_V3;
+            return;
+        }
+    }
+
+    throw format("unknown (magic, version) combination: %08x, %08x; is this really a NE file?", magic, version);
+  }
+
+  enum model_format read_file_magic() {
+    char gguf_magic[4];
+    const size_t n = fread(&gguf_magic, 1, sizeof(gguf_magic), file.fp);
+    bool ok = true;
+    ok &= gguf_magic[0] == 'G';
+    ok &= gguf_magic[1] == 'G';
+    ok &= gguf_magic[2] == 'U';
+    ok &= gguf_magic[3] == 'F';
+
+    if (ok) {
+      model_magic = GGUF;
+    } else {
+      model_magic = NE;
+    }
+    return model_magic;
+  }
+
+  void load_ne_hparams() {
+    unsigned int count = 0;
+    hparams.n_vocab = file.read_u32();
+    hparams.n_embd = file.read_u32();
+    hparams.n_mult = file.read_u32();
+    hparams.n_head = file.read_u32();
+    hparams.n_head_kv = file.read_u32();
+    hparams.n_layer = file.read_u32();
+    hparams.n_rot = file.read_u32();
+    printf("%-16s %d.hparams.n_vocab = %-30d\n", __func__, count++, hparams.n_vocab);
+    printf("%-16s %d.hparams.n_embd = %-30d\n", __func__, count++, hparams.n_embd);
+    printf("%-16s %d.hparams.n_mult = %-30d\n", __func__, count++, hparams.n_mult);
+    printf("%-16s %d.hparams.n_head = %-30d\n", __func__, count++, hparams.n_head);
+    printf("%-16s %d.hparams.n_head_kv = %-30d\n", __func__, count++, hparams.n_head_kv);
+    printf("%-16s %d.hparams.n_layer = %-30d\n", __func__, count++, hparams.n_layer);
+    printf("%-16s %d.hparams.n_rot = %-30d\n", __func__, count++, hparams.n_rot);
+
+    hparams.ftype = (enum ne_ftype)file.read_u32();
+    hparams.max_seq_len = file.read_u32();
+    file.read_raw(&hparams.alibi_bias_max, sizeof(float));
+    file.read_raw(&hparams.clip_qkv, sizeof(float));
+    hparams.par_res = file.read_u32();
+    hparams.word_embed_proj_dim = file.read_u32();
+    hparams.do_layer_norm_before = bool(file.read_u32());
+    printf("%-16s %d.hparams.ftype = %-30d\n", __func__, count++, hparams.ftype);
+    printf("%-16s %d.hparams.max_seq_len = %-30d\n", __func__, count++, hparams.max_seq_len);
+    printf("%-16s %d.hparams.alibi_bias_max = %-30.3f\n", __func__, count++, hparams.alibi_bias_max);
+    printf("%-16s %d.hparams.clip_qkv = %-30.3f\n", __func__, count++, hparams.clip_qkv);
+    printf("%-16s %d.hparams.par_res = %-30d\n", __func__, count++, hparams.par_res);
+    printf("%-16s %d.hparams.word_embed_proj_dim = %-30d\n", __func__, count++, hparams.word_embed_proj_dim);
+    printf("%-16s %d.hparams.do_layer_norm_before = %-30d\n", __func__, count++, hparams.do_layer_norm_before);
+
+    hparams.multi_query_group_num = file.read_u32();
+    hparams.ffn_hidden_size = file.read_u32();
+    printf("%-16s %d.hparams.multi_query_group_num = %-30d\n", __func__, count++, hparams.multi_query_group_num);
+    printf("%-16s %d.hparams.ffn_hidden_size = %-30d\n", __func__, count++, hparams.ffn_hidden_size);
+
+    hparams.inner_hidden_size = file.read_u32();
+    hparams.n_experts = file.read_u32();
+    hparams.n_experts_used = file.read_u32();
+    hparams.n_embd_head_k = file.read_u32();
+    printf("%-16s %d.hparams.inner_hidden_size = %-30d\n", __func__, count++, hparams.inner_hidden_size);
+    printf("%-16s %d.hparams.n_experts = %-30d\n", __func__, count++, hparams.n_experts);
+    printf("%-16s %d.hparams.n_experts_used = %-30d\n", __func__, count++, hparams.n_experts_used);
+    printf("%-16s %d.hparams.n_embd_head_k = %-30d\n", __func__, count++, hparams.n_embd_head_k);
+
+    file.read_raw(&hparams.norm_eps, sizeof(float));
+
+    file.read_raw(&hparams.freq_base, sizeof(float));
+    file.read_raw(&hparams.freq_scale, sizeof(float));
+    printf("%-16s %d.hparams.norm_eps = %-30f\n", __func__, count++, hparams.norm_eps);
+    printf("%-16s %d.hparams.freq_base = %-30.3f\n", __func__, count++, hparams.freq_base);
+    printf("%-16s %d.hparams.freq_scale = %-30.3f\n", __func__, count++, hparams.freq_scale);
+
+    file.read_raw(&hparams.rope_scaling_factor, sizeof(float));
+    hparams.original_max_position_embeddings = file.read_u32();
+    hparams.use_yarn = file.read_u32();
+    printf("%-16s %d.hparams.rope_scaling_factor = %-30.3f\n", __func__, count++, hparams.rope_scaling_factor);
+    printf("%-16s %d.hparams.original_max_position_embeddings = %-30d\n", __func__, count++,
+           hparams.original_max_position_embeddings);
+    printf("%-16s %d.hparams.use_yarn = %-30d\n", __func__, count++, hparams.use_yarn);
+    unsigned int total = 26;
+    if (count != total) {
+      fprintf(stderr, "The number of ne_parameters is wrong, total = %d, count = %d.\n", total, count);
+    }
+  }
+
+  void load_ne_vocab() {
+    unsigned int count = 0;
+    unsigned int ne_hparams_total = 26;
+    file.read_raw(&vocab.bos_token_id, sizeof(model_vocab::id));
+    file.read_raw(&vocab.eos_token_id, sizeof(model_vocab::id));
+    file.read_raw(&vocab.pad_token_id, sizeof(model_vocab::id));
+    file.read_raw(&vocab.sep_token_id, sizeof(model_vocab::id));
+    printf("%-16s %d.vocab.bos_token_id = %-30d\n", __func__, ne_hparams_total + count++, vocab.bos_token_id);
+    printf("%-16s %d.vocab.eos_token_id = %-30d\n", __func__, ne_hparams_total + count++, vocab.eos_token_id);
+    printf("%-16s %d.vocab.pad_token_id = %-30d\n", __func__, ne_hparams_total + count++, vocab.pad_token_id);
+    printf("%-16s %d.vocab.sep_token_id = %-30d\n", __func__, ne_hparams_total + count++, vocab.sep_token_id);
+
+    vocab.id_to_token.resize(hparams.n_vocab);
+    for (uint32_t i = 0; i < hparams.n_vocab; i++) {
+      uint32_t len = file.read_u32();
+      std::string word = file.read_string(len);
+      // std::cout << "word = " << word << std::endl;
+
+      float score = 0.0f;
+      if (file_version >= MODEL_FILE_VERSION_GGMF_V1) {
+        file.read_raw(&score, sizeof(score));
+      }
+
+      vocab.token_to_id[word] = i;
+
+      auto& tok_score = vocab.id_to_token[i];
+      tok_score.tok = std::move(word);
+      tok_score.score = score;
+    }
+  }
+  void read_tensor_metadata(size_t file_idx, model_load_tensors_map& tensors_map) {
+    while (file.tell() < file.size) {
+      model_load_tensor_shard shard;
+      uint32_t n_dims = file.read_u32();
+      uint32_t name_len = file.read_u32();
+      shard.type = (enum ne_type)file.read_u32();
+      shard.ne.resize(n_dims);
+      file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
+      std::string name = file.read_string(name_len);
+      if (n_dims < 1 || n_dims > 2) {
+        throw format("model.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
+      }
+      switch (shard.type) {
+        case NE_TYPE_F32:
+        case NE_TYPE_F16:
+        case NE_TYPE_Q4_0:
+        case NE_TYPE_Q4_1:
+        case NE_TYPE_Q5_0:
+        case NE_TYPE_Q5_1:
+        case NE_TYPE_Q8_0:
+        case NE_TYPE_BTLA:
+          break;
+        default: {
+          throw format("unrecognized tensor type %u\n", shard.type);
+        }
+      }
+
+      if (file_version >= MODEL_FILE_VERSION_GGJT_V1) {
+        // skip to the next multiple of 32 bytes
+        file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
+      }
+      shard.file_idx = file_idx;
+      shard.file_off = file.tell();
+      if (shard.type == NE_TYPE_BTLA) {
+        size_t size = 0;
+        file.read_raw(&size, sizeof(size_t));
+        shard.size = size;
+        file.seek(shard.size - sizeof(size_t), SEEK_CUR);
+      } else {
+        shard.calc_size();
+        file.seek(shard.size, SEEK_CUR);
+      }
+
+      auto it = tensors_map.name_to_idx.find(name);
+      size_t idx;
+      if (it != tensors_map.name_to_idx.end()) {
+        idx = it->second;
+      } else {
+        tensors_map.tensors.emplace_back(name);
+        idx = tensors_map.tensors.size() - 1;
+        tensors_map.name_to_idx.emplace(name, idx);
+      }
+      tensors_map.tensors.at(idx).shards.push_back(shard);
+    }
+  }
+};
+
+struct model_file_saver {
+  model_file file;
+  model_file_loader* any_file_loader;
+  model_file_saver(const char* fname, model_file_loader* any_file_loader, enum ne_ftype new_ftype)
+      : file(fname, "wb"), any_file_loader(any_file_loader) {
+    fprintf(stderr, "model.cpp: saving model to %s\n", fname);
+    write_magic();
+    write_hparams(new_ftype);
+    write_vocab();
+  }
+  void write_magic() {
+    file.write_u32(MODEL_FILE_MAGIC);    // magic
+    file.write_u32(MODEL_FILE_VERSION);  // version
+  }
+  void write_hparams(enum ne_ftype new_ftype) {
+    const model_hparams& hparams = any_file_loader->hparams;
+    file.write_u32(hparams.n_vocab);
+    file.write_u32(hparams.n_embd);
+    file.write_u32(hparams.n_mult);
+    file.write_u32(hparams.n_head);
+    file.write_u32(hparams.n_head_kv);
+    file.write_u32(hparams.n_layer);
+    file.write_u32(hparams.n_rot);
+    file.write_u32(hparams.ftype);
+    file.write_u32(hparams.max_seq_len);
+    file.write_raw(&hparams.alibi_bias_max, sizeof(float));
+    file.write_raw(&hparams.clip_qkv, sizeof(float));
+    file.write_u32(hparams.par_res);
+    file.write_u32(hparams.word_embed_proj_dim);
+    file.write_u32(static_cast<int>(hparams.do_layer_norm_before));
+
+    file.write_u32(hparams.multi_query_group_num);
+    file.write_u32(hparams.ffn_hidden_size);
+    file.write_u32(hparams.inner_hidden_size);
+    file.write_u32(hparams.n_experts);
+    file.write_u32(hparams.n_experts_used);
+    file.write_u32(hparams.n_embd_head_k);
+
+    file.write_raw(&hparams.norm_eps, sizeof(float));
+    file.write_raw(&hparams.freq_base, sizeof(float));
+    file.write_raw(&hparams.freq_scale, sizeof(float));
+    file.write_raw(&hparams.rope_scaling_factor, sizeof(float));
+    file.write_u32(hparams.original_max_position_embeddings);
+    file.write_u32(hparams.use_yarn);
+  }
+  void write_vocab() {
+    if (any_file_loader->file_version == MODEL_FILE_VERSION_NE) {
+      fprintf(stderr, "model.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
+    }
+    uint32_t n_vocab = any_file_loader->hparams.n_vocab;
+    file.write_raw(&(any_file_loader->vocab.bos_token_id), sizeof(model_vocab::id));
+    file.write_raw(&(any_file_loader->vocab.eos_token_id), sizeof(model_vocab::id));
+    file.write_raw(&(any_file_loader->vocab.pad_token_id), sizeof(model_vocab::id));
+    file.write_raw(&(any_file_loader->vocab.sep_token_id), sizeof(model_vocab::id));
+    for (uint32_t i = 0; i < n_vocab; i++) {
+      const auto& token_score = any_file_loader->vocab.id_to_token.at(i);
+      file.write_u32((uint32_t)token_score.tok.size());
+      file.write_raw(token_score.tok.data(), token_score.tok.size());
+      file.write_raw(&token_score.score, sizeof(token_score.score));
+    }
+  }
+  void write_tensor(model_load_tensor& tensor, enum ne_type new_type, const void* new_data, size_t new_size) {
+    switch (new_type) {
+      case NE_TYPE_F32:
+      case NE_TYPE_F16:
+      case NE_TYPE_Q4_0:
+      case NE_TYPE_Q4_1:
+      case NE_TYPE_Q5_0:
+      case NE_TYPE_Q5_1:
+      case NE_TYPE_Q8_0:
+      case NE_TYPE_BTLA:
+        break;
+      default:
+        MODEL_ASSERT(false);
+    }
+    file.write_u32((uint32_t)tensor.ne.size());
+    file.write_u32((uint32_t)tensor.name.size());
+    file.write_u32(new_type);
+    file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
+    file.write_raw(tensor.name.data(), tensor.name.size());
+    file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
+    if (new_type != NE_TYPE_BTLA) MODEL_ASSERT(new_size == model_calc_tensor_size(tensor.ne, new_type));
+    file.write_raw(new_data, new_size);
+  }
+};
+
+struct model_model_loader {
+  std::vector<std::unique_ptr<model_file_loader>> file_loaders;
+  model_load_tensors_map tensors_map;
+  bool use_mmap;
+  size_t num_ne_tensors_created = 0;
+  struct ne_context* ne_ctx = nullptr;
+  std::unique_ptr<model_mmap> mapping;
+
+  model_model_loader(const std::string& fname_base, bool use_mmap, bool vocab_only) {
+    auto* first_file = new model_file_loader(fname_base.c_str(), 0, tensors_map);
+    file_loaders.emplace_back(first_file);
+    uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
+    for (uint32_t i = 1; i < n_parts; i++) {
+      std::string fname = fname_base + "." + std::to_string(i);
+      auto* ith_file = new model_file_loader(fname.c_str(), i, tensors_map);
+      file_loaders.emplace_back(ith_file);
+      if (ith_file->hparams != first_file->hparams) {
+        throw format("model.cpp: hparams inconsistent between files");
+      }
+    }
+    if (!model_mmap::SUPPORTED) {
+      use_mmap = false;
+    }
+    if (use_mmap && alignment_prevents_mmap()) {
+      fprintf(stderr,
+              "model.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
+      use_mmap = false;
+    }
+    this->use_mmap = use_mmap;
+    for (model_load_tensor& lt : tensors_map.tensors) {
+      lt.calc_all();
+    }
+  }
+
+  bool alignment_prevents_mmap() {
+    for (const model_load_tensor& lt : tensors_map.tensors) {
+      for (const model_load_tensor_shard& shard : lt.shards) {
+        if (shard.file_off & 3) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  uint32_t guess_n_parts() const {
+    auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
+    if (it == tensors_map.name_to_idx.end()) {
+      it = tensors_map.name_to_idx.find("transformer.wte.weight");
+      if (it == tensors_map.name_to_idx.end()) {
+        it = tensors_map.name_to_idx.find("gpt_neox.embed_in.weight");
+        if (it == tensors_map.name_to_idx.end()) {
+          it = tensors_map.name_to_idx.find("model/wte");
+          if (it == tensors_map.name_to_idx.end()) {
+            it = tensors_map.name_to_idx.find("token_embd.weight");  // llama-2-chat-hf
+            if (it == tensors_map.name_to_idx.end()) {
+              it = tensors_map.name_to_idx.find("model.embed_tokens.weight");  // baichuan13B
+              if (it == tensors_map.name_to_idx.end()) {
+                it = tensors_map.name_to_idx.find("transformer.word_embeddings.weight");  // ChatGLM-1
+                if (it == tensors_map.name_to_idx.end()) {
+                  it = tensors_map.name_to_idx.find("transformer.embedding.word_embeddings.weight");  // ChatGLM-2
+                  if (it == tensors_map.name_to_idx.end()) {
+                    it = tensors_map.name_to_idx.find("model.decoder.embed_tokens.weight");
+                    if (it != tensors_map.name_to_idx.end()) return 1;  // hacky solution for OPT loading
+                    if (it == tensors_map.name_to_idx.end()) {
+                      throw std::string("missing tok_embeddings.weight");
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    const model_load_tensor& lt = tensors_map.tensors.at(it->second);
+    return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
+  }
+
+  void calc_sizes(size_t* ctx_size_p, size_t* mmapped_size_p) const {
+    *ctx_size_p = *mmapped_size_p = 0;
+    size_t size_needed = 0;
+    for (const model_load_tensor& lt : tensors_map.tensors) {
+      *ctx_size_p += sizeof(struct ne_tensor) + NE_OBJECT_SIZE;
+      if (lt.type == NE_TYPE_BTLA) {
+        size_needed = lt.size;
+      } else {
+        size_needed = (lt.size + NE_MEM_ALIGN - 1) / NE_MEM_ALIGN * NE_MEM_ALIGN;
+      }
+      *(use_mmap ? mmapped_size_p : ctx_size_p) += size_needed;
+    }
+  }
+
+  bool verify_tensor(const std::string& name) {
+    auto it = tensors_map.name_to_idx.find(name);
+    if (it == tensors_map.name_to_idx.end()) {
+      return false;
+    }
+
+    return true;
+  }
+
+  struct ne_tensor* get_tensor(const std::string& name, const std::vector<uint32_t>& ne, ne_backend backend) {
+    auto it = tensors_map.name_to_idx.find(name);
+    if (it == tensors_map.name_to_idx.end()) {
+      throw format("model.cpp: tensor '%s' is missing from model", name.c_str());
+    }
+    model_load_tensor& lt = tensors_map.tensors.at(it->second);
+    if (lt.ne != ne) {
+      throw format("model.cpp: tensor '%s' has wrong shape; expected %s, got %s", name.c_str(),
+                   model_format_tensor_shape(ne).c_str(), model_format_tensor_shape(lt.ne).c_str());
+    }
+
+    return get_tensor_for(lt, backend);
+  }
+
+  struct ne_tensor* get_tensor_for(model_load_tensor& lt, ne_backend backend) {
+    struct ne_tensor* tensor;
+    if (lt.ne.size() == 2) {
+      if (lt.type == NE_TYPE_BTLA) {
+        tensor = ne_new_tensor_2d(ne_ctx, lt.type, lt.ne.at(0), lt.ne.at(1), lt.size);
+      } else {
+        tensor = ne_new_tensor_2d(ne_ctx, lt.type, lt.ne.at(0), lt.ne.at(1), NE_SIZE_CALC);
+      }
+    } else {
+      MODEL_ASSERT(lt.ne.size() == 1);
+      tensor = ne_new_tensor_1d(ne_ctx, lt.type, lt.ne.at(0), NE_SIZE_CALC);
+    }
+    ne_set_name(tensor, lt.name.c_str());
+    MODEL_ASSERT(lt.ne_tensor == nullptr);  // if this fails, we called get_tensor twice on the same tensor
+    tensor->backend = backend;
+    lt.ne_tensor = tensor;
+    num_ne_tensors_created++;
+    return tensor;
+  }
+
+  void done_getting_tensors() const {
+    if (num_ne_tensors_created != tensors_map.tensors.size()) {
+      throw std::string("model.cpp: file contained more tensors than expected");
+    }
+  }
+
+  void load_all_data(model_progress_callback progress_callback, void* progress_callback_user_data,
+                     model_mlock* lmlock) {
+    size_t data_size = 0;
+    size_t prefetch_size = 0;
+    for (const model_load_tensor& lt : tensors_map.tensors) {
+      data_size += lt.size;
+      if (lt.ne_tensor->backend == NE_BACKEND_CPU) {
+        prefetch_size += lt.size;
+      }
+    }
+
+    if (use_mmap) {
+      mapping.reset(new model_mmap(&file_loaders.at(0)->file, prefetch_size));
+      if (!lmlock) {
+        // Don't call the callback since the actual loading will be lazy
+        // and we can't measure it.
+        progress_callback = nullptr;
+      }
+      if (lmlock) {
+        lmlock->init(mapping->addr);
+      }
+    }
+
+    size_t done_size = 0;
+    for (model_load_tensor& lt : tensors_map.tensors) {
+      if (lt.ne_tensor->backend != NE_BACKEND_CPU) {
+        continue;
+      }
+      if (progress_callback) {
+        progress_callback((float)done_size / data_size, progress_callback_user_data);
+      }
+      MODEL_ASSERT(lt.ne_tensor);  // unused tensors should have been caught by load_data already
+      lt.data = (uint8_t*)lt.ne_tensor->data;
+      load_data_for(lt);
+      lt.ne_tensor->data = lt.data;
+      done_size += lt.size;
+      if (use_mmap && lmlock) {
+        lmlock->grow_to(done_size);
+      }
+    }
+  }
+
+  void bestla_split_weight(void** src, void** dst, size_t src_n, size_t src_k, size_t dst_n, size_t dst_k,
+                           size_t n_rank, size_t k_rank, bool qkv_fusion = false) {
+    auto src_fp32 = (float*)malloc(src_n * src_k * sizeof(float));
+    if (src_fp32 == nullptr) {
+      assert(0);
+    }
+    bestla_unpackweight_fp32(*src, src_n, src_k, src_fp32, src_n);
+    // layout will be K * N in the buffer
+    float* dst_fp32;
+    if (qkv_fusion) {
+      dst_fp32 = (float*)malloc(dst_n * dst_k * sizeof(float));
+      for (int i = 0; i < src_k; ++i) {
+        for (int j = 0; j < 3; ++j) {
+          float* dst_off = dst_fp32 + dst_n * i + j * dst_n / 3;
+          float* src_off = src_fp32 + src_n * i + j * src_n / 3 + n_rank * dst_n / 3;
+          memcpy(dst_off, src_off, dst_n * sizeof(float) / 3);
+        }
+      }
+      bestla_packweight_copyattr(dst_fp32, *dst, dst_n, dst_k, dst_n, *src);
+      free(dst_fp32);
+    } else {
+      dst_fp32 = src_fp32 + k_rank * dst_k * src_n + n_rank * dst_n;
+      bestla_packweight_copyattr(dst_fp32, *dst, dst_n, dst_k, src_n, *src);
+    }
+    free(src_fp32);
+  }
+  void load_data_for(model_load_tensor& lt) {
+    if (use_mmap) {
+      MODEL_ASSERT(lt.shards.size() == 1);
+      lt.data = (uint8_t*)mapping->addr + lt.shards.at(0).file_off;
+    } else if (lt.split_type == SPLIT_NONE) {
+      model_file& file = file_loaders.at(lt.shards.at(0).file_idx)->file;
+      file.seek(lt.shards.at(0).file_off + file_loaders.at(0)->gguf_data_offset, SEEK_SET);
+      file.read_raw(lt.data, lt.size);
+    } else if (lt.split_type == SPLIT_BY_ROWS) {
+      size_t offset = 0;
+      for (model_load_tensor_shard& shard : lt.shards) {
+        model_file& file = file_loaders.at(shard.file_idx)->file;
+        file.seek(shard.file_off, SEEK_SET);
+        file.read_raw(lt.data + offset, shard.size);
+        offset += shard.size;
+      }
+      MODEL_ASSERT(offset == lt.size);
+    } else if (lt.split_type == SPLIT_BY_COLUMNS) {
+      // Let's load the data into temporary buffers to ensure the OS performs large loads.
+      std::vector<model_buffer> tmp_bufs(lt.shards.size());
+      for (size_t i = 0; i < lt.shards.size(); i++) {
+        model_load_tensor_shard& shard = lt.shards.at(i);
+        model_file& file = file_loaders.at(shard.file_idx)->file;
+        file.seek(shard.file_off, SEEK_SET);
+        tmp_bufs.at(i).resize(shard.size);
+        file.read_raw(tmp_bufs.at(i).addr, shard.size);
+      }
+      // Then reshape.
+      size_t num_rows = lt.ne.at(1);
+      size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
+      size_t out_offset = 0;
+      for (size_t row = 0; row < num_rows; row++) {
+        for (model_buffer& tmp_buf : tmp_bufs) {
+          memcpy(lt.data + out_offset, tmp_buf.addr + row * per_shard_row_size, per_shard_row_size);
+          out_offset += per_shard_row_size;
+        }
+      }
+      MODEL_ASSERT(out_offset == lt.size);
+    }
+    if (0) {
+      print_checksum(lt);
+    }
+  }
+
+  static void print_checksum(model_load_tensor& lt) {
+    uint32_t sum = 0;
+    for (size_t i = 0; i < lt.size; i++) {
+      uint8_t byte = lt.data[i];
+      sum = byte + (sum << 6) + (sum << 16) - sum;  // sdbm hash
+    }
+    fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
+            model_format_tensor_shape(lt.ne).c_str(), lt.size);
+  }
+};
+
+#endif  // MODEL_FILES_H
diff --git a/vllm-ext/inference_engine/cpp/models/model_utils/model_types.h b/vllm-ext/inference_engine/cpp/models/model_utils/model_types.h
new file mode 100644
index 000000000..00522c41c
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/model_utils/model_types.h
@@ -0,0 +1,469 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#ifndef MODEL_TYPES_H
+#define MODEL_TYPES_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cinttypes>
+#include <climits>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <initializer_list>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <queue>
+#include <random>
+#include <sstream>
+#include <thread>
+#include <unordered_map>
+
+#include "core/ne_layers.h"
+#include "models/model_utils/util.h"
+
+#define MODEL_MAX_NORM 4
+#define MODEL_MAX_ATTN 8
+#define MODEL_MAX_FFN 6
+#define MODEL_MAX_OTHERS 7
+#define MODEL_MAX_EXPERTS 8
+
+#define MODEL_USE_SCRATCH
+#define MODEL_MAX_SCRATCH_BUFFERS 16
+
+#define MODEL_FILE_MAGIC_GGJT 0x67676a74u  // 'ggjt'
+#define MODEL_FILE_MAGIC_GGLA 0x67676c61u  // 'ggla'
+#define MODEL_FILE_MAGIC_GGMF 0x67676d66u  // 'ggmf'
+#define MODEL_FILE_MAGIC_NE 0x67676d6cu    // 'ne'
+#define MODEL_FILE_MAGIC_GGSN 0x6767736eu  // 'ggsn'
+
+#define MODEL_FILE_VERSION 3
+#define MODEL_FILE_MAGIC MODEL_FILE_MAGIC_GGJT
+#define MODEL_FILE_MAGIC_UNVERSIONED MODEL_FILE_MAGIC_NE
+#define MODEL_SESSION_MAGIC MODEL_FILE_MAGIC_GGSN
+#define MODEL_SESSION_VERSION 1
+
+#define MODEL_MAX_REQUEST_NUM 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum model_archs {
+  MODEL_UNKNOWN,
+  MODEL_LLAMA,
+  MODEL_GPTJ,
+  MODEL_MPT,
+  MODEL_GPTNEOX,
+  MODEL_STARCODER,
+  MODEL_FALCON,
+  MODEL_OPT,
+  MODEL_BLOOM,
+  MODEL_BAICHUAN,
+  MODEL_CHATGLM3,
+  MODEL_CHATGLM2,
+  MODEL_CHATGLM,
+  MODEL_QWEN,
+  MODEL_PHI,
+  MODEL_GEMMA,
+  MODEL_STABLELM,
+  MODEL_WHISPER
+};
+
+static const size_t MB = 1024 * 1024;
+
+typedef enum KV_MEM_TYPE {  // Memory kv data type
+  KV_MEM_TYPE_AUTO,         // Try with bestla flash attn managed format; fall back to fp16 if failed
+  KV_MEM_TYPE_F16,          // Use F16 for memory kv
+  KV_MEM_TYPE_F32,          // Use F32 for memory kv
+} KV_MEM_TYPE;
+
+struct model_scratch {
+  size_t scratch0;
+  size_t scratch1;
+  size_t eval;
+};
+
+enum model_file_version {
+  MODEL_FILE_VERSION_NE,
+  MODEL_FILE_VERSION_GGMF_V1,  // added version field and scores in vocab
+  MODEL_FILE_VERSION_GGJT_V1,  // added padding
+  MODEL_FILE_VERSION_GGJT_V2,  // changed quantization format
+  MODEL_FILE_VERSION_GGJT_V3,  // changed Q4 and Q8 quantization format
+};
+
+//
+// C interface
+//
+// TODO: show sample usage
+//
+
+// default hparams (LLaMA 7B)
+struct model_hparams {
+  uint32_t n_vocab = 32000;
+  uint32_t n_embd = 4096;
+  uint32_t n_mult = 256;
+  uint32_t n_head = 32;
+  uint32_t n_head_kv = 0;  //  MQA, multi-query attention (default =0 means no MQA)
+  uint32_t n_layer = 32;
+  uint32_t n_rot = 64;
+  enum ne_ftype ftype = NE_FTYPE_MOSTLY_F16;
+  int32_t max_seq_len = 0;            // for mpt
+  float alibi_bias_max = 0;           // for mpt
+  float clip_qkv = 0;                 // for mpt
+  int32_t par_res = 1;                // for neox 1 = true, 0 = false
+  uint32_t word_embed_proj_dim = 0;   // for opt
+  bool do_layer_norm_before = false;  // for opt
+  float norm_eps = 1e-6f;             // rms norm epsilon
+  float freq_base = 10000.0f;         // rope theta
+  float freq_scale = 1.0f;            // rope scale factor
+
+  // ChatGLM-2
+  int32_t multi_query_group_num = 0;
+  int32_t ffn_hidden_size = 0;
+
+  // ChatGLM-1
+  int32_t inner_hidden_size = 0;
+  uint32_t n_experts = 0;
+  uint32_t n_experts_used = 0;
+  uint32_t n_embd_head_k = 0;
+
+  float rope_scaling_factor = 0.0f;
+  int32_t original_max_position_embeddings = 0;
+  int32_t use_yarn = 0;
+
+  bool operator!=(const model_hparams& other) const {
+    return static_cast<bool>(memcmp(this, &other, sizeof(model_hparams)));
+  }
+};
+
+struct model_layer {
+  // normalization
+  struct ne_tensor* norm[MODEL_MAX_NORM];
+
+  // attention
+  struct ne_tensor* attn[MODEL_MAX_ATTN];
+
+  // ff
+  struct ne_tensor* ffn[MODEL_MAX_FFN];
+
+  struct ne_tensor* ffn_gate_inp;
+
+  struct ne_tensor* ffn_gate_exp[MODEL_MAX_EXPERTS];
+
+  struct ne_tensor* ffn_down_exp[MODEL_MAX_EXPERTS];
+
+  struct ne_tensor* ffn_up_exp[MODEL_MAX_EXPERTS];
+
+  struct ne_tensor* k_cache;
+  struct ne_tensor* v_cache;
+
+  bool ffn_fusion = false;
+};
+
+typedef int32_t model_pos;
+typedef int32_t model_seq_id;
+
+struct kv_token_cell {
+  model_pos pos = -1;   // token idx (for rope)
+  model_pos delta = 0;  // token shift delta (pos += delta)
+};
+
+struct kv_seq_cell {
+  std::vector<kv_token_cell> token_cells;
+  model_seq_id seq_id = -1;
+  bool empty = true;
+};
+
+struct model_kv_cache {
+  struct ne_tensor* k = nullptr;
+  struct ne_tensor* v = nullptr;
+  struct ne_tensor* cossin = nullptr;  // cached cos/sin value for shifting RoPE
+
+  struct ne_context* ctx = nullptr;
+
+  model_ctx_buffer buf;
+
+  int n;  // number of tokens currently in the cache
+
+  bool has_shift = false;  // ring-buffer (for too long text generation like streaming-llm)
+  std::vector<kv_seq_cell> seq_cells;
+
+  ~model_kv_cache() {
+    if (ctx) {
+      ne_free(ctx);
+    }
+  }
+};
+
+struct generation_config {
+  uint32_t max_new_tokens;  // n_predict there
+  uint32_t min_new_tokens = 0;
+  // Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+  // the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+  // likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
+  // `length_penalty` < 0.0 encourages shorter sequences. (default = 1.0)
+  float length_penalty = 1.0f;
+  bool do_early_stopping = false;
+  // sampling parameters
+  bool do_sample = false;
+  int32_t top_k = 40;            // <= 0 to use vocab size
+  float top_p = 0.95f;           // 1.0 = disabled
+  float temp = 0.80f;            // 1.0 = disabled
+  float repeat_penalty = 1.10f;  // 1.0 = disabled
+};
+
+struct model_struct {
+  model_archs arch;
+
+  model_hparams hparams;
+  model_scratch scratchs;
+
+  struct ne_tensor* others[MODEL_MAX_OTHERS];
+  std::vector<model_layer> layers;
+
+  // context
+  struct ne_context* ctx = nullptr;
+
+  // key + value cache for the self attention
+  // TODO: move to model_state
+  struct model_kv_cache kv_self;
+
+  // the model memory buffer
+  model_ctx_buffer buf;
+
+  // model memory mapped file
+  std::unique_ptr<model_mmap> mapping;
+
+  // objects representing data potentially being locked in memory
+  model_mlock mlock_buf;
+  model_mlock mlock_mmap;
+
+  // for quantize-stats only
+  std::vector<std::pair<std::string, struct ne_tensor*>> tensors_by_name;
+
+  ~model_struct() {
+    if (ctx) {
+      ne_free(ctx);
+    }
+  }
+};
+
+struct model_vocab {
+  using id = int32_t;
+  using token = std::string;
+
+  struct token_score {
+    token tok;
+    float score;
+  };
+
+  std::unordered_map<token, id> token_to_id;
+  std::vector<token_score> id_to_token;
+  id bos_token_id = -1;
+  id eos_token_id = -1;
+  id pad_token_id = -1;
+  id sep_token_id = -1;
+};
+
+struct model_context {
+  std::mt19937 rng;
+
+  int64_t t_load_us = 0;
+  int64_t t_start_us = 0;
+  bool has_evaluated_once = false;
+
+  int64_t t_sample_us = 0;
+  int64_t t_eval_us = 0;
+  int64_t t_p_eval_us = 0;
+  std::vector<int64_t> eval_times;
+
+  int32_t n_eval = 0;    // number of eval calls
+  int32_t n_p_eval = 0;  // number of tokens in eval calls for the prompt (with batch size > 1)
+
+  int32_t n_ctx = 512;  // number of tokens to keep as context
+  int32_t batch_size;  // batch size to be dynamically changed for each run
+  int32_t max_batch_size;
+
+  model_struct model;
+  model_vocab vocab;
+  // num of current execution prompts
+  int request_running_bs = 1;
+  // length of current execution tokens list
+  // first token (prefill) generation is equal to `request_running_bs`
+  // next tokens (decoding) generation may be larger than `request_running_bs`(for example, beam search)
+  bool support_bestla_kv = false;  // whether the model graph supports bestla-kvcache
+  int kv_n_ctx_block = 1;
+
+  std::vector<std::vector<std::string>> tensors_name;
+
+  size_t mem_per_token = 0;
+
+  float scratch_size_ratio = 1.0f;  // model memory scratch enlarge scale
+
+  // last hidden states after all decoding layers and normalization. (1-dimensional array: [n_embd * batch_size])
+  std::vector<float> last_hidden_states;
+
+  // memory buffers used to evaluate the model
+  // TODO: move in model_state
+  model_ctx_buffer buf_compute;
+  model_ctx_buffer buf_scratch[MODEL_MAX_SCRATCH_BUFFERS];
+
+  int buf_last = 0;
+  size_t buf_max_size[MODEL_MAX_SCRATCH_BUFFERS] = {0};
+
+  void use_buf(struct ne_context* ctx, int i) {
+#if defined(MODEL_USE_SCRATCH)
+    size_t last_size = 0;
+
+    if (i == -1) {
+      last_size = ne_set_scratch(ctx, {
+                                          0,
+                                          0,
+                                          nullptr,
+                                      });
+    } else {
+      auto& buf = buf_scratch[i];
+      last_size = ne_set_scratch(ctx, {
+                                          0,
+                                          buf.size,
+                                          buf.addr,
+                                      });
+    }
+
+    if (buf_last >= 0) {
+      buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
+    }
+
+    buf_last = i;
+#else
+    (void)i;
+    (void)ctx;
+#endif
+  }
+
+  size_t get_buf_max_mem(int i) const {
+#if defined(MODEL_USE_SCRATCH)
+    return buf_max_size[i];
+#else
+    (void)i;
+    return 0;
+#endif
+  }
+};
+
+typedef model_vocab::id model_token;
+
+typedef struct model_token_data {
+  model_token id;  // token id
+  float logit;     // log-odds of the token
+  float p;         // probability of the token
+} model_token_data;
+
+typedef struct model_token_data_array {
+  model_token_data* data;
+  size_t size;
+  bool sorted;
+} model_token_data_array;
+
+typedef void (*model_progress_callback)(float progress, void* ctx);
+
+struct model_input {
+  // embd or next token
+  const model_token* tokens = nullptr;
+  // tokens length
+  uint32_t n_tokens = 0;
+  // prompt length
+  uint32_t n_prompt_tokens = 0;
+  // kv cache n_past
+  uint32_t n_past = 0;
+  // text tokens length (prompt + all next tokens)
+  // the number of tokens evaluated so far (including evicted tokens if there is any)
+  uint32_t n_total = 0;
+  // request id
+  int request_idx = -1;
+  // beam id in beam search
+  int beam_idx = 0;
+  // padding related, attention mask
+  // (0: left, 1: right)
+  // only support padding left in decoder only model
+  int padding_side = 0;
+  // padding length
+  uint32_t n_padding = 0;
+  // generation_config gen_conf;
+};
+
+class model_name_to_arch {
+ public:
+  static model_name_to_arch& init() {
+    static model_name_to_arch ins;
+    return ins;
+  }
+
+  void valid_options() {
+    for (auto pair : name2arch_) {
+      printf("%s, ", pair.first.c_str());
+    }
+    printf("\n");
+  }
+
+  model_archs find(const std::string& name) {
+    auto it = name2arch_.find(name);
+    if (it == name2arch_.end()) {
+      printf("%s is not a valid model name, supported model names are: ", name.c_str());
+      valid_options();
+      return MODEL_UNKNOWN;
+    } else {
+      return name2arch_.at(name);
+    }
+  }
+
+ private:
+  model_name_to_arch() {}
+  // update this table if has new cpp model
+  std::unordered_map<std::string, model_archs> name2arch_ = {
+      {"unknown", MODEL_UNKNOWN}, {"llama", MODEL_LLAMA},       {"gptj", MODEL_GPTJ},
+      {"mpt", MODEL_MPT},         {"opt", MODEL_OPT},           {"gptneox", MODEL_GPTNEOX},
+      {"dolly", MODEL_GPTNEOX},   {"polyglot", MODEL_GPTNEOX},  {"starcoder", MODEL_STARCODER},
+      {"falcon", MODEL_FALCON},   {"bloom", MODEL_BLOOM},       {"chatglm2", MODEL_CHATGLM2},
+      {"chatglm", MODEL_CHATGLM}, {"baichuan", MODEL_BAICHUAN}, {"mistral", MODEL_LLAMA},
+      {"qwen", MODEL_QWEN},       {"phi", MODEL_PHI},           {"stablelm", MODEL_STABLELM},
+      {"whisper", MODEL_WHISPER}, {"chatglm3", MODEL_CHATGLM3}, {"mixtral", MODEL_LLAMA},
+      {"gemma", MODEL_GEMMA}};
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+// Internal API to be implemented by model.cpp and used by tests/benchmarks only
+#ifdef MODEL_API_INTERNAL
+
+#include <string>
+#include <vector>
+struct ne_tensor;
+
+std::vector<std::pair<std::string, struct ne_tensor*>>& model_internal_get_tensor_map(struct model_context* ctx);
+
+#endif
+
+#endif  // MODEL_TYPES_H
diff --git a/vllm-ext/inference_engine/cpp/models/model_utils/model_utils.cpp b/vllm-ext/inference_engine/cpp/models/model_utils/model_utils.cpp
new file mode 100644
index 000000000..981ea4fd2
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/model_utils/model_utils.cpp
@@ -0,0 +1,1332 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+// Defines fileno on msys:
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#endif
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cinttypes>
+#include <climits>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <initializer_list>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <mutex>  //NOLINT
+#include <numeric>
+#include <queue>
+#include <random>
+#include <sstream>
+#include <thread>  //NOLINT
+#include <unordered_map>
+
+#include "models/application/common.h"
+#include "core/layers/bestla_common.hpp"
+#include "core/layers/mha_dense.h"
+#include "core/ne_layers.h"
+#include "core/layers/bestla_gemm.h"
+#include "bestla/bestla_parallel.h"
+
+#include "models/model_utils/model_config.h"
+#include "models/model_utils/model_files.h"
+#include "models/model_utils/model_utils.h"
+#include "models/model_utils/util.h"
+#include "models/models.h"
+
+//
+// kv cache
+//
+
+// non-null pointer of model for kv-cache as components of model->layers[il] (e.g. chatglm)
+static bool kv_cache_init(const struct model_hparams& hparams, struct model_kv_cache& cache,  // NOLINT
+                          const ne_type wtype, const int n_ctx, const int max_batch_size, model_struct* model) {
+  const auto n_layer = hparams.n_layer;
+  auto heads_kv = hparams.n_head_kv > 0 ? hparams.n_head_kv : hparams.n_head;
+  const auto head_size = hparams.n_embd_head_k == 0 ? hparams.n_embd / hparams.n_head : hparams.n_embd_head_k;
+
+  // 64bit to avoid overflow in later calculation
+  int64_t k_size, v_size;
+  get_batch_kv_elements_from_model_params(heads_kv, head_size, n_ctx, wtype, &k_size, &v_size);
+
+  int64_t layer_ne_k = max_batch_size * k_size;
+  int64_t layer_ne_v = max_batch_size * v_size;
+  const auto wsize = wtype == NE_TYPE_BTLA ? 1 : ne_type_size(wtype);
+
+  cache.buf.resize(n_layer * (layer_ne_k + layer_ne_v) * wsize + 2u * MB);
+  cache.seq_cells.resize(max_batch_size);
+  for (int i = 0; i < cache.seq_cells.size(); ++i) {
+    cache.seq_cells[i].token_cells.resize(n_ctx);
+  }
+
+  struct ne_init_params params;
+  params.mem_size = cache.buf.size;
+  params.mem_buffer = cache.buf.addr;
+  params.no_alloc = false;
+
+  cache.ctx = ne_init(params);
+
+  if (!cache.ctx) {
+    fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
+    return false;
+  }
+
+  // NE_TYPE_BTLA can not be allocated memory
+  const auto wtype_alloc = wtype == NE_TYPE_BTLA ? NE_TYPE_I8 : wtype;
+
+  if (model) {  // non-null param of model for kv-cache as components of model->layers[il]
+    for (int il = 0; il < n_layer; ++il) {
+      auto& k_cache = model->layers[il].k_cache;
+      auto& v_cache = model->layers[il].v_cache;
+      if (wtype == NE_TYPE_F16) {  // chatglm does not support fp32 kv-cache in original impl of chatglm_util.cpp
+        const auto head_size = hparams.n_embd_head_k == 0 ? hparams.n_embd / hparams.n_head : hparams.n_embd_head_k;
+        const int heads_kv = hparams.multi_query_group_num > 0 ? hparams.multi_query_group_num : hparams.n_head;
+        k_cache = d_ne_new_tensor_4d(model->ctx, NE_TYPE_F16, head_size, n_ctx, heads_kv, max_batch_size);
+        v_cache = d_ne_new_tensor_4d(model->ctx, NE_TYPE_F16, n_ctx, head_size, heads_kv, max_batch_size);
+      } else if (wtype == NE_TYPE_BTLA) {
+        k_cache = ne_new_tensor_1d(model->ctx, wtype_alloc, layer_ne_k + NE_ALIGNMENT, NE_SIZE_CALC);
+        const auto k_align_off = reinterpret_cast<uintptr_t>(k_cache->data) % NE_ALIGNMENT;
+        k_cache = ne_view_1d(model->ctx, k_cache, layer_ne_k, NE_ALIGNMENT - k_align_off);
+        k_cache->type = wtype;
+        v_cache = ne_new_tensor_1d(model->ctx, wtype_alloc, layer_ne_v + NE_ALIGNMENT, NE_SIZE_CALC);
+        const auto v_align_off = reinterpret_cast<uintptr_t>(v_cache->data) % NE_ALIGNMENT;
+        v_cache = ne_view_1d(model->ctx, v_cache, layer_ne_v, NE_ALIGNMENT - v_align_off);
+        v_cache->type = wtype;
+      } else {
+        NE_ASSERT(("Unexpected ne dtype for kv-cache", false));
+      }
+      ne_set_name(k_cache, "cache_k");
+      ne_set_name(v_cache, "cache_v");
+    }
+    const bool run_mha_reordered = model->layers[0].k_cache->type == NE_TYPE_BTLA;
+    fprintf(stderr, "%s: run_mha_reordered = %d\n", __func__, run_mha_reordered);
+  } else {
+    cache.k = ne_new_tensor_1d(cache.ctx, wtype_alloc, n_layer * layer_ne_k + NE_ALIGNMENT, NE_SIZE_CALC);
+    const auto k_align_off = reinterpret_cast<uintptr_t>(cache.k->data) % NE_ALIGNMENT;
+    cache.k = ne_view_1d(cache.ctx, cache.k, n_layer * layer_ne_k, NE_ALIGNMENT - k_align_off);
+    cache.k->type = wtype;
+    cache.v = ne_new_tensor_1d(cache.ctx, wtype_alloc, n_layer * layer_ne_v + NE_ALIGNMENT, NE_SIZE_CALC);
+    const auto v_align_off = reinterpret_cast<uintptr_t>(cache.v->data) % NE_ALIGNMENT;
+    cache.v = ne_view_1d(cache.ctx, cache.v, n_layer * layer_ne_v, NE_ALIGNMENT - v_align_off);
+    cache.v->type = wtype;
+    ne_set_name(cache.k, "cache_k");
+    ne_set_name(cache.v, "cache_v");
+  }
+
+  return true;
+}
+
+bool model_mmap_supported() { return model_mmap::SUPPORTED; }
+
+bool model_mlock_supported() { return model_mlock::SUPPORTED; }
+
+void model_init_backend() {
+  ne_time_init();
+
+  // needed to initialize f16 tables
+  {
+    struct ne_init_params params = {0, nullptr, false};
+    struct ne_context* ctx = ne_init(params);
+    ne_free(ctx);
+  }
+}
+
+int64_t model_time_us() { return ne_time_us(); }
+
+//
+// model loading
+//
+
+static bool load_model(const struct model_params& params, model_context& lctx, model_progress_callback progress_callback,
+                       void* progress_ctx) {
+  try {
+    lctx.t_start_us = ne_time_us();
+    lctx.model.arch = params.model_arch;
+    load_model_internal(params, lctx, progress_callback, progress_ctx);
+    lctx.t_load_us = ne_time_us() - lctx.t_start_us;
+    return true;
+  } catch (const std::string& err) {
+    fprintf(stderr, "error loading model: %s\n", err.c_str());
+    return false;
+  }
+}
+
+//
+// tokenizer
+//
+
+static size_t utf8_len(char src) {
+  const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
+  uint8_t highbits = static_cast<uint8_t>(src) >> 4;
+  return lookup[highbits];
+}
+
+struct model_sp_symbol_t {
+  using index = int;
+  index prev;
+  index next;
+  const char* text;
+  size_t n;
+};
+
+static_assert(std::is_trivially_copyable<model_sp_symbol_t>::value, "model_sp_symbol_t is not trivially copyable");
+
+struct model_sp_bigram_t {
+  struct comparator_t {
+    bool operator()(model_sp_bigram_t& l, model_sp_bigram_t& r) {  // NOLINT
+      return (l.score < r.score) || (l.score == r.score && l.left > r.left);
+    }
+  };
+  using queue_storage = std::vector<model_sp_bigram_t>;
+  using queue = std::priority_queue<model_sp_bigram_t, queue_storage, comparator_t>;
+  model_sp_symbol_t::index left;
+  model_sp_symbol_t::index right;
+  float score;
+  size_t size;
+};
+
+// original implementation:
+// https://github.com/ggerganov/model.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
+struct model_tokenizer_t {
+  model_tokenizer_t(const model_vocab& vocab) : vocab_(vocab) {}  // NOLINT
+
+  void tokenize(const std::string& text, std::vector<model_vocab::id>& output) {
+    // split string into utf8 chars
+    int index = 0;
+    size_t offs = 0;
+    while (offs < text.size()) {
+      model_sp_symbol_t sym;
+      size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
+      sym.text = text.c_str() + offs;
+      sym.n = char_len;
+      offs += char_len;
+      sym.prev = index - 1;
+      sym.next = offs == text.size() ? -1 : index + 1;
+      index++;
+      symbols_.emplace_back(sym);
+    }
+
+    // seed the work queue with all possible 2-character tokens.
+    for (size_t i = 1; i < symbols_.size(); ++i) {
+      try_add_bigram(i - 1, i);
+    }
+
+    // keep substituting the highest frequency pairs for as long as we can.
+    while (!work_queue_.empty()) {
+      auto bigram = work_queue_.top();
+      work_queue_.pop();
+
+      auto& left_sym = symbols_[bigram.left];
+      auto& right_sym = symbols_[bigram.right];
+
+      // if one of the symbols already got merged, skip it.
+      if (left_sym.n == 0 || right_sym.n == 0 || left_sym.n + right_sym.n != bigram.size) {
+        continue;
+      }
+
+      // merge the right sym into the left one
+      left_sym.n += right_sym.n;
+      right_sym.n = 0;
+
+      // printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text,
+      // bigram.size);
+
+      // remove the right sym from the chain
+      left_sym.next = right_sym.next;
+      if (right_sym.next >= 0) {
+        symbols_[right_sym.next].prev = bigram.left;
+      }
+
+      // find more substitutions
+      try_add_bigram(left_sym.prev, bigram.left);
+      try_add_bigram(bigram.left, left_sym.next);
+    }
+
+    for (int i = 0; i != -1; i = symbols_[i].next) {
+      auto& symbol = symbols_[i];
+      auto symbol_text = std::string(symbol.text, symbol.n);
+      auto token = vocab_.token_to_id.find(symbol_text);
+
+      if (token == vocab_.token_to_id.end()) {
+        // output any symbols that did not form tokens as bytes.
+        for (int j = 0; j < static_cast<int>(symbol.n); ++j) {
+          model_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
+          output.push_back(token_id);
+        }
+      } else {
+        output.push_back((*token).second);
+      }
+    }
+  }
+
+ private:
+  void try_add_bigram(int left, int right) {
+    if (left == -1 || right == -1) {
+      return;
+    }
+
+    const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n);
+    auto token = vocab_.token_to_id.find(text);
+
+    if (token == vocab_.token_to_id.end()) {
+      return;
+    }
+
+    if (static_cast<size_t>((*token).second) >= vocab_.id_to_token.size()) {
+      return;
+    }
+
+    const auto& tok_score = vocab_.id_to_token[(*token).second];
+
+    model_sp_bigram_t bigram;
+    bigram.left = left;
+    bigram.right = right;
+    bigram.score = tok_score.score;
+    bigram.size = text.size();
+    work_queue_.push(bigram);
+  }
+
+  const model_vocab& vocab_;
+  std::vector<model_sp_symbol_t> symbols_;
+  model_sp_bigram_t::queue work_queue_;
+};
+
+static std::vector<model_vocab::id> model_tokenize(const model_vocab& vocab, const std::string& text, bool bos) {
+  model_tokenizer_t tokenizer(vocab);
+  std::vector<model_vocab::id> output;
+
+  if (text.empty()) {
+    return output;
+  }
+
+  if (bos) {
+    output.push_back(vocab.bos_token_id);
+  }
+
+  tokenizer.tokenize(text, output);
+  return output;
+}
+
+//
+// interface implementation
+//
+
+struct model_context* create_model_context_from_file(const struct model_params& params) {
+  ne_time_init();
+
+  model_context* ctx = new model_context;
+  
+  unsigned cur_percentage = 0;
+  model_progress_callback progress_callback = [](float progress, void* ctx) {
+    unsigned* cur_percentage_p = reinterpret_cast<unsigned*>(ctx);
+    unsigned percentage = (unsigned)(100 * progress);
+    while (percentage > *cur_percentage_p) {
+      *cur_percentage_p = percentage;
+      fprintf(stderr, ".");
+      fflush(stderr);
+      if (percentage >= 100) {
+        fprintf(stderr, "\n");
+      }
+    }
+  };
+
+  ctx->rng = std::mt19937(params.seed);
+  ctx->max_batch_size = params.max_batch_size;
+  ctx->n_ctx = params.n_ctx;
+  ctx->kv_n_ctx_block = ctx->max_batch_size;
+
+  // ctx->scratch_size_ratio = params.scratch_size_ratio * ctx->max_batch_size;
+  ctx->scratch_size_ratio = params.scratch_size_ratio * 2.0f;  // 1.2f to be protective
+
+  const model_archs arch = params.model_arch;
+
+  // the type so that kv-cache allocated according to this type must be large enough
+  if (!load_model(params, *ctx, progress_callback, &cur_percentage)) {
+    fprintf(stderr, "%s: failed to load model\n", __func__);
+    model_free(ctx);
+    return nullptr;
+  }
+
+  // reserve memory for context buffers
+  const auto& hparams = ctx->model.hparams;
+
+  const attn_shape_t attn_shape = {
+      /* .batch_size = */ ctx->max_batch_size,
+      /* .head_num = */ static_cast<int>(hparams.n_head),
+      /* .heads_kv = */ static_cast<int>(hparams.n_head_kv),
+      /* .head_size = */ static_cast<int>(hparams.n_embd / hparams.n_head),
+      /* .sl_q = */ 1,  // for next-token inference
+      /* .sl_kv = */ static_cast<int>(ctx->n_ctx),
+  };
+  const bool support_bestla_kv = ctx->support_bestla_kv && bestla_reordered_attn_fp32_support(&attn_shape);
+  fprintf(stderr, "%s: support_bestla_kv = %d\n", __func__, support_bestla_kv);
+
+  const ne_type memory_type = params.memory_type == KV_MEM_TYPE_F16   ? NE_TYPE_F16
+                              : params.memory_type == KV_MEM_TYPE_F32 ? NE_TYPE_F32
+                              : params.memory_type == KV_MEM_TYPE_AUTO
+                                  ? (support_bestla_kv ? NE_TYPE_BTLA : NE_TYPE_F16)  // fall back to fp16
+                                  : NE_TYPE_COUNT;
+  NE_ASSERT(memory_type != NE_TYPE_COUNT);
+
+  const bool kv_in_layers =
+      (arch == MODEL_CHATGLM3 || arch == MODEL_CHATGLM2 || arch == MODEL_CHATGLM || arch == MODEL_BAICHUAN);
+  if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->n_ctx, ctx->max_batch_size, (kv_in_layers ? &ctx->model : nullptr))) {
+    fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
+    model_free(ctx);
+    return nullptr;
+  }
+
+  if (ctx->model.kv_self.k != nullptr) {
+    const size_t memory_size = params.memory_type == KV_MEM_TYPE_AUTO
+                                    ? ne_nelements(ctx->model.kv_self.k) + ne_nelements(ctx->model.kv_self.v)
+                                    : ne_nbytes(ctx->model.kv_self.k) + ne_nbytes(ctx->model.kv_self.v);
+    fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
+  } else if (ctx->model.layers[0].k_cache != nullptr) {
+    const auto k_cache = ctx->model.layers[0].k_cache;
+    const auto v_cache = ctx->model.layers[0].v_cache;
+    const size_t layer_memory_size = params.memory_type == KV_MEM_TYPE_AUTO
+                                          ? ne_nelements(k_cache) + ne_nelements(v_cache)
+                                          : ne_nbytes(k_cache) + ne_nbytes(v_cache);
+    fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, layer_memory_size / 1024.0 / 1024.0 * hparams.n_layer);
+  } else {
+    NE_ASSERT(("KV-cache not allocated!", false));
+  }
+
+  ctx->last_hidden_states.resize(hparams.n_embd);
+
+  ctx->buf_compute.resize(ctx->model.scratchs.eval);
+
+  ctx->buf_scratch[0].resize(ctx->model.scratchs.scratch0);
+  ctx->buf_scratch[1].resize(ctx->model.scratchs.scratch1);
+
+  return ctx;
+}
+
+void model_free(struct model_context* ctx) { delete ctx; }
+
+int model_apply_lora_from_file_internal(struct model_context* ctx, const char* path_lora, const char* path_base_model,
+                                        int n_threads) {
+  fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
+
+  auto& model = ctx->model;
+
+  const int64_t t_start_lora_us = ne_time_us();
+
+  auto fin = std::ifstream(path_lora, std::ios::binary);
+  if (!fin) {
+    fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
+    return 1;
+  }
+
+  // verify magic and version
+  {
+    uint32_t magic;
+    fin.read(reinterpret_cast<char*>(&magic), sizeof(magic));
+    if (magic != MODEL_FILE_MAGIC_GGLA) {
+      fprintf(stderr, "%s: bad file magic\n", __func__);
+      return 1;
+    }
+    uint32_t format_version;
+    fin.read(reinterpret_cast<char*>(&format_version), sizeof(format_version));
+
+    if (format_version != 1) {
+      fprintf(stderr, "%s: unsupported file version\n", __func__);
+      return 1;
+    }
+  }
+
+  int32_t lora_r;
+  int32_t lora_alpha;
+  fin.read(reinterpret_cast<char*>(&lora_r), sizeof(lora_r));
+  fin.read(reinterpret_cast<char*>(&lora_alpha), sizeof(lora_alpha));
+  float scaling = static_cast<float>(lora_alpha) / static_cast<float>(lora_r);
+
+  fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
+
+  // create a temporary ne context to store the lora tensors
+  // todo: calculate size from biggest possible tensor
+  std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
+  struct ne_init_params params;
+  params.mem_size = lora_buf.size();
+  params.mem_buffer = lora_buf.data();
+  params.no_alloc = false;
+
+  ne_context* lora_ctx = ne_init(params);
+  std::unordered_map<std::string, struct ne_tensor*> lora_tensors;
+
+  // create a name -> tensor map of the model to accelerate lookups
+  std::unordered_map<std::string, struct ne_tensor*> model_tensors;
+  for (auto& kv : model.tensors_by_name) {
+    model_tensors.insert(kv);
+  }
+
+  // load base model
+  std::unique_ptr<model_model_loader> model_loader;
+  ne_context* base_ctx = nullptr;
+  model_buffer base_buf;
+  if (path_base_model) {
+    fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
+    model_loader.reset(new model_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
+
+    size_t ctx_size;
+    size_t mmapped_size;
+    model_loader->calc_sizes(&ctx_size, &mmapped_size);
+    base_buf.resize(ctx_size);
+
+    ne_init_params base_params;
+    base_params.mem_size = base_buf.size;
+    base_params.mem_buffer = base_buf.addr;
+    base_params.no_alloc = model_loader->use_mmap;
+
+    base_ctx = ne_init(base_params);
+
+    model_loader->ne_ctx = base_ctx;
+
+    // maybe this should in model_model_loader
+    if (model_loader->use_mmap) {
+      model_loader->mapping.reset(new model_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
+    }
+  }
+
+  // read tensors and apply
+  bool warned = false;
+  int n_tensors = 0;
+  while (true) {
+    int32_t n_dims;
+    int32_t length;
+    int32_t ftype;
+
+    fin.read(reinterpret_cast<char*>(&n_dims), sizeof(n_dims));
+    fin.read(reinterpret_cast<char*>(&length), sizeof(length));
+    fin.read(reinterpret_cast<char*>(&ftype), sizeof(ftype));
+    if (fin.eof()) {
+      break;
+    }
+
+    int32_t ne[2] = {1, 1};
+    for (int i = 0; i < n_dims; ++i) {
+      fin.read(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
+    }
+
+    std::string name;
+    {
+      char buf[1024];
+      fin.read(buf, length);
+      name = std::string(buf, length);
+    }
+
+    // check for lora suffix and get the type of tensor
+    const std::string lora_suffix = ".lora";
+    size_t pos = name.rfind(lora_suffix);
+    if (pos == std::string::npos) {
+      fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
+      return 1;
+    }
+
+    std::string lora_type = name.substr(pos + lora_suffix.length());
+    std::string base_name = name;
+    base_name.erase(pos);
+    // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__,
+    // name.c_str(),base_name.c_str(), lora_type.c_str());
+
+    if (model_tensors.find(base_name) == model_tensors.end()) {
+      fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
+      return 1;
+    }
+
+    // create ne tensor
+    ne_type wtype;
+    switch (ftype) {
+      case 0:
+        wtype = NE_TYPE_F32;
+        break;
+      case 1:
+        wtype = NE_TYPE_F16;
+        break;
+      default: {
+        fprintf(stderr, "%s: invalid tensor data type '%d'\n", __func__, ftype);
+        return false;
+      }
+    }
+    ne_tensor* lora_tensor;
+    if (n_dims == 2) {
+      lora_tensor = ne_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1], NE_SIZE_CALC);
+    } else {
+      fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
+      return 1;
+    }
+
+    // load tensor data
+    size_t offset = fin.tellg();
+    size_t tensor_data_size = ne_nbytes(lora_tensor);
+    offset = (offset + 31) & -32;
+    fin.seekg(offset);
+    fin.read(reinterpret_cast<char*>(lora_tensor->data), tensor_data_size);
+
+    lora_tensors[name] = lora_tensor;
+
+    // check if we have both A and B tensors and apply
+    if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
+        lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
+      ne_tensor* dest_t = model_tensors[base_name];
+      ne_tensor* base_t;
+      if (model_loader) {
+        // load from base model
+        if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
+          fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
+          return 1;
+        }
+        size_t idx = model_loader->tensors_map.name_to_idx[base_name];
+        model_load_tensor& lt = model_loader->tensors_map.tensors[idx];
+        base_t =
+            model_loader->get_tensor(base_name, {(uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1]}, NE_BACKEND_CPU);
+        lt.data = reinterpret_cast<uint8_t*>(lt.ne_tensor->data);
+        model_loader->load_data_for(lt);
+        lt.ne_tensor->data = lt.data;
+      } else {
+        base_t = dest_t;
+      }
+
+      if (ne_is_quantized(base_t->type)) {
+        if (!warned) {
+          fprintf(stderr,
+                  "%s: warning: using a lora adapter with a quantized model "
+                  "may result in poor quality, "
+                  "use a f16 or f32 base model with --lora-base\n",
+                  __func__);
+          warned = true;
+        }
+      }
+
+      ne_tensor* loraA = lora_tensors[base_name + ".loraA"];
+      ne_tensor* loraB = lora_tensors[base_name + ".loraB"];
+
+      if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
+        fprintf(stderr,
+                "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64
+                ");"
+                " are you sure that this adapter is for this model?\n",
+                __func__, base_t->ne[0], loraA->ne[1]);
+        return 1;
+      }
+
+      // w = w + BA*s
+      ne_tensor* BA = ne_mul_mat(lora_ctx, loraA, loraB);
+
+      if (scaling != 1.0f) {
+        ne_tensor* scale_tensor = ne_new_f32(lora_ctx, scaling);
+        BA = ne_scale_inplace(lora_ctx, BA, scale_tensor);
+      }
+
+      ne_tensor* r;
+      if (base_t == dest_t) {
+        r = ne_add_inplace(lora_ctx, dest_t, BA);
+      } else {
+        r = ne_add(lora_ctx, base_t, BA);
+        r = ne_cpy(lora_ctx, r, dest_t);
+      }
+
+      struct ne_cgraph gf = ne_build_forward(r);
+      gf.n_threads = n_threads;
+      ne_graph_compute(lora_ctx, &gf);
+
+      // we won't need these tensors again, reset the context to save memory
+      ne_free(lora_ctx);
+      lora_ctx = ne_init(params);
+      lora_tensors.clear();
+
+      n_tensors++;
+      if (n_tensors % 4 == 0) {
+        fprintf(stderr, ".");
+      }
+    }
+  }
+
+  // this should be in a destructor, it will leak on failure
+  ne_free(lora_ctx);
+  if (base_ctx) {
+    ne_free(base_ctx);
+  }
+
+  const int64_t t_lora_us = ne_time_us() - t_start_lora_us;
+  fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
+
+  return 0;
+}
+
+int model_apply_lora_from_file(struct model_context* ctx, const char* path_lora, const char* path_base_model,
+                               int n_threads) {
+  try {
+    return model_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
+  } catch (const std::string& err) {
+    fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
+    return 1;
+  }
+}
+
+struct model_context* create_model_context(const model_params& params) {
+  if (params.model_arch == MODEL_UNKNOWN) {
+    fprintf(stderr, "error, please set model_name \n");
+    exit(0);
+  }
+
+  model_context* lctx = create_model_context_from_file(params);
+
+  const auto& model_hparams = lctx->model.hparams;
+  // printf("n_head_kv=%s,multi_query_group_num=%s",model_hparams.n_head_kv,model_hparams.multi_query_group_num);
+  NE_ASSERT(("Can not set n_head_kv and multi_query_group_num at the same time",
+             model_hparams.n_head_kv == 0 || model_hparams.multi_query_group_num == 0 ||
+                 model_hparams.n_head_kv == model_hparams.multi_query_group_num));
+  attn_shape_t attn_shape = {
+      /* .batch_size = */ params.max_batch_size,
+      /* .head_num = */ static_cast<int>(model_hparams.n_head),
+      /* .heads_kv = */ static_cast<int>(model_hparams.n_head_kv + model_hparams.multi_query_group_num),
+      /* .head_size = */ static_cast<int>(model_hparams.n_embd / model_hparams.n_head),
+      /* .sl_q = */ 1,  // Note: make sure that bestla reordered attn supports next token inferencing
+      /* .sl_kv = */ static_cast<int>(params.n_ctx),
+  };
+  const auto k_cache_example = lctx->model.kv_self.k != nullptr ? lctx->model.kv_self.k           // llama.cpp style
+                                                                : lctx->model.layers[0].k_cache;  // chatglm style
+  NE_ASSERT(k_cache_example->type != NE_TYPE_BTLA || bestla_reordered_attn_fp32_support(&attn_shape));
+
+  if (lctx == nullptr) {
+    fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+    return nullptr;
+  }
+
+  if (!params.lora_adapter.empty()) {
+    int err =
+        model_apply_lora_from_file(lctx, params.lora_adapter.c_str(),
+                                   params.lora_base.empty() ? nullptr : params.lora_base.c_str(), params.n_threads);
+    if (err != 0) {
+      fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+      return nullptr;
+    }
+  }
+
+  return lctx;
+}
+
+void get_batch_kv_elements_from_model_params(int heads_kv, int head_size, int n_ctx, ne_type wtype, int64_t* k_size,
+                                           int64_t* v_size) {
+  if (wtype == NE_TYPE_F16 || wtype == NE_TYPE_F32) {
+    *k_size = n_ctx * heads_kv * head_size;
+    *v_size = n_ctx * heads_kv * head_size;
+  } else if (wtype == NE_TYPE_BTLA) {
+    kv_shape_t kv_shape = {
+        /* .heads_kv = */ static_cast<uint32_t>(heads_kv),
+        /* .head_size = */ static_cast<uint32_t>(head_size),
+        /* .sl_kv_max = */ static_cast<uint32_t>(n_ctx),
+    };
+    kv_cache_info_t kv_cache_info;
+    bestla_reordered_attn_fp32_batch_kv_info(&kv_shape, &kv_cache_info);
+    *k_size = kv_cache_info.k_bytes;
+    *v_size = kv_cache_info.v_bytes;
+  } else {
+    assert(false);
+  }
+}
+
+int model_get_kv_cache_token_count(const struct model_context* ctx) { return ctx->model.kv_self.n; }
+
+#define MODEL_MAX_RNG_STATE (64 * 1024)
+
+void model_set_rng_seed(struct model_context* ctx, int seed) {
+  if (seed < 0) {
+    seed = time(nullptr);
+  }
+  ctx->rng.seed(seed);
+}
+
+// Returns the *maximum* size of the state
+size_t model_get_state_size(const struct model_context* ctx) {
+  // we don't know size of rng until we actually serialize it. so reserve more
+  // than enough memory for its serialized state. for reference,
+  // std::mt19937(1337) serializes to 6701 bytes.
+  const size_t s_rng_size = sizeof(size_t);
+  const size_t s_rng = MODEL_MAX_RNG_STATE;
+  const size_t s_hidden_state_size = sizeof(size_t);
+  const size_t s_hidden_states = ctx->last_hidden_states.size() * sizeof(float);
+  const size_t s_kv_size = sizeof(size_t);
+  const size_t s_kv_ntok = sizeof(int);
+  const size_t s_kv = ctx->model.kv_self.buf.size;
+
+  const size_t s_total = (+s_rng_size + s_rng + s_hidden_state_size + s_hidden_states +
+                          s_kv_size + s_kv_ntok + s_kv);
+
+  return s_total;
+}
+
+// Copies the state to the specified destination address
+size_t model_copy_state_data(struct model_context* ctx, uint8_t* dst) {
+  uint8_t* out = dst;
+
+  // copy rng
+  {
+    std::stringstream rng_ss;
+    rng_ss << ctx->rng;
+
+    const size_t rng_size = rng_ss.str().size();
+    char rng_buf[MODEL_MAX_RNG_STATE];
+
+    memset(&rng_buf[0], 0, MODEL_MAX_RNG_STATE);
+    memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
+
+    memcpy(out, &rng_size, sizeof(rng_size));
+    out += sizeof(rng_size);
+    memcpy(out, &rng_buf[0], MODEL_MAX_RNG_STATE);
+    out += MODEL_MAX_RNG_STATE;
+  }
+
+  // copy hidden states
+  {
+    const size_t hidden_state_size = ctx->last_hidden_states.size();
+
+    memcpy(out, &hidden_state_size, sizeof(hidden_state_size));
+    out += sizeof(hidden_state_size);
+
+    if (hidden_state_size) {
+      memcpy(out, ctx->last_hidden_states.data(), hidden_state_size * sizeof(float));
+      out += hidden_state_size * sizeof(float);
+    }
+  }
+
+  // copy kv cache
+  {
+    const auto& kv_self = ctx->model.kv_self;
+    const auto& hparams = ctx->model.hparams;
+    const int n_layer = hparams.n_layer;
+    const int n_embd = hparams.n_embd;
+    const int n_ctx = ctx->n_ctx;
+
+    const size_t kv_size = kv_self.buf.size;
+    const int kv_ntok = model_get_kv_cache_token_count(ctx);
+
+    memcpy(out, &kv_size, sizeof(kv_size));
+    out += sizeof(kv_size);
+    memcpy(out, &kv_ntok, sizeof(kv_ntok));
+    out += sizeof(kv_ntok);
+
+    if (kv_size) {
+      const size_t elt_size = ne_element_size(kv_self.k);
+
+      char buffer[4096];
+
+      ne_context* cpy_ctx = ne_init({sizeof(buffer), buffer, /* no_alloc */ true});
+      ne_cgraph gf{};
+      gf.n_threads = 1;
+
+      ne_tensor* kout3d = ne_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer, NE_SIZE_CALC);
+      kout3d->data = out;
+      out += ne_nbytes(kout3d);
+
+      ne_tensor* vout3d = ne_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer, NE_SIZE_CALC);
+      vout3d->data = out;
+      out += ne_nbytes(vout3d);
+
+      ne_tensor* k3d =
+          ne_view_3d(cpy_ctx, kv_self.k, n_embd, kv_ntok, n_layer, elt_size * n_embd, elt_size * n_embd * n_ctx, 0);
+
+      ne_tensor* v3d =
+          ne_view_3d(cpy_ctx, kv_self.v, kv_ntok, n_embd, n_layer, elt_size * n_ctx, elt_size * n_ctx * n_embd, 0);
+
+      ne_build_forward_expand(&gf, ne_cpy(cpy_ctx, k3d, kout3d));
+      ne_build_forward_expand(&gf, ne_cpy(cpy_ctx, v3d, vout3d));
+      ne_graph_compute(cpy_ctx, &gf);
+
+      ne_free(cpy_ctx);
+    }
+  }
+
+  const size_t written = out - dst;
+  const size_t max_size = model_get_state_size(ctx);
+
+  MODEL_ASSERT(written <= max_size);
+
+  return written;
+}
+
+// Sets the state reading from the specified source address
+size_t model_set_state_data(struct model_context* ctx, uint8_t* src) {
+  uint8_t* inp = src;
+
+  // set rng
+  {
+    size_t rng_size;
+    char rng_buf[MODEL_MAX_RNG_STATE];
+
+    memcpy(&rng_size, inp, sizeof(rng_size));
+    inp += sizeof(rng_size);
+    memcpy(&rng_buf[0], inp, MODEL_MAX_RNG_STATE);
+    inp += MODEL_MAX_RNG_STATE;
+
+    std::stringstream rng_ss;
+    rng_ss.str(std::string(&rng_buf[0], rng_size));
+    rng_ss >> ctx->rng;
+
+    MODEL_ASSERT(rng_ss.fail() == false);
+  }
+
+  // set hidden states
+  {
+    size_t hidden_state_size;
+
+    memcpy(&hidden_state_size, inp, sizeof(hidden_state_size));
+    inp += sizeof(hidden_state_size);
+
+    MODEL_ASSERT(ctx->last_hidden_states.capacity() == hidden_state_size);
+
+    if (hidden_state_size) {
+      memcpy(ctx->last_hidden_states.data(), inp, hidden_state_size * sizeof(float));
+      inp += hidden_state_size * sizeof(float);
+    }
+  }
+
+  // set kv cache
+  {
+    const auto& kv_self = ctx->model.kv_self;
+    const auto& hparams = ctx->model.hparams;
+    const int n_layer = hparams.n_layer;
+    const int n_embd = hparams.n_embd;
+    const int n_ctx = ctx->n_ctx;
+
+    size_t kv_size;
+    int kv_ntok;
+
+    memcpy(&kv_size, inp, sizeof(kv_size));
+    inp += sizeof(kv_size);
+    memcpy(&kv_ntok, inp, sizeof(kv_ntok));
+    inp += sizeof(kv_ntok);
+
+    if (kv_size) {
+      MODEL_ASSERT(kv_self.buf.size == kv_size);
+
+      const size_t elt_size = ne_element_size(kv_self.k);
+
+      char buffer[4096];
+
+      ne_context* cpy_ctx = ne_init({sizeof(buffer), buffer, /* no_alloc */ true});
+      ne_cgraph gf{};
+      gf.n_threads = 1;
+
+      ne_tensor* kin3d = ne_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer, NE_SIZE_CALC);
+      kin3d->data = reinterpret_cast<void*>(inp);
+      inp += ne_nbytes(kin3d);
+
+      ne_tensor* vin3d = ne_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer, NE_SIZE_CALC);
+      vin3d->data = reinterpret_cast<void*>(inp);
+      inp += ne_nbytes(vin3d);
+
+      ne_tensor* k3d =
+          ne_view_3d(cpy_ctx, kv_self.k, n_embd, kv_ntok, n_layer, elt_size * n_embd, elt_size * n_embd * n_ctx, 0);
+
+      ne_tensor* v3d =
+          ne_view_3d(cpy_ctx, kv_self.v, kv_ntok, n_embd, n_layer, elt_size * n_ctx, elt_size * n_ctx * n_embd, 0);
+
+      ne_build_forward_expand(&gf, ne_cpy(cpy_ctx, kin3d, k3d));
+      ne_build_forward_expand(&gf, ne_cpy(cpy_ctx, vin3d, v3d));
+      ne_graph_compute(cpy_ctx, &gf);
+
+      ne_free(cpy_ctx);
+    }
+
+    ctx->model.kv_self.n = kv_ntok;
+  }
+
+  const size_t nread = inp - src;
+  const size_t max_size = model_get_state_size(ctx);
+
+  MODEL_ASSERT(nread <= max_size);
+
+  return nread;
+}
+
+bool model_load_session_file(struct model_context* ctx, const char* path_session, model_token* tokens_out,
+                             size_t n_token_capacity, size_t* n_token_count_out) {
+  model_file file(path_session, "rb");
+
+  // sanity checks
+  {
+    const uint32_t magic = file.read_u32();
+    const uint32_t version = file.read_u32();
+
+    if (magic != MODEL_SESSION_MAGIC || version != MODEL_SESSION_VERSION) {
+      fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
+      return false;
+    }
+
+    model_hparams session_hparams;
+    file.read_raw(&session_hparams, sizeof(model_hparams));
+
+    if (session_hparams != ctx->model.hparams) {
+      fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
+      return false;
+    }
+  }
+
+  // load the prompt
+  {
+    const uint32_t n_token_count = file.read_u32();
+
+    if (n_token_count > n_token_capacity) {
+      fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count,
+              n_token_capacity);
+      return false;
+    }
+
+    file.read_raw(tokens_out, sizeof(model_token) * n_token_count);
+    *n_token_count_out = n_token_count;
+  }
+
+  // restore the context state
+  {
+    const size_t n_state_size_cur = file.size - file.tell();
+    const size_t n_state_size_max = model_get_state_size(ctx);
+
+    if (n_state_size_cur > n_state_size_max) {
+      fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max,
+              n_state_size_cur);
+      return false;
+    }
+
+    std::vector<uint8_t> state_data(n_state_size_max);
+    file.read_raw(state_data.data(), n_state_size_cur);
+
+    model_set_state_data(ctx, state_data.data());
+  }
+
+  return true;
+}
+
+bool model_save_session_file(struct model_context* ctx, const char* path_session, const model_token* tokens,
+                             size_t n_token_count) {
+  model_file file(path_session, "wb");
+
+  file.write_u32(MODEL_SESSION_MAGIC);
+  file.write_u32(MODEL_SESSION_VERSION);
+
+  file.write_raw(&ctx->model.hparams, sizeof(model_hparams));
+
+  // save the prompt
+  file.write_u32((uint32_t)n_token_count);
+  file.write_raw(tokens, sizeof(model_token) * n_token_count);
+
+  // save the context state
+  {
+    const size_t n_state_size_max = model_get_state_size(ctx);
+
+    std::vector<uint8_t> state_data(n_state_size_max);
+    const size_t n_state_size_cur = model_copy_state_data(ctx, state_data.data());
+
+    file.write_raw(state_data.data(), n_state_size_cur);
+  }
+
+  return true;
+}
+
+int model_tokenize(struct model_context* ctx, const char* text, model_token* tokens, int n_max_tokens, bool add_bos) {
+  auto res = model_tokenize(ctx->vocab, text, add_bos);
+
+  if (n_max_tokens < static_cast<int>(res.size())) {
+    fprintf(stderr, "%s: too many tokens\n", __func__);
+    return -(static_cast<int>(res.size()));
+  }
+
+  for (size_t i = 0; i < res.size(); i++) {
+    tokens[i] = res[i];
+  }
+
+  return res.size();
+}
+
+std::vector<model_token> model_tokenize(struct model_context* ctx, const std::string& text, bool add_bos) {
+  // initialize to prompt number of chars, since n_tokens <= n_prompt_chars
+  std::vector<model_token> res(text.size() + static_cast<int>(add_bos));
+  const int n = model_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
+  assert(n >= 0);
+  res.resize(n);
+
+  return res;
+}
+
+int model_n_vocab(const struct model_context* ctx) { return ctx->vocab.id_to_token.size(); }
+
+int model_n_ctx(const struct model_context* ctx) { return ctx->n_ctx; }
+
+int model_n_embd(const struct model_context* ctx) { return ctx->model.hparams.n_embd; }
+
+float* model_get_last_hidden_states(struct model_context* ctx) { return ctx->last_hidden_states.data(); }
+
+const char* model_token_to_str(const struct model_context* ctx, model_token token) {
+  if (token >= model_n_vocab(ctx)) {
+    return nullptr;
+  }
+
+  return ctx->vocab.id_to_token[token].tok.c_str();
+}
+
+model_token model_token_nl() { return 13; }
+
+void model_print_timings(struct model_context* ctx) {
+  const int64_t t_end_us = ne_time_us();
+
+  const int32_t n_eval = std::max(1, ctx->n_eval);
+  const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
+
+  fprintf(stderr, "\n");
+  fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
+  fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__,
+          1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
+  fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per token)\n", __func__,
+          1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
+  fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us) / 1000.0);
+  fflush(stderr);
+  printf("========== eval time log of each prediction ==========\n");
+  for (int i = 0; i < ctx->eval_times.size(); ++i) {
+    printf("prediction %3d, time: %.2fms\n", i, ctx->eval_times[i] / 1000.0f);
+  }
+  fflush(stdout);
+}
+
+void model_reset_timings(struct model_context* ctx) {
+  ctx->t_start_us = ne_time_us();
+  ctx->has_evaluated_once = false;
+  ctx->eval_times.clear();
+  ctx->t_eval_us = ctx->n_eval = 0;
+  ctx->t_p_eval_us = ctx->n_p_eval = 0;
+}
+
+const char* model_print_system_info(void) {
+  static std::string s;
+
+  s = "";
+  s += "AVX = " + std::to_string(ne_cpu_has_avx()) + " | ";
+  s += "AVX2 = " + std::to_string(ne_cpu_has_avx2()) + " | ";
+  s += "AVX512 = " + std::to_string(ne_cpu_has_avx512()) + " | ";
+  s += "AVX512_VBMI = " + std::to_string(ne_cpu_has_avx512_vbmi()) + " | ";
+  s += "AVX512_VNNI = " + std::to_string(ne_cpu_has_avx512_vnni()) + " | ";
+  s += "FMA = " + std::to_string(ne_cpu_has_fma()) + " | ";
+  s += "F16C = " + std::to_string(ne_cpu_has_f16c()) + " | ";
+  s += "BLAS = " + std::to_string(ne_cpu_has_blas()) + " | ";
+  s += "SSE3 = " + std::to_string(ne_cpu_has_sse3()) + " | ";
+  s += "VSX = " + std::to_string(ne_cpu_has_vsx()) + " | ";
+
+  return s.c_str();
+}
+
+// For internal test use
+std::vector<std::pair<std::string, struct ne_tensor*>>& model_internal_get_tensor_map(struct model_context* ctx) {
+  return ctx->model.tensors_by_name;
+}
+
+static void ne_model_kv_cache_seq_cpy(struct model_context* ctx, const model_seq_id& seq_id_src,
+                                      const model_seq_id& seq_id_dst, const model_pos& p0, const model_pos& p1) {
+  const uint32_t kv_n_ctx_block = ctx->kv_n_ctx_block;
+  uint32_t n_head = 0;
+  auto h_n_head_kv = ctx->model.hparams.n_head_kv;
+  auto h_multi_query_group_num = ctx->model.hparams.multi_query_group_num;
+  if (h_n_head_kv > 0) {
+    n_head = h_n_head_kv;
+    MODEL_ASSERT(("Invalid: multi_query_group_num > 0 and n_head_kv >0 !\n", (!(h_multi_query_group_num > 0))));
+  } else if (h_multi_query_group_num > 0) {
+    n_head = h_multi_query_group_num;
+  } else {
+    n_head = ctx->model.hparams.n_head;
+  }
+  const uint32_t head_dim = ctx->model.hparams.n_embd / ctx->model.hparams.n_head;
+  const uint32_t n_embd = n_head * head_dim;
+  const uint32_t n_ctx = ctx->n_ctx;
+  const size_t k_elem_size = ne_element_size(ctx->model.kv_self.k);
+  const size_t v_elem_size = ne_element_size(ctx->model.kv_self.v);
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < ctx->model.layers.size(); ++i) {  // K
+    // [head_dim, N, n_head]
+    for (int nh = 0; nh < n_head; ++nh) {
+      memcpy(static_cast<char*>(ctx->model.kv_self.k->data) + i * n_ctx * k_elem_size * n_embd * kv_n_ctx_block +
+                 seq_id_dst * n_ctx * k_elem_size * n_embd + k_elem_size * nh * head_dim * n_ctx +
+                 p0 * k_elem_size * head_dim,
+             static_cast<char*>(ctx->model.kv_self.k->data) + i * n_ctx * k_elem_size * n_embd * kv_n_ctx_block +
+                 seq_id_src * n_ctx * k_elem_size * n_embd + k_elem_size * nh * head_dim * n_ctx +
+                 p0 * k_elem_size * head_dim,
+             k_elem_size * head_dim * (p1 - p0 + 1));
+    }
+  }
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < ctx->model.layers.size(); ++i) {  // V
+    // [N, head_dim, n_head] or [N, n_embd]
+    for (int nm = 0; nm < n_embd; ++nm) {
+      memcpy(static_cast<char*>(ctx->model.kv_self.v->data) + i * n_ctx * v_elem_size * n_embd * kv_n_ctx_block +
+                 seq_id_dst * n_ctx * v_elem_size * n_embd + n_ctx * nm * v_elem_size + p0 * v_elem_size,
+             static_cast<char*>(ctx->model.kv_self.v->data) + i * n_ctx * v_elem_size * n_embd * kv_n_ctx_block +
+                 seq_id_src * n_ctx * v_elem_size * n_embd + n_ctx * nm * v_elem_size + p0 * v_elem_size,
+             v_elem_size * (p1 - p0 + 1));
+    }
+  }
+}
+
+static void bestla_model_kv_cache_seq_cpy(struct model_context* ctx, const model_seq_id& seq_id_src,
+                                          const model_seq_id& seq_id_dst, const model_pos& p0, const model_pos& p1) {
+  const auto& kv_self = ctx->model.kv_self;
+  const auto& hparams = ctx->model.hparams;
+  int heads_kv = 0;
+  auto h_n_head_kv = hparams.n_head_kv;
+  auto h_multi_query_group_num = hparams.multi_query_group_num;
+  if (h_n_head_kv > 0) {
+    heads_kv = h_n_head_kv;
+    MODEL_ASSERT(("Invalid: multi_query_group_num > 0 and n_head_kv >0 !\n", (!(h_multi_query_group_num > 0))));
+  } else if (h_multi_query_group_num > 0) {
+    heads_kv = h_multi_query_group_num;
+  } else {
+    heads_kv = hparams.n_head;
+  }
+  const auto head_size = hparams.n_embd_head_k == 0 ? hparams.n_embd / hparams.n_head : hparams.n_embd_head_k;
+  const int n_ctx = ctx->n_ctx;
+  const auto kv_n_ctx_block = ctx->kv_n_ctx_block;
+  NE_ASSERT(("Invalid end position!", n_ctx >= p1));
+  kv_cache_info_t kv_cache_info;
+  kv_shape_t kv_shape{
+      /* .head_num = */ static_cast<uint32_t>(heads_kv),
+      /* .head_size = */ static_cast<uint32_t>(head_size),
+      /* .sl_kv_max = */ static_cast<uint32_t>(n_ctx),
+  };
+  bestla_reordered_attn_fp32_batch_kv_info(&kv_shape, &kv_cache_info);
+  const auto k_bytes = kv_cache_info.k_bytes;
+  const auto v_bytes = kv_cache_info.v_bytes;
+
+  bestla_fusion_attn_fp32_batch_cpy_kv_args_t seq_cpy_param{
+      /* .src = */ nullptr,
+      /* .dst = */ nullptr,
+      /* .heads_kv = */ heads_kv,
+      /* .head_size = */ static_cast<int>(head_size),
+      /* .seq_off = */ p0,
+      /* .seq_size = */ p1 - p0,
+      /* .seq_max = */ n_ctx,
+      /* .no_zeroing = */ false,
+  };
+  for (int il = 0; il < ctx->model.layers.size(); ++il) {
+    const auto k_data = reinterpret_cast<char*>(kv_self.k->data) + il * kv_n_ctx_block * k_bytes;
+    seq_cpy_param.src = k_data + seq_id_src * k_bytes;
+    seq_cpy_param.dst = k_data + seq_id_dst * k_bytes;
+    bestla_fusion_attn_fp32_batch_cpy_k(&seq_cpy_param);
+
+    const auto v_data = reinterpret_cast<char*>(kv_self.v->data) + il * kv_n_ctx_block * v_bytes;
+    seq_cpy_param.src = v_data + seq_id_src * v_bytes;
+    seq_cpy_param.dst = v_data + seq_id_dst * v_bytes;
+    bestla_fusion_attn_fp32_batch_cpy_v(&seq_cpy_param);
+  }
+}
+
+void model_kv_cache_seq_cpy(struct model_context* ctx, const model_seq_id& seq_id_src, const model_seq_id& seq_id_dst,
+                            const model_pos& p0, const model_pos& p1) {
+  if (ctx->model.kv_self.k->type != NE_TYPE_BTLA)
+    ne_model_kv_cache_seq_cpy(ctx, seq_id_src, seq_id_dst, p0, p1);
+  else
+    bestla_model_kv_cache_seq_cpy(ctx, seq_id_src, seq_id_dst, p0, p1);
+}
+
+static ne_tensor* ne_model_kv_cache_seq_concat(struct ne_cgraph* cgraph, struct model_context* moctx,
+                                               struct ne_context* nectx, const int64_t& ne0, const int64_t& ne1,
+                                               const int64_t& ne2, const int64_t& ne3,
+                                               const std::vector<int>& block_ids, const int& layer_idx,
+                                               const bool& concat_k) {
+  MODEL_ASSERT(ne3 == block_ids.size());
+  struct ne_tensor* cache = concat_k ? moctx->model.kv_self.k : moctx->model.kv_self.v;
+  // K = [head_dim, n_past+N, n_head, batch_size]
+  // V = [N_past+N, head_dim, n_head, batch_size]
+  const uint32_t n_embd_kv = concat_k ? ne0 * ne2 : ne1 * ne2;
+  struct ne_tensor* dst = nullptr;
+  if (concat_k) {
+    MODEL_ASSERT(ne1 <= moctx->n_ctx);
+  } else {
+    MODEL_ASSERT(ne0 <= moctx->n_ctx);
+  }
+  const size_t elem_size = ne_element_size(cache);
+  const size_t nb1 = concat_k ? elem_size * ne0 : elem_size * moctx->n_ctx;
+  const size_t nb2 = concat_k ? nb1 * moctx->n_ctx : nb1 * ne1;
+  const size_t nb3 = nb2 * ne2;
+  int cont_bs = 1;
+  int start_idx = block_ids[0];
+  int id = 1;
+  size_t dst_off = 0;
+  while (id < block_ids.size()) {
+    if (block_ids[id] - block_ids[id - 1] <= 1) {
+      cont_bs++;
+      id++;
+      continue;
+    } else {
+      if (dst == nullptr) {
+        dst = ne_new_tensor_4d(nectx, cache->type, ne0, ne1, ne2, ne3, NE_SIZE_CALC);
+      }
+      struct ne_tensor* dst_i = ne_view_4d(nectx, dst, ne0, ne1, ne2, cont_bs, elem_size * ne0, elem_size * ne0 * ne1,
+                                           elem_size * ne0 * ne1 * ne2, dst_off);
+      dst_off += elem_size * ne0 * ne1 * ne2 * cont_bs;
+      size_t off = layer_idx * moctx->n_ctx * elem_size * n_embd_kv * moctx->kv_n_ctx_block +
+                   start_idx * moctx->n_ctx * elem_size * n_embd_kv;
+      ne_build_forward_expand(
+          cgraph, ne_cpy(nectx, ne_view_4d(nectx, cache, ne0, ne1, ne2, cont_bs, nb1, nb2, nb3, off), dst_i));
+      start_idx = block_ids[id];
+      cont_bs = 1;
+      id++;
+    }
+  }
+
+  size_t off = layer_idx * moctx->n_ctx * elem_size * n_embd_kv * moctx->kv_n_ctx_block +
+               start_idx * moctx->n_ctx * elem_size * n_embd_kv;
+  if (start_idx == block_ids[0]) {
+    // continuous among all batch tokens
+    return ne_view_4d(nectx, cache, ne0, ne1, ne2, ne3, nb1, nb2, nb3, off);
+  } else {
+    // last cont batch
+    struct ne_tensor* dst_i = ne_view_4d(nectx, dst, ne0, ne1, ne2, cont_bs, elem_size * ne0, elem_size * ne0 * ne1,
+                                         elem_size * ne0 * ne1 * ne2, dst_off);
+    ne_build_forward_expand(cgraph,
+                            ne_cpy(nectx, ne_view_4d(nectx, cache, ne0, ne1, ne2, cont_bs, nb1, nb2, nb3, off), dst_i));
+    return dst;
+  }
+}
+
+ne_tensor* model_kv_cache_seq_concat(struct ne_cgraph* cgraph, struct model_context* moctx, struct ne_context* nectx,
+                                     const int64_t& ne0, const int64_t& ne1, const int64_t& ne2, const int64_t& ne3,
+                                     const std::vector<int>& block_ids, const int& layer_idx, const bool& concat_k) {
+  if (moctx->model.kv_self.k->type != NE_TYPE_BTLA) {
+    return ne_model_kv_cache_seq_concat(cgraph, moctx, nectx, ne0, ne1, ne2, ne3, block_ids, layer_idx, concat_k);
+  } else {
+    return nullptr;  // bestla
+  }
+}
+
+std::vector<std::vector<int>> split_inputs_into_groups(const model_input* inputs, const int n_input) {
+  MODEL_ASSERT(("There should be some input!", n_input > 0));
+  std::vector<std::vector<int>> groups{{0}};
+  for (int i = 1; i < n_input; ++i) {
+    const auto last_idx = inputs[i - 1].request_idx;
+    const auto curr_idx = inputs[i].request_idx;
+    if (curr_idx != last_idx) {
+      groups.emplace_back();  // Here is the beginning of a new group
+    } else {
+      MODEL_ASSERT(("n_tokens should be same", inputs[i - 1].n_tokens == inputs[i].n_tokens));
+      MODEL_ASSERT(("n_past should be same", inputs[i - 1].n_past == inputs[i].n_past));
+    }
+    groups.back().push_back(i);
+  }
+  return groups;
+}
diff --git a/vllm-ext/inference_engine/cpp/models/model_utils/model_utils.h b/vllm-ext/inference_engine/cpp/models/model_utils/model_utils.h
new file mode 100644
index 000000000..f502d46d1
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/model_utils/model_utils.h
@@ -0,0 +1,186 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#ifndef MODEL_H
+#define MODEL_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <limits>
+
+#include "models/application/common.h"
+#include "models/model_utils/model_config.h"
+#include "models/model_utils/model_types.h"
+
+#ifdef MODEL_SHARED
+#if defined(_WIN32) && !defined(__MINGW32__)
+#ifdef MODEL_BUILD
+#define MODEL_API __declspec(dllexport)
+#else
+#define MODEL_API __declspec(dllimport)
+#endif
+#else
+#define MODEL_API __attribute__((visibility("default")))
+#endif
+#else
+#define MODEL_API
+#endif
+
+#define MODEL_FILE_MAGIC_GGJT 0x67676a74u  // 'ggjt'
+#define MODEL_FILE_MAGIC_GGLA 0x67676c61u  // 'ggla'
+#define MODEL_FILE_MAGIC_GGMF 0x67676d66u  // 'ggmf'
+#define MODEL_FILE_MAGIC_NE 0x67676d6cu    // 'ne'
+#define MODEL_FILE_MAGIC_GGSN 0x6767736eu  // 'ggsn'
+
+#define MODEL_FILE_VERSION 3
+#define MODEL_FILE_MAGIC MODEL_FILE_MAGIC_GGJT
+#define MODEL_FILE_MAGIC_UNVERSIONED MODEL_FILE_MAGIC_NE
+#define MODEL_SESSION_MAGIC MODEL_FILE_MAGIC_GGSN
+#define MODEL_SESSION_VERSION 1
+
+inline int64_t ns_log_level() {
+  static int64_t log_level = -1;
+  if (log_level == -1) {
+    const char* log_level_env = getenv("IE_VERBOSE");
+    if (log_level_env != nullptr)
+      log_level = std::stoi(log_level_env);
+    else
+      log_level = -2;
+  }
+  return log_level;
+}
+
+void load_model_internal(const struct model_params& params, model_context& lctx,
+                         model_progress_callback progress_callback, void* progress_ctx);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+MODEL_API bool model_mmap_supported();
+MODEL_API bool model_mlock_supported();
+
+// TODO: not great API - very likely to change
+// Initialize the model + ne backend
+// Call once at the start of the program
+MODEL_API void model_init_backend();
+
+MODEL_API int64_t model_time_us();
+
+// Frees all allocated memory
+MODEL_API void model_free(struct model_context* ctx);
+
+// Apply a LoRA adapter to a loaded model
+// path_base_model is the path to a higher quality model to use as a base for
+// the layers modified by the adapter. Can be nullptr to use the current loaded
+// model. The model needs to be reloaded before applying a new adapter,
+// otherwise the adapter will be applied on top of the previous one Returns 0 on
+// success
+MODEL_API int model_apply_lora_from_file(struct model_context* ctx, const char* path_lora, const char* path_base_model,
+                                         int n_threads);
+
+// Returns the number of tokens in the KV cache
+MODEL_API int model_get_kv_cache_token_count(const struct model_context* ctx);
+
+// Sets the current rng seed.
+MODEL_API void model_set_rng_seed(struct model_context* ctx, int seed);
+
+// Returns the maximum size in bytes of the state (rng, logits, embedding
+// and kv_cache) - will often be smaller after compacting tokens
+MODEL_API size_t model_get_state_size(const struct model_context* ctx);
+
+// Copies the state to the specified destination address.
+// Destination needs to have allocated enough memory.
+// Returns the number of bytes copied
+MODEL_API size_t model_copy_state_data(struct model_context* ctx, uint8_t* dst);
+
+// Set the state reading from the specified address
+// Returns the number of bytes read
+MODEL_API size_t model_set_state_data(struct model_context* ctx, uint8_t* src);
+
+// Save/load session file
+MODEL_API bool model_load_session_file(struct model_context* ctx, const char* path_session, model_token* tokens_out,
+                                       size_t n_token_capacity, size_t* n_token_total_out);
+MODEL_API bool model_save_session_file(struct model_context* ctx, const char* path_session, const model_token* tokens,
+                                       size_t n_token_total);
+
+// Run the model inference to obtain the logits and probabilities for the next
+// model_input has some necessary members for inference (more details please see model_types.h):
+// token. tokens + n_tokens is the provided batch of new tokens to process
+// n_past is the offset to which the kv is cached to
+// n_total is the number of tokens evaluated in previous eval calls
+// Returns 0 on success
+MODEL_API int model_eval(struct model_context* ctx, const model_input* inputs, const int n_input, int n_threads);
+
+// Convert the provided text into tokens.
+// The tokens pointer must be large enough to hold the resulting tokens.
+// Returns the number of tokens on success, no more than n_max_tokens
+// Returns a negative number on failure - the number of tokens that would have
+// been returned
+// TODO: not sure if correct
+MODEL_API int model_tokenize(struct model_context* ctx, const char* text, model_token* tokens, int n_max_tokens,
+                             bool add_bos);
+
+MODEL_API int model_n_vocab(const struct model_context* ctx);
+MODEL_API int model_n_ctx(const struct model_context* ctx);
+MODEL_API int model_n_embd(const struct model_context* ctx);
+
+// Token logits obtained from the last call to model_eval()
+// The logits for the last token are stored in the last row
+// Can be mutated in order to change the probabilities of the next token
+// Rows: n_tokens
+// Cols: n_vocab
+MODEL_API float* model_get_logits(struct model_context* ctx);
+
+// Get the embeddings for the input
+// shape: [n_embd] (1-dimensional)
+MODEL_API float* model_get_embeddings(struct model_context* ctx);
+
+// Token Id -> String. Uses the vocabulary in the provided context
+MODEL_API const char* model_token_to_str(const struct model_context* ctx, model_token token);
+
+// Special tokens
+MODEL_API model_token model_token_nl();
+
+// Performance information
+MODEL_API void model_print_timings(struct model_context* ctx);
+MODEL_API void model_reset_timings(struct model_context* ctx);
+
+// Print system information
+MODEL_API const char* model_print_system_info(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+/* kv cache utils */
+// kv cache both stores permuted tensor
+// k shape is [head_dim, N, n_head]
+// v shape is [N, head_dim, n_head] or [N, n_embd]
+/* kv cache utils */
+
+// copy consecutive tokens from one seq to another
+MODEL_API void model_kv_cache_seq_cpy(struct model_context* ctx, const model_seq_id& seq_id_src,
+                                      const model_seq_id& seq_id_dst, const model_pos& p0, const model_pos& p1);
+
+// concat several seqs into a continuous batch from kv cache
+MODEL_API ne_tensor* model_kv_cache_seq_concat(struct ne_cgraph* cgraph, struct model_context* moctx,
+                                               struct ne_context* nectx, const int64_t& ne0, const int64_t& ne1,
+                                               const int64_t& ne2, const int64_t& ne3,
+                                               const std::vector<int>& block_ids, const int& layer_idx,
+                                               const bool& concat_k = true);
+
+MODEL_API std::vector<std::vector<int>> split_inputs_into_groups(const model_input* inputs, const int n_input);
+
+#endif  // MODEL_H
diff --git a/vllm-ext/inference_engine/cpp/models/model_utils/quant_config.h b/vllm-ext/inference_engine/cpp/models/model_utils/quant_config.h
new file mode 100644
index 000000000..469603709
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/model_utils/quant_config.h
@@ -0,0 +1,227 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#include <stdint.h>
+#include <string>
+#include <memory>
+#include "core/data_types.h"
+#include "bestla/bestla.h"
+
+enum class quant_bits : int { q4 = 0, q2, q3, q8, fp4_e2m1, nf4, fp8_e4m3, fp8_e5m2, count };
+static inline quant_bits parse_bits(const std::string& bits) {
+  if (bits == "int3") {
+    return quant_bits::q3;
+  }
+  if (bits == "int2") {
+    return quant_bits::q2;
+  }
+  if (bits == "int4") {
+    return quant_bits::q4;
+  }
+  if (bits == "int8") {
+    return quant_bits::q8;
+  }
+  if (bits == "fp4_e2m1" || bits == "fp4") {
+    return quant_bits::fp4_e2m1;
+  }
+  if (bits == "nf4") {
+    return quant_bits::nf4;
+  }
+  if (bits == "fp8_e4m3" || bits == "fp8") {
+    return quant_bits::fp8_e4m3;
+  }
+  if (bits == "fp8_e5m2") {
+    return quant_bits::fp8_e5m2;
+  }
+  return quant_bits::count;
+}
+
+enum class quant_alg : int {
+  sym = 0,
+  asym,
+  count,
+};
+
+static inline quant_alg parse_alg(std::string arg) {
+  if (arg == "sym") {
+    return quant_alg::sym;
+  }
+  if (arg == "asym") {
+    return quant_alg::asym;
+  }
+  return quant_alg::count;
+}
+
+enum class quant_sdtype : int {
+  fp16 = 0,
+  fp32,
+  bf16,
+  fp8,
+  count,
+};
+
+static inline quant_sdtype parse_scale_dtype(std::string arg) {
+  if (arg == "fp16") {
+    return quant_sdtype::fp16;
+  }
+  if (arg == "fp32") {
+    return quant_sdtype::fp32;
+  }
+  if (arg == "bf16") {
+    return quant_sdtype::bf16;
+  }
+  if (arg == "fp8") {
+    return quant_sdtype::fp8;
+  }
+  return quant_sdtype::count;
+}
+
+enum class quant_comp : int {
+  ggml = 0,  // native
+  int8,      // bestla int8
+  fp32,      // bestla fp32
+  bf16,      // bestla bf16
+  fp16,      // bestla fp16
+  count,
+};
+static inline quant_comp parse_compute_type(std::string arg, bool ggml_arg) {
+  if (ggml_arg) {
+    return quant_comp::ggml;
+  }
+  if (arg == "int8") {
+    return quant_comp::int8;
+  }
+  if (arg == "fp32") {
+    return quant_comp::fp32;
+  }
+  if (arg == "bf16") {
+    return quant_comp::bf16;
+  }
+  if (arg == "fp16") {
+    return quant_comp::fp16;
+  }
+  return quant_comp::count;
+}
+
+// without ggml
+inline constexpr ne_comp_type quant2ne_comp_type(const quant_comp& qc) {
+  switch (qc) {
+    case quant_comp::fp32:
+      return NE_COMP_F32;
+    case quant_comp::fp16:
+      return NE_COMP_F16;
+    case quant_comp::bf16:
+      return NE_COMP_BF16;
+    case quant_comp::int8:
+      return NE_COMP_INT8;
+    default:
+      return NE_COMP_UNDEF;
+  }
+}
+
+struct quant_params_internal {
+  quant_bits bits = quant_bits::q4;
+  quant_alg alg = quant_alg::sym;
+  int32_t group_size = 32;
+  quant_sdtype scale_dtype = quant_sdtype::fp16;
+  quant_comp compute_dtype = quant_comp::ggml;
+  bool valid() const {
+    return bits != quant_bits::count && alg != quant_alg::count && scale_dtype != quant_sdtype::count &&
+           compute_dtype != quant_comp::count;
+  }
+  std::string getstr() {
+    return std::to_string(int(bits)) + "_" + std::to_string(int(alg)) + "_" + std::to_string(group_size) + "_" +
+           std::to_string(int(scale_dtype)) + "_" + std::to_string(int(compute_dtype));
+  }
+};
+
+static inline ne_type quant_params_to_type(const quant_params_internal& params) {
+  if (params.compute_dtype == quant_comp::ggml) {
+    if (params.bits == quant_bits::q4) {
+      if (params.alg == quant_alg::sym) {
+        return NE_TYPE_Q4_0;
+      } else if (params.alg == quant_alg::asym) {
+        return NE_TYPE_Q4_1;
+      }
+    } else if (params.bits == quant_bits::q8) {
+      if (params.alg == quant_alg::sym) {
+        return NE_TYPE_Q8_0;
+      }
+    }
+  } else {
+    return NE_TYPE_BTLA;
+  }
+  return NE_TYPE_F32;
+}
+
+class quant_layer_base {
+ public:
+  virtual void set_global_config(int nthread, quant_params_internal param) {
+    mNThread = nthread;
+    mGCfg = param;
+  }
+  virtual quant_params_internal get_layer_config(std::string layername, std::vector<int64_t> ne, ne_type type) = 0;
+
+ protected:
+  quant_params_internal mGCfg;
+  int mNThread;
+};
+
+// template ?
+// register quant_layer class for different models
+class ql_registry {
+ public:
+  typedef std::shared_ptr<quant_layer_base> (*creator)();
+  // model_name to model quant_layer
+  typedef std::unordered_map<model_archs, creator> creator_registry;
+
+  static creator_registry& registry() {
+    static std::unique_ptr<creator_registry> registry(new creator_registry());
+    return *registry;
+  }
+
+  static void add_creator(const std::string& type, creator cr) {
+    creator_registry& re = registry();
+    model_archs mt = model_name_to_arch::init().find(type);
+    NE_ASSERT(mt != MODEL_UNKNOWN);
+    NE_ASSERT(re.count(mt) == 0);
+    re[mt] = cr;
+  }
+
+  static std::shared_ptr<quant_layer_base> create_ql(const std::string& type) {
+    creator_registry& re = registry();
+    model_archs mt = model_name_to_arch::init().find(type);
+    NE_ASSERT(mt != MODEL_UNKNOWN);
+    NE_ASSERT(re.count(mt) > 0);
+    return re[mt]();
+  }
+
+ private:
+  ql_registry() {}
+};
+
+class ql_registerer {
+ public:
+  ql_registerer(const std::string& type, std::shared_ptr<quant_layer_base> (*ql_creator)()) {
+    ql_registry::add_creator(type, ql_creator);
+  }
+};
+
+#define REGISTER_QUANT_LAYER_CREATOR(type, creator) static ql_registerer ql_creator_##type(#type, creator);
+
+#define REGISTER_QUANT_LAYER_CLASS(type)                                \
+  std::shared_ptr<quant_layer_base> creator_##type##_quant_layer() {    \
+    return std::shared_ptr<quant_layer_base>(new type##_quant_layer()); \
+  }                                                                     \
+  REGISTER_QUANT_LAYER_CREATOR(type, creator_##type##_quant_layer)
diff --git a/vllm-ext/inference_engine/cpp/models/model_utils/quant_utils.cpp b/vllm-ext/inference_engine/cpp/models/model_utils/quant_utils.cpp
new file mode 100644
index 000000000..4bed04910
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/model_utils/quant_utils.cpp
@@ -0,0 +1,652 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+// Defines fileno on msys:
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#endif
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cinttypes>
+#include <climits>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <initializer_list>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <mutex>  //NOLINT
+#include <numeric>
+#include <queue>
+#include <random>
+#include <sstream>
+#include <thread>  //NOLINT
+#include <unordered_map>
+
+#include "models/application/common.h"
+#include "core/layers/bestla_common.hpp"
+#include "core/layers/mha_dense.h"
+#include "core/ne_layers.h"
+#include "core/layers/bestla_gemm.h"
+#include "core/ne_bestla.h"
+
+#include "models/model_utils/model_files.h"
+// #include "models/whisper/whisper.h"
+#include "models/model_utils/quant_utils.h"
+#include "models/model_utils/util.h"
+#include "models/models.h"
+
+// default hparams (Whisper tiny)
+struct whisper_hparams_t {
+  int32_t n_vocab = 51864;
+  int32_t n_audio_ctx = 1500;
+  int32_t n_audio_state = 384;
+  int32_t n_audio_head = 6;
+  int32_t n_audio_layer = 4;
+  int32_t n_text_ctx = 448;
+  int32_t n_text_state = 384;
+  int32_t n_text_head = 6;
+  int32_t n_text_layer = 4;
+  int32_t n_mels = 80;
+  int32_t ftype = 1;
+};
+
+struct whisper_filters_t {
+  int32_t n_mel;
+  int32_t n_fft;
+
+  std::vector<float> data;
+};
+
+// quantize a model
+bool whisper_model_quantize(const std::string& fname_inp, const std::string& fname_out, ne_ftype ftype) {
+  gpt_vocab vocab;
+
+  printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
+
+  auto finp = std::ifstream(fname_inp, std::ios::binary);
+  if (!finp) {
+    fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
+    return false;
+  }
+
+  auto fout = std::ofstream(fname_out, std::ios::binary);
+  if (!fout) {
+    fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
+    return false;
+  }
+
+  // verify magic
+  {
+    uint32_t magic;
+    finp.read(reinterpret_cast<char*>(&magic), sizeof(magic));
+    if (magic != NE_FILE_MAGIC) {
+      fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
+      return false;
+    }
+
+    fout.write(reinterpret_cast<char*>(&magic), sizeof(magic));
+  }
+
+  whisper_hparams_t hparams;
+
+  // load hparams
+  {
+    finp.read(reinterpret_cast<char*>(&hparams.n_vocab), sizeof(hparams.n_vocab));
+    finp.read(reinterpret_cast<char*>(&hparams.n_audio_ctx), sizeof(hparams.n_audio_ctx));
+    finp.read(reinterpret_cast<char*>(&hparams.n_audio_state), sizeof(hparams.n_audio_state));
+    finp.read(reinterpret_cast<char*>(&hparams.n_audio_head), sizeof(hparams.n_audio_head));
+    finp.read(reinterpret_cast<char*>(&hparams.n_audio_layer), sizeof(hparams.n_audio_layer));
+    finp.read(reinterpret_cast<char*>(&hparams.n_text_ctx), sizeof(hparams.n_text_ctx));
+    finp.read(reinterpret_cast<char*>(&hparams.n_text_state), sizeof(hparams.n_text_state));
+    finp.read(reinterpret_cast<char*>(&hparams.n_text_head), sizeof(hparams.n_text_head));
+    finp.read(reinterpret_cast<char*>(&hparams.n_text_layer), sizeof(hparams.n_text_layer));
+    finp.read(reinterpret_cast<char*>(&hparams.n_mels), sizeof(hparams.n_mels));
+    finp.read(reinterpret_cast<char*>(&hparams.ftype), sizeof(hparams.ftype));
+
+    const int32_t qntvr_src = hparams.ftype / NE_QNT_VERSION_FACTOR;
+    const int32_t ftype_dst = NE_QNT_VERSION * NE_QNT_VERSION_FACTOR + ftype;
+
+    fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
+    fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
+    fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
+    fprintf(stderr, "%s: n_audio_head  = %d\n", __func__, hparams.n_audio_head);
+    fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
+    fprintf(stderr, "%s: n_text_ctx    = %d\n", __func__, hparams.n_text_ctx);
+    fprintf(stderr, "%s: n_text_state  = %d\n", __func__, hparams.n_text_state);
+    fprintf(stderr, "%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
+    fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
+    fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
+    fprintf(stderr, "%s: ftype (src)   = %d\n", __func__, hparams.ftype);
+    fprintf(stderr, "%s: qntvr (src)   = %d\n", __func__, qntvr_src);
+    fprintf(stderr, "%s: ftype (dst)   = %d\n", __func__, ftype_dst);
+    fprintf(stderr, "%s: qntvr (dst)   = %d\n", __func__, NE_QNT_VERSION);
+
+    fout.write((const char*)&hparams.n_vocab, sizeof(hparams.n_vocab));
+    fout.write((const char*)&hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
+    fout.write((const char*)&hparams.n_audio_state, sizeof(hparams.n_audio_state));
+    fout.write((const char*)&hparams.n_audio_head, sizeof(hparams.n_audio_head));
+    fout.write((const char*)&hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
+    fout.write((const char*)&hparams.n_text_ctx, sizeof(hparams.n_text_ctx));
+    fout.write((const char*)&hparams.n_text_state, sizeof(hparams.n_text_state));
+    fout.write((const char*)&hparams.n_text_head, sizeof(hparams.n_text_head));
+    fout.write((const char*)&hparams.n_text_layer, sizeof(hparams.n_text_layer));
+    fout.write((const char*)&hparams.n_mels, sizeof(hparams.n_mels));
+    fout.write((const char*)&ftype_dst, sizeof(hparams.ftype));
+  }
+
+  // load mel filters
+  {
+    whisper_filters_t filters;
+
+    finp.read(reinterpret_cast<char*>(&filters.n_mel), sizeof(filters.n_mel));
+    fout.write(reinterpret_cast<char*>(&filters.n_mel), sizeof(filters.n_mel));
+    finp.read(reinterpret_cast<char*>(&filters.n_fft), sizeof(filters.n_fft));
+    fout.write(reinterpret_cast<char*>(&filters.n_fft), sizeof(filters.n_fft));
+
+    filters.data.resize(filters.n_mel * filters.n_fft);
+    finp.read(reinterpret_cast<char*>(filters.data.data()), filters.data.size() * sizeof(float));
+    fout.write(reinterpret_cast<char*>(filters.data.data()), filters.data.size() * sizeof(float));
+  }
+
+  // load vocab
+  {
+    int32_t n_vocab = 0;
+    finp.read(reinterpret_cast<char*>(&n_vocab), sizeof(n_vocab));
+    fout.write(reinterpret_cast<char*>(&n_vocab), sizeof(n_vocab));
+
+    // if (n_vocab != hparams.n_vocab) {
+    //     fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+    //             __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
+    //     return false;
+    // }
+
+    char word[129];
+
+    for (int i = 0; i < n_vocab; i++) {
+      uint32_t len;
+      finp.read(reinterpret_cast<char*>(&len), sizeof(len));
+      fout.write(reinterpret_cast<char*>(&len), sizeof(len));
+
+      word[len] = '\0';
+
+      finp.read(reinterpret_cast<char*>(word), len);
+      fout.write(reinterpret_cast<char*>(word), len);
+
+      vocab.token_to_id[word] = i;
+      vocab.id_to_token[i] = word;
+    }
+  }
+
+  // regexes of tensor names to not be quantized
+  const std::vector<std::string> to_skip = {
+      // "encoder.*",
+      "encoder.conv1.bias",
+      "encoder.conv2.bias",
+      "encoder.positional_embedding",
+      "decoder.positional_embedding",
+  };
+
+  if (!model_quantize_special(finp, fout, ftype, {".*"}, to_skip)) {
+    fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
+    return false;
+  }
+
+  finp.close();
+  fout.close();
+
+  return true;
+}
+//
+// quantization
+//
+quant_params_internal quant_params_to_internal(const quant_params& params) {
+  return quant_params_internal{parse_bits(params.weight_dtype), parse_alg(params.alg), params.group_size,
+                               parse_scale_dtype(params.scale_dtype),
+                               parse_compute_type(params.compute_dtype, params.use_ggml)};
+}
+
+size_t bestla_qpack(const int8_t* src_w, const float* src_scales, const int8_t* src_zps, void* dstpr,
+                    const quant_params_internal params, int nthread, int n, int k, int* g_idx) {
+  auto ctype = quant2ne_comp_type(params.compute_dtype);
+  auto dstbptr = reinterpret_cast<int8_t*>(dstpr);
+  bestla_set_threads(nthread);
+  BTLA_DTYPE quant_type = BTLA_DTYPE::S4_CLIP;
+  if (params.bits == quant_bits::q8) {
+    quant_type = BTLA_DTYPE::S8;
+  }
+  if (params.bits == quant_bits::q3) quant_type = BTLA_DTYPE::S3_CLIP;
+  if (params.bits == quant_bits::q2) quant_type = BTLA_DTYPE::S2_CLIP;
+  auto dtype_type = static_cast<BTLA_DTYPE>(
+      bestla::utils::bestla_dtype_get_mask_val(quant_type, BTLA_DTYPE::TypeMask, BTLA_DTYPE::TypeShift));
+  if (dtype_type == BTLA_DTYPE::TypeFloat) {
+    printf("Not support float dtype in qpack\n");
+    if (params.alg == quant_alg::asym) {
+      printf("Invalid alg for float quant types, will be igonred\n");
+    }
+    if (params.compute_dtype == quant_comp::int8) {
+      printf("Compute Int8 is not supported by float quant types, will be igonred\n");
+    }
+  }
+  BTLA_DTYPE scale_type = BTLA_DTYPE::BF16;
+  if (params.scale_dtype == quant_sdtype::fp32) {
+    scale_type = BTLA_DTYPE::F32;
+  }
+  if (params.scale_dtype == quant_sdtype::fp16) {
+    printf("Current not support float16 scale, reset to bf16\n");
+  }
+  auto gsize = params.group_size == -1 ? k : params.group_size;
+  auto size = BTLAGemmPackBSize(n, k, gsize, quant_type, scale_type, params.alg == quant_alg::asym, ctype, g_idx);
+  if (size) {
+    if (!BTLAGemmPackB(dstpr, src_w, src_scales, src_zps, n, k, n, gsize, quant_type, scale_type,
+                       params.alg == quant_alg::asym, ctype, g_idx, bestla_get_thread_handle())) {
+      printf("Failed to quant this weight\n");
+      return 0;
+    }
+    return size;
+  }
+  return 0;
+}
+
+// dstptr: default maximum workspace = float array size
+size_t bestla_quantize(const float* f32ptr, void* dstpr, const quant_params_internal params, int nthread, size_t n,
+                       size_t k) {
+  auto ctype = quant2ne_comp_type(params.compute_dtype);
+  auto dstbptr = reinterpret_cast<int8_t*>(dstpr);
+  bestla_set_threads(nthread);
+  auto thdptr = bestla_get_thread_handle();
+
+  BTLA_DTYPE quant_type = BTLA_DTYPE::S4_CLIP;
+  if (params.bits == quant_bits::q3) {
+    quant_type = BTLA_DTYPE::S3_CLIP;
+  }
+  if (params.bits == quant_bits::q2) {
+    quant_type = BTLA_DTYPE::S2_CLIP;
+  }
+  if (params.bits == quant_bits::q8) {
+    quant_type = BTLA_DTYPE::S8;
+  }
+  if (params.bits == quant_bits::fp4_e2m1) {
+    quant_type = BTLA_DTYPE::F4_E2M1;
+  }
+  if (params.bits == quant_bits::nf4) {
+    quant_type = BTLA_DTYPE::F4_NF4;
+  }
+  if (params.bits == quant_bits::fp8_e4m3) {
+    quant_type = BTLA_DTYPE::F8_E4M3;
+  }
+  if (params.bits == quant_bits::fp8_e5m2) {
+    quant_type = BTLA_DTYPE::F8_E5M2;
+  }
+  auto dtype_type = static_cast<BTLA_DTYPE>(
+      bestla::utils::bestla_dtype_get_mask_val(quant_type, BTLA_DTYPE::TypeMask, BTLA_DTYPE::TypeShift));
+  if (dtype_type == BTLA_DTYPE::TypeFloat) {
+    if (params.alg == quant_alg::asym) {
+      printf("Invalid alg for float quant types, will be igonred\n");
+    }
+    if (params.compute_dtype == quant_comp::int8) {
+      printf("Compute Int8 is not supported by float quant types, will be igonred\n");
+    }
+  }
+  BTLA_DTYPE scale_type = BTLA_DTYPE::BF16;
+  if (params.scale_dtype == quant_sdtype::fp32) {
+    scale_type = BTLA_DTYPE::F32;
+  }
+  if (params.scale_dtype == quant_sdtype::fp16) {
+    printf("Current not support float16 scale, reset to bf16\n");
+  }
+  if (quant_type == BTLA_DTYPE::F8_E4M3 || quant_type == BTLA_DTYPE::F8_E5M2) {
+    if (params.scale_dtype != quant_sdtype::fp8 && params.scale_dtype != quant_sdtype::fp32) {
+      printf("Warning: fp8 weight only supports fp8 / fp32 scale now! Fall back to fp8.\n");
+    }
+    scale_type = BTLA_DTYPE::F8_E8M0;
+  }
+  auto gsize = params.group_size == -1 ? k : params.group_size;
+  auto size = BTLAGemmPackBSize(n, k, gsize, quant_type, scale_type, params.alg == quant_alg::asym, ctype, nullptr);
+  bool constexpr IsTrans_TorchWeight = true;
+  if (size) {
+    if (!BTLAGemmQuantPackB(dstpr, f32ptr, n, k, k, gsize, quant_type, scale_type, params.alg == quant_alg::asym, ctype,
+                            IsTrans_TorchWeight, thdptr)) {
+      printf("Failed to quant this weight\n");
+      return 0;
+    }
+    return size;
+  }
+  return 0;
+}
+
+size_t ggml_quantize(const float* f32ptr, void* dstpr, const ne_type new_type, int nthread, size_t nelements) {
+  std::vector<int64_t> hist_cur(1 << 4, 0);
+  std::vector<std::thread> workers;
+  std::mutex mutex;
+  int chunk_size = 32 * 512;
+  const int nchunk = (nelements + chunk_size - 1) / chunk_size;
+  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
+  size_t new_size = 0;
+  if (nthread_use < 2) {
+    new_size = ne_quantize_chunk(new_type, f32ptr, dstpr, 0, nelements, hist_cur.data());
+  } else {
+    size_t counter = 0;
+    new_size = 0;
+    auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32ptr, dstpr, nelements, chunk_size]() {
+      std::vector<int64_t> local_hist;
+      size_t local_size = 0;
+      while (true) {
+        std::unique_lock<std::mutex> lock(mutex);
+        size_t first = counter;
+        counter += chunk_size;
+        if (first >= nelements) {
+          if (!local_hist.empty()) {
+            for (int j = 0; j < static_cast<int>(local_hist.size()); ++j) {
+              hist_cur[j] += local_hist[j];
+            }
+            new_size += local_size;
+          }
+          break;
+        }
+        lock.unlock();
+        size_t last = std::min(nelements, first + chunk_size);
+        if (local_hist.empty()) {
+          local_hist.resize(hist_cur.size(), 0);
+        }
+        local_size += ne_quantize_chunk(new_type, f32ptr, dstpr, first, last - first, local_hist.data());
+      }
+    };
+    if (static_cast<int>(workers.size()) < nthread_use - 1) {
+      workers.resize(nthread_use - 1);
+    }
+    for (int it = 0; it < nthread_use - 1; ++it) {
+      workers[it] = std::thread(compute);
+    }
+    compute();
+    for (int it = 0; it < nthread_use - 1; ++it) {
+      workers[it].join();
+    }
+  }
+  return new_size;
+}
+
+void ne_common_quantize(const int nthread, const quant_params_internal& params, model_load_tensor& tensor,  // NOLINT
+                        model_file_saver& saver, size_t& size_org, size_t& size_new) {                      // NOLINT
+  size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
+  enum ne_type new_type = quant_params_to_type(params);
+  model_buffer work;
+  work.resize(nelements * 4);  // upper bound on size
+  void* new_data = work.addr;
+  size_t new_size = 0;
+  float* f32_data = nullptr;
+  model_buffer f32_conv_buf;
+  if (tensor.type == NE_TYPE_F32) {
+    f32_data = reinterpret_cast<float*>(tensor.data);
+  } else if (tensor.type == NE_TYPE_F16) {
+    f32_conv_buf.resize(nelements * sizeof(float));
+    f32_data = reinterpret_cast<float*>(f32_conv_buf.addr);
+    const auto* f16_data = (const ne_fp16_t*)tensor.data;
+    for (size_t i = 0; i < nelements; i++) {
+      f32_data[i] = ne_fp16_to_fp32(f16_data[i]);
+    }
+  } else {
+    throw format("type %s unsupported for integer quantization", ne_type_name(tensor.type));
+  }
+  printf("quantizing .. ");
+  fflush(stdout);
+  if (new_type == NE_TYPE_BTLA) {
+    size_t k_ = tensor.ne.at(0);
+    size_t n_ = tensor.ne.at(1);
+    printf("BesTLA ");
+    new_size = bestla_quantize(f32_data, work.addr, params, nthread, n_, k_);
+  } else if (new_type >= NE_TYPE_Q4_0 && new_type < NE_TYPE_BTLA) {
+    printf("GGML ");
+    new_size = ggml_quantize(f32_data, work.addr, new_type, nthread, nelements);
+  }
+  printf("size = %8.2f MB -> %8.2f MB\n", tensor.size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+
+  size_org += tensor.size;
+  size_new += new_size;
+  saver.write_tensor(tensor, new_type, new_data, new_size);
+  printf("\n");
+}
+
+static void model_quantize_internal(const quant_params& params, std::shared_ptr<quant_layer_base>& quant_layer) {
+  auto ftype = quant_params_to_ftype(params);
+  quant_layer->set_global_config(params.nthread, quant_params_to_internal(params));
+  int nthread = params.nthread;
+  if (nthread <= 0) {
+    nthread = std::thread::hardware_concurrency();
+  }
+  std::unique_ptr<model_model_loader> model_loader(new model_model_loader(params.model_file, /*use_mmap*/ false,
+                                                                          /*vocab_only*/ false));
+  model_file_saver file_saver(params.out_file.c_str(), model_loader->file_loaders.at(0).get(), ftype);
+  size_t total_size_org = 0;
+  size_t total_size_new = 0;
+  size_t idx = 0;
+  for (model_load_tensor& tensor : model_loader->tensors_map.tensors) {
+    model_buffer read_data;
+    read_data.resize(tensor.size);
+    tensor.data = read_data.addr;
+    model_loader->load_data_for(tensor);
+    printf("[%4zu/%4zu] %36s - %16s, type = %6s, ", ++idx, model_loader->tensors_map.tensors.size(),
+           tensor.name.c_str(), model_format_tensor_shape(tensor.ne).c_str(), ne_type_name(tensor.type));
+    std::vector<int64_t> tmpne(tensor.ne.size());
+    for (size_t i = 0; i < tmpne.size(); i++) {
+      tmpne[i] = static_cast<int64_t>(tensor.ne[i]);
+    }
+    auto lconfig = quant_layer->get_layer_config(tensor.name, tmpne, tensor.type);
+    bool quantize = lconfig.valid();
+    printf("%s,", lconfig.getstr().c_str());
+    if (quantize) {
+      ne_common_quantize(nthread, lconfig, tensor, file_saver, total_size_org, total_size_new);
+    } else {
+      printf("size = %8.3f MB\n", tensor.size / 1024.0 / 1024.0);
+      total_size_org += tensor.size;
+      total_size_new += tensor.size;
+      file_saver.write_tensor(tensor, tensor.type, tensor.data, tensor.size);
+      printf("\n");
+    }
+  }
+  printf("%s: model size  = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+  printf("%s: quant size  = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+}
+
+size_t jblas_special_quantize(const float* f32ptr, void* dstpr, int group_size, int nthread, int n, int k) { return 0; }
+
+bool model_quantize_special(std::ifstream& finp, std::ofstream& fout, const ne_ftype ftype,
+                            const std::vector<std::string>& to_quant, const std::vector<std::string>& to_skip) {
+  ne_type qtype = NE_TYPE_F32;
+
+  switch (ftype) {
+    case NE_FTYPE_MOSTLY_Q4_0:
+      qtype = NE_TYPE_Q4_0;
+      break;
+    case NE_FTYPE_MOSTLY_Q_BTLA:
+      qtype = NE_TYPE_BTLA;
+      break;
+    case NE_FTYPE_MOSTLY_F16: {
+      fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
+      return false;
+    }
+  }
+  if (!ne_is_quantized(qtype)) {
+    fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ne_type_name(qtype));
+    return false;
+  }
+
+  size_t total_size_org = 0;
+  size_t total_size_new = 0;
+
+  std::vector<float> work;
+
+  std::vector<uint8_t> data_u8;
+  std::vector<ne_fp16_t> data_f16;
+  std::vector<float> data_f32;
+
+  std::vector<int64_t> hist_all(1 << 4, 0);
+
+  while (true) {
+    int32_t n_dims;
+    int32_t length;
+    int32_t ttype;
+
+    finp.read(reinterpret_cast<char*>(&n_dims), sizeof(n_dims));
+    finp.read(reinterpret_cast<char*>(&length), sizeof(length));
+    finp.read(reinterpret_cast<char*>(&ttype), sizeof(ttype));
+
+    if (finp.eof()) {
+      break;
+    }
+
+    int32_t nelements = 1;
+    int32_t ne[4] = {1, 1, 1, 1};
+    for (int i = 0; i < n_dims; ++i) {
+      finp.read(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
+      nelements *= ne[i];
+    }
+
+    std::string name(length, 0);
+    finp.read(&name[0], length);
+
+    printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ne_type_name((ne_type)ttype));
+
+    bool quantize = false;
+
+    // check if we should quantize this tensor
+    for (const auto& s : to_quant) {
+      if (std::regex_match(name, std::regex(s))) {
+        quantize = true;
+        break;
+      }
+    }
+
+    // check if we should skip this tensor
+    for (const auto& s : to_skip) {
+      if (std::regex_match(name, std::regex(s))) {
+        quantize = false;
+        break;
+      }
+    }
+
+    // quantize only 2D tensors
+    quantize &= (n_dims == 2);
+
+    if (quantize) {
+      if (ttype != NE_TYPE_F32 && ttype != NE_TYPE_F16) {
+        fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype,
+                ne_type_name((ne_type)ttype));
+        return false;
+      }
+
+      if (ttype == NE_TYPE_F16) {
+        data_f16.resize(nelements);
+        finp.read(reinterpret_cast<char*>(data_f16.data()), nelements * sizeof(ne_fp16_t));
+        data_f32.resize(nelements);
+        for (int i = 0; i < nelements; ++i) {
+          data_f32[i] = ne_fp16_to_fp32(data_f16[i]);
+        }
+      } else {
+        data_f32.resize(nelements);
+        finp.read(reinterpret_cast<char*>(data_f32.data()), nelements * sizeof(float));
+      }
+
+      ttype = qtype;
+    } else {
+      const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
+
+      data_u8.resize(nelements * bpe);
+      finp.read(reinterpret_cast<char*>(data_u8.data()), nelements * bpe);
+    }
+
+    fout.write(reinterpret_cast<char*>(&n_dims), sizeof(n_dims));
+    fout.write(reinterpret_cast<char*>(&length), sizeof(length));
+    fout.write(reinterpret_cast<char*>(&ttype), sizeof(ttype));
+    for (int i = 0; i < n_dims; ++i) {
+      fout.write(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
+    }
+    fout.write(&name[0], length);
+
+    if (quantize) {
+      work.resize(nelements);  // for quantization
+
+      size_t cur_size = 0;
+      std::vector<int64_t> hist_cur(1 << 4, 0);
+
+      switch ((ne_type)ttype) {
+        case NE_TYPE_Q4_0: {
+          cur_size = ne_quantize_chunk((ne_type)ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data());
+        } break;
+        case NE_TYPE_BTLA: {
+          cur_size = jblas_special_quantize(data_f32.data(), work.data(), 32, 1, ne[0], ne[1]);
+          printf("JBLAS");
+        } break;
+        case NE_TYPE_F32: {
+          fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ne_type_name((ne_type)ttype));
+          return false;
+        }
+      }
+
+      fout.write(reinterpret_cast<char*>(work.data()), cur_size);
+      total_size_new += cur_size;
+
+      printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float) / 1024.0 / 1024.0,
+             cur_size / 1024.0 / 1024.0);
+      for (int i = 0; i < static_cast<int>(hist_cur.size()); ++i) {
+        hist_all[i] += hist_cur[i];
+      }
+
+      for (int i = 0; i < static_cast<int>(hist_cur.size()); ++i) {
+        printf("%5.3f ", hist_cur[i] / static_cast<float>(nelements));
+      }
+      printf("\n");
+    } else {
+      printf("size = %8.3f MB\n", data_u8.size() / 1024.0 / 1024.0);
+      fout.write(reinterpret_cast<char*>(data_u8.data()), data_u8.size());
+      total_size_new += data_u8.size();
+    }
+
+    total_size_org += nelements * sizeof(float);
+  }
+
+  printf("%s: model size  = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+  printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new / 1024.0 / 1024.0, ftype,
+         ne_type_name(qtype));
+
+  {
+    int64_t sum_all = 0;
+    for (int i = 0; i < static_cast<int>(hist_all.size()); ++i) {
+      sum_all += hist_all[i];
+    }
+
+    printf("%s: hist: ", __func__);
+    for (int i = 0; i < static_cast<int>(hist_all.size()); ++i) {
+      printf("%5.3f ", hist_all[i] / static_cast<float>(sum_all));
+    }
+    printf("\n");
+  }
+
+  return true;
+}
+int model_quantize(const quant_params& params, std::shared_ptr<quant_layer_base> quant_layer) {
+  try {
+    model_quantize_internal(params, quant_layer);
+    return 0;
+  } catch (const std::string& err) {
+    fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
+    return 1;
+  }
+}
diff --git a/vllm-ext/inference_engine/cpp/models/model_utils/quant_utils.h b/vllm-ext/inference_engine/cpp/models/model_utils/quant_utils.h
new file mode 100644
index 000000000..6d09a1654
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/model_utils/quant_utils.h
@@ -0,0 +1,43 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#ifndef QUANT_UTILS_H
+#define QUANT_UTILS_H
+
+#include "models/application/common.h"
+#include "models/model_utils/quant_config.h"
+
+#ifdef MODEL_SHARED
+#if defined(_WIN32) && !defined(__MINGW32__)
+#ifdef MODEL_BUILD
+#define QUANT_API __declspec(dllexport)
+#else
+#define QUANT_API __declspec(dllimport)
+#endif
+#else
+#define QUANT_API __attribute__((visibility("default")))
+#endif
+#else
+#define QUANT_API
+#endif
+
+QUANT_API int model_quantize(const quant_params& param, std::shared_ptr<quant_layer_base> quant_layer);
+size_t bestla_qpack(const int8_t* src_w, const float* src_scales, const int8_t* src_zps, void* dstpr,
+                    const quant_params_internal params, int nthread, int n, int k, int* g_idx);
+size_t bestla_quantize(const float* f32ptr, void* dstpr, const quant_params_internal params, int nthread, size_t n,
+                       size_t k);
+QUANT_API bool model_quantize_special(std::ifstream& finp, std::ofstream& fout, const ne_ftype ftype,
+                                      const std::vector<std::string>& to_quant,
+                                      const std::vector<std::string>& to_skip);
+QUANT_API bool whisper_model_quantize(const std::string& fname_inp, const std::string& fname_out, ne_ftype ftype);
+#endif  // MODEL_H
diff --git a/vllm-ext/inference_engine/cpp/models/model_utils/util.cpp b/vllm-ext/inference_engine/cpp/models/model_utils/util.cpp
new file mode 100644
index 000000000..b27c380d6
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/model_utils/util.cpp
@@ -0,0 +1,49 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#include "util.h"
+
+int32_t get_num_physical_cores() {
+#ifdef __linux__
+  // enumerate the set of thread siblings, num entries is num cores
+  std::unordered_set<std::string> siblings;
+  for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
+    std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
+    if (!thread_siblings.is_open()) {
+      break;  // no more cpus
+    }
+    std::string line;
+    if (std::getline(thread_siblings, line)) {
+      siblings.insert(line);
+    }
+  }
+  if (!siblings.empty()) {
+    return static_cast<int32_t>(siblings.size());
+  }
+#elif defined(__APPLE__) && defined(__MACH__)
+  int32_t num_physical_cores;
+  size_t len = sizeof(num_physical_cores);
+  int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, nullptr, 0);
+  if (result == 0) {
+    return num_physical_cores;
+  }
+  result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, nullptr, 0);
+  if (result == 0) {
+    return num_physical_cores;
+  }
+#elif defined(_WIN32)
+  // Implement
+#endif
+  unsigned int n_threads = std::thread::hardware_concurrency();
+  return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
+}
diff --git a/vllm-ext/inference_engine/cpp/models/model_utils/util.h b/vllm-ext/inference_engine/cpp/models/model_utils/util.h
new file mode 100644
index 000000000..a6cec4664
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/model_utils/util.h
@@ -0,0 +1,430 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+// Internal header to be included only by model.cpp.
+// Contains wrappers around OS interfaces.
+
+#ifndef MODEL_UTIL_H
+#define MODEL_UTIL_H
+
+#include <cerrno>
+#include <climits>
+#include <cstdarg>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <sstream>
+#include <fstream>
+#include <stdexcept>
+#include <string>
+#include <thread>
+#include <unordered_set>
+#include <vector>
+#include "bestla/bestla_utils.h"  // borrow aligned_malloc
+
+#ifdef __has_include
+#if __has_include(<unistd.h>)
+#include <unistd.h>
+#if defined(_POSIX_MAPPED_FILES)
+#include <sys/mman.h>
+#endif
+#if defined(_POSIX_MEMLOCK_RANGE)
+#include <sys/resource.h>
+#endif
+#endif
+#endif
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <io.h>
+#include <stdio.h>  // for _fseeki64
+#include <windows.h>
+#endif
+
+#define MODEL_ASSERT(x)                                                     \
+  do {                                                                      \
+    if (!(x)) {                                                             \
+      fprintf(stderr, "MODEL_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+      abort();                                                              \
+    }                                                                       \
+  } while (0)
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+static std::string
+format(const char* fmt, ...) {
+  va_list ap, ap2;
+  va_start(ap, fmt);
+  va_copy(ap2, ap);
+  int size = vsnprintf(nullptr, 0, fmt, ap);
+  MODEL_ASSERT(size >= 0 && size < INT_MAX);
+  std::vector<char> buf(size + 1);
+  int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+  MODEL_ASSERT(size2 == size);
+  va_end(ap2);
+  va_end(ap);
+  return std::string(buf.data(), size);
+}
+
+struct model_file {
+  // use FILE * so we don't have to re-open the file to mmap
+  FILE* fp;
+  size_t size;
+
+  model_file(const char* fname, const char* mode) {
+    fp = std::fopen(fname, mode);
+    if (fp == nullptr) {
+      throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+    }
+    seek(0, SEEK_END);
+    size = tell();
+    seek(0, SEEK_SET);
+  }
+
+  size_t tell() const {
+#ifdef _WIN32
+    __int64 ret = _ftelli64(fp);
+#else
+    long ret = std::ftell(fp);
+#endif
+    MODEL_ASSERT(ret != -1);  // this really shouldn't fail
+    return (size_t)ret;
+  }
+
+  void seek(size_t offset, int whence) {
+#ifdef _WIN32
+    int ret = _fseeki64(fp, (__int64)offset, whence);
+#else
+    int ret = std::fseek(fp, (long)offset, whence);
+#endif
+    MODEL_ASSERT(ret == 0);  // same
+  }
+
+  void read_raw(void* ptr, size_t len) const {
+    if (len == 0) {
+      return;
+    }
+    errno = 0;
+    MODEL_ASSERT(ptr != nullptr);
+    std::size_t ret = std::fread(ptr, len, 1, fp);
+    if (ferror(fp)) {
+      throw std::runtime_error(format("read error: %s", strerror(errno)));
+    }
+    if (ret != 1) {
+      throw std::runtime_error(std::string("unexpectedly reached end of file"));
+    }
+  }
+
+  std::uint32_t read_u32() {
+    std::uint32_t ret;
+    read_raw(&ret, sizeof(ret));
+    return ret;
+  }
+
+  std::string read_string(std::uint32_t len) {
+    std::vector<char> chars(len);
+    read_raw(chars.data(), len);
+    return std::string(chars.data(), len);
+  }
+
+  void write_raw(const void* ptr, size_t len) const {
+    if (len == 0) {
+      return;
+    }
+    errno = 0;
+    size_t ret = std::fwrite(ptr, len, 1, fp);
+    if (ret != 1) {
+      throw std::runtime_error(format("write error: %s", strerror(errno)));
+    }
+  }
+
+  void write_u32(std::uint32_t val) { write_raw(&val, sizeof(val)); }
+
+  ~model_file() {
+    if (fp) {
+      std::fclose(fp);
+    }
+  }
+};
+
+#if defined(_WIN32)
+static std::string model_format_win_err(DWORD err) {
+  LPSTR buf;
+  size_t size =
+      FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                     nullptr, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, nullptr);
+  if (!size) {
+    return "FormatMessageA failed";
+  }
+  std::string ret(buf, size);
+  LocalFree(buf);
+  return ret;
+}
+#endif
+
+struct model_mmap {
+  void* addr;
+  size_t size;
+
+  model_mmap(const model_mmap&) = delete;
+
+#ifdef _POSIX_MAPPED_FILES
+  static constexpr bool SUPPORTED = true;
+
+  model_mmap(struct model_file* file, size_t prefetch = (size_t)-1 /* -1 = max value */) {
+    size = file->size;
+    int fd = fileno(file->fp);
+    // int flags = MAP_SHARED;
+    int flags = MAP_PRIVATE;
+#ifdef __linux__
+    flags |= MAP_POPULATE;
+#endif
+    addr = mmap(nullptr, file->size, PROT_READ, flags, fd, 0);
+    if (addr == MAP_FAILED) {
+      throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
+    }
+
+    if (prefetch > 0) {
+      // Advise the kernel to preload the mapped memory
+      if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
+        fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", strerror(errno));
+      }
+    }
+  }
+
+  ~model_mmap() { munmap(addr, size); }
+#elif defined(_WIN32)
+  static constexpr bool SUPPORTED = true;
+
+  model_mmap(struct model_file* file, bool prefetch = true) {
+    size = file->size;
+
+    HANDLE hFile = (HANDLE)_get_osfhandle(_fileno(file->fp));
+
+    HANDLE hMapping = CreateFileMappingA(hFile, nullptr, PAGE_READONLY, 0, 0, nullptr);
+    DWORD error = GetLastError();
+
+    if (hMapping == nullptr) {
+      throw std::runtime_error(format("CreateFileMappingA failed: %s", model_format_win_err(error).c_str()));
+    }
+
+    addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+    error = GetLastError();
+    CloseHandle(hMapping);
+
+    if (addr == nullptr) {
+      throw std::runtime_error(format("MapViewOfFile failed: %s", model_format_win_err(error).c_str()));
+    }
+
+#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
+    if (prefetch) {
+      // Advise the kernel to preload the mapped memory
+      WIN32_MEMORY_RANGE_ENTRY range;
+      range.VirtualAddress = addr;
+      range.NumberOfBytes = (SIZE_T)size;
+      if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
+        fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", model_format_win_err(GetLastError()).c_str());
+      }
+    }
+#else
+#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
+#endif  // _WIN32_WINNT >= _WIN32_WINNT_WIN8
+  }
+
+  ~model_mmap() {
+    if (!UnmapViewOfFile(addr)) {
+      fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", model_format_win_err(GetLastError()).c_str());
+    }
+  }
+#else
+  static constexpr bool SUPPORTED = false;
+
+  model_mmap(struct model_file*, bool prefetch = true) {
+    (void)prefetch;
+    throw std::runtime_error(std::string("mmap not supported"));
+  }
+#endif
+};
+
+// Represents some region of memory being locked using mlock or VirtualLock;
+// will automatically unlock on destruction.
+struct model_mlock {
+  void* addr = nullptr;
+  size_t size = 0;
+  bool failed_already = false;
+
+  model_mlock() {}
+  model_mlock(const model_mlock&) = delete;
+
+  ~model_mlock() {
+    if (size) {
+      raw_unlock(addr, size);
+    }
+  }
+
+  void init(void* ptr) {
+    MODEL_ASSERT(addr == nullptr && size == 0);
+    addr = ptr;
+  }
+
+  void grow_to(size_t target_size) {
+    MODEL_ASSERT(addr);
+    if (failed_already) {
+      return;
+    }
+    size_t granularity = lock_granularity();
+    target_size = (target_size + granularity - 1) & ~(granularity - 1);
+    if (target_size > size) {
+      if (raw_lock((uint8_t*)addr + size, target_size - size)) {
+        size = target_size;
+      } else {
+        failed_already = true;
+      }
+    }
+  }
+
+#ifdef _POSIX_MEMLOCK_RANGE
+  static constexpr bool SUPPORTED = true;
+
+  size_t lock_granularity() { return (size_t)sysconf(_SC_PAGESIZE); }
+
+#ifdef __APPLE__
+#define MLOCK_SUGGESTION                                                                          \
+  "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
+  "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
+#else
+#define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
+#endif
+
+  bool raw_lock(const void* addr, size_t size) {
+    if (!mlock(addr, size)) {
+      return true;
+    } else {
+      char* errmsg = std::strerror(errno);
+      bool suggest = (errno == ENOMEM);
+
+      // Check if the resource limit is fine after all
+      struct rlimit lock_limit;
+      if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) suggest = false;
+      if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) suggest = false;
+
+      fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", size,
+              this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
+      return false;
+    }
+  }
+
+#undef MLOCK_SUGGESTION
+
+  void raw_unlock(void* addr, size_t size) {
+    if (munlock(addr, size)) {
+      fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
+    }
+  }
+#elif defined(_WIN32)
+  static constexpr bool SUPPORTED = true;
+
+  size_t lock_granularity() {
+    SYSTEM_INFO si;
+    GetSystemInfo(&si);
+    return (size_t)si.dwPageSize;
+  }
+
+  bool raw_lock(void* ptr, size_t len) {
+    for (int tries = 1;; tries++) {
+      if (VirtualLock(ptr, len)) {
+        return true;
+      }
+      if (tries == 2) {
+        fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
+                len, size, model_format_win_err(GetLastError()).c_str());
+        return false;
+      }
+
+      // It failed but this was only the first try; increase the working
+      // set size and try again.
+      SIZE_T min_ws_size, max_ws_size;
+      if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
+        fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", model_format_win_err(GetLastError()).c_str());
+        return false;
+      }
+      // Per MSDN: "The maximum number of pages that a process can lock
+      // is equal to the number of pages in its minimum working set minus
+      // a small overhead."
+      // Hopefully a megabyte is enough overhead:
+      size_t increment = len + 1048576;
+      // The minimum must be <= the maximum, so we need to increase both:
+      min_ws_size += increment;
+      max_ws_size += increment;
+      if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
+        fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", model_format_win_err(GetLastError()).c_str());
+        return false;
+      }
+    }
+  }
+
+  void raw_unlock(void* ptr, size_t len) {
+    if (!VirtualUnlock(ptr, len)) {
+      fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", model_format_win_err(GetLastError()).c_str());
+    }
+  }
+#else
+  static constexpr bool SUPPORTED = false;
+
+  size_t lock_granularity() { return (size_t)65536; }
+
+  bool raw_lock(const void* addr, size_t len) {
+    fprintf(stderr, "warning: mlock not supported on this system\n");
+    return false;
+  }
+
+  void raw_unlock(const void* addr, size_t len) {}
+#endif
+};
+
+// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
+struct model_buffer {
+  uint8_t* addr = nullptr;
+  size_t size = 0;
+
+  model_buffer() = default;
+
+  void resize(size_t len) {
+    bestla::utils::afree(addr);
+    addr = bestla::utils::amalloc<uint8_t>(len);
+    size = len;
+  }
+
+  ~model_buffer() { bestla::utils::afree(addr); }
+
+  // disable copy and move
+  model_buffer(const model_buffer&) = delete;
+  model_buffer(model_buffer&&) = delete;
+  model_buffer& operator=(const model_buffer&) = delete;
+  model_buffer& operator=(model_buffer&&) = delete;
+};
+
+typedef model_buffer model_ctx_buffer;
+
+int32_t get_num_physical_cores();
+
+#endif
diff --git a/vllm-ext/inference_engine/cpp/models/models.h b/vllm-ext/inference_engine/cpp/models/models.h
new file mode 100644
index 000000000..e01608df0
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/models/models.h
@@ -0,0 +1,26 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#ifndef MODELS_H
+#define MODELS_H
+
+#include "models/model_utils/model_types.h"
+
+struct IModel {
+  virtual void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap, bool use_mlock,
+                    bool vocab_only) = 0;
+  virtual void load(model_context* ctx, model_progress_callback progress_callback,
+                    void* progress_callback_user_data) = 0;
+};
+
+#endif  // MODELS_H
diff --git a/vllm-ext/inference_engine/cpp/test/test.cpp b/vllm-ext/inference_engine/cpp/test/test.cpp
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm-ext/inference_engine/cpp/vectors/CMakeLists.txt b/vllm-ext/inference_engine/cpp/vectors/CMakeLists.txt
new file mode 100644
index 000000000..36b564d46
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/CMakeLists.txt
@@ -0,0 +1,18 @@
+#  Copyright (c) 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+add_subdirectory(cpu)
+
+add_library_w_warning(ne_vec ele_reduce.cpp)
+target_link_libraries(ne_vec PUBLIC cpu_vec)
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/CMakeLists.txt b/vllm-ext/inference_engine/cpp/vectors/cpu/CMakeLists.txt
new file mode 100644
index 000000000..f0bad78bb
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/CMakeLists.txt
@@ -0,0 +1,17 @@
+#  Copyright (c) 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+add_library_w_warning(cpu_vec vec_arithmetic.cpp vec_compare.cpp vec_convert.cpp  vec_set.cpp  vec_store.cpp vec_load.cpp)
+set_target_properties(cpu_vec PROPERTIES LINKER_LANGUAGE CXX)
+set_property(TARGET cpu_vec PROPERTY POSITION_INDEPENDENT_CODE  ON)
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/quantize.h b/vllm-ext/inference_engine/cpp/vectors/cpu/quantize.h
new file mode 100644
index 000000000..73100e423
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/quantize.h
@@ -0,0 +1,1060 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+
+#include <assert.h>
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
+#include <immintrin.h>
+#endif
+
+#include "core/data_types.h"
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+//
+// quantization
+//
+
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+  // Get absolute values of x vectors
+  const __m128i ax = _mm_sign_epi8(x, x);
+  // Sign the values of the y vectors
+  const __m128i sy = _mm_sign_epi8(y, x);
+  // Perform multiplication and create 16-bit values
+  const __m128i dot = _mm_maddubs_epi16(ax, sy);
+  const __m128i ones = _mm_set1_epi16(1);
+  return _mm_madd_epi16(ones, dot);
+}
+
+#if __AVX__ || __AVX2__ || __AVX512F__
+// horizontally add 8 floats
+static inline float hsum_float_8(const __m256 x) {
+  __m128 res = _mm256_extractf128_ps(x, 1);
+  res = _mm_add_ps(res, _mm256_castps256_ps128(x));
+  res = _mm_add_ps(res, _mm_movehl_ps(res, res));
+  res = _mm_add_ss(res, _mm_movehdup_ps(res));
+  return _mm_cvtss_f32(res);
+}
+
+// horizontally add 8 int32_t
+static inline int hsum_i32_8(const __m256i a) {
+  const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
+  const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
+  const __m128i sum64 = _mm_add_epi32(hi64, sum128);
+  const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+  return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+// horizontally add 4 int32_t
+static inline int hsum_i32_4(const __m128i a) {
+  const __m128i hi64 = _mm_unpackhi_epi64(a, a);
+  const __m128i sum64 = _mm_add_epi32(hi64, a);
+  const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+  return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+#if __AVX2__ || __AVX512F__
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t* x) {
+  uint32_t x32;
+  memcpy(&x32, x, sizeof(uint32_t));
+  const __m256i shuf_mask =
+      _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000);
+  __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
+  const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
+  bytes = _mm256_or_si256(bytes, bit_mask);
+  return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t* rsi) {
+  const __m128i tmp = _mm_loadu_si128((const __m128i*)rsi);
+  const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
+  const __m256i lowMask = _mm256_set1_epi8(0xF);
+  return _mm256_and_si256(lowMask, bytes);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+  const __m256i ones = _mm256_set1_epi16(1);
+  const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
+  return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+#if __AVXVNNI__
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
+  return _mm256_cvtepi32_ps(summed_pairs);
+#else
+  // Perform multiplication and create 16-bit values
+  const __m256i dot = _mm256_maddubs_epi16(ax, sy);
+  return sum_i16_pairs_float(dot);
+#endif
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+#if __AVXVNNIINT8__
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
+  return _mm256_cvtepi32_ps(summed_pairs);
+#else
+  // Get absolute values of x vectors
+  const __m256i ax = _mm256_sign_epi8(x, x);
+  // Sign the values of the y vectors
+  const __m256i sy = _mm256_sign_epi8(y, x);
+  return mul_sum_us8_pairs_float(ax, sy);
+#endif
+}
+
+static inline __m128i packNibbles(__m256i bytes) {
+  // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+#if __AVX512F__
+  const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);  // 0000_0000_abcd_0000
+  bytes = _mm256_or_si256(bytes, bytes_srli_4);              // 0000_abcd_abcd_efgh
+  return _mm256_cvtepi16_epi8(bytes);                        // abcd_efgh
+#else
+  const __m256i lowByte = _mm256_set1_epi16(0xFF);
+  __m256i high = _mm256_andnot_si256(lowByte, bytes);
+  __m256i low = _mm256_and_si256(lowByte, bytes);
+  high = _mm256_srli_epi16(high, 4);
+  bytes = _mm256_or_si256(low, high);
+
+  // Compress uint16_t lanes into bytes
+  __m128i r0 = _mm256_castsi256_si128(bytes);
+  __m128i r1 = _mm256_extracti128_si256(bytes, 1);
+  return _mm_packus_epi16(r0, r1);
+#endif
+}
+#elif defined(__AVX__)
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t* x) {
+  uint32_t x32;
+  memcpy(&x32, x, sizeof(uint32_t));
+  const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+  const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
+  __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
+  __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
+  const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
+  bytesl = _mm_or_si128(bytesl, bit_mask);
+  bytesh = _mm_or_si128(bytesh, bit_mask);
+  bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
+  bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
+  return _mm256_set_m128i(bytesh, bytesl);
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t* rsi) {
+  // Load 16 bytes from memory
+  __m128i tmpl = _mm_loadu_si128((const __m128i*)rsi);
+  __m128i tmph = _mm_srli_epi16(tmpl, 4);
+  const __m128i lowMask = _mm_set1_epi8(0xF);
+  tmpl = _mm_and_si128(lowMask, tmpl);
+  tmph = _mm_and_si128(lowMask, tmph);
+  return _mm256_set_m128i(tmph, tmpl);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
+  const __m128i ones = _mm_set1_epi16(1);
+  const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
+  const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
+  const __m256i summed_pairs = _mm256_set_m128i(summed_pairsh, summed_pairsl);
+  return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+  const __m128i axl = _mm256_castsi256_si128(ax);
+  const __m128i axh = _mm256_extractf128_si256(ax, 1);
+  const __m128i syl = _mm256_castsi256_si128(sy);
+  const __m128i syh = _mm256_extractf128_si256(sy, 1);
+  // Perform multiplication and create 16-bit values
+  const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+  const __m128i doth = _mm_maddubs_epi16(axh, syh);
+  return sum_i16_pairs_float(doth, dotl);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+  const __m128i xl = _mm256_castsi256_si128(x);
+  const __m128i xh = _mm256_extractf128_si256(x, 1);
+  const __m128i yl = _mm256_castsi256_si128(y);
+  const __m128i yh = _mm256_extractf128_si256(y, 1);
+  // Get absolute values of x vectors
+  const __m128i axl = _mm_sign_epi8(xl, xl);
+  const __m128i axh = _mm_sign_epi8(xh, xh);
+  // Sign the values of the y vectors
+  const __m128i syl = _mm_sign_epi8(yl, xl);
+  const __m128i syh = _mm_sign_epi8(yh, xh);
+  // Perform multiplication and create 16-bit values
+  const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+  const __m128i doth = _mm_maddubs_epi16(axh, syh);
+  return sum_i16_pairs_float(doth, dotl);
+}
+
+static inline __m128i packNibbles(__m128i bytes1, __m128i bytes2) {
+  // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+  const __m128i lowByte = _mm_set1_epi16(0xFF);
+  __m128i high = _mm_andnot_si128(lowByte, bytes1);
+  __m128i low = _mm_and_si128(lowByte, bytes1);
+  high = _mm_srli_epi16(high, 4);
+  bytes1 = _mm_or_si128(low, high);
+  high = _mm_andnot_si128(lowByte, bytes2);
+  low = _mm_and_si128(lowByte, bytes2);
+  high = _mm_srli_epi16(high, 4);
+  bytes2 = _mm_or_si128(low, high);
+
+  return _mm_packus_epi16(bytes1, bytes2);
+}
+#endif
+#elif defined(__SSSE3__)
+// horizontally add 4x4 floats
+static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
+  __m128 res_0 = _mm_hadd_ps(a, b);
+  __m128 res_1 = _mm_hadd_ps(c, d);
+  __m128 res = _mm_hadd_ps(res_0, res_1);
+  res = _mm_hadd_ps(res, res);
+  res = _mm_hadd_ps(res, res);
+
+  return _mm_cvtss_f32(res);
+}
+#endif  // __AVX__ || __AVX2__ || __AVX512F__
+#endif  // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+
+// reference implementation for deterministic creation of model files
+static void quantize_row_q4_0_reference(const float* restrict x, block_q4_0* restrict y, int k) {
+  static const int qk = QK4_0;
+
+  assert(k % qk == 0);
+
+  const int nb = k / qk;
+
+  for (int i = 0; i < nb; i++) {
+    float amax = 0.0f;  // absolute max
+    float max = 0.0f;
+
+    for (int j = 0; j < qk; j++) {
+      const float v = x[i * qk + j];
+      if (amax < fabsf(v)) {
+        amax = fabsf(v);
+        max = v;
+      }
+    }
+
+    const float d = max / -8;
+    const float id = d ? 1.0f / d : 0.0f;
+
+    y[i].d = NE_FP32_TO_FP16(d);
+
+    for (int j = 0; j < qk / 2; ++j) {
+      const float x0 = x[i * qk + 0 + j] * id;
+      const float x1 = x[i * qk + qk / 2 + j] * id;
+
+      const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+      const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
+
+      y[i].qs[j] = xi0;
+      y[i].qs[j] |= xi1 << 4;
+    }
+  }
+}
+
+static void quantize_row_q4_0(const float* restrict x, void* restrict y, int k) {
+  quantize_row_q4_0_reference(x, (block_q4_0*)y, k);
+}
+
+static void quantize_row_q4_1_reference(const float* x, block_q4_1* y, int k) {
+  const int qk = QK4_1;
+
+  assert(k % qk == 0);
+
+  const int nb = k / qk;
+
+  for (int i = 0; i < nb; i++) {
+    float min = FLT_MAX;
+    float max = -FLT_MAX;
+
+    for (int j = 0; j < qk; j++) {
+      const float v = x[i * qk + j];
+
+      if (v < min) min = v;
+      if (v > max) max = v;
+    }
+
+    const float d = (max - min) / ((1 << 4) - 1);
+    const float id = d ? 1.0f / d : 0.0f;
+
+    y[i].d = NE_FP32_TO_FP16(d);
+    y[i].m = NE_FP32_TO_FP16(min);
+
+    for (int j = 0; j < qk / 2; ++j) {
+      const float x0 = (x[i * qk + 0 + j] - min) * id;
+      const float x1 = (x[i * qk + qk / 2 + j] - min) * id;
+
+      const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
+      const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
+
+      y[i].qs[j] = xi0;
+      y[i].qs[j] |= xi1 << 4;
+    }
+  }
+}
+
+static void quantize_row_q4_1(const float* restrict x, void* restrict y, int k) {
+  quantize_row_q4_1_reference(x, (block_q4_1*)y, k);
+}
+
+static void quantize_row_q5_0_reference(const float* restrict x, block_q5_0* restrict y, int k) {
+  static const int qk = QK5_0;
+
+  assert(k % qk == 0);
+
+  const int nb = k / qk;
+
+  for (int i = 0; i < nb; i++) {
+    float amax = 0.0f;  // absolute max
+    float max = 0.0f;
+
+    for (int j = 0; j < qk; j++) {
+      const float v = x[i * qk + j];
+      if (amax < fabsf(v)) {
+        amax = fabsf(v);
+        max = v;
+      }
+    }
+
+    const float d = max / -16;
+    const float id = d ? 1.0f / d : 0.0f;
+
+    y[i].d = NE_FP32_TO_FP16(d);
+
+    uint32_t qh = 0;
+
+    for (int j = 0; j < qk / 2; ++j) {
+      const float x0 = x[i * qk + 0 + j] * id;
+      const float x1 = x[i * qk + qk / 2 + j] * id;
+
+      const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
+      const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
+
+      y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+      // get the 5-th bit and store it in qh at the right position
+      qh |= ((xi0 & 0x10) >> 4) << (j + 0);
+      qh |= ((xi1 & 0x10) >> 4) << (j + qk / 2);
+    }
+
+    memcpy(&y[i].qh, &qh, sizeof(qh));
+  }
+}
+
+static void quantize_row_q5_0(const float* restrict x, void* restrict y, int k) {
+  quantize_row_q5_0_reference(x, (block_q5_0*)y, k);
+}
+
+static void quantize_row_q5_1_reference(const float* restrict x, block_q5_1* restrict y, int k) {
+  const int qk = QK5_1;
+
+  assert(k % qk == 0);
+
+  const int nb = k / qk;
+
+  for (int i = 0; i < nb; i++) {
+    float min = FLT_MAX;
+    float max = -FLT_MAX;
+
+    for (int j = 0; j < qk; j++) {
+      const float v = x[i * qk + j];
+
+      if (v < min) min = v;
+      if (v > max) max = v;
+    }
+
+    const float d = (max - min) / ((1 << 5) - 1);
+    const float id = d ? 1.0f / d : 0.0f;
+
+    y[i].d = NE_FP32_TO_FP16(d);
+    y[i].m = NE_FP32_TO_FP16(min);
+
+    uint32_t qh = 0;
+
+    for (int j = 0; j < qk / 2; ++j) {
+      const float x0 = (x[i * qk + 0 + j] - min) * id;
+      const float x1 = (x[i * qk + qk / 2 + j] - min) * id;
+
+      const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
+      const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
+
+      y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+      // get the 5-th bit and store it in qh at the right position
+      qh |= ((xi0 & 0x10) >> 4) << (j + 0);
+      qh |= ((xi1 & 0x10) >> 4) << (j + qk / 2);
+    }
+
+    memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
+  }
+}
+
+static void quantize_row_q5_1(const float* restrict x, void* restrict y, int k) {
+  quantize_row_q5_1_reference(x, (block_q5_1*)y, k);
+}
+
+// reference implementation for deterministic creation of model files
+static void quantize_row_q8_0_reference(const float* restrict x, block_q8_0* restrict y, int k) {
+  assert(k % QK8_0 == 0);
+  const int nb = k / QK8_0;
+
+  for (int i = 0; i < nb; i++) {
+    float amax = 0.0f;  // absolute max
+
+    for (int j = 0; j < QK8_0; j++) {
+      const float v = x[i * QK8_0 + j];
+      amax = MAX(amax, fabsf(v));
+    }
+
+    const float d = amax / ((1 << 7) - 1);
+    const float id = d ? 1.0f / d : 0.0f;
+
+    y[i].d = NE_FP32_TO_FP16(d);
+
+    for (int j = 0; j < QK8_0; ++j) {
+      const float x0 = x[i * QK8_0 + j] * id;
+
+      y[i].qs[j] = roundf(x0);
+    }
+  }
+}
+
+static void quantize_row_q8_0(const float* restrict x, void* restrict vy, int k) {
+  assert(QK8_0 == 32);
+  assert(k % QK8_0 == 0);
+  const int nb = k / QK8_0;
+
+  block_q8_0* y = (block_q8_0*)vy;
+
+#if defined(__AVX2__) || defined(__AVX__)
+  for (int i = 0; i < nb; i++) {
+    // Load elements into 4 AVX vectors
+    __m256 v0 = _mm256_loadu_ps(x);
+    __m256 v1 = _mm256_loadu_ps(x + 8);
+    __m256 v2 = _mm256_loadu_ps(x + 16);
+    __m256 v3 = _mm256_loadu_ps(x + 24);
+    x += 32;
+
+    // Compute max(abs(e)) for the block
+    const __m256 signBit = _mm256_set1_ps(-0.0f);
+    __m256 maxAbs = _mm256_andnot_ps(signBit, v0);
+    maxAbs = _mm256_max_ps(maxAbs, _mm256_andnot_ps(signBit, v1));
+    maxAbs = _mm256_max_ps(maxAbs, _mm256_andnot_ps(signBit, v2));
+    maxAbs = _mm256_max_ps(maxAbs, _mm256_andnot_ps(signBit, v3));
+
+    __m128 max4 = _mm_max_ps(_mm256_extractf128_ps(maxAbs, 1), _mm256_castps256_ps128(maxAbs));
+    max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
+    max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
+    const float maxScalar = _mm_cvtss_f32(max4);
+
+    // Quantize these floats
+    const float d = maxScalar / 127.f;
+    y[i].d = NE_FP32_TO_FP16(d);
+    const float id = (maxScalar != 0.0f) ? 127.f / maxScalar : 0.0f;
+    const __m256 mul = _mm256_set1_ps(id);
+
+    // Apply the multiplier
+    v0 = _mm256_mul_ps(v0, mul);
+    v1 = _mm256_mul_ps(v1, mul);
+    v2 = _mm256_mul_ps(v2, mul);
+    v3 = _mm256_mul_ps(v3, mul);
+
+    // Round to nearest integer
+    v0 = _mm256_round_ps(v0, _MM_ROUND_NEAREST);
+    v1 = _mm256_round_ps(v1, _MM_ROUND_NEAREST);
+    v2 = _mm256_round_ps(v2, _MM_ROUND_NEAREST);
+    v3 = _mm256_round_ps(v3, _MM_ROUND_NEAREST);
+
+    // Convert floats to integers
+    __m256i i0 = _mm256_cvtps_epi32(v0);
+    __m256i i1 = _mm256_cvtps_epi32(v1);
+    __m256i i2 = _mm256_cvtps_epi32(v2);
+    __m256i i3 = _mm256_cvtps_epi32(v3);
+
+#if defined(__AVX2__)
+    // Convert int32 to int16
+    i0 = _mm256_packs_epi32(i0, i1);  // 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+    i2 = _mm256_packs_epi32(i2, i3);  // 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                      // Convert int16 to int8
+    i0 = _mm256_packs_epi16(i0, i2);  // 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12,
+                                      // 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+    // We got our precious signed bytes, but the order is now wrong
+    // These AVX2 pack instructions process 16-byte pieces independently
+    // The following instruction is fixing the order
+    const __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+    i0 = _mm256_permutevar8x32_epi32(i0, perm);
+
+    _mm256_storeu_si256((__m256i*)y[i].qs, i0);
+#else
+    // Since we don't have in AVX some necessary functions,
+    // we split the registers in half and call AVX2 analogs from SSE
+    __m128i ni0 = _mm256_castsi256_si128(i0);
+    __m128i ni1 = _mm256_extractf128_si256(i0, 1);
+    __m128i ni2 = _mm256_castsi256_si128(i1);
+    __m128i ni3 = _mm256_extractf128_si256(i1, 1);
+    __m128i ni4 = _mm256_castsi256_si128(i2);
+    __m128i ni5 = _mm256_extractf128_si256(i2, 1);
+    __m128i ni6 = _mm256_castsi256_si128(i3);
+    __m128i ni7 = _mm256_extractf128_si256(i3, 1);
+
+    // Convert int32 to int16
+    ni0 = _mm_packs_epi32(ni0, ni1);
+    ni2 = _mm_packs_epi32(ni2, ni3);
+    ni4 = _mm_packs_epi32(ni4, ni5);
+    ni6 = _mm_packs_epi32(ni6, ni7);
+    // Convert int16 to int8
+    ni0 = _mm_packs_epi16(ni0, ni2);
+    ni4 = _mm_packs_epi16(ni4, ni6);
+
+    _mm_storeu_si128((__m128i*)(y[i].qs + 0), ni0);
+    _mm_storeu_si128((__m128i*)(y[i].qs + 16), ni4);
+#endif
+  }
+#else
+  // scalar
+  quantize_row_q8_0_reference(x, y, k);
+#endif
+}
+
+// reference implementation for deterministic creation of model files
+static void quantize_row_q8_1_reference(const float* restrict x, block_q8_1* restrict y, int k) {
+  assert(QK8_1 == 32);
+  assert(k % QK8_1 == 0);
+  const int nb = k / QK8_1;
+
+  for (int i = 0; i < nb; i++) {
+    float amax = 0.0f;  // absolute max
+
+    for (int j = 0; j < QK8_1; j++) {
+      const float v = x[i * QK8_1 + j];
+      amax = MAX(amax, fabsf(v));
+    }
+
+    const float d = amax / ((1 << 7) - 1);
+    const float id = d ? 1.0f / d : 0.0f;
+
+    y[i].d = d;
+
+    int sum = 0;
+
+    for (int j = 0; j < QK8_1 / 2; ++j) {
+      const float v0 = x[i * QK8_1 + j] * id;
+      const float v1 = x[i * QK8_1 + QK8_1 / 2 + j] * id;
+
+      y[i].qs[j] = roundf(v0);
+      y[i].qs[QK8_1 / 2 + j] = roundf(v1);
+
+      sum += y[i].qs[j];
+      sum += y[i].qs[QK8_1 / 2 + j];
+    }
+
+    y[i].s = sum * d;
+  }
+}
+
+static void quantize_row_q8_1(const float* restrict x, void* restrict vy, int k) {
+  assert(k % QK8_1 == 0);
+  const int nb = k / QK8_1;
+
+  block_q8_1* y = (block_q8_1*)vy;
+
+#if defined(__AVX2__) || defined(__AVX__)
+  for (int i = 0; i < nb; i++) {
+    // Load elements into 4 AVX vectors
+    __m256 v0 = _mm256_loadu_ps(x);
+    __m256 v1 = _mm256_loadu_ps(x + 8);
+    __m256 v2 = _mm256_loadu_ps(x + 16);
+    __m256 v3 = _mm256_loadu_ps(x + 24);
+    x += 32;
+
+    // Compute max(abs(e)) for the block
+    const __m256 signBit = _mm256_set1_ps(-0.0f);
+    __m256 maxAbs = _mm256_andnot_ps(signBit, v0);
+    maxAbs = _mm256_max_ps(maxAbs, _mm256_andnot_ps(signBit, v1));
+    maxAbs = _mm256_max_ps(maxAbs, _mm256_andnot_ps(signBit, v2));
+    maxAbs = _mm256_max_ps(maxAbs, _mm256_andnot_ps(signBit, v3));
+
+    __m128 max4 = _mm_max_ps(_mm256_extractf128_ps(maxAbs, 1), _mm256_castps256_ps128(maxAbs));
+    max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
+    max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
+    const float maxScalar = _mm_cvtss_f32(max4);
+
+    // Quantize these floats
+    const float d = maxScalar / 127.f;
+    y[i].d = d;
+    const float id = (maxScalar != 0.0f) ? 127.f / maxScalar : 0.0f;
+    const __m256 mul = _mm256_set1_ps(id);
+
+    // Apply the multiplier
+    v0 = _mm256_mul_ps(v0, mul);
+    v1 = _mm256_mul_ps(v1, mul);
+    v2 = _mm256_mul_ps(v2, mul);
+    v3 = _mm256_mul_ps(v3, mul);
+
+    // Round to nearest integer
+    v0 = _mm256_round_ps(v0, _MM_ROUND_NEAREST);
+    v1 = _mm256_round_ps(v1, _MM_ROUND_NEAREST);
+    v2 = _mm256_round_ps(v2, _MM_ROUND_NEAREST);
+    v3 = _mm256_round_ps(v3, _MM_ROUND_NEAREST);
+
+    // Convert floats to integers
+    __m256i i0 = _mm256_cvtps_epi32(v0);
+    __m256i i1 = _mm256_cvtps_epi32(v1);
+    __m256i i2 = _mm256_cvtps_epi32(v2);
+    __m256i i3 = _mm256_cvtps_epi32(v3);
+
+#if defined(__AVX2__)
+    // Compute the sum of the quants and set y[i].s
+    y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
+
+    // Convert int32 to int16
+    i0 = _mm256_packs_epi32(i0, i1);  // 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+    i2 = _mm256_packs_epi32(i2, i3);  // 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                      // Convert int16 to int8
+    i0 = _mm256_packs_epi16(i0, i2);  // 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12,
+                                      // 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+    // We got our precious signed bytes, but the order is now wrong
+    // These AVX2 pack instructions process 16-byte pieces independently
+    // The following instruction is fixing the order
+    const __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+    i0 = _mm256_permutevar8x32_epi32(i0, perm);
+
+    _mm256_storeu_si256((__m256i*)y[i].qs, i0);
+#else
+    // Since we don't have in AVX some necessary functions,
+    // we split the registers in half and call AVX2 analogs from SSE
+    __m128i ni0 = _mm256_castsi256_si128(i0);
+    __m128i ni1 = _mm256_extractf128_si256(i0, 1);
+    __m128i ni2 = _mm256_castsi256_si128(i1);
+    __m128i ni3 = _mm256_extractf128_si256(i1, 1);
+    __m128i ni4 = _mm256_castsi256_si128(i2);
+    __m128i ni5 = _mm256_extractf128_si256(i2, 1);
+    __m128i ni6 = _mm256_castsi256_si128(i3);
+    __m128i ni7 = _mm256_extractf128_si256(i3, 1);
+
+    // Compute the sum of the quants and set y[i].s
+    const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
+    const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
+    y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
+
+    // Convert int32 to int16
+    ni0 = _mm_packs_epi32(ni0, ni1);
+    ni2 = _mm_packs_epi32(ni2, ni3);
+    ni4 = _mm_packs_epi32(ni4, ni5);
+    ni6 = _mm_packs_epi32(ni6, ni7);
+    // Convert int16 to int8
+    ni0 = _mm_packs_epi16(ni0, ni2);
+    ni4 = _mm_packs_epi16(ni4, ni6);
+
+    _mm_storeu_si128((__m128i*)(y[i].qs + 0), ni0);
+    _mm_storeu_si128((__m128i*)(y[i].qs + 16), ni4);
+#endif
+  }
+#else
+  // scalar
+  quantize_row_q8_1_reference(x, y, k);
+#endif
+}
+
+static void dequantize_row_q4_0(const block_q4_0* restrict x, float* restrict y, int k) {
+  static const int qk = QK4_0;
+
+  assert(k % qk == 0);
+
+  const int nb = k / qk;
+
+  for (int i = 0; i < nb; i++) {
+    const float d = NE_FP16_TO_FP32(x[i].d);
+
+    for (int j = 0; j < qk / 2; ++j) {
+      const int x0 = (x[i].qs[j] & 0x0F) - 8;
+      const int x1 = (x[i].qs[j] >> 4) - 8;
+
+      y[i * qk + j + 0] = x0 * d;
+      y[i * qk + j + qk / 2] = x1 * d;
+    }
+  }
+}
+
+static void dequantize_row_q4_1(const block_q4_1* restrict x, float* restrict y, int k) {
+  static const int qk = QK4_1;
+
+  assert(k % qk == 0);
+
+  const int nb = k / qk;
+
+  for (int i = 0; i < nb; i++) {
+    const float d = NE_FP16_TO_FP32(x[i].d);
+    const float m = NE_FP16_TO_FP32(x[i].m);
+
+    for (int j = 0; j < qk / 2; ++j) {
+      const int x0 = (x[i].qs[j] & 0x0F);
+      const int x1 = (x[i].qs[j] >> 4);
+
+      y[i * qk + j + 0] = x0 * d + m;
+      y[i * qk + j + qk / 2] = x1 * d + m;
+    }
+  }
+}
+
+static void dequantize_row_q5_0(const block_q5_0* restrict x, float* restrict y, int k) {
+  static const int qk = QK5_0;
+
+  assert(k % qk == 0);
+
+  const int nb = k / qk;
+
+  for (int i = 0; i < nb; i++) {
+    const float d = NE_FP16_TO_FP32(x[i].d);
+
+    uint32_t qh;
+    memcpy(&qh, x[i].qh, sizeof(qh));
+
+    for (int j = 0; j < qk / 2; ++j) {
+      const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+      const uint8_t xh_1 = ((qh >> (j + 12))) & 0x10;
+
+      const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
+      const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
+
+      y[i * qk + j + 0] = x0 * d;
+      y[i * qk + j + qk / 2] = x1 * d;
+    }
+  }
+}
+
+static void dequantize_row_q5_1(const block_q5_1* restrict x, float* restrict y, int k) {
+  static const int qk = QK5_1;
+
+  assert(k % qk == 0);
+
+  const int nb = k / qk;
+
+  for (int i = 0; i < nb; i++) {
+    const float d = NE_FP16_TO_FP32(x[i].d);
+    const float m = NE_FP16_TO_FP32(x[i].m);
+
+    uint32_t qh;
+    memcpy(&qh, x[i].qh, sizeof(qh));
+
+    for (int j = 0; j < qk / 2; ++j) {
+      const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+      const uint8_t xh_1 = ((qh >> (j + 12))) & 0x10;
+
+      const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
+      const int x1 = (x[i].qs[j] >> 4) | xh_1;
+
+      y[i * qk + j + 0] = x0 * d + m;
+      y[i * qk + j + qk / 2] = x1 * d + m;
+    }
+  }
+}
+
+static void dequantize_row_q8_0(const void* restrict vx, float* restrict y, int k) {
+  static const int qk = QK8_0;
+
+  assert(k % qk == 0);
+
+  const int nb = k / qk;
+
+  const block_q8_0* x = (const block_q8_0*)vx;
+
+  for (int i = 0; i < nb; i++) {
+    const float d = NE_FP16_TO_FP32(x[i].d);
+
+    for (int j = 0; j < qk; ++j) {
+      y[i * qk + j] = x[i].qs[j] * d;
+    }
+  }
+}
+
+//
+// ===================== Helper functions
+//
+static inline int nearest_int(float fval) {
+  assert(fval <= 4194303.f);
+  float val = fval + 12582912.f;
+  int i;
+  memcpy(&i, &val, sizeof(int));
+  return (i & 0x007fffff) - 0x00400000;
+}
+
+static float make_qx_quants(int n, int nmax, const float* restrict x, int8_t* restrict L, int rmse_type) {
+  float max = 0;
+  float amax = 0;
+  for (int i = 0; i < n; ++i) {
+    float ax = fabsf(x[i]);
+    if (ax > amax) {
+      amax = ax;
+      max = x[i];
+    }
+  }
+  if (amax < 1e-30f) {  // all zero
+    for (int i = 0; i < n; ++i) {
+      L[i] = 0;
+    }
+    return 0.f;
+  }
+  float iscale = -nmax / max;
+  if (rmse_type == 0) {
+    for (int i = 0; i < n; ++i) {
+      int l = nearest_int(iscale * x[i]);
+      L[i] = nmax + MAX(-nmax, MIN(nmax - 1, l));
+    }
+    return 1 / iscale;
+  }
+  bool return_early = false;
+  if (rmse_type < 0) {
+    rmse_type = -rmse_type;
+    return_early = true;
+  }
+  int weight_type = rmse_type % 2;
+  float sumlx = 0;
+  float suml2 = 0;
+  for (int i = 0; i < n; ++i) {
+    int l = nearest_int(iscale * x[i]);
+    l = MAX(-nmax, MIN(nmax - 1, l));
+    L[i] = l + nmax;
+    float w = weight_type == 1 ? x[i] * x[i] : 1;
+    sumlx += w * x[i] * l;
+    suml2 += w * l * l;
+  }
+  float scale = sumlx / suml2;
+  if (return_early) return suml2 > 0 ? 0.5f * (scale + 1 / iscale) : 1 / iscale;
+  float best = scale * sumlx;
+  for (int is = -9; is <= 9; ++is) {
+    if (is == 0) {
+      continue;
+    }
+    iscale = -(nmax + 0.1f * is) / max;
+    sumlx = suml2 = 0;
+    for (int i = 0; i < n; ++i) {
+      int l = nearest_int(iscale * x[i]);
+      l = MAX(-nmax, MIN(nmax - 1, l));
+      float w = weight_type == 1 ? x[i] * x[i] : 1;
+      sumlx += w * x[i] * l;
+      suml2 += w * l * l;
+    }
+    if (suml2 > 0 && sumlx * sumlx > best * suml2) {
+      for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale * x[i]);
+        L[i] = nmax + MAX(-nmax, MIN(nmax - 1, l));
+      }
+      scale = sumlx / suml2;
+      best = scale * sumlx;
+    }
+  }
+  return scale;
+}
+
+static void quantize_row_q6_K_reference(const float* restrict x, block_q6_K* restrict y, int k) {
+  assert(k % QK_K == 0);
+  const int nb = k / QK_K;
+
+  int8_t L[QK_K];
+  float scales[QK_K / 16];
+
+  for (int i = 0; i < nb; i++) {
+    float max_scale = 0;
+    float max_abs_scale = 0;
+
+    for (int ib = 0; ib < QK_K / 16; ++ib) {
+      const float scale = make_qx_quants(16, 32, x + 16 * ib, L + 16 * ib, 1);
+      scales[ib] = scale;
+
+      const float abs_scale = fabsf(scale);
+      if (abs_scale > max_abs_scale) {
+        max_abs_scale = abs_scale;
+        max_scale = scale;
+      }
+    }
+
+    if (!max_abs_scale) {
+      memset(&y[i], 0, sizeof(block_q6_K));
+      y[i].d = NE_FP32_TO_FP16(0.f);
+      x += QK_K;
+      continue;
+    }
+
+    float iscale = -128.f / max_scale;
+    y[i].d = NE_FP32_TO_FP16(1 / iscale);
+    for (int ib = 0; ib < QK_K / 16; ++ib) {
+      y[i].scales[ib] = MIN(127, nearest_int(iscale * scales[ib]));
+    }
+
+    for (int j = 0; j < QK_K / 16; ++j) {
+      float d = NE_FP16_TO_FP32(y[i].d) * y[i].scales[j];
+      if (!d) {
+        continue;
+      }
+      for (int ii = 0; ii < 16; ++ii) {
+        int l = nearest_int(x[16 * j + ii] / d);
+        l = MAX(-32, MIN(31, l));
+        L[16 * j + ii] = l + 32;
+      }
+    }
+
+    uint8_t* restrict ql = y[i].ql;
+    uint8_t* restrict qh = y[i].qh;
+#if QK_K == 256
+    for (int j = 0; j < QK_K; j += 128) {
+      for (int l = 0; l < 32; ++l) {
+        const uint8_t q1 = L[j + l + 0] & 0xF;
+        const uint8_t q2 = L[j + l + 32] & 0xF;
+        const uint8_t q3 = L[j + l + 64] & 0xF;
+        const uint8_t q4 = L[j + l + 96] & 0xF;
+        ql[l + 0] = q1 | (q3 << 4);
+        ql[l + 32] = q2 | (q4 << 4);
+        qh[l] =
+            (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
+      }
+      ql += 64;
+      qh += 32;
+    }
+#else
+    for (int l = 0; l < 32; ++l) {
+      const uint8_t q1 = L[l + 0] & 0xF;
+      const uint8_t q2 = L[l + 32] & 0xF;
+      ql[l] = q1 | (q2 << 4);
+    }
+    for (int l = 0; l < 16; ++l) {
+      qh[l] = (L[l] >> 4) | ((L[l + 16] >> 4) << 2) | ((L[l + 32] >> 4) << 4) | ((L[l + 48] >> 4) << 6);
+    }
+#endif
+
+    x += QK_K;
+  }
+}
+
+static void dequantize_row_q6_K(const block_q6_K* restrict x, float* restrict y, int k) {
+  assert(k % QK_K == 0);
+  const int nb = k / QK_K;
+
+  for (int i = 0; i < nb; i++) {
+    const float d = NE_FP16_TO_FP32(x[i].d);
+
+    const uint8_t* restrict ql = x[i].ql;
+    const uint8_t* restrict qh = x[i].qh;
+    const int8_t* restrict sc = x[i].scales;
+
+#if QK_K == 256
+    for (int n = 0; n < QK_K; n += 128) {
+      for (int l = 0; l < 32; ++l) {
+        int is = l / 16;
+        const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+        const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+        const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+        const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+        y[l + 0] = d * sc[is + 0] * q1;
+        y[l + 32] = d * sc[is + 2] * q2;
+        y[l + 64] = d * sc[is + 4] * q3;
+        y[l + 96] = d * sc[is + 6] * q4;
+      }
+      y += 128;
+      ql += 64;
+      qh += 32;
+      sc += 8;
+    }
+#else
+    for (int l = 0; l < 16; ++l) {
+      const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+      const int8_t q2 = (int8_t)((ql[l + 16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+      const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+      const int8_t q4 = (int8_t)((ql[l + 16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+      y[l + 0] = d * sc[0] * q1;
+      y[l + 16] = d * sc[1] * q2;
+      y[l + 32] = d * sc[2] * q3;
+      y[l + 48] = d * sc[3] * q4;
+    }
+    y += 64;
+#endif
+  }
+}
+
+static void quantize_row_q6_K(const float* restrict x, void* restrict vy, int k) {
+  assert(k % QK_K == 0);
+  block_q6_K* restrict y = (block_q6_K*)vy;
+  quantize_row_q6_K_reference(x, y, k);
+}
+
+static void dequantize_row_q8_K(const block_q8_K* restrict x, float* restrict y, int k) {
+  assert(k % QK_K == 0);
+  const int nb = k / QK_K;
+
+  for (int i = 0; i < nb; i++) {
+    for (int j = 0; j < QK_K; ++j) {
+      *y++ = x[i].d * x[i].qs[j];
+    }
+  }
+}
+
+//===================================== Q8_K ==============================================
+
+static void quantize_row_q8_K_reference(const float* restrict x, block_q8_K* restrict y, int k) {
+  assert(k % QK_K == 0);
+  const int nb = k / QK_K;
+
+  for (int i = 0; i < nb; i++) {
+    float max = 0;
+    float amax = 0;
+    for (int j = 0; j < QK_K; ++j) {
+      float ax = fabsf(x[j]);
+      if (ax > amax) {
+        amax = ax;
+        max = x[j];
+      }
+    }
+    if (!amax) {
+      y[i].d = 0;
+      memset(y[i].qs, 0, QK_K);
+      x += QK_K;
+      continue;
+    }
+    const float iscale = -128.f / max;
+    for (int j = 0; j < QK_K; ++j) {
+      int v = nearest_int(iscale * x[j]);
+      y[i].qs[j] = MIN(127, v);
+    }
+    for (int j = 0; j < QK_K / 16; ++j) {
+      int sum = 0;
+      for (int ii = 0; ii < 16; ++ii) {
+        sum += y[i].qs[j * 16 + ii];
+      }
+      y[i].bsums[j] = sum;
+    }
+    y[i].d = 1 / iscale;
+    x += QK_K;
+  }
+}
+
+static void quantize_row_q8_K(const float* restrict x, void* restrict y, int k) {
+  block_q8_K* restrict y_tmp = (block_q8_K*)y;
+  quantize_row_q8_K_reference(x, y_tmp, k);
+}
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/simd.h b/vllm-ext/inference_engine/cpp/vectors/cpu/simd.h
new file mode 100644
index 000000000..146b96921
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/simd.h
@@ -0,0 +1,240 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#ifndef NE_GRAPH_SIMD_H
+#define NE_GRAPH_SIMD_H
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
+#include <immintrin.h>
+#endif
+
+//
+// simd mappings
+//
+
+// we define a common set of C macros which map to specific intrinsics based on the current architecture
+// we then implement the fundamental computation operations below using only these macros
+// adding support for new architectures requires to define the corresponding SIMD macros
+//
+// NE_F32_STEP / NE_F16_STEP
+//   number of elements to process in a single step
+//
+// NE_F32_EPR / NE_F16_EPR
+//   number of elements to fit in a single register
+//
+
+#if defined(__AVX__)
+
+#define NE_SIMD
+
+// F32 AVX
+
+#define NE_F32_STEP 32
+#define NE_F32_EPR 8
+
+#define NE_F32x8 __m256
+#define NE_F32x8_ZERO _mm256_setzero_ps()
+#define NE_F32x8_SET1(x) _mm256_set1_ps(x)
+#define NE_F32x8_LOAD _mm256_loadu_ps
+#define NE_F32x8_STORE _mm256_storeu_ps
+#if defined(__FMA__)
+#define NE_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
+#else
+#define NE_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
+#endif
+#define NE_F32x8_ADD _mm256_add_ps
+#define NE_F32x8_MUL _mm256_mul_ps
+#define NE_F32x8_REDUCE(res, x)                                                                 \
+  {                                                                                             \
+    for (int i = 0; i < NE_F32_ARR / 2; ++i) {                                                  \
+      x[2 * i] = _mm256_add_ps(x[2 * i], x[2 * i + 1]);                                         \
+    }                                                                                           \
+    for (int i = 0; i < NE_F32_ARR / 4; ++i) {                                                  \
+      x[4 * i] = _mm256_add_ps(x[4 * i], x[4 * i + 2]);                                         \
+    }                                                                                           \
+    for (int i = 0; i < NE_F32_ARR / 8; ++i) {                                                  \
+      x[8 * i] = _mm256_add_ps(x[8 * i], x[8 * i + 4]);                                         \
+    }                                                                                           \
+    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), _mm256_extractf128_ps(x[0], 1)); \
+    const __m128 t1 = _mm_hadd_ps(t0, t0);                                                      \
+    res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1));                                                   \
+  }
+// TODO(Yucheng): is this optimal ?
+
+#define NE_F32_VEC NE_F32x8
+#define NE_F32_VEC_ZERO NE_F32x8_ZERO
+#define NE_F32_VEC_SET1 NE_F32x8_SET1
+#define NE_F32_VEC_LOAD NE_F32x8_LOAD
+#define NE_F32_VEC_STORE NE_F32x8_STORE
+#define NE_F32_VEC_FMA NE_F32x8_FMA
+#define NE_F32_VEC_ADD NE_F32x8_ADD
+#define NE_F32_VEC_MUL NE_F32x8_MUL
+#define NE_F32_VEC_REDUCE NE_F32x8_REDUCE
+
+// F16 AVX
+
+#define NE_F16_STEP 32
+#define NE_F16_EPR 8
+
+// F16 arithmetic is not supported by AVX, so we use F32 instead
+
+#define NE_F32Cx8 __m256
+#define NE_F32Cx8_ZERO _mm256_setzero_ps()
+#define NE_F32Cx8_SET1(x) _mm256_set1_ps(x)
+
+#if defined(__F16C__)
+// the  _mm256_cvt intrinsics require F16C
+#define NE_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x)))
+#define NE_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i*)(x), _mm256_cvtps_ph(y, 0))
+#else
+static inline __m256 __avx_f32cx8_load(ne_fp16_t* x) {
+  float tmp[8];
+
+  for (int i = 0; i < 8; i++) {
+    tmp[i] = NE_FP16_TO_FP32(x[i]);
+  }
+
+  return _mm256_loadu_ps(tmp);
+}
+static inline void __avx_f32cx8_store(ne_fp16_t* x, __m256 y) {
+  float arr[8];
+
+  _mm256_storeu_ps(arr, y);
+
+  for (int i = 0; i < 8; i++) x[i] = NE_FP32_TO_FP16(arr[i]);
+}
+#define NE_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
+#define NE_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
+#endif
+
+#define NE_F32Cx8_FMA NE_F32x8_FMA
+#define NE_F32Cx8_ADD _mm256_add_ps
+#define NE_F32Cx8_MUL _mm256_mul_ps
+#define NE_F32Cx8_REDUCE NE_F32x8_REDUCE
+
+#define NE_F16_VEC NE_F32Cx8
+#define NE_F16_VEC_ZERO NE_F32Cx8_ZERO
+#define NE_F16_VEC_SET1 NE_F32Cx8_SET1
+#define NE_F16_VEC_LOAD(p, i) NE_F32Cx8_LOAD(p)
+#define NE_F16_VEC_STORE(p, r, i) NE_F32Cx8_STORE(p, r[i])
+#define NE_F16_VEC_FMA NE_F32Cx8_FMA
+#define NE_F16_VEC_ADD NE_F32Cx8_ADD
+#define NE_F16_VEC_MUL NE_F32Cx8_MUL
+#define NE_F16_VEC_REDUCE NE_F32Cx8_REDUCE
+
+#elif defined(__SSE3__)
+
+#define NE_SIMD
+
+// F32 SSE
+
+#define NE_F32_STEP 32
+#define NE_F32_EPR 4
+
+#define NE_F32x4 __m128
+#define NE_F32x4_ZERO _mm_setzero_ps()
+#define NE_F32x4_SET1(x) _mm_set1_ps(x)
+#define NE_F32x4_LOAD _mm_loadu_ps
+#define NE_F32x4_STORE _mm_storeu_ps
+#if defined(__FMA__)
+// TODO(Yucheng): Does this work?
+#define NE_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
+#else
+#define NE_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
+#endif
+#define NE_F32x4_ADD _mm_add_ps
+#define NE_F32x4_MUL _mm_mul_ps
+#define NE_F32x4_REDUCE(res, x)                      \
+  {                                                  \
+    for (int i = 0; i < NE_F32_ARR / 2; ++i) {       \
+      x[2 * i] = _mm_add_ps(x[2 * i], x[2 * i + 1]); \
+    }                                                \
+    for (int i = 0; i < NE_F32_ARR / 4; ++i) {       \
+      x[4 * i] = _mm_add_ps(x[4 * i], x[4 * i + 2]); \
+    }                                                \
+    for (int i = 0; i < NE_F32_ARR / 8; ++i) {       \
+      x[8 * i] = _mm_add_ps(x[8 * i], x[8 * i + 4]); \
+    }                                                \
+    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);       \
+    res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0));        \
+  }
+// TODO(Yucheng): is this optimal ?
+
+#define NE_F32_VEC NE_F32x4
+#define NE_F32_VEC_ZERO NE_F32x4_ZERO
+#define NE_F32_VEC_SET1 NE_F32x4_SET1
+#define NE_F32_VEC_LOAD NE_F32x4_LOAD
+#define NE_F32_VEC_STORE NE_F32x4_STORE
+#define NE_F32_VEC_FMA NE_F32x4_FMA
+#define NE_F32_VEC_ADD NE_F32x4_ADD
+#define NE_F32_VEC_MUL NE_F32x4_MUL
+#define NE_F32_VEC_REDUCE NE_F32x4_REDUCE
+
+// F16 SSE
+
+#define NE_F16_STEP 32
+#define NE_F16_EPR 4
+
+static inline __m128 __sse_f16x4_load(ne_fp16_t* x) {
+  float tmp[4];
+
+  tmp[0] = NE_FP16_TO_FP32(x[0]);
+  tmp[1] = NE_FP16_TO_FP32(x[1]);
+  tmp[2] = NE_FP16_TO_FP32(x[2]);
+  tmp[3] = NE_FP16_TO_FP32(x[3]);
+
+  return _mm_loadu_ps(tmp);
+}
+
+static inline void __sse_f16x4_store(ne_fp16_t* x, __m128 y) {
+  float arr[4];
+
+  _mm_storeu_ps(arr, y);
+
+  x[0] = NE_FP32_TO_FP16(arr[0]);
+  x[1] = NE_FP32_TO_FP16(arr[1]);
+  x[2] = NE_FP32_TO_FP16(arr[2]);
+  x[3] = NE_FP32_TO_FP16(arr[3]);
+}
+
+#define NE_F32Cx4 __m128
+#define NE_F32Cx4_ZERO _mm_setzero_ps()
+#define NE_F32Cx4_SET1(x) _mm_set1_ps(x)
+#define NE_F32Cx4_LOAD(x) __sse_f16x4_load(x)
+#define NE_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
+#define NE_F32Cx4_FMA NE_F32x4_FMA
+#define NE_F32Cx4_ADD _mm_add_ps
+#define NE_F32Cx4_MUL _mm_mul_ps
+#define NE_F32Cx4_REDUCE NE_F32x4_REDUCE
+
+#define NE_F16_VEC NE_F32Cx4
+#define NE_F16_VEC_ZERO NE_F32Cx4_ZERO
+#define NE_F16_VEC_SET1 NE_F32Cx4_SET1
+#define NE_F16_VEC_LOAD(p, i) NE_F32Cx4_LOAD(p)
+#define NE_F16_VEC_STORE(p, r, i) NE_F32Cx4_STORE(p, r[i])
+#define NE_F16_VEC_FMA NE_F32Cx4_FMA
+#define NE_F16_VEC_ADD NE_F32Cx4_ADD
+#define NE_F16_VEC_MUL NE_F32Cx4_MUL
+#define NE_F16_VEC_REDUCE NE_F32Cx4_REDUCE
+
+#endif
+
+// NE_F32_ARR / NE_F16_ARR
+//   number of registers to use per step
+#ifdef NE_SIMD
+#define NE_F32_ARR (NE_F32_STEP / NE_F32_EPR)
+#define NE_F16_ARR (NE_F16_STEP / NE_F16_EPR)
+#endif
+
+#endif  // NE_GRAPH_SIMD_H
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/vec.hpp b/vllm-ext/inference_engine/cpp/vectors/cpu/vec.hpp
new file mode 100644
index 000000000..41cc13da3
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/vec.hpp
@@ -0,0 +1,24 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#ifndef ENGINE_EXECUTOR_INCLUDE_VEC_HPP_
+#define ENGINE_EXECUTOR_INCLUDE_VEC_HPP_
+
+#include "vec_arithmetic.hpp"
+#include "vec_base.hpp"
+#include "vec_compare.hpp"
+#include "vec_convert.hpp"
+#include "vec_set.hpp"
+
+#endif  // ENGINE_EXECUTOR_INCLUDE_VEC_HPP_
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/vec.hpp.gch b/vllm-ext/inference_engine/cpp/vectors/cpu/vec.hpp.gch
new file mode 100644
index 000000000..7b772e3be
Binary files /dev/null and b/vllm-ext/inference_engine/cpp/vectors/cpu/vec.hpp.gch differ
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/vec_arithmetic.cpp b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_arithmetic.cpp
new file mode 100644
index 000000000..1e47ef8b2
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_arithmetic.cpp
@@ -0,0 +1,170 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#include "vec_load.hpp"
+#include "vec_store.hpp"
+#include "vec_arithmetic.hpp"
+#include "cmath"
+
+fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y) {
+#if __AVX512F__
+  return {_mm512_sub_ps(x.first, y.first)};
+#else
+  return {_mm256_sub_ps(x.first, y.first), _mm256_sub_ps(x.second, y.second)};
+#endif
+}
+
+fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z) {
+#if __AVX512F__
+  return {_mm512_fmsub_ps(x.first, y.first, z.first)};
+#else
+  return {_mm256_fmsub_ps(x.first, y.first, z.first), _mm256_fmsub_ps(x.second, y.second, z.second)};
+#endif
+}
+
+fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z) {
+#if __AVX512F__
+  return {_mm512_maskz_fmsub_ps(mask, x.first, y.first, z.first)};
+#else
+  __m256 first, second;
+  MASK_DECORATOR(_mm256_blend_ps, _mm256_setzero_ps(), _mm256_fmsub_ps(x.first, y.first, z.first), mask & 255, first);
+  MASK_DECORATOR(_mm256_blend_ps, _mm256_setzero_ps(), _mm256_fmsub_ps(x.second, y.second, z.second), mask >> 8,
+                 second);
+  return {first, second};
+#endif
+}
+
+fp32x16 add_fp32x16(fp32x16 x, fp32x16 y) {
+#if __AVX512F__
+  return {_mm512_add_ps(x.first, y.first)};
+#else
+  return {_mm256_add_ps(x.first, y.first), _mm256_add_ps(x.second, y.second)};
+#endif
+}
+
+fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z) {
+#if __AVX512F__
+  return {_mm512_fmadd_ps(x.first, y.first, z.first)};
+#else
+  return {_mm256_fmadd_ps(x.first, y.first, z.first), _mm256_fmadd_ps(x.second, y.second, z.second)};
+#endif
+}
+
+fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y) {
+#if __AVX512F__
+  return {_mm512_mul_ps(x.first, y.first)};
+#else
+  return {_mm256_mul_ps(x.first, y.first), _mm256_mul_ps(x.second, y.second)};
+#endif
+}
+
+fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y) {
+#if __AVX512F__
+  return {_mm512_maskz_mul_ps(mask, x.first, y.first)};
+#else
+  __m256 first, second;
+  MASK_DECORATOR(_mm256_blend_ps, _mm256_setzero_ps(), _mm256_mul_ps(x.first, y.first), mask & 255, first);
+  MASK_DECORATOR(_mm256_blend_ps, _mm256_setzero_ps(), _mm256_mul_ps(x.second, y.second), mask >> 8, second);
+  return {first, second};
+#endif
+}
+
+template <int rounding>
+fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y) {
+  static_assert(rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+                    rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) ||
+                    rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+                    rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_CUR_DIRECTION),
+                "ERROR: Not support rounding");
+#if __AVX512F__
+  return {_mm512_mul_round_ps(x.first, y.first, rounding)};
+#else
+  return {_mm256_round_ps(_mm256_mul_ps(x.first, y.first), rounding),
+          _mm256_round_ps(_mm256_mul_ps(x.second, y.second), rounding)};
+#endif
+}
+
+fp32x16 div_fp32x16(fp32x16 x, fp32x16 y) {
+#if __AVX512F__
+  return {_mm512_div_ps(x.first, y.first)};
+#else
+  return {_mm256_div_ps(x.first, y.first), _mm256_div_ps(x.second, y.second)};
+#endif
+}
+
+float reduce_add_fp32x16(fp32x16 x) {
+#if __AVX512F__
+  return {_mm512_reduce_add_ps(x.first)};
+#else
+  const __m256 x256 = _mm256_add_ps(x.first, x.second);
+  const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(x256, 1), _mm256_castps256_ps128(x256));
+  const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
+  const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
+  return _mm_cvtss_f32(x32);
+#endif
+}
+
+fp32x16 sqrt_fp32x16(fp32x16 x) {
+#if __AVX512F__
+  return {_mm512_sqrt_ps(x.first)};
+#else
+  return {_mm256_sqrt_ps(x.first), _mm256_sqrt_ps(x.second)};
+#endif
+}
+
+fp32x16 rsqrt14_fp32x16(fp32x16 x) {
+#if __AVX512F__
+  return {_mm512_rsqrt14_ps(x.first)};
+#else
+  // the max relative error is 6x than avx512
+  return {_mm256_rsqrt_ps(x.first), _mm256_rsqrt_ps(x.second)};
+#endif
+}
+fp32x16 ceil_fp32x16(fp32x16 x) {
+#if __AVX512F__
+  return {_mm512_ceil_ps(x.first)};
+#else
+  // the max relative error is 6x than avx512
+  return {_mm256_ceil_ps(x.first), _mm256_ceil_ps(x.second)};
+#endif
+}
+
+fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y) {
+#if __AVX512F__
+  return {_mm512_scalef_ps(x.first, y.first)};
+#else
+  float* vec_x = new float[16];
+  float* vec_y = new float[16];
+  float* vec_z = new float[16];
+  store_fp32x16(vec_x, x);
+  store_fp32x16(vec_y, y);
+  for (int i = 0; i < 16; i++) vec_z[i] = vec_x[i] * exp2(vec_y[i]);
+  fp32x16 res = load_fp32x16(vec_z);
+  delete[] vec_x;
+  delete[] vec_y;
+  delete[] vec_z;
+  return res;
+#endif
+}
+
+float dot_fp32x16(fp32x16 x, fp32x16 y) { return reduce_add_fp32x16(mul_fp32x16(x, y)); }
+
+fp32x16 abs_fp32x16(fp32x16 x) {
+#if __AVX512F__
+  return {_mm512_abs_ps(x.first)};
+#else
+  return {_mm256_castsi256_ps(_mm256_abs_epi32(_mm256_castps_si256(x.first))),
+          _mm256_castsi256_ps(_mm256_abs_epi32(_mm256_castps_si256(x.second)))};
+#endif
+}
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/vec_arithmetic.hpp b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_arithmetic.hpp
new file mode 100644
index 000000000..71bf6f7f5
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_arithmetic.hpp
@@ -0,0 +1,66 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#ifndef ENGINE_EXECUTOR_INCLUDE_VEC_ARITHMETIC_HPP_
+#define ENGINE_EXECUTOR_INCLUDE_VEC_ARITHMETIC_HPP_
+
+#include "vec_base.hpp"
+
+fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y);
+REGISTER_KERNEL_T(sub_fp32x16, fp32x16, fp32x16, fp32x16);
+
+fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z);
+REGISTER_KERNEL_T(fmsub_fp32x16, fp32x16, fp32x16, fp32x16, fp32x16);
+
+fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z);
+
+fp32x16 add_fp32x16(fp32x16 x, fp32x16 y);
+REGISTER_KERNEL_T(add_fp32x16, fp32x16, fp32x16, fp32x16);
+
+fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z);
+REGISTER_KERNEL_T(fmadd_fp32x16, fp32x16, fp32x16, fp32x16, fp32x16);
+
+fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y);
+REGISTER_KERNEL_T(mul_fp32x16, fp32x16, fp32x16, fp32x16);
+
+fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y);
+
+template <int rounding>
+fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y);
+
+fp32x16 div_fp32x16(fp32x16 x, fp32x16 y);
+REGISTER_KERNEL_T(div_fp32x16, fp32x16, fp32x16, fp32x16);
+
+float reduce_add_fp32x16(fp32x16 x);
+REGISTER_KERNEL_T(reduce_add_fp32x16, float, fp32x16);
+
+fp32x16 sqrt_fp32x16(fp32x16 x);
+REGISTER_KERNEL_T(sqrt_fp32x16, fp32x16, fp32x16);
+
+fp32x16 rsqrt14_fp32x16(fp32x16 x);
+REGISTER_KERNEL_T(rsqrt14_fp32x16, fp32x16, fp32x16);
+
+fp32x16 ceil_fp32x16(fp32x16 x);
+REGISTER_KERNEL_T(ceil_fp32x16, fp32x16, fp32x16);
+
+fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y);
+REGISTER_KERNEL_T(scale_fp32x16, fp32x16, fp32x16, fp32x16);
+
+float dot_fp32x16(fp32x16 x, fp32x16 y);
+REGISTER_KERNEL_T(dot_fp32x16, float, fp32x16, fp32x16);
+
+fp32x16 abs_fp32x16(fp32x16 x);
+REGISTER_KERNEL_T(abs_fp32x16, fp32x16, fp32x16);
+
+#endif  // ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/vec_base.hpp b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_base.hpp
new file mode 100644
index 000000000..2051f9bb9
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_base.hpp
@@ -0,0 +1,149 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#ifndef ENGINE_EXECUTOR_INCLUDE_VEC_BASE_HPP_
+#define ENGINE_EXECUTOR_INCLUDE_VEC_BASE_HPP_
+
+#include <immintrin.h>
+#include <cstdint>
+
+#if __AVX512F__
+struct fp32x16 {
+  __m512 first;
+};
+
+struct s32x16 {
+  __m512i first;
+};
+struct u32x16 {
+  __m512i first;
+};
+#else
+struct fp32x16 {
+  __m256 first, second;
+};
+
+struct s32x16 {
+  __m256i first, second;
+};
+struct u32x16 {
+  __m256i first, second;
+};
+#define MASK_DECORATOR(blend_func, a, b, mask, res) \
+  switch ((mask)) {                                 \
+    case 1:                                         \
+      (res) = blend_func((a), (b), 1);              \
+      break;                                        \
+    case 3:                                         \
+      (res) = blend_func((a), (b), 3);              \
+      break;                                        \
+    case 7:                                         \
+      (res) = blend_func((a), (b), 7);              \
+      break;                                        \
+    case 15:                                        \
+      (res) = blend_func((a), (b), 15);             \
+      break;                                        \
+    case 31:                                        \
+      (res) = blend_func((a), (b), 31);             \
+      break;                                        \
+    case 63:                                        \
+      (res) = blend_func((a), (b), 63);             \
+      break;                                        \
+    case 127:                                       \
+      (res) = blend_func((a), (b), 127);            \
+      break;                                        \
+    default:                                        \
+      break;                                        \
+  }
+
+#endif
+
+struct bf16x16 {
+  __m256i first;
+};
+
+struct fp16x16 {
+  __m256i first;
+};
+
+struct s16x16 {
+  __m256i first;
+};
+struct s8x16 {
+  __m128i first;
+};
+struct u8x16 {
+  __m128i first;
+};
+
+#define CPU_VEC_STEP 16
+
+template <typename T>
+T load_kernel_t(const void* src) {
+  return *reinterpret_cast<const T*>(src);
+}
+
+template <>
+fp32x16 load_kernel_t<fp32x16>(const void* src);
+template <>
+bf16x16 load_kernel_t<bf16x16>(const void* src);
+
+template <typename T>
+void store_kernel_t(void* dst, T src) {
+  T* dst_T = reinterpret_cast<T*>(dst);
+  *dst_T = src;
+}
+
+template <>
+void store_kernel_t<s8x16>(void* dst, s8x16 src);
+template <>
+void store_kernel_t<fp32x16>(void* dst, fp32x16 src);
+template <>
+void store_kernel_t<bf16x16>(void* dst, bf16x16 src);
+
+template <typename dstT, typename src0T = void, typename src1T = void, typename src2T = void>
+struct kernel_t {
+  dstT (*func_)(src0T, src1T, src2T);
+  void operator()(void* dst, const void* src0, const void* src1, const void* src2) {
+    store_kernel_t<dstT>(dst,
+                         func_(load_kernel_t<src0T>(src0), load_kernel_t<src1T>(src1), load_kernel_t<src2T>(src2)));
+  }
+};
+
+template <typename dstT, typename src0T, typename src1T>
+struct kernel_t<dstT, src0T, src1T, void> {
+  dstT (*func_)(src0T, src1T);
+  void operator()(void* dst, const void* src0, const void* src1) {
+    store_kernel_t<dstT>(dst, func_(load_kernel_t<src0T>(src0), load_kernel_t<src1T>(src1)));
+  }
+};
+
+template <typename dstT, typename src0T>
+struct kernel_t<dstT, src0T, void, void> {
+  dstT (*func_)(src0T);
+  void operator()(void* dst, const void* src0) { store_kernel_t<dstT>(dst, func_(load_kernel_t<src0T>(src0))); }
+};
+
+template <typename dstT>
+struct kernel_t<dstT, void, void, void> {
+  dstT (*func_)();
+  void operator()(void* dst) { store_kernel_t<dstT>(dst, func_()); }
+};
+
+#define REGISTER_KERNEL_T(func, ...)                           \
+  struct ne_##func##_kernel_t : public kernel_t<__VA_ARGS__> { \
+    ne_##func##_kernel_t() { func_ = func; }                   \
+  };
+
+#endif  // ENGINE_EXECUTOR_INCLUDE_VEC_BASE_HPP_
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/vec_compare.cpp b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_compare.cpp
new file mode 100644
index 000000000..be78ac3b8
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_compare.cpp
@@ -0,0 +1,51 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#include "vec_compare.hpp"
+
+fp32x16 min_fp32x16(fp32x16 a, fp32x16 b) {
+#if __AVX512F__
+  return {_mm512_min_ps(a.first, b.first)};
+#else
+  return {_mm256_min_ps(a.first, b.first), _mm256_min_ps(a.second, b.second)};
+#endif
+}
+
+s32x16 max_s32x16(s32x16 a, s32x16 b) {
+#if __AVX512F__
+  return {_mm512_max_epi32(a.first, b.first)};
+#else
+  return {_mm256_max_epi32(a.first, b.first), _mm256_max_epi32(a.second, b.second)};
+#endif
+}
+
+fp32x16 max_fp32x16(fp32x16 a, fp32x16 b) {
+#if __AVX512F__
+  return {_mm512_max_ps(a.first, b.first)};
+#else
+  return {_mm256_max_ps(a.first, b.first), _mm256_max_ps(a.second, b.second)};
+#endif
+}
+
+float reduce_max_fp32x16(fp32x16 x) {
+#if __AVX512F__
+  return {_mm512_reduce_max_ps(x.first)};
+#else
+  const __m256 x256 = _mm256_max_ps(x.first, x.second);
+  const __m128 x128 = _mm_max_ps(_mm256_extractf128_ps(x256, 1), _mm256_castps256_ps128(x256));
+  const __m128 x64 = _mm_max_ps(x128, _mm_movehl_ps(x128, x128));
+  const __m128 x32 = _mm_max_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
+  return _mm_cvtss_f32(x32);
+#endif
+}
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/vec_compare.hpp b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_compare.hpp
new file mode 100644
index 000000000..2300f9526
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_compare.hpp
@@ -0,0 +1,29 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#ifndef ENGINE_EXECUTOR_INCLUDE_VEC_COMPARE_HPP_
+#define ENGINE_EXECUTOR_INCLUDE_VEC_COMPARE_HPP_
+
+#include "vec_base.hpp"
+
+fp32x16 min_fp32x16(fp32x16 a, fp32x16 b);
+
+s32x16 max_s32x16(s32x16 a, s32x16 b);
+
+fp32x16 max_fp32x16(fp32x16 a, fp32x16 b);
+
+float reduce_max_fp32x16(fp32x16 x);
+REGISTER_KERNEL_T(reduce_max_fp32x16, float, fp32x16);
+
+#endif  // ENGINE_EXECUTOR_INCLUDE_VEC_COMPARE_HPP_
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/vec_convert.cpp b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_convert.cpp
new file mode 100644
index 000000000..4e57dbb91
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_convert.cpp
@@ -0,0 +1,188 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#include "vec_store.hpp"
+#include "vec_convert.hpp"
+
+template <int rounding>
+s32x16 cvt_roundfp32x16_s32x16(fp32x16 a) {
+  static_assert(rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+                    rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) ||
+                    rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+                    rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_CUR_DIRECTION),
+                "ERROR: Not support rounding");
+#if __AVX512F__
+  return {_mm512_cvt_roundps_epi32(a.first, rounding)};
+#else
+  return {_mm256_cvtps_epi32(_mm256_round_ps(a.first, rounding)),
+          _mm256_cvtps_epi32(_mm256_round_ps(a.second, rounding))};
+#endif
+}
+template <int rounding>
+s32x16 maskz_cvt_roundfp32x16_s32x16(int mask, fp32x16 a) {
+  static_assert(rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+                    rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) ||
+                    rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+                    rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_CUR_DIRECTION),
+                "ERROR: Not support rounding");
+#if __AVX512F__
+  return {_mm512_maskz_cvt_roundps_epi32(mask, a.first, rounding)};
+#else
+  __m256i first, second;
+  first = _mm256_cvtps_epi32(_mm256_round_ps(a.first, rounding));
+  second = _mm256_cvtps_epi32(_mm256_round_ps(a.second, rounding));
+  MASK_DECORATOR(_mm256_blend_epi32, _mm256_setzero_si256(), first, mask & 255, first);
+  MASK_DECORATOR(_mm256_blend_epi32, _mm256_setzero_si256(), second, mask >> 8, second);
+  return {first, second};
+#endif
+}
+
+bf16x16 cvt_fp32x16_bf16x16(fp32x16 a) {
+#if __AVX512F__
+#if __AVX512BF16__ && __GNUC__ > 11
+  return {_mm512_cvtneps_pbh(a.first)};
+#else
+  return {_mm512_cvtepi32_epi16(_mm512_bsrli_epi128(_mm512_castps_si512(a.first), 2))};
+#endif
+#else
+  __m256i first = _mm256_bsrli_epi128(_mm256_castps_si256(a.first), 2);
+  __m256i second = _mm256_bsrli_epi128(_mm256_castps_si256(a.second), 2);
+  __m256i res = _mm256_packus_epi32(first, second);
+  return {_mm256_permute4x64_epi64(res, 0x18)};
+#endif
+}
+
+fp32x16 cvt_bf16x16_fp32x16(bf16x16 a) {
+#if __AVX512F__
+#if __AVX512BF16__ && __GNUC__ > 11
+  return {_mm512_cvtpbh_ps(a.first)};
+#else
+  return {_mm512_castsi512_ps(_mm512_bslli_epi128(_mm512_cvtepu16_epi32(a.first), 2))};
+#endif
+#else
+  __m128i second = _mm256_extractf128_si256(a.first, 1);
+  __m256 second_fp32 = _mm256_castsi256_ps(_mm256_bslli_epi128(_mm256_cvtepu16_epi32(second), 2));
+  __m128i first = _mm256_castsi256_si128(a.first);
+  __m256 first_fp32 = _mm256_castsi256_ps(_mm256_bslli_epi128(_mm256_cvtepu16_epi32(first), 2));
+  return {first_fp32, second_fp32};
+#endif
+}
+
+fp32x16 maskz_cvt_bf16x16_fp32x16(int mask, bf16x16 a) {
+#if __AVX512F__
+#if __AVX512BF16__ && __GNUC__ > 11
+  return {_mm512_maskz_cvtpbh_ps(mask, a.first)};
+#else
+  return {_mm512_castsi512_ps(_mm512_bslli_epi128(_mm512_maskz_cvtepu16_epi32(mask, a.first), 2))};
+#endif
+#else
+  __m128i second = _mm256_extractf128_si256(a.first, 1);
+  __m256 second_fp32 = _mm256_castsi256_ps(_mm256_bslli_epi128(_mm256_cvtepu16_epi32(second), 2));
+  __m128i first = _mm256_castsi256_si128(a.first);
+  __m256 first_fp32 = _mm256_castsi256_ps(_mm256_bslli_epi128(_mm256_cvtepu16_epi32(first), 2));
+  MASK_DECORATOR(_mm256_blend_ps, _mm256_setzero_ps(), first_fp32, mask & 255, first_fp32);
+  MASK_DECORATOR(_mm256_blend_ps, _mm256_setzero_ps(), second_fp32, mask >> 8, second_fp32);
+  return {first_fp32, second_fp32};
+#endif
+}
+
+u8x16 cvt_u32x16_u8x16(u32x16 a) {
+#if __AVX512F__
+  return {_mm512_cvtusepi32_epi8(a.first)};
+#else
+  __m256i first = _mm256_min_epi32(_mm256_set1_epi32(255), a.first);
+  __m256i second = _mm256_min_epi32(_mm256_set1_epi32(255), a.second);
+  first = _mm256_shuffle_epi8(first, _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1,
+                                                     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0));
+  second = _mm256_shuffle_epi8(second, _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1, -1, -1, -1, -1,
+                                                       -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1, -1, -1, -1));
+  __m256i result = _mm256_or_si256(first, second);
+  result = _mm256_permutevar8x32_epi32(result, _mm256_set_epi32(7, 6, 3, 2, 5, 1, 4, 0));
+  return {_mm256_castsi256_si128(result)};
+#endif
+}
+
+u8x16 maskz_cvt_u32x16_u8x16(int mask, u32x16 a) {
+#if __AVX512F__
+  return {_mm512_maskz_cvtusepi32_epi8(mask, a.first)};
+#else
+  __m256i first, second;
+  MASK_DECORATOR(_mm256_blend_epi32, _mm256_setzero_si256(), _mm256_min_epi32(_mm256_set1_epi32(255), a.first),
+                 mask & 255, first);
+  MASK_DECORATOR(_mm256_blend_epi32, _mm256_setzero_si256(), _mm256_min_epi32(_mm256_set1_epi32(255), a.second),
+                 mask >> 8, second);
+  first = _mm256_shuffle_epi8(first, _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1,
+                                                     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0));
+  second = _mm256_shuffle_epi8(second, _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1, -1, -1, -1, -1,
+                                                       -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1, -1, -1, -1));
+  __m256i result = _mm256_or_si256(first, second);
+  result = _mm256_permutevar8x32_epi32(result, _mm256_set_epi32(7, 6, 3, 2, 5, 1, 4, 0));
+  return {_mm256_castsi256_si128(result)};
+#endif
+}
+
+s8x16 cvt_s32x16_s8x16(s32x16 a) {
+#if __AVX512F__
+  return {_mm512_cvtsepi32_epi8(a.first)};
+#else
+  __m256i first = _mm256_min_epi32(_mm256_set1_epi32(127), a.first);
+  __m256i second = _mm256_min_epi32(_mm256_set1_epi32(127), a.second);
+  first = _mm256_max_epi32(_mm256_set1_epi32(-128), first);
+  second = _mm256_max_epi32(_mm256_set1_epi32(-128), second);
+  first = _mm256_shuffle_epi8(first, _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1,
+                                                     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0));
+  second = _mm256_shuffle_epi8(second, _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1, -1, -1, -1, -1,
+                                                       -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1, -1, -1, -1));
+  __m256i result = _mm256_or_si256(first, second);
+  result = _mm256_permutevar8x32_epi32(result, _mm256_set_epi32(7, 6, 3, 2, 5, 1, 4, 0));
+  return {_mm256_castsi256_si128(result)};
+#endif
+}
+
+s8x16 maskz_cvt_s32x16_s8x16(const int mask, s32x16 a) {
+#if __AVX512F__
+  return {_mm512_maskz_cvtsepi32_epi8(mask, a.first)};
+#else
+  __m256i first, second;
+  MASK_DECORATOR(_mm256_blend_epi32, _mm256_setzero_si256(), _mm256_min_epi32(_mm256_set1_epi32(127), a.first),
+                 mask & 255, first);
+  MASK_DECORATOR(_mm256_blend_epi32, _mm256_setzero_si256(), _mm256_min_epi32(_mm256_set1_epi32(127), a.second),
+                 mask >> 8, second);
+  first = _mm256_max_epi32(_mm256_set1_epi32(-128), first);
+  second = _mm256_max_epi32(_mm256_set1_epi32(-128), second);
+  first = _mm256_shuffle_epi8(first, _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1,
+                                                     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0));
+  second = _mm256_shuffle_epi8(second, _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1, -1, -1, -1, -1,
+                                                       -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1, -1, -1, -1));
+  __m256i result = _mm256_or_si256(first, second);
+  result = _mm256_permutevar8x32_epi32(result, _mm256_set_epi32(7, 6, 3, 2, 5, 1, 4, 0));
+  return {_mm256_castsi256_si128(result)};
+#endif
+}
+
+void cvtu32x16_store_u8x16(void* base_addr, u32x16 a) {
+#ifdef __AVX512F__
+  _mm512_mask_cvtusepi32_storeu_epi8(base_addr, 0xffff, a.first);
+#else
+  store_u8x16(base_addr, cvt_u32x16_u8x16(a));
+#endif
+}
+
+void mask_cvtu32x16_store_u8x16(void* base_addr, int mask, u32x16 a) {
+#ifdef __AVX512F__
+  _mm512_mask_cvtusepi32_storeu_epi8(base_addr, mask, a.first);
+#else
+  mask_store_u8x16(base_addr, mask, maskz_cvt_u32x16_u8x16(mask, a));
+#endif
+}
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/vec_convert.hpp b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_convert.hpp
new file mode 100644
index 000000000..d0e58e5d0
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_convert.hpp
@@ -0,0 +1,43 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#ifndef ENGINE_EXECUTOR_INCLUDE_VEC_CONVERT_HPP_
+#define ENGINE_EXECUTOR_INCLUDE_VEC_CONVERT_HPP_
+
+#include "vec_base.hpp"
+
+template <int rounding = (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)>
+s32x16 cvt_roundfp32x16_s32x16(fp32x16 a);
+template <int rounding>
+struct ne_cvt_roundfp32x16_s32x16_kernel_t : public kernel_t<s32x16, fp32x16> {
+  ne_cvt_roundfp32x16_s32x16_kernel_t() { func_ = cvt_roundfp32x16_s32x16<rounding>; }
+};
+
+template <int rounding = (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)>
+s32x16 maskz_cvt_roundfp32x16_s32x16(int mask, fp32x16 a);
+bf16x16 cvt_fp32x16_bf16x16(fp32x16 a);
+
+fp32x16 cvt_bf16x16_fp32x16(bf16x16 a);
+
+fp32x16 maskz_cvt_bf16x16_fp32x16(int mask, bf16x16 a);
+
+u8x16 cvt_u32x16_u8x16(u32x16 a);
+u8x16 maskz_cvt_u32x16_u8x16(int mask, u32x16 a);
+
+s8x16 cvt_s32x16_s8x16(s32x16 a);
+s8x16 maskz_cvt_s32x16_s8x16(const int mask, s32x16 a);
+
+void cvtu32x16_store_u8x16(void* base_addr, u32x16 a);
+void mask_cvtu32x16_store_u8x16(void* base_addr, int mask, u32x16 a);
+#endif  // ENGINE_EXECUTOR_INCLUDE_VEC_CONVERT_HPP_
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/vec_load.cpp b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_load.cpp
new file mode 100644
index 000000000..eeaea180d
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_load.cpp
@@ -0,0 +1,15 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#include "vec_load.hpp"
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/vec_load.hpp b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_load.hpp
new file mode 100644
index 000000000..3211ceac4
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_load.hpp
@@ -0,0 +1,52 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#ifndef ENGINE_EXECUTOR_INCLUDE_VEC_LOAD_HPP_
+#define ENGINE_EXECUTOR_INCLUDE_VEC_LOAD_HPP_
+
+#include "vec_base.hpp"
+
+inline fp32x16 load_fp32x16(void const* mem_addr) {
+#if __AVX512F__
+  return {_mm512_loadu_ps(mem_addr)};
+#else
+  float const* mem_addr_fp32 = reinterpret_cast<float const*>(mem_addr);
+  return {_mm256_loadu_ps(mem_addr_fp32), _mm256_loadu_ps(mem_addr_fp32 + 8)};
+#endif
+}
+template <>
+inline fp32x16 load_kernel_t<fp32x16>(const void* src) {
+  return load_fp32x16(src);
+}
+inline fp32x16 mask_load_fp32x16(fp32x16 src, int mask, void const* mem_addr) {
+#if __AVX512F__
+  return {_mm512_mask_loadu_ps(src.first, mask, mem_addr)};
+#else
+  float const* mem_addr_fp32 = reinterpret_cast<float const*>(mem_addr);
+  return {_mm256_loadu_ps(mem_addr_fp32), _mm256_loadu_ps(mem_addr_fp32 + 8)};
+#endif
+}
+
+inline bf16x16 load_bf16x16(void const* mem_addr) {
+  __m256i const* mem_addr_bf16 = reinterpret_cast<__m256i const*>(mem_addr);
+  return {_mm256_loadu_si256(mem_addr_bf16)};
+}
+template <>
+inline bf16x16 load_kernel_t<bf16x16>(const void* src) {
+  return load_bf16x16(src);
+}
+
+inline bf16x16 maskz_load_bf16x16(int mask, void const* mem_addr);
+
+#endif  // ENGINE_EXECUTOR_INCLUDE_VEC_LOAD_HPP_
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/vec_set.cpp b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_set.cpp
new file mode 100644
index 000000000..d16c749ee
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_set.cpp
@@ -0,0 +1,53 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#include "vec_set.hpp"
+
+fp32x16 set1_fp32x16(const float x) {
+#if __AVX512F__
+  return {_mm512_set1_ps(x)};
+#else
+  return {_mm256_set1_ps(x), _mm256_set1_ps(x)};
+#endif
+}
+
+s8x16 set1_s8x16(const int8_t x) { return {_mm_set1_epi8(x)}; }
+
+s16x16 set1_s16x16(const int16_t x) { return {_mm256_set1_epi16(x)}; }
+
+fp16x16 set1_fp16x16(const uint16_t x) { return {_mm256_set1_epi16(x)}; }
+
+s32x16 set1_s32x16(const int32_t x) {
+#if __AVX512F__
+  return {_mm512_set1_epi32(x)};
+#else
+  return {_mm256_set1_epi32(x), _mm256_set1_epi32(x)};
+#endif
+}
+
+s32x16 setzero_s32x16() {
+#if __AVX512F__
+  return {_mm512_setzero_epi32()};
+#else
+  return {_mm256_setzero_si256(), _mm256_setzero_si256()};
+#endif
+}
+
+fp32x16 setzero_fp32x16() {
+#if __AVX512F__
+  return {_mm512_setzero_ps()};
+#else
+  return {_mm256_setzero_ps(), _mm256_setzero_ps()};
+#endif
+}
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/vec_set.hpp b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_set.hpp
new file mode 100644
index 000000000..84b15ca2a
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_set.hpp
@@ -0,0 +1,39 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#ifndef ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_
+#define ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_
+
+#include "vec_base.hpp"
+
+fp32x16 set1_fp32x16(const float x);
+REGISTER_KERNEL_T(set1_fp32x16, fp32x16, float);
+
+s8x16 set1_s8x16(const int8_t x);
+REGISTER_KERNEL_T(set1_s8x16, s8x16, int8_t);
+
+s16x16 set1_s16x16(const int16_t x);
+REGISTER_KERNEL_T(set1_s16x16, s16x16, int16_t);
+
+fp16x16 set1_fp16x16(const uint16_t x);
+REGISTER_KERNEL_T(set1_fp16x16, fp16x16, uint16_t);
+
+s32x16 set1_s32x16(const int32_t x);
+REGISTER_KERNEL_T(set1_s32x16, s32x16, int32_t);
+
+s32x16 setzero_s32x16();
+
+fp32x16 setzero_fp32x16();
+
+#endif  // ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/vec_store.cpp b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_store.cpp
new file mode 100644
index 000000000..8c862d6c2
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_store.cpp
@@ -0,0 +1,15 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#include "vec_store.hpp"
diff --git a/vllm-ext/inference_engine/cpp/vectors/cpu/vec_store.hpp b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_store.hpp
new file mode 100644
index 000000000..5522ebb2f
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/cpu/vec_store.hpp
@@ -0,0 +1,73 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+
+#ifndef ENGINE_EXECUTOR_INCLUDE_VEC_STORE_HPP_
+#define ENGINE_EXECUTOR_INCLUDE_VEC_STORE_HPP_
+
+#include "vec_base.hpp"
+
+inline void store_s8x16(void* mem_addr, s8x16 a) { _mm_storeu_si128(reinterpret_cast<__m128i*>(mem_addr), a.first); }
+inline void store_u8x16(void* mem_addr, u8x16 a) { _mm_storeu_si128(reinterpret_cast<__m128i*>(mem_addr), a.first); }
+template <>
+inline void store_kernel_t<s8x16>(void* dst, s8x16 src) {
+  store_s8x16(dst, src);
+}
+
+inline void mask_store_s8x16(void* mem_addr, const int mask, s8x16 a) {
+#ifdef __AVX512F__
+  _mm_mask_storeu_epi8(mem_addr, mask, a.first);
+#else
+  __m128i mask_reg =
+      _mm_set_epi8(mask & 32768, mask & 16384, mask & 8192, mask & 4096, mask & 2048, mask & 1024, mask & 512,
+                   mask & 256, mask & 128, mask & 64, mask & 32, mask & 16, mask & 8, mask & 4, mask & 2, mask & 1);
+  _mm_maskmoveu_si128(a.first, mask_reg, reinterpret_cast<char*>(mem_addr));
+#endif
+}
+
+inline void mask_store_u8x16(void* mem_addr, const int mask, u8x16 a) {
+#ifdef __AVX512F__
+  _mm_mask_storeu_epi8(mem_addr, mask, a.first);
+#else
+  __m128i mask_reg =
+      _mm_set_epi8(mask & 32768, mask & 16384, mask & 8192, mask & 4096, mask & 2048, mask & 1024, mask & 512,
+                   mask & 256, mask & 128, mask & 64, mask & 32, mask & 16, mask & 8, mask & 4, mask & 2, mask & 1);
+  _mm_maskmoveu_si128(a.first, mask_reg, reinterpret_cast<char*>(mem_addr));
+#endif
+}
+
+inline void store_fp32x16(void* mem_addr, fp32x16 a) {
+#ifdef __AVX512F__
+  _mm512_storeu_ps(mem_addr, a.first);
+#else
+  float* mem_addr_fp32 = reinterpret_cast<float*>(mem_addr);
+  _mm256_storeu_ps(mem_addr_fp32, a.first);
+  _mm256_storeu_ps(mem_addr_fp32 + 8, a.second);
+#endif
+}
+
+template <>
+inline void store_kernel_t<fp32x16>(void* dst, fp32x16 src) {
+  store_fp32x16(dst, src);
+}
+
+inline void store_bf16x16(void* mem_addr, bf16x16 a) {
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(mem_addr), a.first);
+}
+
+template <>
+inline void store_kernel_t<bf16x16>(void* dst, bf16x16 src) {
+  store_bf16x16(dst, src);
+}
+
+#endif  // ENGINE_EXECUTOR_INCLUDE_VEC_STORE_HPP_
diff --git a/vllm-ext/inference_engine/cpp/vectors/ele_reduce.cpp b/vllm-ext/inference_engine/cpp/vectors/ele_reduce.cpp
new file mode 100644
index 000000000..655706d47
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/ele_reduce.cpp
@@ -0,0 +1,67 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#include "vectors/cpu/vec.hpp"
+#include "vectors/ele_reduce.h"
+#include "cmath"
+
+void ne_vec_norm_f32_(const int n, float* s, const float* x) {
+  float sum = 0.0;
+  ne_dot_fp32x16_kernel_t k_t;
+  for (int i = 0; i < n / 16; ++i) {
+    float tmp;
+    k_t(reinterpret_cast<void*>(&tmp), reinterpret_cast<const void*>(x + i * 16),
+        reinterpret_cast<const void*>(x + i * 16));
+    sum += tmp;
+  }
+  for (int i = n / 16 * 16; i < n; i++) sum += x[i] * x[i];
+  *s = sqrtf(sum);
+}
+
+void ne_vec_sum_f32_(const int n, float* s, const float* x) {
+  float sum = 0.0;
+  ne_reduce_add_fp32x16_kernel_t k_t;
+  for (int i = 0; i < n / 16; ++i) {
+    float tmp;
+    k_t(reinterpret_cast<void*>(&tmp), reinterpret_cast<const void*>(x + i * 16));
+    sum += tmp;
+  }
+  for (int i = n / 16 * 16; i < n; i++) sum += x[i];
+  *s = sum;
+}
+
+void ne_vec_max_f32_(const int n, float* s, const float* x) {
+  float max = -INFINITY;
+  ne_reduce_max_fp32x16_kernel_t k_t;
+  for (int i = 0; i < n / 16; ++i) {
+    float tmp;
+    k_t(reinterpret_cast<void*>(&tmp), reinterpret_cast<const void*>(x + i * 16));
+    max = max > tmp ? max : tmp;
+  }
+  for (int i = n / 16 * 16; i < n; i++) {
+    max = x[i] > max ? x[i] : max;
+  }
+  *s = max;
+}
+
+void ne_vec_norm_inv_f32_(const int n, float* s, const float* x) {
+  ne_vec_norm_f32_(n, s, x);
+  *s = 1.f / (*s);
+}
+void ne_vec_sum_ggf_(const int n, double* s, const float* x) {
+  float sum = 0.0;
+  for (int i = 0; i < n; ++i) {
+    sum += static_cast<float>(x[i]);
+  }
+  *s = sum;
+}
diff --git a/vllm-ext/inference_engine/cpp/vectors/ele_reduce.h b/vllm-ext/inference_engine/cpp/vectors/ele_reduce.h
new file mode 100644
index 000000000..edf36de4c
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/ele_reduce.h
@@ -0,0 +1,44 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef VEC_SHARED
+#if defined(_WIN32) && !defined(__MINGW32__)
+#ifdef VEC_BUILD
+#define VEC_API __declspec(dllexport)
+#else
+#define VEC_API __declspec(dllimport)
+#endif
+#else
+#define VEC_API __attribute__((visibility("default")))
+#endif
+#else
+#define VEC_API
+#endif
+VEC_API void ne_vec_norm_f32_(const int n, float* s, const float* x);
+VEC_API void ne_vec_sum_f32_(const int n, float* s, const float* x);
+
+VEC_API void ne_vec_sum_ggf_(const int n, double* s, const float* x);
+
+VEC_API void ne_vec_max_f32_(const int n, float* s, const float* x);
+
+VEC_API void ne_vec_norm_inv_f32_(const int n, float* s, const float* x);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/vllm-ext/inference_engine/cpp/vectors/fp16.h b/vllm-ext/inference_engine/cpp/vectors/fp16.h
new file mode 100644
index 000000000..2e31f44c5
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/fp16.h
@@ -0,0 +1,161 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
+#include <immintrin.h>
+#endif
+
+#include <string.h>
+#include <math.h>
+#include "core/data_types.h"
+
+// precomputed gelu table for f16 (128 KB)
+static ggml_fp16_t table_gelu_f16[1 << 16];
+
+// precomputed silu table for f16 (128 KB)
+static ggml_fp16_t table_silu_f16[1 << 16];
+
+// precomputed exp table for f16 (128 KB)
+static ggml_fp16_t table_exp_f16[1 << 16];
+
+// precomputed f32 table for f16 (256 KB)
+static float table_f32_f16[1 << 16];
+
+#ifdef __F16C__
+
+#ifdef _MSC_VER
+#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+#else
+#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+#endif
+
+#else
+// FP16 <-> FP32
+// ref: https://github.com/Maratyszcza/FP16
+
+static inline float fp32_from_bits(uint32_t w) {
+  union {
+    uint32_t as_bits;
+    float as_value;
+  } fp32;
+  fp32.as_bits = w;
+  return fp32.as_value;
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+  union {
+    float as_value;
+    uint32_t as_bits;
+  } fp32;
+  fp32.as_value = f;
+  return fp32.as_bits;
+}
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+  const uint32_t w = (uint32_t)h << 16;
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  const uint32_t two_w = w + w;
+
+  const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+  const float exp_scale = 0x1.0p-112f;
+#else
+  const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+  const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+  const uint32_t magic_mask = UINT32_C(126) << 23;
+  const float magic_bias = 0.5f;
+  const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+  const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+  const uint32_t result =
+      sign | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+  return fp32_from_bits(result);
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+  const float scale_to_inf = 0x1.0p+112f;
+  const float scale_to_zero = 0x1.0p-110f;
+#else
+  const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+  const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+  float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+  const uint32_t w = fp32_to_bits(f);
+  const uint32_t shl1_w = w + w;
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+  if (bias < UINT32_C(0x71000000)) {
+    bias = UINT32_C(0x71000000);
+  }
+
+  base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+  const uint32_t bits = fp32_to_bits(base);
+  const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+  const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+  const uint32_t nonsign = exp_bits + mantissa_bits;
+  return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#endif  // __F16C__
+
+#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+  uint16_t s;
+  memcpy(&s, &f, sizeof(uint16_t));
+  return table_f32_f16[s];
+}
+#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+#endif
+
+float ggml_fp16_to_fp32(ggml_fp16_t x) { return (float)GGML_FP16_TO_FP32(x); }
+
+ggml_fp16_t ggml_fp32_to_fp16(float x) { return GGML_FP32_TO_FP16(x); }
+
+void ggml_fp16_to_fp32_row(const ggml_fp16_t* x, float* y, size_t n) {
+  for (size_t i = 0; i < n; i++) {
+    y[i] = GGML_FP16_TO_FP32(x[i]);
+  }
+}
+
+void ggml_fp32_to_fp16_row(const float* x, ggml_fp16_t* y, size_t n) {
+  size_t i = 0;
+#if defined(__F16C__)
+  for (; i + 7 < n; i += 8) {
+    __m256 x_vec = _mm256_loadu_ps(x + i);
+    __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+    _mm_storeu_si128((__m128i*)(y + i), y_vec);
+  }
+  for (; i + 3 < n; i += 4) {
+    __m128 x_vec = _mm_loadu_ps(x + i);
+    __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+    _mm_storel_epi64((__m128i*)(y + i), y_vec);
+  }
+#endif
+  for (; i < n; i++) {
+    y[i] = GGML_FP32_TO_FP16(x[i]);
+  }
+}
diff --git a/vllm-ext/inference_engine/cpp/vectors/parallel_for.h b/vllm-ext/inference_engine/cpp/vectors/parallel_for.h
new file mode 100644
index 000000000..c6dde6b09
--- /dev/null
+++ b/vllm-ext/inference_engine/cpp/vectors/parallel_for.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#pragma once
+#ifdef GPU_BACKEND
+#include <sycl/sycl.hpp>
+
+template <size_t VL = 16, typename kernel_t, typename kernel_tail_t>
+void parallel_for(sycl::queue& q, size_t size, kernel_t kernel, kernel_tail_t kernel_tail) {
+  constexpr unsigned GroupSize = 1;
+
+  sycl::range<1> GlobalRange{size / VL};
+  sycl::range<1> LocalRange{GroupSize};
+  sycl::nd_range<1> Range(GlobalRange, LocalRange);
+
+  sycl::range<1> GlobalRange_tail{size % VL};
+  sycl::range<1> LocalRange_tail{GroupSize};
+  sycl::nd_range<1> Range_tail(GlobalRange_tail, LocalRange_tail);
+
+  auto e = q.submit([&](sycl::handler& cgh) { cgh.parallel_for(Range, kernel); });
+  auto e_tail = q.submit([&](sycl::handler& cgh) { cgh.parallel_for(Range_tail, kernel_tail); });
+  e.wait();
+  e_tail.wait();
+}
+// #endif
+
+// Example:
+//    float* input;
+//    float* output;
+//    size_t size = 128 + 1;
+//    size_t VL = 16;
+//    ...
+//    Kernel kernel(input, output);
+//    Kernel_tail kernel_tail(input, output);
+//    parallel_for<VL>(128, kernel, kernel_tail);
+template <size_t VL, typename kernel_t, typename kernel_tail_t>
+void parallel_for(size_t size, kernel_t kernel, kernel_tail_t kernel_tail) {
+  for (size_t i = 0; i < size; i += VL) {
+    kernel(i);
+  }
+  for (size_t i = size / VL * VL; i < size; i++) {
+    kernel_tail(i);
+  }
+}
+#endif
diff --git a/vllm-ext/inference_engine/python/README.md b/vllm-ext/inference_engine/python/README.md
new file mode 100644
index 000000000..2a4c0c7d5
--- /dev/null
+++ b/vllm-ext/inference_engine/python/README.md
@@ -0,0 +1 @@
+# Inference-Engine
\ No newline at end of file
diff --git a/vllm-ext/inference_engine/python/inference_engine/__init__.py b/vllm-ext/inference_engine/python/inference_engine/__init__.py
new file mode 100644
index 000000000..0b0132f07
--- /dev/null
+++ b/vllm-ext/inference_engine/python/inference_engine/__init__.py
@@ -0,0 +1,21 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .model import Model as Model
+
+_KV_CACHE_CPY_PARAMS_SIZE = 5
+
+__all__ = ["Model"]
diff --git a/vllm-ext/inference_engine/python/inference_engine/llama_vllm_cb_cpp.cpython-39-x86_64-linux-gnu.so b/vllm-ext/inference_engine/python/inference_engine/llama_vllm_cb_cpp.cpython-39-x86_64-linux-gnu.so
new file mode 100755
index 000000000..22e5ebfeb
Binary files /dev/null and b/vllm-ext/inference_engine/python/inference_engine/llama_vllm_cb_cpp.cpython-39-x86_64-linux-gnu.so differ
diff --git a/vllm-ext/inference_engine/python/inference_engine/model/__init__.py b/vllm-ext/inference_engine/python/inference_engine/model/__init__.py
new file mode 100644
index 000000000..981fce988
--- /dev/null
+++ b/vllm-ext/inference_engine/python/inference_engine/model/__init__.py
@@ -0,0 +1,19 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .model import Model as Model
+
+__all__ = ["Model"]
diff --git a/vllm-ext/inference_engine/python/inference_engine/model/model.py b/vllm-ext/inference_engine/python/inference_engine/model/model.py
new file mode 100644
index 000000000..ed40ee775
--- /dev/null
+++ b/vllm-ext/inference_engine/python/inference_engine/model/model.py
@@ -0,0 +1,225 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+from filelock import FileLock
+from typing import List
+from pathlib import Path
+import ctypes
+from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer, PretrainedConfig
+from inference_engine.quant import convert_model
+
+_DEFAULT_QUANT_DIR = "runtime_outs/quantized_models"
+
+_CTYPES_STR_ENCODE = "utf-8"
+
+
+class Model:
+    def __init__(
+        self,
+        model_id,
+        tokenizer: PreTrainedTokenizer = None,
+        max_new_tokens: int = 512,
+        max_batch_size: int = 512,
+        ctx_size: int = 512,
+        pad_token: int = -1,
+        memory_dtype: str = "auto",  # auto, fp16, fp32
+        scratch_size_ratio: float = 1.0,
+        threads: int = 56,
+        threads_next: int = 56,
+        max_prompt_tokens: int = 1024,
+        seed: int = 1234,
+    ):
+        self.model_id = model_id
+        self.tokenizer = tokenizer
+        self.generation_args = {}
+        self.generation_args["max_new_tokens"] = max_new_tokens
+        self.generation_args["max_batch_size"] = max_batch_size
+        self.generation_args["ctx_size"] = ctx_size
+        self.generation_args["seed"] = seed
+        self.generation_args["threads"] = threads
+        self.generation_args["threads_next"] = threads_next
+        self.generation_args["pad_token"] = pad_token
+        self.generation_args["memory_dtype"] = memory_dtype
+        self.generation_args["scratch_size_ratio"] = scratch_size_ratio
+        self.generation_args["max_prompt_tokens"] = max_prompt_tokens
+
+        self.cpp_module = None
+        self.native_model_ptr = None
+        self.quantized_model_path: str = None
+        self.fp32_model_path: str = None
+        self.config: PretrainedConfig = None
+
+    def _init_cpp_model(self, config):
+        if self.cpp_module is not None:
+            return
+        from inference_engine.model.model_cpp import load_engine_lib
+
+        self.cpp_module = load_engine_lib(config.model_type)
+
+    def _get_model_target_path(self, quant_dir, for_quantized: bool = False, **quant_kwargs):
+        weight_type = quant_kwargs["weight_dtype"]
+        group_size = quant_kwargs["group_size"]
+        alg = quant_kwargs["alg"]
+        compute_dtype = quant_kwargs["compute_dtype"]
+        if for_quantized:
+            desc = f"{self.model_id}_{self.config.model_type}_{weight_type}_{group_size}_{alg}_{compute_dtype}_quantized.bin"
+        else:
+            desc = f"{self.model_id}_{self.config.model_type}_fp32.bin"
+        t_path = os.path.join(quant_dir, desc)
+        return t_path
+
+    def __set_pad_token(self, tokenizer):
+        if self.generation_args["pad_token"] == -1:
+            if tokenizer.pad_token:
+                self.generation_args["pad_token"] = tokenizer.pad_token_id
+            else:
+                if tokenizer.unk_token:
+                    tokenizer.pad_token_id = tokenizer.unk_token_id
+                elif tokenizer.eos_token:
+                    tokenizer.pad_token_id = tokenizer.eos_token_id
+                else:
+                    raise ValueError("Tokenizer has no pad, unk, or eos token")
+                self.generation_args["pad_token"] = tokenizer.pad_token_id
+        else:
+            if tokenizer.pad_token_id:
+                assert (
+                    tokenizer.pad_token_id == self.generation_args["pad_token"]
+                ), "different pad token ids in tokenizer and model init"
+            else:
+                tokenizer.pad_token_id = self.generation_args["pad_token"]
+
+    def check_and_quantize(self, **quant_kwargs):
+        self.config = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)
+        self._init_cpp_model(self.config)
+        if self.tokenizer is None:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
+            self.__set_pad_token(self.tokenizer)
+        # check if quantized model exists, quantize on the fly if not existed
+        quant_dir = quant_kwargs.get("quant_dir", _DEFAULT_QUANT_DIR)
+        if quant_dir == _DEFAULT_QUANT_DIR:
+            os.makedirs(quant_dir, exist_ok=True)
+        self.quantized_model_path = self._get_model_target_path(
+            quant_dir, for_quantized=True, **quant_kwargs
+        )
+        parent_path = Path(self.quantized_model_path).parent
+        os.makedirs(
+            parent_path, exist_ok=True
+        )  # in case model id is ***/***, e.g., "meta-llama/Llama-7b.."
+        if os.path.exists(self.quantized_model_path):  # existed?
+            return
+        # check if intermediate fp32 model exists, convert on the fly if not existed
+        self.fp32_model_path = self._get_model_target_path(
+            quant_dir, for_quantized=False, **quant_kwargs
+        )
+        if not os.path.exists(self.fp32_model_path):
+            # lock to prevent being messed up with mutiple processes
+            fp32_lock_path = self.fp32_model_path + ".lock"
+            with FileLock(fp32_lock_path):
+                if not os.path.exists(self.fp32_model_path):  # converted by other process already?
+                    fp32_path_tmp = self.fp32_model_path + ".tmp"
+                    convert_model(self.model_id, fp32_path_tmp, "f32", model_hub="huggingface")
+                    os.rename(fp32_path_tmp, self.fp32_model_path)
+                    assert os.path.exists(
+                        self.fp32_model_path
+                    ), "Failed to convert model to intermediate fp32 model"
+        # quantize
+        quant_output_tmp = self.quantized_model_path + ".tmp"
+        fp32_model_path = self.fp32_model_path.encode(_CTYPES_STR_ENCODE)
+        quant_output_path = quant_output_tmp.encode(_CTYPES_STR_ENCODE)
+        weight_dtype = quant_kwargs.get("weight_dtype", "int4").encode(_CTYPES_STR_ENCODE)
+        alg = quant_kwargs.get("alg", "sym").encode(_CTYPES_STR_ENCODE)
+        group_size = quant_kwargs.get("group_size", 32)
+        scale_dtype = quant_kwargs.get("scale_dtype", "fp32").encode(_CTYPES_STR_ENCODE)
+        compute_dtype = quant_kwargs.get("compute_dtype", "int8").encode(_CTYPES_STR_ENCODE)
+        use_ggml = ctypes.c_bool(quant_kwargs.get("use_ggml", False))
+        threads = quant_kwargs.get("threads", 8)
+
+        # lock to prevent being messed up with mutiple processes
+        quant_lock_path = self.quantized_model_path + ".lock"
+        with FileLock(quant_lock_path):
+            if not os.path.exists(self.quantized_model_path):  # quantized by other process already?
+                self.cpp_module.quantize_model(
+                    fp32_model_path,
+                    quant_output_path,
+                    weight_dtype,
+                    alg,
+                    group_size,
+                    scale_dtype,
+                    compute_dtype,
+                    use_ggml,
+                    threads,
+                )
+                os.rename(quant_output_tmp, self.quantized_model_path)
+
+        assert os.path.exists(self.quantized_model_path), "Failed to quantize model"
+
+    def load_model(self):
+        self.native_model_ptr = ctypes.c_void_p(self.cpp_module.create_new_model())
+        quantized_model_path = self.quantized_model_path.encode(_CTYPES_STR_ENCODE)
+        memory_dtype = self.generation_args["memory_dtype"].encode(_CTYPES_STR_ENCODE)
+
+        ok = self.cpp_module.init_model(
+            self.native_model_ptr,
+            quantized_model_path,
+            self.generation_args["max_new_tokens"],
+            self.generation_args["max_batch_size"],
+            self.generation_args["ctx_size"],
+            self.generation_args["pad_token"],
+            memory_dtype,
+            self.generation_args["scratch_size_ratio"],
+            self.generation_args["threads"],
+            self.generation_args["threads_next"],
+            self.generation_args["max_prompt_tokens"],
+            self.generation_args["seed"],
+        )
+        if not ok:
+            raise RuntimeError("Failed to initialize model. Please check native log for details.")
+
+    def __call__(
+        self,
+        input_ids_dataptr: int,
+        positions_dataptr: int,
+        is_prompt: bool,
+        block_tables_dataptr: int,
+        slot_mapping_dataptr: int,
+        context_lens: List[int],
+    ) -> ctypes.c_void_p:
+        context_lens_arr = (ctypes.c_int * len(context_lens))(*context_lens)
+        return self.cpp_module.generate(
+            self.native_model_ptr,
+            ctypes.c_void_p(input_ids_dataptr),
+            ctypes.c_void_p(positions_dataptr),
+            ctypes.c_bool(is_prompt),
+            ctypes.c_void_p(block_tables_dataptr),
+            ctypes.c_void_p(slot_mapping_dataptr),
+            context_lens_arr,
+            len(context_lens),
+        )
+
+    def set_block_size(self, block_size: int):
+        self.cpp_module.set_block_size(self.native_model_ptr, block_size)
+
+    def free_slots(self, seq_ids: List[int]) -> bool:
+        id_len = len(seq_ids)
+        seq_id_array = None if id_len == 0 else (ctypes.c_int64 * id_len)(*seq_ids)
+        return self.cpp_module.free_slots(self.native_model_ptr, seq_id_array, id_len // 2)
+
+    def set_kv_caches_ptr(self, kv_caches_dataptr: int):
+        self.cpp_module.set_kv_caches_ptr(self.native_model_ptr, ctypes.c_void_p(kv_caches_dataptr))
+
+    def get_last_error(self) -> str:
+        self.cpp_module.get_last_error(self.native_model_ptr)
diff --git a/vllm-ext/inference_engine/python/inference_engine/model/model_cpp.py b/vllm-ext/inference_engine/python/inference_engine/model/model_cpp.py
new file mode 100644
index 000000000..530008dcb
--- /dev/null
+++ b/vllm-ext/inference_engine/python/inference_engine/model/model_cpp.py
@@ -0,0 +1,99 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import ctypes
+from ctypes import (
+    c_void_p,
+    c_char_p,
+    c_int,
+    c_int32,
+    c_int64,
+    c_float,
+    c_bool,
+    POINTER,
+)
+import pathlib
+
+_base_path = pathlib.Path(__file__).parent.parent.parent.resolve()
+# if _base_path.name != "site-packages": # editable mode
+#     _base_path = _base_path.parent.resolve()
+_base_path = f"{str(_base_path)}/inference_engine"
+
+
+def load_engine_lib(model_type: str):
+    native_lib = None
+    if model_type == "llama" or model_type == "llama2":
+        native_lib = ctypes.CDLL(f"{_base_path}/libllama_vllm_cb_cpp.so")
+    if native_lib:
+        native_lib.create_new_model.argtypes = []
+        native_lib.create_new_model.restype = c_void_p
+
+        native_lib.destroy_model.argtypes = [c_void_p]
+        native_lib.destroy_model.restype = None
+
+        native_lib.init_model.argtypes = [
+            c_void_p,
+            c_char_p,
+            c_int,
+            c_int,
+            c_int,
+            c_int32,
+            c_char_p,
+            c_float,
+            c_int,
+            c_int,
+            c_int,
+            c_int,
+        ]
+        native_lib.init_model.restype = c_bool
+
+        native_lib.quantize_model.argtypes = [
+            c_char_p,
+            c_char_p,
+            c_char_p,
+            c_char_p,
+            c_int,
+            c_char_p,
+            c_char_p,
+            c_bool,
+            c_int,
+        ]
+        native_lib.quantize_model.restype = c_int
+
+        native_lib.generate.argtypes = [
+            c_void_p,
+            c_void_p,
+            c_void_p,
+            c_bool,
+            c_void_p,
+            c_void_p,
+            POINTER(c_int),
+            c_int,
+        ]
+        native_lib.generate.restype = c_void_p
+
+        native_lib.set_block_size.argtypes = [c_void_p, c_int64]
+        native_lib.set_block_size.restype = None
+
+        native_lib.free_slots.argtypes = [c_void_p, POINTER(c_int64), c_int]
+        native_lib.free_slots.restype = c_bool
+
+        native_lib.set_kv_caches_ptr.argtypes = [c_void_p, c_void_p]
+        native_lib.set_kv_caches_ptr.restype = None
+
+        native_lib.get_last_error.argtypes = [c_void_p]
+        native_lib.get_last_error.restype = c_char_p
+    return native_lib
diff --git a/vllm-ext/inference_engine/python/inference_engine/quant/__init__.py b/vllm-ext/inference_engine/python/inference_engine/quant/__init__.py
new file mode 100644
index 000000000..7904480b0
--- /dev/null
+++ b/vllm-ext/inference_engine/python/inference_engine/quant/__init__.py
@@ -0,0 +1,19 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .quant import convert_model
+
+__all__ = ["convert_model"]
diff --git a/vllm-ext/inference_engine/python/inference_engine/quant/common.py b/vllm-ext/inference_engine/python/inference_engine/quant/common.py
new file mode 100644
index 000000000..f78097ce3
--- /dev/null
+++ b/vllm-ext/inference_engine/python/inference_engine/quant/common.py
@@ -0,0 +1,868 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import torch
+import os
+from pathlib import Path
+import numpy as np
+import struct
+import json
+import warnings
+import gguf
+from typing import (
+    IO,
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+)
+from sentencepiece import SentencePieceProcessor  # type: ignore
+
+GGML_QK8_0 = 32
+GGML_QK4_0 = 32
+GGML_QK4_1 = 32
+GGML_QK5_0 = 32
+GGML_QK5_1 = 32
+
+GGML_QK4_0_TYPE = 2
+GGML_QK4_1_TYPE = 3
+GGML_QJBLAS_TYPE = 19
+
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def quantize_q4_0(tensor: torch.Tensor) -> torch.CharTensor:
+    # equivalent to ggml_quantize_q4_0 in ggml.c
+    assert tensor.shape[1] % GGML_QK4_0 == 0
+    tensor = tensor.view(-1, GGML_QK4_0)
+    abs_max_indices = tensor.abs().max(dim=-1, keepdim=True).indices
+    max_values = torch.take_along_dim(tensor, abs_max_indices, dim=-1)
+    scale = max_values / -8
+    tensor = (tensor / scale + 8).round().clamp(min=0, max=15).char()
+    # compress two int4 weights into an int8
+    tensor = tensor[:, :16] | (tensor[:, 16:] << 4)
+    # add scale into each block
+    tensor = torch.cat((scale.half().view(torch.int8), tensor), dim=-1)
+    return tensor
+
+
+def quantize_q4_1(tensor: torch.Tensor) -> torch.CharTensor:
+    # equivalent to ggml_quantize_q4_1 in ggml.c
+    assert tensor.shape[1] % GGML_QK4_1 == 0
+    tensor = tensor.view(-1, GGML_QK4_1)
+    min_vals = tensor.min(dim=-1, keepdim=True).values
+    max_vals = tensor.max(dim=-1, keepdim=True).values
+    scale = (max_vals - min_vals) / ((1 << 4) - 1)
+    tensor = ((tensor - min_vals) / scale).round().clamp(min=0, max=15).char()
+    # compress two int4 weights into an int8
+    tensor = tensor[:, :16] | (tensor[:, 16:] << 4)
+    # add scale & min into each block
+    tensor = torch.cat(
+        (scale.half().view(torch.int8), min_vals.half().view(torch.int8), tensor), dim=-1
+    )
+    return tensor
+
+
+def quantize_q8_0(tensor: torch.Tensor) -> torch.Tensor:
+    # equivalent to ggml_quantize_q8_0 in ggml.c
+    assert tensor.shape[1] % GGML_QK8_0 == 0
+    tensor = tensor.view(-1, GGML_QK8_0)
+    scale = tensor.abs().max(dim=-1, keepdim=True).values / ((1 << 7) - 1)
+    tensor = (tensor / scale).round().clamp(min=-128, max=127).char()
+    # add scale into each block
+    tensor = torch.cat((scale.half().view(torch.int8), tensor), dim=-1)
+    return tensor
+
+
+def quantize_q5_0(tensor: torch.Tensor) -> torch.Tensor:
+    # equivalent to ggml_quantize_q5_0 in ggml.c
+    assert tensor.shape[1] % GGML_QK5_0 == 0
+    tensor = tensor.view(-1, GGML_QK5_0)
+    abs_max_indices = tensor.abs().max(dim=-1, keepdim=True).indices
+    max_values = torch.take_along_dim(tensor, abs_max_indices, dim=-1)
+    scale = max_values / -16
+    tensor = (tensor / scale + 16).round().clamp(min=0, max=31).char()
+    qs = (tensor[:, :16] & 0x0F) | (tensor[:, 16:] << 4)
+    qh = torch.zeros(tensor.shape[:-1], dtype=torch.int32)
+    for i in range(32):
+        qh |= ((tensor[:, i] & 0x10) >> 4).int() << i
+
+    # add scale into each block
+    tensor = torch.cat((scale.half().view(torch.int8), qh[..., None].view(torch.int8), qs), dim=-1)
+    return tensor
+
+
+def quantize_q5_1(tensor: torch.Tensor) -> torch.Tensor:
+    # equivalent to ggml_quantize_q5_1 in ggml.c
+    assert tensor.shape[1] % GGML_QK5_1 == 0
+    tensor = tensor.view(-1, GGML_QK5_1)
+    min_vals = tensor.min(dim=-1, keepdim=True).values
+    max_vals = tensor.max(dim=-1, keepdim=True).values
+    scale = (max_vals - min_vals) / ((1 << 5) - 1)
+    tensor = ((tensor - min_vals) / scale).round().clamp(min=0, max=31).char()
+    qs = (tensor[:, :16] & 0x0F) | (tensor[:, 16:] << 4)
+    qh = torch.zeros(tensor.shape[:-1], dtype=torch.int32)
+    for i in range(32):
+        qh |= ((tensor[:, i] & 0x10) >> 4).int() << i
+
+    # add scale & min into each block
+    tensor = torch.cat(
+        (
+            scale.half().view(torch.int8),
+            min_vals.half().view(torch.int8),
+            qh[..., None].view(torch.int8),
+            qs,
+        ),
+        dim=-1,
+    )
+    return tensor
+
+
+ADDED_TOKENS_FILE = "added_tokens.json"
+FAST_TOKENIZER_FILE = "tokenizer.json"
+
+
+class BpeVocab:
+    tokenizer_model = "gpt2"
+    name = "bpe"
+
+    def __init__(self, base_path: Path):
+        added_tokens: dict[str, int] = {}
+
+        if (fname_tokenizer := base_path / "vocab.json").exists():
+            # "slow" tokenizer
+            with open(fname_tokenizer, encoding="utf-8") as f:
+                self.vocab = json.load(f)
+
+            try:
+                # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
+                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
+                    added_tokens = json.load(f)
+            except FileNotFoundError:
+                pass
+        else:
+            # "fast" tokenizer
+            fname_tokenizer = base_path / FAST_TOKENIZER_FILE
+
+            # if this fails, FileNotFoundError propagates to caller
+            with open(fname_tokenizer, encoding="utf-8") as f:
+                tokenizer_json = json.load(f)
+
+            tokenizer_model: dict[str, Any] = tokenizer_json["model"]
+            if (
+                tokenizer_model["type"] != "BPE"
+                or tokenizer_model.get("byte_fallback", False)
+                or tokenizer_json["decoder"]["type"] != "ByteLevel"
+            ):
+                raise FileNotFoundError("Cannot find GPT-2 BPE tokenizer")
+
+            self.vocab = tokenizer_model["vocab"]
+
+            if (added := tokenizer_json.get("added_tokens")) is not None:
+                # Added tokens here can be duplicates of the main vocabulary.
+                added_tokens = {
+                    item["content"]: item["id"]
+                    for item in added
+                    if item["content"] not in self.vocab
+                }
+
+        vocab_size = len(self.vocab)
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            expected_end_id = vocab_size + len(actual_ids) - 1
+            raise ValueError(
+                f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
+                f"{vocab_size} - {expected_end_id}; got {actual_ids}"
+            )
+
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_dict = added_tokens
+        self.added_tokens_list = [text for (text, idx) in items]
+        self.vocab_size_base = vocab_size
+        self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer
+
+    def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
+
+        for i, _ in enumerate(self.vocab):
+            yield reverse_vocab[i], 0.0
+
+    def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score
+
+    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        yield from self.bpe_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class SentencePieceVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
+        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+        added_tokens: Dict[str, int]
+        if fname_added_tokens is not None:
+            added_tokens = json.load(open(fname_added_tokens))
+        else:
+            added_tokens = {}
+        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            raise Exception(
+                f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}"
+            )
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_list = [text for (text, idx) in items]
+        self.vocab_size_base: int = vocab_size
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer
+        self.fname_added_tokens = fname_added_tokens
+
+    def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        tokenizer = self.sentencepiece_tokenizer
+        for i in range(tokenizer.vocab_size()):
+            text: bytes
+            if tokenizer.is_unknown(i):
+                text = " \u2047 ".encode("utf-8")
+            elif tokenizer.is_control(i):
+                text = b""
+            elif tokenizer.is_byte(i):
+                piece = tokenizer.id_to_piece(i)
+                if len(piece) != 6:
+                    raise Exception(f"Invalid token: {piece}")
+                byte_value = int(piece[3:-1], 16)
+                text = struct.pack("B", byte_value)
+            else:
+                text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            score: float = tokenizer.get_score(i)
+            yield text, score
+
+    def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score
+
+    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        yield from self.sentencepiece_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)}\
+                added tokens>"
+
+
+def load_vocab(path: Path) -> SentencePieceVocab:
+    # Be extra-friendly and accept either a file or a directory.  Also, if it's
+    # a directory, it might be the model directory, and tokenizer.model might
+    # be in the parent of that.
+    local_path = path
+    if not local_path.exists():
+        from huggingface_hub import snapshot_download
+
+        local_path = snapshot_download(
+            repo_id=str(path.parent),
+            allow_patterns=["*.model"],
+        )
+        local_path = Path(local_path)
+    if local_path.is_dir():
+        path2 = local_path / "tokenizer.model"
+        # Use `.parent` instead of /.. to handle the symlink case better.
+        path3 = local_path.parent / "tokenizer.model"
+        if path2.exists():
+            local_path = path2
+        elif path3.exists():
+            local_path = path3
+        else:
+            raise FileNotFoundError(
+                f"Could not find tokenizer.model in {local_path} or its parent; if it's in another directory, \
+                pass the directory as --vocab-dir"
+            )
+    added_tokens_path = local_path.parent / "added_tokens.json"
+    print(f"Loading vocab file {local_path}")
+    return SentencePieceVocab(local_path, added_tokens_path if added_tokens_path.exists() else None)
+
+
+def expandToInt4(qweight):
+    eweight = qweight.repeat(8, axis=2)
+    eweight = eweight.astype(np.uint32)
+    for i in range(0, eweight.shape[2]):
+        offset = i % (32 // 4) * 4
+        eweight[:, :, i] = eweight[:, :, i] >> offset & (2**4 - 1)
+    return eweight
+
+
+def to_ggml_int16(eweight):
+    qweight = np.zeros((eweight.shape[0], eweight.shape[1], eweight.shape[2] // 4), dtype=np.uint16)
+    eweight = np.asarray(eweight, dtype=np.uint16)
+    for i in range(0, qweight.shape[2]):
+        qweight[:, :, i] = eweight[:, :, i * 2 + 0]
+        qweight[:, :, i] |= eweight[:, :, i * 2 + 32] << 1 * 4
+        qweight[:, :, i] |= eweight[:, :, i * 2 + 1] << 2 * 4
+        qweight[:, :, i] |= eweight[:, :, i * 2 + 33] << 3 * 4
+    return qweight.astype(np.int16)
+
+
+def qzeros_to_zeros(qzeros, bits=4):
+    zeros = np.zeros((qzeros.shape[0], qzeros.shape[1] * (32 // bits)), dtype=np.float32)
+    i = 0
+    col = 0
+    while col < qzeros.shape[1]:
+        for j in range(i, i + (32 // bits)):
+            zeros[:, j] = (qzeros[:, col] >> (bits * (j - i)) & (2**bits - 1)) + 1
+        i += 32 // bits
+        col += 1
+    return zeros
+
+
+def unpack_weight(qweight, scales, qzeros, q_config):
+    if "quant_method" not in q_config:
+        raise ValueError(f"Unsupported q_config without quant_method: {q_config}")
+    quant_method = q_config["quant_method"]
+    if quant_method.lower() in ["gptq", "autoround", "rtn"]:
+        qbits = q_config["bits"]
+        if qbits == 4:
+            return unpack_gptq_weight_4bits(qweight, scales, qzeros, q_config)
+        elif qbits == 3:
+            return unpack_gptq_weight_3bits(qweight, scales, qzeros, q_config)
+        elif qbits == 8:
+            return unpack_gptq_weight_8bits(qweight, scales, qzeros, q_config)
+
+        return ValueError(f"Unsupported q_config[bits]: {qbits}")
+
+    if quant_method.lower() == "awq":
+        return unpack_awq_weight(qweight, scales, qzeros, q_config)
+    raise ValueError(f"Unsupported quant_method: {quant_method}")
+
+
+def unpack_gptq_weight_8bits(qweight, scales, qzeros, q_config):
+    sym = q_config["sym"]
+    q_config["group_size"]
+    bits = q_config["bits"]
+    s32_bits = 32
+
+    assert bits == 8
+    # Int32 can store 8 * 4bits data. This is the offset for each data.
+    wf = torch.tensor(list(range(0, s32_bits, bits)), dtype=torch.int32).unsqueeze(0)
+    zeros = torch.bitwise_right_shift(
+        torch.unsqueeze(qzeros, 2).expand(-1, -1, 32 // bits), wf.unsqueeze(0)
+    ).to(torch.int16 if bits == 8 else torch.int8)
+    torch.bitwise_and(zeros, (2**bits) - 1, out=zeros)
+
+    if bits == 8:
+        zeros = zeros.to(torch.int8 if sym else torch.uint8)
+
+    zeros = zeros + 1
+    try:
+        zeros = zeros.reshape(scales.shape)
+    except BaseException:
+        # zeros and scales have different item numbers.
+        # remove 1 (due to 0 + 1 in line 68)
+        zeros = zeros[zeros != 1]
+        zeros = zeros.reshape(scales.shape)
+
+    if not sym and bits == 8:
+        zeros = (zeros.to(torch.int32) - 128).to(torch.int8)
+
+    weight = torch.bitwise_right_shift(
+        torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1), wf.unsqueeze(-1)
+    ).to(torch.int16 if bits == 8 else torch.int8)
+    torch.bitwise_and(weight, (2**bits) - 1, out=weight)
+
+    if bits == 8:
+        # due to INC add shift bias for sym
+        if sym:
+            shift_bias = 2 ** (bits - 1)
+            weight -= shift_bias
+        weight = weight.to(torch.int8 if sym else torch.uint8)
+        # due to INC asym return torch.uint8 but backend request int8,
+        # change it to int8 with offset 128
+        if not sym:
+            weight = (weight.to(torch.int32) - 128).to(torch.int8)
+    return weight, scales, zeros
+
+
+def unpack_gptq_weight_4bits(qweight, scales, qzeros, q_config):
+    q_config["group_size"]
+    bits = q_config["bits"]
+    s32_bits = 32
+
+    assert bits == 4
+    # Int32 can store 8 * 4bits data. This is the offset for each data.
+    wf = torch.tensor(list(range(0, s32_bits, bits)), dtype=torch.int32).unsqueeze(0)
+    zeros = torch.bitwise_right_shift(
+        torch.unsqueeze(qzeros, 2).expand(-1, -1, 32 // bits), wf.unsqueeze(0)
+    ).to(torch.int16 if bits == 8 else torch.int8)
+    torch.bitwise_and(zeros, (2**bits) - 1, out=zeros)
+
+    zeros = zeros + 1
+    zeros = zeros.reshape(scales.shape)
+
+    weight = torch.bitwise_right_shift(
+        torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1), wf.unsqueeze(-1)
+    ).to(torch.int16 if bits == 8 else torch.int8)
+    torch.bitwise_and(weight, (2**bits) - 1, out=weight)
+
+    return weight, scales, zeros
+
+
+def unpack_gptq_weight_3bits(qweight, scales, qzeros, q_config):
+    print("unpack_gptq_weight_3bits...   ", end="")
+    group_size = q_config["group_size"]
+    bits = q_config["bits"]
+    s32_bits = 32
+
+    assert bits == 3
+    # Int32 can only store 10 * 3bits data. This is the offset for each data.
+    wf = torch.tensor([[i for i in range(0, s32_bits - bits, bits)]], dtype=torch.int32)
+    zeros = torch.bitwise_right_shift(
+        torch.unsqueeze(qzeros, 2).expand(-1, -1, 32 // bits), wf.unsqueeze(0)
+    ).to(torch.int16 if bits == 8 else torch.int8)
+    torch.bitwise_and(zeros, (2**bits) - 1, out=zeros)
+
+    zeros = zeros + 1
+    zeros = zeros.reshape(zeros.shape[0], -1)
+    zeros = zeros[:, : scales.shape[1]]
+
+    weight = torch.bitwise_right_shift(
+        torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1), wf.unsqueeze(-1)
+    ).to(torch.int16 if bits == 8 else torch.int8)
+
+    weight = weight.reshape(-1, weight.shape[-1])
+    input_feature = group_size * scales.shape[0]
+    weight = weight[:input_feature, :]
+
+    torch.bitwise_and(weight, (2**bits) - 1, out=weight)
+
+    return weight, scales, zeros
+
+
+def unpack_awq_weight(qweight, scales, qzeros, q_config):
+    q_config["group_size"]
+    bits = q_config["bits"]
+    order_map = [0, 4, 1, 5, 2, 6, 3, 7]
+
+    pack_num = 32 // bits
+    weight = torch.zeros(qweight.shape[0], qweight.shape[1] * pack_num)
+    zeros = torch.zeros(qzeros.shape[0], qzeros.shape[1] * pack_num)
+    for col in range(qweight.shape[1]):
+        for i in range(pack_num):
+            w_col = torch.bitwise_right_shift(qweight[:, col], 4 * order_map[i])
+            weight[:, col * pack_num + i] = torch.bitwise_and(w_col, (2**bits) - 1)
+            z_col = torch.bitwise_right_shift(qzeros[:, col], 4 * order_map[i])
+            zeros[:, col * pack_num + i] = torch.bitwise_and(z_col, (2**bits) - 1)
+
+    return weight, scales, zeros
+
+
+def write_header(fout, shape, dst_name, ftype_cur):
+    sname = dst_name.encode("utf-8")
+    fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
+    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
+    fout.write(sname)
+    fout.seek((fout.tell() + 31) & -32)
+
+
+def find_quantized_model_file(model_path):
+    model_path = Path(model_path)
+    for ext in [".safetensors", ".pt"]:
+        found = list(model_path.glob(f"*{ext}"))
+        if len(found) > 0:
+            if len(found) != 1:
+                warnings.warn(f"Detected {len(found)} {ext} model, use the first one {found[0]}.")
+            print(f"Detected model file {found[0]}")
+            return str(found[0])
+        else:
+            return None
+
+
+def load_quantized_safetensors(model_path):
+    # load GPTQ & AWQ models, only for safetensors
+    from safetensors.torch import load_file
+
+    safetensors = []
+    local_model_path = model_path
+    if not os.path.exists(local_model_path):
+        from huggingface_hub import snapshot_download
+
+        local_model_path = snapshot_download(
+            repo_id=model_path,
+            allow_patterns=["*.safetensors"],
+        )
+    for m_file in os.listdir(local_model_path):
+        if m_file.endswith(".safetensors"):
+            safetensors.append(m_file)
+
+    print(f"safetensors list = {safetensors}")
+    model = {}
+    for m_file in safetensors:
+        tmp = load_file(os.path.join(local_model_path, m_file))
+        if isinstance(tmp, dict):
+            model.update(tmp)
+
+    with open(os.path.join(local_model_path, "config.json"), "r", encoding="utf-8") as f:
+        config = json.load(f)
+
+    quantize_config = config["quantization_config"]
+    if "zero_point" in quantize_config:
+        quantize_config["sym"] = not quantize_config["zero_point"]
+    return model, config, config["quantization_config"]
+
+
+def load_quantized_model(model_path):
+    local_model_path = model_path
+    if not os.path.exists(local_model_path):
+        from huggingface_hub import snapshot_download
+
+        local_model_path = snapshot_download(
+            repo_id=model_path,
+            allow_patterns=["*.pt", "*.safetensors", "*.json"],
+        )
+    input_path = find_quantized_model_file(local_model_path)
+    model = None
+    if input_path.endswith("pt"):
+        model = torch.load(input_path, map_location="cpu")
+    elif input_path.endswith("safetensors"):
+        return load_quantized_safetensors(local_model_path)
+    else:
+        print("unknown input model path, only support .safetensors or .pt file.")
+
+    with open(os.path.join(local_model_path, "config.json"), "r", encoding="utf-8") as f:
+        config = json.load(f)
+
+    quantize_config = config["quantization_config"]
+    if "zero_point" in quantize_config:
+        quantize_config["sym"] = not quantize_config["zero_point"]
+    return model, config, config["quantization_config"]
+
+
+def convert_to_fp32_tensor(src_name, dst_name, model, fout):
+    v = model[src_name]
+    shape = v.shape
+    n_dims = len(shape)
+    print(
+        "Processing non-Q4 variable:     " + src_name + " -> " + dst_name + " with shape: ",
+        shape,
+        " and type: ",
+        v.dtype,
+        "data: ",
+        v[:2, :2].tolist() if n_dims > 1 else v[:2].tolist(),
+    )
+    v = v.to(torch.float32)
+
+    ftype_cur = {torch.float16: 1, torch.float32: 0}[v.dtype]
+
+    # header
+    write_header(fout, shape, dst_name, ftype_cur)
+
+    # data
+    v.numpy().tofile(fout)
+    # print(f"converting {src_name} -> {dst_name} float tensor")
+
+
+def convert_q4_tensor(
+    src_name, dst_name, model, fout, q_config, n_head, n_head2=0, permute_func=None
+):
+    qzeros = model[f"{src_name}.qzeros"]
+    qzeros_to_zeros(qzeros)
+    scales = model[f"{src_name}.scales"]
+    qweight = model[f"{src_name}.qweight"]
+    int_weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config)
+
+    int_weight = int_weight.view(-1, int_weight.shape[-1]).t()
+    gptq_scales = gptq_scales.view(-1, gptq_scales.shape[-1]).t()
+    gptq_zeros = gptq_zeros.view(-1, gptq_zeros.shape[-1]).t()
+
+    write_header(fout, int_weight.shape, dst_name, 2)
+    if permute_func:
+        int_weight = permute_func(int_weight, n_head, n_head2).contiguous()
+        gptq_scales = permute_func(gptq_scales, n_head, n_head2).contiguous()
+        gptq_zeros = permute_func(gptq_zeros, n_head, n_head2).contiguous()
+
+    tensor = int_weight.reshape(-1, 32) - 8
+    tensor = tensor[:, :16] | (tensor[:, 16:] << 4)
+    gptq_scale = gptq_scales.reshape(-1, 1)
+    # gptq_scale = torch.cat([gptq_scale,gptq_scale,gptq_scale,gptq_scale], dim=1).view(-1,1)
+    pack_tensor = torch.cat((gptq_scale.half().view(torch.int8), tensor), dim=-1)
+    pack_tensor.numpy().tofile(fout)
+    print(f"converting {dst_name} quantized tensor to ggml q4 block")
+
+
+def convert_q4_1_tensor(
+    src_name, dst_name, model, fout, q_config, n_head, n_head2=0, permute_func=None
+):
+    qzeros = model[f"{src_name}.qzeros"]
+    qzeros_to_zeros(qzeros)
+    scales = model[f"{src_name}.scales"]
+    # g_idx = model[f"{src_name}.g_idx"]
+    qweight = model[f"{src_name}.qweight"]
+    int_weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config)
+
+    int_weight = int_weight.view(-1, int_weight.shape[-1]).t()
+    gptq_scales = gptq_scales.view(-1, gptq_scales.shape[-1]).t()
+    gptq_zeros = gptq_zeros.view(-1, gptq_zeros.shape[-1]).t()
+
+    write_header(fout, int_weight.shape, dst_name, 3)
+    if permute_func:
+        int_weight = permute_func(int_weight, n_head, n_head2).contiguous()
+        gptq_scales = permute_func(gptq_scales, n_head, n_head2).contiguous()
+        gptq_zeros = permute_func(gptq_zeros, n_head, n_head2).contiguous()
+
+    tensor = int_weight.reshape(-1, 32)
+    tensor = tensor[:, :16] | (tensor[:, 16:] << 4)
+    gptq_scale = gptq_scales.reshape(-1, 1)
+    gptq_zeros = gptq_zeros.reshape(-1, 1)
+    gptq_zeros = -gptq_scale * gptq_zeros
+    pack_tensor = torch.cat(
+        (gptq_scale.half().view(torch.int8), gptq_zeros.half().view(torch.int8), tensor), dim=-1
+    )
+    pack_tensor.numpy().tofile(fout)
+    print(f"converting {dst_name} quantized tensor to ggml q4 1 block")
+
+
+def convert_q4_f32_tensor(
+    src_name, dst_name, model, fout, q_config, n_head, n_head_kv=0, permute_func=None
+):
+    qzeros = model[f"{src_name}.qzeros"]
+    scales = model[f"{src_name}.scales"]
+    qweight = model[f"{src_name}.qweight"]
+
+    weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config)
+    # weight = weight.reshape(weight.shape[0], weight.shape[1] * weight.shape[2])
+    # num_itr = g_idx.shape[0]//x.shape[-1]
+    if "desc_act" in q_config and q_config["desc_act"]:
+        g_idx = model[f"{src_name}.g_idx"]
+        weight = weight.reshape(-1, weight.shape[-1])
+        weight = gptq_scales[g_idx.long()] * (weight - gptq_zeros[g_idx.long()])
+    else:
+        if len(weight.shape) > 2:
+            weight = weight.reshape(-1, weight.shape[-1])
+        infeatures = weight.shape[0]
+        g_idx = torch.tensor(
+            [i // q_config["group_size"] for i in range(infeatures)], dtype=torch.int32
+        )
+        scale_zeros = gptq_zeros * gptq_scales
+        weight = gptq_scales[g_idx.long()] * weight - scale_zeros[g_idx.long()]
+
+    weight = weight.t()
+    weight = weight.float()
+    if permute_func:
+        weight = permute_func(weight, n_head, n_head_kv).contiguous()
+
+    shape = weight.shape
+    write_header(fout, shape, dst_name, 0)
+    weight.numpy().tofile(fout)
+
+    print(f"converting {dst_name} quantized tensor to fp32 tensor")
+
+
+def convert_q4_bestla_tensor(
+    src_name, dst_name, model, fout, q_config, n_head, n_head_kv=0, permute_func=None
+):
+    # unpack weight and repack into jblas format
+    import neural_speed.llama_cpp as cpp_model
+
+    qzeros = model[f"{src_name}.qzeros"]
+    qzeros_to_zeros(qzeros)
+    scales = model[f"{src_name}.scales"]
+    qweight = model[f"{src_name}.qweight"]
+
+    int_weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config)
+    int_weight = int_weight.view(-1, int_weight.shape[-1])
+
+    # permute_func for llama-like model
+    if permute_func:
+        int_weight = permute_func(int_weight.t(), n_head, n_head_kv).t().contiguous()
+        gptq_scales = permute_func(gptq_scales.t(), n_head, n_head_kv).t().contiguous()
+        gptq_zeros = permute_func(gptq_zeros.t(), n_head, n_head_kv).t().contiguous()
+
+    # shuffle weight in GPTQ when act order is on
+    if "desc_act" in q_config and q_config["desc_act"]:
+        g_idx = model[f"{src_name}.g_idx"]
+        int_weight2 = int_weight.clone()
+        group_size = q_config["group_size"]
+        group_dict = {}
+        for i in range(len(g_idx)):
+            group_idx = g_idx[i].item()
+            if group_idx not in group_dict:
+                target_idx = group_idx * group_size
+                group_dict[group_idx] = 0
+            else:
+                group_dict[group_idx] = group_dict[group_idx] + 1
+                target_idx = group_idx * group_size + group_dict[group_idx]
+            int_weight2[target_idx] = int_weight[i]
+        int_weight = int_weight2
+
+    shape = int_weight.shape
+    write_header(fout, shape[::-1], dst_name, GGML_QJBLAS_TYPE)
+
+    if q_config["bits"] == 4:
+        int_weight = int_weight - 8
+        gptq_scales = gptq_scales
+        gptq_zeros = gptq_zeros - 8
+    dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8)
+    int_weight = np.ascontiguousarray(int_weight.numpy())
+    gptq_scales = np.ascontiguousarray((gptq_scales.float()).numpy())
+    if q_config["sym"]:
+        gptq_zeros = np.empty(0, dtype=np.int8)
+    else:
+        gptq_zeros = np.ascontiguousarray(gptq_zeros.numpy())
+    if "desc_act" in q_config and q_config["desc_act"]:
+        g_idx = np.ascontiguousarray(g_idx.numpy())
+    else:
+        g_idx = np.empty(0, dtype=np.int32)
+
+    # pack int weight in bestla format
+    byte_size = cpp_model.Model.np_bestla_qpack(
+        int_weight,
+        gptq_scales,
+        gptq_zeros,
+        g_idx,
+        dst,
+        weight_dtype="int4" if q_config["bits"] == 4 else "int8",
+        group_size=q_config["group_size"],
+        alg="sym" if q_config["sym"] else "asym",
+        compute_dtype="int8",
+    )
+    dst.flatten()[:byte_size].tofile(fout)
+    print(f"converting {dst_name} qauntized tensor to bestla q4 block")
+
+
+def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config):
+    # unpack weight and repack into 3bits / 4bits BestLA format
+    import neural_speed.llama_cpp as cpp_model
+
+    if ".weight" in src_name:
+        src_name = src_name.replace(".weight", "")
+    qzeros = model[f"{src_name}.qzeros"]
+    qzeros_to_zeros(qzeros)
+    scales = model[f"{src_name}.scales"]
+    qweight = model[f"{src_name}.qweight"]
+
+    int_weight, gptq_scales, gptq_zeros = unpack_weight(qweight, scales, qzeros, q_config)
+    int_weight = int_weight.view(-1, int_weight.shape[-1])
+
+    # shuffle weight in GPTQ when act order is on
+    if "desc_act" in q_config and q_config["desc_act"]:
+        g_idx = model[f"{src_name}.g_idx"]
+        int_weight2 = int_weight.clone()
+        group_size = q_config["group_size"]
+        group_dict = {}
+        for i in range(len(g_idx)):
+            group_idx = g_idx[i].item()
+            if group_idx not in group_dict:
+                target_idx = group_idx * group_size
+                group_dict[group_idx] = 0
+            else:
+                group_dict[group_idx] = group_dict[group_idx] + 1
+                target_idx = group_idx * group_size + group_dict[group_idx]
+            int_weight2[target_idx] = int_weight[i]
+        int_weight = int_weight2
+
+    # shape = int_weight.shape[::-1]
+    shape = int_weight.shape[::-1]
+    # write_header(fout, shape[::-1], dst_name, GGML_QJBLAS_TYPE)
+    n_dims = len(shape)
+    str = dst_name.encode("utf-8")
+    fout.write(struct.pack("iii", n_dims, len(str), GGML_QJBLAS_TYPE))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", shape[n_dims - 1 - i]))
+    fout.write(str)
+
+    # INC stores sig-int4 value as u4(range 0~15, they add a offset),
+    # BesTLA requires s4_clip((-8,7)*16), so we sub the offset and then mul 16.
+    # Int3 is the same as int4, but offset=4, mul scale==32.
+    weight_dtype = "int8"
+    if q_config["bits"] == 4:
+        int_weight = int_weight - 8
+        gptq_scales = gptq_scales
+        gptq_zeros = gptq_zeros - 8
+        weight_dtype = "int4"
+    elif q_config["bits"] == 3:
+        int_weight = int_weight - 4
+        gptq_scales = gptq_scales
+        gptq_zeros = gptq_zeros - 4
+        weight_dtype = "int3"
+    else:
+        ValueError(f"Unsupported q_config[bits]: {q_config['bits']}")
+
+    dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8)
+    int_weight = np.ascontiguousarray(int_weight.numpy())
+    gptq_scales = np.ascontiguousarray((gptq_scales.float()).numpy())
+    if q_config["sym"]:
+        gptq_zeros = np.empty(0, dtype=np.int8)
+    else:
+        gptq_zeros = np.ascontiguousarray(gptq_zeros.numpy())
+    if "desc_act" in q_config and q_config["desc_act"]:
+        g_idx = np.ascontiguousarray(g_idx.numpy())
+    else:
+        g_idx = np.empty(0, dtype=np.int32)
+
+    # repack int weight in BesTLA format
+    byte_size = cpp_model.Model.np_bestla_qpack(
+        int_weight,
+        gptq_scales,
+        gptq_zeros,
+        g_idx,
+        dst,
+        weight_dtype=weight_dtype,
+        group_size=q_config["group_size"],
+        alg="sym" if q_config["sym"] else "asym",
+        compute_dtype="int8",
+    )
+    dst.flatten()[:byte_size].tofile(fout)
+    print(
+        f"convert_to_qx_bestla_tensor: {src_name:>40} -> {dst_name:<40} shape: {shape}, byte_size: {byte_size:<10}"
+    )
diff --git a/vllm-ext/inference_engine/python/inference_engine/quant/convert_llama.py b/vllm-ext/inference_engine/python/inference_engine/quant/convert_llama.py
new file mode 100644
index 000000000..53b61f3d3
--- /dev/null
+++ b/vllm-ext/inference_engine/python/inference_engine/quant/convert_llama.py
@@ -0,0 +1,1653 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import concurrent.futures
+import copy
+import enum
+import faulthandler
+import functools
+import io
+import itertools
+import json
+import math
+import mmap
+import pickle
+import re
+import signal
+import struct
+import sys
+import zipfile38 as zipfile
+from abc import ABCMeta, abstractmethod
+from dataclasses import dataclass
+from pathlib import Path
+from typing import (
+    IO,
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+)
+import numpy as np
+from sentencepiece import SentencePieceProcessor  # type: ignore
+
+import gguf
+
+if TYPE_CHECKING:
+    from typing_extensions import TypeAlias
+
+if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"):
+    faulthandler.register(signal.SIGUSR1)
+
+NDArray: "TypeAlias" = "np.ndarray[Any, Any]"
+llama3_vocab_size = 128256
+
+
+@dataclass(frozen=True)
+class UnquantizedDataType:
+    name: str
+
+
+DT_F16 = UnquantizedDataType("F16")
+DT_F32 = UnquantizedDataType("F32")
+DT_I32 = UnquantizedDataType("I32")
+DT_BF16 = UnquantizedDataType("BF16")
+DT_BOOL = UnquantizedDataType("BOOL")
+
+
+@dataclass(frozen=True)
+class QuantizedDataType:
+    groupsize: int
+    have_addends: bool
+    have_g_idx: bool
+
+
+DT_Q4_0 = QuantizedDataType(groupsize=32, have_addends=False, have_g_idx=False)
+DT_Q4_1 = QuantizedDataType(groupsize=32, have_addends=True, have_g_idx=False)
+
+DataType = Union[UnquantizedDataType, QuantizedDataType]
+
+DATA_TYPE_TO_FTYPE: Dict[DataType, int] = {DT_F32: 0, DT_F16: 1, DT_Q4_0: 2, DT_Q4_1: 3, DT_BOOL: 4}
+
+FTYPE_TO_DATA_TYPE: Dict[int, DataType] = {
+    ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()
+}
+
+DATA_TYPE_TO_NUMPY: Dict[DataType, "np.dtype[Any]"] = {
+    DT_BF16: np.dtype(np.uint16),
+    DT_F16: np.dtype(np.float16),
+    DT_F32: np.dtype(np.float32),
+    DT_I32: np.dtype(np.int32),
+    DT_BOOL: np.dtype(np.bool_),
+}
+
+NUMPY_TYPE_TO_DATA_TYPE: Dict["np.dtype[Any]", DataType] = {
+    dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()
+}
+
+
+class NEFileType(enum.Enum):
+    AllF32 = 0
+    MostlyF16 = 1  # except 1d tensors
+    MostlyQ4_0 = 2  # except 1d tensors
+    MostlyQ4_1 = 3  # except 1d tensors
+    PerLayerIsQ4_1 = 4  # but tok_embeddings.weight and output.weight are F16
+
+    def type_for_tensor(self, name: str, tensor: "LazyTensor") -> DataType:
+        if len(tensor.shape) == 1:
+            # 1D tensors are always F32.
+            return DT_F32
+        elif self == NEFileType.AllF32:
+            return DT_F32
+        elif self == NEFileType.MostlyF16:
+            return DT_F16
+        elif self == NEFileType.MostlyQ4_0:
+            return DT_Q4_0
+        elif self == NEFileType.MostlyQ4_1:
+            return DT_Q4_1
+        elif self == NEFileType.PerLayerIsQ4_1:
+            if name in ("output.weight", "tok_embeddings.weight"):
+                return DT_F16
+            else:
+                return DT_Q4_1
+        else:
+            raise ValueError(self)
+
+
+def make_tensors_list() -> List[str]:
+    ret = [
+        "tok_embeddings.weight",
+        "norm.weight",
+        "output.weight",
+    ]
+    for i in range(80):  # maximum number of layer
+        ret += [
+            f"layers.{i}.attention.wq.weight",
+            f"layers.{i}.attention.wk.weight",
+            f"layers.{i}.attention.wv.weight",
+            f"layers.{i}.attention.wo.weight",
+            f"layers.{i}.attention_norm.weight",
+            f"layers.{i}.feed_forward.w1.weight",
+            f"layers.{i}.feed_forward.w2.weight",
+            f"layers.{i}.feed_forward.w3.weight",
+            f"layers.{i}.ffn_norm.weight",
+        ]
+    return ret
+
+
+TENSORS_LIST = make_tensors_list()
+TENSORS_SET = set(TENSORS_LIST)
+
+
+@dataclass
+class Params:
+    n_vocab: int
+    n_embd: int
+    n_mult: int
+    n_head: int
+    n_layer: int
+    n_head_kv: int
+    ffn_hidden_size: int
+    rms_norm_eps: float
+    rope_theta: float
+    rope_scale: float
+    bos_token_id: int
+    eos_token_id: int
+    pad_token_id: int
+
+    @staticmethod
+    def guessed(model: "LazyModel") -> "Params":
+        n_vocab, n_embd = (
+            model["model.embed_tokens.weight"].shape
+            if "model.embed_tokens.weight" in model
+            else model["tok_embeddings.weight"].shape
+        )
+
+        return Params(
+            n_vocab=n_vocab,
+            n_embd=n_embd,
+            n_mult=256,
+            n_head=n_embd // 128,
+            n_head_kv=n_embd // 128,
+            rms_norm_eps=1e-5,
+            n_layer=next(
+                i
+                for i in itertools.count()
+                if f"model.layers.{i}.self_attn.q_proj.weight" not in model
+            ),
+        )
+
+    @staticmethod
+    def loadHFTransformerJson(model: "LazyModel", config_path: Path) -> "Params":
+        config = json.load(open(config_path))
+
+        n_vocab = config["vocab_size"]
+        n_embd = config["hidden_size"]
+        n_layer = config["num_hidden_layers"]
+        n_head = config["num_attention_heads"]
+        n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
+        ffn_hidden_size = config["intermediate_size"]
+        rms_norm_eps = config["rms_norm_eps"]
+        rope_theta = config["rope_theta"] if "rope_theta" in config else 10000
+        rope_scale = 1
+        if "rope_scaling" in config and config["rope_scaling"] is not None:
+            rope_scale = (
+                config["rope_scaling"]["factor"] if "factor" in config["rope_scaling"] else 1
+            )
+        bos_token_id = config["bos_token_id"]
+        eos_token_id = config["eos_token_id"]
+        pad_token_id = config["pad_token_id"] if "pad_token_id" in config else -1
+
+        return Params(
+            n_vocab=n_vocab,
+            n_embd=n_embd,
+            n_layer=n_layer,
+            n_mult=256,
+            n_head=n_head,
+            n_head_kv=n_head_kv,
+            ffn_hidden_size=ffn_hidden_size,
+            rms_norm_eps=rms_norm_eps,
+            rope_theta=rope_theta,
+            rope_scale=rope_scale,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+        )
+
+    # LLaMA v2 70B params.json
+    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8,
+    #  "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
+    @staticmethod
+    def loadOriginalParamsJson(model: "LazyModel", config_path: Path) -> "Params":
+        config = json.load(open(config_path))
+
+        n_vocab = config["vocab_size"] if "vocab_size" in config else -1
+        n_embd = config["dim"]
+        n_layer = config["n_layers"]
+        n_head = config["n_heads"]
+        n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
+        ffn_hidden_size = config["intermediate_size"]
+        bos_token_id = config["bos_token_id"]
+        eos_token_id = config["eos_token_id"]
+        pad_token_id = config["pad_token_id"] if "pad_token_id" in config else -1
+        # hack to determine LLaMA v1 vs v2 vs CodeLlama
+
+        if n_vocab == -1:
+            n_vocab = model["tok_embeddings.weight"].shape[0]
+
+        return Params(
+            n_vocab=n_vocab,
+            n_embd=n_embd,
+            n_mult=256,
+            n_layer=n_layer,
+            n_head=n_head,
+            n_head_kv=n_head_kv,
+            ffn_hidden_size=ffn_hidden_size,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+        )
+
+    @staticmethod
+    def load(model: "ModelPlus") -> "Params":
+        hf_config_path = model.paths[0].parent / "config.json"
+        orig_config_path = model.paths[0].parent / "params.json"
+
+        if hf_config_path.exists():
+            params = Params.loadHFTransformerJson(model.model, hf_config_path)
+        elif orig_config_path.exists():
+            params = Params.loadOriginalParamsJson(model.model, orig_config_path)
+        elif model.format != "none":
+            params = Params.guessed(model.model)
+        else:
+            raise ValueError("Cannot guess params when model format is none")
+
+        return params
+
+
+class SentencePieceVocab:
+    def __init__(
+        self, fname_tokenizer: Path, params_vocab_size: int, fname_added_tokens: Optional[Path]
+    ) -> None:
+        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+        added_tokens: Dict[str, int]
+        if fname_added_tokens is not None:
+            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
+        else:
+            added_tokens = {}
+        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            print(
+                f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}"
+            )
+            added_tokens = {}
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_list = [text for (text, idx) in items]
+        self.vocab_size_base: int = vocab_size
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer
+        self.fname_added_tokens = fname_added_tokens
+        self.params_vocab_size = params_vocab_size
+
+    def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        tokenizer = self.sentencepiece_tokenizer
+        for i in range(self.params_vocab_size):
+            text: bytes
+            if i < tokenizer.vocab_size():
+                if tokenizer.is_unknown(i):
+                    text = " \u2047 ".encode("utf-8")
+                elif tokenizer.is_control(i):
+                    text = b""
+                elif tokenizer.is_byte(i):
+                    piece = tokenizer.id_to_piece(i)
+                    if len(piece) != 6:
+                        raise Exception(f"Invalid token: {piece}")
+                    byte_value = int(piece[3:-1], 16)
+                    text = struct.pack("B", byte_value)
+                else:
+                    text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+                score: float = tokenizer.get_score(i)
+                yield text, score
+            else:
+                text = " \u2047 ".encode("utf-8")
+                score: float = i
+                yield text, score
+
+    def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score
+
+    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        yield from self.sentencepiece_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and \
+                 {len(self.added_tokens_list)} added tokens>"
+
+
+class NEVocab:
+    def __init__(self, tokens: List[Tuple[bytes, float]]):
+        self.tokens = tokens
+        self.vocab_size = len(tokens)
+
+    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        return self.tokens
+
+    def __repr__(self) -> str:
+        return f"<NEVocab with {self.vocab_size} tokens>"
+
+
+Vocab = Union[SentencePieceVocab, NEVocab]
+
+
+def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
+    if n_head_kv is not None and n_head != n_head_kv:
+        n_head = n_head_kv
+    return (
+        weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+        .swapaxes(1, 2)
+        .reshape(weights.shape)
+    )
+
+
+def dequantize_q4(
+    qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]
+) -> NDArray:
+    # First reinterpret each row from a list of int32s containing 8 values each
+    # to a list of uint8s containing 2 values each.
+    qvalues_pack8 = qvalues_pack32.view(np.uint8)
+
+    # Then split out the two values per int8 (which requires an actual
+    # conversion because numpy doesn't natively support int4s).
+    qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], dtype=np.uint8)
+    qvalues[:, 0::2] = qvalues_pack8 & 0xF
+    qvalues[:, 1::2] = qvalues_pack8 >> 4
+
+    assert addends is None or addends.shape == scales.shape
+    assert qvalues.shape[0] == scales.shape[0]
+    assert qvalues.shape[1] % scales.shape[1] == 0
+    if g_idx is None:
+        repeat_count = qvalues.shape[1] // scales.shape[1]
+        scales = scales[:, :, np.newaxis]
+        if addends is not None:
+            addends = addends[:, :, np.newaxis]
+        # Reshape so that the below computation broadcasts over scales and addends:
+        qvalues.shape = (qvalues.shape[0], scales.shape[1], int(repeat_count))
+    else:
+        # In this case the scale and addend is selected for each column by g_idx:
+        assert addends is not None
+        scales = scales[:, g_idx]
+        addends = addends[:, g_idx]
+    if addends is None:
+        # Q4_0
+        qvalues = qvalues.view(np.int8)
+        qvalues -= 8
+    # And do the actual 'value = scale * qvalue + addend' computation.
+    values = scales * qvalues
+    if addends is not None:
+        values += addends
+    if g_idx is None:
+        values.shape = (values.shape[0], values.shape[1] * values.shape[2])
+    return values
+
+
+class Tensor(metaclass=ABCMeta):
+    data_type: DataType
+
+    @abstractmethod
+    def astype(self, data_type: DataType) -> "Tensor":
+        ...
+
+    @abstractmethod
+    def permute(self, n_head: int, kv_head: int) -> "Tensor":
+        ...
+
+    @abstractmethod
+    def to_ne(self) -> "NECompatibleTensor":
+        ...
+
+
+def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray:
+    assert (
+        bf16_arr.dtype == np.uint16
+    ), f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
+    fp32_arr = bf16_arr.astype(np.uint32) << 16
+    return fp32_arr.view(np.float32)
+
+
+class UnquantizedTensor(Tensor):
+    def __init__(self, ndarray: NDArray) -> None:
+        assert isinstance(ndarray, np.ndarray)
+        self.ndarray = ndarray
+        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
+
+    def astype(self, data_type: DataType) -> Tensor:
+        dtype = DATA_TYPE_TO_NUMPY[data_type]
+        if self.data_type == DT_BF16:
+            self.ndarray = bf16_to_fp32(self.ndarray)
+        return UnquantizedTensor(self.ndarray.astype(dtype))
+
+    def to_ne(self) -> "UnquantizedTensor":
+        return self
+
+    def permute(self, n_head: int, kv_head: int) -> "UnquantizedTensor":
+        return UnquantizedTensor(permute(self.ndarray, n_head, kv_head))
+
+
+def load_unquantized(
+    lazy_tensor: "LazyTensor", expected_dtype: Any = None, convert: bool = False
+) -> NDArray:
+    tensor = lazy_tensor.load()
+    assert isinstance(tensor, UnquantizedTensor)
+
+    # double-check:
+    actual_shape = list(tensor.ndarray.shape)
+    assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
+    if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
+        if convert:
+            tensor.ndarray = tensor.ndarray.astype(expected_dtype)
+        else:
+            raise ValueError(
+                f"expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}"
+            )
+
+    return tensor.ndarray
+
+
+class NEQuantizedTensor(Tensor):
+    data_type: QuantizedDataType
+
+    def __init__(self, ndarray: NDArray, shape: List[int], data_type: DataType) -> None:
+        rows, columns = shape
+        assert data_type in (DT_Q4_1, DT_Q4_0)  # for now
+        assert isinstance(
+            data_type, QuantizedDataType
+        )  # redundant, but mypy complains without this
+        assert columns % data_type.groupsize == 0
+        words_in_block = 6 if data_type == DT_Q4_1 else 5
+        self.ndarray = ndarray.view(dtype=np.uint32).reshape(
+            (rows, columns // data_type.groupsize, words_in_block)
+        )
+        self.shape = shape[:]
+        self.data_type = data_type
+
+    def astype(self, data_type: DataType) -> Tensor:
+        if data_type == self.data_type:
+            return self
+        scales = self.ndarray[:, :, 0].view(np.float32)
+        if self.data_type.have_addends:
+            addends = self.ndarray[:, :, 1].view(np.float32)
+        else:
+            addends = None
+        qweights = self.ndarray[:, :, -4:].reshape([self.shape[0], self.shape[1] // 8])
+
+        dq = dequantize_q4(qweights, scales, addends, g_idx=None)
+        return UnquantizedTensor(dq).astype(data_type)
+
+    def to_ne(self) -> "NEQuantizedTensor":
+        return self
+
+    def permute(self, n_head: int, kv_head: int) -> "NEQuantizedTensor":
+        return NEQuantizedTensor(permute(self.ndarray, n_head, kv_head), self.shape, self.data_type)
+
+
+NECompatibleTensor = Union[UnquantizedTensor, NEQuantizedTensor]
+
+
+class DeferredPermutedTensor(Tensor):
+    def __init__(self, base: Tensor, n_head: int, kv_head: int) -> None:
+        self.base = base
+        self.n_head = n_head
+        self.kv_head = kv_head
+        self.data_type = self.base.data_type
+
+    def astype(self, data_type: DataType) -> Tensor:
+        return self.base.astype(data_type).permute(self.n_head, self.kv_head)
+
+    def to_ne(self) -> NECompatibleTensor:
+        return self.base.to_ne().permute(self.n_head, self.kv_head)
+
+    def permute(self, n_head: int, kv_head: int) -> Tensor:
+        raise Exception("shouldn't permute twice")
+
+
+class GPTQForLLaMaQuantizedTensor(Tensor):
+    def __init__(self, model: "LazyModel", namebase: str) -> None:
+        qweight = load_unquantized(model[f"{namebase}.qweight"], np.int32)
+        scales = load_unquantized(model[f"{namebase}.scales"], np.float32, convert=True)
+
+        bias = model.get(f"{namebase}.bias")
+        if bias is not None:
+            # Q4_1 does not support bias; good thing the bias is always all zeros.
+            assert not np.any(load_unquantized(bias))
+
+        if f"{namebase}.zeros" in model:
+            zeros = load_unquantized(model[f"{namebase}.zeros"], np.float32)
+        else:
+            qzeros = load_unquantized(model[f"{namebase}.qzeros"], np.int32)
+            assert qzeros.dtype == np.int32
+            zeros = dequantize_q4(qzeros, scales, scales, g_idx=None)
+            assert zeros.dtype == np.float32
+
+        assert zeros.shape == scales.shape
+
+        # Output is transposed compared to the input, and addends have their sign flipped.
+        # Scales and zeros similarly must be transposed but only for newer
+        # versions of GPTQ-for-LLaMa; the older versions can be identified by
+        # having shape (n_embd, 1).
+        qweight = qweight.T
+        if scales.shape[1] != 1:
+            scales = scales.T
+            zeros = zeros.T
+
+        # Output also has signs flipped for the addends.
+        self.qweight = qweight
+        self.scales = scales
+        self.addends = -zeros
+
+        self.g_idx: Optional[NDArray]
+        if f"{namebase}.g_idx" in model:
+            self.g_idx = load_unquantized(model[f"{namebase}.g_idx"], np.int32)
+            assert self.g_idx.shape == (qweight.shape[1] * 8,)
+        else:
+            self.g_idx = None
+
+        self.shape = [self.qweight.shape[0], self.qweight.shape[1] * 8]
+        self.data_type = QuantizedDataType(
+            groupsize=self.groupsize(), have_addends=True, have_g_idx=(self.g_idx is not None)
+        )
+
+    def inspect(self, row: int, col: int) -> None:
+        """For debugging."""
+        qweight = (self.qweight[row, col // 8] >> (4 * (col & 7))) & 0xF
+        if self.g_idx is not None:
+            group = self.g_idx[col]
+        else:
+            group = int(col // self.groupsize())
+        scale = self.scales[row, group]
+        addend = self.addends[row, group]
+        with np.printoptions(precision=None, suppress=True):
+            print(f"scale:{scale} addend:{addend} qweight:{qweight}")
+            print("possible values:", np.arange(16) * scale + addend)
+            print("actual value:", qweight * scale + addend)
+
+    def astype(self, data_type: DataType) -> Tensor:
+        if isinstance(data_type, QuantizedDataType):
+            assert (
+                self.g_idx is None
+                and data_type.have_addends is True
+                and data_type.have_g_idx is False
+            )
+            return self.regroup(data_type.groupsize)
+
+        dequantized = dequantize_q4(
+            np.ascontiguousarray(self.qweight), self.scales, self.addends, self.g_idx
+        )
+        return UnquantizedTensor(dequantized).astype(data_type)
+
+    def groupsize(self) -> int:
+        assert self.addends.shape == self.scales.shape
+        assert self.shape[1] % self.scales.shape[1] == 0
+        return self.shape[1] // self.scales.shape[1]
+
+    def regroup(self, new_groupsize: int = 32) -> "GPTQForLLaMaQuantizedTensor":
+        # Old versions of GPTQ-for-LLaMa shared scales and addends between all the
+        # columns in a row.  Newer versions share them between every set of N
+        # columns in a row, where N is the `groupsize` parameter, usually 128.  The
+        # output format shares them between every set of 32 columns.  To handle
+        # this, duplicate scales and addends for every smaller group.
+        # (In the above, 'row' and 'column' are in the sense of the output.)
+        assert self.g_idx is None
+        old_groupsize = self.groupsize()
+        assert old_groupsize >= new_groupsize and old_groupsize % new_groupsize == 0, old_groupsize
+        ret = copy.copy(self)
+        ret.addends = self.addends.repeat(old_groupsize // new_groupsize, axis=1)
+        ret.scales = self.scales.repeat(old_groupsize // new_groupsize, axis=1)
+        ret.data_type = QuantizedDataType(
+            groupsize=new_groupsize, have_addends=True, have_g_idx=False
+        )
+        return ret
+
+    def permute(self, n_head: int, kv_head: int) -> Tensor:
+        return DeferredPermutedTensor(self, n_head, kv_head)
+
+    def to_ne(self) -> NEQuantizedTensor:
+        # The output format looks like this:
+        # For each row:
+        #   For each group of 32 columns:
+        #     - addend (float32, 4 bytes)
+        #     - scale (float32, 4 bytes)
+        #     - weights (int4 * 32, 16 bytes)
+
+        if self.groupsize() != 32:
+            raise Exception("should have been regrouped before converting to ne")
+
+        # Since the output format is mixed between integers and floats, we have
+        # to hackily view the floats as int32s just so numpy will let us
+        # concatenate them.
+        addends_view = self.addends.view(dtype=np.int32)[:, :, np.newaxis]
+        scales_view = self.scales.view(dtype=np.int32)[:, :, np.newaxis]
+
+        # Split into groups of 4 columns (i.e. 32 columns of quantized data):
+        grouped = self.qweight.reshape([self.qweight.shape[0], self.qweight.shape[1] // 4, 4])
+
+        # And concatenate:
+        grouped = np.concatenate([scales_view, addends_view, grouped], axis=2, casting="no")
+
+        return NEQuantizedTensor(grouped, self.shape, DT_Q4_1)
+
+
+@dataclass
+class LazyTensor:
+    _load: Callable[[], Tensor]
+    shape: List[int]
+    data_type: DataType
+    description: str
+
+    def load(self) -> Tensor:
+        ret = self._load()
+        assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
+        return ret
+
+    def astype(self, data_type: DataType) -> "LazyTensor":
+        self.validate_conversion_to(data_type)
+
+        def load() -> Tensor:
+            return self.load().astype(data_type)
+
+        return LazyTensor(load, self.shape, data_type, f"convert({data_type}) {self.description}")
+
+    def validate_conversion_to(self, data_type: DataType) -> None:
+        if data_type == self.data_type:
+            return
+        if isinstance(data_type, QuantizedDataType):
+            if not isinstance(self.data_type, QuantizedDataType):
+                raise Exception(
+                    f"Can't turn an unquantized tensor into a quantized type ({data_type})"
+                )
+            if self.data_type.have_g_idx:
+                sys.stderr.write(
+                    "Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively\
+                     supported by NE.  For now you can still convert this model by passing `--outtype f16` to \
+                     dequantize, but that will result in a much larger output file for no quality benefit.\n"
+                )
+                sys.exit(1)
+            assert (
+                not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
+            )
+
+
+LazyModel = Dict[str, LazyTensor]
+
+
+@dataclass
+class ModelPlus:
+    model: LazyModel
+    paths: List[Path]  # Where this was read from.
+    format: Literal["ne", "torch", "safetensors"]
+    vocab: Optional[Vocab]  # For NE models (which have vocab built in), the vocab.
+
+
+def merge_sharded(models: List[LazyModel]) -> LazyModel:
+    # Original LLaMA models have each file contain one part of each tensor.
+    # Use a dict instead of a set to preserve order.
+    names = {name: None for model in models for name in model}
+
+    def convert(name: str) -> LazyTensor:
+        lazy_tensors: List[LazyTensor] = [model[name] for model in models]
+        if len(lazy_tensors) == 1:
+            # only one file; don't go through this procedure since there might
+            # be quantized tensors
+            return lazy_tensors[0]
+        if len(lazy_tensors[0].shape) == 1:
+            # the tensor is just duplicated in every file
+            return lazy_tensors[0]
+        if (
+            name.startswith("tok_embeddings.")
+            or name.endswith(".attention.wo.weight")
+            or name.endswith(".feed_forward.w2.weight")
+        ):
+            # split by columns
+            axis = 1
+        else:
+            # split by rows
+            axis = 0
+        concatenated_shape = list(lazy_tensors[0].shape)
+        concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
+
+        def load() -> UnquantizedTensor:
+            ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
+            concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
+            return UnquantizedTensor(concatenated)
+
+        description = "concatenated[[" + "] | [".join(lt.description for lt in lazy_tensors) + "]]"
+        return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
+
+    return {name: convert(name) for name in names}
+
+
+def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
+    formats = set(mp.format for mp in models_plus)
+    assert len(formats) == 1, "different formats?"
+    format = formats.pop()
+    paths = [path for mp in models_plus for path in mp.paths]
+    # Use the first non-None vocab, if any.
+    try:
+        vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
+    except StopIteration:
+        vocab = None
+
+    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
+        # Transformers models put different tensors in different files, but
+        # don't split individual tensors between files.
+        model: LazyModel = {}
+        for mp in models_plus:
+            model.update(mp.model)
+    else:
+        model = merge_sharded([mp.model for mp in models_plus])
+
+    return ModelPlus(model, paths, format, vocab)
+
+
+def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().permute(n_head, n_head_kv)
+
+    return LazyTensor(
+        load,
+        lazy_tensor.shape,
+        lazy_tensor.data_type,
+        f"permute({n_head}, {n_head_kv}) " + lazy_tensor.description,
+    )
+
+
+def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
+    out: LazyModel = {}
+    out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
+    out["norm.weight"] = model["model.norm.weight"]
+    out["output.weight"] = model["lm_head.weight"]
+
+    for i in itertools.count():
+        if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
+            break
+        out[f"layers.{i}.attention.wq.weight"] = permute_lazy(
+            model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head
+        )
+        out[f"layers.{i}.attention.wk.weight"] = permute_lazy(
+            model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv
+        )
+        out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
+        out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
+
+        out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
+        out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"]
+        out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"]
+
+        out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"]
+        out[f"layers.{i}.ffn_norm.weight"] = model[
+            f"model.layers.{i}.post_attention_layernorm.weight"
+        ]
+    return out
+
+
+def handle_quantization(model: LazyModel) -> LazyModel:
+    """Convert a model with entries for 'foo.qweight', 'foo.scales', etc.
+    (which resolve to UnquantizedTensors with the raw data) to one with entries
+    for 'foo.weight' (which resolve to QuantizedTensors).
+    """
+
+    def convert(name: str) -> Tuple[str, LazyTensor]:
+        if name.endswith(".qweight"):
+            namebase = name.rsplit(".", 1)[0]
+            orig_name = namebase + ".weight"
+
+            lazy_tensor = model[name]
+            assert len(lazy_tensor.shape) == 2
+            real_shape = [lazy_tensor.shape[1], lazy_tensor.shape[0] * 8]
+
+            # Calculate type.  This replicates the logic in
+            # GPTQForLLaMaQuantizedTensor (which is executed when the modelis
+            # actually loaded).
+            lazy_scales = model[f"{namebase}.scales"]
+            scales_width = 1 if lazy_scales.shape[1] == 1 else lazy_scales.shape[0]
+            assert real_shape[1] % scales_width == 0
+            groupsize = real_shape[1] // scales_width
+            have_g_idx = f"{namebase}.g_idx" in model
+            data_type = QuantizedDataType(
+                groupsize=groupsize, have_addends=True, have_g_idx=have_g_idx
+            )
+
+            def load() -> Tensor:
+                return GPTQForLLaMaQuantizedTensor(model, namebase)
+
+            return (orig_name, LazyTensor(load, real_shape, data_type, "[quantized]"))
+        else:
+            return (name, model[name])
+
+    return dict(convert(name) for name in model)
+
+
+# Functionality that simulates `torch.load` but where individual tensors are
+# only loaded into memory on demand, not all at once.
+# PyTorch can't do this natively as of time of writing:
+# - https://github.com/pytorch/pytorch/issues/64327
+# This allows us to de-shard without multiplying RAM usage, and also
+# conveniently drops the PyTorch dependency (though we still need numpy).
+
+
+@dataclass
+class LazyStorageKind:
+    data_type: DataType
+
+
+@dataclass
+class LazyStorage:
+    load: Callable[[int, int], NDArray]
+    kind: LazyStorageKind
+    description: str
+
+
+class LazyUnpickler(pickle.Unpickler):
+    def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
+        super().__init__(fp)
+        self.data_base_path = data_base_path
+        self.zip_file = zip_file
+
+    def persistent_load(self, pid: Any) -> Any:
+        assert pid[0] == "storage"
+        assert isinstance(pid[1], LazyStorageKind)
+        data_type = pid[1].data_type
+        filename_stem = pid[2]
+        filename = self.data_base_path + "/" + filename_stem
+        info = self.zip_file.getinfo(filename)
+
+        def load(offset: int, elm_count: int) -> NDArray:
+            dtype = DATA_TYPE_TO_NUMPY.get(data_type)
+            if dtype is None:
+                raise Exception("tensor stored in unsupported format")
+            fp = self.zip_file.open(info)
+            fp.seek(offset * dtype.itemsize)
+            size = elm_count * dtype.itemsize
+            data = fp.read(size)
+            assert len(data) == size
+            return np.frombuffer(data, dtype)
+
+        description = (
+            f"storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}"
+        )
+        return LazyStorage(load=load, kind=pid[1], description=description)
+
+    # @staticmethod
+
+    def lazy_rebuild_tensor_v2(
+        storage: Any,
+        storage_offset: Any,
+        size: Any,
+        stride: Any,  # pyright: ignore[reportSelfClsParameterName]
+        requires_grad: Any,
+        backward_hooks: Any,
+        metadata: Any = None,
+    ) -> LazyTensor:
+        assert isinstance(storage, LazyStorage)
+
+        def load() -> UnquantizedTensor:
+            elm_count = stride[0] * size[0]
+            return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
+
+        description = f"pickled storage_offset={storage_offset} in {storage.description}"
+        return LazyTensor(load, list(size), storage.kind.data_type, description)
+
+    # @staticmethod
+    def rebuild_from_type_v2(func, new_type, args, state):
+        return func(*args)
+
+    CLASSES: Dict[Any, Any] = {
+        ("torch._tensor", "_rebuild_from_type_v2"): rebuild_from_type_v2,
+        ("torch._utils", "_rebuild_tensor_v2"): lazy_rebuild_tensor_v2,
+        ("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16),
+        ("torch", "HalfStorage"): LazyStorageKind(DT_F16),
+        ("torch", "FloatStorage"): LazyStorageKind(DT_F32),
+        ("torch", "IntStorage"): LazyStorageKind(DT_I32),
+        ("torch", "BoolStorage"): LazyStorageKind(DT_BOOL),
+        ("torch", "Tensor"): LazyTensor,
+    }
+
+    def find_class(self, module: str, name: str) -> Any:
+        if not module.startswith("torch"):
+            return super().find_class(module, name)
+        return self.CLASSES[(module, name)]
+
+
+def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
+    zf = zipfile.ZipFile(outer_fp)
+    pickle_paths = [name for name in zf.namelist() if name.endswith(".pkl")]
+    assert len(pickle_paths) == 1, pickle_paths
+    pickle_fp = zf.open(pickle_paths[0], "r")
+    unpickler = LazyUnpickler(pickle_fp, data_base_path=pickle_paths[0][:-4], zip_file=zf)
+    model = unpickler.load()
+    as_dict = dict(model.items())
+    return ModelPlus(model=as_dict, paths=[path], format="torch", vocab=None)
+
+
+SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
+    "F16": DT_F16,
+    "F32": DT_F32,
+    "I32": DT_I32,
+    "BOOL": DT_BOOL,
+    "BF16": DT_BF16,
+}
+
+
+def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
+    (header_size,) = struct.unpack("<Q", fp.read(8))
+    header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
+    # Use mmap for the actual data to avoid race conditions with the file offset.
+    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
+    byte_buf = mapped[8 + header_size :]
+
+    def convert(info: Dict[str, Any]) -> LazyTensor:
+        data_type = SAFETENSORS_DATA_TYPES[info["dtype"]]
+        numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
+        shape: List[int] = info["shape"]
+        begin, end = info["data_offsets"]
+        assert 0 <= begin <= end <= len(byte_buf)
+        assert end - begin == math.prod(shape) * numpy_dtype.itemsize
+        buf = byte_buf[begin:end]
+
+        def load() -> UnquantizedTensor:
+            return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
+
+        description = f"safetensors begin={begin} end={end} type={data_type} path={path}"
+        return LazyTensor(load, shape, data_type, description)
+
+    model = {name: convert(info) for (name, info) in header.items() if name != "__metadata__"}
+    return ModelPlus(model=model, paths=[path], format="safetensors", vocab=None)
+
+
+def must_read(fp: IO[bytes], length: int) -> bytes:
+    ret = fp.read(length)
+    if len(ret) < length:
+        raise Exception("unexpectedly reached end of file")
+    return ret
+
+
+def lazy_load_ne_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
+    magic = must_read(fp, 4)[::-1]
+    if magic in (b"ggmf", b"ggjt"):
+        (version,) = struct.unpack("i", must_read(fp, 4))
+        assert version == 1
+    else:
+        assert magic == b"ne"
+        version = None
+    n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack(
+        "<7i", must_read(fp, 28)
+    )
+
+    tokens: List[Tuple[bytes, float]] = []
+    for i in range(n_vocab):
+        if i == 32000:
+            # HACK: GPT4All messed with the format without changing the magic
+            # number.  Specifically, they changed the vocab section to contain
+            # `n_vocab - 1` tokens instead of `n_vocab` (i.e. omitting the
+            # extra pad token).  Try to detect if we're reading a file like
+            # this.
+            orig_pos = fp.tell()
+            fp.seek(20, io.SEEK_CUR)
+            is_gpt4all = fp.read(21) == b"tok_embeddings.weight"
+            fp.seek(orig_pos)
+            if is_gpt4all:
+                break
+
+        (length,) = struct.unpack("i", must_read(fp, 4))
+        text = must_read(fp, length)
+        if magic != b"ne":
+            (score,) = struct.unpack("f", must_read(fp, 4))
+            tokens.append((text, score))
+    vocab = NEVocab(tokens) if magic != b"ne" else None
+
+    model: LazyModel = {}
+    # Use mmap for the actual data to avoid race conditions with the file offset.
+    off = fp.raw.tell()
+    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
+    fp.raw.seek(off)  # needed on Windows
+
+    def read_tensor() -> (
+        None
+    ):  # this is a function so that variables captured in `load` don't change
+        shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
+        assert 0 <= shape_len <= 3
+        shape: List[int] = list(struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len)))
+        shape = shape[::-1]
+        name = must_read(fp, name_len).decode("utf-8")
+        data_type = FTYPE_TO_DATA_TYPE[ftype]
+
+        if magic == b"ggjt":
+            fp.seek((fp.tell() + 31) & -32)
+
+        if data_type == DT_Q4_1:
+            # See GPTQForLLaMaQuantizedTensor.ne_ndarray()
+            size = 24 * (shape[1] // 32) * shape[0]
+        elif data_type == DT_Q4_0:
+            size = 20 * (shape[1] // 32) * shape[0]
+        else:
+            numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
+            elm_count = math.prod(shape)
+            size = elm_count * numpy_dtype.itemsize
+        offset = fp.tell()
+        buf = mapped[offset : offset + size]
+        fp.seek(size, io.SEEK_CUR)
+
+        def load() -> Tensor:
+            if isinstance(data_type, QuantizedDataType):
+                ndarray = np.frombuffer(buf, dtype=np.uint32)
+                return NEQuantizedTensor(ndarray, shape, data_type)
+            else:
+                return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
+
+        description = f"ne offset={offset} type={data_type} path={path}"
+        model[name] = LazyTensor(load, shape, data_type, description)
+
+    while fp.read(1) != b"":
+        fp.seek(-1, io.SEEK_CUR)
+        read_tensor()
+
+    return ModelPlus(model=model, paths=[path], format="ne", vocab=vocab)
+
+
+@functools.lru_cache(maxsize=None)
+def lazy_load_file(path: Path) -> ModelPlus:
+    fp = open(path, "rb")
+    first8 = fp.read(8)
+    fp.seek(0)
+    if first8[:2] == b"PK":
+        # A zip file, i.e. PyTorch format
+        return lazy_load_torch_file(fp, path)
+    elif first8[2:4] == b"gg":
+        # NE format
+        return lazy_load_ne_file(fp, path)
+    elif struct.unpack("<Q", first8)[0] < 16 * 1024 * 1024:
+        # Probably safetensors
+        return lazy_load_safetensors_file(fp, path)
+    else:
+        raise ValueError(f"unknown format: {path}")
+
+
+In = TypeVar("In")
+Out = TypeVar("Out")
+
+
+def bounded_parallel_map(
+    func: Callable[[In], Out], iterable: Iterable[In], concurrency: int
+) -> Iterable[Out]:
+    """Parallel map, but with backpressure.  If the caller doesn't call `next`
+    fast enough, this will stop calling `func` at some point rather than
+    letting results pile up in memory.  Specifically, there is a max of one
+    output value buffered per thread."""
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures: List[concurrent.futures.Future[Out]] = []
+        items_rev = list(iterable)[::-1]
+        for i in range(min(concurrency, len(items_rev))):
+            futures.append(executor.submit(func, items_rev.pop()))
+        while futures:
+            result = futures.pop(0).result()
+            if items_rev:
+                futures.append(executor.submit(func, items_rev.pop()))
+            yield result
+
+
+def check_vocab_size(params: Params, vocab: Vocab) -> None:
+    if params.n_vocab != vocab.vocab_size:
+        # NEVocab comes from the same file as the model so shouldn't mismatch:
+        assert isinstance(vocab, SentencePieceVocab)
+        if params.n_vocab == vocab.vocab_size_base:
+            print("Ignoring added_tokens.json since model matches vocab size without it.")
+            vocab.added_tokens_list = []
+            vocab.vocab_size = vocab.vocab_size_base
+            return
+        msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
+        if vocab.fname_added_tokens is not None:
+            msg += f" combined with {vocab.fname_added_tokens}"
+        msg += f" has {vocab.vocab_size})."
+        if (
+            vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20
+            and vocab.fname_added_tokens is None
+        ):
+            msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
+        raise Exception(msg)
+
+
+class OutputFile:
+    def __init__(self, fname_out: Path) -> None:
+        self.fout = open(fname_out, "wb")
+        self.vocab_size = None
+
+    def write_file_header(self, params: Params, file_type: NEFileType) -> None:
+        self.fout.write(b"ggjt"[::-1])  # magic
+        values = [
+            1,  # file version
+            params.n_vocab,
+            params.n_embd,
+            params.n_mult,
+            params.n_head,
+            params.n_head_kv,  # n_head_kv (multi_query attention)
+            params.n_layer,
+            params.n_embd // params.n_head,  # rot (obsolete)
+            file_type.value,
+        ]
+        self.vocab_size = params.n_vocab
+        self.fout.write(struct.pack("i" * len(values), *values))
+        self.fout.write(struct.pack("i", 0))
+        self.fout.write(struct.pack("f", 0))
+        self.fout.write(struct.pack("f", 0))
+        self.fout.write(struct.pack("i", 0))
+        self.fout.write(struct.pack("i", 0))  # word_embed_proj_dim (for opt)
+        self.fout.write(struct.pack("i", 0))  # do_layer_norm_before (for opt)
+
+        self.fout.write(struct.pack("i", 0))
+        self.fout.write(struct.pack("i", params.ffn_hidden_size))
+        self.fout.write(struct.pack("i", 0))
+
+        self.fout.write(struct.pack("i", 0))  # n_experts
+        self.fout.write(struct.pack("i", 0))  # n_expert_used
+        self.fout.write(struct.pack("i", 0))  # n_embd_head_k for gemma
+        self.fout.write(struct.pack("f", params.rms_norm_eps))
+        self.fout.write(struct.pack("f", params.rope_theta))
+        self.fout.write(struct.pack("f", params.rope_scale))
+
+        self.fout.write(struct.pack("f", 0.0))  # config.json "rope_scaling.factor", not enabled
+        self.fout.write(struct.pack("i", 0))  # rope_scaling.original_max_position_embeddings
+        self.fout.write(struct.pack("i", 0))  # params["rope_scaling"]["type"] =="yarn" else 0))
+
+        # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
+        # but bos_token_id = 1 in llama.cpp
+        self.fout.write(struct.pack("i", params.bos_token_id))
+        self.fout.write(struct.pack("i", params.eos_token_id))
+        self.fout.write(struct.pack("i", params.pad_token_id))
+        self.fout.write(struct.pack("i", -1))
+
+    def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
+        sname = name.encode("utf-8")
+        self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type]))
+        self.fout.write(struct.pack("i" * len(shape), *shape[::-1]))
+        self.fout.write(sname)
+        self.fout.seek((self.fout.tell() + 31) & -32)
+
+    def write_vocab(self, vocab: Vocab) -> None:
+        if self.vocab_size == llama3_vocab_size:
+            for text, score in vocab.all_tokens():
+                if isinstance(text, str):
+                    text = text.encode("utf-8")
+                self.fout.write(struct.pack("i", len(text)))
+                self.fout.write(text)
+                self.fout.write(struct.pack("f", score))
+        else:
+            for text, score in vocab.all_tokens():
+                self.fout.write(struct.pack("i", len(text)))
+                self.fout.write(text)
+                self.fout.write(struct.pack("f", score))
+
+    @staticmethod
+    def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
+        of = OutputFile(fname_out)
+        params = Params(
+            n_vocab=vocab.vocab_size,
+            n_embd=0,
+            n_mult=0,
+            n_head=1,
+            n_layer=0,
+        )
+        of = OutputFile(fname_out)
+        of.write_file_header(params, file_type=NEFileType.AllF32)
+        of.write_vocab(vocab)
+        of.fout.close()
+
+    @staticmethod
+    def write_all(
+        fname_out: Path, params: Params, model: LazyModel, vocab: Vocab, file_type: NEFileType
+    ) -> None:
+        # check_vocab_size(params, vocab)
+        of = OutputFile(fname_out)
+        of.write_file_header(params, file_type)
+        print("Writing vocab...")
+        of.write_vocab(vocab)
+
+        def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
+            name, lazy_tensor = item
+            return lazy_tensor.load().to_ne().ndarray
+
+        ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
+            size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            padi = len(str(len(model)))
+            print(
+                f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | \
+                 type {lazy_tensor.data_type}"
+            )
+            of.write_tensor_header(
+                name,
+                lazy_tensor.shape,
+                lazy_tensor.data_type,
+            )
+            ndarray.tofile(of.fout)
+        of.fout.close()
+
+
+class OutputFile_GGUF:
+    def __init__(self, fname_out: Path) -> None:
+        self.gguf_file = str(fname_out) + ".gguf"
+        self.gguf_writer = gguf.GGUFWriter(self.gguf_file, "llama")
+
+    def write_file_header(self, params: Params, file_type: NEFileType) -> None:
+        # Customized
+        self.gguf_writer.add_uint32("magic", 0x67676D66)
+        self.gguf_writer.add_uint32("version", 1)
+        self.gguf_writer.add_uint32("n_vocab", params.n_vocab)
+        self.gguf_writer.add_uint32("n_mult", params.n_mult)
+        self.gguf_writer.add_uint32("ftype", file_type.value)
+
+        # LLM
+        self.gguf_writer.add_embedding_length(params.n_embd)
+        self.gguf_writer.add_context_length(4096)
+        self.gguf_writer.add_block_count(params.n_layer)
+        self.gguf_writer.add_feed_forward_length(params.ffn_hidden_size)
+
+        # Attention
+        self.gguf_writer.add_head_count(params.n_head)
+        self.gguf_writer.add_head_count_kv(params.n_head_kv)
+        self.gguf_writer.add_rope_dimension_count(params.n_embd // params.n_head)
+        self.gguf_writer.add_layer_norm_rms_eps(params.rms_norm_eps)
+        self.gguf_writer.add_rope_freq_base(params.rope_theta)
+
+        # TODO:
+        # bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
+        # but bos_token_id = 1 in llama.cpp
+        # Tokenizer
+        self.gguf_writer.add_bos_token_id(1)
+        self.gguf_writer.add_eos_token_id(2)
+        self.gguf_writer.add_pad_token_id(0)
+        self.gguf_writer.add_sep_token_id(0)
+
+    def write_tensor_header_gguf(
+        self, name: str, shape: Sequence[int], data_type: DataType, data
+    ) -> None:
+        self.gguf_writer.add_tensor(name, data)
+
+    def end(self):
+        print("gguf: write header")
+        self.gguf_writer.write_header_to_file()
+        print("gguf: write metadata")
+        self.gguf_writer.write_kv_data_to_file()
+        print("gguf: write tensors")
+        self.gguf_writer.write_tensors_to_file()
+
+        self.gguf_writer.close()
+
+    def write_vocab_gguf(self, vocab: Vocab) -> None:
+        # for text, score in vocab.all_tokens():
+        #     self.fout.write(struct.pack("i", len(text)))
+        #     self.fout.write(text)
+        #     self.fout.write(struct.pack("f", score))
+
+        print("gguf: get tokenizer metadata")
+
+        tokens: List[bytes] = []
+        scores: List[float] = []
+
+        for text, score in vocab.all_tokens():
+            tokens.append(text)
+            scores.append(score)
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+
+        print("gguf: get tokenizer metadata done")
+
+    @staticmethod
+    def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
+        of = OutputFile_GGUF(fname_out)
+        params = Params(
+            n_vocab=vocab.vocab_size,
+            n_embd=0,
+            n_mult=0,
+            n_head=1,
+            n_layer=0,
+        )
+        of = OutputFile_GGUF(fname_out)
+        of.write_file_header(params, file_type=NEFileType.AllF32)
+        of.write_vocab_gguf(vocab)
+
+    @staticmethod
+    def write_all(
+        fname_out: Path, params: Params, model: LazyModel, vocab: Vocab, file_type: NEFileType
+    ) -> None:
+        check_vocab_size(params, vocab)
+        of = OutputFile_GGUF(fname_out)
+        of.write_file_header(params, file_type)
+        print("Writing vocab...")
+        of.write_vocab_gguf(vocab)
+
+        def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
+            name, lazy_tensor = item
+            return lazy_tensor.load().to_ne().ndarray
+
+        ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
+            size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            padi = len(str(len(model)))
+            print(
+                f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | \
+                 type {lazy_tensor.data_type}"
+            )
+            of.write_tensor_header_gguf(name, lazy_tensor.shape, lazy_tensor.data_type, ndarray)
+
+        of.end()
+
+
+def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> NEFileType:
+    wq_type = model["layers.0.attention.wq.weight"].data_type
+    if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
+        return NEFileType.AllF32
+    if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
+        return NEFileType.MostlyF16
+    if output_type_str == "q4_1" or (
+        output_type_str is None and isinstance(wq_type, QuantizedDataType) and wq_type.have_addends
+    ):
+        if isinstance(model["output.weight"].data_type, QuantizedDataType):
+            return NEFileType.MostlyQ4_1
+        else:
+            return NEFileType.PerLayerIsQ4_1
+    if output_type_str == "q4_0" or (
+        output_type_str is None and isinstance(wq_type, QuantizedDataType)
+    ):
+        return NEFileType.MostlyQ4_0
+    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
+    raise Exception(f"Unexpected combination of types: {name_to_type}")
+
+
+def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
+    model = handle_quantization(model)
+
+    if "lm_head.weight" in model:
+        model = convert_transformers_to_orig(model, params)
+    model = filter_and_sort_tensors(model)
+
+    return model
+
+
+def convert_to_output_type(model: LazyModel, output_type: NEFileType) -> LazyModel:
+    return {
+        name: tensor.astype(output_type.type_for_tensor(name, tensor))
+        for (name, tensor) in model.items()
+    }
+
+
+def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
+    """Given any path belonging to a multi-file model (e.g. foo.bin.1), return
+    the nth path in the model.
+    """
+    # Support the following patterns:
+    patterns: List[Tuple[str, str]] = [
+        # - x.00.pth, x.01.pth, etc.
+        (r"\.[0-9]{2}\.pth$", f".{n:02}.pth"),
+        # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
+        (r"-[0-9]{5}-of-(.*)$", rf"-{n:05}-of-\1"),
+        # x.bin, x.bin.1, etc.
+        (r"(\.[0-9]+)?$", r"\1" if n == 0 else rf"\1.{n}"),
+    ]
+    for regex, replacement in patterns:
+        if re.search(regex, path.name):
+            new_path = path.with_name(re.sub(regex, replacement, path.name))
+            if new_path.exists():
+                return new_path
+    return None
+
+
+def find_multifile_paths(path: Path) -> List[Path]:
+    """Given any path belonging to a multi-file model (e.g. foo.bin.1), return
+    the whole list of paths in the model.
+    """
+    ret: List[Path] = []
+    for i in itertools.count():
+        nth_path = nth_multifile_path(path, i)
+        if nth_path is None:
+            break
+        ret.append(nth_path)
+    if not ret:
+        # No matches.  This should only happen if the file was named, e.g.,
+        # foo.0, and there was no file named foo.  Oh well, try to process it
+        # as a single file.
+        return [path]
+    return ret
+
+
+def load_some_model(path: Path) -> ModelPlus:
+    """Load a model of any supported format."""
+    # Be extra-friendly and accept either a file or a directory:
+    if path.is_dir():
+        # Check if it's a set of safetensors files first
+        files = list(path.glob("model-00001-of-*.safetensors"))
+        if not files:
+            # Try the PyTorch patterns too, with lower priority
+            globs = [
+                "consolidated.00.pth",
+                "pytorch_model-00001-of-*.bin",
+                "*.pt",
+                "pytorch_model.bin",
+            ]
+            files = [file for glob in globs for file in path.glob(glob)]
+        if not files:
+            # Try NE too, but with lower priority, since if both a non-NE
+            # model and a NE model exist in the same directory, we assume the
+            # latter was converted from the former.
+            files = list(path.glob("ne-model*.bin*"))
+        if not files:
+            raise Exception(f"Can't find model in directory {path}")
+        if len(files) > 1:
+            raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
+        path = files[0]
+
+    paths = find_multifile_paths(path)
+    models_plus: List[ModelPlus] = []
+    for path in paths:
+        print(f"Loading model file {path}")
+        models_plus.append(lazy_load_file(path))
+
+    model_plus = merge_multifile_models(models_plus)
+    return model_plus
+
+
+def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
+    return {name: model[name] for name in TENSORS_LIST if name in model}
+
+
+def load_vocab(path: Path, params_vocab_size: int) -> SentencePieceVocab:
+    # Be extra-friendly and accept either a file or a directory.  Also, if it's
+    # a directory, it might be the model directory, and tokenizer.model might
+    # be in the parent of that.
+    if path.is_dir():
+        path2 = path / "tokenizer.model"
+        # Use `.parent` instead of /.. to handle the symlink case better.
+        path3 = path.parent / "tokenizer.model"
+        if path2.exists():
+            path = path2
+        elif path3.exists():
+            path = path3
+        else:
+            raise FileNotFoundError(
+                f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, \
+                pass the directory as --vocab-dir"
+            )
+    added_tokens_path = path.parent / "added_tokens.json"
+    print(f"Loading vocab file {path}")
+    return SentencePieceVocab(
+        path, params_vocab_size, added_tokens_path if added_tokens_path.exists() else None
+    )
+
+
+def default_outfile(model_paths: List[Path], params: Params) -> Path:
+    namestr = {
+        NEFileType.AllF32: "f32",
+        NEFileType.MostlyF16: "f16",
+        NEFileType.MostlyQ4_0: "q4_0",
+        NEFileType.MostlyQ4_1: "q4_1",
+        NEFileType.PerLayerIsQ4_1: "q4_1",
+    }[NEFileType.AllF32]
+    ret = model_paths[0].parent / f"ne-model-{namestr}.bin"
+    if ret in model_paths:
+        sys.stderr.write(
+            f"Error: Default output path ({ret}) would overwrite the input.  Please explicitly specify\
+            a path using --outfile.\n"
+        )
+        sys.exit(1)
+    return ret
+
+
+def do_dump_model(model_plus: ModelPlus) -> None:
+    print(f"model_plus.paths = {model_plus.paths!r}")
+    print(f"model_plus.format = {model_plus.format!r}")
+    print(f"model_plus.vocab = {model_plus.vocab!r}")
+    for name, lazy_tensor in model_plus.model.items():
+        print(
+            f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}"
+        )
+
+
+def main(args_in: Optional[List[str]] = None) -> None:
+    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a NE compatible file")
+    parser.add_argument(
+        "--dump", action="store_true", help="don't convert, just show what's in the model"
+    )
+    parser.add_argument(
+        "--dump-single",
+        action="store_true",
+        help="don't convert, just show what's in a single model file",
+    )
+    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
+    parser.add_argument(
+        "--outtype",
+        choices=["f32", "f16", "q4_1", "q4_0"],
+        help="output format (default: based on input)",
+    )
+    parser.add_argument(
+        "--vocab-dir",
+        type=Path,
+        help="directory containing tokenizer.model, if separate from model file",
+    )
+    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument(
+        "--model_hub",
+        choices=["huggingface", "modelscope"],
+        default="huggingface",
+        help="hub to load model",
+    )
+    parser.add_argument(
+        "model",
+        type=Path,
+        help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)",
+    )
+    parser.add_argument(
+        "--format",
+        type=str,
+        default="NE",
+        choices=["NE", "GGUF"],
+        help="convert to the GGUF or NE format",
+    )
+    parser.add_argument(
+        "--vocab-type",
+        help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)",
+        default="spm,hfft",
+    )
+    args = parser.parse_args(args_in)
+    vocab: Vocab
+    if args.dump_single:
+        model_plus = lazy_load_file(args.model)
+        do_dump_model(model_plus)
+    elif args.vocab_only:
+        vocab = load_vocab(args.vocab_dir or args.model, 0)
+        assert args.outfile, "need --outfile if using --vocab-only"
+        outfile = args.outfile
+        OutputFile.write_vocab_only(outfile, vocab)
+        print(f"Wrote {outfile}")
+    else:
+        if Path(args.model).is_dir():
+            print("Loadding the model from the local path.")
+            model_plus = load_some_model(args.model)
+        else:
+            print("Loadding the model from HF.")
+            if args.model_hub == "modelscope":
+                from modelscope import AutoModelForCausalLM, AutoTokenizer
+            else:
+                from transformers import AutoModelForCausalLM, AutoTokenizer
+            model = AutoModelForCausalLM.from_pretrained(
+                str(args.model), low_cpu_mem_usage=True, trust_remote_code=True
+            )
+            tokenizer = AutoTokenizer.from_pretrained(str(args.model), trust_remote_code=True)
+            if hasattr(tokenizer, "vocab_file"):
+                cache_path = Path(tokenizer.vocab_file).parent
+                args.model = cache_path
+            else:
+                from transformers.utils import cached_file
+
+                tokenizer_path = cached_file(args.model, "tokenizer.json")
+                args.model = Path(tokenizer_path).parent
+
+        model_plus = load_some_model(args.model)
+        if args.dump:
+            do_dump_model(model_plus)
+            return
+        model = model_plus.model
+        params = Params.load(model_plus)
+
+        if model_plus.vocab is not None and args.vocab_dir is None:
+            vocab = model_plus.vocab
+        else:
+            if params.n_vocab == llama3_vocab_size:
+                # Llama3
+                from common import BpeVocab
+
+                vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
+                vocab = BpeVocab(Path(vocab_dir))
+            else:
+                vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
+                vocab = load_vocab(vocab_dir, params.n_vocab)
+        model = do_necessary_conversions(model, params)
+        output_type = pick_output_type(model, args.outtype)
+        model = convert_to_output_type(model, output_type)
+        outfile = args.outfile or default_outfile(model_plus.paths, params)
+
+        if args.format == "GGUF":
+            OutputFile_GGUF.write_all(outfile, params, model, vocab, output_type)
+        else:
+            OutputFile.write_all(outfile, params, model, vocab, output_type)
+
+        print(f"Wrote {outfile}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm-ext/inference_engine/python/inference_engine/quant/quant.py b/vllm-ext/inference_engine/python/inference_engine/quant/quant.py
new file mode 100644
index 000000000..6b9a02ef5
--- /dev/null
+++ b/vllm-ext/inference_engine/python/inference_engine/quant/quant.py
@@ -0,0 +1,53 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pathlib import Path
+import subprocess
+
+model_maps = {
+    "gpt_neox": "gptneox",
+    "gpt_bigcode": "starcoder",
+    "whisper": "whisper",
+    "qwen2": "qwen",
+    "RefinedWebModel": "falcon",
+    "RefinedWeb": "falcon",
+    "phi-msft": "phi",
+}
+
+
+def convert_model(
+    model, outfile, outtype="f32", format="NE", model_hub="huggingface", use_quantized_model=False
+):
+    from transformers import AutoConfig
+
+    config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+    model_type = model_maps.get(config.model_type, config.model_type)
+
+    if use_quantized_model:
+        path = Path(Path(__file__).parent.absolute(), "convert_quantized_{}.py".format(model_type))
+    else:
+        path = Path(Path(__file__).parent.absolute(), "convert_{}.py".format(model_type))
+    cmd = []
+    cmd.extend(["python", path])
+    cmd.extend(["--outfile", outfile])
+    cmd.extend(["--outtype", outtype])
+    if model_type in {"phi", "stablelm"}:
+        cmd.extend(["--format", format])
+    cmd.extend(["--model_hub", model_hub])
+    cmd.extend([model])
+
+    print("cmd:", cmd)
+    subprocess.run(cmd)
diff --git a/vllm-ext/requirements.txt b/vllm-ext/requirements.txt
new file mode 100644
index 000000000..7cc5ad485
--- /dev/null
+++ b/vllm-ext/requirements.txt
@@ -0,0 +1,19 @@
+vllm @ git+https://github.com/vllm-project/vllm.git@v0.5.0
+accelerate
+cmake
+datasets
+einops
+gguf
+huggingface_hub
+matplotlib
+numpy
+peft
+protobuf<3.20
+py-cpuinfo
+sentencepiece
+setuptools>=61
+tiktoken
+torch
+transformers
+zipfile38
+ninja
diff --git a/vllm-ext/setup.py b/vllm-ext/setup.py
new file mode 100644
index 000000000..4fe0b6deb
--- /dev/null
+++ b/vllm-ext/setup.py
@@ -0,0 +1,311 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+# check environment for building vllm+cpu
+result = subprocess.Popen("../dev/scripts/check-vllm-cpu-build-env.sh", shell=True)
+if result.wait():
+    raise SystemExit("error in setup: check-vllm-cpu-build-env.sh failed")
+# install requirements (vllm+cpu and its dependencies)
+print("Installing packages, including vllm+cpu, listed in requirements.txt")
+result = subprocess.Popen(
+    "MAX_JOBS=8 VLLM_TARGET_DEVICE=cpu pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu",
+    shell=True,
+)
+if result.wait():
+    raise SystemExit("error in setup: failed to install vllm+cpu and its dependencies")
+
+from setuptools import Extension, find_packages, setup
+from setuptools.command.build_ext import build_ext
+import sysconfig
+
+from cmake import CMAKE_BIN_DIR
+from cpuinfo import get_cpu_info
+
+cpu_flags = get_cpu_info()["flags"]
+
+CMAKE_BUILD_TYPE = os.environ.get("CMAKE_BUILD_TYPE", "Release")
+""" Whether to build with -O0 / -O3 / -g; could be one of Debug / Release / RelWithDebInfo; default to Release """
+
+CMAKE_GENERATOR = os.environ.get("CMAKE_GENERATOR", "Ninja")
+""" The CMake generator to be used; default to Ninja """
+
+CMAKE_ARGS = os.environ.get("CMAKE_ARGS", "")
+""" Adding CMake arguments set as environment variable (needed e.g. to build for GPU support on conda-forge) """
+
+CMAKE_BUILD_PARALLEL_LEVEL = os.environ.get("CMAKE_BUILD_PARALLEL_LEVEL", "")
+""" Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level across all generators """
+
+
+def check_env_flag(name: str, default: bool = False) -> bool:
+    if default:  # if a flag meant to be true if not set / mal-formatted
+        return os.getenv(name, "").upper() not in ["OFF", "0", "FALSE", "NO", "N"]
+    else:
+        return os.getenv(name, "").upper() in ["ON", "1", "TRUE", "YES", "Y"]
+
+
+IE_WITH_AVX2 = check_env_flag("IE_WITH_AVX2", "avx512f" not in cpu_flags)
+""" Whether to limit the max ISA used to AVX2; otherwise AVX512 will be used; set to ON/OFF """
+IE_PROFILING_ENV = os.environ.get("IE_PROFILING", "OFF")
+
+cwd = os.path.dirname(os.path.abspath(__file__))
+
+
+class CMakeExtension(Extension):
+    """CMakeExtension class."""
+
+    def __init__(self, name, sourcedir="", lib_only=False):
+        """Init a CMakeExtension object."""
+        Extension.__init__(self, name, sources=[])
+        self.sourcedir = os.path.abspath(sourcedir)
+        self.optional = (
+            lib_only  # we only deliver shared object but not as a python extension module
+        )
+
+
+class CMakeBuild(build_ext):
+    """Extension builder."""
+
+    _copy_targes: bool = False
+
+    @staticmethod
+    def _is_target_file(file_name: str) -> bool:
+        if file_name.endswith(".dll") or file_name.endswith(".pyd") or file_name.endswith(".exe"):
+            return True
+        if file_name.endswith(".so") or ".so." in file_name:
+            return True
+        if sys.platform == "linux" and ("." not in file_name):
+            return True
+        return False
+
+    @staticmethod
+    def _get_files(scope: str, repo: str):
+        """Equivalent of `git ls-files --recurse-submodules -- $scope` for git-v1.x"""
+        files = [
+            os.path.join(repo, f)
+            for f in subprocess.check_output(["git", "ls-files", "--", scope], cwd=repo)
+            .decode("utf-8")
+            .splitlines()
+        ]
+        submodules = (
+            subprocess.check_output(
+                ["git", "submodule", "--quiet", "foreach", "echo $sm_path"], cwd=repo
+            )
+            .decode("utf-8")
+            .splitlines()
+        )
+        for sm in submodules:
+            sm_path = os.path.join(repo, sm)
+            files.extend(CMakeBuild._get_files(sm_path, sm_path))
+        return files
+
+    def get_source_files(self):
+        """The primary purpose of this function is to help populating the `sdist` with all the files necessary to build the distribution. -- setuptools doc"""
+        files = super().get_source_files()
+        if not os.path.isdir(os.path.join(cwd, ".git")):
+            return files
+
+        for ext in self.extensions:
+            if not isinstance(ext, CMakeExtension):
+                continue
+            files.extend(os.path.relpath(f, cwd) for f in self._get_files(ext.sourcedir, cwd))
+        return files
+
+    def get_ext_filename(self, ext_name):
+        filename = super().get_ext_filename(ext_name)
+        print(f"file name: {filename}")
+        suffix = sysconfig.get_config_var("EXT_SUFFIX")
+        print(f"suffix: {suffix}")
+        ext = os.path.splitext(filename)[1]
+        return filename.replace(suffix, "") + ext
+
+    def build_extension(self, ext: CMakeExtension) -> None:
+        # Must be in this form due to bug in .resolve() only fixed in Python 3.10+
+        ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)
+        extdir = ext_fullpath.parent.resolve()
+
+        output_dir = f"{extdir}{os.sep}"
+        cmake_args = [
+            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={output_dir}",
+            f"-DCMAKE_RUNTIME_OUTPUT_DIRECTORY={output_dir}",
+            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{CMAKE_BUILD_TYPE.upper()}={output_dir}",
+            f"-DCMAKE_RUNTIME_OUTPUT_DIRECTORY_{CMAKE_BUILD_TYPE.upper()}={output_dir}",
+            f"-DPYTHON_EXECUTABLE={sys.executable}",
+            f"-DCMAKE_BUILD_TYPE={CMAKE_BUILD_TYPE}",
+            f"-DIE_VERSION_STRING={self.distribution.get_version()}",
+            f"-DIE_WITH_AVX2={'ON' if IE_WITH_AVX2 else 'OFF'}",
+            "-DIE_WITH_TESTS=OFF",
+            f"-DIE_PROFILING={IE_PROFILING_ENV}",
+        ]
+        if sys.platform == "linux":  # relative_rpath
+            cmake_args.append("-DCMAKE_BUILD_RPATH=$ORIGIN/")
+
+        build_args = []
+        my_env: dict[str, str] = os.environ.copy()
+
+        # Using Ninja-build since it a) is available as a wheel and b)
+        # multithreads automatically. MSVC would require all variables be
+        # exported for Ninja to pick it up, which is a little tricky to do.
+        generator = CMAKE_GENERATOR
+
+        if generator == "Ninja":
+            try:
+                import ninja
+
+                ninja_executable_path = Path(ninja.BIN_DIR) / "ninja"
+                cmake_args += [
+                    f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
+                ]
+            except ImportError:
+                generator = ""
+        if generator:
+            cmake_args += [f"-G{generator}"]
+
+        if self.compiler.compiler_type == "msvc":
+            # Single config generators are handled "normally"
+            single_config = any(x in generator for x in {"NMake", "Ninja"})
+
+            # CMake allows an arch-in-generator style for backward compatibility
+            contains_arch = any(x in generator for x in {"Win64"})
+
+            # Specify the arch if using MSVC generator, but only if it doesn't
+            # contain a backward-compatibility arch spec already in the
+            # generator name.
+            PLAT_TO_CMAKE = {  # Convert distutils Windows platform specifiers to CMake -A arguments
+                "win32": "Win32",
+                "win-amd64": "x64",
+            }
+
+            if not single_config and not contains_arch:
+                if sys.platform == "win32" and sys.maxsize.bit_length() > 31:
+                    cmake_args += ["-A", "x64"]
+                else:
+                    cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]]
+
+            if generator == "Ninja":
+                # temporary solution based on that of pytorch
+                from distutils import _msvccompiler  # type: ignore[import]
+
+                vc_env = _msvccompiler._get_vc_env("x64")
+                # Keys in `_get_vc_env` are always lowercase while OS environ keys are always uppercase on Windows.
+                # https://stackoverflow.com/a/7797329
+                my_env = {**my_env, **{k.upper(): v for k, v in vc_env.items()}}
+
+            # Multi-config generators have a different way to specify configs
+            if not single_config:
+                build_args += ["--config", CMAKE_BUILD_TYPE]
+
+        if CMAKE_ARGS:
+            cmake_args += [item for item in CMAKE_ARGS.split(" ") if item]
+
+        if not CMAKE_BUILD_PARALLEL_LEVEL:
+            parallel_level = getattr(self, "parallel", "") or ""
+            build_args += [f"-j{parallel_level}"]
+
+        # we avoid using self.build_tmp for incremental builds
+        build_dir = Path("build") / ext.name.split(".")[-1]
+        if not build_dir.exists():
+            build_dir.mkdir(parents=True)
+        cmake_path = os.path.join(CMAKE_BIN_DIR, "cmake")
+        config_command = [cmake_path, *cmake_args, ext.sourcedir]
+        build_command = [cmake_path, "--build", ".", *build_args]
+        print(" ".join(config_command))
+        subprocess.run(config_command, cwd=build_dir, check=True, env=my_env)
+        print(" ".join(build_command))
+        subprocess.run(build_command, cwd=build_dir, check=True, env=my_env)
+        if self._copy_targes:
+            for f in next(os.walk(output_dir))[2]:
+                if CMakeBuild._is_target_file(f):
+                    dest_folder = os.path.join(cwd, *ext.name.split(".")[:-1])
+                    if not os.path.exists(dest_folder):
+                        os.mkdir(dest_folder)
+                    self.copy_file(os.path.join(output_dir, f), os.path.join(dest_folder, f))
+
+    def get_output_mapping(self):
+        mapping: dict[str, str] = getattr(super(), "get_output_mapping")()
+        for ext in self.extensions:
+            if not isinstance(ext, CMakeExtension):
+                continue
+            build_lib = (Path(self.build_lib) / ext.name.replace(".", os.sep)).parent
+            ext_dir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
+            for f in next(os.walk(build_lib.resolve()))[2]:
+                mapping[str(build_lib / f)] = str(ext_dir / f)
+        return mapping
+
+    def run(self) -> None:
+        self._copy_targes = self.inplace or getattr(self, "editable_mode", False)
+        return super().run()
+
+
+def check_submodules():
+    """Check submodules information."""
+    if not os.path.exists(".git"):
+        return
+    try:
+        print(" --- Trying to initialize submodules")
+        start = time.time()
+        subprocess.check_call(["git", "submodule", "update", "--init", "--recursive"], cwd=cwd)
+        end = time.time()
+        print(f" --- Submodule initialization took {end - start:.2f} sec")
+    except Exception:
+        print(" --- Submodule initialization failed")
+        print("Please run:\n\tgit submodule update --init --recursive")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    check_submodules()
+    ext_modules = [CMakeExtension("inference_engine.libllama_vllm_cb_cpp", "inference_engine/cpp")]
+    cmdclass = {"build_ext": CMakeBuild}
+
+    setup(
+        name="vllm-ext",
+        author="Intel HPDA Team",
+        author_email="jiafu.zhang@intel.com",
+        description="VLLM Intel Inference Engine on CPU",
+        long_description=open("README.md", "r", encoding="utf-8").read(),
+        long_description_content_type="text/markdown",
+        keywords="Large Language Model, LLM, Inference Engine, Intel",
+        license="Apache 2.0",
+        url="https://github.com/intel/llm-on-ray/vllm",
+        ext_modules=ext_modules,
+        packages=(["vllm"] + find_packages(where="inference_engine/python")),
+        # packages=find_packages(),
+        # package_dir={'inference_engine': './python/inference_engine', '': './python'},
+        # package_dir={'vllm': '', 'inference_engine': 'inference_engine/python'},
+        package_dir={"inference_engine": "inference_engine/python/inference_engine"},
+        # otherwise CMakeExtension's source files will be included in final installation
+        include_package_data=True,
+        # exclude_package_data={"vllm": ["*.pyc", "*.pyo", "**/__pycache__"]},
+        # package_data={
+        #     '': ["*.yaml", "*.mat"],
+        # },
+        cmdclass=cmdclass,
+        python_requires=">=3.9.0",
+        classifiers=[
+            "Intended Audience :: Science/Research",
+            "Programming Language :: Python :: 3",
+            "Topic :: Scientific/Engineering :: Artificial Intelligence",
+            "License :: OSI Approved :: Apache Software License",
+        ],
+        # setup_requires=['setuptools_scm'],
+        version="0.0.1",
+    )
diff --git a/vllm-ext/uninstall_vllm_ext.sh b/vllm-ext/uninstall_vllm_ext.sh
new file mode 100755
index 000000000..464cf7b81
--- /dev/null
+++ b/vllm-ext/uninstall_vllm_ext.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+sed -e 's\setuptools.*\\g' requirements.txt > .requirements.txt.unis
+pip uninstall -y -r .requirements.txt.unis
+pip uninstall -y vllm-ext
+rm .requirements.txt.unis
diff --git a/vllm-ext/vllm/extension/__init__.py b/vllm-ext/vllm/extension/__init__.py
new file mode 100644
index 000000000..854e39ad4
--- /dev/null
+++ b/vllm-ext/vllm/extension/__init__.py
@@ -0,0 +1,15 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/vllm-ext/vllm/extension/ns/__init__.py b/vllm-ext/vllm/extension/ns/__init__.py
new file mode 100644
index 000000000..941fbf531
--- /dev/null
+++ b/vllm-ext/vllm/extension/ns/__init__.py
@@ -0,0 +1,174 @@
+# extension for integrate with neural-speed
+import importlib
+from typing import Optional
+import os
+
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import get_quant_config
+
+from vllm.logger import init_logger
+
+from inference_engine import Model as IE_Model
+
+logger = init_logger(__name__)
+
+# parameters for interacting with native code, same variables defined in native vllm_cont_batch.cpp
+# ==========================================================================================
+_KV_CACHE_MARK_YES = -1
+_KV_CACHE_MARK_NO = 0
+_KV_CACHE_LAST_DIM = 2
+_KV_CACHE_CPY_PARAMS_SIZE = 5
+_KV_CACHE_ELEMENT_USED = 4
+
+# global model object to store the model
+# ==========================================================================================
+_IE_MODEL: IE_Model = None
+
+# global request_id to internal
+
+
+# allow ns quantization
+# ==========================================================================================
+vllm_config = importlib.import_module("vllm.config")
+old_verify_quantization = vllm_config.ModelConfig._verify_quantization
+
+
+def _verify_quntization(self):
+    if self.quantization == "ns":
+        os.environ["NS_QUANTIZATION"] = "1"
+        return
+    return old_verify_quantization(self)
+
+
+vllm_config.ModelConfig._verify_quantization = _verify_quntization
+
+
+quant = importlib.import_module("vllm.model_executor.layers.quantization")
+
+from vllm.extension.ns.quantization.cpu_ns_config import NSQuantConfig
+
+quant.QUANTIZATION_METHODS["ns"] = NSQuantConfig
+
+logger.info("__ns extension: add ns quantization config, %s", NSQuantConfig.__name__)
+
+# use ns model loader for ns
+# ==========================================================================================
+vllm_loader = importlib.import_module("vllm.model_executor.model_loader.loader")
+old_get_model_loader = vllm_loader.get_model_loader
+
+from vllm.extension.ns.model.ns_loader import NSModelLoaderV2
+from vllm.extension.ns.quantization.cpu_ns_config import copy_quant_ns_config
+from vllm.config import LoadConfig, ModelConfig
+
+
+def get_model_loader_ns(load_config: LoadConfig) -> vllm_loader.BaseModelLoader:
+    if os.environ["NS_QUANTIZATION"] == "1":
+        return NSModelLoaderV2(load_config)
+    return old_get_model_loader(load_config)
+
+
+vllm_loader.get_model_loader = get_model_loader_ns
+
+
+def _get_linear_method_ns(
+    model_config: vllm_config.ModelConfig, load_config: LoadConfig
+) -> Optional["LinearMethodBase"]:
+    """Get the (maybe quantized) linear method."""
+    linear_method = None
+    if model_config.quantization is not None:
+        quant_config = get_quant_config(model_config, load_config)
+        linear_method = quant_config.get_linear_method()
+    return linear_method
+
+
+vllm_loader._get_linear_method = _get_linear_method_ns
+
+
+# bypass cuda capability check since vllm+cpu doesn't have cuda installed
+def _get_quantization_config(
+    model_config: ModelConfig, load_config: LoadConfig
+) -> Optional[QuantizationConfig]:
+    """Get the quantization config."""
+    if model_config.quantization is not None:
+        # copy quant_ns_config.json to downloaded model directory
+        copy_quant_ns_config(model_config, load_config)
+        return get_quant_config(model_config, load_config)
+    return None
+
+
+vllm_loader._get_quantization_config = _get_quantization_config
+
+# reload to make above changes take effect
+model_loader = importlib.import_module("vllm.model_executor.model_loader")
+importlib.reload(model_loader)
+
+logger.info("__ns extension: use ns model loader for ns model, %s", NSModelLoaderV2.__name__)
+
+# replace LlamaModel in models/llama.py with our NSLLamaModel
+# ==========================================================================================
+from vllm.extension.ns.model.ns_model import NSLLamaModel
+
+# ModelRegistry.register_model("LlamaForCausalLM", NSLLamaModel)
+
+vllm_llama = importlib.import_module("vllm.model_executor.models.llama")
+vllm_llama.LlamaModel = NSLLamaModel
+
+logger.info("__ns extension: replace LlamaModel with ns LLamaModel, %s", NSLLamaModel.__name__)
+
+# use our CPUCacheEngine for ns
+# ==========================================================================================
+cpu_worker = importlib.import_module("vllm.worker.cpu_worker")
+
+from vllm.extension.ns.kv_cache.ns_cache import NSCPUCacheEngine
+
+cpu_worker.CPUCacheEngine = NSCPUCacheEngine
+
+
+def get_cache_block_size_bytes(self) -> int:
+    """Return the size in bytes of a single KV cache block."""
+    return NSCPUCacheEngine.get_cache_block_size(
+        self.cache_config, self.model_config, self.parallel_config, self.scheduler_config
+    )
+
+
+cpu_worker.CPUWorker.get_cache_block_size_bytes = get_cache_block_size_bytes
+
+logger.info("__ns extension: use ns cache engine for ns, %s", NSCPUCacheEngine.__name__)
+
+# use our execute_model method to do some conversion and pass more parameters
+# ==========================================================================================
+cpu_model_runner = importlib.import_module("vllm.worker.cpu_model_runner")
+
+from vllm.extension.ns.model.ns_model import execute_model
+
+cpu_model_runner.CPUModelRunner.execute_model = execute_model
+
+logger.info("__ns extension: replace execute_model in cpu_model_runner, %s", execute_model.__name__)
+
+# use extended BlockSpaceManagerV1 to allocate and free native slot and kv caches
+# ==========================================================================================
+interfaces = importlib.import_module("vllm.core.interfaces")
+
+
+def get_block_space_manager_class(version: str):
+    version = version.lower()
+
+    if version == "v1":
+        from vllm.extension.ns.kv_cache.ns_cache import NSBlockSpaceManagerV1
+
+        return NSBlockSpaceManagerV1
+
+    # TODO: v2 is to be supported
+    # if version == "v2":
+    #     from vllm.core.block_manager_v2 import BlockSpaceManagerV2
+    #     return BlockSpaceManagerV2
+
+    raise ValueError(f"Unknown version {version=}")
+
+
+interfaces.BlockSpaceManager.get_block_space_manager_class = get_block_space_manager_class
+
+logger.info(
+    "__ns extension: replace BlockSpaceManager.get_block_space_manager_class in vllm.core.interfaces with %s",
+    get_block_space_manager_class.__name__,
+)
diff --git a/vllm-ext/vllm/extension/ns/kv_cache/__init__.py b/vllm-ext/vllm/extension/ns/kv_cache/__init__.py
new file mode 100644
index 000000000..854e39ad4
--- /dev/null
+++ b/vllm-ext/vllm/extension/ns/kv_cache/__init__.py
@@ -0,0 +1,15 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/vllm-ext/vllm/extension/ns/kv_cache/ns_cache.py b/vllm-ext/vllm/extension/ns/kv_cache/ns_cache.py
new file mode 100644
index 000000000..fdffdfa5a
--- /dev/null
+++ b/vllm-ext/vllm/extension/ns/kv_cache/ns_cache.py
@@ -0,0 +1,309 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Dict, Optional, Tuple, Set
+import os
+import torch
+from vllm.config import CacheConfig, ModelConfig, ParallelConfig, DeviceConfig, SchedulerConfig
+from vllm.core.interfaces import AllocStatus
+from vllm.sequence import Sequence, SequenceGroup
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+
+from vllm.core.block_manager_v1 import BlockSpaceManagerV1
+
+from vllm.extension import ns
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+_KV_CACHES: List[torch.Tensor] = None
+
+
+class NSCPUCacheEngine:
+    """Origin:
+    Manages the KV cache for CPU backend.
+
+    This class is responsible for initializing and managing CPU KV
+    caches. It also provides methods for performing KV cache operations, such
+    as copying.
+
+    New:
+    ======Change to map vllm seq_id to native KV cache slot_id======
+    KV cache is managed in native.
+    """
+
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        device_config: DeviceConfig,
+    ) -> None:
+        assert device_config.device_type == "cpu"
+        self.cache_config = cache_config
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+
+        self.head_size = model_config.get_head_size()
+        self.num_layers = model_config.get_num_layers(parallel_config)
+        self.num_heads = model_config.get_num_kv_heads(parallel_config)
+
+        # set
+        self.block_size = cache_config.block_size
+        # Note: In CacheConfig, num_gpu_blocks actual is num_cpu_blocks
+        # for CPU backend, because we want to reuse KV cache management
+        # in the scheduler.
+        self.num_cpu_blocks = cache_config.num_gpu_blocks
+
+        # if cache_config.cache_dtype == "auto":
+        #     self.dtype = model_config.dtype
+        # else:
+        #     self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
+        # int64 to hold slot_id, seq_id, vllm_reqeust_id ...
+        self.dtype = torch.int64
+
+        # Get attention backend.
+        # self.attn_backend = get_attn_backend(model_config.dtype)
+
+        # Initialize the cache.
+        # Note: fake kv cache here. We only store native KV cache slot_id here
+        self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks)
+        global _KV_CACHES
+        if _KV_CACHES:
+            raise ValueError("KV cache is already initialized")
+        _KV_CACHES = self.cpu_cache
+
+    def _allocate_kv_cache(
+        self,
+        num_blocks: int,
+    ) -> List[torch.Tensor]:
+        """Allocates KV cache on CPU."""
+        # kv_cache_shape = self.attn_backend.get_kv_cache_shape(
+        #     num_blocks, self.block_size, self.num_heads, self.head_size)
+
+        # single tensor would be enough to store the sequence id/slot_id
+        kv_cache_shape = (num_blocks, self.block_size, ns._KV_CACHE_LAST_DIM)
+        kv_cache: List[torch.Tensor] = []
+        # for _ in range(self.num_layers):
+        kv_cache.append(torch.empty(kv_cache_shape, dtype=self.dtype, device="cpu"))
+
+        return kv_cache
+
+    def swap_in(self, src_to_dst: Dict[int, int]) -> None:
+        raise NotImplementedError("Swap is not supported in CPUCacheEngine.")
+
+    def swap_out(self, src_to_dst: Dict[int, int]) -> None:
+        raise NotImplementedError("Swap is not supported in CPUCacheEngine.")
+
+    def copy(self, src_to_dsts: Dict[int, List[int]]) -> None:
+        # self.attn_backend.copy_blocks(self.cpu_cache, src_to_dsts)
+        pass
+
+    @staticmethod
+    def get_cache_block_size(
+        cache_config: CacheConfig,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+    ) -> int:
+        # head_size = model_config.get_head_size()
+        # num_heads = model_config.get_num_kv_heads(parallel_config)
+        # num_layers = model_config.get_num_layers(parallel_config)
+
+        # key_cache_block = block_size * num_heads * head_size
+        # value_cache_block = key_cache_block
+        # total = num_layers * (key_cache_block + value_cache_block)
+
+        # VLLM_CPU_KVCACHE_SPACE env and block_size are used to calculate number of cpu blocks
+        # model_config.max_model_len must be no greater than block_size,
+        # and  VLLM_CPU_KVCACHE_SPACE *_GB/(block_size*element_size) = num_cpu_blocks <= max_num_seqs as verfied in NSCPUCacheEngine
+        # Otherwise, native kv cache may run out of slots.
+        # Set VLLM_CPU_KVCACHE_SPACE to at least 1GB here before number of cpu blocks being caclulated in vllm. block_size
+        # may be adjusted to meet the requirement.
+        _GB = 1 << 30
+        block_size = cache_config.block_size
+        assert (
+            block_size >= model_config.max_model_len
+        ), "kv cache block size should be equal to max_model_len"
+        space_key = "VLLM_CPU_KVCACHE_SPACE"
+        space_value = block_size * scheduler_config.max_num_seqs * 4  # int32
+        cache_config.cpu_kvcache_space_bytes = space_value
+        os.environ[space_key] = str(float(space_value) / _GB)
+        logger.info("reset cache_config.cpu_kvcache_space_bytes to %s GB", os.environ[space_key])
+
+        total = block_size
+        # if cache_dtype == "auto":
+        #     dtype = model_config.dtype
+        # else:
+        #     dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype]
+        # dtype_size = torch.tensor([], dtype=dtype).element_size()
+
+        # we use int32 to store native kv cache slot_id
+        return 4 * total
+
+
+class NSBlockSpaceManagerV1(BlockSpaceManagerV1):
+    def __init__(
+        self,
+        block_size: int,
+        num_gpu_blocks: int,
+        num_cpu_blocks: int,
+        watermark: float = 0.01,
+        sliding_window: Optional[int] = None,
+        enable_caching: bool = False,
+    ):
+        super().__init__(
+            block_size, num_gpu_blocks, num_cpu_blocks, watermark, sliding_window, enable_caching
+        )
+
+        global _KV_CACHES
+        if _KV_CACHES is None:
+            raise ValueError("KV cache should be set in " + NSCPUCacheEngine.__name__)
+
+        from vllm.extension import ns
+
+        if ns._IE_MODEL is None:
+            raise ValueError("vllm.extension.ns._IE_model should not be empty")
+        self.ie_model = ns._IE_MODEL
+        self.ie_model.set_block_size(block_size)
+        self.ie_model.set_kv_caches_ptr(_KV_CACHES[0].data_ptr())
+
+        # corresponding kv cache to be copied from. parent_seq_id -> (parent_seq, [child_seq])
+        self.kv_cache_copy_waiting: Dict[int, Tuple[Sequence, List[Sequence]]] = {}
+        # for quick lookup
+        self.cp_child_id_to_parent_id: Dict[int, int] = {}
+        # for deferred parent seq native slot free, also for checking if block table and its native slot can be reused
+        # the parent seq id also in kv_cache_copy_waiting, but not vice-versa
+        self.free_native_waiting: List[int] = []
+
+    def allocate_block_table_for_seq(self, seq_id: int):
+        block_table = self.gpu_allocator.allocate()
+        self.block_tables[seq_id] = [block_table]
+        return block_table.block_number
+
+    # always return true since we use native slots which are preallocated
+    # also do some bookkeeping for block table assignment and kv cache copy and free native slot of deferred parent seqs
+    def can_append_slots(self, seq_group: SequenceGroup, num_lookahead_slots: int = 0) -> bool:
+        # collect all parent seq ids
+        parent_seq_ids: Set[int] = set()
+        for seq_id in seq_group.seqs_dict.keys():
+            if seq_id in self.cp_child_id_to_parent_id:
+                parent_seq_ids.add(self.cp_child_id_to_parent_id.pop(seq_id))
+        # copy kv cache if needed
+        # kv cache usage
+        # 0 0 -> seq_id
+        # 0 1 -> slot_id, will be set in native
+        # 1 0 -> has parent sequence, yes: -1, no: 0
+        # 1 1 -> if has parent sequence (-1), parent seq_id
+        # 2 0 -> if kv cache copied, yes: -1, no: 0
+        # 2 1 -> if has parent sequence (-1), parent slot_id
+        # 3 0 -> beam_size if it's beam search. should be greater than 1
+        # 3 1 -> vllm request idx
+
+        global _KV_CACHES
+        kv_cache = _KV_CACHES[0]
+        for parent_seq_id in parent_seq_ids:
+            if parent_seq_id not in self.kv_cache_copy_waiting:
+                continue
+            # free native slot of deferred parent_seq and copy kv cache if needed
+            _, child_seqs = self.kv_cache_copy_waiting.pop(parent_seq_id)
+            parent_block_nbr = None
+            parent_slot_id = None
+            for child_seq in child_seqs:
+                if child_seq.seq_id not in self.block_tables:  # make sure child seq is still valid
+                    continue
+                # reuse parent block nbr and slot id if prarent seq's native slot going to be freed
+                # otherwise, get new block table
+                child_block_nbr = self.block_tables[child_seq.seq_id][0].block_number
+                # child seq and parent seq share same block nbr initially
+                if parent_block_nbr is None:
+                    parent_block_nbr = child_block_nbr
+                    parent_slot_id = kv_cache[child_block_nbr][0][1]
+                # reuse parent block nbr, no copy needed, only free parent seq
+                if parent_seq_id in self.free_native_waiting:
+                    # no need to free native slot since the slot will be reused here
+                    self.free_native_waiting.remove(parent_seq_id)
+                    # reuse parent_block_nbr, no copy needed
+                    kv_cache[child_block_nbr][0][1] = parent_slot_id
+                    kv_cache[child_block_nbr][2][0] = ns._KV_CACHE_MARK_YES
+                else:  # assign new block nbr and copy kv cache
+                    # call parent class's free since it's new child sequence without native slot associated
+                    super().free(child_seq)
+                    # allocate new one
+                    child_block_nbr = self.allocate_block_table_for_seq(child_seq.seq_id)
+                    # need to copy kv cache
+                    kv_cache[child_block_nbr][0][1] = 0
+                    kv_cache[child_block_nbr][2][0] = ns._KV_CACHE_MARK_NO
+
+                # check seq_id is correct in execute_model
+                kv_cache[child_block_nbr][0][0] = child_seq.seq_id
+                kv_cache[child_block_nbr][1][0] = ns._KV_CACHE_MARK_YES
+                kv_cache[child_block_nbr][1][1] = parent_seq_id
+                kv_cache[child_block_nbr][2][1] = parent_slot_id  # need to copy kv cache in native
+            # cannot reset kv cache here since the blocks may be already used by child seqs
+            # it's ok not resetting kv cache since all control elements are set specifically in above logic and in execute_model (beam size and vllm group request idx)
+        return True
+
+    # return empty dict since no slot to append
+    def append_slots(self, seq: Sequence, num_lookahead_slots: int = 0) -> Dict[int, List[int]]:
+        if seq.seq_id in self.cp_child_id_to_parent_id or seq.seq_id in self.kv_cache_copy_waiting:
+            raise ValueError(
+                "seq should not be in cp_child_id_to_parent_id or kv_cache_copy_waiting"
+            )
+        return {}
+
+    def remove_seq_from_block_tables(self, seq_id):
+        block_table = self.block_tables[seq_id][0]
+        block_table.ref_count -= 1
+        del self.block_tables[seq_id]
+
+    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        super().fork(parent_seq, child_seq)
+
+        # one block per sequence
+        parent_block_nbr = self.block_tables[parent_seq.seq_id][0].block_number
+        child_block_nbr = self.block_tables[child_seq.seq_id][0].block_number
+        assert (
+            parent_block_nbr == child_block_nbr
+        ), "child block nbr should be equal to parent block nbr"
+
+        # add to kv cache copy waiting which will be copied in "can_append_slots"
+        self.cp_child_id_to_parent_id[child_seq.seq_id] = parent_seq.seq_id
+        if parent_seq.seq_id in self.kv_cache_copy_waiting:
+            self.kv_cache_copy_waiting[parent_seq.seq_id][1].append(child_seq)
+        else:
+            self.kv_cache_copy_waiting[parent_seq.seq_id] = (parent_seq, [child_seq])
+
+    def free(self, seq: Sequence) -> None:
+        if seq.seq_id not in self.block_tables:
+            # Already freed or haven't been scheduled yet.
+            return
+        # free native slot when no need to copy kv cache
+        # otherwise, do actual native slot free in "can_append_slot" which is called in scheduling running seqs.
+        if seq.seq_id not in self.kv_cache_copy_waiting:
+            block_nbr = self.block_tables[seq.seq_id][0].block_number
+            global _KV_CACHES
+            kv_cache = _KV_CACHES[0]
+            # free native slot
+            if not self.ie_model.free_slots([seq.seq_id, kv_cache[block_nbr][3][1]]):
+                raise ValueError("cannot free slot for seq")
+            kv_cache[block_nbr][0 : ns._KV_CACHE_ELEMENT_USED][:] = 0  # other elements are not used
+        else:
+            self.free_native_waiting.append(seq.seq_id)
+
+        # free block table
+        super().free(seq)
diff --git a/vllm-ext/vllm/extension/ns/model/__init__.py b/vllm-ext/vllm/extension/ns/model/__init__.py
new file mode 100644
index 000000000..164984920
--- /dev/null
+++ b/vllm-ext/vllm/extension/ns/model/__init__.py
@@ -0,0 +1,121 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Dict
+import time
+import os
+
+
+# stats for model execution performance
+class ModelPerf:
+    _METRICS = ["non_execution", "model_execution", "compute_logits", "sample"]
+
+    def __init__(self):
+        self._perf_stats: Dict[str, int] = {}
+        self._perf_stats_prompt: Dict[str, int] = {}
+        self._last_call = 0
+        self._call_cnt = 0
+        self._call_cnt_prompt = 0
+        for metric in ModelPerf._METRICS:
+            self._perf_stats[metric] = 0
+            self._perf_stats_prompt[metric] = 0
+
+        self._ticks = []
+        self._perf_steps = int(os.environ.get("NS_MODEL_PERF_STEPS", "50"))
+
+    def reset(self, prompt: bool = False):
+        pass
+
+    def non_execution(self, prompt: bool = False):
+        pass
+
+    def model_execution(self, prompt: bool = False):
+        pass
+
+    def compute_logits(self, prompt: bool = False):
+        pass
+
+    def sample(self, prompt: bool = False):
+        pass
+
+    def tick(self):
+        pass
+
+    def calc_stats(self, prompt: bool = False):
+        pass
+
+
+class RealModelPerf(ModelPerf):
+    def reset(self, prompt: bool = False):
+        if prompt:
+            self._call_cnt_prompt = 0
+            for metric in ModelPerf._METRICS:
+                self._perf_stats_prompt[metric] = 0
+        else:
+            self._call_cnt = 0
+            for metric in ModelPerf._METRICS:
+                self._perf_stats[metric] = 0
+        self._ticks.clear()
+
+    def non_execution(self, prompt: bool = False):
+        if self._last_call == 0:
+            return
+        self._ticks.append(self._last_call)
+        self._stats(0, prompt)
+
+    def model_execution(self, prompt: bool = False):
+        self._stats(1, prompt)
+
+    def compute_logits(self, prompt: bool = False):
+        self._stats(2, prompt)
+
+    def sample(self, prompt: bool = False):
+        self._stats(3, prompt)
+
+    def tick(self):
+        self._ticks.append(time.perf_counter())
+
+    def calc_stats(self, prompt: bool = False):
+        self._last_call = time.perf_counter()
+        if prompt:
+            self._call_cnt_prompt += 1
+            perfs = self._perf_stats_prompt
+            call_cnt = self._call_cnt_prompt
+            stats = {key: value / call_cnt for key, value in perfs.items()}
+            print(f"===execution_model prompt={True}: {stats}")
+            self.reset(True)
+        else:
+            self._call_cnt += 1
+            perfs = self._perf_stats
+            call_cnt = self._call_cnt
+            if call_cnt == self._perf_steps:
+                stats = {key: value / call_cnt for key, value in perfs.items()}
+                print(f"===execution_model prompt={False}: {stats}")
+                self.reset(False)
+
+    def _stats(self, metric_idx: int, prompt: bool = False):
+        last_tick = self._ticks[len(self._ticks) - 1]
+        t1 = time.perf_counter()
+        self._ticks.append(t1)
+        metric = _ModelPerf._METRICS[metric_idx]
+        diff = t1 - last_tick
+        if prompt:
+            self._perf_stats_prompt[metric] += diff
+        else:
+            self._perf_stats[metric] += diff
+
+
+_ModelPerf = RealModelPerf() if "NS_MODEL_PERF_STEPS" in os.environ else ModelPerf()
diff --git a/vllm-ext/vllm/extension/ns/model/ns_loader.py b/vllm-ext/vllm/extension/ns/model/ns_loader.py
new file mode 100644
index 000000000..8ac6b7642
--- /dev/null
+++ b/vllm-ext/vllm/extension/ns/model/ns_loader.py
@@ -0,0 +1,76 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import importlib
+from typing import Optional
+import torch
+from torch import nn
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    LoRAConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VisionLanguageConfig,
+)
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+
+# extend get_model_loader to load ns model
+vllm_loader = importlib.import_module("vllm.model_executor.model_loader.loader")
+
+
+class NSModelLoaderV2(vllm_loader.DefaultModelLoader):
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+        lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig],
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        cache_config: CacheConfig,
+    ) -> nn.Module:
+        if (model_config.quantization is None) or (model_config.quantization != "ns"):
+            raise ValueError(f"Model {model_config.model} is not a NS model")
+
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                vllm_model = vllm_loader._initialize_model(
+                    model_config,
+                    self.load_config,
+                    lora_config,
+                    vision_language_config,
+                    cache_config,
+                )
+            # initialize native inference engine
+            vllm_model.model.init_inference_engine(model_config, parallel_config, scheduler_config)
+            vllm_model.model.load_weights([0])
+            weights_generator = self._get_weights_iterator(
+                model_config.model,
+                model_config.revision,
+                fall_back_to_pt=getattr(vllm_model, "fall_back_to_pt_during_load", True),
+            )
+
+            filtered_weights = [
+                (name, weight)
+                for name, weight in weights_generator
+                if (name == "lm_head.weight" or name == "model.embed_tokens.weight")
+            ]
+            vllm_model.load_weights(filtered_weights)
+
+        return vllm_model.eval()
diff --git a/vllm-ext/vllm/extension/ns/model/ns_model.py b/vllm-ext/vllm/extension/ns/model/ns_model.py
new file mode 100644
index 000000000..77b6a18ed
--- /dev/null
+++ b/vllm-ext/vllm/extension/ns/model/ns_model.py
@@ -0,0 +1,328 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Optional
+import sys
+import os
+import ctypes
+import psutil
+import torch
+import numpy as np
+from torch import nn
+
+from transformers import PretrainedConfig
+from transformers import LlamaConfig
+
+from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig, ParallelConfig
+from vllm.model_executor.layers.linear import LinearMethodBase
+from vllm.attention import AttentionMetadata
+
+from inference_engine import Model as IE_Model
+
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+
+from vllm.logger import init_logger
+
+from vllm.extension import ns
+
+from vllm.extension.ns.model import _ModelPerf
+
+
+# import threading
+
+logger = init_logger(__name__)
+
+_NS_NUM_THREADS = "NS_NUM_THREADS"  # threads for prompt decoding
+_NS_NUM_THREADS_NEXT = "NS_NUM_THREADS_NEXT"  # threads for next token generation
+_NS_MAX_PROMPT_TOKENS = "NS_MAX_PROMPT_TOKENS"  # to be supported
+
+
+class NSModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ):
+        super(NSModel, self).__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.lora_config = lora_config
+        self.ie_model = None
+        self.tokenizer = None
+
+        self.padding_idx = config.pad_token_id
+        lora_vocab = (
+            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0
+        )
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ):
+        assert len(kv_caches) == 1, "kv_caches should have 1 element here"
+        # use data_ptr and torch type in str to avoid inference engine model depends on pytorch and vllm types
+        # kv_cache pointer is set in NSBlockManager in advance
+        return self.ie_model(
+            input_ids.data_ptr(),
+            positions.data_ptr(),
+            attn_metadata.is_prompt,
+            attn_metadata.block_tables.data_ptr(),
+            attn_metadata.slot_mapping.data_ptr(),
+            attn_metadata.seq_lens,
+        )
+
+    def init_inference_engine(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+    ):
+        # get available cores
+        max_prompt_tokens = int(os.environ.get(_NS_MAX_PROMPT_TOKENS, "8192"))
+        try:
+            import ray
+
+            cpus_per_worker = ray.runtime_context.get_runtime_context().get_assigned_resources()[
+                "CPU"
+            ]
+            # reserve one core for non-ns tasks
+            cpus_per_worker = cpus_per_worker if cpus_per_worker <= 1 else cpus_per_worker - 1
+            threads = int(os.environ.get(_NS_NUM_THREADS, str(cpus_per_worker)))
+            threads_next = int(os.environ.get(_NS_NUM_THREADS_NEXT, str(threads)))
+        except AssertionError:
+            logger.warn("not inside ray worker")
+            physical_cores = psutil.cpu_count(logical=False)
+            # reserve one core for non-ns tasks
+            physical_cores = physical_cores if physical_cores <= 1 else physical_cores - 1
+            threads = int(os.environ.get(_NS_NUM_THREADS, str(physical_cores)))
+            threads_next = int(os.environ.get(_NS_NUM_THREADS_NEXT, str(threads)))
+        logger.info(
+            "Using %d threads for prefill and %d threads for next tokens in inference engine",
+            threads,
+            threads_next,
+        )
+        self.ie_model = IE_Model(
+            self.config.name_or_path,
+            max_batch_size=scheduler_config.max_num_seqs,
+            ctx_size=model_config.max_model_len,
+            max_new_tokens=model_config.max_model_len,
+            threads=threads,
+            threads_next=threads_next,
+            max_prompt_tokens=max_prompt_tokens,
+        )
+        self.tokenizer = self.ie_model.tokenizer
+
+    def load_weights(self, weights):
+        assert sum(1 for _ in weights) > 0
+
+        qc = self.quant_config
+        self.ie_model.check_and_quantize(
+            weight_dtype=qc.weight_dtype,
+            alg=qc.alg,
+            group_size=qc.group_size,
+            scale_dtype=qc.scale_dtype,
+            compute_dtype=qc.compute_dtype,
+        )
+        self.ie_model.load_model()
+        if ns._IE_MODEL:
+            raise ValueError("vllm.extension.ns._IE_model should be empty")
+        ns._IE_MODEL = self.ie_model
+
+
+class NSLLamaModel(NSModel):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+    ):
+        super().__init__(config, cache_config, quant_config, lora_config)
+
+
+def native_ptr_to_tensor(hidden_states_ptr, seq_len_sum, hidden_size):
+    data = ctypes.cast(hidden_states_ptr, ctypes.POINTER(ctypes.c_float))
+    data_array = np.ctypeslib.as_array(data, shape=(seq_len_sum * hidden_size,))
+    return torch.frombuffer(data_array, dtype=torch.float).view(seq_len_sum, hidden_size)
+
+
+def get_vllm_reqidx(request_id: str):
+    id = 0
+    if len(request_id) == 32:  # 128-bit UUID
+        id = int(request_id, 16)
+        # take the first 63 bit to make sure it's less than sys.maxsize. There is very tiny chance id is not unique.
+        # But we have logic in native code to check if the id is unique in running requests when prefill.
+        id = id >> 65
+    else:
+        id = int(request_id)
+        if id > sys.maxsize:
+            raise ValueError(
+                f"request_id {id} exeeds max size of int64 {sys.maxsize}. Please reset the request_id counter and resubmit."
+            )
+
+    return id
+
+
+def set_vllm_reqidx(
+    kv_cache: torch.Tensor, beam_search: bool, block_nbr: int, best_of: int, request_id: str
+):
+    # for assign pre-allocated slots to reduce kv cache copy length
+    if beam_search:
+        if kv_cache[block_nbr][3][0] == 0:  # not set yet
+            kv_cache[block_nbr][3][0] = best_of
+            kv_cache[block_nbr][3][1] = get_vllm_reqidx(request_id)
+    else:
+        kv_cache[block_nbr][3][0] = 0  # beam size
+        kv_cache[block_nbr][3][1] = -1
+
+
+def set_more_metadata(
+    attn_metadata, kv_cache: torch.Tensor, seq_group_metadata_list: List[SequenceGroupMetadata]
+):
+    # kv cache usage
+    # 0 0 -> seq_id
+    # 0 1 -> slot_id, will be set in native
+    # 1 0 -> has parent sequence, yes: -1, no: 0
+    # 1 1 -> if has parent sequence (-1), parent seq_id
+    # 2 0 -> if kv cache copied, yes: -1, no: 0
+    # 2 1 -> if has parent sequence (-1), parent slot_id
+    # 3 0 -> beam_size if it's beam search. should be greater than 1
+    # 3 1 -> vllm request idx
+
+    # set seq id to first element of block in kv cache
+    # one sequence one block
+    if attn_metadata.is_prompt:
+        block_tables = torch.zeros((len(seq_group_metadata_list)), dtype=torch.int)
+        i = 0
+        for seq_g_meta in seq_group_metadata_list:
+            beam_search = seq_g_meta.sampling_params.use_beam_search
+            for seq_id, block_nbrs in seq_g_meta.block_tables.items():
+                block_nbr = block_nbrs[0]
+                kv_cache[block_nbr][0][0] = seq_id
+                set_vllm_reqidx(
+                    kv_cache,
+                    beam_search,
+                    block_nbr,
+                    seq_g_meta.sampling_params.best_of,
+                    seq_g_meta.request_id,
+                )
+                block_tables[i] = block_nbr
+                i = i + 1
+        assert i == block_tables.shape[0], "inconsistent block tables and sequences"
+        attn_metadata.block_tables = block_tables
+    else:
+        prompt_lens: List[int] = []
+        for seq_g_meta in seq_group_metadata_list:
+            beam_search = seq_g_meta.sampling_params.use_beam_search
+            for seq_id, block_nbrs in seq_g_meta.block_tables.items():
+                block_nbr = block_nbrs[0]
+                set_vllm_reqidx(
+                    kv_cache,
+                    beam_search,
+                    block_nbr,
+                    seq_g_meta.sampling_params.best_of,
+                    seq_g_meta.request_id,
+                )
+                # check if seq_id matches
+                assert (
+                    seq_id == kv_cache[block_nbr][0][0]
+                ), "seq_ids in metadata and kv_caches not match"
+                prompt_lens.append(seq_g_meta.seq_data[seq_id].get_prompt_len())
+        attn_metadata.block_tables = attn_metadata.block_tables.squeeze(
+            1
+        )  # we only have one block per sequence
+        attn_metadata.seq_lens = prompt_lens
+
+
+# modified execute_model in cpu_model_runner.py to pass sequence_id and convert tensor to int32 for now
+@torch.inference_mode()
+def execute_model(
+    self,
+    seq_group_metadata_list: List[SequenceGroupMetadata],
+    kv_caches: List[torch.Tensor],
+) -> Optional[SamplerOutput]:
+    (
+        input_tokens,
+        input_positions,
+        attn_metadata,
+        sampling_metadata,
+        multi_modal_input,
+    ) = self.prepare_input_tensors(seq_group_metadata_list)
+
+    _ModelPerf.non_execution(attn_metadata.is_prompt)
+
+    set_more_metadata(attn_metadata, kv_caches[0], seq_group_metadata_list)
+
+    _ModelPerf.tick()
+
+    model_executable = self.model
+    execute_model_kwargs = {
+        "input_ids": input_tokens.to(torch.int32),
+        "positions": input_positions,
+        "kv_caches": kv_caches,
+        "attn_metadata": attn_metadata,
+    }
+
+    if self.vision_language_config:
+        execute_model_kwargs.update({"image_input": multi_modal_input})
+
+    hidden_states_ptr = model_executable(**execute_model_kwargs)
+    if not hidden_states_ptr:
+        raise RuntimeError(
+            "Failed to execute model. Details: " + model_executable.ie_model.get_last_error()
+        )
+
+    _ModelPerf.model_execution(attn_metadata.is_prompt)
+
+    hidden_states = native_ptr_to_tensor(
+        hidden_states_ptr, input_tokens.shape[0], self.model_config.hf_config.hidden_size
+    )
+
+    hidden_states = hidden_states.to(self.model_config.dtype)
+
+    _ModelPerf.tick()
+    # Compute the logits.
+    logits = self.model.compute_logits(hidden_states, sampling_metadata)
+    _ModelPerf.compute_logits(attn_metadata.is_prompt)
+
+    # Only perform sampling in the driver worker.
+    if not self.is_driver_worker:
+        return None
+
+    # Sample the next token.
+    output = self.model.sample(
+        logits=logits,
+        sampling_metadata=sampling_metadata,
+    )
+    _ModelPerf.sample(attn_metadata.is_prompt)
+
+    _ModelPerf.calc_stats(attn_metadata.is_prompt)
+
+    return output
diff --git a/vllm-ext/vllm/extension/ns/quantization/__init__.py b/vllm-ext/vllm/extension/ns/quantization/__init__.py
new file mode 100644
index 000000000..854e39ad4
--- /dev/null
+++ b/vllm-ext/vllm/extension/ns/quantization/__init__.py
@@ -0,0 +1,15 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/vllm-ext/vllm/extension/ns/quantization/cpu_ns_config.py b/vllm-ext/vllm/extension/ns/quantization/cpu_ns_config.py
new file mode 100644
index 000000000..445771fd5
--- /dev/null
+++ b/vllm-ext/vllm/extension/ns/quantization/cpu_ns_config.py
@@ -0,0 +1,165 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Dict, Any, Optional
+import os
+
+from pathlib import Path
+
+import torch
+from torch.nn.modules import Module
+
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from vllm.config import LoadConfig, ModelConfig
+from vllm.model_executor.layers.linear import LinearMethodBase
+from vllm.model_executor.model_loader.weight_utils import get_lock
+from vllm.logger import init_logger
+
+import huggingface_hub
+from huggingface_hub import snapshot_download
+
+from tqdm.auto import tqdm
+
+import shutil
+
+logger = init_logger(__name__)
+
+
+class DisabledTqdm(tqdm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs, disable=True)
+
+
+# copy quant_ns_config to model snapshot folder if any
+def copy_quant_ns_config(model_config: ModelConfig, load_config: LoadConfig):
+    # only handle ns config here
+    if model_config.quantization != "ns":
+        return
+    model_name_or_path = model_config.model
+    is_local = os.path.isdir(model_name_or_path)
+    if not is_local:
+        # Download the config files.
+        with get_lock(model_name_or_path, load_config.download_dir):
+            hf_folder = snapshot_download(
+                model_name_or_path,
+                revision=model_config.revision,
+                allow_patterns="*.json",
+                cache_dir=load_config.download_dir,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                tqdm_class=DisabledTqdm,
+            )
+    else:
+        hf_folder = model_name_or_path
+
+    possible_config_filenames = NSQuantConfig.get_config_filenames()
+
+    # If the quantization config is not found, skip
+    if not possible_config_filenames:
+        logger.warn(
+            "ns quantization config file name not specified for model %s. Will use default ns config",
+            model_name_or_path,
+        )
+        return
+    quant_config_path = os.path.join(hf_folder, possible_config_filenames[0])
+    if os.path.exists(quant_config_path):
+        logger.info("Found ns quantization config file %s", quant_config_path)
+        return
+    downloaded_model_name = Path(
+        hf_folder
+    ).parent.parent.name  # e.g., hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590/
+    src_file = os.path.join(
+        Path(__file__).parent.absolute(), downloaded_model_name, possible_config_filenames[0]
+    )
+    if not os.path.exists(src_file):
+        logger.warn(
+            "no ns quantization config file found in %s. Will use default ns config", src_file
+        )
+        return
+    shutil.copy(src_file, quant_config_path)
+    logger.info("Copyed ns quantization config file to %s", quant_config_path)
+
+
+class NSQuantConfig(QuantizationConfig):
+    def __init__(
+        self,
+        quant_method="ns",
+        weight_dtype="int8",
+        group_size="128",
+        scale_dtype="fp32",
+        compute_dtype="int8",
+        alg="sym",
+        version="v1",
+    ):
+        self.quant_method = quant_method
+        self.weight_dtype = weight_dtype
+        self.group_size = group_size
+        self.scale_dtype = scale_dtype
+        self.compute_dtype = compute_dtype
+        self.alg = alg
+        self.version = version
+
+    def get_name(self) -> str:
+        return "ns"
+
+    def get_supported_act_dtypes(self) -> List[torch.dtype]:
+        return [torch.int]
+
+    # useless
+    def get_min_capability(self) -> int:
+        pass
+
+    @staticmethod
+    def get_config_filenames() -> List[str]:
+        return ["quant_ns_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "NSQuantConfig":
+        return cls(**config["quantization_config"])
+
+    def get_linear_method(self) -> LinearMethodBase:
+        return NSLinearMethod(self)
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+    def get_quant_method(self, layer: Module) -> Optional[QuantizeMethodBase]:
+        return None
+
+
+class NSLinearMethod(LinearMethodBase):
+    def __init__(self, quant_config) -> None:
+        super().__init__()
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_size_per_partition: int,
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        pass
+
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        pass
diff --git a/vllm-ext/vllm/extension/ns/quantization/models--meta-llama--Llama-2-7b-chat-hf/quant_ns_config.json b/vllm-ext/vllm/extension/ns/quantization/models--meta-llama--Llama-2-7b-chat-hf/quant_ns_config.json
new file mode 100644
index 000000000..b80f3e711
--- /dev/null
+++ b/vllm-ext/vllm/extension/ns/quantization/models--meta-llama--Llama-2-7b-chat-hf/quant_ns_config.json
@@ -0,0 +1,11 @@
+{
+    "quantization_config": {
+        "quant_method": "ns",
+        "weight_dtype": "int4",
+        "alg": "sym",
+        "group_size": 128,
+        "scale_dtype": "fp32",
+        "compute_dtype": "int8",
+        "version": "v1"
+    }
+}
diff --git a/vllm-ext/vllm/extension/ns/quantization/models--meta-llama--Meta-Llama-3-8B-Instruct/quant_ns_config.json b/vllm-ext/vllm/extension/ns/quantization/models--meta-llama--Meta-Llama-3-8B-Instruct/quant_ns_config.json
new file mode 100644
index 000000000..b80f3e711
--- /dev/null
+++ b/vllm-ext/vllm/extension/ns/quantization/models--meta-llama--Meta-Llama-3-8B-Instruct/quant_ns_config.json
@@ -0,0 +1,11 @@
+{
+    "quantization_config": {
+        "quant_method": "ns",
+        "weight_dtype": "int4",
+        "alg": "sym",
+        "group_size": 128,
+        "scale_dtype": "fp32",
+        "compute_dtype": "int8",
+        "version": "v1"
+    }
+}