From 8bf19d111ded56d8baf8b9a2b2eded2c63088b19 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Mon, 14 Oct 2024 17:27:12 -0700
Subject: [PATCH 01/22] MTK Android Llama Runner

---
 .../llm_helper/include/llama_runner_values.h  |  32 ++
 .../executor_runner/mtk_llama_runner.cpp      | 333 ++++++++++++++++++
 .../executor_runner/mtk_llama_runner.h        |  69 ++++
 3 files changed, 434 insertions(+)
 create mode 100644 examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
 create mode 100644 examples/mediatek/executor_runner/mtk_llama_runner.cpp
 create mode 100644 examples/mediatek/executor_runner/mtk_llama_runner.h

diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
new file mode 100644
index 00000000000..98cd8ab394e
--- /dev/null
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
@@ -0,0 +1,32 @@
+#pragma once
+
+namespace torch::executor {
+  using llm_helper::LLMType;
+
+  // Sizes
+  const size_t PROMPT_TOKEN_BATCH_SIZE = 128;
+  const size_t CACHE_SIZE = 512;
+  const size_t HIDDEN_SIZE = 4096;
+  const size_t NUM_HEAD = 32;
+  const size_t NUM_LAYER = 32;
+  const size_t MAX_TOKEN_LENGTH = 8192;
+  const double ROT_EMB_BASE = 500000;
+
+  // Types
+  const LLMType MODEL_INPUT_TYPE = LLMType::FP32;
+  const LLMType MODEL_OUTPUT_TYPE = LLMType::FP32;
+  const LLMType CACHE_TYPE = LLMType::FP32;
+  const LLMType MASK_TYPE = LLMType::FP32;
+  const LLMType ROT_EMB_TYPE = LLMType::FP32;
+
+  // Paths
+  const std::string TOKENIZER_PATH="/data/local/tmp/et-mtk/llama3/tokenizer.model";
+  const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/et-mtk/llama3/embedding_llama3-8B-instruct_fp32.bin";
+
+  // Comma-Separated Paths
+  const std::string PROMPT_MODEL_PATHS="/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_3.pte,";
+
+  // Comma-Separated Paths
+  const std::string GEN_MODEL_PATHS="/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_3.pte,";
+
+} // namespace torch::executor
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
new file mode 100644
index 00000000000..ea882cbb2f5
--- /dev/null
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 MediaTek Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/* Copyright Statement:
+ *
+ * This software/firmware and related documentation ("MediaTek Software") are
+ * protected under relevant copyright laws. The information contained herein
+ * is confidential and proprietary to MediaTek Inc. and/or its licensors.
+ * Without the prior written permission of MediaTek inc. and/or its licensors,
+ * any reproduction, modification, use or disclosure of MediaTek Software,
+ * and information contained herein, in whole or in part, shall be strictly
+ * prohibited.
+ */
+/* MediaTek Inc. (C) 2024. All rights reserved.
+ *
+ * BY OPENING THIS FILE, RECEIVER HEREBY UNEQUIVOCALLY ACKNOWLEDGES AND AGREES
+ * THAT THE SOFTWARE/FIRMWARE AND ITS DOCUMENTATIONS ("MEDIATEK SOFTWARE")
+ * RECEIVED FROM MEDIATEK AND/OR ITS REPRESENTATIVES ARE PROVIDED TO RECEIVER ON
+ * AN "AS-IS" BASIS ONLY. MEDIATEK EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT.
+ * NEITHER DOES MEDIATEK PROVIDE ANY WARRANTY WHATSOEVER WITH RESPECT TO THE
+ * SOFTWARE OF ANY THIRD PARTY WHICH MAY BE USED BY, INCORPORATED IN, OR
+ * SUPPLIED WITH THE MEDIATEK SOFTWARE, AND RECEIVER AGREES TO LOOK ONLY TO SUCH
+ * THIRD PARTY FOR ANY WARRANTY CLAIM RELATING THERETO. RECEIVER EXPRESSLY
+ * ACKNOWLEDGES THAT IT IS RECEIVER'S SOLE RESPONSIBILITY TO OBTAIN FROM ANY
+ * THIRD PARTY ALL PROPER LICENSES CONTAINED IN MEDIATEK SOFTWARE. MEDIATEK
+ * SHALL ALSO NOT BE RESPONSIBLE FOR ANY MEDIATEK SOFTWARE RELEASES MADE TO
+ * RECEIVER'S SPECIFICATION OR TO CONFORM TO A PARTICULAR STANDARD OR OPEN
+ * FORUM. RECEIVER'S SOLE AND EXCLUSIVE REMEDY AND MEDIATEK'S ENTIRE AND
+ * CUMULATIVE LIABILITY WITH RESPECT TO THE MEDIATEK SOFTWARE RELEASED HEREUNDER
+ * WILL BE, AT MEDIATEK'S OPTION, TO REVISE OR REPLACE THE MEDIATEK SOFTWARE AT
+ * ISSUE, OR REFUND ANY SOFTWARE LICENSE FEES OR SERVICE CHARGE PAID BY RECEIVER
+ * TO MEDIATEK FOR SUCH MEDIATEK SOFTWARE AT ISSUE.
+ *
+ * The following software/firmware and/or related documentation ("MediaTek
+ * Software") have been modified by MediaTek Inc. All revisions are subject to
+ * any receiver's applicable license agreements with MediaTek Inc.
+ */
+
+#include "executorch/backends/mediatek/runtime/include/NeuronBufferAllocator.h"
+#include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
+
+#include <ctime>
+#include <iostream>
+#include <memory>
+#include <random>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/runtime/executor/method.h>
+#include <executorch/runtime/executor/program.h>
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/profiler.h>
+#include <executorch/runtime/platform/runtime.h>
+// #include <executorch/util/util.h>
+#include <executorch/extension/llm/runner/util.h>
+#include <executorch/runtime/core/result.h>
+
+#include "llama_runner/ModelChunk.h"
+#include "llama_runner/Utils.h"
+#include "llama_runner/llm_helper/include/llm_types.h"
+#include "llama_runner/llm_helper/include/llama_runner_values.h"
+
+static uint64_t MAX_RESPONSE = 50; // Maximum number of tokens to generate.
+// Global BOS and EOS option for tokenization (encoding)
+static constexpr int8_t kAddBos = 1;
+static constexpr int8_t kAddEos = 0;
+
+using namespace torch::executor;
+using namespace torch::executor::llm_helper;
+using torch::executor::utils::Timer;
+
+MTKLlamaRunner::MTKLlamaRunner(
+  const std::string& model_path,
+  const std::string& tokenizer_path,
+  const float temperature)
+  : modeloptions_(get_model_options()),
+    modelpaths_(get_model_paths()) {
+  runtime_init();
+  ET_LOG(
+        Info,
+        "Creating MTK Llama runner. Current it will self-load .pte, .bin, and .so files. Initiated runtime_init().");
+}
+
+Error MTKLlamaRunner::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+
+  // Load tokenizer
+  ET_LOG(Info, "Loading tokenizer.");
+  tokenizer_ = load_tokenizer();
+  ET_LOG(Info, "Complete loading tokenizer.");
+
+  // Load prompt model
+  runtime_ = std::make_unique<LlamaRuntime>();
+  ET_LOG(Info, "Loading prompt model.");
+  runtime_->Initialize(modeloptions_, modelpaths_);
+  ET_LOG(Info, "Complete loading prompt model.");
+
+  return Error::Ok;
+}
+
+bool MTKLlamaRunner::is_loaded() const {
+  return tokenizer_ && runtime_;
+}
+
+Error MTKLlamaRunner::generate(
+    const std::string& prompt,
+    int32_t seq_len,
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const Stats&)> stats_callback) {
+
+  if (!is_loaded()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+  }
+
+  // Wrap the token_callback with print function
+  std::function<void(const std::string&)> wrapped_callback =
+      [token_callback](const std::string& piece) {
+        util::safe_printf(piece.c_str());
+        fflush(stdout);
+        if (token_callback) {
+          token_callback(piece);
+        }
+      };
+
+  ET_LOG(Info, "Starting inference from MTKLlamaRunner");    
+  inference(*runtime_.get(), tokenizer_, prompt, wrapped_callback);
+  ET_LOG(Info, "Completed inference from MTKLlamaRunner"); 
+
+  return Error::Ok;
+}
+
+void MTKLlamaRunner::stop() {
+  if (is_loaded()) {
+    runtime_->Release();
+  } else {
+    ET_LOG(Error, "Llama Runtime is not loaded, cannot stop");
+  }
+}
+
+LlamaModelOptions MTKLlamaRunner::get_model_options() {
+  LlamaModelOptions options = {
+      // Sizes
+      .prompt_token_batch_size = PROMPT_TOKEN_BATCH_SIZE,
+      .cache_size = CACHE_SIZE,
+      .hidden_size = HIDDEN_SIZE,
+      .num_head = NUM_HEAD,
+      .num_layer = NUM_LAYER,
+      .max_token_length = MAX_TOKEN_LENGTH,
+      .rot_emb_base = ROT_EMB_BASE,
+
+      // Types
+      .model_input_type = MODEL_INPUT_TYPE,
+      .model_output_type = MODEL_OUTPUT_TYPE,
+      .cache_type = CACHE_TYPE,
+      .mask_type = MASK_TYPE,
+      .rot_emb_type = ROT_EMB_TYPE};
+  ET_LOG(Info, "Completed get_model_options");    
+  return options;
+}
+
+LlamaModelPaths MTKLlamaRunner::get_model_paths() {
+  LlamaModelPaths model_paths = {
+      .tokenizer_path = TOKENIZER_PATH,
+      .token_embedding_path = TOKEN_EMBEDDING_PATH,
+      .prompt_model_paths = utils::split(PROMPT_MODEL_PATHS, ','),
+      .gen_model_paths = utils::split(GEN_MODEL_PATHS, ',')};
+  ET_LOG(Info, "Completed get_model_paths");   
+  return model_paths;
+}
+
+Result<uint64_t> MTKLlamaRunner::digest_prompt(
+    LlamaRuntime& llama_runtime,
+    const std::unique_ptr<Tokenizer>& tokenizer,
+    const std::vector<uint64_t> input_tokens) {
+  const auto input_token_count = input_tokens.size();
+  const auto prompt_token_batch_size = llama_runtime.GetTokenBatchSize();
+  size_t cur_token_index = 0;
+
+  Timer timer_digest_prompt([=](const auto elapsed_sec) {
+    // Ideal prompt size is a multiple of prompt batch size
+    const size_t ideal_prompt_size =
+        std::ceil(float(input_token_count) / prompt_token_batch_size) *
+        prompt_token_batch_size;
+    ET_LOG(
+        Info,
+        "Done analyzing prompt in %f sec (%f tok/s)",
+        elapsed_sec,
+        (float)ideal_prompt_size / elapsed_sec);
+  });
+
+  auto getNextTokens = [&]() {
+    const size_t num_tok_remain = input_token_count - cur_token_index;
+    const size_t remainder = num_tok_remain % prompt_token_batch_size;
+    const size_t num_new_tokens =
+        remainder ? remainder : prompt_token_batch_size;
+    const auto start = cur_token_index;
+    const auto end = start + num_new_tokens;
+    return std::vector(
+        input_tokens.begin() + start, input_tokens.begin() + end);
+  };
+
+  void* logits;
+  timer_digest_prompt.Start();
+  while (cur_token_index < input_token_count) {
+    const auto next_tokens = getNextTokens();
+    ET_LOG(
+        Debug,
+        "Digest next tokens (size=%zu), 1st tok=%lu",
+        next_tokens.size(),
+        next_tokens[0]);
+    logits = llama_runtime.Run(next_tokens);
+    cur_token_index += next_tokens.size();
+  }
+  timer_digest_prompt.End();
+
+  const auto vocab_size = tokenizer->vocab_size();
+  const auto logits_type = llama_runtime.GetModelOptions().model_output_type;
+  const auto first_output_token =
+      utils::argmax(logits_type, logits, vocab_size);
+  return first_output_token;
+}
+
+Error MTKLlamaRunner::gen_response(
+    LlamaRuntime& llama_runtime,
+    const std::unique_ptr<Tokenizer>& tokenizer,
+    const uint64_t input_token,
+    std::function<void(const std::string&)> token_callback) {
+  Timer timer_model_swap(
+      [](const auto elapsed_sec) { ET_LOG(Info, "Model swapped."); });
+
+  // Swap to gen mode
+  timer_model_swap.Start();
+  llama_runtime.SwapModel(1);
+  timer_model_swap.End();
+
+  size_t gen_tok_count = 0;
+  uint64_t prev_token = input_token;
+  uint64_t output_token = input_token;
+
+  auto decode_res = tokenizer->decode(prev_token, output_token);
+  ET_CHECK_OR_RETURN_ERROR(
+      decode_res.ok(),
+      InvalidState,
+      "Tokenizer failed to decode first generated token: %lu",
+      output_token);
+  std::string full_response = std::move(decode_res.get());
+  std::vector<uint64_t> full_response_tokens = {input_token};
+
+  const auto vocab_size = tokenizer->vocab_size();
+  const auto logits_type = llama_runtime.GetModelOptions().model_output_type;
+
+  double gen_total_time_sec = 0;
+  Timer timer_gen_token(
+      [&](const auto elapsed_sec) { gen_total_time_sec += elapsed_sec; });
+
+  // Print first output token
+  token_callback(full_response);
+
+  while (gen_tok_count++ < MAX_RESPONSE &&
+         llama_runtime.GetTokenIndex() < modeloptions_.max_token_length) {
+    timer_gen_token.Start();
+    void* logits = llama_runtime.Run({output_token});
+    timer_gen_token.End();
+
+    prev_token = output_token;
+    output_token = utils::argmax(logits_type, logits, vocab_size);
+    full_response_tokens.push_back(output_token);
+
+    // Stop when output is EOS
+    if (output_token == tokenizer->eos_tok()) {
+      token_callback("</eos>");
+      break;
+    }
+    auto decode_res = tokenizer->decode(prev_token, output_token);
+    ET_CHECK_OR_RETURN_ERROR(
+        decode_res.ok(),
+        InvalidState,
+        "Tokenizer failed to decode generated token %lu",
+        output_token);
+    const std::string tok_str = std::move(decode_res.get());
+    full_response += tok_str;
+    token_callback(tok_str);
+  }
+
+  std::cout << "\n\n[Generated Tokens]\n"
+            << utils::to_string(full_response_tokens) << std::endl;
+
+  ET_LOG(
+      Info,
+      "Token generation speed: %f tok/s",
+      gen_tok_count / gen_total_time_sec);
+
+  return Error::Ok;
+}
+
+Error MTKLlamaRunner::inference(
+    LlamaRuntime& llama_runtime,
+    const std::unique_ptr<Tokenizer>& tokenizer,
+    const std::string& prompt,
+    std::function<void(const std::string&)> token_callback) {
+  // Tokenize input prompt
+  auto encode_res = tokenizer->encode(prompt, kAddBos, kAddEos);
+  ET_CHECK_OR_RETURN_ERROR(
+      encode_res.ok(), InvalidState, "Tokenizer failed to encode prompt");
+  const auto input_tokens = std::move(encode_res.get());
+
+  // Run prompt mode (pre-fill)
+  auto prefill_res = digest_prompt(llama_runtime, tokenizer, input_tokens);
+  ET_CHECK_OR_RETURN_ERROR(
+      prefill_res.ok(), InvalidState, "Failed to digest prompt");
+  const auto first_output_token = prefill_res.get();
+
+  // run generation mode (decoding)
+  return gen_response(llama_runtime, tokenizer, first_output_token, token_callback);
+}
+
+std::unique_ptr<Tokenizer> MTKLlamaRunner::load_tokenizer() {
+  std::unique_ptr<Tokenizer> tokenizer;
+  // Assumes that tokenizer type is Tiktoken
+  tokenizer = torch::executor::get_tiktoken_for_llama();
+  tokenizer->load(modelpaths_.tokenizer_path);
+  return tokenizer;
+}
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h
new file mode 100644
index 00000000000..d9f85c20257
--- /dev/null
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple llama2 runner that includes preprocessing and post processing logic.
+// The module takes in a string as input and emits a string as output.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
+#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
+#include <executorch/extension/llm/tokenizer/tiktoken.h>
+
+#include "llama_runner/LlamaConfig.h"
+#include "llama_runner/LlamaRuntime.h"
+using namespace torch::executor;
+using Stats = ::executorch::llm::Stats;
+
+class MTKLlamaRunner {
+ public:
+  explicit MTKLlamaRunner(
+      const std::string& model_path,
+      const std::string& tokenizer_path,
+      const float temperature = 0.8f);
+
+  bool is_loaded() const;
+  Error load();
+  Error generate(
+      const std::string& prompt,
+      int32_t seq_len = 128,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const Stats&)> stats_callback = {});
+  void stop();
+
+  LlamaModelOptions get_model_options();
+  LlamaModelPaths get_model_paths();
+  Result<uint64_t> digest_prompt(
+      LlamaRuntime& llama_runtime,
+      const std::unique_ptr<Tokenizer>& tokenizer,
+      const std::vector<uint64_t> input_tokens);
+  Error gen_response(
+      LlamaRuntime& llama_runtime,
+      const std::unique_ptr<Tokenizer>& tokenizer,
+      const uint64_t input_token,
+      std::function<void(const std::string&)> token_callback);
+  Error inference(
+      LlamaRuntime& llama_runtime,
+      const std::unique_ptr<Tokenizer>& tokenizer,
+      const std::string& prompt,
+      std::function<void(const std::string&)> token_callback);
+  std::unique_ptr<Tokenizer> load_tokenizer();
+
+
+ private:
+  // model
+  const torch::executor::LlamaModelOptions modeloptions_;
+  const torch::executor::LlamaModelPaths modelpaths_;
+  std::unique_ptr<Tokenizer> tokenizer_;
+  std::unique_ptr<LlamaRuntime> runtime_;
+};

From 826d59dfa5e01df94cc42d5c2934432b61073c18 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Tue, 15 Oct 2024 09:19:07 -0700
Subject: [PATCH 02/22] Enable JNI with MTK Llama Runner core functions

---
 extension/android/jni/jni_layer_llama.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index 1049b9da308..e6b5807a086 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -15,6 +15,7 @@
 
 #include <executorch/examples/models/llama/runner/runner.h>
 #include <executorch/examples/models/llava/runner/llava_runner.h>
+#include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/platform.h>
@@ -113,6 +114,7 @@ class ExecuTorchLlamaJni
   int model_type_category_;
   std::unique_ptr<example::Runner> runner_;
   std::unique_ptr<llm::MultimodalRunner> multi_modal_runner_;
+  std::unique_ptr<MTKLlamaRunner> mtk_llama_runner_;
 
  public:
   constexpr static auto kJavaDescriptor =
@@ -120,6 +122,7 @@ class ExecuTorchLlamaJni
 
   constexpr static int MODEL_TYPE_CATEGORY_LLM = 1;
   constexpr static int MODEL_TYPE_CATEGORY_MULTIMODAL = 2;
+  constexpr static int MODEL_TYPE_MEDIATEK_LLAMA = 3;
 
   static facebook::jni::local_ref<jhybriddata> initHybrid(
       facebook::jni::alias_ref<jclass>,
@@ -158,6 +161,11 @@ class ExecuTorchLlamaJni
           model_path->toStdString().c_str(),
           tokenizer_path->toStdString().c_str(),
           temperature);
+    } else if (model_type_category == MODEL_TYPE_MEDIATEK_LLAMA) {
+      mtk_llama_runner_ = std::make_unique<MTKLlamaRunner>(
+        model_path->toStdString().c_str(),
+        tokenizer_path->toStdString().c_str(),
+        temperature);
     }
   }
 
@@ -197,6 +205,12 @@ class ExecuTorchLlamaJni
           [callback](std::string result) { callback->onResult(result); },
           [callback](const llm::Stats& result) { callback->onStats(result); },
           echo);
+    } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) {
+      mtk_llama_runner_->generate(
+          prompt->toStdString(),
+          seq_len,
+          [callback](std::string result) { callback->onResult(result); },
+          [callback](const Stats& result) { callback->onStats(result); });
     }
     return 0;
   }
@@ -286,6 +300,8 @@ class ExecuTorchLlamaJni
       multi_modal_runner_->stop();
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
       runner_->stop();
+    } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) {
+      mtk_llama_runner_->stop();
     }
   }
 
@@ -294,6 +310,8 @@ class ExecuTorchLlamaJni
       return static_cast<jint>(multi_modal_runner_->load());
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
       return static_cast<jint>(runner_->load());
+    } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) {
+      return static_cast<jint>(mtk_llama_runner_->load());
     }
     return static_cast<jint>(Error::InvalidArgument);
   }

From 38e88df3975c2072f00a98a394e8f61e9032ec7c Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Tue, 15 Oct 2024 09:29:49 -0700
Subject: [PATCH 03/22] Cmake to include mtk target source

---
 extension/android/CMakeLists.txt | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 8f0e67900c1..21c25e1c9bb 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -158,6 +158,26 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
     ${EXECUTORCH_ROOT}/examples/models/llama/runner
     ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama/runner
   )
+
+    target_sources(
+    executorch_jni PRIVATE
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/mtk_llama_runner.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/rotary_embedding.cpp
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/token_embedding.cpp
+  )
+  target_include_directories(
+    executorch_jni PRIVATE
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/
+    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner
+  )
+  ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED)
+  SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION ${NEURON_BUFFER_ALLOCATOR_LIB}/libneuron_buffer_allocator.so)
+  list(APPEND link_libraries neuron_backend libneuron_buffer_allocator)
 endif()
 
 target_include_directories(

From 5f4e4a97a0eb12f56479d8d1628e9352785357ca Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Wed, 16 Oct 2024 01:30:48 -0700
Subject: [PATCH 04/22] namespace changes to runner and jni layer

---
 .../llm_helper/include/llama_runner_values.h  | 16 +++++++++--
 .../executor_runner/mtk_llama_runner.cpp      | 28 +++++++++++--------
 .../executor_runner/mtk_llama_runner.h        | 12 ++++++--
 extension/android/CMakeLists.txt              |  2 +-
 4 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
index 98cd8ab394e..bef4335e8e5 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
@@ -1,7 +1,17 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Contains values that are used by the mtk_llama_runner.cpp
+
 #pragma once
 
-namespace torch::executor {
-  using llm_helper::LLMType;
+namespace mtk::vars {
+  using example::llm_helper::LLMType;
 
   // Sizes
   const size_t PROMPT_TOKEN_BATCH_SIZE = 128;
@@ -29,4 +39,4 @@ namespace torch::executor {
   // Comma-Separated Paths
   const std::string GEN_MODEL_PATHS="/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_3.pte,";
 
-} // namespace torch::executor
+} // namespace mtk:vars
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
index ea882cbb2f5..695812eb308 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
@@ -73,9 +73,14 @@ static uint64_t MAX_RESPONSE = 50; // Maximum number of tokens to generate.
 static constexpr int8_t kAddBos = 1;
 static constexpr int8_t kAddEos = 0;
 
-using namespace torch::executor;
-using namespace torch::executor::llm_helper;
-using torch::executor::utils::Timer;
+using namespace example::llm_helper;
+using example::utils::argmax;
+using example::utils::split;
+using example::utils::Timer;
+using example::utils::to_string;
+using namespace mtk::vars;
+
+namespace llm = ::executorch::extension::llm;
 
 MTKLlamaRunner::MTKLlamaRunner(
   const std::string& model_path,
@@ -83,7 +88,7 @@ MTKLlamaRunner::MTKLlamaRunner(
   const float temperature)
   : modeloptions_(get_model_options()),
     modelpaths_(get_model_paths()) {
-  runtime_init();
+  executorch::runtime::runtime_init();
   ET_LOG(
         Info,
         "Creating MTK Llama runner. Current it will self-load .pte, .bin, and .so files. Initiated runtime_init().");
@@ -125,7 +130,7 @@ Error MTKLlamaRunner::generate(
   // Wrap the token_callback with print function
   std::function<void(const std::string&)> wrapped_callback =
       [token_callback](const std::string& piece) {
-        util::safe_printf(piece.c_str());
+        llm::safe_printf(piece.c_str());
         fflush(stdout);
         if (token_callback) {
           token_callback(piece);
@@ -172,8 +177,8 @@ LlamaModelPaths MTKLlamaRunner::get_model_paths() {
   LlamaModelPaths model_paths = {
       .tokenizer_path = TOKENIZER_PATH,
       .token_embedding_path = TOKEN_EMBEDDING_PATH,
-      .prompt_model_paths = utils::split(PROMPT_MODEL_PATHS, ','),
-      .gen_model_paths = utils::split(GEN_MODEL_PATHS, ',')};
+      .prompt_model_paths = split(PROMPT_MODEL_PATHS, ','),
+      .gen_model_paths = split(GEN_MODEL_PATHS, ',')};
   ET_LOG(Info, "Completed get_model_paths");   
   return model_paths;
 }
@@ -225,8 +230,7 @@ Result<uint64_t> MTKLlamaRunner::digest_prompt(
 
   const auto vocab_size = tokenizer->vocab_size();
   const auto logits_type = llama_runtime.GetModelOptions().model_output_type;
-  const auto first_output_token =
-      utils::argmax(logits_type, logits, vocab_size);
+  const auto first_output_token = argmax(logits_type, logits, vocab_size);
   return first_output_token;
 }
 
@@ -273,7 +277,7 @@ Error MTKLlamaRunner::gen_response(
     timer_gen_token.End();
 
     prev_token = output_token;
-    output_token = utils::argmax(logits_type, logits, vocab_size);
+    output_token = argmax(logits_type, logits, vocab_size);
     full_response_tokens.push_back(output_token);
 
     // Stop when output is EOS
@@ -293,7 +297,7 @@ Error MTKLlamaRunner::gen_response(
   }
 
   std::cout << "\n\n[Generated Tokens]\n"
-            << utils::to_string(full_response_tokens) << std::endl;
+            << to_string(full_response_tokens) << std::endl;
 
   ET_LOG(
       Info,
@@ -327,7 +331,7 @@ Error MTKLlamaRunner::inference(
 std::unique_ptr<Tokenizer> MTKLlamaRunner::load_tokenizer() {
   std::unique_ptr<Tokenizer> tokenizer;
   // Assumes that tokenizer type is Tiktoken
-  tokenizer = torch::executor::get_tiktoken_for_llama();
+  tokenizer = example::get_tiktoken_for_llama();
   tokenizer->load(modelpaths_.tokenizer_path);
   return tokenizer;
 }
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h
index d9f85c20257..e79a3b02adb 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.h
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.h
@@ -22,9 +22,15 @@
 
 #include "llama_runner/LlamaConfig.h"
 #include "llama_runner/LlamaRuntime.h"
-using namespace torch::executor;
 using Stats = ::executorch::llm::Stats;
 
+using example::LlamaModelOptions;
+using example::LlamaModelPaths;
+using example::LlamaRuntime;
+using executorch::extension::llm::Tokenizer;
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+
 class MTKLlamaRunner {
  public:
   explicit MTKLlamaRunner(
@@ -62,8 +68,8 @@ class MTKLlamaRunner {
 
  private:
   // model
-  const torch::executor::LlamaModelOptions modeloptions_;
-  const torch::executor::LlamaModelPaths modelpaths_;
+  const LlamaModelOptions modeloptions_;
+  const LlamaModelPaths modelpaths_;
   std::unique_ptr<Tokenizer> tokenizer_;
   std::unique_ptr<LlamaRuntime> runtime_;
 };
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 21c25e1c9bb..c7f61ff59be 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -176,7 +176,7 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
     ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner
   )
   ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED)
-  SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION ${NEURON_BUFFER_ALLOCATOR_LIB}/libneuron_buffer_allocator.so)
+  SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION ${NEURON_BUFFER_ALLOCATOR_LIB})
   list(APPEND link_libraries neuron_backend libneuron_buffer_allocator)
 endif()
 

From 26acfc29e41d6ec5064bb77ed506b3c22750e51c Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Wed, 16 Oct 2024 11:38:04 -0700
Subject: [PATCH 05/22] lintrunner formatting

---
 .../llm_helper/include/llama_runner_values.h  | 62 ++++++++++---------
 .../executor_runner/mtk_llama_runner.cpp      | 29 +++++----
 .../executor_runner/mtk_llama_runner.h        |  9 ++-
 extension/android/jni/jni_layer_llama.cpp     |  8 +--
 4 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
index bef4335e8e5..098898f5c27 100644
--- a/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
+++ b/examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h
@@ -11,32 +11,36 @@
 #pragma once
 
 namespace mtk::vars {
-  using example::llm_helper::LLMType;
-
-  // Sizes
-  const size_t PROMPT_TOKEN_BATCH_SIZE = 128;
-  const size_t CACHE_SIZE = 512;
-  const size_t HIDDEN_SIZE = 4096;
-  const size_t NUM_HEAD = 32;
-  const size_t NUM_LAYER = 32;
-  const size_t MAX_TOKEN_LENGTH = 8192;
-  const double ROT_EMB_BASE = 500000;
-
-  // Types
-  const LLMType MODEL_INPUT_TYPE = LLMType::FP32;
-  const LLMType MODEL_OUTPUT_TYPE = LLMType::FP32;
-  const LLMType CACHE_TYPE = LLMType::FP32;
-  const LLMType MASK_TYPE = LLMType::FP32;
-  const LLMType ROT_EMB_TYPE = LLMType::FP32;
-
-  // Paths
-  const std::string TOKENIZER_PATH="/data/local/tmp/et-mtk/llama3/tokenizer.model";
-  const std::string TOKEN_EMBEDDING_PATH="/data/local/tmp/et-mtk/llama3/embedding_llama3-8B-instruct_fp32.bin";
-
-  // Comma-Separated Paths
-  const std::string PROMPT_MODEL_PATHS="/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_3.pte,";
-
-  // Comma-Separated Paths
-  const std::string GEN_MODEL_PATHS="/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_3.pte,";
-
-} // namespace mtk:vars
+using example::llm_helper::LLMType;
+
+// Sizes
+const size_t PROMPT_TOKEN_BATCH_SIZE = 128;
+const size_t CACHE_SIZE = 512;
+const size_t HIDDEN_SIZE = 4096;
+const size_t NUM_HEAD = 32;
+const size_t NUM_LAYER = 32;
+const size_t MAX_TOKEN_LENGTH = 8192;
+const double ROT_EMB_BASE = 500000;
+
+// Types
+const LLMType MODEL_INPUT_TYPE = LLMType::FP32;
+const LLMType MODEL_OUTPUT_TYPE = LLMType::FP32;
+const LLMType CACHE_TYPE = LLMType::FP32;
+const LLMType MASK_TYPE = LLMType::FP32;
+const LLMType ROT_EMB_TYPE = LLMType::FP32;
+
+// Paths
+const std::string TOKENIZER_PATH =
+    "/data/local/tmp/et-mtk/llama3/tokenizer.model";
+const std::string TOKEN_EMBEDDING_PATH =
+    "/data/local/tmp/et-mtk/llama3/embedding_llama3-8B-instruct_fp32.bin";
+
+// Comma-Separated Paths
+const std::string PROMPT_MODEL_PATHS =
+    "/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_128t512c_3.pte,";
+
+// Comma-Separated Paths
+const std::string GEN_MODEL_PATHS =
+    "/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_0.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_1.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_2.pte,/data/local/tmp/et-mtk/llama3/llama3-8B-instruct_A16W4_4_chunks_1t512c_3.pte,";
+
+} // namespace mtk::vars
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
index 695812eb308..824bd8f3c8f 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
@@ -44,8 +44,8 @@
  * any receiver's applicable license agreements with MediaTek Inc.
  */
 
-#include "executorch/backends/mediatek/runtime/include/NeuronBufferAllocator.h"
 #include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
+#include "executorch/backends/mediatek/runtime/include/NeuronBufferAllocator.h"
 
 #include <ctime>
 #include <iostream>
@@ -65,8 +65,8 @@
 
 #include "llama_runner/ModelChunk.h"
 #include "llama_runner/Utils.h"
-#include "llama_runner/llm_helper/include/llm_types.h"
 #include "llama_runner/llm_helper/include/llama_runner_values.h"
+#include "llama_runner/llm_helper/include/llm_types.h"
 
 static uint64_t MAX_RESPONSE = 50; // Maximum number of tokens to generate.
 // Global BOS and EOS option for tokenization (encoding)
@@ -83,15 +83,14 @@ using namespace mtk::vars;
 namespace llm = ::executorch::extension::llm;
 
 MTKLlamaRunner::MTKLlamaRunner(
-  const std::string& model_path,
-  const std::string& tokenizer_path,
-  const float temperature)
-  : modeloptions_(get_model_options()),
-    modelpaths_(get_model_paths()) {
+    const std::string& model_path,
+    const std::string& tokenizer_path,
+    const float temperature)
+    : modeloptions_(get_model_options()), modelpaths_(get_model_paths()) {
   executorch::runtime::runtime_init();
   ET_LOG(
-        Info,
-        "Creating MTK Llama runner. Current it will self-load .pte, .bin, and .so files. Initiated runtime_init().");
+      Info,
+      "Creating MTK Llama runner. Current it will self-load .pte, .bin, and .so files. Initiated runtime_init().");
 }
 
 Error MTKLlamaRunner::load() {
@@ -122,7 +121,6 @@ Error MTKLlamaRunner::generate(
     int32_t seq_len,
     std::function<void(const std::string&)> token_callback,
     std::function<void(const Stats&)> stats_callback) {
-
   if (!is_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(load());
   }
@@ -137,9 +135,9 @@ Error MTKLlamaRunner::generate(
         }
       };
 
-  ET_LOG(Info, "Starting inference from MTKLlamaRunner");    
+  ET_LOG(Info, "Starting inference from MTKLlamaRunner");
   inference(*runtime_.get(), tokenizer_, prompt, wrapped_callback);
-  ET_LOG(Info, "Completed inference from MTKLlamaRunner"); 
+  ET_LOG(Info, "Completed inference from MTKLlamaRunner");
 
   return Error::Ok;
 }
@@ -169,7 +167,7 @@ LlamaModelOptions MTKLlamaRunner::get_model_options() {
       .cache_type = CACHE_TYPE,
       .mask_type = MASK_TYPE,
       .rot_emb_type = ROT_EMB_TYPE};
-  ET_LOG(Info, "Completed get_model_options");    
+  ET_LOG(Info, "Completed get_model_options");
   return options;
 }
 
@@ -179,7 +177,7 @@ LlamaModelPaths MTKLlamaRunner::get_model_paths() {
       .token_embedding_path = TOKEN_EMBEDDING_PATH,
       .prompt_model_paths = split(PROMPT_MODEL_PATHS, ','),
       .gen_model_paths = split(GEN_MODEL_PATHS, ',')};
-  ET_LOG(Info, "Completed get_model_paths");   
+  ET_LOG(Info, "Completed get_model_paths");
   return model_paths;
 }
 
@@ -325,7 +323,8 @@ Error MTKLlamaRunner::inference(
   const auto first_output_token = prefill_res.get();
 
   // run generation mode (decoding)
-  return gen_response(llama_runtime, tokenizer, first_output_token, token_callback);
+  return gen_response(
+      llama_runtime, tokenizer, first_output_token, token_callback);
 }
 
 std::unique_ptr<Tokenizer> MTKLlamaRunner::load_tokenizer() {
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h
index e79a3b02adb..292a91fe873 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.h
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.h
@@ -11,14 +11,14 @@
 
 #pragma once
 
+#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
+#include <executorch/extension/llm/tokenizer/tiktoken.h>
 #include <cstdint>
 #include <functional>
 #include <memory>
 #include <string>
-#include <executorch/extension/llm/runner/stats.h>
-#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
-#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
-#include <executorch/extension/llm/tokenizer/tiktoken.h>
 
 #include "llama_runner/LlamaConfig.h"
 #include "llama_runner/LlamaRuntime.h"
@@ -65,7 +65,6 @@ class MTKLlamaRunner {
       std::function<void(const std::string&)> token_callback);
   std::unique_ptr<Tokenizer> load_tokenizer();
 
-
  private:
   // model
   const LlamaModelOptions modeloptions_;
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index e6b5807a086..db3dbd89f24 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -13,9 +13,9 @@
 #include <unordered_map>
 #include <vector>
 
+#include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
 #include <executorch/examples/models/llama/runner/runner.h>
 #include <executorch/examples/models/llava/runner/llava_runner.h>
-#include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/platform.h>
@@ -163,9 +163,9 @@ class ExecuTorchLlamaJni
           temperature);
     } else if (model_type_category == MODEL_TYPE_MEDIATEK_LLAMA) {
       mtk_llama_runner_ = std::make_unique<MTKLlamaRunner>(
-        model_path->toStdString().c_str(),
-        tokenizer_path->toStdString().c_str(),
-        temperature);
+          model_path->toStdString().c_str(),
+          tokenizer_path->toStdString().c_str(),
+          temperature);
     }
   }
 

From 8a29b1a81fc7bd5fe21b7670124aad5bff308a63 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Thu, 17 Oct 2024 17:06:22 -0700
Subject: [PATCH 06/22] protect cmakelist for extension under
 NEURON_BUFFER_ALLOCATOR_LIB flag

---
 extension/android/CMakeLists.txt | 40 +++++++++++++++++---------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index c7f61ff59be..9dd155db00f 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -159,25 +159,27 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
     ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama/runner
   )
 
-    target_sources(
-    executorch_jni PRIVATE
-    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/mtk_llama_runner.cpp
-    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
-    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
-    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
-    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
-    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp
-    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/rotary_embedding.cpp
-    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/token_embedding.cpp
-  )
-  target_include_directories(
-    executorch_jni PRIVATE
-    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/
-    ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner
-  )
-  ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED)
-  SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION ${NEURON_BUFFER_ALLOCATOR_LIB})
-  list(APPEND link_libraries neuron_backend libneuron_buffer_allocator)
+  if(NEURON_BUFFER_ALLOCATOR_LIB)
+      target_sources(
+      executorch_jni PRIVATE
+      ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/mtk_llama_runner.cpp
+      ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
+      ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/LlamaRuntime.cpp
+      ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp
+      ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/MultiModelLoader.cpp
+      ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp
+      ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/rotary_embedding.cpp
+      ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner/llm_helper/token_embedding.cpp
+    )
+    target_include_directories(
+      executorch_jni PRIVATE
+      ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/
+      ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner
+    )
+    ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED)
+    SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION ${NEURON_BUFFER_ALLOCATOR_LIB})
+    list(APPEND link_libraries neuron_backend libneuron_buffer_allocator)
+  endif()
 endif()
 
 target_include_directories(

From 66f550aff1150ff7e04c27cf86123bbc6342eb76 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Thu, 17 Oct 2024 20:54:00 -0700
Subject: [PATCH 07/22] llama2 -> llama

---
 examples/mediatek/executor_runner/mtk_llama_runner.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h
index 292a91fe873..2123818f09b 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.h
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.h
@@ -11,7 +11,7 @@
 
 #pragma once
 
-#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
+#include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 #include <executorch/extension/llm/tokenizer/tiktoken.h>

From 03c12c840f013d4710097cb5c5ca9f7f7238b271 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Fri, 18 Oct 2024 12:34:19 -0700
Subject: [PATCH 08/22] Use common LLM interface

---
 .../executor_runner/mtk_llama_runner.cpp      |  4 ++-
 .../executor_runner/mtk_llama_runner.h        |  8 ++++--
 extension/android/CMakeLists.txt              |  1 +
 extension/android/jni/jni_layer_llama.cpp     | 25 ++++++++-----------
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
index 824bd8f3c8f..713e6679e49 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
@@ -120,7 +120,9 @@ Error MTKLlamaRunner::generate(
     const std::string& prompt,
     int32_t seq_len,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback) {
+    std::function<void(const Stats&)> stats_callback
+      bool,
+      bool) {
   if (!is_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(load());
   }
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h
index 2123818f09b..8240a6a45c1 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.h
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.h
@@ -12,6 +12,7 @@
 #pragma once
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
+#include <executorch/extension/llm/runner/runner_interface.h>
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 #include <executorch/extension/llm/tokenizer/tiktoken.h>
@@ -31,7 +32,8 @@ using executorch::extension::llm::Tokenizer;
 using executorch::runtime::Error;
 using executorch::runtime::Result;
 
-class MTKLlamaRunner {
+class MTKLlamaRunner
+    : public executorch::extension::llm::RunnerInterface {
  public:
   explicit MTKLlamaRunner(
       const std::string& model_path,
@@ -44,7 +46,9 @@ class MTKLlamaRunner {
       const std::string& prompt,
       int32_t seq_len = 128,
       std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {});
+      std::function<void(const Stats&)> stats_callback = {},
+      bool echo = true,
+      bool warming = false);
   void stop();
 
   LlamaModelOptions get_model_options();
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 9dd155db00f..1fe2852c97d 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -179,6 +179,7 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
     ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED)
     SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION ${NEURON_BUFFER_ALLOCATOR_LIB})
     list(APPEND link_libraries neuron_backend libneuron_buffer_allocator)
+    target_compile_definitions(executorch_jni PRIVATE EXECUTORCH_BUILD_MEDIATEK=1)
   endif()
 endif()
 
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index db3dbd89f24..54a2a5dba20 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -13,10 +13,10 @@
 #include <unordered_map>
 #include <vector>
 
-#include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
 #include <executorch/examples/models/llama/runner/runner.h>
 #include <executorch/examples/models/llava/runner/llava_runner.h>
 #include <executorch/extension/llm/runner/image.h>
+#include <executorch/extension/llm/runner/runner_interface.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/runtime.h>
@@ -29,6 +29,10 @@
 #include <fbjni/ByteBuffer.h>
 #include <fbjni/fbjni.h>
 
+#if defined(EXECUTORCH_BUILD_MEDIATEK)
+#include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
+#endif
+
 namespace llm = ::executorch::extension::llm;
 using ::executorch::runtime::Error;
 
@@ -112,9 +116,8 @@ class ExecuTorchLlamaJni
  private:
   friend HybridBase;
   int model_type_category_;
-  std::unique_ptr<example::Runner> runner_;
+  std::unique_ptr<llm::RunnerInterface> runner_;
   std::unique_ptr<llm::MultimodalRunner> multi_modal_runner_;
-  std::unique_ptr<MTKLlamaRunner> mtk_llama_runner_;
 
  public:
   constexpr static auto kJavaDescriptor =
@@ -161,11 +164,15 @@ class ExecuTorchLlamaJni
           model_path->toStdString().c_str(),
           tokenizer_path->toStdString().c_str(),
           temperature);
+#if defined(EXECUTORCH_BUILD_MEDIATEK)
     } else if (model_type_category == MODEL_TYPE_MEDIATEK_LLAMA) {
-      mtk_llama_runner_ = std::make_unique<MTKLlamaRunner>(
+      runner_ = std::make_unique<MTKLlamaRunner>(
           model_path->toStdString().c_str(),
           tokenizer_path->toStdString().c_str(),
           temperature);
+      // Interpret the model type as LLM
+      model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
+#endif
     }
   }
 
@@ -205,12 +212,6 @@ class ExecuTorchLlamaJni
           [callback](std::string result) { callback->onResult(result); },
           [callback](const llm::Stats& result) { callback->onStats(result); },
           echo);
-    } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) {
-      mtk_llama_runner_->generate(
-          prompt->toStdString(),
-          seq_len,
-          [callback](std::string result) { callback->onResult(result); },
-          [callback](const Stats& result) { callback->onStats(result); });
     }
     return 0;
   }
@@ -300,8 +301,6 @@ class ExecuTorchLlamaJni
       multi_modal_runner_->stop();
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
       runner_->stop();
-    } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) {
-      mtk_llama_runner_->stop();
     }
   }
 
@@ -310,8 +309,6 @@ class ExecuTorchLlamaJni
       return static_cast<jint>(multi_modal_runner_->load());
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
       return static_cast<jint>(runner_->load());
-    } else if (model_type_category_ == MODEL_TYPE_MEDIATEK_LLAMA) {
-      return static_cast<jint>(mtk_llama_runner_->load());
     }
     return static_cast<jint>(Error::InvalidArgument);
   }

From 6bd7c295b568292bbb2868e3e0974d9936b3b504 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Mon, 21 Oct 2024 15:02:30 -0700
Subject: [PATCH 09/22] Add android-26 and rename runner_inferface to irunner

---
 build/build_android_llm_demo.sh                        | 2 ++
 examples/mediatek/executor_runner/mtk_llama_runner.cpp | 6 +++---
 examples/mediatek/executor_runner/mtk_llama_runner.h   | 4 ++--
 extension/android/jni/jni_layer_llama.cpp              | 4 ++--
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
index 4eb47c7d05a..4ad7c70c393 100644
--- a/build/build_android_llm_demo.sh
+++ b/build/build_android_llm_demo.sh
@@ -37,6 +37,7 @@ build_android_native_library() {
   cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
     -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
     -DANDROID_ABI="${ANDROID_ABI}" \
+    -DANDROID_PLATFORM=android-26 \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DEXECUTORCH_LOG_LEVEL=Info \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
@@ -65,6 +66,7 @@ build_android_native_library() {
   cmake extension/android \
     -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI="${ANDROID_ABI}" \
+    -DANDROID_PLATFORM=android-26 \
     -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DEXECUTORCH_LOG_LEVEL=Info \
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
index 713e6679e49..de22171d179 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
@@ -120,9 +120,9 @@ Error MTKLlamaRunner::generate(
     const std::string& prompt,
     int32_t seq_len,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback
-      bool,
-      bool) {
+    std::function<void(const Stats&)> stats_callback,
+    bool echo,
+    bool warming) {
   if (!is_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(load());
   }
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h
index 8240a6a45c1..a5b9d017933 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.h
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.h
@@ -12,7 +12,7 @@
 #pragma once
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
-#include <executorch/extension/llm/runner/runner_interface.h>
+#include <executorch/extension/llm/runner/irunner.h>
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 #include <executorch/extension/llm/tokenizer/tiktoken.h>
@@ -33,7 +33,7 @@ using executorch::runtime::Error;
 using executorch::runtime::Result;
 
 class MTKLlamaRunner
-    : public executorch::extension::llm::RunnerInterface {
+    : public executorch::extension::llm::IRunner {
  public:
   explicit MTKLlamaRunner(
       const std::string& model_path,
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index 54a2a5dba20..a22e0b7e41f 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -16,7 +16,7 @@
 #include <executorch/examples/models/llama/runner/runner.h>
 #include <executorch/examples/models/llava/runner/llava_runner.h>
 #include <executorch/extension/llm/runner/image.h>
-#include <executorch/extension/llm/runner/runner_interface.h>
+#include <executorch/extension/llm/runner/irunner.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/runtime.h>
@@ -116,7 +116,7 @@ class ExecuTorchLlamaJni
  private:
   friend HybridBase;
   int model_type_category_;
-  std::unique_ptr<llm::RunnerInterface> runner_;
+  std::unique_ptr<llm::IRunner> runner_;
   std::unique_ptr<llm::MultimodalRunner> multi_modal_runner_;
 
  public:

From b2bca6e0416e9d401ae99a0632333d135937c36c Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Mon, 21 Oct 2024 16:00:06 -0700
Subject: [PATCH 10/22] lint fix

---
 examples/mediatek/executor_runner/mtk_llama_runner.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h
index a5b9d017933..4c7b35d1a88 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.h
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.h
@@ -32,8 +32,7 @@ using executorch::extension::llm::Tokenizer;
 using executorch::runtime::Error;
 using executorch::runtime::Result;
 
-class MTKLlamaRunner
-    : public executorch::extension::llm::IRunner {
+class MTKLlamaRunner : public executorch::extension::llm::IRunner {
  public:
   explicit MTKLlamaRunner(
       const std::string& model_path,

From a4ff6809b356ee2373f540a068892167b43abc16 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@fb.com>
Date: Mon, 21 Oct 2024 16:00:18 -0700
Subject: [PATCH 11/22] linter

---
 extension/android/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 1fe2852c97d..31f24b39793 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -176,8 +176,8 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
       ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/
       ${EXECUTORCH_ROOT}/examples/mediatek/executor_runner/llama_runner
     )
-    ADD_LIBRARY(libneuron_buffer_allocator SHARED IMPORTED)
-    SET_PROPERTY(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION ${NEURON_BUFFER_ALLOCATOR_LIB})
+    add_library(libneuron_buffer_allocator SHARED IMPORTED)
+    set_property(TARGET libneuron_buffer_allocator PROPERTY IMPORTED_LOCATION ${NEURON_BUFFER_ALLOCATOR_LIB})
     list(APPEND link_libraries neuron_backend libneuron_buffer_allocator)
     target_compile_definitions(executorch_jni PRIVATE EXECUTORCH_BUILD_MEDIATEK=1)
   endif()

From 9bba42d0daf39d6e0a702ce5c8a70e39fd21e107 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Wed, 16 Oct 2024 01:27:09 -0700
Subject: [PATCH 12/22] MTK Android app changes

---
 .../executorchllamademo/MainActivity.java     | 18 ++++++
 .../executorchllamademo/SettingsActivity.java | 62 +++++++++++++++++++
 .../executorchllamademo/SettingsFields.java   | 11 +++-
 .../src/main/res/layout/activity_settings.xml | 45 +++++++++++++-
 4 files changed, 134 insertions(+), 2 deletions(-)

diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index e9560be2871..169ebb1e02a 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -285,6 +285,7 @@ protected void onResume() {
       }
       boolean isUpdated = !mCurrentSettingsFields.equals(updatedSettingsFields);
       boolean isLoadModel = updatedSettingsFields.getIsLoadModel();
+      setBackendMode(updatedSettingsFields.getBackendType());
       if (isUpdated) {
         if (isLoadModel) {
           // If users change the model file, but not pressing loadModelButton, we won't load the new
@@ -293,6 +294,7 @@ protected void onResume() {
         } else {
           askUserToSelectModel();
         }
+
         checkForClearChatHistory(updatedSettingsFields);
         // Update current to point to the latest
         mCurrentSettingsFields = new SettingsFields(updatedSettingsFields);
@@ -302,6 +304,22 @@ protected void onResume() {
     }
   }
 
+  private void setBackendMode(BackendType backendType) {
+    if(backendType.equals(BackendType.XNNPACK)) {
+      setXNNPACKMode();
+    } else if(backendType.equals(BackendType.MEDIATEK)) {
+      setMediaTekMode();
+    }
+  }
+
+  private void setXNNPACKMode() {
+    requireViewById(R.id.addMediaButton).setVisibility(View.VISIBLE);
+  }
+
+  private void setMediaTekMode() {
+    requireViewById(R.id.addMediaButton).setVisibility(View.GONE);
+  }
+
   private void checkForClearChatHistory(SettingsFields updatedSettingsFields) {
     if (updatedSettingsFields.getIsClearChatHistory()) {
       mMessageAdapter.clear();
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
index 71f5ec4733a..5b7554264d2 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
@@ -14,11 +14,13 @@
 import android.os.Bundle;
 import android.text.Editable;
 import android.text.TextWatcher;
+import android.view.View;
 import android.widget.Button;
 import android.widget.EditText;
 import android.widget.ImageButton;
 import android.widget.TextView;
 import androidx.appcompat.app.AppCompatActivity;
+import androidx.compose.foundation.BackgroundKt;
 import androidx.core.content.ContextCompat;
 import androidx.core.graphics.Insets;
 import androidx.core.view.ViewCompat;
@@ -32,6 +34,7 @@ public class SettingsActivity extends AppCompatActivity {
 
   private String mModelFilePath = "";
   private String mTokenizerFilePath = "";
+  private TextView mBackendTextView;
   private TextView mModelTextView;
   private TextView mTokenizerTextView;
   private TextView mModelTypeTextView;
@@ -41,6 +44,7 @@ public class SettingsActivity extends AppCompatActivity {
   private double mSetTemperature;
   private String mSystemPrompt;
   private String mUserPrompt;
+  private BackendType mBackendType;
   private ModelType mModelType;
   public SettingsFields mSettingsFields;
 
@@ -68,9 +72,11 @@ protected void onCreate(Bundle savedInstanceState) {
   }
 
   private void setupSettings() {
+    mBackendTextView = requireViewById(R.id.backendTextView);
     mModelTextView = requireViewById(R.id.modelTextView);
     mTokenizerTextView = requireViewById(R.id.tokenizerTextView);
     mModelTypeTextView = requireViewById(R.id.modelTypeTextView);
+    ImageButton backendImageButton = requireViewById(R.id.backendImageButton);
     ImageButton modelImageButton = requireViewById(R.id.modelImageButton);
     ImageButton tokenizerImageButton = requireViewById(R.id.tokenizerImageButton);
     ImageButton modelTypeImageButton = requireViewById(R.id.modelTypeImageButton);
@@ -79,6 +85,10 @@ private void setupSettings() {
     loadSettings();
 
     // TODO: The two setOnClickListeners will be removed after file path issue is resolved
+    backendImageButton.setOnClickListener(
+        view -> {
+          setupBackendSelectorDialog();
+        });
     modelImageButton.setOnClickListener(
         view -> {
           setupModelSelectorDialog();
@@ -104,6 +114,12 @@ private void setupSettings() {
     if (mModelType != null) {
       mModelTypeTextView.setText(mModelType.toString());
     }
+    mBackendType = mSettingsFields.getBackendType();
+    ETLogging.getInstance().log("mBackendType from settings " + mBackendType);
+    if (mBackendType != null) {
+      mBackendTextView.setText(mBackendType.toString());
+      setBackendSettingMode();
+    }
 
     setupParameterSettings();
     setupPromptSettings();
@@ -285,6 +301,29 @@ private void showInvalidPromptDialog() {
         .show();
   }
 
+  private void setupBackendSelectorDialog() {
+    // Convert enum to list
+    List<String> backendTypesList = new ArrayList<>();
+    for (BackendType backendType : BackendType.values()) {
+      backendTypesList.add(backendType.toString());
+    }
+    // Alert dialog builder takes in arr of string instead of list
+    String[] backendTypes = backendTypesList.toArray(new String[0]);
+    AlertDialog.Builder backendTypeBuilder = new AlertDialog.Builder(this);
+    backendTypeBuilder.setTitle("Select backend type");
+    backendTypeBuilder.setSingleChoiceItems(
+            backendTypes,
+            -1,
+            (dialog, item) -> {
+              mBackendTextView.setText(backendTypes[item]);
+              mBackendType = BackendType.valueOf(backendTypes[item]);
+              setBackendSettingMode();
+              dialog.dismiss();
+            });
+
+    backendTypeBuilder.create().show();
+  }
+
   private void setupModelSelectorDialog() {
     String[] pteFiles = listLocalFile("/data/local/tmp/llama/", ".pte");
     AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this);
@@ -370,6 +409,28 @@ private String getFilenameFromPath(String uriFilePath) {
     return "";
   }
 
+  private void setBackendSettingMode() {
+    if(mBackendType.equals(BackendType.XNNPACK)) {
+      setXNNPACKSettingMode();
+    } else if(mBackendType.equals(BackendType.MEDIATEK)) {
+      setMediaTekSettingMode();
+    }
+  }
+
+  private void setXNNPACKSettingMode() {
+    requireViewById(R.id.modelLayout).setVisibility(View.VISIBLE);
+    requireViewById(R.id.tokenizerLayout).setVisibility(View.VISIBLE);
+    requireViewById(R.id.parametersView).setVisibility(View.VISIBLE);
+    requireViewById(R.id.temperatureLayout).setVisibility(View.VISIBLE);
+  }
+
+  private void setMediaTekSettingMode() {
+    requireViewById(R.id.modelLayout).setVisibility(View.GONE);
+    requireViewById(R.id.tokenizerLayout).setVisibility(View.GONE);
+    requireViewById(R.id.parametersView).setVisibility(View.GONE);
+    requireViewById(R.id.temperatureLayout).setVisibility(View.GONE);
+  }
+
   private void loadSettings() {
     Gson gson = new Gson();
     String settingsFieldsJSON = mDemoSharedPreferences.getSettings();
@@ -384,6 +445,7 @@ private void saveSettings() {
     mSettingsFields.saveParameters(mSetTemperature);
     mSettingsFields.savePrompts(mSystemPrompt, mUserPrompt);
     mSettingsFields.saveModelType(mModelType);
+    mSettingsFields.saveBackendType(mBackendType);
     mDemoSharedPreferences.addSettings(mSettingsFields);
   }
 
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
index b71799981b2..a63c6112227 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
@@ -30,6 +30,8 @@ public ModelType getModelType() {
     return modelType;
   }
 
+  public BackendType getBackendType(){ return backendType; }
+
   public String getUserPrompt() {
     return userPrompt;
   }
@@ -63,9 +65,11 @@ public boolean getIsLoadModel() {
   private boolean isClearChatHistory;
   private boolean isLoadModel;
   private ModelType modelType;
+  private BackendType backendType;
 
   public SettingsFields() {
     ModelType DEFAULT_MODEL = ModelType.LLAMA_3;
+    BackendType DEFAULT_BACKEND= BackendType.XNNPACK;
 
     modelFilePath = "";
     tokenizerFilePath = "";
@@ -75,6 +79,7 @@ public SettingsFields() {
     isClearChatHistory = false;
     isLoadModel = false;
     modelType = DEFAULT_MODEL;
+    backendType = DEFAULT_BACKEND;
   }
 
   public SettingsFields(SettingsFields settingsFields) {
@@ -86,6 +91,7 @@ public SettingsFields(SettingsFields settingsFields) {
     this.isClearChatHistory = settingsFields.getIsClearChatHistory();
     this.isLoadModel = settingsFields.getIsLoadModel();
     this.modelType = settingsFields.modelType;
+    this.backendType = settingsFields.backendType;
   }
 
   public void saveModelPath(String modelFilePath) {
@@ -100,6 +106,8 @@ public void saveModelType(ModelType modelType) {
     this.modelType = modelType;
   }
 
+  public void saveBackendType(BackendType backendType) { this.backendType = backendType;}
+
   public void saveParameters(Double temperature) {
     this.temperature = temperature;
   }
@@ -126,6 +134,7 @@ public boolean equals(SettingsFields anotherSettingsFields) {
         && userPrompt.equals(anotherSettingsFields.userPrompt)
         && isClearChatHistory == anotherSettingsFields.isClearChatHistory
         && isLoadModel == anotherSettingsFields.isLoadModel
-        && modelType == anotherSettingsFields.modelType;
+        && modelType == anotherSettingsFields.modelType
+        && backendType == anotherSettingsFields.backendType;
   }
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml
index 7d5c3b1b6df..0ec551ae364 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml
@@ -28,11 +28,51 @@
             android:translationY="5dp" />
 
         <LinearLayout
+            android:id="@+id/backendLayout"
             android:layout_width="match_parent"
             android:layout_height="wrap_content"
             android:layout_marginTop="40dp"
             android:orientation="horizontal">
 
+            <TextView
+                android:id="@+id/backendLabel"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:gravity="center_vertical"
+                android:text="Backend"
+                android:textColor="#FFFFFF"
+                android:textSize="16sp"
+                android:translationX="5dp" />
+
+            <TextView
+                android:id="@+id/backendTextView"
+                android:layout_width="0dp"
+                android:layout_height="match_parent"
+                android:layout_weight="1"
+                android:gravity="center_vertical|end"
+                android:text="no backend selected"
+                android:textColor="#FFFFFF" />
+
+            <ImageButton
+                android:id="@+id/backendImageButton"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginStart="5dp"
+                android:background="#00FFFFFF"
+                android:scaleType="center"
+                android:scaleX="0.7"
+                android:scaleY="0.7"
+                android:src="@drawable/expand_circle_down" />
+
+        </LinearLayout>
+
+        <LinearLayout
+            android:id="@+id/modelLayout"
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="20dp"
+            android:orientation="horizontal">
+
             <TextView
                 android:id="@+id/modelLabel"
                 android:layout_width="wrap_content"
@@ -66,6 +106,7 @@
         </LinearLayout>
 
         <LinearLayout
+            android:id="@+id/tokenizerLayout"
             android:layout_width="match_parent"
             android:layout_height="wrap_content"
             android:layout_marginTop="20dp"
@@ -103,6 +144,7 @@
         </LinearLayout>
 
         <LinearLayout
+            android:id="@+id/modelTypeLayout"
             android:layout_width="match_parent"
             android:layout_height="wrap_content"
             android:layout_marginTop="20dp"
@@ -152,7 +194,7 @@
             android:theme="@style/DefaultButton" />
 
         <TextView
-            android:id="@+id/textView4"
+            android:id="@+id/parametersView"
             android:layout_width="match_parent"
             android:layout_height="wrap_content"
             android:layout_marginTop="20dp"
@@ -164,6 +206,7 @@
             android:translationX="5dp" />
 
         <LinearLayout
+            android:id="@+id/temperatureLayout"
             android:layout_width="match_parent"
             android:layout_height="wrap_content"
             android:layout_marginBottom="10dp"

From 08235458971d845b058a665cf86af3b14c5d7aad Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Wed, 16 Oct 2024 14:06:05 -0700
Subject: [PATCH 13/22] include libs in manifest and some modeltype
 placeholders (todos)

---
 .../app/src/main/AndroidManifest.xml          | 24 +++++++++++++++++++
 .../executorchllamademo/MainActivity.java     |  4 +++-
 .../executorchllamademo/ModelUtils.java       |  1 +
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
index 836c7f7f8be..7096a7d4e76 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
@@ -36,6 +36,30 @@
             android:name="libcdsprpc.so"
             android:required="false" />
 
+        <uses-native-library
+            android:name="libapuwareutils_v2.mtk.so"
+            android:required="false" />
+
+        <uses-native-library
+            android:name="libapuwareapusys_v2.mtk.so"
+            android:required="false" />
+
+        <uses-native-library
+            android:name="libnir_neon_driver_ndk.mtk.so"
+            android:required="false" />
+
+        <uses-native-library
+            android:name="libnir_neon_driver_ndk.mtk.vndk.so"
+            android:required="false" />
+
+        <uses-native-library
+            android:name="libcmdl_ndk.mtk.vndk.so"
+            android:required="false" />
+
+        <uses-native-library
+            android:name="libcmdl_ndk.mtk.so"
+            android:required="false" />
+
         <activity
             android:name=".MainActivity"
             android:exported="true"
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index 169ebb1e02a..eb8a019538c 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -125,7 +125,8 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera
     long runStartTime = System.currentTimeMillis();
     mModule =
         new LlamaModule(
-            ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()),
+            //ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()),
+                3, //TODO: Modify this based on JNI change for how to select MTK backend
             modelPath,
             tokenizerPath,
             temperature);
@@ -229,6 +230,7 @@ protected void onCreate(Bundle savedInstanceState) {
 
     try {
       Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true);
+      Os.setenv("LD_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true);
     } catch (ErrnoException e) {
       finish();
     }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
index 28e14cdac01..0cccb70f5a0 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
@@ -12,6 +12,7 @@ public class ModelUtils {
   static final int TEXT_MODEL = 1;
   static final int VISION_MODEL = 2;
   static final int VISION_MODEL_IMAGE_CHANNELS = 3;
+  //TODO: Make change here based on JNI change on how to indicate MTK backend
   static final int VISION_MODEL_SEQ_LEN = 768;
   static final int TEXT_MODEL_SEQ_LEN = 256;
 

From 7722e08e123ffec1584dd83dd1937d314ccd0411 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Thu, 17 Oct 2024 16:32:50 -0700
Subject: [PATCH 14/22] adding modelpath and tokenizerpath placeholder to load
 models

---
 .../com/example/executorchllamademo/SettingsActivity.java     | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
index 5b7554264d2..e40edc01951 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
@@ -422,6 +422,8 @@ private void setXNNPACKSettingMode() {
     requireViewById(R.id.tokenizerLayout).setVisibility(View.VISIBLE);
     requireViewById(R.id.parametersView).setVisibility(View.VISIBLE);
     requireViewById(R.id.temperatureLayout).setVisibility(View.VISIBLE);
+    mModelFilePath="";
+    mTokenizerFilePath="";
   }
 
   private void setMediaTekSettingMode() {
@@ -429,6 +431,8 @@ private void setMediaTekSettingMode() {
     requireViewById(R.id.tokenizerLayout).setVisibility(View.GONE);
     requireViewById(R.id.parametersView).setVisibility(View.GONE);
     requireViewById(R.id.temperatureLayout).setVisibility(View.GONE);
+    mModelFilePath="/in/mtk/llama/runner";
+    mTokenizerFilePath="/in/mtk/llama/runner";
   }
 
   private void loadSettings() {

From 4944842fe3457e001ff1eb40d9a213b5045c670b Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Mon, 21 Oct 2024 15:35:54 -0700
Subject: [PATCH 15/22] Add backend parameter to ModelUtil

---
 .../executorchllamademo/MainActivity.java     | 11 ++++---
 .../executorchllamademo/ModelUtils.java       | 31 +++++++++++++------
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index eb8a019538c..18c1964ec50 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -125,8 +125,7 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera
     long runStartTime = System.currentTimeMillis();
     mModule =
         new LlamaModule(
-            //ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()),
-                3, //TODO: Modify this based on JNI change for how to select MTK backend
+            ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType()),
             modelPath,
             tokenizerPath,
             temperature);
@@ -175,6 +174,10 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera
             + modelPath
             + "\nTokenizer path: "
             + tokenizerPath
+            + "\nBackend: "
+            + mCurrentSettingsFields.getBackendType().toString()
+            + "\nModelType: "
+            + ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType())
             + "\nTemperature: "
             + temperature
             + "\nModel loaded time: "
@@ -692,7 +695,7 @@ private void onModelRunStopped() {
           addSelectedImagesToChatThread(mSelectedImageUri);
           String finalPrompt;
           String rawPrompt = mEditTextMessage.getText().toString();
-          if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType())
+          if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType())
               == ModelUtils.VISION_MODEL) {
             finalPrompt = mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt);
           } else {
@@ -725,7 +728,7 @@ public void run() {
                         }
                       });
                   long generateStartTime = System.currentTimeMillis();
-                  if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType())
+                  if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType())
                       == ModelUtils.VISION_MODEL) {
                     mModule.generateFromPos(
                         finalPrompt,
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
index 0cccb70f5a0..29878c7e4de 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
@@ -9,22 +9,33 @@
 package com.example.executorchllamademo;
 
 public class ModelUtils {
+  // XNNPACK or QNN
   static final int TEXT_MODEL = 1;
+
+  // XNNPACK
   static final int VISION_MODEL = 2;
   static final int VISION_MODEL_IMAGE_CHANNELS = 3;
-  //TODO: Make change here based on JNI change on how to indicate MTK backend
   static final int VISION_MODEL_SEQ_LEN = 768;
   static final int TEXT_MODEL_SEQ_LEN = 256;
 
-  public static int getModelCategory(ModelType modelType) {
-    switch (modelType) {
-      case LLAVA_1_5:
-        return VISION_MODEL;
-      case LLAMA_3:
-      case LLAMA_3_1:
-      case LLAMA_3_2:
-      default:
-        return TEXT_MODEL;
+  // MediaTek
+  static final int MEDIATEK_TEXT_MODEL = 3;
+
+  public static int getModelCategory(ModelType modelType, BackendType backendType) {
+    if (backendType.equals(BackendType.XNNPACK)) {
+      switch (modelType) {
+        case LLAVA_1_5:
+          return VISION_MODEL;
+        case LLAMA_3:
+        case LLAMA_3_1:
+        case LLAMA_3_2:
+        default:
+          return TEXT_MODEL;
+      }
+    } else if (backendType.equals(BackendType.MEDIATEK)) {
+      return MEDIATEK_TEXT_MODEL;
     }
+
+    return TEXT_MODEL; // default
   }
 }

From 09ef4f867732eaa8909295a5f6992e087f5170b1 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Mon, 21 Oct 2024 15:58:41 -0700
Subject: [PATCH 16/22] lint

---
 extension/android/jni/jni_layer_llama.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index a22e0b7e41f..b9811d6d48b 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -13,6 +13,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
 #include <executorch/examples/models/llama/runner/runner.h>
 #include <executorch/examples/models/llava/runner/llava_runner.h>
 #include <executorch/extension/llm/runner/image.h>

From 7252d671550477af6d10481240c2c5d22c3eb154 Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Mon, 21 Oct 2024 16:51:51 -0700
Subject: [PATCH 17/22] rebase + lint fix

---
 .../executorchllamademo/MainActivity.java     | 17 +++++++----
 .../executorchllamademo/SettingsActivity.java | 29 +++++++++----------
 .../executorchllamademo/SettingsFields.java   | 10 +++++--
 3 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index 18c1964ec50..632ea80c8f6 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -125,7 +125,8 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera
     long runStartTime = System.currentTimeMillis();
     mModule =
         new LlamaModule(
-            ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType()),
+            ModelUtils.getModelCategory(
+                mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType()),
             modelPath,
             tokenizerPath,
             temperature);
@@ -177,7 +178,8 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera
             + "\nBackend: "
             + mCurrentSettingsFields.getBackendType().toString()
             + "\nModelType: "
-            + ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType())
+            + ModelUtils.getModelCategory(
+                mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType())
             + "\nTemperature: "
             + temperature
             + "\nModel loaded time: "
@@ -310,9 +312,9 @@ protected void onResume() {
   }
 
   private void setBackendMode(BackendType backendType) {
-    if(backendType.equals(BackendType.XNNPACK)) {
+    if (backendType.equals(BackendType.XNNPACK)) {
       setXNNPACKMode();
-    } else if(backendType.equals(BackendType.MEDIATEK)) {
+    } else if (backendType.equals(BackendType.MEDIATEK)) {
       setMediaTekMode();
     }
   }
@@ -695,7 +697,8 @@ private void onModelRunStopped() {
           addSelectedImagesToChatThread(mSelectedImageUri);
           String finalPrompt;
           String rawPrompt = mEditTextMessage.getText().toString();
-          if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType())
+          if (ModelUtils.getModelCategory(
+                  mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType())
               == ModelUtils.VISION_MODEL) {
             finalPrompt = mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt);
           } else {
@@ -728,7 +731,9 @@ public void run() {
                         }
                       });
                   long generateStartTime = System.currentTimeMillis();
-                  if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType())
+                  if (ModelUtils.getModelCategory(
+                          mCurrentSettingsFields.getModelType(),
+                          mCurrentSettingsFields.getBackendType())
                       == ModelUtils.VISION_MODEL) {
                     mModule.generateFromPos(
                         finalPrompt,
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
index e40edc01951..fbea72a5876 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
@@ -20,7 +20,6 @@
 import android.widget.ImageButton;
 import android.widget.TextView;
 import androidx.appcompat.app.AppCompatActivity;
-import androidx.compose.foundation.BackgroundKt;
 import androidx.core.content.ContextCompat;
 import androidx.core.graphics.Insets;
 import androidx.core.view.ViewCompat;
@@ -312,14 +311,14 @@ private void setupBackendSelectorDialog() {
     AlertDialog.Builder backendTypeBuilder = new AlertDialog.Builder(this);
     backendTypeBuilder.setTitle("Select backend type");
     backendTypeBuilder.setSingleChoiceItems(
-            backendTypes,
-            -1,
-            (dialog, item) -> {
-              mBackendTextView.setText(backendTypes[item]);
-              mBackendType = BackendType.valueOf(backendTypes[item]);
-              setBackendSettingMode();
-              dialog.dismiss();
-            });
+        backendTypes,
+        -1,
+        (dialog, item) -> {
+          mBackendTextView.setText(backendTypes[item]);
+          mBackendType = BackendType.valueOf(backendTypes[item]);
+          setBackendSettingMode();
+          dialog.dismiss();
+        });
 
     backendTypeBuilder.create().show();
   }
@@ -410,9 +409,9 @@ private String getFilenameFromPath(String uriFilePath) {
   }
 
   private void setBackendSettingMode() {
-    if(mBackendType.equals(BackendType.XNNPACK)) {
+    if (mBackendType.equals(BackendType.XNNPACK)) {
       setXNNPACKSettingMode();
-    } else if(mBackendType.equals(BackendType.MEDIATEK)) {
+    } else if (mBackendType.equals(BackendType.MEDIATEK)) {
       setMediaTekSettingMode();
     }
   }
@@ -422,8 +421,8 @@ private void setXNNPACKSettingMode() {
     requireViewById(R.id.tokenizerLayout).setVisibility(View.VISIBLE);
     requireViewById(R.id.parametersView).setVisibility(View.VISIBLE);
     requireViewById(R.id.temperatureLayout).setVisibility(View.VISIBLE);
-    mModelFilePath="";
-    mTokenizerFilePath="";
+    mModelFilePath = "";
+    mTokenizerFilePath = "";
   }
 
   private void setMediaTekSettingMode() {
@@ -431,8 +430,8 @@ private void setMediaTekSettingMode() {
     requireViewById(R.id.tokenizerLayout).setVisibility(View.GONE);
     requireViewById(R.id.parametersView).setVisibility(View.GONE);
     requireViewById(R.id.temperatureLayout).setVisibility(View.GONE);
-    mModelFilePath="/in/mtk/llama/runner";
-    mTokenizerFilePath="/in/mtk/llama/runner";
+    mModelFilePath = "/in/mtk/llama/runner";
+    mTokenizerFilePath = "/in/mtk/llama/runner";
   }
 
   private void loadSettings() {
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
index a63c6112227..3adadf574da 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
@@ -30,7 +30,9 @@ public ModelType getModelType() {
     return modelType;
   }
 
-  public BackendType getBackendType(){ return backendType; }
+  public BackendType getBackendType() {
+    return backendType;
+  }
 
   public String getUserPrompt() {
     return userPrompt;
@@ -69,7 +71,7 @@ public boolean getIsLoadModel() {
 
   public SettingsFields() {
     ModelType DEFAULT_MODEL = ModelType.LLAMA_3;
-    BackendType DEFAULT_BACKEND= BackendType.XNNPACK;
+    BackendType DEFAULT_BACKEND = BackendType.XNNPACK;
 
     modelFilePath = "";
     tokenizerFilePath = "";
@@ -106,7 +108,9 @@ public void saveModelType(ModelType modelType) {
     this.modelType = modelType;
   }
 
-  public void saveBackendType(BackendType backendType) { this.backendType = backendType;}
+  public void saveBackendType(BackendType backendType) {
+    this.backendType = backendType;
+  }
 
   public void saveParameters(Double temperature) {
     this.temperature = temperature;

From 0e9ab6c8faafbdd7542ae00513f4ad61fe3eb37d Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Tue, 22 Oct 2024 12:10:31 -0700
Subject: [PATCH 18/22] Add backendtype.java

---
 .../java/com/example/executorchllamademo/BackendType.java   | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java

diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java
new file mode 100644
index 00000000000..639d7d4378c
--- /dev/null
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java
@@ -0,0 +1,6 @@
+package com.example.executorchllamademo;
+
+public enum BackendType {
+  XNNPACK,
+  MEDIATEK
+}
\ No newline at end of file

From b654008ab02de4bffa157b4d6ef78c94da3f9f3e Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Tue, 22 Oct 2024 12:18:01 -0700
Subject: [PATCH 19/22] lint fix

---
 .../main/java/com/example/executorchllamademo/BackendType.java  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java
index 639d7d4378c..df326981bfb 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java
@@ -3,4 +3,4 @@
 public enum BackendType {
   XNNPACK,
   MEDIATEK
-}
\ No newline at end of file
+}

From 6b5a5108332539ad343f6bc36ff7fd8caa10df7c Mon Sep 17 00:00:00 2001
From: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com>
Date: Tue, 29 Oct 2024 11:16:43 -0700
Subject: [PATCH 20/22] completing all supported backend fields

---
 .../main/java/com/example/executorchllamademo/BackendType.java  | 1 +
 .../main/java/com/example/executorchllamademo/MainActivity.java | 2 +-
 .../java/com/example/executorchllamademo/SettingsActivity.java  | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java
index df326981bfb..7c84799795f 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java
@@ -2,5 +2,6 @@
 
 public enum BackendType {
   XNNPACK,
+  QUALCOMM,
   MEDIATEK
 }
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index 632ea80c8f6..7b88d16d708 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -312,7 +312,7 @@ protected void onResume() {
   }
 
   private void setBackendMode(BackendType backendType) {
-    if (backendType.equals(BackendType.XNNPACK)) {
+    if (backendType.equals(BackendType.XNNPACK) || backendType.equals(BackendType.QUALCOMM)) {
       setXNNPACKMode();
     } else if (backendType.equals(BackendType.MEDIATEK)) {
       setMediaTekMode();
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
index fbea72a5876..c922e32e761 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
@@ -409,7 +409,7 @@ private String getFilenameFromPath(String uriFilePath) {
   }
 
   private void setBackendSettingMode() {
-    if (mBackendType.equals(BackendType.XNNPACK)) {
+    if (mBackendType.equals(BackendType.XNNPACK) || mBackendType.equals(BackendType.QUALCOMM)) {
       setXNNPACKSettingMode();
     } else if (mBackendType.equals(BackendType.MEDIATEK)) {
       setMediaTekSettingMode();

From 0b128dd3812254909efa6cbb743c9f9a900314d7 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Tue, 29 Oct 2024 13:46:37 -0700
Subject: [PATCH 21/22] Add to BUCK

---
 examples/demo-apps/android/LlamaDemo/app/src/main/BUCK | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK
index 80315c4104b..afbd3697661 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK
@@ -14,6 +14,7 @@ fb_android_library(
     name = "app_lib",
     srcs = [
         "java/com/example/executorchllamademo/AppLog.java",
+        "java/com/example/executorchllamademo/BackendType.java",
         "java/com/example/executorchllamademo/DemoSharedPreferences.java",
         "java/com/example/executorchllamademo/ETImage.java",
         "java/com/example/executorchllamademo/ETLogging.java",

From 85f3304d300898d3a75183e0999a6e15f9d6e138 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Tue, 29 Oct 2024 13:47:21 -0700
Subject: [PATCH 22/22] Remove mtk part in jni

---
 extension/android/jni/jni_layer_llama.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index f44897c0f76..5c58c89ee91 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -13,7 +13,6 @@
 #include <unordered_map>
 #include <vector>
 
-#include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
 #include <executorch/examples/models/llama/runner/runner.h>
 #include <executorch/examples/models/llava/runner/llava_runner.h>
 #include <executorch/extension/llm/runner/image.h>