From 50ce29667f9ec50a4fd730fb0b76a28267b4b587 Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Sat, 3 Jun 2023 18:51:58 +0800
Subject: [PATCH 01/17] add interface for float input

---
 llama.cpp | 224 +++++++++++++++++++++++++++++++++++++++++++++---------
 llama.h   |   7 ++
 2 files changed, 197 insertions(+), 34 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 47b4c8dd7ffb2..fff90a1432bd3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1194,27 +1194,14 @@ static bool llama_model_load(
     }
 }
 
-// evaluate the transformer
-//
-//   - lctx:      llama context
-//   - tokens:    new batch of tokens to process
-//   - n_past:    the context size so far
-//   - n_threads: number of threads to use
-//
-static bool llama_eval_internal(
-        llama_context & lctx,
-    const llama_token * tokens,
+static bool llama_eval_internal_tensor(
+            llama_context& lctx,
+            ggml_context* ctx0,
+            ggml_tensor* inpL,
             const int   n_tokens,
             const int   n_past,
-            const int   n_threads) {
-
-    // enforce that the first token is BOS
-    if (n_past == 0 && tokens[0] != llama_token_bos()) {
-        fprintf(stderr, "%s: first token must be BOS\n", __func__);
-        return false;
-    }
-
-    const int64_t t_start_us = ggml_time_us();
+            const int   n_threads,
+            const int64_t t_start_us) {
 
     const int N = n_tokens;
 
@@ -1223,8 +1210,6 @@ static bool llama_eval_internal(
 
     const auto & kv_self = model.kv_self;
 
-    LLAMA_ASSERT(!!kv_self.ctx);
-
     const int n_embd  = hparams.n_embd;
     const int n_layer = hparams.n_layer;
     const int n_ctx   = hparams.n_ctx;
@@ -1233,26 +1218,14 @@ static bool llama_eval_internal(
     const int n_rot   = hparams.n_embd/hparams.n_head;
 
     auto & mem_per_token = lctx.mem_per_token;
-    auto & buf_compute   = lctx.buf_compute;
 
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.addr,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
+    LLAMA_ASSERT(!!kv_self.ctx);
 
     // for big prompts, if BLAS is enabled, it is better to use only one thread
     // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
     ggml_cgraph gf = {};
     gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
 
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_set_name(embd, "embd");
-    memcpy(embd->data, tokens, N*ggml_element_size(embd));
-
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
 
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * inpSA = inpL;
@@ -1494,6 +1467,52 @@ static bool llama_eval_internal(
     return true;
 }
 
+
+// evaluate the transformer
+//
+//   - lctx:      llama context
+//   - tokens:    new batch of tokens to process
+//   - n_past:    the context size so far
+//   - n_threads: number of threads to use
+//
+static bool llama_eval_internal(
+        llama_context & lctx,
+    const llama_token * tokens,
+            const int   n_tokens,
+            const int   n_past,
+            const int   n_threads) {
+
+    // enforce that the first token is BOS
+    if (n_past == 0 && tokens[0] != llama_token_bos()) {
+        fprintf(stderr, "%s: first token must be BOS\n", __func__);
+        return false;
+    }
+
+    const auto & model   = lctx.model;
+
+    const int64_t t_start_us = ggml_time_us();
+
+    const int N = n_tokens;
+
+    auto & buf_compute   = lctx.buf_compute;
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_compute.size,
+        /*.mem_buffer =*/ buf_compute.addr,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_set_name(embd, "embd");
+    memcpy(embd->data, tokens, N*ggml_element_size(embd));
+
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
+    return llama_eval_internal_tensor(lctx, ctx0, inpL, N, n_past, n_threads, t_start_us);
+}
+
 //
 // tokenizer
 //
@@ -2214,6 +2233,97 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     }
 }
 
+
+ggml_tensor *quantize_float_tensor(ggml_context *ctx0, ggml_tensor* tensor,
+                                   llama_ftype ftype, int nthread) {
+
+  ggml_type quantized_type;
+  switch (ftype) {
+  case LLAMA_FTYPE_MOSTLY_Q4_0:
+    quantized_type = GGML_TYPE_Q4_0;
+    break;
+  case LLAMA_FTYPE_MOSTLY_Q4_1:
+    quantized_type = GGML_TYPE_Q4_1;
+    break;
+  case LLAMA_FTYPE_MOSTLY_Q5_0:
+    quantized_type = GGML_TYPE_Q5_0;
+    break;
+  case LLAMA_FTYPE_MOSTLY_Q5_1:
+    quantized_type = GGML_TYPE_Q5_1;
+    break;
+  case LLAMA_FTYPE_MOSTLY_Q8_0:
+    quantized_type = GGML_TYPE_Q8_0;
+    break;
+  default:
+    throw format("invalid output file type %d\n", ftype);
+  };
+  void *new_data;
+  size_t new_size;
+  llama_buffer work;
+  float *f32_data;
+  size_t nelements = tensor->ne[0] * tensor->ne[1];
+  llama_buffer f32_conv_buf;
+  f32_data = (float *)tensor->data;
+  work.resize(nelements * 4);
+  new_data = work.addr;
+  std::vector<int64_t> hist_cur(1 << 4, 0);
+  std::vector<std::thread> workers;
+  std::mutex mutex;
+  enum ggml_type new_type = quantized_type;
+
+  int chunk_size = 32 * 512;
+  const int nchunk = (nelements + chunk_size - 1) / chunk_size;
+  const int nthread_use =
+      nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
+  if (nthread_use < 2) {
+    new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements,
+                                   hist_cur.data());
+  } else {
+    size_t counter = 0;
+    new_size = 0;
+    auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data,
+                    new_data, nelements, chunk_size]() {
+      std::vector<int64_t> local_hist;
+      size_t local_size = 0;
+      while (true) {
+        std::unique_lock<std::mutex> lock(mutex);
+        size_t first = counter;
+        counter += chunk_size;
+        if (first >= nelements) {
+          if (!local_hist.empty()) {
+            for (int j = 0; j < int(local_hist.size()); ++j) {
+              hist_cur[j] += local_hist[j];
+            }
+            new_size += local_size;
+          }
+          break;
+        }
+        lock.unlock();
+        size_t last = std::min(nelements, first + chunk_size);
+        if (local_hist.empty()) {
+          local_hist.resize(hist_cur.size(), 0);
+        }
+        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first,
+                                          last - first, local_hist.data());
+      }
+    };
+    if ((int)workers.size() < nthread_use - 1) {
+      workers.resize(nthread_use - 1);
+    }
+    for (int it = 0; it < nthread_use - 1; ++it) {
+      workers[it] = std::thread(compute);
+    }
+    compute();
+    for (int it = 0; it < nthread_use - 1; ++it) {
+      workers[it].join();
+    }
+  }
+  ggml_tensor *ret =
+      ggml_new_tensor_2d(ctx0, new_type, tensor->ne[0], tensor->ne[1]);
+  memcpy(ret->data, new_data, new_size);
+  return ret;
+}
+
 //
 // interface implementation
 //
@@ -2921,6 +3031,52 @@ int llama_eval(
     return 0;
 }
 
+int llama_eval_float(
+        struct llama_context * ctx,
+           const float * input,
+                         int   n_tokens,
+                         int   n_past,
+                         int   n_threads) {
+    const auto & model   = ctx->model;
+
+    const int64_t t_start_us = ggml_time_us();
+
+    const int N = n_tokens;
+
+    auto & buf_compute   = ctx->buf_compute;
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_compute.size,
+        /*.mem_buffer =*/ buf_compute.addr,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+
+    struct ggml_tensor *input_f =
+      ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, N * model.hparams.n_embd);
+  memcpy(input_f->data, input,
+         N * model.hparams.n_embd * ggml_element_size(input_f));
+  struct ggml_tensor *inpL =
+      quantize_float_tensor(ctx0, input_f, model.hparams.ftype, n_threads);
+
+    ;
+    if (!llama_eval_internal_tensor(*ctx, ctx0, inpL, N, n_past, n_threads, t_start_us)) {
+        fprintf(stderr, "%s: failed to eval\n", __func__);
+        return 1;
+    }
+
+    // get a more accurate load time, upon first eval
+    // TODO: fix this
+    if (!ctx->has_evaluated_once) {
+        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
+        ctx->has_evaluated_once = true;
+    }
+
+    return 0;
+}
+
 int llama_tokenize(
         struct llama_context * ctx,
                   const char * text,
diff --git a/llama.h b/llama.h
index c6b0a2889f8de..3b984845c8d67 100644
--- a/llama.h
+++ b/llama.h
@@ -173,6 +173,13 @@ extern "C" {
                              int   n_past,
                              int   n_threads);
 
+    LLAMA_API int llama_eval_float(
+            struct llama_context * ctx,
+               const float * embds,
+                             int   n_tokens,
+                             int   n_past,
+                             int   n_threads);
+
     // Convert the provided text into tokens.
     // The tokens pointer must be large enough to hold the resulting tokens.
     // Returns the number of tokens on success, no more than n_max_tokens

From 5673a8de37b453ce1646a12bf3f0442956b68c02 Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Mon, 5 Jun 2023 21:39:35 +0800
Subject: [PATCH 02/17] fixed inpL shape and type

---
 llama.cpp | 101 ++----------------------------------------------------
 1 file changed, 3 insertions(+), 98 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index fff90a1432bd3..c76b198123dae 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2234,95 +2234,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 }
 
 
-ggml_tensor *quantize_float_tensor(ggml_context *ctx0, ggml_tensor* tensor,
-                                   llama_ftype ftype, int nthread) {
-
-  ggml_type quantized_type;
-  switch (ftype) {
-  case LLAMA_FTYPE_MOSTLY_Q4_0:
-    quantized_type = GGML_TYPE_Q4_0;
-    break;
-  case LLAMA_FTYPE_MOSTLY_Q4_1:
-    quantized_type = GGML_TYPE_Q4_1;
-    break;
-  case LLAMA_FTYPE_MOSTLY_Q5_0:
-    quantized_type = GGML_TYPE_Q5_0;
-    break;
-  case LLAMA_FTYPE_MOSTLY_Q5_1:
-    quantized_type = GGML_TYPE_Q5_1;
-    break;
-  case LLAMA_FTYPE_MOSTLY_Q8_0:
-    quantized_type = GGML_TYPE_Q8_0;
-    break;
-  default:
-    throw format("invalid output file type %d\n", ftype);
-  };
-  void *new_data;
-  size_t new_size;
-  llama_buffer work;
-  float *f32_data;
-  size_t nelements = tensor->ne[0] * tensor->ne[1];
-  llama_buffer f32_conv_buf;
-  f32_data = (float *)tensor->data;
-  work.resize(nelements * 4);
-  new_data = work.addr;
-  std::vector<int64_t> hist_cur(1 << 4, 0);
-  std::vector<std::thread> workers;
-  std::mutex mutex;
-  enum ggml_type new_type = quantized_type;
-
-  int chunk_size = 32 * 512;
-  const int nchunk = (nelements + chunk_size - 1) / chunk_size;
-  const int nthread_use =
-      nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
-  if (nthread_use < 2) {
-    new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements,
-                                   hist_cur.data());
-  } else {
-    size_t counter = 0;
-    new_size = 0;
-    auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data,
-                    new_data, nelements, chunk_size]() {
-      std::vector<int64_t> local_hist;
-      size_t local_size = 0;
-      while (true) {
-        std::unique_lock<std::mutex> lock(mutex);
-        size_t first = counter;
-        counter += chunk_size;
-        if (first >= nelements) {
-          if (!local_hist.empty()) {
-            for (int j = 0; j < int(local_hist.size()); ++j) {
-              hist_cur[j] += local_hist[j];
-            }
-            new_size += local_size;
-          }
-          break;
-        }
-        lock.unlock();
-        size_t last = std::min(nelements, first + chunk_size);
-        if (local_hist.empty()) {
-          local_hist.resize(hist_cur.size(), 0);
-        }
-        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first,
-                                          last - first, local_hist.data());
-      }
-    };
-    if ((int)workers.size() < nthread_use - 1) {
-      workers.resize(nthread_use - 1);
-    }
-    for (int it = 0; it < nthread_use - 1; ++it) {
-      workers[it] = std::thread(compute);
-    }
-    compute();
-    for (int it = 0; it < nthread_use - 1; ++it) {
-      workers[it].join();
-    }
-  }
-  ggml_tensor *ret =
-      ggml_new_tensor_2d(ctx0, new_type, tensor->ne[0], tensor->ne[1]);
-  memcpy(ret->data, new_data, new_size);
-  return ret;
-}
 
 //
 // interface implementation
@@ -3053,15 +2964,9 @@ int llama_eval_float(
 
     struct ggml_context * ctx0 = ggml_init(params);
 
-
-    struct ggml_tensor *input_f =
-      ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, N * model.hparams.n_embd);
-  memcpy(input_f->data, input,
-         N * model.hparams.n_embd * ggml_element_size(input_f));
-  struct ggml_tensor *inpL =
-      quantize_float_tensor(ctx0, input_f, model.hparams.ftype, n_threads);
-
-    ;
+    struct ggml_tensor *inpL =
+      ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, model.hparams.n_embd, N);
+    memcpy(inpL->data, input, N * model.hparams.n_embd * ggml_element_size(inpL));
     if (!llama_eval_internal_tensor(*ctx, ctx0, inpL, N, n_past, n_threads, t_start_us)) {
         fprintf(stderr, "%s: failed to eval\n", __func__);
         return 1;

From 20d5eef8160d8e0d020bd4167f01b9be99bb19b2 Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Mon, 5 Jun 2023 22:32:36 +0800
Subject: [PATCH 03/17] add examples of input floats

---
 examples/embd_input/embd_input.h        |  28 +++
 examples/embd_input/embd_input_lib.cpp  | 267 ++++++++++++++++++++++++
 examples/embd_input/embd_input_test.cpp |  29 +++
 3 files changed, 324 insertions(+)
 create mode 100644 examples/embd_input/embd_input.h
 create mode 100644 examples/embd_input/embd_input_lib.cpp
 create mode 100644 examples/embd_input/embd_input_test.cpp

diff --git a/examples/embd_input/embd_input.h b/examples/embd_input/embd_input.h
new file mode 100644
index 0000000000000..f5deb52775044
--- /dev/null
+++ b/examples/embd_input/embd_input.h
@@ -0,0 +1,28 @@
+#ifndef _EMBD_INPUT_H_
+#define _EMBD_INPUT_H_ 1
+
+#include "common.h"
+#include "llama.h"
+#include "build-info.h"
+
+
+extern "C" {
+
+typedef struct MyModel {
+    llama_context* ctx;
+    gpt_params params;
+} MyModel;
+
+
+struct MyModel* create_mymodel(int argc, char ** argv);
+
+bool eval_float(void* model, float* input, int N);
+bool eval_tokens(void* model, std::vector<llama_token> tokens);
+bool eval_id(struct MyModel* mymodel, int id);
+bool eval_string(struct MyModel* mymodel, const char* str);
+const char* sampling(struct MyModel* mymodel);
+llama_token sampling_id(struct MyModel* mymodel);
+
+}
+
+#endif
diff --git a/examples/embd_input/embd_input_lib.cpp b/examples/embd_input/embd_input_lib.cpp
new file mode 100644
index 0000000000000..a9edc120e4a28
--- /dev/null
+++ b/examples/embd_input/embd_input_lib.cpp
@@ -0,0 +1,267 @@
+// Defines sigaction on msys:
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "embd_input.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include <signal.h>
+#endif
+
+static console_state con_st;
+static llama_context ** g_ctx;
+
+static bool is_interacting = false;
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+void sigint_handler(int signo) {
+    if (signo == SIGINT) {
+        if (!is_interacting) {
+            is_interacting=true;
+        } else {
+            console_cleanup(con_st);
+            printf("\n");
+            llama_print_timings(*g_ctx);
+            _exit(130);
+        }
+    }
+}
+#endif
+
+
+extern "C" {
+
+struct MyModel* create_mymodel(int argc, char ** argv) {
+    gpt_params params;
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return nullptr;
+    }
+
+
+    if (params.n_ctx > 2048) {
+        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+                "expect poor results\n", __func__, params.n_ctx);
+    }
+
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+
+    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    llama_init_backend();
+
+    llama_context * ctx;
+    g_ctx = &ctx;
+
+    // load the model and apply lora adapter, if any
+    ctx = llama_init_from_gpt_params(params);
+    if (ctx == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return nullptr;
+    }
+
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+    }
+    struct MyModel* ret= new MyModel();
+    ret->ctx = ctx;
+    ret->params = params;
+    // printf("ctx: %d\n", ret->ctx);
+    return ret;
+}
+
+
+bool eval_float(void* model, float* input, int N){
+    MyModel* mymodel = (MyModel* )model;
+    llama_context* ctx = mymodel->ctx;
+    gpt_params params = mymodel->params;
+    int n_emb = llama_n_embd(ctx);
+    int n_past = 0;
+    for (int i = 0; i < (int) N; i += params.n_batch) {
+        int n_eval = (int) N - i;
+        if (n_eval > params.n_batch) {
+            n_eval = params.n_batch;
+        }
+        if (llama_eval_float(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return false;
+        }
+        n_past += n_eval;
+    }
+    return true;
+}
+
+
+
+
+
+bool eval_tokens(void* model, std::vector<llama_token> tokens) {
+    MyModel* mymodel = (MyModel* )model;
+    // printf("model: %d\n", mymodel);
+    llama_context* ctx;// = mymodel->ctx;
+    // printf("ctx2: %d\n", ctx);
+    // printf("ctx2: %d\n", mymodel->ctx);
+    ctx = mymodel->ctx;
+    // printf("ctx2: %d\n", ctx);
+    gpt_params params = mymodel->params;
+    // printf("\n%d\n", params);
+    int n_past = 1;
+    for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
+        int n_eval = (int) tokens.size() - i;
+        if (n_eval > params.n_batch) {
+            n_eval = params.n_batch;
+        }
+        // printf("%d, %d, %d\n", i, n_eval, n_past);
+        if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return false;
+        }
+        n_past += n_eval;
+    }
+    return true;
+}
+
+bool eval_id(struct MyModel* mymodel, int id) {
+    // printf("%d\n", id);
+    std::vector<llama_token> tokens;
+    tokens.push_back(id);
+    // printf("%d\n", tokens.size());
+    // printf("%d\n", tokens[0]);
+    return eval_tokens(mymodel, tokens);
+}
+
+
+bool eval_string(struct MyModel* mymodel,const char* str){
+    // std::cout << "eval " << std::endl;
+    // printf("%s", str);
+    llama_context* ctx = mymodel->ctx;
+    std::string str2 = str;
+    // printf("%s", str2.c_str());
+    std::cout << str2 << std::endl;
+    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
+    eval_tokens(mymodel, embd_inp);
+    return true;
+}
+
+
+
+
+llama_token sampling_id(struct MyModel* mymodel) {
+    llama_context* ctx = mymodel->ctx;
+    gpt_params params = mymodel->params;
+    // int n_ctx = llama_n_ctx(ctx);
+
+
+    // out of user input, sample next token
+            const float   temp            = params.temp;
+            const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+            const float   top_p           = params.top_p;
+            const float   tfs_z           = params.tfs_z;
+            const float   typical_p       = params.typical_p;
+           // const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
+           // const float   repeat_penalty  = params.repeat_penalty;
+           // const float   alpha_presence  = params.presence_penalty;
+           // const float   alpha_frequency = params.frequency_penalty;
+            const int     mirostat        = params.mirostat;
+            const float   mirostat_tau    = params.mirostat_tau;
+            const float   mirostat_eta    = params.mirostat_eta;
+           // const bool    penalize_nl     = params.penalize_nl;
+
+            llama_token id = 0;
+
+            {
+                auto logits  = llama_get_logits(ctx);
+                auto n_vocab = llama_n_vocab(ctx);
+
+                // Apply params.logit_bias map
+                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+                    logits[it->first] += it->second;
+                }
+
+                std::vector<llama_token_data> candidates;
+                candidates.reserve(n_vocab);
+                for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                    candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+                }
+
+                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+                // Apply penalties
+//                 float nl_logit = logits[llama_token_nl()];
+//                 auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+//                 llama_sample_repetition_penalty(ctx, &candidates_p,
+//                     last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+//                     last_n_repeat, repeat_penalty);
+//                 llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+//                     last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+//                     last_n_repeat, alpha_frequency, alpha_presence);
+//                 if (!penalize_nl) {
+//                     logits[llama_token_nl()] = nl_logit;
+//                 }
+
+                if (temp <= 0) {
+                    // Greedy sampling
+                    id = llama_sample_token_greedy(ctx, &candidates_p);
+                } else {
+                    if (mirostat == 1) {
+                        static float mirostat_mu = 2.0f * mirostat_tau;
+                        const int mirostat_m = 100;
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+                    } else if (mirostat == 2) {
+                        static float mirostat_mu = 2.0f * mirostat_tau;
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+                    } else {
+                        // Temperature sampling
+                        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+                        llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
+                        llama_sample_typical(ctx, &candidates_p, typical_p, 1);
+                        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token(ctx, &candidates_p);
+                    }
+                }
+
+            }
+    return id;
+}
+
+const char* sampling(struct MyModel* mymodel) {
+    llama_context* ctx = mymodel->ctx;
+    int id = sampling_id(mymodel);
+    std::string ret = llama_token_to_str(ctx, id);
+    return ret.c_str();
+}
+
+}
diff --git a/examples/embd_input/embd_input_test.cpp b/examples/embd_input/embd_input_test.cpp
new file mode 100644
index 0000000000000..96ce130fde6b2
--- /dev/null
+++ b/examples/embd_input/embd_input_test.cpp
@@ -0,0 +1,29 @@
+#include "embd_input.h"
+#include <stdlib.h>
+#include <random>
+
+int main(int argc, char** argv) {
+
+    auto mymodel = create_mymodel(argc, argv);
+    int N = 10;
+    int n_embd = llama_n_embd(mymodel->ctx);
+    float* data = new float[N*n_embd];
+    std::default_random_engine e;
+    std::uniform_real_distribution<float>  u(0,1);
+    for (int i=0;i<N*n_embd;i++) {
+        data[i] = u(e);
+    }
+
+    eval_string(mymodel, "111");
+    printf("eval float");
+    eval_float(mymodel, data, N);
+    printf("eval float end\n");
+    eval_string(mymodel, mymodel->params.prompt.c_str());
+    for (int i=0;i < 500; i++) {
+        int id = sampling_id(mymodel);
+        printf("%s", llama_token_to_str(mymodel->ctx, id));
+        eval_id(mymodel, id);
+    }
+    printf("\n");
+    return 0;
+}

From a91487093bfb4c13bf2aaafce8ab600f06c7cf4d Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Tue, 6 Jun 2023 22:06:51 +0800
Subject: [PATCH 04/17] add test example for embd input

---
 .gitignore                              |  1 +
 Makefile                                | 11 +++++-
 examples/embd_input/embd_input.h        |  1 +
 examples/embd_input/embd_input.py       | 47 +++++++++++++++++++++++++
 examples/embd_input/embd_input_lib.cpp  | 15 +++++---
 examples/embd_input/embd_input_test.cpp | 10 +++---
 6 files changed, 75 insertions(+), 10 deletions(-)
 create mode 100644 examples/embd_input/embd_input.py

diff --git a/.gitignore b/.gitignore
index d231f3ff8ed36..88bf142b954c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 *.o
 *.a
+*.so
 .DS_Store
 .build/
 .cache/
diff --git a/Makefile b/Makefile
index 8e8d426c5d6bf..7685003c26ae9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot libembd_input.so embd_input_test
 
 ifdef LLAMA_BUILD_SERVER
 	BUILD_TARGETS += server
@@ -250,6 +250,15 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
 server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
 
+libembd_input.so: examples/embd_input/embd_input.h examples/embd_input/embd_input_lib.cpp examples/embd_input/embd_input_test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) --shared $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
+
+
+embd_input_test: libembd_input.so examples/embd_input/embd_input_test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -Wl,-rpath=./ -lembd_input
+
+
+
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
diff --git a/examples/embd_input/embd_input.h b/examples/embd_input/embd_input.h
index f5deb52775044..f45cee32dac4d 100644
--- a/examples/embd_input/embd_input.h
+++ b/examples/embd_input/embd_input.h
@@ -11,6 +11,7 @@ extern "C" {
 typedef struct MyModel {
     llama_context* ctx;
     gpt_params params;
+    int n_past = 0;
 } MyModel;
 
 
diff --git a/examples/embd_input/embd_input.py b/examples/embd_input/embd_input.py
new file mode 100644
index 0000000000000..6d1abf736e143
--- /dev/null
+++ b/examples/embd_input/embd_input.py
@@ -0,0 +1,47 @@
+import ctypes
+from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
+import numpy as np
+
+libc = cdll.LoadLibrary("./libembd_input.so")
+libc.sampling.restype=c_char_p
+libc.create_mymodel.restype=c_void_p
+libc.eval_string.argtypes=[c_void_p, c_char_p]
+libc.sampling.argtypes=[c_void_p]
+libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
+
+
+class MyModel:
+    def __init__(self, args):
+        argc = len(args)
+        c_str = [c_char_p(i.encode()) for i in args]
+        args_c = (c_char_p * argc)(*c_str)
+        self.model = c_void_p(libc.create_mymodel(argc, args_c))
+        print("self.model", self.model)
+
+    def eval_float(self, x):
+        libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[0])
+
+    def eval_string(self, x):
+        libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
+
+    def eval_token(self, x):
+        libc.eval_id(self.model, x)
+
+    def sampling(self):
+        s = libc.sampling(self.model)
+        return s
+
+
+model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin"])
+print(model)
+model.eval_string("""There is a better way to deal with the formula, """)
+# model.eval_token(100)
+x = np.random.random((10,5120))# , dtype=np.float32)
+# print(x[0,0], x[0,1],x[1,0])
+model.eval_float(x)
+print(libc)
+
+for i in range(100):
+   print(model.sampling().decode(), end="")
+
+
diff --git a/examples/embd_input/embd_input_lib.cpp b/examples/embd_input/embd_input_lib.cpp
index a9edc120e4a28..cb7e5d189e162 100644
--- a/examples/embd_input/embd_input_lib.cpp
+++ b/examples/embd_input/embd_input_lib.cpp
@@ -96,6 +96,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
     struct MyModel* ret= new MyModel();
     ret->ctx = ctx;
     ret->params = params;
+    ret->n_past = 0;
     // printf("ctx: %d\n", ret->ctx);
     return ret;
 }
@@ -106,11 +107,13 @@ bool eval_float(void* model, float* input, int N){
     llama_context* ctx = mymodel->ctx;
     gpt_params params = mymodel->params;
     int n_emb = llama_n_embd(ctx);
-    int n_past = 0;
-    for (int i = 0; i < (int) N; i += params.n_batch) {
+    int n_past = mymodel->n_past;
+    // printf("%f,%f\n", *input, *(input+1));
+    int n_batch = N; // params.n_batch;
+    for (int i = 0; i < (int) N; i += n_batch) {
         int n_eval = (int) N - i;
-        if (n_eval > params.n_batch) {
-            n_eval = params.n_batch;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
         }
         if (llama_eval_float(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
@@ -118,6 +121,7 @@ bool eval_float(void* model, float* input, int N){
         }
         n_past += n_eval;
     }
+    mymodel->n_past = n_past;
     return true;
 }
 
@@ -135,7 +139,7 @@ bool eval_tokens(void* model, std::vector<llama_token> tokens) {
     // printf("ctx2: %d\n", ctx);
     gpt_params params = mymodel->params;
     // printf("\n%d\n", params);
-    int n_past = 1;
+    int n_past = mymodel->n_past;
     for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
         int n_eval = (int) tokens.size() - i;
         if (n_eval > params.n_batch) {
@@ -148,6 +152,7 @@ bool eval_tokens(void* model, std::vector<llama_token> tokens) {
         }
         n_past += n_eval;
     }
+    mymodel->n_past = n_past;
     return true;
 }
 
diff --git a/examples/embd_input/embd_input_test.cpp b/examples/embd_input/embd_input_test.cpp
index 96ce130fde6b2..7cd094e352f66 100644
--- a/examples/embd_input/embd_input_test.cpp
+++ b/examples/embd_input/embd_input_test.cpp
@@ -14,14 +14,16 @@ int main(int argc, char** argv) {
         data[i] = u(e);
     }
 
-    eval_string(mymodel, "111");
-    printf("eval float");
+    eval_string(mymodel, "user: what is the color of the flag of UN?");
+    // printf("eval float");
     eval_float(mymodel, data, N);
-    printf("eval float end\n");
+    eval_string(mymodel, "assistant:");
+    // printf("eval float end\n");
     eval_string(mymodel, mymodel->params.prompt.c_str());
-    for (int i=0;i < 500; i++) {
+    for (int i=0;i < 50; i++) {
         int id = sampling_id(mymodel);
         printf("%s", llama_token_to_str(mymodel->ctx, id));
+        fflush(stdout);
         eval_id(mymodel, id);
     }
     printf("\n");

From 9c6117cd8df3efcf1631a09b311cb1696c3b87a5 Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Tue, 6 Jun 2023 22:29:34 +0800
Subject: [PATCH 05/17] fixed sampling

---
 .gitignore                              |  1 +
 examples/embd_input/embd_input.py       | 23 +++++++++++------------
 examples/embd_input/embd_input_lib.cpp  |  1 +
 examples/embd_input/embd_input_test.cpp |  6 +++---
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index 88bf142b954c3..35c77554ea8c0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,7 @@ models/*
 /benchmark-matmult
 /vdot
 /Pipfile
+/embd_input_test
 
 build-info.h
 arm_neon.h
diff --git a/examples/embd_input/embd_input.py b/examples/embd_input/embd_input.py
index 6d1abf736e143..742bd60884094 100644
--- a/examples/embd_input/embd_input.py
+++ b/examples/embd_input/embd_input.py
@@ -16,7 +16,7 @@ def __init__(self, args):
         c_str = [c_char_p(i.encode()) for i in args]
         args_c = (c_char_p * argc)(*c_str)
         self.model = c_void_p(libc.create_mymodel(argc, args_c))
-        print("self.model", self.model)
+#         print("self.model", self.model)
 
     def eval_float(self, x):
         libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[0])
@@ -31,17 +31,16 @@ def sampling(self):
         s = libc.sampling(self.model)
         return s
 
-
-model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin"])
-print(model)
-model.eval_string("""There is a better way to deal with the formula, """)
+model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
+# print(model)
+model.eval_string("""user: what is the color of the flag of UN?""")
 # model.eval_token(100)
-x = np.random.random((10,5120))# , dtype=np.float32)
-# print(x[0,0], x[0,1],x[1,0])
+x = np.random.random((10, 5120))# , dtype=np.float32)
 model.eval_float(x)
-print(libc)
-
-for i in range(100):
-   print(model.sampling().decode(), end="")
-
+model.eval_string("""assistant:""")
+# print(x[0,0], x[0,1],x[1,0])
+# model.eval_float(x)
+# print(libc)
 
+for i in range(50):
+    print(model.sampling().decode(), end="", flush=True)
diff --git a/examples/embd_input/embd_input_lib.cpp b/examples/embd_input/embd_input_lib.cpp
index cb7e5d189e162..5cbc81709049f 100644
--- a/examples/embd_input/embd_input_lib.cpp
+++ b/examples/embd_input/embd_input_lib.cpp
@@ -266,6 +266,7 @@ const char* sampling(struct MyModel* mymodel) {
     llama_context* ctx = mymodel->ctx;
     int id = sampling_id(mymodel);
     std::string ret = llama_token_to_str(ctx, id);
+    eval_id(mymodel, id);
     return ret.c_str();
 }
 
diff --git a/examples/embd_input/embd_input_test.cpp b/examples/embd_input/embd_input_test.cpp
index 7cd094e352f66..3d86f03d7900d 100644
--- a/examples/embd_input/embd_input_test.cpp
+++ b/examples/embd_input/embd_input_test.cpp
@@ -21,10 +21,10 @@ int main(int argc, char** argv) {
     // printf("eval float end\n");
     eval_string(mymodel, mymodel->params.prompt.c_str());
     for (int i=0;i < 50; i++) {
-        int id = sampling_id(mymodel);
-        printf("%s", llama_token_to_str(mymodel->ctx, id));
+        // int id = sampling_id(mymodel);
+        printf("%s", sampling(mymodel)); // llama_token_to_str(mymodel->ctx, id));
         fflush(stdout);
-        eval_id(mymodel, id);
+        // eval_id(mymodel, id);
     }
     printf("\n");
     return 0;

From ba1f617d7d6d81cf97760489e1df8487ca5bc277 Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Tue, 6 Jun 2023 22:57:04 +0800
Subject: [PATCH 06/17] add free for context

---
 examples/embd_input/embd_input.h        | 1 +
 examples/embd_input/embd_input.py       | 2 ++
 examples/embd_input/embd_input_lib.cpp  | 7 +++++++
 examples/embd_input/embd_input_test.cpp | 1 +
 4 files changed, 11 insertions(+)

diff --git a/examples/embd_input/embd_input.h b/examples/embd_input/embd_input.h
index f45cee32dac4d..4fefabd425c76 100644
--- a/examples/embd_input/embd_input.h
+++ b/examples/embd_input/embd_input.h
@@ -23,6 +23,7 @@ bool eval_id(struct MyModel* mymodel, int id);
 bool eval_string(struct MyModel* mymodel, const char* str);
 const char* sampling(struct MyModel* mymodel);
 llama_token sampling_id(struct MyModel* mymodel);
+void free_mymodel(struct MyModel* mymodel);
 
 }
 
diff --git a/examples/embd_input/embd_input.py b/examples/embd_input/embd_input.py
index 742bd60884094..d4831d46abe49 100644
--- a/examples/embd_input/embd_input.py
+++ b/examples/embd_input/embd_input.py
@@ -17,6 +17,8 @@ def __init__(self, args):
         args_c = (c_char_p * argc)(*c_str)
         self.model = c_void_p(libc.create_mymodel(argc, args_c))
 #         print("self.model", self.model)
+    def __del__(self):
+        libc.free_mymodel(self.model)
 
     def eval_float(self, x):
         libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[0])
diff --git a/examples/embd_input/embd_input_lib.cpp b/examples/embd_input/embd_input_lib.cpp
index 5cbc81709049f..7ad98dcb9e3d0 100644
--- a/examples/embd_input/embd_input_lib.cpp
+++ b/examples/embd_input/embd_input_lib.cpp
@@ -101,6 +101,13 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
     return ret;
 }
 
+void free_mymodel(struct MyModel* mymodel) {
+    llama_context* ctx = mymodel->ctx;
+    llama_print_timings(ctx);
+    llama_free(ctx);
+    delete mymodel;
+}
+
 
 bool eval_float(void* model, float* input, int N){
     MyModel* mymodel = (MyModel* )model;
diff --git a/examples/embd_input/embd_input_test.cpp b/examples/embd_input/embd_input_test.cpp
index 3d86f03d7900d..94287e37fdd25 100644
--- a/examples/embd_input/embd_input_test.cpp
+++ b/examples/embd_input/embd_input_test.cpp
@@ -27,5 +27,6 @@ int main(int argc, char** argv) {
         // eval_id(mymodel, id);
     }
     printf("\n");
+    free_mymodel(mymodel);
     return 0;
 }

From 6ed4893391ec3135e59b8e7c726fd4847fbd1a18 Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Wed, 7 Jun 2023 23:44:54 +0800
Subject: [PATCH 07/17] fixed add end condition for generating

---
 examples/embd_input/embd_input.py       | 30 ++++++++++++++-----------
 examples/embd_input/embd_input_test.cpp |  8 +++++--
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/examples/embd_input/embd_input.py b/examples/embd_input/embd_input.py
index d4831d46abe49..ebce1bb4562d7 100644
--- a/examples/embd_input/embd_input.py
+++ b/examples/embd_input/embd_input.py
@@ -33,16 +33,20 @@ def sampling(self):
         s = libc.sampling(self.model)
         return s
 
-model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
-# print(model)
-model.eval_string("""user: what is the color of the flag of UN?""")
-# model.eval_token(100)
-x = np.random.random((10, 5120))# , dtype=np.float32)
-model.eval_float(x)
-model.eval_string("""assistant:""")
-# print(x[0,0], x[0,1],x[1,0])
-# model.eval_float(x)
-# print(libc)
-
-for i in range(50):
-    print(model.sampling().decode(), end="", flush=True)
+if __name__ == "__main__":
+    model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
+    # print(model)
+    model.eval_string("""user: what is the color of the flag of UN?""")
+    # model.eval_token(100)
+    x = np.random.random((10, 5120))# , dtype=np.float32)
+    model.eval_float(x)
+    model.eval_string("""assistant:""")
+    # print(x[0,0], x[0,1],x[1,0])
+    # model.eval_float(x)
+    # print(libc)
+
+    for i in range(500):
+        tmp = model.sampling().decode()
+        if tmp == "":
+            break
+        print(tmp, end="", flush=True)
diff --git a/examples/embd_input/embd_input_test.cpp b/examples/embd_input/embd_input_test.cpp
index 94287e37fdd25..d83febeb23371 100644
--- a/examples/embd_input/embd_input_test.cpp
+++ b/examples/embd_input/embd_input_test.cpp
@@ -1,6 +1,7 @@
 #include "embd_input.h"
 #include <stdlib.h>
 #include <random>
+#include <string.h>
 
 int main(int argc, char** argv) {
 
@@ -20,9 +21,12 @@ int main(int argc, char** argv) {
     eval_string(mymodel, "assistant:");
     // printf("eval float end\n");
     eval_string(mymodel, mymodel->params.prompt.c_str());
-    for (int i=0;i < 50; i++) {
+    const char* tmp;
+    for (int i=0;i < 500; i++) {
         // int id = sampling_id(mymodel);
-        printf("%s", sampling(mymodel)); // llama_token_to_str(mymodel->ctx, id));
+        tmp = sampling(mymodel);
+        if (strlen(tmp) == 0) break;
+        printf("%s", tmp); // llama_token_to_str(mymodel->ctx, id));
         fflush(stdout);
         // eval_id(mymodel, id);
     }

From 8cea3ab9e56c3ddda4e502060daaaf35060cacee Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Thu, 8 Jun 2023 04:50:31 +0800
Subject: [PATCH 08/17] add examples for llava.py

---
 examples/embd_input/embd_input.py |  4 +--
 examples/embd_input/llava.py      | 46 +++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 2 deletions(-)
 create mode 100644 examples/embd_input/llava.py

diff --git a/examples/embd_input/embd_input.py b/examples/embd_input/embd_input.py
index ebce1bb4562d7..db5cd0fdb528e 100644
--- a/examples/embd_input/embd_input.py
+++ b/examples/embd_input/embd_input.py
@@ -21,7 +21,7 @@ def __del__(self):
         libc.free_mymodel(self.model)
 
     def eval_float(self, x):
-        libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[0])
+        libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
 
     def eval_string(self, x):
         libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
@@ -38,7 +38,7 @@ def sampling(self):
     # print(model)
     model.eval_string("""user: what is the color of the flag of UN?""")
     # model.eval_token(100)
-    x = np.random.random((10, 5120))# , dtype=np.float32)
+    x = np.random.random((5120,10))# , dtype=np.float32)
     model.eval_float(x)
     model.eval_string("""assistant:""")
     # print(x[0,0], x[0,1],x[1,0])
diff --git a/examples/embd_input/llava.py b/examples/embd_input/llava.py
new file mode 100644
index 0000000000000..914c7ef00b762
--- /dev/null
+++ b/examples/embd_input/llava.py
@@ -0,0 +1,46 @@
+import sys
+import os
+sys.path.insert(0, os.path.dirname(__file__))
+from embd_input import MyModel
+import numpy as np
+from torch import nn
+import torch
+from transformers import CLIPVisionModel,  CLIPImageProcessor
+from PIL import Image
+vision_tower = "openai/clip-vit-large-patch14"
+
+class Llava:
+    def __init__(self):
+        self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
+        self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
+        self.mm_projector = nn.Linear(1024, 5120)
+        self.model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
+
+    def chat_with_image(self, image, question):
+        with torch.no_grad():
+            embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
+            select_hidden_state_layer = -2
+            select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
+            image_feature = select_hidden_state[:, 1:]
+            embd_image = self.mm_projector(image_feature)
+            embd_image = embd_image.cpu().numpy()
+        self.model.eval_string("user: ")
+        # print(embd_image.shape)
+        self.model.eval_float(embd_image.T)
+        self.model.eval_string(question)
+        self.model.eval_string("\nassistant: ")
+        ret = ""
+        for _ in range(500):
+            tmp = self.model.sampling().decode()
+            if tmp == "":
+                break
+            ret += tmp
+        return ret
+
+a = Llava()
+state = torch.load(os.path.dirname(__file__) + "/a.pth")
+a.mm_projector.load_state_dict({"weight": state["model.mm_projector.weight"], "bias": state["model.mm_projector.bias"]})
+print(a.chat_with_image(Image.open("./media/llama1-logo.png").convert('RGB'), "what is the text in the picture?"))
+
+

From 4f1aa3cc763f99f44421157716bf00a0d0de1ade Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Sat, 17 Jun 2023 16:41:37 +0800
Subject: [PATCH 09/17] add READMD for llava.py

---
 examples/embd_input/README.md           | 20 +++++++++
 examples/embd_input/embd_input_lib.cpp  |  5 ++-
 examples/embd_input/embd_input_test.cpp |  2 +-
 examples/embd_input/llava.py            | 58 +++++++++++++++++++------
 4 files changed, 70 insertions(+), 15 deletions(-)
 create mode 100644 examples/embd_input/README.md

diff --git a/examples/embd_input/README.md b/examples/embd_input/README.md
new file mode 100644
index 0000000000000..1d23d086d00e3
--- /dev/null
+++ b/examples/embd_input/README.md
@@ -0,0 +1,20 @@
+### Examples for input embedding directly
+
+## LLAVA example  (llava.py)
+
+1. obtian llava model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/)
+2. convert it to ggml format
+3. llava_projection.pth is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin)
+
+```
+import torch
+
+bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
+pth_path = "./examples/embd_input/llava_projection.pth"
+
+dic = torch.load(bin_path)
+used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
+torch.save({k: dic[k] for k in used_key}, pth_path)
+```
+
+
diff --git a/examples/embd_input/embd_input_lib.cpp b/examples/embd_input/embd_input_lib.cpp
index 7ad98dcb9e3d0..bbdf6d645b17f 100644
--- a/examples/embd_input/embd_input_lib.cpp
+++ b/examples/embd_input/embd_input_lib.cpp
@@ -272,7 +272,10 @@ llama_token sampling_id(struct MyModel* mymodel) {
 const char* sampling(struct MyModel* mymodel) {
     llama_context* ctx = mymodel->ctx;
     int id = sampling_id(mymodel);
-    std::string ret = llama_token_to_str(ctx, id);
+
+    std::string ret;
+    if (id == llama_token_eos()) ret = "</s>";
+    else ret = llama_token_to_str(ctx, id);
     eval_id(mymodel, id);
     return ret.c_str();
 }
diff --git a/examples/embd_input/embd_input_test.cpp b/examples/embd_input/embd_input_test.cpp
index d83febeb23371..e14141497c35d 100644
--- a/examples/embd_input/embd_input_test.cpp
+++ b/examples/embd_input/embd_input_test.cpp
@@ -25,7 +25,7 @@ int main(int argc, char** argv) {
     for (int i=0;i < 500; i++) {
         // int id = sampling_id(mymodel);
         tmp = sampling(mymodel);
-        if (strlen(tmp) == 0) break;
+        if (strcmp(tmp, "</s>")==0) break;
         printf("%s", tmp); // llama_token_to_str(mymodel->ctx, id));
         fflush(stdout);
         // eval_id(mymodel, id);
diff --git a/examples/embd_input/llava.py b/examples/embd_input/llava.py
index 914c7ef00b762..8489f792795a8 100644
--- a/examples/embd_input/llava.py
+++ b/examples/embd_input/llava.py
@@ -7,40 +7,72 @@
 import torch
 from transformers import CLIPVisionModel,  CLIPImageProcessor
 from PIL import Image
+
+# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
 vision_tower = "openai/clip-vit-large-patch14"
+select_hidden_state_layer = -2
+# (vision_config.image_size // vision_config.patch_size) ** 2
+image_token_len = (224//14)**2
 
 class Llava:
-    def __init__(self):
+    def __init__(self, args):
         self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
         self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
         self.mm_projector = nn.Linear(1024, 5120)
-        self.model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
+        self.model = MyModel(["main", *args])
+
+    def load_projection(self, path):
+        state = torch.load(path)
+        self.mm_projector.load_state_dict({
+            "weight": state["model.mm_projector.weight"],
+            "bias": state["model.mm_projector.bias"]})
+
+    def chat(self, question):
+        self.model.eval_string("user: ")
+        self.model.eval_string(question)
+        self.model.eval_string("\nassistant: ")
+        return self.sampling()
 
     def chat_with_image(self, image, question):
         with torch.no_grad():
             embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
             image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
-            select_hidden_state_layer = -2
             select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
             image_feature = select_hidden_state[:, 1:]
             embd_image = self.mm_projector(image_feature)
-            embd_image = embd_image.cpu().numpy()
+            embd_image = embd_image.cpu().numpy()[0]
         self.model.eval_string("user: ")
-        # print(embd_image.shape)
+        self.model.eval_token(32003-2) # im_start
         self.model.eval_float(embd_image.T)
+        for i in range(image_token_len-embd_image.shape[0]):
+            self.model.eval_token(32003-3) # im_patch
+        self.model.eval_token(32003-1) # im_end
         self.model.eval_string(question)
         self.model.eval_string("\nassistant: ")
-        ret = ""
+        return self.sampling()
+
+    def sampling(self):
+        ret = b""
         for _ in range(500):
-            tmp = self.model.sampling().decode()
-            if tmp == "":
+            tmp = self.model.sampling() # .decode()
+            if tmp == b"</s>":
                 break
             ret += tmp
-        return ret
+        return ret.decode()
+
+if __name__=="__main__":
+    # model form liuhaotian/LLaVA-13b-delta-v1-1
+    a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
+    # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
+    # Also here can use pytorch_model-00003-of-00003.bin directly.
+    a.load_projection(os.path.join(
+        os.path.dirname(__file__) ,
+        "llava_projetion.pth"))
+    respose = a.chat_with_image(
+        Image.open("./media/llama1-logo.png").convert('RGB'),
+        "what is the text in the picture?")
+    print(respose)
+    print(a.chat("what is the color of it?"))
 
-a = Llava()
-state = torch.load(os.path.dirname(__file__) + "/a.pth")
-a.mm_projector.load_state_dict({"weight": state["model.mm_projector.weight"], "bias": state["model.mm_projector.bias"]})
-print(a.chat_with_image(Image.open("./media/llama1-logo.png").convert('RGB'), "what is the text in the picture?"))
 
 

From 93c57a057175951182b2646ee2b842294fe8e6c5 Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Sat, 17 Jun 2023 16:43:36 +0800
Subject: [PATCH 10/17] add READMD for llava.py

---
 examples/embd_input/README.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/embd_input/README.md b/examples/embd_input/README.md
index 1d23d086d00e3..56db072cf695d 100644
--- a/examples/embd_input/README.md
+++ b/examples/embd_input/README.md
@@ -3,8 +3,12 @@
 ## LLAVA example  (llava.py)
 
 1. obtian llava model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/)
-2. convert it to ggml format
-3. llava_projection.pth is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin)
+2. build  `libembd_input.so`
+```
+make
+```
+3. convert it to ggml format
+4. llava_projection.pth is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin)
 
 ```
 import torch
@@ -17,4 +21,3 @@ used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
 torch.save({k: dic[k] for k in used_key}, pth_path)
 ```
 
-

From 53dfbbf553e16264a41f71d23d166bc2f79e323b Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Tue, 20 Jun 2023 22:57:21 +0800
Subject: [PATCH 11/17] add example of PandaGPT

---
 convert-lora-to-ggml.py           |   6 +-
 examples/embd_input/README.md     |  36 +++++++++--
 examples/embd_input/embd_input.py |  29 +++++++++
 examples/embd_input/llava.py      |  12 +---
 examples/embd_input/panda_gpt.py  | 100 ++++++++++++++++++++++++++++++
 5 files changed, 166 insertions(+), 17 deletions(-)
 create mode 100644 examples/embd_input/panda_gpt.py

diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py
index 9090e8d6dd55a..f43c836f577a6 100644
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -113,6 +113,10 @@ def write_tensor_header(
 
     write_file_header(fout, params)
     for k, v in model.items():
+        if k.endswith(".default.weight"):
+            k = k.replace(".default.weight", ".weight")
+        if k in ["llama_proj.weight", "llama_proj.bias"]:
+            continue
         if k.endswith("lora_A.weight"):
             if v.dtype != torch.float16 and v.dtype != torch.float32:
                 v = v.float()
@@ -120,7 +124,7 @@ def write_tensor_header(
         else:
             v = v.float()
 
-        t = v.numpy()
+        t = v.detach().numpy()
         tname = translate_tensor_name(k)
         print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
         write_tensor_header(fout, tname, t.shape, t.dtype)
diff --git a/examples/embd_input/README.md b/examples/embd_input/README.md
index 56db072cf695d..c180d541aa6c4 100644
--- a/examples/embd_input/README.md
+++ b/examples/embd_input/README.md
@@ -1,14 +1,17 @@
 ### Examples for input embedding directly
 
-## LLAVA example  (llava.py)
-
-1. obtian llava model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/)
-2. build  `libembd_input.so`
+## Requirement 
+build  `libembd_input.so`
+run the following comman in main dir (../../).
 ```
 make
 ```
-3. convert it to ggml format
-4. llava_projection.pth is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin)
+
+## LLAVA example  (llava.py)
+
+1. obtian llava model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/)
+2. convert it to ggml format
+3. llava_projection.pth is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin)
 
 ```
 import torch
@@ -21,3 +24,24 @@ used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
 torch.save({k: dic[k] for k in used_key}, pth_path)
 ```
 
+## PandaGPT example (panda_gpt.py)
+
+1. Obtian PandaGPT lora model. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
+The `adapter_config.json` is
+```
+{
+  "peft_type": "LORA",
+  "fan_in_fan_out": false,
+  "bias": null,
+  "modules_to_save": null,
+  "r": 32,
+  "lora_alpha": 32,
+  "lora_dropout": 0.1,
+  "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
+}
+```
+2. papare the `vicuna` v0 model.
+3. obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
+4. Clone the PandaGPT source.
+5. check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
+
diff --git a/examples/embd_input/embd_input.py b/examples/embd_input/embd_input.py
index db5cd0fdb528e..ce057a89d504d 100644
--- a/examples/embd_input/embd_input.py
+++ b/examples/embd_input/embd_input.py
@@ -33,6 +33,35 @@ def sampling(self):
         s = libc.sampling(self.model)
         return s
 
+    def generate(self, end="</s>"):
+        ret = b""
+        end = end.encode()
+        for _ in range(500):
+            tmp = self.sampling() # .decode()
+            if (ret+tmp).endswith(end):
+                break
+            ret += tmp
+        return ret.decode()
+
+    def stream_generate(self, end="</s>"):
+        ret = b""
+        end = end.encode()
+        head = b""
+        for _ in range(500):
+            tmp = self.sampling() # .decode()
+            ret += tmp
+            try:
+                text = (head + tmp).decode()
+                print(text, end="")
+                head = b""
+            except:
+                head += text
+            if ret.endswith(end):
+                break
+        print("")
+        return ret.decode()
+
+
 if __name__ == "__main__":
     model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
     # print(model)
diff --git a/examples/embd_input/llava.py b/examples/embd_input/llava.py
index 8489f792795a8..a1efaddf654c4 100644
--- a/examples/embd_input/llava.py
+++ b/examples/embd_input/llava.py
@@ -31,7 +31,7 @@ def chat(self, question):
         self.model.eval_string("user: ")
         self.model.eval_string(question)
         self.model.eval_string("\nassistant: ")
-        return self.sampling()
+        return self.model.generate()
 
     def chat_with_image(self, image, question):
         with torch.no_grad():
@@ -49,16 +49,8 @@ def chat_with_image(self, image, question):
         self.model.eval_token(32003-1) # im_end
         self.model.eval_string(question)
         self.model.eval_string("\nassistant: ")
-        return self.sampling()
+        return self.model.generate()
 
-    def sampling(self):
-        ret = b""
-        for _ in range(500):
-            tmp = self.model.sampling() # .decode()
-            if tmp == b"</s>":
-                break
-            ret += tmp
-        return ret.decode()
 
 if __name__=="__main__":
     # model form liuhaotian/LLaVA-13b-delta-v1-1
diff --git a/examples/embd_input/panda_gpt.py b/examples/embd_input/panda_gpt.py
new file mode 100644
index 0000000000000..b1199b95df7e5
--- /dev/null
+++ b/examples/embd_input/panda_gpt.py
@@ -0,0 +1,100 @@
+import sys
+import os
+sys.path.insert(0, os.path.dirname(__file__))
+from embd_input import MyModel
+import numpy as np
+from torch import nn
+import torch
+
+# use PandaGPT path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "PandaGPT","code","model"))
+from ImageBind.models import imagebind_model
+from ImageBind import data
+
+imagebind_ckpt_path = "./models/panda_gpt/"
+ModalityType = imagebind_model.ModalityType
+max_tgt_len = 400
+
+class PandaGPT:
+    def __init__(self, args):
+        self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
+        self.visual_encoder.eval()
+        self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
+        self.max_tgt_len = max_tgt_len
+        self.model = MyModel(["main", *args])
+        self.generated_text = ""
+        self.device = "cpu"
+
+    def load_projection(self, path):
+        state = torch.load(path, map_location="cpu")
+        self.llama_proj.load_state_dict({
+            "weight": state["llama_proj.weight"],
+            "bias": state["llama_proj.bias"]})
+
+    def chat(self, question):
+        if self.generated_text == "":
+            self.model.eval_string("###")
+        self.model.eval_string(" Human: ")
+        self.model.eval_string(question)
+        self.model.eval_string("\n### Assistant:")
+        ret = self.model.stream_generate(end="###")
+        self.generated_text += ret
+        return ret
+
+    def chat_with_image(self, inputs, question):
+        if self.generated_text == "":
+            self.model.eval_string("###")
+        self.model.eval_string(" Human: <Img>")
+        embds = self.extract_multimoal_feature(inputs)
+        for i in embds:
+            self.model.eval_float(i.T)
+        self.model.eval_string("</Img> " + question + "\n### Assistant:")
+        ret = self.model.stream_generate(end="###")
+        self.generated_text += ret
+        return ret
+
+    def extract_multimoal_feature(self, inputs):
+        features = []
+        for key in ["image", "audio", "video", "thermal"]:
+            if key + "_paths" in inputs:
+                embeds = self.encode_data(key, inputs[key+"_paths"])
+                features.append(embeds)
+        return features
+
+    def encode_data(self, data_type, data_paths):
+
+        type_map = {
+            "image": ModalityType.VISION,
+            "audio": ModalityType.AUDIO,
+            "video": ModalityType.VISION,
+            "thermal": ModalityType.THERMAL,
+        }
+        load_map = {
+            "image": data.load_and_transform_vision_data,
+            "audio": data.load_and_transform_audio_data,
+            "video": data.load_and_transform_video_data,
+            "thermal": data.load_and_transform_thermal_data
+        }
+
+        load_function = load_map[data_type]
+        key = type_map[data_type]
+
+        inputs = {key: load_function(data_paths, self.device)}
+        with torch.no_grad():
+            embeddings = self.visual_encoder(inputs)
+            embeds = embeddings[key]
+            embeds = self.llama_proj(embeds).cpu().numpy()
+        return embeds
+
+
+if __name__=="__main__":
+    # model form liuhaotian/LLaVA-13b-delta-v1-1
+    a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
+    # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
+    # Also here can use pytorch_model-00003-of-00003.bin directly.
+    a.load_projection("./models/panda_gpt/adapter_model.bin")
+    a.chat_with_image(
+        {"image_paths": ["./media/llama1-logo.png"]},
+        "what is the text in the picture? 'llama' or 'lambda'?")
+    a.chat("what is the color of it?")
+

From 9d866118c4169fecf052d9b541f963176cfacc91 Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Sun, 25 Jun 2023 01:21:54 +0800
Subject: [PATCH 12/17] refactor the interface and fixed the styles

---
 Makefile                                      |  10 +-
 examples/{embd_input => embd-input}/README.md |   2 +-
 examples/embd-input/embd-input-lib.cpp        | 218 ++++++++++++++
 .../embd-input-test.cpp}                      |  15 +-
 .../embd_input.h => embd-input/embd-input.h}  |   0
 .../{embd_input => embd-input}/embd_input.py  |  56 ++--
 examples/{embd_input => embd-input}/llava.py  |   8 +-
 .../{embd_input => embd-input}/panda_gpt.py   |  36 ++-
 examples/embd_input/embd_input_lib.cpp        | 283 ------------------
 llama.cpp                                     | 141 ++++-----
 llama.h                                       |   4 +-
 11 files changed, 333 insertions(+), 440 deletions(-)
 rename examples/{embd_input => embd-input}/README.md (98%)
 create mode 100644 examples/embd-input/embd-input-lib.cpp
 rename examples/{embd_input/embd_input_test.cpp => embd-input/embd-input-test.cpp} (71%)
 rename examples/{embd_input/embd_input.h => embd-input/embd-input.h} (100%)
 rename examples/{embd_input => embd-input}/embd_input.py (64%)
 rename examples/{embd_input => embd-input}/llava.py (94%)
 rename examples/{embd_input => embd-input}/panda_gpt.py (81%)
 delete mode 100644 examples/embd_input/embd_input_lib.cpp

diff --git a/Makefile b/Makefile
index 41fb6d7c76ca1..2fc4424471600 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple libembd_input.so embd_input_test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple libembdinput.so embd-input-test
 
 ifdef LLAMA_BUILD_SERVER
 	BUILD_TARGETS += server
@@ -302,12 +302,12 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
 server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
 
-libembd_input.so: examples/embd_input/embd_input.h examples/embd_input/embd_input_lib.cpp examples/embd_input/embd_input_test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) --shared $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
+libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
 
 
-embd_input_test: libembd_input.so examples/embd_input/embd_input_test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -Wl,-rpath=./ -lembd_input
+embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
 
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
diff --git a/examples/embd_input/README.md b/examples/embd-input/README.md
similarity index 98%
rename from examples/embd_input/README.md
rename to examples/embd-input/README.md
index c180d541aa6c4..eb1095f24fe96 100644
--- a/examples/embd_input/README.md
+++ b/examples/embd-input/README.md
@@ -1,6 +1,6 @@
 ### Examples for input embedding directly
 
-## Requirement 
+## Requirement
 build  `libembd_input.so`
 run the following comman in main dir (../../).
 ```
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
new file mode 100644
index 0000000000000..37a5b52086ccf
--- /dev/null
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -0,0 +1,218 @@
+// Defines sigaction on msys:
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "embd-input.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+static llama_context ** g_ctx;
+
+extern "C" {
+
+struct MyModel* create_mymodel(int argc, char ** argv) {
+    gpt_params params;
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return nullptr;
+    }
+
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
+
+    llama_init_backend();
+
+    llama_context * ctx;
+    g_ctx = &ctx;
+
+    // load the model and apply lora adapter, if any
+    ctx = llama_init_from_gpt_params(params);
+    if (ctx == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return nullptr;
+    }
+
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+    }
+    struct MyModel * ret = new MyModel();
+    ret->ctx = ctx;
+    ret->params = params;
+    ret->n_past = 0;
+    // printf("ctx: %d\n", ret->ctx);
+    return ret;
+}
+
+void free_mymodel(struct MyModel * mymodel) {
+    llama_context * ctx = mymodel->ctx;
+    llama_print_timings(ctx);
+    llama_free(ctx);
+    delete mymodel;
+}
+
+
+bool eval_float(void * model, float * input, int N){
+    MyModel * mymodel = (MyModel*)model;
+    llama_context * ctx = mymodel->ctx;
+    gpt_params params = mymodel->params;
+    int n_emb = llama_n_embd(ctx);
+    int n_past = mymodel->n_past;
+    int n_batch = N; // params.n_batch;
+
+    for (int i = 0; i < (int) N; i += n_batch) {
+        int n_eval = (int) N - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        if (llama_eval_embd(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return false;
+        }
+        n_past += n_eval;
+    }
+    mymodel->n_past = n_past;
+    return true;
+}
+
+bool eval_tokens(void * model, std::vector<llama_token> tokens) {
+    MyModel * mymodel = (MyModel* )model;
+    llama_context * ctx;
+    ctx = mymodel->ctx;
+    gpt_params params = mymodel->params;
+    int n_past = mymodel->n_past;
+    for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
+        int n_eval = (int) tokens.size() - i;
+        if (n_eval > params.n_batch) {
+            n_eval = params.n_batch;
+        }
+        if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return false;
+        }
+        n_past += n_eval;
+    }
+    mymodel->n_past = n_past;
+    return true;
+}
+
+bool eval_id(struct MyModel* mymodel, int id) {
+    std::vector<llama_token> tokens;
+    tokens.push_back(id);
+    return eval_tokens(mymodel, tokens);
+}
+
+bool eval_string(struct MyModel * mymodel,const char* str){
+    llama_context * ctx = mymodel->ctx;
+    std::string str2 = str;
+    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
+    eval_tokens(mymodel, embd_inp);
+    return true;
+}
+
+llama_token sampling_id(struct MyModel* mymodel) {
+    llama_context* ctx = mymodel->ctx;
+    gpt_params params = mymodel->params;
+    // int n_ctx = llama_n_ctx(ctx);
+
+    // out of user input, sample next token
+    const float   temp            = params.temp;
+    const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+    const float   top_p           = params.top_p;
+    const float   tfs_z           = params.tfs_z;
+    const float   typical_p       = params.typical_p;
+    // const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
+    // const float   repeat_penalty  = params.repeat_penalty;
+    // const float   alpha_presence  = params.presence_penalty;
+    // const float   alpha_frequency = params.frequency_penalty;
+    const int     mirostat        = params.mirostat;
+    const float   mirostat_tau    = params.mirostat_tau;
+    const float   mirostat_eta    = params.mirostat_eta;
+    // const bool    penalize_nl     = params.penalize_nl;
+
+    llama_token id = 0;
+    {
+        auto logits  = llama_get_logits(ctx);
+        auto n_vocab = llama_n_vocab(ctx);
+
+        // Apply params.logit_bias map
+        for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+            logits[it->first] += it->second;
+        }
+
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+        // TODO: Apply penalties
+        // float nl_logit = logits[llama_token_nl()];
+        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+        // llama_sample_repetition_penalty(ctx, &candidates_p,
+        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+        //      last_n_repeat, repeat_penalty);
+        // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+        // last_n_repeat, alpha_frequency, alpha_presence);
+        // if (!penalize_nl) {
+        //     logits[llama_token_nl()] = nl_logit;
+        // }
+
+        if (temp <= 0) {
+            // Greedy sampling
+            id = llama_sample_token_greedy(ctx, &candidates_p);
+        } else {
+            if (mirostat == 1) {
+                static float mirostat_mu = 2.0f * mirostat_tau;
+                const int mirostat_m = 100;
+                llama_sample_temperature(ctx, &candidates_p, temp);
+                id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+            } else if (mirostat == 2) {
+                static float mirostat_mu = 2.0f * mirostat_tau;
+                llama_sample_temperature(ctx, &candidates_p, temp);
+                id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+            } else {
+                // Temperature sampling
+                llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+                llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
+                llama_sample_typical(ctx, &candidates_p, typical_p, 1);
+                llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+                llama_sample_temperature(ctx, &candidates_p, temp);
+                id = llama_sample_token(ctx, &candidates_p);
+            }
+        }
+    }
+
+    return id;
+}
+
+const char * sampling(struct MyModel * mymodel) {
+    llama_context * ctx = mymodel->ctx;
+    int id = sampling_id(mymodel);
+    std::string ret;
+    if (id == llama_token_eos()) ret = "</s>";
+    else ret = llama_token_to_str(ctx, id);
+    eval_id(mymodel, id);
+    return ret.c_str();
+}
+
+}
diff --git a/examples/embd_input/embd_input_test.cpp b/examples/embd-input/embd-input-test.cpp
similarity index 71%
rename from examples/embd_input/embd_input_test.cpp
rename to examples/embd-input/embd-input-test.cpp
index e14141497c35d..e5e040f62a60a 100644
--- a/examples/embd_input/embd_input_test.cpp
+++ b/examples/embd-input/embd-input-test.cpp
@@ -1,4 +1,4 @@
-#include "embd_input.h"
+#include "embd-input.h"
 #include <stdlib.h>
 #include <random>
 #include <string.h>
@@ -7,8 +7,11 @@ int main(int argc, char** argv) {
 
     auto mymodel = create_mymodel(argc, argv);
     int N = 10;
+    int max_tgt_len = 500;
     int n_embd = llama_n_embd(mymodel->ctx);
-    float* data = new float[N*n_embd];
+
+    // add random float embd to test evaluation
+    float * data = new float[N*n_embd];
     std::default_random_engine e;
     std::uniform_real_distribution<float>  u(0,1);
     for (int i=0;i<N*n_embd;i++) {
@@ -16,19 +19,15 @@ int main(int argc, char** argv) {
     }
 
     eval_string(mymodel, "user: what is the color of the flag of UN?");
-    // printf("eval float");
     eval_float(mymodel, data, N);
     eval_string(mymodel, "assistant:");
-    // printf("eval float end\n");
     eval_string(mymodel, mymodel->params.prompt.c_str());
     const char* tmp;
-    for (int i=0;i < 500; i++) {
-        // int id = sampling_id(mymodel);
+    for (int i=0; i<max_tgt_len; i++) {
         tmp = sampling(mymodel);
         if (strcmp(tmp, "</s>")==0) break;
-        printf("%s", tmp); // llama_token_to_str(mymodel->ctx, id));
+        printf("%s", tmp);
         fflush(stdout);
-        // eval_id(mymodel, id);
     }
     printf("\n");
     free_mymodel(mymodel);
diff --git a/examples/embd_input/embd_input.h b/examples/embd-input/embd-input.h
similarity index 100%
rename from examples/embd_input/embd_input.h
rename to examples/embd-input/embd-input.h
diff --git a/examples/embd_input/embd_input.py b/examples/embd-input/embd_input.py
similarity index 64%
rename from examples/embd_input/embd_input.py
rename to examples/embd-input/embd_input.py
index ce057a89d504d..be2896614e9b3 100644
--- a/examples/embd_input/embd_input.py
+++ b/examples/embd-input/embd_input.py
@@ -1,8 +1,9 @@
 import ctypes
 from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
 import numpy as np
+import os
 
-libc = cdll.LoadLibrary("./libembd_input.so")
+libc = cdll.LoadLibrary("./libembdinput.so")
 libc.sampling.restype=c_char_p
 libc.create_mymodel.restype=c_void_p
 libc.eval_string.argtypes=[c_void_p, c_char_p]
@@ -16,7 +17,9 @@ def __init__(self, args):
         c_str = [c_char_p(i.encode()) for i in args]
         args_c = (c_char_p * argc)(*c_str)
         self.model = c_void_p(libc.create_mymodel(argc, args_c))
-#         print("self.model", self.model)
+        self.max_tgt_len = 512
+        self.print_string_eval = True
+
     def __del__(self):
         libc.free_mymodel(self.model)
 
@@ -25,6 +28,8 @@ def eval_float(self, x):
 
     def eval_string(self, x):
         libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
+        if self.print_string_eval:
+            print(x)
 
     def eval_token(self, x):
         libc.eval_id(self.model, x)
@@ -33,49 +38,34 @@ def sampling(self):
         s = libc.sampling(self.model)
         return s
 
-    def generate(self, end="</s>"):
-        ret = b""
-        end = end.encode()
-        for _ in range(500):
-            tmp = self.sampling() # .decode()
-            if (ret+tmp).endswith(end):
-                break
-            ret += tmp
-        return ret.decode()
-
     def stream_generate(self, end="</s>"):
         ret = b""
         end = end.encode()
-        head = b""
-        for _ in range(500):
-            tmp = self.sampling() # .decode()
+        for _ in range(self.max_tgt_len):
+            tmp = self.sampling()
             ret += tmp
-            try:
-                text = (head + tmp).decode()
-                print(text, end="")
-                head = b""
-            except:
-                head += text
+            yield tmp
             if ret.endswith(end):
                 break
+
+    def generate_with_print(self, end="</s>"):
+        ret = b""
+        for i in self.stream_generate(end=end):
+            ret += i
+            print(i.decode(errors="replace"), end="", flush=True)
         print("")
-        return ret.decode()
+        return ret.decode(errors="replace")
 
 
+    def generate(self, end="</s>"):
+        text = b"".join(self.stream_generate(end=end))
+        return text.decode(errors="replace")
+
 if __name__ == "__main__":
     model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
-    # print(model)
     model.eval_string("""user: what is the color of the flag of UN?""")
-    # model.eval_token(100)
     x = np.random.random((5120,10))# , dtype=np.float32)
     model.eval_float(x)
     model.eval_string("""assistant:""")
-    # print(x[0,0], x[0,1],x[1,0])
-    # model.eval_float(x)
-    # print(libc)
-
-    for i in range(500):
-        tmp = model.sampling().decode()
-        if tmp == "":
-            break
-        print(tmp, end="", flush=True)
+    for i in model.generate():
+        print(i.decode(errors="replace"), end="", flush=True)
diff --git a/examples/embd_input/llava.py b/examples/embd-input/llava.py
similarity index 94%
rename from examples/embd_input/llava.py
rename to examples/embd-input/llava.py
index a1efaddf654c4..2f20cb7225b20 100644
--- a/examples/embd_input/llava.py
+++ b/examples/embd-input/llava.py
@@ -31,7 +31,7 @@ def chat(self, question):
         self.model.eval_string("user: ")
         self.model.eval_string(question)
         self.model.eval_string("\nassistant: ")
-        return self.model.generate()
+        return self.model.generate_with_print()
 
     def chat_with_image(self, image, question):
         with torch.no_grad():
@@ -49,7 +49,7 @@ def chat_with_image(self, image, question):
         self.model.eval_token(32003-1) # im_end
         self.model.eval_string(question)
         self.model.eval_string("\nassistant: ")
-        return self.model.generate()
+        return self.model.generate_with_print()
 
 
 if __name__=="__main__":
@@ -63,8 +63,8 @@ def chat_with_image(self, image, question):
     respose = a.chat_with_image(
         Image.open("./media/llama1-logo.png").convert('RGB'),
         "what is the text in the picture?")
-    print(respose)
-    print(a.chat("what is the color of it?"))
+    respose
+    a.chat("what is the color of it?")
 
 
 
diff --git a/examples/embd_input/panda_gpt.py b/examples/embd-input/panda_gpt.py
similarity index 81%
rename from examples/embd_input/panda_gpt.py
rename to examples/embd-input/panda_gpt.py
index b1199b95df7e5..0cfac5f32adf2 100644
--- a/examples/embd_input/panda_gpt.py
+++ b/examples/embd-input/panda_gpt.py
@@ -7,11 +7,13 @@
 import torch
 
 # use PandaGPT path
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "PandaGPT","code","model"))
+panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
+imagebind_ckpt_path = "./models/panda_gpt/"
+
+sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
 from ImageBind.models import imagebind_model
 from ImageBind import data
 
-imagebind_ckpt_path = "./models/panda_gpt/"
 ModalityType = imagebind_model.ModalityType
 max_tgt_len = 400
 
@@ -31,25 +33,25 @@ def load_projection(self, path):
             "weight": state["llama_proj.weight"],
             "bias": state["llama_proj.bias"]})
 
+    def eval_inputs(self, inputs):
+        self.model.eval_string("<Img>")
+        embds = self.extract_multimoal_feature(inputs)
+        for i in embds:
+            self.model.eval_float(i.T)
+        self.model.eval_string("</Img> ")
+
     def chat(self, question):
+        return self.chat_with_image(None, question)
+
+    def chat_with_image(self, inputs, question):
         if self.generated_text == "":
             self.model.eval_string("###")
         self.model.eval_string(" Human: ")
+        if inputs:
+            self.eval_inputs(inputs)
         self.model.eval_string(question)
         self.model.eval_string("\n### Assistant:")
-        ret = self.model.stream_generate(end="###")
-        self.generated_text += ret
-        return ret
-
-    def chat_with_image(self, inputs, question):
-        if self.generated_text == "":
-            self.model.eval_string("###")
-        self.model.eval_string(" Human: <Img>")
-        embds = self.extract_multimoal_feature(inputs)
-        for i in embds:
-            self.model.eval_float(i.T)
-        self.model.eval_string("</Img> " + question + "\n### Assistant:")
-        ret = self.model.stream_generate(end="###")
+        ret = self.model.generate_with_print(end="###")
         self.generated_text += ret
         return ret
 
@@ -88,13 +90,9 @@ def encode_data(self, data_type, data_paths):
 
 
 if __name__=="__main__":
-    # model form liuhaotian/LLaVA-13b-delta-v1-1
     a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
-    # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
-    # Also here can use pytorch_model-00003-of-00003.bin directly.
     a.load_projection("./models/panda_gpt/adapter_model.bin")
     a.chat_with_image(
         {"image_paths": ["./media/llama1-logo.png"]},
         "what is the text in the picture? 'llama' or 'lambda'?")
     a.chat("what is the color of it?")
-
diff --git a/examples/embd_input/embd_input_lib.cpp b/examples/embd_input/embd_input_lib.cpp
deleted file mode 100644
index bbdf6d645b17f..0000000000000
--- a/examples/embd_input/embd_input_lib.cpp
+++ /dev/null
@@ -1,283 +0,0 @@
-// Defines sigaction on msys:
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
-#include "embd_input.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
-#include <windows.h>
-#include <signal.h>
-#endif
-
-static console_state con_st;
-static llama_context ** g_ctx;
-
-static bool is_interacting = false;
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-void sigint_handler(int signo) {
-    if (signo == SIGINT) {
-        if (!is_interacting) {
-            is_interacting=true;
-        } else {
-            console_cleanup(con_st);
-            printf("\n");
-            llama_print_timings(*g_ctx);
-            _exit(130);
-        }
-    }
-}
-#endif
-
-
-extern "C" {
-
-struct MyModel* create_mymodel(int argc, char ** argv) {
-    gpt_params params;
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return nullptr;
-    }
-
-
-    if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
-                "expect poor results\n", __func__, params.n_ctx);
-    }
-
-    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    llama_init_backend();
-
-    llama_context * ctx;
-    g_ctx = &ctx;
-
-    // load the model and apply lora adapter, if any
-    ctx = llama_init_from_gpt_params(params);
-    if (ctx == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
-        return nullptr;
-    }
-
-    // print system information
-    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
-    }
-    struct MyModel* ret= new MyModel();
-    ret->ctx = ctx;
-    ret->params = params;
-    ret->n_past = 0;
-    // printf("ctx: %d\n", ret->ctx);
-    return ret;
-}
-
-void free_mymodel(struct MyModel* mymodel) {
-    llama_context* ctx = mymodel->ctx;
-    llama_print_timings(ctx);
-    llama_free(ctx);
-    delete mymodel;
-}
-
-
-bool eval_float(void* model, float* input, int N){
-    MyModel* mymodel = (MyModel* )model;
-    llama_context* ctx = mymodel->ctx;
-    gpt_params params = mymodel->params;
-    int n_emb = llama_n_embd(ctx);
-    int n_past = mymodel->n_past;
-    // printf("%f,%f\n", *input, *(input+1));
-    int n_batch = N; // params.n_batch;
-    for (int i = 0; i < (int) N; i += n_batch) {
-        int n_eval = (int) N - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        if (llama_eval_float(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return false;
-        }
-        n_past += n_eval;
-    }
-    mymodel->n_past = n_past;
-    return true;
-}
-
-
-
-
-
-bool eval_tokens(void* model, std::vector<llama_token> tokens) {
-    MyModel* mymodel = (MyModel* )model;
-    // printf("model: %d\n", mymodel);
-    llama_context* ctx;// = mymodel->ctx;
-    // printf("ctx2: %d\n", ctx);
-    // printf("ctx2: %d\n", mymodel->ctx);
-    ctx = mymodel->ctx;
-    // printf("ctx2: %d\n", ctx);
-    gpt_params params = mymodel->params;
-    // printf("\n%d\n", params);
-    int n_past = mymodel->n_past;
-    for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
-        int n_eval = (int) tokens.size() - i;
-        if (n_eval > params.n_batch) {
-            n_eval = params.n_batch;
-        }
-        // printf("%d, %d, %d\n", i, n_eval, n_past);
-        if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return false;
-        }
-        n_past += n_eval;
-    }
-    mymodel->n_past = n_past;
-    return true;
-}
-
-bool eval_id(struct MyModel* mymodel, int id) {
-    // printf("%d\n", id);
-    std::vector<llama_token> tokens;
-    tokens.push_back(id);
-    // printf("%d\n", tokens.size());
-    // printf("%d\n", tokens[0]);
-    return eval_tokens(mymodel, tokens);
-}
-
-
-bool eval_string(struct MyModel* mymodel,const char* str){
-    // std::cout << "eval " << std::endl;
-    // printf("%s", str);
-    llama_context* ctx = mymodel->ctx;
-    std::string str2 = str;
-    // printf("%s", str2.c_str());
-    std::cout << str2 << std::endl;
-    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
-    eval_tokens(mymodel, embd_inp);
-    return true;
-}
-
-
-
-
-llama_token sampling_id(struct MyModel* mymodel) {
-    llama_context* ctx = mymodel->ctx;
-    gpt_params params = mymodel->params;
-    // int n_ctx = llama_n_ctx(ctx);
-
-
-    // out of user input, sample next token
-            const float   temp            = params.temp;
-            const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
-            const float   top_p           = params.top_p;
-            const float   tfs_z           = params.tfs_z;
-            const float   typical_p       = params.typical_p;
-           // const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-           // const float   repeat_penalty  = params.repeat_penalty;
-           // const float   alpha_presence  = params.presence_penalty;
-           // const float   alpha_frequency = params.frequency_penalty;
-            const int     mirostat        = params.mirostat;
-            const float   mirostat_tau    = params.mirostat_tau;
-            const float   mirostat_eta    = params.mirostat_eta;
-           // const bool    penalize_nl     = params.penalize_nl;
-
-            llama_token id = 0;
-
-            {
-                auto logits  = llama_get_logits(ctx);
-                auto n_vocab = llama_n_vocab(ctx);
-
-                // Apply params.logit_bias map
-                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-                    logits[it->first] += it->second;
-                }
-
-                std::vector<llama_token_data> candidates;
-                candidates.reserve(n_vocab);
-                for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                    candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-                }
-
-                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-                // Apply penalties
-//                 float nl_logit = logits[llama_token_nl()];
-//                 auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-//                 llama_sample_repetition_penalty(ctx, &candidates_p,
-//                     last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-//                     last_n_repeat, repeat_penalty);
-//                 llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-//                     last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-//                     last_n_repeat, alpha_frequency, alpha_presence);
-//                 if (!penalize_nl) {
-//                     logits[llama_token_nl()] = nl_logit;
-//                 }
-
-                if (temp <= 0) {
-                    // Greedy sampling
-                    id = llama_sample_token_greedy(ctx, &candidates_p);
-                } else {
-                    if (mirostat == 1) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        const int mirostat_m = 100;
-                        llama_sample_temperature(ctx, &candidates_p, temp);
-                        id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-                    } else if (mirostat == 2) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        llama_sample_temperature(ctx, &candidates_p, temp);
-                        id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-                    } else {
-                        // Temperature sampling
-                        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-                        llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
-                        llama_sample_typical(ctx, &candidates_p, typical_p, 1);
-                        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-                        llama_sample_temperature(ctx, &candidates_p, temp);
-                        id = llama_sample_token(ctx, &candidates_p);
-                    }
-                }
-
-            }
-    return id;
-}
-
-const char* sampling(struct MyModel* mymodel) {
-    llama_context* ctx = mymodel->ctx;
-    int id = sampling_id(mymodel);
-
-    std::string ret;
-    if (id == llama_token_eos()) ret = "</s>";
-    else ret = llama_token_to_str(ctx, id);
-    eval_id(mymodel, id);
-    return ret.c_str();
-}
-
-}
diff --git a/llama.cpp b/llama.cpp
index 65890663b5967..9c983da3a6916 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1342,15 +1342,33 @@ static bool llama_model_load(
     }
 }
 
-static bool llama_eval_internal_tensor(
-            llama_context& lctx,
-            ggml_context* ctx0,
-            ggml_tensor* inpL,
-            const int   n_tokens,
-            const int   n_past,
-            const int   n_threads,
-            const char * cgraph_fname,
-            const int64_t t_start_us) {
+// evaluate the transformer
+//
+//   - lctx:      llama context
+//   - tokens:    new batch of tokens to process
+//   - n_tokens   number of tokens
+//   - embd       embeddings input
+//   - n_past:    the context size so far
+//   - n_threads: number of threads to use
+//
+static bool llama_eval_internal(
+         llama_context & lctx,
+     const llama_token * tokens,
+             const int   n_tokens,
+           const float * embd,
+             const int   n_past,
+             const int   n_threads,
+            const char * cgraph_fname) {
+
+    LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
+
+    // enforce that the first token is BOS
+    if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
+        fprintf(stderr, "%s: first token must be BOS\n", __func__);
+        return false;
+    }
+
+    const int64_t t_start_us = ggml_time_us();
 
     const int N = n_tokens;
 
@@ -1359,7 +1377,6 @@ static bool llama_eval_internal_tensor(
 
     const auto & kv_self = model.kv_self;
 
-
     LLAMA_ASSERT(!!kv_self.ctx);
 
     const int n_embd       = hparams.n_embd;
@@ -1371,6 +1388,15 @@ static bool llama_eval_internal_tensor(
     const int n_gpu_layers = model.n_gpu_layers;
 
     auto & mem_per_token = lctx.mem_per_token;
+    auto & buf_compute   = lctx.buf_compute;
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_compute.size,
+        /*.mem_buffer =*/ buf_compute.addr,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
 
     // for big prompts, if BLAS is enabled, it is better to use only one thread
     // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
@@ -1378,6 +1404,17 @@ static bool llama_eval_internal_tensor(
     gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
 
     struct ggml_tensor * cur;
+    struct ggml_tensor * inpL;
+
+    if (tokens) {
+        struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+        ggml_set_name(embd, "embd");
+        memcpy(embd->data, tokens, N*ggml_element_size(embd));
+        inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
+    } else {
+        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
+        memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
+    }
 
     const int i_gpu_start = n_layer - n_gpu_layers;
     (void) i_gpu_start;
@@ -1746,53 +1783,6 @@ static bool llama_eval_internal_tensor(
     return true;
 }
 
-
-// evaluate the transformer
-//
-//   - lctx:      llama context
-//   - tokens:    new batch of tokens to process
-//   - n_past:    the context size so far
-//   - n_threads: number of threads to use
-//
-static bool llama_eval_internal(
-        llama_context & lctx,
-    const llama_token * tokens,
-            const int   n_tokens,
-            const int   n_past,
-            const int   n_threads,
-            const char * cgraph_fname) {
-
-    // enforce that the first token is BOS
-    if (n_past == 0 && tokens[0] != llama_token_bos()) {
-        fprintf(stderr, "%s: first token must be BOS\n", __func__);
-        return false;
-    }
-
-    const auto & model   = lctx.model;
-
-    const int64_t t_start_us = ggml_time_us();
-
-    const int N = n_tokens;
-
-    auto & buf_compute   = lctx.buf_compute;
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.addr,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_set_name(embd, "embd");
-    memcpy(embd->data, tokens, N*ggml_element_size(embd));
-
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
-    return llama_eval_internal_tensor(lctx, ctx0, inpL, N, n_past, n_threads, cgraph_fname, t_start_us);
-}
-
 //
 // tokenizer
 //
@@ -3357,7 +3347,7 @@ int llama_eval(
                          int   n_tokens,
                          int   n_past,
                          int   n_threads) {
-    if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
+    if (!llama_eval_internal(*ctx, tokens, n_tokens, nullptr, n_past, n_threads, nullptr)) {
         fprintf(stderr, "%s: failed to eval\n", __func__);
         return 1;
     }
@@ -3373,32 +3363,13 @@ int llama_eval(
 }
 
 
-int llama_eval_float(
-        struct llama_context * ctx,
-           const float * input,
-                         int   n_tokens,
-                         int   n_past,
-                         int   n_threads) {
-    const auto & model   = ctx->model;
-
-    const int64_t t_start_us = ggml_time_us();
-
-    const int N = n_tokens;
-
-    auto & buf_compute   = ctx->buf_compute;
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.addr,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_tensor *inpL =
-      ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, model.hparams.n_embd, N);
-    memcpy(inpL->data, input, N * model.hparams.n_embd * ggml_element_size(inpL));
-    if (!llama_eval_internal_tensor(*ctx, ctx0, inpL, N, n_past, n_threads, nullptr, t_start_us)) {
+int llama_eval_embd(
+            struct llama_context * ctx,
+                     const float * embd,
+                             int   n_tokens,
+                             int   n_past,
+                             int   n_threads) {
+    if (!llama_eval_internal(*ctx, nullptr, n_tokens, embd, n_past, n_threads, nullptr)) {
         fprintf(stderr, "%s: failed to eval\n", __func__);
         return 1;
     }
@@ -3419,7 +3390,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
 
     const std::vector<llama_token> tmp(n_batch, llama_token_bos());
 
-    if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
+    if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), nullptr, n_ctx, 1, fname)) {
         fprintf(stderr, "%s: failed to eval\n", __func__);
         return 1;
     }
diff --git a/llama.h b/llama.h
index ad975166bd28e..2183b12fa1ce1 100644
--- a/llama.h
+++ b/llama.h
@@ -200,9 +200,9 @@ extern "C" {
                              int   n_threads);
 
     // Same as llama_eval, but use float matrix input directly.
-    LLAMA_API int llama_eval_float(
+    LLAMA_API int llama_eval_embd(
             struct llama_context * ctx,
-               const float * embds,
+                     const float * embd,
                              int   n_tokens,
                              int   n_past,
                              int   n_threads);

From beca5a61859b3af64bd773408d660f2a954370d8 Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Sun, 25 Jun 2023 06:15:13 +0800
Subject: [PATCH 13/17] add cmake build for embd-input

---
 Makefile                               | 2 +-
 examples/CMakeLists.txt                | 1 +
 examples/embd-input/embd-input-lib.cpp | 6 ++++--
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 058a234f88b42..b69301f478543 100644
--- a/Makefile
+++ b/Makefile
@@ -265,7 +265,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
 clean:
-	rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h
+	rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch embd-input-test build-info.h
 
 #
 # Examples
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index cf9c4a2231337..161960bb853cc 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -39,6 +39,7 @@ else()
     add_subdirectory(baby-llama)
     add_subdirectory(train-text-from-scratch)
     add_subdirectory(simple)
+    add_subdirectory(embd-input)
     if (LLAMA_METAL)
         add_subdirectory(metal)
     endif()
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
index 37a5b52086ccf..83fcd065ceb8e 100644
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -36,12 +36,14 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
 
     llama_init_backend();
 
+    llama_model * model;
     llama_context * ctx;
+
     g_ctx = &ctx;
 
     // load the model and apply lora adapter, if any
-    ctx = llama_init_from_gpt_params(params);
-    if (ctx == NULL) {
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == NULL) {
         fprintf(stderr, "%s: error: unable to load model\n", __func__);
         return nullptr;
     }

From 6cb62ca2873c7f79c0e7e5189accb63866597c7a Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Sun, 25 Jun 2023 06:17:30 +0800
Subject: [PATCH 14/17] add cmake build for embd-input

---
 examples/embd-input/CMakeLists.txt | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 examples/embd-input/CMakeLists.txt

diff --git a/examples/embd-input/CMakeLists.txt b/examples/embd-input/CMakeLists.txt
new file mode 100644
index 0000000000000..2b623953e8061
--- /dev/null
+++ b/examples/embd-input/CMakeLists.txt
@@ -0,0 +1,15 @@
+set(TARGET embdinput)
+add_library(${TARGET} embd-input-lib.cpp embd-input.h)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
+
+set(TARGET embd-input-test)
+add_executable(${TARGET} embd-input-test.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()

From 40340d82af4ca1fe9ca519b0e7cc42dff427406f Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Sun, 25 Jun 2023 23:42:26 +0800
Subject: [PATCH 15/17] Add MiniGPT-4 example

---
 .gitignore                      |   2 +-
 examples/embd-input/.gitignore  |   4 +
 examples/embd-input/README.md   |  36 ++++++---
 examples/embd-input/minigpt4.py | 128 ++++++++++++++++++++++++++++++++
 4 files changed, 159 insertions(+), 11 deletions(-)
 create mode 100644 examples/embd-input/.gitignore
 create mode 100644 examples/embd-input/minigpt4.py

diff --git a/.gitignore b/.gitignore
index 9116729fd5f12..4fccec31b8114 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,7 +40,7 @@ models/*
 /vdot
 /server
 /Pipfile
-/embd_input_test
+/embd-input-test
 /libllama.so
 build-info.h
 arm_neon.h
diff --git a/examples/embd-input/.gitignore b/examples/embd-input/.gitignore
new file mode 100644
index 0000000000000..87ef68771de5e
--- /dev/null
+++ b/examples/embd-input/.gitignore
@@ -0,0 +1,4 @@
+PandaGPT
+MiniGPT-4
+*.pth
+
diff --git a/examples/embd-input/README.md b/examples/embd-input/README.md
index eb1095f24fe96..02d028f261f17 100644
--- a/examples/embd-input/README.md
+++ b/examples/embd-input/README.md
@@ -1,17 +1,17 @@
 ### Examples for input embedding directly
 
 ## Requirement
-build  `libembd_input.so`
+build  `libembdinput.so`
 run the following comman in main dir (../../).
 ```
 make
 ```
 
-## LLAVA example  (llava.py)
+## [LLaVA](https://github.com/haotian-liu/LLaVA/) example  (llava.py)
 
-1. obtian llava model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/)
-2. convert it to ggml format
-3. llava_projection.pth is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin)
+1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
+2. Convert it to ggml format.
+3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
 
 ```
 import torch
@@ -23,10 +23,12 @@ dic = torch.load(bin_path)
 used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
 torch.save({k: dic[k] for k in used_key}, pth_path)
 ```
+4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
 
-## PandaGPT example (panda_gpt.py)
 
-1. Obtian PandaGPT lora model. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
+## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
+
+1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
 The `adapter_config.json` is
 ```
 {
@@ -40,8 +42,22 @@ The `adapter_config.json` is
   "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
 }
 ```
-2. papare the `vicuna` v0 model.
-3. obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
+2. Papare the `vicuna` v0 model.
+3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
 4. Clone the PandaGPT source.
-5. check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
+```
+git clone https://github.com/yxuansu/PandaGPT
+```
+5. Install the requirement of PandaGPT.
+6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
 
+## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
+
+1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
+2. Clone the MiniGPT-4 source.
+```
+git clone https://github.com/Vision-CAIR/MiniGPT-4/
+```
+3. Install the requirement of PandaGPT.
+4. Papare the `vicuna` v0 model.
+5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.
diff --git a/examples/embd-input/minigpt4.py b/examples/embd-input/minigpt4.py
new file mode 100644
index 0000000000000..8e98f85179c4e
--- /dev/null
+++ b/examples/embd-input/minigpt4.py
@@ -0,0 +1,128 @@
+import sys
+import os
+sys.path.insert(0, os.path.dirname(__file__))
+from embd_input import MyModel
+import numpy as np
+from torch import nn
+import torch
+from PIL import Image
+
+minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
+sys.path.insert(0, minigpt4_path)
+from minigpt4.models.blip2 import Blip2Base
+from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
+
+
+class MiniGPT4(Blip2Base):
+    """
+    MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
+    """
+    def __init__(self,
+        args,
+        vit_model="eva_clip_g",
+        q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
+        img_size=224,
+        drop_path_rate=0,
+        use_grad_checkpoint=False,
+        vit_precision="fp32",
+        freeze_vit=True,
+        freeze_qformer=True,
+        num_query_token=32,
+        llama_model="",
+        prompt_path="",
+        prompt_template="",
+        max_txt_len=32,
+        end_sym='\n',
+        low_resource=False,  # use 8 bit and put vit in cpu
+        device_8bit=0
+    ):
+        super().__init__()
+        self.img_size = img_size
+        self.low_resource = low_resource
+        self.preprocessor = Blip2ImageEvalProcessor(img_size)
+
+        print('Loading VIT')
+        self.visual_encoder, self.ln_vision = self.init_vision_encoder(
+            vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
+        )
+        print('Loading VIT Done')
+        print('Loading Q-Former')
+        self.Qformer, self.query_tokens = self.init_Qformer(
+            num_query_token, self.visual_encoder.num_features
+        )
+        self.Qformer.cls = None
+        self.Qformer.bert.embeddings.word_embeddings = None
+        self.Qformer.bert.embeddings.position_embeddings = None
+        for layer in self.Qformer.bert.encoder.layer:
+            layer.output = None
+            layer.intermediate = None
+        self.load_from_pretrained(url_or_filename=q_former_model)
+        print('Loading Q-Former Done')
+        self.llama_proj = nn.Linear(
+            self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
+        )
+        self.max_txt_len = max_txt_len
+        self.end_sym = end_sym
+        self.model = MyModel(["main", *args])
+        # system promt
+        self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
+           "You will be able to see the image once I provide it to you. Please answer my questions."
+           "###")
+
+    def encode_img(self, image):
+        image = self.preprocessor(image)
+        image = image.unsqueeze(0)
+        device = image.device
+        if self.low_resource:
+            self.vit_to_cpu()
+            image = image.to("cpu")
+
+        with self.maybe_autocast():
+            image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
+            image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
+
+            query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+            query_output = self.Qformer.bert(
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=True,
+            )
+
+            inputs_llama = self.llama_proj(query_output.last_hidden_state)
+            # atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
+        return inputs_llama
+
+    def load_projection(self, path):
+        state = torch.load(path)["model"]
+        self.llama_proj.load_state_dict({
+            "weight": state["llama_proj.weight"],
+            "bias": state["llama_proj.bias"]})
+
+    def chat(self, question):
+        self.model.eval_string("Human: ")
+        self.model.eval_string(question)
+        self.model.eval_string("\n### Assistant:")
+        return self.model.generate_with_print(end="###")
+
+    def chat_with_image(self, image, question):
+        with torch.no_grad():
+            embd_image = self.encode_img(image)
+        embd_image = embd_image.cpu().numpy()[0]
+        self.model.eval_string("Human: <Img>")
+        self.model.eval_float(embd_image.T)
+        self.model.eval_string("</Img> ")
+        self.model.eval_string(question)
+        self.model.eval_string("\n### Assistant:")
+        return self.model.generate_with_print(end="###")
+
+
+if __name__=="__main__":
+    a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
+    a.load_projection(os.path.join(
+        os.path.dirname(__file__) ,
+        "pretrained_minigpt4.pth"))
+    respose = a.chat_with_image(
+        Image.open("./media/llama1-logo.png").convert('RGB'),
+        "what is the text in the picture?")
+    a.chat("what is the color of it?")

From 39011ad7c479c5dfa325940b9efac53a1011df3e Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Tue, 27 Jun 2023 04:06:20 +0800
Subject: [PATCH 16/17] change the order of the args of llama_eval_internal

---
 llama.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 90758aedc9ef5..672270a7f2b41 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1363,16 +1363,16 @@ static bool llama_model_load(
 //
 //   - lctx:      llama context
 //   - tokens:    new batch of tokens to process
-//   - n_tokens   number of tokens
 //   - embd       embeddings input
+//   - n_tokens   number of tokens
 //   - n_past:    the context size so far
 //   - n_threads: number of threads to use
 //
 static bool llama_eval_internal(
          llama_context & lctx,
      const llama_token * tokens,
-             const int   n_tokens,
            const float * embd,
+             const int   n_tokens,
              const int   n_past,
              const int   n_threads,
             const char * cgraph_fname) {
@@ -3420,7 +3420,7 @@ int llama_eval(
                          int   n_tokens,
                          int   n_past,
                          int   n_threads) {
-    if (!llama_eval_internal(*ctx, tokens, n_tokens, nullptr, n_past, n_threads, nullptr)) {
+    if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
         fprintf(stderr, "%s: failed to eval\n", __func__);
         return 1;
     }
@@ -3442,7 +3442,7 @@ int llama_eval_embd(
                              int   n_tokens,
                              int   n_past,
                              int   n_threads) {
-    if (!llama_eval_internal(*ctx, nullptr, n_tokens, embd, n_past, n_threads, nullptr)) {
+    if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
         fprintf(stderr, "%s: failed to eval\n", __func__);
         return 1;
     }
@@ -3463,7 +3463,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
 
     const std::vector<llama_token> tmp(n_batch, llama_token_bos());
 
-    if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), nullptr, n_ctx, 1, fname)) {
+    if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
         fprintf(stderr, "%s: failed to eval\n", __func__);
         return 1;
     }

From 7abb513e9413c58d4dde0d6fc26b9137900a6950 Mon Sep 17 00:00:00 2001
From: ningshanwutuobang <ningshanwutuobang@gmail.com>
Date: Tue, 27 Jun 2023 20:38:52 +0800
Subject: [PATCH 17/17] fix ci error

---
 examples/embd-input/embd-input-lib.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
index 83fcd065ceb8e..37de52ad6e37c 100644
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -34,7 +34,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
     }
     fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
 
-    llama_init_backend();
+    llama_init_backend(params.numa);
 
     llama_model * model;
     llama_context * ctx;