From 50ce29667f9ec50a4fd730fb0b76a28267b4b587 Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Sat, 3 Jun 2023 18:51:58 +0800 Subject: [PATCH 01/17] add interface for float input --- llama.cpp | 224 +++++++++++++++++++++++++++++++++++++++++++++--------- llama.h | 7 ++ 2 files changed, 197 insertions(+), 34 deletions(-) diff --git a/llama.cpp b/llama.cpp index 47b4c8dd7ffb2..fff90a1432bd3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1194,27 +1194,14 @@ static bool llama_model_load( } } -// evaluate the transformer -// -// - lctx: llama context -// - tokens: new batch of tokens to process -// - n_past: the context size so far -// - n_threads: number of threads to use -// -static bool llama_eval_internal( - llama_context & lctx, - const llama_token * tokens, +static bool llama_eval_internal_tensor( + llama_context& lctx, + ggml_context* ctx0, + ggml_tensor* inpL, const int n_tokens, const int n_past, - const int n_threads) { - - // enforce that the first token is BOS - if (n_past == 0 && tokens[0] != llama_token_bos()) { - fprintf(stderr, "%s: first token must be BOS\n", __func__); - return false; - } - - const int64_t t_start_us = ggml_time_us(); + const int n_threads, + const int64_t t_start_us) { const int N = n_tokens; @@ -1223,8 +1210,6 @@ static bool llama_eval_internal( const auto & kv_self = model.kv_self; - LLAMA_ASSERT(!!kv_self.ctx); - const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; @@ -1233,26 +1218,14 @@ static bool llama_eval_internal( const int n_rot = hparams.n_embd/hparams.n_head; auto & mem_per_token = lctx.mem_per_token; - auto & buf_compute = lctx.buf_compute; - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.addr, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); + LLAMA_ASSERT(!!kv_self.ctx); // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ggml_cgraph gf = {}; gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_set_name(embd, "embd"); - memcpy(embd->data, tokens, N*ggml_element_size(embd)); - - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -1494,6 +1467,52 @@ static bool llama_eval_internal( return true; } + +// evaluate the transformer +// +// - lctx: llama context +// - tokens: new batch of tokens to process +// - n_past: the context size so far +// - n_threads: number of threads to use +// +static bool llama_eval_internal( + llama_context & lctx, + const llama_token * tokens, + const int n_tokens, + const int n_past, + const int n_threads) { + + // enforce that the first token is BOS + if (n_past == 0 && tokens[0] != llama_token_bos()) { + fprintf(stderr, "%s: first token must be BOS\n", __func__); + return false; + } + + const auto & model = lctx.model; + + const int64_t t_start_us = ggml_time_us(); + + const int N = n_tokens; + + auto & buf_compute = lctx.buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.addr, + /*.no_alloc =*/ false, + }; + + struct ggml_context * ctx0 = ggml_init(params); + + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_set_name(embd, "embd"); + memcpy(embd->data, tokens, N*ggml_element_size(embd)); + + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); + return llama_eval_internal_tensor(lctx, ctx0, inpL, N, n_past, n_threads, t_start_us); +} + // // tokenizer // @@ -2214,6 +2233,97 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } + +ggml_tensor *quantize_float_tensor(ggml_context *ctx0, ggml_tensor* tensor, + llama_ftype ftype, int nthread) { + + ggml_type quantized_type; + switch (ftype) { + case LLAMA_FTYPE_MOSTLY_Q4_0: + quantized_type = GGML_TYPE_Q4_0; + break; + case LLAMA_FTYPE_MOSTLY_Q4_1: + quantized_type = GGML_TYPE_Q4_1; + break; + case LLAMA_FTYPE_MOSTLY_Q5_0: + quantized_type = GGML_TYPE_Q5_0; + break; + case LLAMA_FTYPE_MOSTLY_Q5_1: + quantized_type = GGML_TYPE_Q5_1; + break; + case LLAMA_FTYPE_MOSTLY_Q8_0: + quantized_type = GGML_TYPE_Q8_0; + break; + default: + throw format("invalid output file type %d\n", ftype); + }; + void *new_data; + size_t new_size; + llama_buffer work; + float *f32_data; + size_t nelements = tensor->ne[0] * tensor->ne[1]; + llama_buffer f32_conv_buf; + f32_data = (float *)tensor->data; + work.resize(nelements * 4); + new_data = work.addr; + std::vector hist_cur(1 << 4, 0); + std::vector workers; + std::mutex mutex; + enum ggml_type new_type = quantized_type; + + int chunk_size = 32 * 512; + const int nchunk = (nelements + chunk_size - 1) / chunk_size; + const int nthread_use = + nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; + if (nthread_use < 2) { + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, + hist_cur.data()); + } else { + size_t counter = 0; + new_size = 0; + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, + new_data, nelements, chunk_size]() { + std::vector local_hist; + size_t local_size = 0; + while (true) { + std::unique_lock lock(mutex); + size_t first = counter; + counter += chunk_size; + if (first >= nelements) { + if (!local_hist.empty()) { + for (int j = 0; j < int(local_hist.size()); ++j) { + hist_cur[j] += local_hist[j]; + } + new_size += local_size; + } + break; + } + lock.unlock(); + size_t last = std::min(nelements, first + chunk_size); + if (local_hist.empty()) { + local_hist.resize(hist_cur.size(), 0); + } + local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, + last - first, local_hist.data()); + } + }; + if ((int)workers.size() < nthread_use - 1) { + workers.resize(nthread_use - 1); + } + for (int it = 0; it < nthread_use - 1; ++it) { + workers[it] = std::thread(compute); + } + compute(); + for (int it = 0; it < nthread_use - 1; ++it) { + workers[it].join(); + } + } + ggml_tensor *ret = + ggml_new_tensor_2d(ctx0, new_type, tensor->ne[0], tensor->ne[1]); + memcpy(ret->data, new_data, new_size); + return ret; +} + // // interface implementation // @@ -2921,6 +3031,52 @@ int llama_eval( return 0; } +int llama_eval_float( + struct llama_context * ctx, + const float * input, + int n_tokens, + int n_past, + int n_threads) { + const auto & model = ctx->model; + + const int64_t t_start_us = ggml_time_us(); + + const int N = n_tokens; + + auto & buf_compute = ctx->buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.addr, + /*.no_alloc =*/ false, + }; + + struct ggml_context * ctx0 = ggml_init(params); + + + struct ggml_tensor *input_f = + ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, N * model.hparams.n_embd); + memcpy(input_f->data, input, + N * model.hparams.n_embd * ggml_element_size(input_f)); + struct ggml_tensor *inpL = + quantize_float_tensor(ctx0, input_f, model.hparams.ftype, n_threads); + + ; + if (!llama_eval_internal_tensor(*ctx, ctx0, inpL, N, n_past, n_threads, t_start_us)) { + fprintf(stderr, "%s: failed to eval\n", __func__); + return 1; + } + + // get a more accurate load time, upon first eval + // TODO: fix this + if (!ctx->has_evaluated_once) { + ctx->t_load_us = ggml_time_us() - ctx->t_start_us; + ctx->has_evaluated_once = true; + } + + return 0; +} + int llama_tokenize( struct llama_context * ctx, const char * text, diff --git a/llama.h b/llama.h index c6b0a2889f8de..3b984845c8d67 100644 --- a/llama.h +++ b/llama.h @@ -173,6 +173,13 @@ extern "C" { int n_past, int n_threads); + LLAMA_API int llama_eval_float( + struct llama_context * ctx, + const float * embds, + int n_tokens, + int n_past, + int n_threads); + // Convert the provided text into tokens. // The tokens pointer must be large enough to hold the resulting tokens. // Returns the number of tokens on success, no more than n_max_tokens From 5673a8de37b453ce1646a12bf3f0442956b68c02 Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Mon, 5 Jun 2023 21:39:35 +0800 Subject: [PATCH 02/17] fixed inpL shape and type --- llama.cpp | 101 ++---------------------------------------------------- 1 file changed, 3 insertions(+), 98 deletions(-) diff --git a/llama.cpp b/llama.cpp index fff90a1432bd3..c76b198123dae 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2234,95 +2234,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } -ggml_tensor *quantize_float_tensor(ggml_context *ctx0, ggml_tensor* tensor, - llama_ftype ftype, int nthread) { - - ggml_type quantized_type; - switch (ftype) { - case LLAMA_FTYPE_MOSTLY_Q4_0: - quantized_type = GGML_TYPE_Q4_0; - break; - case LLAMA_FTYPE_MOSTLY_Q4_1: - quantized_type = GGML_TYPE_Q4_1; - break; - case LLAMA_FTYPE_MOSTLY_Q5_0: - quantized_type = GGML_TYPE_Q5_0; - break; - case LLAMA_FTYPE_MOSTLY_Q5_1: - quantized_type = GGML_TYPE_Q5_1; - break; - case LLAMA_FTYPE_MOSTLY_Q8_0: - quantized_type = GGML_TYPE_Q8_0; - break; - default: - throw format("invalid output file type %d\n", ftype); - }; - void *new_data; - size_t new_size; - llama_buffer work; - float *f32_data; - size_t nelements = tensor->ne[0] * tensor->ne[1]; - llama_buffer f32_conv_buf; - f32_data = (float *)tensor->data; - work.resize(nelements * 4); - new_data = work.addr; - std::vector hist_cur(1 << 4, 0); - std::vector workers; - std::mutex mutex; - enum ggml_type new_type = quantized_type; - - int chunk_size = 32 * 512; - const int nchunk = (nelements + chunk_size - 1) / chunk_size; - const int nthread_use = - nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; - if (nthread_use < 2) { - new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, - hist_cur.data()); - } else { - size_t counter = 0; - new_size = 0; - auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, - new_data, nelements, chunk_size]() { - std::vector local_hist; - size_t local_size = 0; - while (true) { - std::unique_lock lock(mutex); - size_t first = counter; - counter += chunk_size; - if (first >= nelements) { - if (!local_hist.empty()) { - for (int j = 0; j < int(local_hist.size()); ++j) { - hist_cur[j] += local_hist[j]; - } - new_size += local_size; - } - break; - } - lock.unlock(); - size_t last = std::min(nelements, first + chunk_size); - if (local_hist.empty()) { - local_hist.resize(hist_cur.size(), 0); - } - local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, - last - first, local_hist.data()); - } - }; - if ((int)workers.size() < nthread_use - 1) { - workers.resize(nthread_use - 1); - } - for (int it = 0; it < nthread_use - 1; ++it) { - workers[it] = std::thread(compute); - } - compute(); - for (int it = 0; it < nthread_use - 1; ++it) { - workers[it].join(); - } - } - ggml_tensor *ret = - ggml_new_tensor_2d(ctx0, new_type, tensor->ne[0], tensor->ne[1]); - memcpy(ret->data, new_data, new_size); - return ret; -} // // interface implementation @@ -3053,15 +2964,9 @@ int llama_eval_float( struct ggml_context * ctx0 = ggml_init(params); - - struct ggml_tensor *input_f = - ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, N * model.hparams.n_embd); - memcpy(input_f->data, input, - N * model.hparams.n_embd * ggml_element_size(input_f)); - struct ggml_tensor *inpL = - quantize_float_tensor(ctx0, input_f, model.hparams.ftype, n_threads); - - ; + struct ggml_tensor *inpL = + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, model.hparams.n_embd, N); + memcpy(inpL->data, input, N * model.hparams.n_embd * ggml_element_size(inpL)); if (!llama_eval_internal_tensor(*ctx, ctx0, inpL, N, n_past, n_threads, t_start_us)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; From 20d5eef8160d8e0d020bd4167f01b9be99bb19b2 Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Mon, 5 Jun 2023 22:32:36 +0800 Subject: [PATCH 03/17] add examples of input floats --- examples/embd_input/embd_input.h | 28 +++ examples/embd_input/embd_input_lib.cpp | 267 ++++++++++++++++++++++++ examples/embd_input/embd_input_test.cpp | 29 +++ 3 files changed, 324 insertions(+) create mode 100644 examples/embd_input/embd_input.h create mode 100644 examples/embd_input/embd_input_lib.cpp create mode 100644 examples/embd_input/embd_input_test.cpp diff --git a/examples/embd_input/embd_input.h b/examples/embd_input/embd_input.h new file mode 100644 index 0000000000000..f5deb52775044 --- /dev/null +++ b/examples/embd_input/embd_input.h @@ -0,0 +1,28 @@ +#ifndef _EMBD_INPUT_H_ +#define _EMBD_INPUT_H_ 1 + +#include "common.h" +#include "llama.h" +#include "build-info.h" + + +extern "C" { + +typedef struct MyModel { + llama_context* ctx; + gpt_params params; +} MyModel; + + +struct MyModel* create_mymodel(int argc, char ** argv); + +bool eval_float(void* model, float* input, int N); +bool eval_tokens(void* model, std::vector tokens); +bool eval_id(struct MyModel* mymodel, int id); +bool eval_string(struct MyModel* mymodel, const char* str); +const char* sampling(struct MyModel* mymodel); +llama_token sampling_id(struct MyModel* mymodel); + +} + +#endif diff --git a/examples/embd_input/embd_input_lib.cpp b/examples/embd_input/embd_input_lib.cpp new file mode 100644 index 0000000000000..a9edc120e4a28 --- /dev/null +++ b/examples/embd_input/embd_input_lib.cpp @@ -0,0 +1,267 @@ +// Defines sigaction on msys: +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include "embd_input.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include +#include +#endif + +static console_state con_st; +static llama_context ** g_ctx; + +static bool is_interacting = false; + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) +void sigint_handler(int signo) { + if (signo == SIGINT) { + if (!is_interacting) { + is_interacting=true; + } else { + console_cleanup(con_st); + printf("\n"); + llama_print_timings(*g_ctx); + _exit(130); + } + } +} +#endif + + +extern "C" { + +struct MyModel* create_mymodel(int argc, char ** argv) { + gpt_params params; + + if (gpt_params_parse(argc, argv, params) == false) { + return nullptr; + } + + + if (params.n_ctx > 2048) { + fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + "expect poor results\n", __func__, params.n_ctx); + } + + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + + if (params.seed < 0) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_init_backend(); + + llama_context * ctx; + g_ctx = &ctx; + + // load the model and apply lora adapter, if any + ctx = llama_init_from_gpt_params(params); + if (ctx == NULL) { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + return nullptr; + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + struct MyModel* ret= new MyModel(); + ret->ctx = ctx; + ret->params = params; + // printf("ctx: %d\n", ret->ctx); + return ret; +} + + +bool eval_float(void* model, float* input, int N){ + MyModel* mymodel = (MyModel* )model; + llama_context* ctx = mymodel->ctx; + gpt_params params = mymodel->params; + int n_emb = llama_n_embd(ctx); + int n_past = 0; + for (int i = 0; i < (int) N; i += params.n_batch) { + int n_eval = (int) N - i; + if (n_eval > params.n_batch) { + n_eval = params.n_batch; + } + if (llama_eval_float(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + n_past += n_eval; + } + return true; +} + + + + + +bool eval_tokens(void* model, std::vector tokens) { + MyModel* mymodel = (MyModel* )model; + // printf("model: %d\n", mymodel); + llama_context* ctx;// = mymodel->ctx; + // printf("ctx2: %d\n", ctx); + // printf("ctx2: %d\n", mymodel->ctx); + ctx = mymodel->ctx; + // printf("ctx2: %d\n", ctx); + gpt_params params = mymodel->params; + // printf("\n%d\n", params); + int n_past = 1; + for (int i = 0; i < (int) tokens.size(); i += params.n_batch) { + int n_eval = (int) tokens.size() - i; + if (n_eval > params.n_batch) { + n_eval = params.n_batch; + } + // printf("%d, %d, %d\n", i, n_eval, n_past); + if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + n_past += n_eval; + } + return true; +} + +bool eval_id(struct MyModel* mymodel, int id) { + // printf("%d\n", id); + std::vector tokens; + tokens.push_back(id); + // printf("%d\n", tokens.size()); + // printf("%d\n", tokens[0]); + return eval_tokens(mymodel, tokens); +} + + +bool eval_string(struct MyModel* mymodel,const char* str){ + // std::cout << "eval " << std::endl; + // printf("%s", str); + llama_context* ctx = mymodel->ctx; + std::string str2 = str; + // printf("%s", str2.c_str()); + std::cout << str2 << std::endl; + std::vector embd_inp = ::llama_tokenize(ctx, str2, true); + eval_tokens(mymodel, embd_inp); + return true; +} + + + + +llama_token sampling_id(struct MyModel* mymodel) { + llama_context* ctx = mymodel->ctx; + gpt_params params = mymodel->params; + // int n_ctx = llama_n_ctx(ctx); + + + // out of user input, sample next token + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + const float top_p = params.top_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + // const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; + // const float repeat_penalty = params.repeat_penalty; + // const float alpha_presence = params.presence_penalty; + // const float alpha_frequency = params.frequency_penalty; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + // const bool penalize_nl = params.penalize_nl; + + llama_token id = 0; + + { + auto logits = llama_get_logits(ctx); + auto n_vocab = llama_n_vocab(ctx); + + // Apply params.logit_bias map + for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { + logits[it->first] += it->second; + } + + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // Apply penalties +// float nl_logit = logits[llama_token_nl()]; +// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); +// llama_sample_repetition_penalty(ctx, &candidates_p, +// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, +// last_n_repeat, repeat_penalty); +// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, +// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, +// last_n_repeat, alpha_frequency, alpha_presence); +// if (!penalize_nl) { +// logits[llama_token_nl()] = nl_logit; +// } + + if (temp <= 0) { + // Greedy sampling + id = llama_sample_token_greedy(ctx, &candidates_p); + } else { + if (mirostat == 1) { + static float mirostat_mu = 2.0f * mirostat_tau; + const int mirostat_m = 100; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); + } else if (mirostat == 2) { + static float mirostat_mu = 2.0f * mirostat_tau; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); + } else { + // Temperature sampling + llama_sample_top_k(ctx, &candidates_p, top_k, 1); + llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); + llama_sample_typical(ctx, &candidates_p, typical_p, 1); + llama_sample_top_p(ctx, &candidates_p, top_p, 1); + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token(ctx, &candidates_p); + } + } + + } + return id; +} + +const char* sampling(struct MyModel* mymodel) { + llama_context* ctx = mymodel->ctx; + int id = sampling_id(mymodel); + std::string ret = llama_token_to_str(ctx, id); + return ret.c_str(); +} + +} diff --git a/examples/embd_input/embd_input_test.cpp b/examples/embd_input/embd_input_test.cpp new file mode 100644 index 0000000000000..96ce130fde6b2 --- /dev/null +++ b/examples/embd_input/embd_input_test.cpp @@ -0,0 +1,29 @@ +#include "embd_input.h" +#include +#include + +int main(int argc, char** argv) { + + auto mymodel = create_mymodel(argc, argv); + int N = 10; + int n_embd = llama_n_embd(mymodel->ctx); + float* data = new float[N*n_embd]; + std::default_random_engine e; + std::uniform_real_distribution u(0,1); + for (int i=0;iparams.prompt.c_str()); + for (int i=0;i < 500; i++) { + int id = sampling_id(mymodel); + printf("%s", llama_token_to_str(mymodel->ctx, id)); + eval_id(mymodel, id); + } + printf("\n"); + return 0; +} From a91487093bfb4c13bf2aaafce8ab600f06c7cf4d Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Tue, 6 Jun 2023 22:06:51 +0800 Subject: [PATCH 04/17] add test example for embd input --- .gitignore | 1 + Makefile | 11 +++++- examples/embd_input/embd_input.h | 1 + examples/embd_input/embd_input.py | 47 +++++++++++++++++++++++++ examples/embd_input/embd_input_lib.cpp | 15 +++++--- examples/embd_input/embd_input_test.cpp | 10 +++--- 6 files changed, 75 insertions(+), 10 deletions(-) create mode 100644 examples/embd_input/embd_input.py diff --git a/.gitignore b/.gitignore index d231f3ff8ed36..88bf142b954c3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.o *.a +*.so .DS_Store .build/ .cache/ diff --git a/Makefile b/Makefile index 8e8d426c5d6bf..7685003c26ae9 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot libembd_input.so embd_input_test ifdef LLAMA_BUILD_SERVER BUILD_TARGETS += server @@ -250,6 +250,15 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml. server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) +libembd_input.so: examples/embd_input/embd_input.h examples/embd_input/embd_input_lib.cpp examples/embd_input/embd_input_test.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) --shared $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) + + +embd_input_test: libembd_input.so examples/embd_input/embd_input_test.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -Wl,-rpath=./ -lembd_input + + + build-info.h: $(wildcard .git/index) scripts/build-info.sh @sh scripts/build-info.sh > $@.tmp @if ! cmp -s $@.tmp $@; then \ diff --git a/examples/embd_input/embd_input.h b/examples/embd_input/embd_input.h index f5deb52775044..f45cee32dac4d 100644 --- a/examples/embd_input/embd_input.h +++ b/examples/embd_input/embd_input.h @@ -11,6 +11,7 @@ extern "C" { typedef struct MyModel { llama_context* ctx; gpt_params params; + int n_past = 0; } MyModel; diff --git a/examples/embd_input/embd_input.py b/examples/embd_input/embd_input.py new file mode 100644 index 0000000000000..6d1abf736e143 --- /dev/null +++ b/examples/embd_input/embd_input.py @@ -0,0 +1,47 @@ +import ctypes +from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int +import numpy as np + +libc = cdll.LoadLibrary("./libembd_input.so") +libc.sampling.restype=c_char_p +libc.create_mymodel.restype=c_void_p +libc.eval_string.argtypes=[c_void_p, c_char_p] +libc.sampling.argtypes=[c_void_p] +libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int] + + +class MyModel: + def __init__(self, args): + argc = len(args) + c_str = [c_char_p(i.encode()) for i in args] + args_c = (c_char_p * argc)(*c_str) + self.model = c_void_p(libc.create_mymodel(argc, args_c)) + print("self.model", self.model) + + def eval_float(self, x): + libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[0]) + + def eval_string(self, x): + libc.eval_string(self.model, x.encode()) # c_char_p(x.encode())) + + def eval_token(self, x): + libc.eval_id(self.model, x) + + def sampling(self): + s = libc.sampling(self.model) + return s + + +model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin"]) +print(model) +model.eval_string("""There is a better way to deal with the formula, """) +# model.eval_token(100) +x = np.random.random((10,5120))# , dtype=np.float32) +# print(x[0,0], x[0,1],x[1,0]) +model.eval_float(x) +print(libc) + +for i in range(100): + print(model.sampling().decode(), end="") + + diff --git a/examples/embd_input/embd_input_lib.cpp b/examples/embd_input/embd_input_lib.cpp index a9edc120e4a28..cb7e5d189e162 100644 --- a/examples/embd_input/embd_input_lib.cpp +++ b/examples/embd_input/embd_input_lib.cpp @@ -96,6 +96,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) { struct MyModel* ret= new MyModel(); ret->ctx = ctx; ret->params = params; + ret->n_past = 0; // printf("ctx: %d\n", ret->ctx); return ret; } @@ -106,11 +107,13 @@ bool eval_float(void* model, float* input, int N){ llama_context* ctx = mymodel->ctx; gpt_params params = mymodel->params; int n_emb = llama_n_embd(ctx); - int n_past = 0; - for (int i = 0; i < (int) N; i += params.n_batch) { + int n_past = mymodel->n_past; + // printf("%f,%f\n", *input, *(input+1)); + int n_batch = N; // params.n_batch; + for (int i = 0; i < (int) N; i += n_batch) { int n_eval = (int) N - i; - if (n_eval > params.n_batch) { - n_eval = params.n_batch; + if (n_eval > n_batch) { + n_eval = n_batch; } if (llama_eval_float(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) { fprintf(stderr, "%s : failed to eval\n", __func__); @@ -118,6 +121,7 @@ bool eval_float(void* model, float* input, int N){ } n_past += n_eval; } + mymodel->n_past = n_past; return true; } @@ -135,7 +139,7 @@ bool eval_tokens(void* model, std::vector tokens) { // printf("ctx2: %d\n", ctx); gpt_params params = mymodel->params; // printf("\n%d\n", params); - int n_past = 1; + int n_past = mymodel->n_past; for (int i = 0; i < (int) tokens.size(); i += params.n_batch) { int n_eval = (int) tokens.size() - i; if (n_eval > params.n_batch) { @@ -148,6 +152,7 @@ bool eval_tokens(void* model, std::vector tokens) { } n_past += n_eval; } + mymodel->n_past = n_past; return true; } diff --git a/examples/embd_input/embd_input_test.cpp b/examples/embd_input/embd_input_test.cpp index 96ce130fde6b2..7cd094e352f66 100644 --- a/examples/embd_input/embd_input_test.cpp +++ b/examples/embd_input/embd_input_test.cpp @@ -14,14 +14,16 @@ int main(int argc, char** argv) { data[i] = u(e); } - eval_string(mymodel, "111"); - printf("eval float"); + eval_string(mymodel, "user: what is the color of the flag of UN?"); + // printf("eval float"); eval_float(mymodel, data, N); - printf("eval float end\n"); + eval_string(mymodel, "assistant:"); + // printf("eval float end\n"); eval_string(mymodel, mymodel->params.prompt.c_str()); - for (int i=0;i < 500; i++) { + for (int i=0;i < 50; i++) { int id = sampling_id(mymodel); printf("%s", llama_token_to_str(mymodel->ctx, id)); + fflush(stdout); eval_id(mymodel, id); } printf("\n"); From 9c6117cd8df3efcf1631a09b311cb1696c3b87a5 Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Tue, 6 Jun 2023 22:29:34 +0800 Subject: [PATCH 05/17] fixed sampling --- .gitignore | 1 + examples/embd_input/embd_input.py | 23 +++++++++++------------ examples/embd_input/embd_input_lib.cpp | 1 + examples/embd_input/embd_input_test.cpp | 6 +++--- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 88bf142b954c3..35c77554ea8c0 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,7 @@ models/* /benchmark-matmult /vdot /Pipfile +/embd_input_test build-info.h arm_neon.h diff --git a/examples/embd_input/embd_input.py b/examples/embd_input/embd_input.py index 6d1abf736e143..742bd60884094 100644 --- a/examples/embd_input/embd_input.py +++ b/examples/embd_input/embd_input.py @@ -16,7 +16,7 @@ def __init__(self, args): c_str = [c_char_p(i.encode()) for i in args] args_c = (c_char_p * argc)(*c_str) self.model = c_void_p(libc.create_mymodel(argc, args_c)) - print("self.model", self.model) +# print("self.model", self.model) def eval_float(self, x): libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[0]) @@ -31,17 +31,16 @@ def sampling(self): s = libc.sampling(self.model) return s - -model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin"]) -print(model) -model.eval_string("""There is a better way to deal with the formula, """) +model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"]) +# print(model) +model.eval_string("""user: what is the color of the flag of UN?""") # model.eval_token(100) -x = np.random.random((10,5120))# , dtype=np.float32) -# print(x[0,0], x[0,1],x[1,0]) +x = np.random.random((10, 5120))# , dtype=np.float32) model.eval_float(x) -print(libc) - -for i in range(100): - print(model.sampling().decode(), end="") - +model.eval_string("""assistant:""") +# print(x[0,0], x[0,1],x[1,0]) +# model.eval_float(x) +# print(libc) +for i in range(50): + print(model.sampling().decode(), end="", flush=True) diff --git a/examples/embd_input/embd_input_lib.cpp b/examples/embd_input/embd_input_lib.cpp index cb7e5d189e162..5cbc81709049f 100644 --- a/examples/embd_input/embd_input_lib.cpp +++ b/examples/embd_input/embd_input_lib.cpp @@ -266,6 +266,7 @@ const char* sampling(struct MyModel* mymodel) { llama_context* ctx = mymodel->ctx; int id = sampling_id(mymodel); std::string ret = llama_token_to_str(ctx, id); + eval_id(mymodel, id); return ret.c_str(); } diff --git a/examples/embd_input/embd_input_test.cpp b/examples/embd_input/embd_input_test.cpp index 7cd094e352f66..3d86f03d7900d 100644 --- a/examples/embd_input/embd_input_test.cpp +++ b/examples/embd_input/embd_input_test.cpp @@ -21,10 +21,10 @@ int main(int argc, char** argv) { // printf("eval float end\n"); eval_string(mymodel, mymodel->params.prompt.c_str()); for (int i=0;i < 50; i++) { - int id = sampling_id(mymodel); - printf("%s", llama_token_to_str(mymodel->ctx, id)); + // int id = sampling_id(mymodel); + printf("%s", sampling(mymodel)); // llama_token_to_str(mymodel->ctx, id)); fflush(stdout); - eval_id(mymodel, id); + // eval_id(mymodel, id); } printf("\n"); return 0; From ba1f617d7d6d81cf97760489e1df8487ca5bc277 Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Tue, 6 Jun 2023 22:57:04 +0800 Subject: [PATCH 06/17] add free for context --- examples/embd_input/embd_input.h | 1 + examples/embd_input/embd_input.py | 2 ++ examples/embd_input/embd_input_lib.cpp | 7 +++++++ examples/embd_input/embd_input_test.cpp | 1 + 4 files changed, 11 insertions(+) diff --git a/examples/embd_input/embd_input.h b/examples/embd_input/embd_input.h index f45cee32dac4d..4fefabd425c76 100644 --- a/examples/embd_input/embd_input.h +++ b/examples/embd_input/embd_input.h @@ -23,6 +23,7 @@ bool eval_id(struct MyModel* mymodel, int id); bool eval_string(struct MyModel* mymodel, const char* str); const char* sampling(struct MyModel* mymodel); llama_token sampling_id(struct MyModel* mymodel); +void free_mymodel(struct MyModel* mymodel); } diff --git a/examples/embd_input/embd_input.py b/examples/embd_input/embd_input.py index 742bd60884094..d4831d46abe49 100644 --- a/examples/embd_input/embd_input.py +++ b/examples/embd_input/embd_input.py @@ -17,6 +17,8 @@ def __init__(self, args): args_c = (c_char_p * argc)(*c_str) self.model = c_void_p(libc.create_mymodel(argc, args_c)) # print("self.model", self.model) + def __del__(self): + libc.free_mymodel(self.model) def eval_float(self, x): libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[0]) diff --git a/examples/embd_input/embd_input_lib.cpp b/examples/embd_input/embd_input_lib.cpp index 5cbc81709049f..7ad98dcb9e3d0 100644 --- a/examples/embd_input/embd_input_lib.cpp +++ b/examples/embd_input/embd_input_lib.cpp @@ -101,6 +101,13 @@ struct MyModel* create_mymodel(int argc, char ** argv) { return ret; } +void free_mymodel(struct MyModel* mymodel) { + llama_context* ctx = mymodel->ctx; + llama_print_timings(ctx); + llama_free(ctx); + delete mymodel; +} + bool eval_float(void* model, float* input, int N){ MyModel* mymodel = (MyModel* )model; diff --git a/examples/embd_input/embd_input_test.cpp b/examples/embd_input/embd_input_test.cpp index 3d86f03d7900d..94287e37fdd25 100644 --- a/examples/embd_input/embd_input_test.cpp +++ b/examples/embd_input/embd_input_test.cpp @@ -27,5 +27,6 @@ int main(int argc, char** argv) { // eval_id(mymodel, id); } printf("\n"); + free_mymodel(mymodel); return 0; } From 6ed4893391ec3135e59b8e7c726fd4847fbd1a18 Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Wed, 7 Jun 2023 23:44:54 +0800 Subject: [PATCH 07/17] fixed add end condition for generating --- examples/embd_input/embd_input.py | 30 ++++++++++++++----------- examples/embd_input/embd_input_test.cpp | 8 +++++-- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/examples/embd_input/embd_input.py b/examples/embd_input/embd_input.py index d4831d46abe49..ebce1bb4562d7 100644 --- a/examples/embd_input/embd_input.py +++ b/examples/embd_input/embd_input.py @@ -33,16 +33,20 @@ def sampling(self): s = libc.sampling(self.model) return s -model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"]) -# print(model) -model.eval_string("""user: what is the color of the flag of UN?""") -# model.eval_token(100) -x = np.random.random((10, 5120))# , dtype=np.float32) -model.eval_float(x) -model.eval_string("""assistant:""") -# print(x[0,0], x[0,1],x[1,0]) -# model.eval_float(x) -# print(libc) - -for i in range(50): - print(model.sampling().decode(), end="", flush=True) +if __name__ == "__main__": + model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"]) + # print(model) + model.eval_string("""user: what is the color of the flag of UN?""") + # model.eval_token(100) + x = np.random.random((10, 5120))# , dtype=np.float32) + model.eval_float(x) + model.eval_string("""assistant:""") + # print(x[0,0], x[0,1],x[1,0]) + # model.eval_float(x) + # print(libc) + + for i in range(500): + tmp = model.sampling().decode() + if tmp == "": + break + print(tmp, end="", flush=True) diff --git a/examples/embd_input/embd_input_test.cpp b/examples/embd_input/embd_input_test.cpp index 94287e37fdd25..d83febeb23371 100644 --- a/examples/embd_input/embd_input_test.cpp +++ b/examples/embd_input/embd_input_test.cpp @@ -1,6 +1,7 @@ #include "embd_input.h" #include #include +#include int main(int argc, char** argv) { @@ -20,9 +21,12 @@ int main(int argc, char** argv) { eval_string(mymodel, "assistant:"); // printf("eval float end\n"); eval_string(mymodel, mymodel->params.prompt.c_str()); - for (int i=0;i < 50; i++) { + const char* tmp; + for (int i=0;i < 500; i++) { // int id = sampling_id(mymodel); - printf("%s", sampling(mymodel)); // llama_token_to_str(mymodel->ctx, id)); + tmp = sampling(mymodel); + if (strlen(tmp) == 0) break; + printf("%s", tmp); // llama_token_to_str(mymodel->ctx, id)); fflush(stdout); // eval_id(mymodel, id); } From 8cea3ab9e56c3ddda4e502060daaaf35060cacee Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Thu, 8 Jun 2023 04:50:31 +0800 Subject: [PATCH 08/17] add examples for llava.py --- examples/embd_input/embd_input.py | 4 +-- examples/embd_input/llava.py | 46 +++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 examples/embd_input/llava.py diff --git a/examples/embd_input/embd_input.py b/examples/embd_input/embd_input.py index ebce1bb4562d7..db5cd0fdb528e 100644 --- a/examples/embd_input/embd_input.py +++ b/examples/embd_input/embd_input.py @@ -21,7 +21,7 @@ def __del__(self): libc.free_mymodel(self.model) def eval_float(self, x): - libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[0]) + libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1]) def eval_string(self, x): libc.eval_string(self.model, x.encode()) # c_char_p(x.encode())) @@ -38,7 +38,7 @@ def sampling(self): # print(model) model.eval_string("""user: what is the color of the flag of UN?""") # model.eval_token(100) - x = np.random.random((10, 5120))# , dtype=np.float32) + x = np.random.random((5120,10))# , dtype=np.float32) model.eval_float(x) model.eval_string("""assistant:""") # print(x[0,0], x[0,1],x[1,0]) diff --git a/examples/embd_input/llava.py b/examples/embd_input/llava.py new file mode 100644 index 0000000000000..914c7ef00b762 --- /dev/null +++ b/examples/embd_input/llava.py @@ -0,0 +1,46 @@ +import sys +import os +sys.path.insert(0, os.path.dirname(__file__)) +from embd_input import MyModel +import numpy as np +from torch import nn +import torch +from transformers import CLIPVisionModel, CLIPImageProcessor +from PIL import Image +vision_tower = "openai/clip-vit-large-patch14" + +class Llava: + def __init__(self): + self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower) + self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower) + self.mm_projector = nn.Linear(1024, 5120) + self.model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"]) + + def chat_with_image(self, image, question): + with torch.no_grad(): + embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True) + select_hidden_state_layer = -2 + select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer] + image_feature = select_hidden_state[:, 1:] + embd_image = self.mm_projector(image_feature) + embd_image = embd_image.cpu().numpy() + self.model.eval_string("user: ") + # print(embd_image.shape) + self.model.eval_float(embd_image.T) + self.model.eval_string(question) + self.model.eval_string("\nassistant: ") + ret = "" + for _ in range(500): + tmp = self.model.sampling().decode() + if tmp == "": + break + ret += tmp + return ret + +a = Llava() +state = torch.load(os.path.dirname(__file__) + "/a.pth") +a.mm_projector.load_state_dict({"weight": state["model.mm_projector.weight"], "bias": state["model.mm_projector.bias"]}) +print(a.chat_with_image(Image.open("./media/llama1-logo.png").convert('RGB'), "what is the text in the picture?")) + + From 4f1aa3cc763f99f44421157716bf00a0d0de1ade Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Sat, 17 Jun 2023 16:41:37 +0800 Subject: [PATCH 09/17] add READMD for llava.py --- examples/embd_input/README.md | 20 +++++++++ examples/embd_input/embd_input_lib.cpp | 5 ++- examples/embd_input/embd_input_test.cpp | 2 +- examples/embd_input/llava.py | 58 +++++++++++++++++++------ 4 files changed, 70 insertions(+), 15 deletions(-) create mode 100644 examples/embd_input/README.md diff --git a/examples/embd_input/README.md b/examples/embd_input/README.md new file mode 100644 index 0000000000000..1d23d086d00e3 --- /dev/null +++ b/examples/embd_input/README.md @@ -0,0 +1,20 @@ +### Examples for input embedding directly + +## LLAVA example (llava.py) + +1. obtian llava model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/) +2. convert it to ggml format +3. llava_projection.pth is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin) + +``` +import torch + +bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin" +pth_path = "./examples/embd_input/llava_projection.pth" + +dic = torch.load(bin_path) +used_key = ["model.mm_projector.weight","model.mm_projector.bias"] +torch.save({k: dic[k] for k in used_key}, pth_path) +``` + + diff --git a/examples/embd_input/embd_input_lib.cpp b/examples/embd_input/embd_input_lib.cpp index 7ad98dcb9e3d0..bbdf6d645b17f 100644 --- a/examples/embd_input/embd_input_lib.cpp +++ b/examples/embd_input/embd_input_lib.cpp @@ -272,7 +272,10 @@ llama_token sampling_id(struct MyModel* mymodel) { const char* sampling(struct MyModel* mymodel) { llama_context* ctx = mymodel->ctx; int id = sampling_id(mymodel); - std::string ret = llama_token_to_str(ctx, id); + + std::string ret; + if (id == llama_token_eos()) ret = ""; + else ret = llama_token_to_str(ctx, id); eval_id(mymodel, id); return ret.c_str(); } diff --git a/examples/embd_input/embd_input_test.cpp b/examples/embd_input/embd_input_test.cpp index d83febeb23371..e14141497c35d 100644 --- a/examples/embd_input/embd_input_test.cpp +++ b/examples/embd_input/embd_input_test.cpp @@ -25,7 +25,7 @@ int main(int argc, char** argv) { for (int i=0;i < 500; i++) { // int id = sampling_id(mymodel); tmp = sampling(mymodel); - if (strlen(tmp) == 0) break; + if (strcmp(tmp, "")==0) break; printf("%s", tmp); // llama_token_to_str(mymodel->ctx, id)); fflush(stdout); // eval_id(mymodel, id); diff --git a/examples/embd_input/llava.py b/examples/embd_input/llava.py index 914c7ef00b762..8489f792795a8 100644 --- a/examples/embd_input/llava.py +++ b/examples/embd_input/llava.py @@ -7,40 +7,72 @@ import torch from transformers import CLIPVisionModel, CLIPImageProcessor from PIL import Image + +# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1' vision_tower = "openai/clip-vit-large-patch14" +select_hidden_state_layer = -2 +# (vision_config.image_size // vision_config.patch_size) ** 2 +image_token_len = (224//14)**2 class Llava: - def __init__(self): + def __init__(self, args): self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower) self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower) self.mm_projector = nn.Linear(1024, 5120) - self.model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"]) + self.model = MyModel(["main", *args]) + + def load_projection(self, path): + state = torch.load(path) + self.mm_projector.load_state_dict({ + "weight": state["model.mm_projector.weight"], + "bias": state["model.mm_projector.bias"]}) + + def chat(self, question): + self.model.eval_string("user: ") + self.model.eval_string(question) + self.model.eval_string("\nassistant: ") + return self.sampling() def chat_with_image(self, image, question): with torch.no_grad(): embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True) - select_hidden_state_layer = -2 select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer] image_feature = select_hidden_state[:, 1:] embd_image = self.mm_projector(image_feature) - embd_image = embd_image.cpu().numpy() + embd_image = embd_image.cpu().numpy()[0] self.model.eval_string("user: ") - # print(embd_image.shape) + self.model.eval_token(32003-2) # im_start self.model.eval_float(embd_image.T) + for i in range(image_token_len-embd_image.shape[0]): + self.model.eval_token(32003-3) # im_patch + self.model.eval_token(32003-1) # im_end self.model.eval_string(question) self.model.eval_string("\nassistant: ") - ret = "" + return self.sampling() + + def sampling(self): + ret = b"" for _ in range(500): - tmp = self.model.sampling().decode() - if tmp == "": + tmp = self.model.sampling() # .decode() + if tmp == b"": break ret += tmp - return ret + return ret.decode() + +if __name__=="__main__": + # model form liuhaotian/LLaVA-13b-delta-v1-1 + a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"]) + # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin. + # Also here can use pytorch_model-00003-of-00003.bin directly. + a.load_projection(os.path.join( + os.path.dirname(__file__) , + "llava_projetion.pth")) + respose = a.chat_with_image( + Image.open("./media/llama1-logo.png").convert('RGB'), + "what is the text in the picture?") + print(respose) + print(a.chat("what is the color of it?")) -a = Llava() -state = torch.load(os.path.dirname(__file__) + "/a.pth") -a.mm_projector.load_state_dict({"weight": state["model.mm_projector.weight"], "bias": state["model.mm_projector.bias"]}) -print(a.chat_with_image(Image.open("./media/llama1-logo.png").convert('RGB'), "what is the text in the picture?")) From 93c57a057175951182b2646ee2b842294fe8e6c5 Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Sat, 17 Jun 2023 16:43:36 +0800 Subject: [PATCH 10/17] add READMD for llava.py --- examples/embd_input/README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/embd_input/README.md b/examples/embd_input/README.md index 1d23d086d00e3..56db072cf695d 100644 --- a/examples/embd_input/README.md +++ b/examples/embd_input/README.md @@ -3,8 +3,12 @@ ## LLAVA example (llava.py) 1. obtian llava model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/) -2. convert it to ggml format -3. llava_projection.pth is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin) +2. build `libembd_input.so` +``` +make +``` +3. convert it to ggml format +4. llava_projection.pth is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin) ``` import torch @@ -17,4 +21,3 @@ used_key = ["model.mm_projector.weight","model.mm_projector.bias"] torch.save({k: dic[k] for k in used_key}, pth_path) ``` - From 53dfbbf553e16264a41f71d23d166bc2f79e323b Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Tue, 20 Jun 2023 22:57:21 +0800 Subject: [PATCH 11/17] add example of PandaGPT --- convert-lora-to-ggml.py | 6 +- examples/embd_input/README.md | 36 +++++++++-- examples/embd_input/embd_input.py | 29 +++++++++ examples/embd_input/llava.py | 12 +--- examples/embd_input/panda_gpt.py | 100 ++++++++++++++++++++++++++++++ 5 files changed, 166 insertions(+), 17 deletions(-) create mode 100644 examples/embd_input/panda_gpt.py diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index 9090e8d6dd55a..f43c836f577a6 100644 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -113,6 +113,10 @@ def write_tensor_header( write_file_header(fout, params) for k, v in model.items(): + if k.endswith(".default.weight"): + k = k.replace(".default.weight", ".weight") + if k in ["llama_proj.weight", "llama_proj.bias"]: + continue if k.endswith("lora_A.weight"): if v.dtype != torch.float16 and v.dtype != torch.float32: v = v.float() @@ -120,7 +124,7 @@ def write_tensor_header( else: v = v.float() - t = v.numpy() + t = v.detach().numpy() tname = translate_tensor_name(k) print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") write_tensor_header(fout, tname, t.shape, t.dtype) diff --git a/examples/embd_input/README.md b/examples/embd_input/README.md index 56db072cf695d..c180d541aa6c4 100644 --- a/examples/embd_input/README.md +++ b/examples/embd_input/README.md @@ -1,14 +1,17 @@ ### Examples for input embedding directly -## LLAVA example (llava.py) - -1. obtian llava model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/) -2. build `libembd_input.so` +## Requirement +build `libembd_input.so` +run the following comman in main dir (../../). ``` make ``` -3. convert it to ggml format -4. llava_projection.pth is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin) + +## LLAVA example (llava.py) + +1. obtian llava model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/) +2. convert it to ggml format +3. llava_projection.pth is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin) ``` import torch @@ -21,3 +24,24 @@ used_key = ["model.mm_projector.weight","model.mm_projector.bias"] torch.save({k: dic[k] for k in used_key}, pth_path) ``` +## PandaGPT example (panda_gpt.py) + +1. Obtian PandaGPT lora model. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format. +The `adapter_config.json` is +``` +{ + "peft_type": "LORA", + "fan_in_fan_out": false, + "bias": null, + "modules_to_save": null, + "r": 32, + "lora_alpha": 32, + "lora_dropout": 0.1, + "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"] +} +``` +2. papare the `vicuna` v0 model. +3. obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model. +4. Clone the PandaGPT source. +5. check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py. + diff --git a/examples/embd_input/embd_input.py b/examples/embd_input/embd_input.py index db5cd0fdb528e..ce057a89d504d 100644 --- a/examples/embd_input/embd_input.py +++ b/examples/embd_input/embd_input.py @@ -33,6 +33,35 @@ def sampling(self): s = libc.sampling(self.model) return s + def generate(self, end=""): + ret = b"" + end = end.encode() + for _ in range(500): + tmp = self.sampling() # .decode() + if (ret+tmp).endswith(end): + break + ret += tmp + return ret.decode() + + def stream_generate(self, end=""): + ret = b"" + end = end.encode() + head = b"" + for _ in range(500): + tmp = self.sampling() # .decode() + ret += tmp + try: + text = (head + tmp).decode() + print(text, end="") + head = b"" + except: + head += text + if ret.endswith(end): + break + print("") + return ret.decode() + + if __name__ == "__main__": model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"]) # print(model) diff --git a/examples/embd_input/llava.py b/examples/embd_input/llava.py index 8489f792795a8..a1efaddf654c4 100644 --- a/examples/embd_input/llava.py +++ b/examples/embd_input/llava.py @@ -31,7 +31,7 @@ def chat(self, question): self.model.eval_string("user: ") self.model.eval_string(question) self.model.eval_string("\nassistant: ") - return self.sampling() + return self.model.generate() def chat_with_image(self, image, question): with torch.no_grad(): @@ -49,16 +49,8 @@ def chat_with_image(self, image, question): self.model.eval_token(32003-1) # im_end self.model.eval_string(question) self.model.eval_string("\nassistant: ") - return self.sampling() + return self.model.generate() - def sampling(self): - ret = b"" - for _ in range(500): - tmp = self.model.sampling() # .decode() - if tmp == b"": - break - ret += tmp - return ret.decode() if __name__=="__main__": # model form liuhaotian/LLaVA-13b-delta-v1-1 diff --git a/examples/embd_input/panda_gpt.py b/examples/embd_input/panda_gpt.py new file mode 100644 index 0000000000000..b1199b95df7e5 --- /dev/null +++ b/examples/embd_input/panda_gpt.py @@ -0,0 +1,100 @@ +import sys +import os +sys.path.insert(0, os.path.dirname(__file__)) +from embd_input import MyModel +import numpy as np +from torch import nn +import torch + +# use PandaGPT path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "PandaGPT","code","model")) +from ImageBind.models import imagebind_model +from ImageBind import data + +imagebind_ckpt_path = "./models/panda_gpt/" +ModalityType = imagebind_model.ModalityType +max_tgt_len = 400 + +class PandaGPT: + def __init__(self, args): + self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path) + self.visual_encoder.eval() + self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120) + self.max_tgt_len = max_tgt_len + self.model = MyModel(["main", *args]) + self.generated_text = "" + self.device = "cpu" + + def load_projection(self, path): + state = torch.load(path, map_location="cpu") + self.llama_proj.load_state_dict({ + "weight": state["llama_proj.weight"], + "bias": state["llama_proj.bias"]}) + + def chat(self, question): + if self.generated_text == "": + self.model.eval_string("###") + self.model.eval_string(" Human: ") + self.model.eval_string(question) + self.model.eval_string("\n### Assistant:") + ret = self.model.stream_generate(end="###") + self.generated_text += ret + return ret + + def chat_with_image(self, inputs, question): + if self.generated_text == "": + self.model.eval_string("###") + self.model.eval_string(" Human: ") + embds = self.extract_multimoal_feature(inputs) + for i in embds: + self.model.eval_float(i.T) + self.model.eval_string(" " + question + "\n### Assistant:") + ret = self.model.stream_generate(end="###") + self.generated_text += ret + return ret + + def extract_multimoal_feature(self, inputs): + features = [] + for key in ["image", "audio", "video", "thermal"]: + if key + "_paths" in inputs: + embeds = self.encode_data(key, inputs[key+"_paths"]) + features.append(embeds) + return features + + def encode_data(self, data_type, data_paths): + + type_map = { + "image": ModalityType.VISION, + "audio": ModalityType.AUDIO, + "video": ModalityType.VISION, + "thermal": ModalityType.THERMAL, + } + load_map = { + "image": data.load_and_transform_vision_data, + "audio": data.load_and_transform_audio_data, + "video": data.load_and_transform_video_data, + "thermal": data.load_and_transform_thermal_data + } + + load_function = load_map[data_type] + key = type_map[data_type] + + inputs = {key: load_function(data_paths, self.device)} + with torch.no_grad(): + embeddings = self.visual_encoder(inputs) + embeds = embeddings[key] + embeds = self.llama_proj(embeds).cpu().numpy() + return embeds + + +if __name__=="__main__": + # model form liuhaotian/LLaVA-13b-delta-v1-1 + a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"]) + # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin. + # Also here can use pytorch_model-00003-of-00003.bin directly. + a.load_projection("./models/panda_gpt/adapter_model.bin") + a.chat_with_image( + {"image_paths": ["./media/llama1-logo.png"]}, + "what is the text in the picture? 'llama' or 'lambda'?") + a.chat("what is the color of it?") + From 9d866118c4169fecf052d9b541f963176cfacc91 Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Sun, 25 Jun 2023 01:21:54 +0800 Subject: [PATCH 12/17] refactor the interface and fixed the styles --- Makefile | 10 +- examples/{embd_input => embd-input}/README.md | 2 +- examples/embd-input/embd-input-lib.cpp | 218 ++++++++++++++ .../embd-input-test.cpp} | 15 +- .../embd_input.h => embd-input/embd-input.h} | 0 .../{embd_input => embd-input}/embd_input.py | 56 ++-- examples/{embd_input => embd-input}/llava.py | 8 +- .../{embd_input => embd-input}/panda_gpt.py | 36 ++- examples/embd_input/embd_input_lib.cpp | 283 ------------------ llama.cpp | 141 ++++----- llama.h | 4 +- 11 files changed, 333 insertions(+), 440 deletions(-) rename examples/{embd_input => embd-input}/README.md (98%) create mode 100644 examples/embd-input/embd-input-lib.cpp rename examples/{embd_input/embd_input_test.cpp => embd-input/embd-input-test.cpp} (71%) rename examples/{embd_input/embd_input.h => embd-input/embd-input.h} (100%) rename examples/{embd_input => embd-input}/embd_input.py (64%) rename examples/{embd_input => embd-input}/llava.py (94%) rename examples/{embd_input => embd-input}/panda_gpt.py (81%) delete mode 100644 examples/embd_input/embd_input_lib.cpp diff --git a/Makefile b/Makefile index 41fb6d7c76ca1..2fc4424471600 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple libembd_input.so embd_input_test +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple libembdinput.so embd-input-test ifdef LLAMA_BUILD_SERVER BUILD_TARGETS += server @@ -302,12 +302,12 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml. server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) -libembd_input.so: examples/embd_input/embd_input.h examples/embd_input/embd_input_lib.cpp examples/embd_input/embd_input_test.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) --shared $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) +libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) -embd_input_test: libembd_input.so examples/embd_input/embd_input_test.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -Wl,-rpath=./ -lembd_input +embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) diff --git a/examples/embd_input/README.md b/examples/embd-input/README.md similarity index 98% rename from examples/embd_input/README.md rename to examples/embd-input/README.md index c180d541aa6c4..eb1095f24fe96 100644 --- a/examples/embd_input/README.md +++ b/examples/embd-input/README.md @@ -1,6 +1,6 @@ ### Examples for input embedding directly -## Requirement +## Requirement build `libembd_input.so` run the following comman in main dir (../../). ``` diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp new file mode 100644 index 0000000000000..37a5b52086ccf --- /dev/null +++ b/examples/embd-input/embd-input-lib.cpp @@ -0,0 +1,218 @@ +// Defines sigaction on msys: +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include "embd-input.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static llama_context ** g_ctx; + +extern "C" { + +struct MyModel* create_mymodel(int argc, char ** argv) { + gpt_params params; + + if (gpt_params_parse(argc, argv, params) == false) { + return nullptr; + } + + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + + if (params.seed < 0) { + params.seed = time(NULL); + } + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + + llama_init_backend(); + + llama_context * ctx; + g_ctx = &ctx; + + // load the model and apply lora adapter, if any + ctx = llama_init_from_gpt_params(params); + if (ctx == NULL) { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + return nullptr; + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + struct MyModel * ret = new MyModel(); + ret->ctx = ctx; + ret->params = params; + ret->n_past = 0; + // printf("ctx: %d\n", ret->ctx); + return ret; +} + +void free_mymodel(struct MyModel * mymodel) { + llama_context * ctx = mymodel->ctx; + llama_print_timings(ctx); + llama_free(ctx); + delete mymodel; +} + + +bool eval_float(void * model, float * input, int N){ + MyModel * mymodel = (MyModel*)model; + llama_context * ctx = mymodel->ctx; + gpt_params params = mymodel->params; + int n_emb = llama_n_embd(ctx); + int n_past = mymodel->n_past; + int n_batch = N; // params.n_batch; + + for (int i = 0; i < (int) N; i += n_batch) { + int n_eval = (int) N - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + if (llama_eval_embd(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + n_past += n_eval; + } + mymodel->n_past = n_past; + return true; +} + +bool eval_tokens(void * model, std::vector tokens) { + MyModel * mymodel = (MyModel* )model; + llama_context * ctx; + ctx = mymodel->ctx; + gpt_params params = mymodel->params; + int n_past = mymodel->n_past; + for (int i = 0; i < (int) tokens.size(); i += params.n_batch) { + int n_eval = (int) tokens.size() - i; + if (n_eval > params.n_batch) { + n_eval = params.n_batch; + } + if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + n_past += n_eval; + } + mymodel->n_past = n_past; + return true; +} + +bool eval_id(struct MyModel* mymodel, int id) { + std::vector tokens; + tokens.push_back(id); + return eval_tokens(mymodel, tokens); +} + +bool eval_string(struct MyModel * mymodel,const char* str){ + llama_context * ctx = mymodel->ctx; + std::string str2 = str; + std::vector embd_inp = ::llama_tokenize(ctx, str2, true); + eval_tokens(mymodel, embd_inp); + return true; +} + +llama_token sampling_id(struct MyModel* mymodel) { + llama_context* ctx = mymodel->ctx; + gpt_params params = mymodel->params; + // int n_ctx = llama_n_ctx(ctx); + + // out of user input, sample next token + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + const float top_p = params.top_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + // const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; + // const float repeat_penalty = params.repeat_penalty; + // const float alpha_presence = params.presence_penalty; + // const float alpha_frequency = params.frequency_penalty; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + // const bool penalize_nl = params.penalize_nl; + + llama_token id = 0; + { + auto logits = llama_get_logits(ctx); + auto n_vocab = llama_n_vocab(ctx); + + // Apply params.logit_bias map + for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { + logits[it->first] += it->second; + } + + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // TODO: Apply penalties + // float nl_logit = logits[llama_token_nl()]; + // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); + // llama_sample_repetition_penalty(ctx, &candidates_p, + // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + // last_n_repeat, repeat_penalty); + // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, + // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + // last_n_repeat, alpha_frequency, alpha_presence); + // if (!penalize_nl) { + // logits[llama_token_nl()] = nl_logit; + // } + + if (temp <= 0) { + // Greedy sampling + id = llama_sample_token_greedy(ctx, &candidates_p); + } else { + if (mirostat == 1) { + static float mirostat_mu = 2.0f * mirostat_tau; + const int mirostat_m = 100; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); + } else if (mirostat == 2) { + static float mirostat_mu = 2.0f * mirostat_tau; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); + } else { + // Temperature sampling + llama_sample_top_k(ctx, &candidates_p, top_k, 1); + llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); + llama_sample_typical(ctx, &candidates_p, typical_p, 1); + llama_sample_top_p(ctx, &candidates_p, top_p, 1); + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token(ctx, &candidates_p); + } + } + } + + return id; +} + +const char * sampling(struct MyModel * mymodel) { + llama_context * ctx = mymodel->ctx; + int id = sampling_id(mymodel); + std::string ret; + if (id == llama_token_eos()) ret = ""; + else ret = llama_token_to_str(ctx, id); + eval_id(mymodel, id); + return ret.c_str(); +} + +} diff --git a/examples/embd_input/embd_input_test.cpp b/examples/embd-input/embd-input-test.cpp similarity index 71% rename from examples/embd_input/embd_input_test.cpp rename to examples/embd-input/embd-input-test.cpp index e14141497c35d..e5e040f62a60a 100644 --- a/examples/embd_input/embd_input_test.cpp +++ b/examples/embd-input/embd-input-test.cpp @@ -1,4 +1,4 @@ -#include "embd_input.h" +#include "embd-input.h" #include #include #include @@ -7,8 +7,11 @@ int main(int argc, char** argv) { auto mymodel = create_mymodel(argc, argv); int N = 10; + int max_tgt_len = 500; int n_embd = llama_n_embd(mymodel->ctx); - float* data = new float[N*n_embd]; + + // add random float embd to test evaluation + float * data = new float[N*n_embd]; std::default_random_engine e; std::uniform_real_distribution u(0,1); for (int i=0;iparams.prompt.c_str()); const char* tmp; - for (int i=0;i < 500; i++) { - // int id = sampling_id(mymodel); + for (int i=0; i")==0) break; - printf("%s", tmp); // llama_token_to_str(mymodel->ctx, id)); + printf("%s", tmp); fflush(stdout); - // eval_id(mymodel, id); } printf("\n"); free_mymodel(mymodel); diff --git a/examples/embd_input/embd_input.h b/examples/embd-input/embd-input.h similarity index 100% rename from examples/embd_input/embd_input.h rename to examples/embd-input/embd-input.h diff --git a/examples/embd_input/embd_input.py b/examples/embd-input/embd_input.py similarity index 64% rename from examples/embd_input/embd_input.py rename to examples/embd-input/embd_input.py index ce057a89d504d..be2896614e9b3 100644 --- a/examples/embd_input/embd_input.py +++ b/examples/embd-input/embd_input.py @@ -1,8 +1,9 @@ import ctypes from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int import numpy as np +import os -libc = cdll.LoadLibrary("./libembd_input.so") +libc = cdll.LoadLibrary("./libembdinput.so") libc.sampling.restype=c_char_p libc.create_mymodel.restype=c_void_p libc.eval_string.argtypes=[c_void_p, c_char_p] @@ -16,7 +17,9 @@ def __init__(self, args): c_str = [c_char_p(i.encode()) for i in args] args_c = (c_char_p * argc)(*c_str) self.model = c_void_p(libc.create_mymodel(argc, args_c)) -# print("self.model", self.model) + self.max_tgt_len = 512 + self.print_string_eval = True + def __del__(self): libc.free_mymodel(self.model) @@ -25,6 +28,8 @@ def eval_float(self, x): def eval_string(self, x): libc.eval_string(self.model, x.encode()) # c_char_p(x.encode())) + if self.print_string_eval: + print(x) def eval_token(self, x): libc.eval_id(self.model, x) @@ -33,49 +38,34 @@ def sampling(self): s = libc.sampling(self.model) return s - def generate(self, end=""): - ret = b"" - end = end.encode() - for _ in range(500): - tmp = self.sampling() # .decode() - if (ret+tmp).endswith(end): - break - ret += tmp - return ret.decode() - def stream_generate(self, end=""): ret = b"" end = end.encode() - head = b"" - for _ in range(500): - tmp = self.sampling() # .decode() + for _ in range(self.max_tgt_len): + tmp = self.sampling() ret += tmp - try: - text = (head + tmp).decode() - print(text, end="") - head = b"" - except: - head += text + yield tmp if ret.endswith(end): break + + def generate_with_print(self, end=""): + ret = b"" + for i in self.stream_generate(end=end): + ret += i + print(i.decode(errors="replace"), end="", flush=True) print("") - return ret.decode() + return ret.decode(errors="replace") + def generate(self, end=""): + text = b"".join(self.stream_generate(end=end)) + return text.decode(errors="replace") + if __name__ == "__main__": model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"]) - # print(model) model.eval_string("""user: what is the color of the flag of UN?""") - # model.eval_token(100) x = np.random.random((5120,10))# , dtype=np.float32) model.eval_float(x) model.eval_string("""assistant:""") - # print(x[0,0], x[0,1],x[1,0]) - # model.eval_float(x) - # print(libc) - - for i in range(500): - tmp = model.sampling().decode() - if tmp == "": - break - print(tmp, end="", flush=True) + for i in model.generate(): + print(i.decode(errors="replace"), end="", flush=True) diff --git a/examples/embd_input/llava.py b/examples/embd-input/llava.py similarity index 94% rename from examples/embd_input/llava.py rename to examples/embd-input/llava.py index a1efaddf654c4..2f20cb7225b20 100644 --- a/examples/embd_input/llava.py +++ b/examples/embd-input/llava.py @@ -31,7 +31,7 @@ def chat(self, question): self.model.eval_string("user: ") self.model.eval_string(question) self.model.eval_string("\nassistant: ") - return self.model.generate() + return self.model.generate_with_print() def chat_with_image(self, image, question): with torch.no_grad(): @@ -49,7 +49,7 @@ def chat_with_image(self, image, question): self.model.eval_token(32003-1) # im_end self.model.eval_string(question) self.model.eval_string("\nassistant: ") - return self.model.generate() + return self.model.generate_with_print() if __name__=="__main__": @@ -63,8 +63,8 @@ def chat_with_image(self, image, question): respose = a.chat_with_image( Image.open("./media/llama1-logo.png").convert('RGB'), "what is the text in the picture?") - print(respose) - print(a.chat("what is the color of it?")) + respose + a.chat("what is the color of it?") diff --git a/examples/embd_input/panda_gpt.py b/examples/embd-input/panda_gpt.py similarity index 81% rename from examples/embd_input/panda_gpt.py rename to examples/embd-input/panda_gpt.py index b1199b95df7e5..0cfac5f32adf2 100644 --- a/examples/embd_input/panda_gpt.py +++ b/examples/embd-input/panda_gpt.py @@ -7,11 +7,13 @@ import torch # use PandaGPT path -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "PandaGPT","code","model")) +panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT") +imagebind_ckpt_path = "./models/panda_gpt/" + +sys.path.insert(0, os.path.join(panda_gpt_path,"code","model")) from ImageBind.models import imagebind_model from ImageBind import data -imagebind_ckpt_path = "./models/panda_gpt/" ModalityType = imagebind_model.ModalityType max_tgt_len = 400 @@ -31,25 +33,25 @@ def load_projection(self, path): "weight": state["llama_proj.weight"], "bias": state["llama_proj.bias"]}) + def eval_inputs(self, inputs): + self.model.eval_string("") + embds = self.extract_multimoal_feature(inputs) + for i in embds: + self.model.eval_float(i.T) + self.model.eval_string(" ") + def chat(self, question): + return self.chat_with_image(None, question) + + def chat_with_image(self, inputs, question): if self.generated_text == "": self.model.eval_string("###") self.model.eval_string(" Human: ") + if inputs: + self.eval_inputs(inputs) self.model.eval_string(question) self.model.eval_string("\n### Assistant:") - ret = self.model.stream_generate(end="###") - self.generated_text += ret - return ret - - def chat_with_image(self, inputs, question): - if self.generated_text == "": - self.model.eval_string("###") - self.model.eval_string(" Human: ") - embds = self.extract_multimoal_feature(inputs) - for i in embds: - self.model.eval_float(i.T) - self.model.eval_string(" " + question + "\n### Assistant:") - ret = self.model.stream_generate(end="###") + ret = self.model.generate_with_print(end="###") self.generated_text += ret return ret @@ -88,13 +90,9 @@ def encode_data(self, data_type, data_paths): if __name__=="__main__": - # model form liuhaotian/LLaVA-13b-delta-v1-1 a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"]) - # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin. - # Also here can use pytorch_model-00003-of-00003.bin directly. a.load_projection("./models/panda_gpt/adapter_model.bin") a.chat_with_image( {"image_paths": ["./media/llama1-logo.png"]}, "what is the text in the picture? 'llama' or 'lambda'?") a.chat("what is the color of it?") - diff --git a/examples/embd_input/embd_input_lib.cpp b/examples/embd_input/embd_input_lib.cpp deleted file mode 100644 index bbdf6d645b17f..0000000000000 --- a/examples/embd_input/embd_input_lib.cpp +++ /dev/null @@ -1,283 +0,0 @@ -// Defines sigaction on msys: -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - -#include "embd_input.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) -#include -#include -#elif defined (_WIN32) -#define WIN32_LEAN_AND_MEAN -#define NOMINMAX -#include -#include -#endif - -static console_state con_st; -static llama_context ** g_ctx; - -static bool is_interacting = false; - -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) -void sigint_handler(int signo) { - if (signo == SIGINT) { - if (!is_interacting) { - is_interacting=true; - } else { - console_cleanup(con_st); - printf("\n"); - llama_print_timings(*g_ctx); - _exit(130); - } - } -} -#endif - - -extern "C" { - -struct MyModel* create_mymodel(int argc, char ** argv) { - gpt_params params; - - if (gpt_params_parse(argc, argv, params) == false) { - return nullptr; - } - - - if (params.n_ctx > 2048) { - fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" - "expect poor results\n", __func__, params.n_ctx); - } - - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); - - if (params.seed < 0) { - params.seed = time(NULL); - } - - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.random_prompt) { - params.prompt = gpt_random_prompt(rng); - } - - llama_init_backend(); - - llama_context * ctx; - g_ctx = &ctx; - - // load the model and apply lora adapter, if any - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); - return nullptr; - } - - // print system information - { - fprintf(stderr, "\n"); - fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", - params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); - } - struct MyModel* ret= new MyModel(); - ret->ctx = ctx; - ret->params = params; - ret->n_past = 0; - // printf("ctx: %d\n", ret->ctx); - return ret; -} - -void free_mymodel(struct MyModel* mymodel) { - llama_context* ctx = mymodel->ctx; - llama_print_timings(ctx); - llama_free(ctx); - delete mymodel; -} - - -bool eval_float(void* model, float* input, int N){ - MyModel* mymodel = (MyModel* )model; - llama_context* ctx = mymodel->ctx; - gpt_params params = mymodel->params; - int n_emb = llama_n_embd(ctx); - int n_past = mymodel->n_past; - // printf("%f,%f\n", *input, *(input+1)); - int n_batch = N; // params.n_batch; - for (int i = 0; i < (int) N; i += n_batch) { - int n_eval = (int) N - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - if (llama_eval_float(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return false; - } - n_past += n_eval; - } - mymodel->n_past = n_past; - return true; -} - - - - - -bool eval_tokens(void* model, std::vector tokens) { - MyModel* mymodel = (MyModel* )model; - // printf("model: %d\n", mymodel); - llama_context* ctx;// = mymodel->ctx; - // printf("ctx2: %d\n", ctx); - // printf("ctx2: %d\n", mymodel->ctx); - ctx = mymodel->ctx; - // printf("ctx2: %d\n", ctx); - gpt_params params = mymodel->params; - // printf("\n%d\n", params); - int n_past = mymodel->n_past; - for (int i = 0; i < (int) tokens.size(); i += params.n_batch) { - int n_eval = (int) tokens.size() - i; - if (n_eval > params.n_batch) { - n_eval = params.n_batch; - } - // printf("%d, %d, %d\n", i, n_eval, n_past); - if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return false; - } - n_past += n_eval; - } - mymodel->n_past = n_past; - return true; -} - -bool eval_id(struct MyModel* mymodel, int id) { - // printf("%d\n", id); - std::vector tokens; - tokens.push_back(id); - // printf("%d\n", tokens.size()); - // printf("%d\n", tokens[0]); - return eval_tokens(mymodel, tokens); -} - - -bool eval_string(struct MyModel* mymodel,const char* str){ - // std::cout << "eval " << std::endl; - // printf("%s", str); - llama_context* ctx = mymodel->ctx; - std::string str2 = str; - // printf("%s", str2.c_str()); - std::cout << str2 << std::endl; - std::vector embd_inp = ::llama_tokenize(ctx, str2, true); - eval_tokens(mymodel, embd_inp); - return true; -} - - - - -llama_token sampling_id(struct MyModel* mymodel) { - llama_context* ctx = mymodel->ctx; - gpt_params params = mymodel->params; - // int n_ctx = llama_n_ctx(ctx); - - - // out of user input, sample next token - const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; - const float top_p = params.top_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; - // const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; - // const float repeat_penalty = params.repeat_penalty; - // const float alpha_presence = params.presence_penalty; - // const float alpha_frequency = params.frequency_penalty; - const int mirostat = params.mirostat; - const float mirostat_tau = params.mirostat_tau; - const float mirostat_eta = params.mirostat_eta; - // const bool penalize_nl = params.penalize_nl; - - llama_token id = 0; - - { - auto logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); - - // Apply params.logit_bias map - for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { - logits[it->first] += it->second; - } - - std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - - // Apply penalties -// float nl_logit = logits[llama_token_nl()]; -// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); -// llama_sample_repetition_penalty(ctx, &candidates_p, -// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, -// last_n_repeat, repeat_penalty); -// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, -// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, -// last_n_repeat, alpha_frequency, alpha_presence); -// if (!penalize_nl) { -// logits[llama_token_nl()] = nl_logit; -// } - - if (temp <= 0) { - // Greedy sampling - id = llama_sample_token_greedy(ctx, &candidates_p); - } else { - if (mirostat == 1) { - static float mirostat_mu = 2.0f * mirostat_tau; - const int mirostat_m = 100; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); - } else if (mirostat == 2) { - static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); - } else { - // Temperature sampling - llama_sample_top_k(ctx, &candidates_p, top_k, 1); - llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); - llama_sample_typical(ctx, &candidates_p, typical_p, 1); - llama_sample_top_p(ctx, &candidates_p, top_p, 1); - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token(ctx, &candidates_p); - } - } - - } - return id; -} - -const char* sampling(struct MyModel* mymodel) { - llama_context* ctx = mymodel->ctx; - int id = sampling_id(mymodel); - - std::string ret; - if (id == llama_token_eos()) ret = ""; - else ret = llama_token_to_str(ctx, id); - eval_id(mymodel, id); - return ret.c_str(); -} - -} diff --git a/llama.cpp b/llama.cpp index 65890663b5967..9c983da3a6916 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1342,15 +1342,33 @@ static bool llama_model_load( } } -static bool llama_eval_internal_tensor( - llama_context& lctx, - ggml_context* ctx0, - ggml_tensor* inpL, - const int n_tokens, - const int n_past, - const int n_threads, - const char * cgraph_fname, - const int64_t t_start_us) { +// evaluate the transformer +// +// - lctx: llama context +// - tokens: new batch of tokens to process +// - n_tokens number of tokens +// - embd embeddings input +// - n_past: the context size so far +// - n_threads: number of threads to use +// +static bool llama_eval_internal( + llama_context & lctx, + const llama_token * tokens, + const int n_tokens, + const float * embd, + const int n_past, + const int n_threads, + const char * cgraph_fname) { + + LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); + + // enforce that the first token is BOS + if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) { + fprintf(stderr, "%s: first token must be BOS\n", __func__); + return false; + } + + const int64_t t_start_us = ggml_time_us(); const int N = n_tokens; @@ -1359,7 +1377,6 @@ static bool llama_eval_internal_tensor( const auto & kv_self = model.kv_self; - LLAMA_ASSERT(!!kv_self.ctx); const int n_embd = hparams.n_embd; @@ -1371,6 +1388,15 @@ static bool llama_eval_internal_tensor( const int n_gpu_layers = model.n_gpu_layers; auto & mem_per_token = lctx.mem_per_token; + auto & buf_compute = lctx.buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.addr, + /*.no_alloc =*/ false, + }; + + struct ggml_context * ctx0 = ggml_init(params); // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance @@ -1378,6 +1404,17 @@ static bool llama_eval_internal_tensor( gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + if (tokens) { + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_set_name(embd, "embd"); + memcpy(embd->data, tokens, N*ggml_element_size(embd)); + inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); + } else { + inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); + } const int i_gpu_start = n_layer - n_gpu_layers; (void) i_gpu_start; @@ -1746,53 +1783,6 @@ static bool llama_eval_internal_tensor( return true; } - -// evaluate the transformer -// -// - lctx: llama context -// - tokens: new batch of tokens to process -// - n_past: the context size so far -// - n_threads: number of threads to use -// -static bool llama_eval_internal( - llama_context & lctx, - const llama_token * tokens, - const int n_tokens, - const int n_past, - const int n_threads, - const char * cgraph_fname) { - - // enforce that the first token is BOS - if (n_past == 0 && tokens[0] != llama_token_bos()) { - fprintf(stderr, "%s: first token must be BOS\n", __func__); - return false; - } - - const auto & model = lctx.model; - - const int64_t t_start_us = ggml_time_us(); - - const int N = n_tokens; - - auto & buf_compute = lctx.buf_compute; - - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.addr, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); - - - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_set_name(embd, "embd"); - memcpy(embd->data, tokens, N*ggml_element_size(embd)); - - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); - return llama_eval_internal_tensor(lctx, ctx0, inpL, N, n_past, n_threads, cgraph_fname, t_start_us); -} - // // tokenizer // @@ -3357,7 +3347,7 @@ int llama_eval( int n_tokens, int n_past, int n_threads) { - if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) { + if (!llama_eval_internal(*ctx, tokens, n_tokens, nullptr, n_past, n_threads, nullptr)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } @@ -3373,32 +3363,13 @@ int llama_eval( } -int llama_eval_float( - struct llama_context * ctx, - const float * input, - int n_tokens, - int n_past, - int n_threads) { - const auto & model = ctx->model; - - const int64_t t_start_us = ggml_time_us(); - - const int N = n_tokens; - - auto & buf_compute = ctx->buf_compute; - - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.addr, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); - - struct ggml_tensor *inpL = - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, model.hparams.n_embd, N); - memcpy(inpL->data, input, N * model.hparams.n_embd * ggml_element_size(inpL)); - if (!llama_eval_internal_tensor(*ctx, ctx0, inpL, N, n_past, n_threads, nullptr, t_start_us)) { +int llama_eval_embd( + struct llama_context * ctx, + const float * embd, + int n_tokens, + int n_past, + int n_threads) { + if (!llama_eval_internal(*ctx, nullptr, n_tokens, embd, n_past, n_threads, nullptr)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } @@ -3419,7 +3390,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) { const std::vector tmp(n_batch, llama_token_bos()); - if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) { + if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), nullptr, n_ctx, 1, fname)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } diff --git a/llama.h b/llama.h index ad975166bd28e..2183b12fa1ce1 100644 --- a/llama.h +++ b/llama.h @@ -200,9 +200,9 @@ extern "C" { int n_threads); // Same as llama_eval, but use float matrix input directly. - LLAMA_API int llama_eval_float( + LLAMA_API int llama_eval_embd( struct llama_context * ctx, - const float * embds, + const float * embd, int n_tokens, int n_past, int n_threads); From beca5a61859b3af64bd773408d660f2a954370d8 Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Sun, 25 Jun 2023 06:15:13 +0800 Subject: [PATCH 13/17] add cmake build for embd-input --- Makefile | 2 +- examples/CMakeLists.txt | 1 + examples/embd-input/embd-input-lib.cpp | 6 ++++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 058a234f88b42..b69301f478543 100644 --- a/Makefile +++ b/Makefile @@ -265,7 +265,7 @@ libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) clean: - rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h + rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch embd-input-test build-info.h # # Examples diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index cf9c4a2231337..161960bb853cc 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -39,6 +39,7 @@ else() add_subdirectory(baby-llama) add_subdirectory(train-text-from-scratch) add_subdirectory(simple) + add_subdirectory(embd-input) if (LLAMA_METAL) add_subdirectory(metal) endif() diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp index 37a5b52086ccf..83fcd065ceb8e 100644 --- a/examples/embd-input/embd-input-lib.cpp +++ b/examples/embd-input/embd-input-lib.cpp @@ -36,12 +36,14 @@ struct MyModel* create_mymodel(int argc, char ** argv) { llama_init_backend(); + llama_model * model; llama_context * ctx; + g_ctx = &ctx; // load the model and apply lora adapter, if any - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return nullptr; } From 6cb62ca2873c7f79c0e7e5189accb63866597c7a Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Sun, 25 Jun 2023 06:17:30 +0800 Subject: [PATCH 14/17] add cmake build for embd-input --- examples/embd-input/CMakeLists.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 examples/embd-input/CMakeLists.txt diff --git a/examples/embd-input/CMakeLists.txt b/examples/embd-input/CMakeLists.txt new file mode 100644 index 0000000000000..2b623953e8061 --- /dev/null +++ b/examples/embd-input/CMakeLists.txt @@ -0,0 +1,15 @@ +set(TARGET embdinput) +add_library(${TARGET} embd-input-lib.cpp embd-input.h) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() + +set(TARGET embd-input-test) +add_executable(${TARGET} embd-input-test.cpp) +target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() From 40340d82af4ca1fe9ca519b0e7cc42dff427406f Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Sun, 25 Jun 2023 23:42:26 +0800 Subject: [PATCH 15/17] Add MiniGPT-4 example --- .gitignore | 2 +- examples/embd-input/.gitignore | 4 + examples/embd-input/README.md | 36 ++++++--- examples/embd-input/minigpt4.py | 128 ++++++++++++++++++++++++++++++++ 4 files changed, 159 insertions(+), 11 deletions(-) create mode 100644 examples/embd-input/.gitignore create mode 100644 examples/embd-input/minigpt4.py diff --git a/.gitignore b/.gitignore index 9116729fd5f12..4fccec31b8114 100644 --- a/.gitignore +++ b/.gitignore @@ -40,7 +40,7 @@ models/* /vdot /server /Pipfile -/embd_input_test +/embd-input-test /libllama.so build-info.h arm_neon.h diff --git a/examples/embd-input/.gitignore b/examples/embd-input/.gitignore new file mode 100644 index 0000000000000..87ef68771de5e --- /dev/null +++ b/examples/embd-input/.gitignore @@ -0,0 +1,4 @@ +PandaGPT +MiniGPT-4 +*.pth + diff --git a/examples/embd-input/README.md b/examples/embd-input/README.md index eb1095f24fe96..02d028f261f17 100644 --- a/examples/embd-input/README.md +++ b/examples/embd-input/README.md @@ -1,17 +1,17 @@ ### Examples for input embedding directly ## Requirement -build `libembd_input.so` +build `libembdinput.so` run the following comman in main dir (../../). ``` make ``` -## LLAVA example (llava.py) +## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py) -1. obtian llava model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/) -2. convert it to ggml format -3. llava_projection.pth is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin) +1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/). +2. Convert it to ggml format. +3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin). ``` import torch @@ -23,10 +23,12 @@ dic = torch.load(bin_path) used_key = ["model.mm_projector.weight","model.mm_projector.bias"] torch.save({k: dic[k] for k in used_key}, pth_path) ``` +4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`. -## PandaGPT example (panda_gpt.py) -1. Obtian PandaGPT lora model. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format. +## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py) + +1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format. The `adapter_config.json` is ``` { @@ -40,8 +42,22 @@ The `adapter_config.json` is "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"] } ``` -2. papare the `vicuna` v0 model. -3. obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model. +2. Papare the `vicuna` v0 model. +3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model. 4. Clone the PandaGPT source. -5. check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py. +``` +git clone https://github.com/yxuansu/PandaGPT +``` +5. Install the requirement of PandaGPT. +6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py. +## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py) + +1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`. +2. Clone the MiniGPT-4 source. +``` +git clone https://github.com/Vision-CAIR/MiniGPT-4/ +``` +3. Install the requirement of PandaGPT. +4. Papare the `vicuna` v0 model. +5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`. diff --git a/examples/embd-input/minigpt4.py b/examples/embd-input/minigpt4.py new file mode 100644 index 0000000000000..8e98f85179c4e --- /dev/null +++ b/examples/embd-input/minigpt4.py @@ -0,0 +1,128 @@ +import sys +import os +sys.path.insert(0, os.path.dirname(__file__)) +from embd_input import MyModel +import numpy as np +from torch import nn +import torch +from PIL import Image + +minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4") +sys.path.insert(0, minigpt4_path) +from minigpt4.models.blip2 import Blip2Base +from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor + + +class MiniGPT4(Blip2Base): + """ + MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4 + """ + def __init__(self, + args, + vit_model="eva_clip_g", + q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth", + img_size=224, + drop_path_rate=0, + use_grad_checkpoint=False, + vit_precision="fp32", + freeze_vit=True, + freeze_qformer=True, + num_query_token=32, + llama_model="", + prompt_path="", + prompt_template="", + max_txt_len=32, + end_sym='\n', + low_resource=False, # use 8 bit and put vit in cpu + device_8bit=0 + ): + super().__init__() + self.img_size = img_size + self.low_resource = low_resource + self.preprocessor = Blip2ImageEvalProcessor(img_size) + + print('Loading VIT') + self.visual_encoder, self.ln_vision = self.init_vision_encoder( + vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision + ) + print('Loading VIT Done') + print('Loading Q-Former') + self.Qformer, self.query_tokens = self.init_Qformer( + num_query_token, self.visual_encoder.num_features + ) + self.Qformer.cls = None + self.Qformer.bert.embeddings.word_embeddings = None + self.Qformer.bert.embeddings.position_embeddings = None + for layer in self.Qformer.bert.encoder.layer: + layer.output = None + layer.intermediate = None + self.load_from_pretrained(url_or_filename=q_former_model) + print('Loading Q-Former Done') + self.llama_proj = nn.Linear( + self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size + ) + self.max_txt_len = max_txt_len + self.end_sym = end_sym + self.model = MyModel(["main", *args]) + # system promt + self.model.eval_string("Give the following image: ImageContent. " + "You will be able to see the image once I provide it to you. Please answer my questions." + "###") + + def encode_img(self, image): + image = self.preprocessor(image) + image = image.unsqueeze(0) + device = image.device + if self.low_resource: + self.vit_to_cpu() + image = image.to("cpu") + + with self.maybe_autocast(): + image_embeds = self.ln_vision(self.visual_encoder(image)).to(device) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_output = self.Qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=True, + ) + + inputs_llama = self.llama_proj(query_output.last_hidden_state) + # atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device) + return inputs_llama + + def load_projection(self, path): + state = torch.load(path)["model"] + self.llama_proj.load_state_dict({ + "weight": state["llama_proj.weight"], + "bias": state["llama_proj.bias"]}) + + def chat(self, question): + self.model.eval_string("Human: ") + self.model.eval_string(question) + self.model.eval_string("\n### Assistant:") + return self.model.generate_with_print(end="###") + + def chat_with_image(self, image, question): + with torch.no_grad(): + embd_image = self.encode_img(image) + embd_image = embd_image.cpu().numpy()[0] + self.model.eval_string("Human: ") + self.model.eval_float(embd_image.T) + self.model.eval_string(" ") + self.model.eval_string(question) + self.model.eval_string("\n### Assistant:") + return self.model.generate_with_print(end="###") + + +if __name__=="__main__": + a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"]) + a.load_projection(os.path.join( + os.path.dirname(__file__) , + "pretrained_minigpt4.pth")) + respose = a.chat_with_image( + Image.open("./media/llama1-logo.png").convert('RGB'), + "what is the text in the picture?") + a.chat("what is the color of it?") From 39011ad7c479c5dfa325940b9efac53a1011df3e Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Tue, 27 Jun 2023 04:06:20 +0800 Subject: [PATCH 16/17] change the order of the args of llama_eval_internal --- llama.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llama.cpp b/llama.cpp index 90758aedc9ef5..672270a7f2b41 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1363,16 +1363,16 @@ static bool llama_model_load( // // - lctx: llama context // - tokens: new batch of tokens to process -// - n_tokens number of tokens // - embd embeddings input +// - n_tokens number of tokens // - n_past: the context size so far // - n_threads: number of threads to use // static bool llama_eval_internal( llama_context & lctx, const llama_token * tokens, - const int n_tokens, const float * embd, + const int n_tokens, const int n_past, const int n_threads, const char * cgraph_fname) { @@ -3420,7 +3420,7 @@ int llama_eval( int n_tokens, int n_past, int n_threads) { - if (!llama_eval_internal(*ctx, tokens, n_tokens, nullptr, n_past, n_threads, nullptr)) { + if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } @@ -3442,7 +3442,7 @@ int llama_eval_embd( int n_tokens, int n_past, int n_threads) { - if (!llama_eval_internal(*ctx, nullptr, n_tokens, embd, n_past, n_threads, nullptr)) { + if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } @@ -3463,7 +3463,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) { const std::vector tmp(n_batch, llama_token_bos()); - if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), nullptr, n_ctx, 1, fname)) { + if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } From 7abb513e9413c58d4dde0d6fc26b9137900a6950 Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Tue, 27 Jun 2023 20:38:52 +0800 Subject: [PATCH 17/17] fix ci error --- examples/embd-input/embd-input-lib.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp index 83fcd065ceb8e..37de52ad6e37c 100644 --- a/examples/embd-input/embd-input-lib.cpp +++ b/examples/embd-input/embd-input-lib.cpp @@ -34,7 +34,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) { } fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); - llama_init_backend(); + llama_init_backend(params.numa); llama_model * model; llama_context * ctx;