From 93984987c0b8a4c71a538a0d29d840762e20d687 Mon Sep 17 00:00:00 2001 From: jhen Date: Fri, 15 Sep 2023 19:42:54 +0800 Subject: [PATCH 01/15] whisper : check state->ctx_metal not null --- whisper.cpp | 84 ++++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index f1c3567df7a..5898c6bff30 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2917,6 +2917,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { } #ifdef GGML_USE_METAL + // TODO: Param for enable GPU state->ctx_metal = ggml_metal_init(1); if (!state->ctx_metal) { log("%s: ggml_metal_init() failed\n", __func__); @@ -2924,52 +2925,55 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { return nullptr; } - log("%s: Metal context initialized\n", __func__); + if (state->ctx_metal) { + log("%s: Metal context initialized\n", __func__); - // this allocates all Metal resources and memory buffers + // this allocates all Metal resources and memory buffers - void * data_ptr = NULL; - size_t data_size = 0; + void * data_ptr = NULL; + size_t data_size = 0; - // TODO: add mmap support - //if (params.use_mmap) { - // data_ptr = ctx->model.mapping->addr; - // data_size = ctx->model.mapping->size; - //} else { - // data_ptr = ggml_get_mem_buffer(ctx->model.ctx); - // data_size = ggml_get_mem_size (ctx->model.ctx); - //} + // TODO: add mmap support + //if (params.use_mmap) { + // data_ptr = ctx->model.mapping->addr; + // data_size = ctx->model.mapping->size; + //} else { + // data_ptr = ggml_get_mem_buffer(ctx->model.ctx); + // data_size = ggml_get_mem_size (ctx->model.ctx); + //} - data_ptr = ggml_get_mem_buffer(ctx->model.ctx); - data_size = ggml_get_mem_size (ctx->model.ctx); + data_ptr = ggml_get_mem_buffer(ctx->model.ctx); + data_size = ggml_get_mem_size (ctx->model.ctx); - const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); + const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); - log("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + log("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); #define WHISPER_METAL_CHECK_BUF(result) \ - if (!(result)) { \ - log("%s: failed to add metal buffer\n", __func__); \ - delete state; \ - return nullptr; \ - } + if (!(result)) { \ + log("%s: failed to add metal buffer\n", __func__); \ + delete state; \ + return nullptr; \ + } - WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data", data_ptr, data_size, max_size)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data", data_ptr, data_size, max_size)); - WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_conv", state->alloc_conv.meta.data(), state->alloc_conv.meta.size(), 0)); - WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_encode", state->alloc_encode.meta.data(), state->alloc_encode.meta.size(), 0)); - WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_cross", state->alloc_cross.meta.data(), state->alloc_cross.meta.size(), 0)); - WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_decode", state->alloc_decode.meta.data(), state->alloc_decode.meta.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_conv", state->alloc_conv.meta.data(), state->alloc_conv.meta.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_encode", state->alloc_encode.meta.data(), state->alloc_encode.meta.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_cross", state->alloc_cross.meta.data(), state->alloc_cross.meta.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_decode", state->alloc_decode.meta.data(), state->alloc_decode.meta.size(), 0)); - WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_conv", state->alloc_conv.data.data(), state->alloc_conv.data.size(), 0)); - WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_encode", state->alloc_encode.data.data(), state->alloc_encode.data.size(), 0)); - WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_cross", state->alloc_cross.data.data(), state->alloc_cross.data.size(), 0)); - WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_decode", state->alloc_decode.data.data(), state->alloc_decode.data.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_conv", state->alloc_conv.data.data(), state->alloc_conv.data.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_encode", state->alloc_encode.data.data(), state->alloc_encode.data.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_cross", state->alloc_cross.data.data(), state->alloc_cross.data.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_decode", state->alloc_decode.data.data(), state->alloc_decode.data.size(), 0)); - WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "kv_cross", state->kv_cross.buf.data(), state->kv_cross.buf.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "kv_cross", state->kv_cross.buf.data(), state->kv_cross.buf.size(), 0)); - WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "kv_self_0", state->decoders[0].kv_self.buf.data(), state->decoders[0].kv_self.buf.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "kv_self_0", state->decoders[0].kv_self.buf.data(), state->decoders[0].kv_self.buf.size(), 0)); #undef WHISPER_METAL_CHECK_BUF + + } #endif state->rng = std::mt19937(0); @@ -4493,17 +4497,19 @@ int whisper_full_with_state( // TODO: not very clean - look for a better way and potentially merging with the init of decoder 0 #ifdef GGML_USE_METAL + if (state->ctx_metal) { #define WHISPER_METAL_CHECK_BUF(result) \ - if (!(result)) { \ - log("%s: failed to add metal buffer\n", __func__); \ - return 0; \ - } + if (!(result)) { \ + log("%s: failed to add metal buffer\n", __func__); \ + return 0; \ + } - const std::string kv_name = "kv_self_" + std::to_string(j); - auto & kv_self = decoder.kv_self; + const std::string kv_name = "kv_self_" + std::to_string(j); + auto & kv_self = decoder.kv_self; - WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, kv_name.c_str(), kv_self.buf.data(), kv_self.buf.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, kv_name.c_str(), kv_self.buf.data(), kv_self.buf.size(), 0)); #undef WHISPER_METAL_CHECK_BUF + } #endif } } From 3d1369e370986d8bc04d36a6ec60756346eea8a7 Mon Sep 17 00:00:00 2001 From: jhen Date: Fri, 15 Sep 2023 20:33:52 +0800 Subject: [PATCH 02/15] whisper : add whisper_context_params { use_gpu } --- examples/bench/bench.cpp | 8 ++++++-- examples/main/main.cpp | 7 ++++++- whisper.cpp | 37 ++++++++++++++++++++----------------- whisper.h | 15 +++++++++------ 4 files changed, 41 insertions(+), 26 deletions(-) diff --git a/examples/bench/bench.cpp b/examples/bench/bench.cpp index ac0e6bb959f..65510e7f428 100644 --- a/examples/bench/bench.cpp +++ b/examples/bench/bench.cpp @@ -10,6 +10,7 @@ struct whisper_params { int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat std::string model = "models/ggml-base.en.bin"; + bool use_gpu = true; }; void whisper_print_usage(int argc, char ** argv, const whisper_params & params); @@ -24,7 +25,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { } else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } - else if (arg == "-w" || arg == "--what") { params.what = atoi(argv[++i]); } + else if (arg == "-w" || arg == "--what") { params.what = atoi(argv[++i]); } + else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params); @@ -53,7 +55,9 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para int whisper_bench_full(const whisper_params & params) { // whisper init - struct whisper_context * ctx = whisper_init_from_file(params.model.c_str()); + struct whisper_context_params cparams; + cparams.use_gpu = params.use_gpu; + struct whisper_context * ctx = whisper_init_from_file(params.model.c_str(), cparams); { fprintf(stderr, "\n"); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 60c1cca756a..921157dea85 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -102,6 +102,8 @@ struct whisper_params { std::vector fname_inp = {}; std::vector fname_out = {}; + + bool use_gpu = true; }; void whisper_print_usage(int argc, char ** argv, const whisper_params & params); @@ -163,6 +165,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); } else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; } else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; } + else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params); @@ -841,7 +844,9 @@ int main(int argc, char ** argv) { // whisper init - struct whisper_context * ctx = whisper_init_from_file(params.model.c_str()); + struct whisper_context_params cparams; + cparams.use_gpu = params.use_gpu; + struct whisper_context * ctx = whisper_init_from_file(params.model.c_str(), cparams); if (ctx == nullptr) { fprintf(stderr, "error: failed to initialize whisper context\n"); diff --git a/whisper.cpp b/whisper.cpp index 5898c6bff30..ba2b87f15c2 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -762,6 +762,7 @@ struct whisper_context { whisper_state * state = nullptr; std::string path_model; // populated by whisper_init_from_file() + whisper_context_params params; }; static void whisper_default_log(const char * text) { @@ -2917,12 +2918,13 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { } #ifdef GGML_USE_METAL - // TODO: Param for enable GPU - state->ctx_metal = ggml_metal_init(1); - if (!state->ctx_metal) { - log("%s: ggml_metal_init() failed\n", __func__); - delete state; - return nullptr; + if (ctx->params.use_gpu) { + state->ctx_metal = ggml_metal_init(1); + if (!state->ctx_metal) { + log("%s: ggml_metal_init() failed\n", __func__); + delete state; + return nullptr; + } } if (state->ctx_metal) { @@ -3030,7 +3032,7 @@ int whisper_ctx_init_openvino_encoder( #endif } -struct whisper_context * whisper_init_from_file_no_state(const char * path_model) { +struct whisper_context * whisper_init_from_file_no_state(const char * path_model, whisper_context_params params) { log("%s: loading model from '%s'\n", __func__, path_model); auto fin = std::ifstream(path_model, std::ios::binary); @@ -3059,7 +3061,7 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model fin->close(); }; - auto ctx = whisper_init_no_state(&loader); + auto ctx = whisper_init_no_state(&loader, params); if (ctx) { ctx->path_model = path_model; @@ -3068,7 +3070,7 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model return ctx; } -struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size) { +struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size, whisper_context_params params) { struct buf_context { uint8_t* buffer; size_t size; @@ -3102,13 +3104,14 @@ struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t loader.close = [](void * /*ctx*/) { }; - return whisper_init_no_state(&loader); + return whisper_init_no_state(&loader, params); } -struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader) { +struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader, whisper_context_params params) { ggml_time_init(); whisper_context * ctx = new whisper_context; + ctx->params = params; if (!whisper_model_load(loader, *ctx)) { loader->close(loader->context); @@ -3122,8 +3125,8 @@ struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loa return ctx; } -struct whisper_context * whisper_init_from_file(const char * path_model) { - whisper_context * ctx = whisper_init_from_file_no_state(path_model); +struct whisper_context * whisper_init_from_file(const char * path_model, whisper_context_params params) { + whisper_context * ctx = whisper_init_from_file_no_state(path_model, params); if (!ctx) { return nullptr; } @@ -3137,8 +3140,8 @@ struct whisper_context * whisper_init_from_file(const char * path_model) { return ctx; } -struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size) { - whisper_context * ctx = whisper_init_from_buffer_no_state(buffer, buffer_size); +struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size, whisper_context_params params) { + whisper_context * ctx = whisper_init_from_buffer_no_state(buffer, buffer_size, params); if (!ctx) { return nullptr; } @@ -3152,8 +3155,8 @@ struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_s return ctx; } -struct whisper_context * whisper_init(struct whisper_model_loader * loader) { - whisper_context * ctx = whisper_init_no_state(loader); +struct whisper_context * whisper_init(struct whisper_model_loader * loader, whisper_context_params params) { + whisper_context * ctx = whisper_init_no_state(loader, params); if (!ctx) { return nullptr; } diff --git a/whisper.h b/whisper.h index 73ab4d799a2..144dea3a22e 100644 --- a/whisper.h +++ b/whisper.h @@ -65,6 +65,9 @@ extern "C" { // understanding of how the model works. // + struct whisper_context_params { + bool use_gpu = true; + }; struct whisper_context; struct whisper_state; struct whisper_full_params; @@ -99,15 +102,15 @@ extern "C" { // Various functions for loading a ggml whisper model. // Allocate (almost) all memory needed for the model. // Return NULL on failure - WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model); - WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size); - WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader); + WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model, whisper_context_params params); + WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size, whisper_context_params params); + WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader, whisper_context_params params); // These are the same as the above, but the internal state of the context is not allocated automatically // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523) - WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model); - WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size); - WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader); + WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model, whisper_context_params params); + WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size, whisper_context_params params); + WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader, whisper_context_params params); WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx); From f924c4160e8d49939946ee052935179416eea3dc Mon Sep 17 00:00:00 2001 From: jhen Date: Mon, 18 Sep 2023 09:34:37 +0800 Subject: [PATCH 03/15] whisper : new API with params & deprecate old API --- examples/bench/bench.cpp | 2 +- examples/main/main.cpp | 2 +- whisper.cpp | 53 +++++++++++++++++++++++++++++++--------- whisper.h | 47 +++++++++++++++++++++++++++++------ 4 files changed, 84 insertions(+), 20 deletions(-) diff --git a/examples/bench/bench.cpp b/examples/bench/bench.cpp index 65510e7f428..a92de45ab72 100644 --- a/examples/bench/bench.cpp +++ b/examples/bench/bench.cpp @@ -57,7 +57,7 @@ int whisper_bench_full(const whisper_params & params) { struct whisper_context_params cparams; cparams.use_gpu = params.use_gpu; - struct whisper_context * ctx = whisper_init_from_file(params.model.c_str(), cparams); + struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); { fprintf(stderr, "\n"); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 921157dea85..d174d8d8e52 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -846,7 +846,7 @@ int main(int argc, char ** argv) { struct whisper_context_params cparams; cparams.use_gpu = params.use_gpu; - struct whisper_context * ctx = whisper_init_from_file(params.model.c_str(), cparams); + struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); if (ctx == nullptr) { fprintf(stderr, "error: failed to initialize whisper context\n"); diff --git a/whisper.cpp b/whisper.cpp index ba2b87f15c2..b86aef3e0e2 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -3032,7 +3032,14 @@ int whisper_ctx_init_openvino_encoder( #endif } -struct whisper_context * whisper_init_from_file_no_state(const char * path_model, whisper_context_params params) { +struct whisper_context_params whisper_context_default_params() { + struct whisper_context_params result = { + /*.use_gpu =*/ true, + }; + return result; +} + +struct whisper_context * whisper_init_from_file_with_params_no_state(const char * path_model, struct whisper_context_params params) { log("%s: loading model from '%s'\n", __func__, path_model); auto fin = std::ifstream(path_model, std::ios::binary); @@ -3061,7 +3068,7 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model fin->close(); }; - auto ctx = whisper_init_no_state(&loader, params); + auto ctx = whisper_init_with_params_no_state(&loader, params); if (ctx) { ctx->path_model = path_model; @@ -3070,7 +3077,7 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model return ctx; } -struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size, whisper_context_params params) { +struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params) { struct buf_context { uint8_t* buffer; size_t size; @@ -3104,10 +3111,10 @@ struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t loader.close = [](void * /*ctx*/) { }; - return whisper_init_no_state(&loader, params); + return whisper_init_with_params_no_state(&loader, params); } -struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader, whisper_context_params params) { +struct whisper_context * whisper_init_with_params_no_state(struct whisper_model_loader * loader, struct whisper_context_params params) { ggml_time_init(); whisper_context * ctx = new whisper_context; @@ -3125,8 +3132,8 @@ struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loa return ctx; } -struct whisper_context * whisper_init_from_file(const char * path_model, whisper_context_params params) { - whisper_context * ctx = whisper_init_from_file_no_state(path_model, params); +struct whisper_context * whisper_init_from_file_with_params(const char * path_model, struct whisper_context_params params) { + whisper_context * ctx = whisper_init_from_file_with_params_no_state(path_model, params); if (!ctx) { return nullptr; } @@ -3140,8 +3147,8 @@ struct whisper_context * whisper_init_from_file(const char * path_model, whisper return ctx; } -struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size, whisper_context_params params) { - whisper_context * ctx = whisper_init_from_buffer_no_state(buffer, buffer_size, params); +struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params) { + whisper_context * ctx = whisper_init_from_buffer_with_params_no_state(buffer, buffer_size, params); if (!ctx) { return nullptr; } @@ -3155,8 +3162,8 @@ struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_s return ctx; } -struct whisper_context * whisper_init(struct whisper_model_loader * loader, whisper_context_params params) { - whisper_context * ctx = whisper_init_no_state(loader, params); +struct whisper_context * whisper_init_with_params(struct whisper_model_loader * loader, struct whisper_context_params params) { + whisper_context * ctx = whisper_init_with_params_no_state(loader, params); if (!ctx) { return nullptr; } @@ -3170,6 +3177,30 @@ struct whisper_context * whisper_init(struct whisper_model_loader * loader, whis return ctx; } +struct whisper_context * whisper_init_from_file(const char * path_model) { + return whisper_init_from_file_with_params(path_model, whisper_context_default_params()); +} + +struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size) { + return whisper_init_from_buffer_with_params(buffer, buffer_size, whisper_context_default_params()); +} + +struct whisper_context * whisper_init(struct whisper_model_loader * loader) { + return whisper_init_with_params(loader, whisper_context_default_params()); +} + +struct whisper_context * whisper_init_from_file_no_state(const char * path_model) { + return whisper_init_from_file_with_params_no_state(path_model, whisper_context_default_params()); +} + +struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size) { + return whisper_init_from_buffer_with_params_no_state(buffer, buffer_size, whisper_context_default_params()); +} + +struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader) { + return whisper_init_with_params_no_state(loader, whisper_context_default_params()); +} + void whisper_free_state(struct whisper_state * state) { if (state) { diff --git a/whisper.h b/whisper.h index 144dea3a22e..4b7035beaee 100644 --- a/whisper.h +++ b/whisper.h @@ -5,6 +5,14 @@ #include #include +#ifdef __GNUC__ +# define WHISPER_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) +#elif defined(_MSC_VER) +# define WHISPER_DEPRECATED(func, hint) __declspec(deprecated(hint)) func +#else +# define WHISPER_DEPRECATED(func, hint) func +#endif + #ifdef WHISPER_SHARED # ifdef _WIN32 # ifdef WHISPER_BUILD @@ -66,7 +74,7 @@ extern "C" { // struct whisper_context_params { - bool use_gpu = true; + bool use_gpu; }; struct whisper_context; struct whisper_state; @@ -102,15 +110,40 @@ extern "C" { // Various functions for loading a ggml whisper model. // Allocate (almost) all memory needed for the model. // Return NULL on failure - WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model, whisper_context_params params); - WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size, whisper_context_params params); - WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader, whisper_context_params params); + WHISPER_API struct whisper_context * whisper_init_from_file_with_params(const char * path_model, struct whisper_context_params params); + WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params); + WHISPER_API struct whisper_context * whisper_init_with_params(struct whisper_model_loader * loader, struct whisper_context_params params); // These are the same as the above, but the internal state of the context is not allocated automatically // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523) - WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model, whisper_context_params params); - WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size, whisper_context_params params); - WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader, whisper_context_params params); + WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state(const char * path_model, struct whisper_context_params params); + WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params); + WHISPER_API struct whisper_context * whisper_init_with_params_no_state(struct whisper_model_loader * loader, struct whisper_context_params params); + + WHISPER_DEPRECATED( + WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model), + "use whisper_init_from_file_with_params instead" + ); + WHISPER_DEPRECATED( + WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size), + "use whisper_init_from_buffer_with_params instead" + ); + WHISPER_DEPRECATED( + WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader), + "use whisper_init_with_params instead" + ); + WHISPER_DEPRECATED( + WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model), + "use whisper_init_from_file_with_params_no_state instead" + ); + WHISPER_DEPRECATED( + WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size), + "use whisper_init_from_buffer_with_params_no_state instead" + ); + WHISPER_DEPRECATED( + WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader), + "use whisper_init_with_params_no_state instead" + ); WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx); From 378bdb2a8fe541cddfa2c32d13fd91d59319bb91 Mon Sep 17 00:00:00 2001 From: Jhen Date: Fri, 6 Oct 2023 09:08:42 +0800 Subject: [PATCH 04/15] examples : use no-gpu param && whisper_init_from_file_with_params --- examples/addon.node/addon.cpp | 7 ++++++- examples/addon.node/index.js | 1 + examples/bench.wasm/emscripten.cpp | 2 +- examples/command.wasm/emscripten.cpp | 2 +- examples/command/command.cpp | 6 +++++- examples/lsp/lsp.cpp | 6 +++++- examples/stream.wasm/emscripten.cpp | 2 +- examples/stream/stream.cpp | 6 +++++- examples/talk-llama/talk-llama.cpp | 9 ++++++++- examples/talk.wasm/emscripten.cpp | 2 +- examples/talk/talk.cpp | 7 +++++-- examples/whisper.wasm/emscripten.cpp | 2 +- whisper.h | 2 ++ 13 files changed, 42 insertions(+), 12 deletions(-) diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp index 52e80ad8528..414a35f5573 100644 --- a/examples/addon.node/addon.cpp +++ b/examples/addon.node/addon.cpp @@ -40,6 +40,7 @@ struct whisper_params { std::string language = "en"; std::string prompt; std::string model = "../../ggml-large.bin"; + bool use_gpu = true; std::vector fname_inp = {}; std::vector fname_out = {}; @@ -153,7 +154,9 @@ int run(whisper_params ¶ms, std::vector> &result) { // whisper init - struct whisper_context * ctx = whisper_init_from_file(params.model.c_str()); + struct whisper_context_params cparams; + cparams.use_gpu = params.use_gpu; + struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); if (ctx == nullptr) { fprintf(stderr, "error: failed to initialize whisper context\n"); @@ -315,10 +318,12 @@ Napi::Value whisper(const Napi::CallbackInfo& info) { std::string language = whisper_params.Get("language").As(); std::string model = whisper_params.Get("model").As(); std::string input = whisper_params.Get("fname_inp").As(); + std::string input = whisper_params.Get("use_gpu").As(); params.language = language; params.model = model; params.fname_inp.emplace_back(input); + params.use_gpu = use_gpu; Napi::Function callback = info[1].As(); Worker* worker = new Worker(callback, params); diff --git a/examples/addon.node/index.js b/examples/addon.node/index.js index d511cdc2b67..3c6429375ab 100644 --- a/examples/addon.node/index.js +++ b/examples/addon.node/index.js @@ -11,6 +11,7 @@ const whisperParams = { language: "en", model: path.join(__dirname, "../../models/ggml-base.en.bin"), fname_inp: "../../samples/jfk.wav", + use_gpu: true, }; const arguments = process.argv.slice(2); diff --git a/examples/bench.wasm/emscripten.cpp b/examples/bench.wasm/emscripten.cpp index 09e9d55d972..3624bbc48b1 100644 --- a/examples/bench.wasm/emscripten.cpp +++ b/examples/bench.wasm/emscripten.cpp @@ -57,7 +57,7 @@ EMSCRIPTEN_BINDINGS(bench) { emscripten::function("init", emscripten::optional_override([](const std::string & path_model) { for (size_t i = 0; i < g_contexts.size(); ++i) { if (g_contexts[i] == nullptr) { - g_contexts[i] = whisper_init_from_file(path_model.c_str()); + g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params()); if (g_contexts[i] != nullptr) { if (g_worker.joinable()) { g_worker.join(); diff --git a/examples/command.wasm/emscripten.cpp b/examples/command.wasm/emscripten.cpp index e739656dc6d..528ff6ab553 100644 --- a/examples/command.wasm/emscripten.cpp +++ b/examples/command.wasm/emscripten.cpp @@ -243,7 +243,7 @@ EMSCRIPTEN_BINDINGS(command) { emscripten::function("init", emscripten::optional_override([](const std::string & path_model) { for (size_t i = 0; i < g_contexts.size(); ++i) { if (g_contexts[i] == nullptr) { - g_contexts[i] = whisper_init_from_file(path_model.c_str()); + g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params()); if (g_contexts[i] != nullptr) { g_running = true; if (g_worker.joinable()) { diff --git a/examples/command/command.cpp b/examples/command/command.cpp index d39af7309a2..2bf1df0da56 100644 --- a/examples/command/command.cpp +++ b/examples/command/command.cpp @@ -41,6 +41,7 @@ struct whisper_params { std::string language = "en"; std::string model = "models/ggml-base.en.bin"; + bool use_gpu = true; std::string fname_out; std::string commands; std::string prompt; @@ -73,6 +74,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; } else if (arg == "-cmd" || arg == "--commands") { params.commands = argv[++i]; } else if (arg == "-p" || arg == "--prompt") { params.prompt = argv[++i]; } + else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params); @@ -610,7 +612,9 @@ int main(int argc, char ** argv) { // whisper init - struct whisper_context * ctx = whisper_init_from_file(params.model.c_str()); + struct whisper_context_params cparams; + cparams.use_gpu = params.use_gpu; + struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); // print some info about the processing { diff --git a/examples/lsp/lsp.cpp b/examples/lsp/lsp.cpp index b8001b95702..d57cee5ecfd 100644 --- a/examples/lsp/lsp.cpp +++ b/examples/lsp/lsp.cpp @@ -33,6 +33,7 @@ struct whisper_params { std::string language = "en"; std::string model = "models/ggml-base.en.bin"; + bool use_gpu = true; }; struct command { std::vector tokens; @@ -74,6 +75,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; } else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } + else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params); @@ -432,7 +434,9 @@ int main(int argc, char ** argv) { } // whisper init - struct whisper_context * ctx = whisper_init_from_file(params.model.c_str()); + struct whisper_context_params cparams; + cparams.use_gpu = params.use_gpu; + struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); // init audio audio_async audio(30*1000); diff --git a/examples/stream.wasm/emscripten.cpp b/examples/stream.wasm/emscripten.cpp index 144a14d268f..71acffba296 100644 --- a/examples/stream.wasm/emscripten.cpp +++ b/examples/stream.wasm/emscripten.cpp @@ -132,7 +132,7 @@ EMSCRIPTEN_BINDINGS(stream) { emscripten::function("init", emscripten::optional_override([](const std::string & path_model) { for (size_t i = 0; i < g_contexts.size(); ++i) { if (g_contexts[i] == nullptr) { - g_contexts[i] = whisper_init_from_file(path_model.c_str()); + g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params()); if (g_contexts[i] != nullptr) { g_running = true; if (g_worker.joinable()) { diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index c8a452d1267..4320cefd6de 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -51,6 +51,7 @@ struct whisper_params { std::string language = "en"; std::string model = "models/ggml-base.en.bin"; + bool use_gpu = true; std::string fname_out; bool save_audio = false; // save audio to wav file }; @@ -84,6 +85,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; } else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; } else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; } + else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); @@ -163,7 +165,9 @@ int main(int argc, char ** argv) { exit(0); } - struct whisper_context * ctx = whisper_init_from_file(params.model.c_str()); + struct whisper_context_params cparams; + cparams.use_gpu = params.use_gpu; + struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); std::vector pcmf32 (n_samples_30s, 0.0f); std::vector pcmf32_old; diff --git a/examples/talk-llama/talk-llama.cpp b/examples/talk-llama/talk-llama.cpp index 68a29bda2fe..304cabae85b 100644 --- a/examples/talk-llama/talk-llama.cpp +++ b/examples/talk-llama/talk-llama.cpp @@ -61,6 +61,7 @@ struct whisper_params { std::string language = "en"; std::string model_wsp = "models/ggml-base.en.bin"; std::string model_llama = "models/ggml-llama-7B.bin"; + bool use_gpu = true; std::string speak = "./examples/talk-llama/speak"; std::string prompt = ""; std::string fname_out; @@ -103,6 +104,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { } } else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; } + else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params); @@ -245,7 +247,9 @@ int main(int argc, char ** argv) { // whisper init - struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str()); + struct whisper_context_params cparams; + cparams.use_gpu = params.use_gpu; + struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams); // llama init @@ -257,6 +261,9 @@ int main(int argc, char ** argv) { lparams.n_ctx = 2048; lparams.seed = 1; lparams.f16_kv = true; + if (!params.use_gpu) { + lparams.n_gpu_layers = 0; + } struct llama_model * model_llama = llama_load_model_from_file(params.model_llama.c_str(), lparams); diff --git a/examples/talk.wasm/emscripten.cpp b/examples/talk.wasm/emscripten.cpp index 1ea970295ac..6d30b295832 100644 --- a/examples/talk.wasm/emscripten.cpp +++ b/examples/talk.wasm/emscripten.cpp @@ -271,7 +271,7 @@ EMSCRIPTEN_BINDINGS(talk) { emscripten::function("init", emscripten::optional_override([](const std::string & path_model) { for (size_t i = 0; i < g_contexts.size(); ++i) { if (g_contexts[i] == nullptr) { - g_contexts[i] = whisper_init_from_file(path_model.c_str()); + g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params()); if (g_contexts[i] != nullptr) { g_running = true; if (g_worker.joinable()) { diff --git a/examples/talk/talk.cpp b/examples/talk/talk.cpp index 346d9d483fe..4e9fca9c625 100644 --- a/examples/talk/talk.cpp +++ b/examples/talk/talk.cpp @@ -36,6 +36,7 @@ struct whisper_params { std::string language = "en"; std::string model_wsp = "models/ggml-base.en.bin"; std::string model_gpt = "models/ggml-gpt-2-117M.bin"; + bool use_gpu = true; std::string speak = "./examples/talk/speak"; std::string fname_out; }; @@ -67,6 +68,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-mg" || arg == "--model-gpt") { params.model_gpt = argv[++i]; } else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; } else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; } + else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params); @@ -181,8 +183,9 @@ int main(int argc, char ** argv) { } // whisper init - - struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str()); + struct whisper_context_params cparams; + cparams.use_gpu = params.use_gpu; + struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams); // gpt init diff --git a/examples/whisper.wasm/emscripten.cpp b/examples/whisper.wasm/emscripten.cpp index db1ff789e5f..b84893dee73 100644 --- a/examples/whisper.wasm/emscripten.cpp +++ b/examples/whisper.wasm/emscripten.cpp @@ -24,7 +24,7 @@ EMSCRIPTEN_BINDINGS(whisper) { for (size_t i = 0; i < g_contexts.size(); ++i) { if (g_contexts[i] == nullptr) { - g_contexts[i] = whisper_init_from_file(path_model.c_str()); + g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params()); if (g_contexts[i] != nullptr) { return i + 1; } else { diff --git a/whisper.h b/whisper.h index 69fa8f9ff3e..e151e9cf27f 100644 --- a/whisper.h +++ b/whisper.h @@ -107,6 +107,8 @@ extern "C" { void (*close)(void * ctx); } whisper_model_loader; + WHISPER_API struct whisper_context_params whisper_context_default_params(void); + // Various functions for loading a ggml whisper model. // Allocate (almost) all memory needed for the model. // Return NULL on failure From a0aba3e5d9ed5761466dd0400fdb7b63d3ddedaa Mon Sep 17 00:00:00 2001 From: Jhen Date: Fri, 6 Oct 2023 09:33:30 +0800 Subject: [PATCH 05/15] whisper.objc : enable metal & disable on simulator --- .../whisper.objc/whisper.objc.xcodeproj/project.pbxproj | 4 ++-- examples/whisper.objc/whisper.objc/ViewController.m | 8 +++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj index f34b9c5b8e7..9a26f65b297 100644 --- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj +++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj @@ -17,8 +17,8 @@ 18627C8629052BE000BD2A04 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8529052BE000BD2A04 /* Assets.xcassets */; }; 18627C8929052BE000BD2A04 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8729052BE000BD2A04 /* LaunchScreen.storyboard */; }; 18627C8C29052BE000BD2A04 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C8B29052BE000BD2A04 /* main.m */; }; - 18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DWHISPER_USE_COREML"; }; }; - 18627C9629052C5800BD2A04 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9529052C5800BD2A04 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE"; }; }; + 18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK -DGGML_USE_METAL"; }; }; + 18627C9629052C5800BD2A04 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9529052C5800BD2A04 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL"; }; }; 18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; }; 7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; }; 7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; }; diff --git a/examples/whisper.objc/whisper.objc/ViewController.m b/examples/whisper.objc/whisper.objc/ViewController.m index 8a1e876c395..151b05d9c99 100644 --- a/examples/whisper.objc/whisper.objc/ViewController.m +++ b/examples/whisper.objc/whisper.objc/ViewController.m @@ -61,7 +61,13 @@ - (void)viewDidLoad { NSLog(@"Loading model from %@", modelPath); // create ggml context - stateInp.ctx = whisper_init_from_file([modelPath UTF8String]); + + struct whisper_context_params cparams = whisper_context_default_params(); +#if TARGET_OS_SIMULATOR + cparams.use_gpu = false; + NSLog(@"Running on simulator, using CPU"); +#endif + stateInp.ctx = whisper_init_from_file_with_params([modelPath UTF8String], cparams); // check if the model was loaded successfully if (stateInp.ctx == NULL) { From 41bc04492397e96d4d15f1346a9c458c600e2284 Mon Sep 17 00:00:00 2001 From: Jhen Date: Fri, 6 Oct 2023 10:34:09 +0800 Subject: [PATCH 06/15] whisper.swiftui, metal : enable metal & support load default.metallib --- .../whisper.cpp.swift/LibWhisper.swift | 7 +++- .../whisper.swiftui.xcodeproj/project.pbxproj | 14 ++++++-- ggml-metal.m | 34 ++++++++++++------- 3 files changed, 39 insertions(+), 16 deletions(-) diff --git a/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift b/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift index e9645b34f74..95e1aeefbc3 100644 --- a/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift +++ b/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift @@ -55,7 +55,12 @@ actor WhisperContext { } static func createContext(path: String) throws -> WhisperContext { - let context = whisper_init_from_file(path) + var params = whisper_context_default_params() +#if targetEnvironment(simulator) + params.use_gpu = false + print("Running on the simulator, using CPU") +#endif + let context = whisper_init_from_file_with_params(path, params) if let context { return WhisperContext(context: context) } else { diff --git a/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj b/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj index d2d0b05c586..5c1cbddeba4 100644 --- a/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj +++ b/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj @@ -16,11 +16,13 @@ 0AAC5D9D29539CCF003032C3 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5D9C29539CCF003032C3 /* ContentView.swift */; }; 0AAC5D9F29539CD0003032C3 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 0AAC5D9E29539CD0003032C3 /* Assets.xcassets */; }; 0AAC5DA329539CD0003032C3 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 0AAC5DA229539CD0003032C3 /* Preview Assets.xcassets */; }; - 0AAC5DCB29539EB1003032C3 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DC729539EB0003032C3 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-Wno-shorten-64-to-32"; }; }; - 0AAC5DCC29539EB1003032C3 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DC929539EB0003032C3 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -Wno-shorten-64-to-32"; }; }; + 0AAC5DCB29539EB1003032C3 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DC729539EB0003032C3 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_METAL -Wno-shorten-64-to-32"; }; }; + 0AAC5DCC29539EB1003032C3 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DC929539EB0003032C3 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -Wno-shorten-64-to-32"; }; }; 0AAC5DCE2953A05C003032C3 /* WhisperState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DCD2953A05C003032C3 /* WhisperState.swift */; }; 0AAC5DD12953A394003032C3 /* LibWhisper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DD02953A394003032C3 /* LibWhisper.swift */; }; 18AED4812AB21F2B009D854F /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 18AED47F2AB21F2B009D854F /* ggml-alloc.c */; }; + 7FCB08262ACFA3A400AF3530 /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FCB08252ACFA3A400AF3530 /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-framework Foundation -framework Metal -framework MetalKit -fno-objc-arc"; }; }; + 7FCB08282ACFA48500AF3530 /* ggml-metal.metal in Sources */ = {isa = PBXBuildFile; fileRef = 7FCB08272ACFA48500AF3530 /* ggml-metal.metal */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ @@ -44,6 +46,9 @@ 0AAC5DD02953A394003032C3 /* LibWhisper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibWhisper.swift; sourceTree = ""; }; 18AED47F2AB21F2B009D854F /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "ggml-alloc.c"; sourceTree = ""; }; 18AED4802AB21F2B009D854F /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "ggml-alloc.h"; sourceTree = ""; }; + 7FCB081E2ACFA04400AF3530 /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "ggml-metal.h"; sourceTree = ""; }; + 7FCB08252ACFA3A400AF3530 /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "ggml-metal.m"; sourceTree = ""; }; + 7FCB08272ACFA48500AF3530 /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = "ggml-metal.metal"; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -127,6 +132,9 @@ 0AAC5DC529539E89003032C3 /* whisper.cpp */ = { isa = PBXGroup; children = ( + 7FCB08272ACFA48500AF3530 /* ggml-metal.metal */, + 7FCB081E2ACFA04400AF3530 /* ggml-metal.h */, + 7FCB08252ACFA3A400AF3530 /* ggml-metal.m */, 18AED47F2AB21F2B009D854F /* ggml-alloc.c */, 18AED4802AB21F2B009D854F /* ggml-alloc.h */, 0AAC5DC929539EB0003032C3 /* ggml.c */, @@ -243,10 +251,12 @@ 0AAC5D9B29539CCF003032C3 /* WhisperCppDemoApp.swift in Sources */, 0AAC5DCC29539EB1003032C3 /* ggml.c in Sources */, 0AAC5DCE2953A05C003032C3 /* WhisperState.swift in Sources */, + 7FCB08282ACFA48500AF3530 /* ggml-metal.metal in Sources */, 0AAC5DD12953A394003032C3 /* LibWhisper.swift in Sources */, 0AA7514C2953B569001EE061 /* RiffWaveUtils.swift in Sources */, 0AAC5DCB29539EB1003032C3 /* whisper.cpp in Sources */, 0AA7514E2953D958001EE061 /* Recorder.swift in Sources */, + 7FCB08262ACFA3A400AF3530 /* ggml-metal.m in Sources */, 18AED4812AB21F2B009D854F /* ggml-alloc.c in Sources */, ); runOnlyForDeploymentPostprocessing = 0; diff --git a/ggml-metal.m b/ggml-metal.m index 1139ee31146..8faf407938a 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -178,22 +178,30 @@ @implementation GGMLMetalClass //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; - NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; - metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]); - - NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; - if (error) { - metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); - return NULL; - } + NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"]; + if (libPath != nil) { + NSURL * libURL = [NSURL fileURLWithPath:libPath]; + metal_printf("%s: loading '%s'\n", __func__, [libPath UTF8String]); + ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; + } else { + metal_printf("%s: default.metallib not found, loading from source\n", __func__); + + NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; + metal_printf("%s: loading '%s'\n", __func__, [sourcePath UTF8String]); + NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error]; + if (error) { + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; + } + MTLCompileOptions* options = nil; #ifdef GGML_QKK_64 - MTLCompileOptions* options = [MTLCompileOptions new]; - options.preprocessorMacros = @{ @"QK_K" : @(64) }; - ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error]; -#else - ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; + options = [MTLCompileOptions new]; + options.preprocessorMacros = @{ @"QK_K" : @(64) }; #endif + ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error]; + } + if (error) { metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; From 22ab80904f0ab5c8c2a8ae595ffe4e344b3278b1 Mon Sep 17 00:00:00 2001 From: Jhen Date: Fri, 6 Oct 2023 10:40:02 +0800 Subject: [PATCH 07/15] whisper.android : use new API --- examples/whisper.android/app/src/main/jni/whisper/jni.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/whisper.android/app/src/main/jni/whisper/jni.c b/examples/whisper.android/app/src/main/jni/whisper/jni.c index c437d0990f1..a8b3ded4a32 100644 --- a/examples/whisper.android/app/src/main/jni/whisper/jni.c +++ b/examples/whisper.android/app/src/main/jni/whisper/jni.c @@ -127,7 +127,7 @@ static struct whisper_context *whisper_init_from_asset( .close = &asset_close }; - return whisper_init(&loader); + return whisper_init_with_params(&loader, whisper_context_default_params()); } JNIEXPORT jlong JNICALL @@ -147,7 +147,7 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContext( UNUSED(thiz); struct whisper_context *context = NULL; const char *model_path_chars = (*env)->GetStringUTFChars(env, model_path_str, NULL); - context = whisper_init_from_file(model_path_chars); + context = whisper_init_from_file_with_params(model_path_chars, whisper_context_default_params()); (*env)->ReleaseStringUTFChars(env, model_path_str, model_path_chars); return (jlong) context; } From e0ebea2dfa4f263c8af5b366d9cdd4f714423fcd Mon Sep 17 00:00:00 2001 From: Jhen Date: Fri, 6 Oct 2023 10:44:31 +0800 Subject: [PATCH 08/15] bindings : use new API --- bindings/go/whisper.go | 2 +- .../ggerganov/whispercpp/WhisperCpp.java | 36 +++++++++++-------- .../whispercpp/WhisperCppJnaLibrary.java | 21 ++++++++++- bindings/javascript/emscripten.cpp | 2 +- bindings/ruby/ext/ruby_whisper.cpp | 2 +- whisper.cpp | 6 ++++ whisper.h | 1 + 7 files changed, 51 insertions(+), 19 deletions(-) diff --git a/bindings/go/whisper.go b/bindings/go/whisper.go index e605d8e0c85..b77e103c4e3 100644 --- a/bindings/go/whisper.go +++ b/bindings/go/whisper.go @@ -103,7 +103,7 @@ var ( func Whisper_init(path string) *Context { cPath := C.CString(path) defer C.free(unsafe.Pointer(cPath)) - if ctx := C.whisper_init_from_file(cPath); ctx != nil { + if ctx := C.whisper_init_from_file_with_params(cPath, C.whisper_context_default_params()); ctx != nil { return (*Context)(ctx) } else { return nil diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java index 9bc1a8601a9..50c24793c2b 100644 --- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java +++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java @@ -15,8 +15,9 @@ public class WhisperCpp implements AutoCloseable { private WhisperCppJnaLibrary lib = WhisperCppJnaLibrary.instance; private Pointer ctx = null; - private Pointer greedyPointer = null; - private Pointer beamPointer = null; + private Pointer paramPointer = null; + private Pointer greedyParamsPointer = null; + private Pointer beamParamsPointer = null; public File modelDir() { String modelDirPath = System.getenv("XDG_CACHE_HOME"); @@ -43,7 +44,8 @@ public void initContext(String modelPath) throws FileNotFoundException { modelPath = new File(modelDir(), modelPath).getAbsolutePath(); } - ctx = lib.whisper_init_from_file(modelPath); + paramPointer = lib.whisper_context_default_params_by_ref(); + ctx = lib.whisper_init_from_file_with_params(modelPath, paramPointer); if (ctx == null) { throw new FileNotFoundException(modelPath); @@ -63,15 +65,15 @@ public WhisperFullParams getFullDefaultParams(WhisperSamplingStrategy strategy) // whisper_full_default_params_by_ref allocates memory which we need to delete, so only create max 1 pointer for each strategy. if (strategy == WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY) { - if (greedyPointer == null) { - greedyPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal()); + if (greedyParamsPointer == null) { + greedyParamsPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal()); } - pointer = greedyPointer; + pointer = greedyParamsPointer; } else { - if (beamPointer == null) { - beamPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal()); + if (beamParamsPointer == null) { + beamParamsPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal()); } - pointer = beamPointer; + pointer = beamParamsPointer; } WhisperFullParams params = new WhisperFullParams(pointer); @@ -93,13 +95,17 @@ private void freeContext() { } private void freeParams() { - if (greedyPointer != null) { - Native.free(Pointer.nativeValue(greedyPointer)); - greedyPointer = null; + if (paramPointer != null) { + Native.free(Pointer.nativeValue(paramPointer)); + paramPointer = null; } - if (beamPointer != null) { - Native.free(Pointer.nativeValue(beamPointer)); - beamPointer = null; + if (greedyParamsPointer != null) { + Native.free(Pointer.nativeValue(greedyParamsPointer)); + greedyParamsPointer = null; + } + if (beamParamsPointer != null) { + Native.free(Pointer.nativeValue(beamParamsPointer)); + beamParamsPointer = null; } } diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java index ad9faa0be70..7d88abb4978 100644 --- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java +++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java @@ -13,12 +13,31 @@ public interface WhisperCppJnaLibrary extends Library { String whisper_print_system_info(); /** - * Allocate (almost) all memory needed for the model by loading from a file. + * DEPRECATED. Allocate (almost) all memory needed for the model by loading from a file. * * @param path_model Path to the model file * @return Whisper context on success, null on failure */ Pointer whisper_init_from_file(String path_model); + + /** + * Provides default params which can be used with `whisper_init_from_file_with_params()` etc. + * Because this function allocates memory for the params, the caller must call either: + * - call `whisper_free_context_params()` + * - `Native.free(Pointer.nativeValue(pointer));` + */ + Pointer whisper_context_default_params_by_ref(); + + void whisper_free_context_params(Pointer params); + + /** + * Allocate (almost) all memory needed for the model by loading from a file. + * + * @param path_model Path to the model file + * @param params Pointer to whisper_context_params + * @return Whisper context on success, null on failure + */ + Pointer whisper_init_from_file_with_params(String path_model, Pointer params); /** * Allocate (almost) all memory needed for the model by loading from a buffer. diff --git a/bindings/javascript/emscripten.cpp b/bindings/javascript/emscripten.cpp index 789ad8b51f8..b442c1fcdbe 100644 --- a/bindings/javascript/emscripten.cpp +++ b/bindings/javascript/emscripten.cpp @@ -20,7 +20,7 @@ struct whisper_context * g_context; EMSCRIPTEN_BINDINGS(whisper) { emscripten::function("init", emscripten::optional_override([](const std::string & path_model) { if (g_context == nullptr) { - g_context = whisper_init_from_file(path_model.c_str()); + g_context = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params()); if (g_context != nullptr) { return true; } else { diff --git a/bindings/ruby/ext/ruby_whisper.cpp b/bindings/ruby/ext/ruby_whisper.cpp index 82027d42fa5..86af9391e2c 100644 --- a/bindings/ruby/ext/ruby_whisper.cpp +++ b/bindings/ruby/ext/ruby_whisper.cpp @@ -87,7 +87,7 @@ static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) { if (!rb_respond_to(whisper_model_file_path, rb_intern("to_s"))) { rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context"); } - rw->context = whisper_init_from_file(StringValueCStr(whisper_model_file_path)); + rw->context = whisper_init_from_file_with_params(StringValueCStr(whisper_model_file_path), whisper_context_default_params()); if (rw->context == nullptr) { rb_raise(rb_eRuntimeError, "error: failed to initialize whisper context"); } diff --git a/whisper.cpp b/whisper.cpp index 4c7b244be7e..7af884ea70a 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -3267,6 +3267,12 @@ void whisper_free(struct whisper_context * ctx) { } } +void whisper_free_context_params(struct whisper_context_params * params) { + if (params) { + delete params; + } +} + void whisper_free_params(struct whisper_full_params * params) { if (params) { delete params; diff --git a/whisper.h b/whisper.h index e151e9cf27f..945b7db71c8 100644 --- a/whisper.h +++ b/whisper.h @@ -170,6 +170,7 @@ extern "C" { WHISPER_API void whisper_free (struct whisper_context * ctx); WHISPER_API void whisper_free_state(struct whisper_state * state); WHISPER_API void whisper_free_params(struct whisper_full_params * params); + WHISPER_API void whisper_free_context_params(struct whisper_context_params * params); // Convert RAW PCM audio to log mel spectrogram. // The resulting spectrogram is stored inside the default state of the provided whisper context. From 851b2ce017074c8cd0082e2d024419aad9aefaff Mon Sep 17 00:00:00 2001 From: Jhen Date: Fri, 6 Oct 2023 11:00:09 +0800 Subject: [PATCH 09/15] addon.node : fix build & test --- examples/addon.node/__test__/whisper.spec.js | 1 + examples/addon.node/addon.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/addon.node/__test__/whisper.spec.js b/examples/addon.node/__test__/whisper.spec.js index 845af2f0790..d102fe7624e 100644 --- a/examples/addon.node/__test__/whisper.spec.js +++ b/examples/addon.node/__test__/whisper.spec.js @@ -11,6 +11,7 @@ const whisperParamsMock = { language: "en", model: path.join(__dirname, "../../../models/ggml-base.en.bin"), fname_inp: path.join(__dirname, "../../../samples/jfk.wav"), + use_gpu: true, }; describe("Run whisper.node", () => { diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp index 414a35f5573..badd5587b54 100644 --- a/examples/addon.node/addon.cpp +++ b/examples/addon.node/addon.cpp @@ -318,7 +318,7 @@ Napi::Value whisper(const Napi::CallbackInfo& info) { std::string language = whisper_params.Get("language").As(); std::string model = whisper_params.Get("model").As(); std::string input = whisper_params.Get("fname_inp").As(); - std::string input = whisper_params.Get("use_gpu").As(); + bool use_gpu = whisper_params.Get("use_gpu").As(); params.language = language; params.model = model; From 0b0e368519ef5bfb0fb69fcf12d1de1b810c91fd Mon Sep 17 00:00:00 2001 From: Jhen Date: Fri, 6 Oct 2023 12:37:54 +0800 Subject: [PATCH 10/15] bindings : updata java binding --- .../ggerganov/whispercpp/WhisperCpp.java | 37 ++++++++++++++++--- .../whispercpp/WhisperCppJnaLibrary.java | 3 +- .../params/WhisperContextParams.java | 31 ++++++++++++++++ 3 files changed, 64 insertions(+), 7 deletions(-) create mode 100644 bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperContextParams.java diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java index 50c24793c2b..4a25040377c 100644 --- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java +++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java @@ -2,6 +2,7 @@ import com.sun.jna.Native; import com.sun.jna.Pointer; +import io.github.ggerganov.whispercpp.params.WhisperContextParams; import io.github.ggerganov.whispercpp.params.WhisperFullParams; import io.github.ggerganov.whispercpp.params.WhisperSamplingStrategy; @@ -15,7 +16,7 @@ public class WhisperCpp implements AutoCloseable { private WhisperCppJnaLibrary lib = WhisperCppJnaLibrary.instance; private Pointer ctx = null; - private Pointer paramPointer = null; + private Pointer paramsPointer = null; private Pointer greedyParamsPointer = null; private Pointer beamParamsPointer = null; @@ -32,6 +33,18 @@ public File modelDir() { * @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en") */ public void initContext(String modelPath) throws FileNotFoundException { + initContextImpl(modelPath, getContextDefaultParams()); + } + + /** + * @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en") + * @param params - params to use when initialising the context + */ + public void initContext(String modelPath, WhisperContextParams params) throws FileNotFoundException { + initContextImpl(modelPath, params); + } + + private void initContextImpl(String modelPath, WhisperContextParams params) throws FileNotFoundException { if (ctx != null) { lib.whisper_free(ctx); } @@ -44,14 +57,26 @@ public void initContext(String modelPath) throws FileNotFoundException { modelPath = new File(modelDir(), modelPath).getAbsolutePath(); } - paramPointer = lib.whisper_context_default_params_by_ref(); - ctx = lib.whisper_init_from_file_with_params(modelPath, paramPointer); + ctx = lib.whisper_init_from_file_with_params(modelPath, params); if (ctx == null) { throw new FileNotFoundException(modelPath); } } + /** + * Provides default params which can be used with `whisper_init_from_file_with_params()` etc. + * Because this function allocates memory for the params, the caller must call either: + * - call `whisper_free_context_params()` + * - `Native.free(Pointer.nativeValue(pointer));` + */ + public WhisperContextParams getContextDefaultParams() { + paramsPointer = lib.whisper_context_default_params_by_ref(); + WhisperContextParams params = new WhisperContextParams(paramsPointer); + params.read(); + return params; + } + /** * Provides default params which can be used with `whisper_full()` etc. * Because this function allocates memory for the params, the caller must call either: @@ -95,9 +120,9 @@ private void freeContext() { } private void freeParams() { - if (paramPointer != null) { - Native.free(Pointer.nativeValue(paramPointer)); - paramPointer = null; + if (paramsPointer != null) { + Native.free(Pointer.nativeValue(paramsPointer)); + paramsPointer = null; } if (greedyParamsPointer != null) { Native.free(Pointer.nativeValue(greedyParamsPointer)); diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java index 7d88abb4978..56a37380136 100644 --- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java +++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java @@ -5,6 +5,7 @@ import com.sun.jna.Pointer; import io.github.ggerganov.whispercpp.model.WhisperModelLoader; import io.github.ggerganov.whispercpp.model.WhisperTokenData; +import io.github.ggerganov.whispercpp.params.WhisperContextParams; import io.github.ggerganov.whispercpp.params.WhisperFullParams; public interface WhisperCppJnaLibrary extends Library { @@ -37,7 +38,7 @@ public interface WhisperCppJnaLibrary extends Library { * @param params Pointer to whisper_context_params * @return Whisper context on success, null on failure */ - Pointer whisper_init_from_file_with_params(String path_model, Pointer params); + Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams params); /** * Allocate (almost) all memory needed for the model by loading from a buffer. diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperContextParams.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperContextParams.java new file mode 100644 index 00000000000..cf98d2c3757 --- /dev/null +++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperContextParams.java @@ -0,0 +1,31 @@ +package io.github.ggerganov.whispercpp.params; + +import com.sun.jna.*; + +import java.util.Arrays; +import java.util.List; + +/** + * Parameters for the whisper_init_from_file_with_params() function. + * If you change the order or add new parameters, make sure to update the default values in whisper.cpp: + * whisper_context_default_params() + */ +public class WhisperContextParams extends Structure { + + public WhisperContextParams(Pointer p) { + super(p); + } + + /** Use GPU for inference Number (default = true) */ + public CBool use_gpu; + + /** Use GPU for inference Number (default = true) */ + public void useGpu(boolean enable) { + use_gpu = enable ? CBool.TRUE : CBool.FALSE; + } + + @Override + protected List getFieldOrder() { + return Arrays.asList("use_gpu"); + } +} From 883e04fa43a0514950fec588c4a50adda3c93957 Mon Sep 17 00:00:00 2001 From: Jhen Date: Fri, 6 Oct 2023 13:50:53 +0800 Subject: [PATCH 11/15] bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java --- .../github/ggerganov/whispercpp/WhisperContext.java | 4 +++- whisper.cpp | 12 ++++++++++-- whisper.h | 6 +++--- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperContext.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperContext.java index 22d4ce87fe6..0498eb4df81 100644 --- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperContext.java +++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperContext.java @@ -4,6 +4,7 @@ import com.sun.jna.ptr.PointerByReference; import io.github.ggerganov.whispercpp.ggml.GgmlType; import io.github.ggerganov.whispercpp.WhisperModel; +import io.github.ggerganov.whispercpp.params.WhisperContextParams; import java.util.List; @@ -23,8 +24,9 @@ public class WhisperContext extends Structure { public PointerByReference vocab; public PointerByReference state; - /** populated by whisper_init_from_file() */ + /** populated by whisper_init_from_file_with_params() */ String path_model; + WhisperContextParams params; // public static class ByReference extends WhisperContext implements Structure.ByReference { // } diff --git a/whisper.cpp b/whisper.cpp index 7af884ea70a..7a19356707f 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -735,7 +735,7 @@ struct whisper_state { int lang_id = 0; // english by default - std::string path_model; // populated by whisper_init_from_file() + std::string path_model; // populated by whisper_init_from_file_with_params() #ifdef WHISPER_USE_COREML whisper_coreml_context * ctx_coreml = nullptr; #endif @@ -769,7 +769,7 @@ struct whisper_context { whisper_vocab vocab; whisper_state * state = nullptr; - std::string path_model; // populated by whisper_init_from_file() + std::string path_model; // populated by whisper_init_from_file_with_params() whisper_context_params params; }; @@ -3741,6 +3741,14 @@ const char * whisper_print_system_info(void) { //////////////////////////////////////////////////////////////////////////// +struct whisper_context_params * whisper_context_default_params_by_ref() { + struct whisper_context_params params = whisper_context_default_params(); + + struct whisper_context_params* result = new whisper_context_params(); + *result = params; + return result; +} + struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy) { struct whisper_full_params params = whisper_full_default_params(strategy); diff --git a/whisper.h b/whisper.h index 945b7db71c8..73862b6f818 100644 --- a/whisper.h +++ b/whisper.h @@ -107,8 +107,6 @@ extern "C" { void (*close)(void * ctx); } whisper_model_loader; - WHISPER_API struct whisper_context_params whisper_context_default_params(void); - // Various functions for loading a ggml whisper model. // Allocate (almost) all memory needed for the model. // Return NULL on failure @@ -481,7 +479,9 @@ extern "C" { void * logits_filter_callback_user_data; }; - // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_params() + // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params() + WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref(); + WHISPER_API struct whisper_context_params whisper_context_default_params(void); WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy); WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy); From 8e49f2a7ea86d775ed9f92d0f7a2db96b9a2c71d Mon Sep 17 00:00:00 2001 From: Jhen Date: Fri, 6 Oct 2023 14:24:45 +0800 Subject: [PATCH 12/15] metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load --- ggml-metal.m | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 8faf407938a..92f73b0edb6 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -150,34 +150,17 @@ @implementation GGMLMetalClass ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); -#ifdef GGML_SWIFT - // load the default.metallib file - { - NSError * error = nil; - - NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; - NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"]; - NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath]; - NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"]; - NSURL * libURL = [NSURL fileURLWithPath:libPath]; - - // Load the metallib file into a Metal library - ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; + UNUSED(msl_library_source); - if (error) { - metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); - return NULL; - } - } + NSBundle * bundle = nil; +#ifdef GGML_SWIFT + bundle = SWIFTPM_MODULE_BUNDLE; #else - UNUSED(msl_library_source); + bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; +#endif - // read the source from "ggml-metal.metal" into a string and use newLibraryWithSource { NSError * error = nil; - - //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; - NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"]; if (libPath != nil) { NSURL * libURL = [NSURL fileURLWithPath:libPath]; @@ -207,7 +190,6 @@ @implementation GGMLMetalClass return NULL; } } -#endif // load kernels { From bda4b59dc21db25df5b41a34de771d308fc6e2f6 Mon Sep 17 00:00:00 2001 From: jhen Date: Sat, 7 Oct 2023 11:11:03 +0800 Subject: [PATCH 13/15] metal : move bundle var into block --- ggml-metal.m | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 92f73b0edb6..93b8748d5ff 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -152,14 +152,14 @@ @implementation GGMLMetalClass UNUSED(msl_library_source); - NSBundle * bundle = nil; + // load library + { + NSBundle * bundle = nil; #ifdef GGML_SWIFT - bundle = SWIFTPM_MODULE_BUNDLE; + bundle = SWIFTPM_MODULE_BUNDLE; #else - bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; + bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; #endif - - { NSError * error = nil; NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"]; if (libPath != nil) { From 52c3dd64e11263f1e9f7c1735f20de3cf64d2fa8 Mon Sep 17 00:00:00 2001 From: jhen Date: Sat, 7 Oct 2023 12:55:35 +0800 Subject: [PATCH 14/15] metal : use SWIFT_PACKAGE instead of GGML_SWIFT --- ggml-metal.m | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 93b8748d5ff..529dca969ad 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -155,7 +155,7 @@ @implementation GGMLMetalClass // load library { NSBundle * bundle = nil; -#ifdef GGML_SWIFT +#ifdef SWIFT_PACKAGE bundle = SWIFTPM_MODULE_BUNDLE; #else bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; @@ -184,7 +184,7 @@ @implementation GGMLMetalClass #endif ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error]; } - + if (error) { metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; From d7b6b3514ac01f943fa27cd05800a989ef5f4bac Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 5 Nov 2023 20:28:15 +0200 Subject: [PATCH 15/15] style : minor updates --- examples/addon.node/addon.cpp | 2 +- examples/bench/bench.cpp | 11 ++-- examples/command/command.cpp | 6 ++- examples/lsp/lsp.cpp | 5 +- examples/main/main.cpp | 5 +- examples/stream/stream.cpp | 50 +++++++++--------- examples/talk-llama/talk-llama.cpp | 85 ++++++++++++++++-------------- examples/talk/talk.cpp | 6 ++- whisper.h | 7 +-- 9 files changed, 96 insertions(+), 81 deletions(-) diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp index badd5587b54..30acbc6afd8 100644 --- a/examples/addon.node/addon.cpp +++ b/examples/addon.node/addon.cpp @@ -36,11 +36,11 @@ struct whisper_params { bool print_colors = false; bool print_progress = false; bool no_timestamps = false; + bool use_gpu = true; std::string language = "en"; std::string prompt; std::string model = "../../ggml-large.bin"; - bool use_gpu = true; std::vector fname_inp = {}; std::vector fname_out = {}; diff --git a/examples/bench/bench.cpp b/examples/bench/bench.cpp index 86b8afdd522..9f50b3b6224 100644 --- a/examples/bench/bench.cpp +++ b/examples/bench/bench.cpp @@ -11,6 +11,7 @@ struct whisper_params { int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat std::string model = "models/ggml-base.en.bin"; + bool use_gpu = true; }; @@ -24,10 +25,10 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { whisper_print_usage(argc, argv, params); exit(0); } - else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); } - else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } - else if (arg == "-w" || arg == "--what") { params.what = atoi(argv[++i]); } - else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } + else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); } + else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } + else if (arg == "-w" || arg == "--what") { params.what = atoi(argv[++i]); } + else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params); @@ -47,6 +48,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads); fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what); + fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true"); fprintf(stderr, " %-7s 0 - whisper\n", ""); fprintf(stderr, " %-7s 1 - memcpy\n", ""); fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", ""); @@ -58,6 +60,7 @@ int whisper_bench_full(const whisper_params & params) { struct whisper_context_params cparams; cparams.use_gpu = params.use_gpu; + struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); { diff --git a/examples/command/command.cpp b/examples/command/command.cpp index 2bf1df0da56..7045f5ff81e 100644 --- a/examples/command/command.cpp +++ b/examples/command/command.cpp @@ -38,10 +38,10 @@ struct whisper_params { bool print_special = false; bool print_energy = false; bool no_timestamps = true; + bool use_gpu = true; std::string language = "en"; std::string model = "models/ggml-base.en.bin"; - bool use_gpu = true; std::string fname_out; std::string commands; std::string prompt; @@ -69,12 +69,12 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; } + else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; } else if (arg == "-cmd" || arg == "--commands") { params.commands = argv[++i]; } else if (arg == "-p" || arg == "--prompt") { params.prompt = argv[++i]; } - else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params); @@ -103,6 +103,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false"); + fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true"); fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str()); fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str()); @@ -614,6 +615,7 @@ int main(int argc, char ** argv) { struct whisper_context_params cparams; cparams.use_gpu = params.use_gpu; + struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); // print some info about the processing diff --git a/examples/lsp/lsp.cpp b/examples/lsp/lsp.cpp index d57cee5ecfd..8d8b6ffa238 100644 --- a/examples/lsp/lsp.cpp +++ b/examples/lsp/lsp.cpp @@ -30,10 +30,10 @@ struct whisper_params { bool translate = false; bool print_special = false; bool print_energy = false; + bool use_gpu = true; std::string language = "en"; std::string model = "models/ggml-base.en.bin"; - bool use_gpu = true; }; struct command { std::vector tokens; @@ -73,9 +73,9 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; } + else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } - else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params); @@ -104,6 +104,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false"); + fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true"); fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str()); fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); fprintf(stderr, "\n"); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 0878db82e30..e43dfe3f948 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -90,6 +90,7 @@ struct whisper_params { bool print_progress = false; bool no_timestamps = false; bool log_score = false; + bool use_gpu = true; std::string language = "en"; std::string prompt; @@ -103,8 +104,6 @@ struct whisper_params { std::vector fname_inp = {}; std::vector fname_out = {}; - - bool use_gpu = true; }; void whisper_print_usage(int argc, char ** argv, const whisper_params & params); @@ -224,6 +223,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", ""); fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str()); fprintf(stderr, " -ls, --log-score [%-7s] log best decoder scores of tokens\n", params.log_score?"true":"false"); + fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true"); fprintf(stderr, "\n"); } @@ -882,6 +882,7 @@ int main(int argc, char ** argv) { struct whisper_context_params cparams; cparams.use_gpu = params.use_gpu; + struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); if (ctx == nullptr) { diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index 4320cefd6de..47f1780b4ea 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -48,12 +48,12 @@ struct whisper_params { bool no_context = true; bool no_timestamps = false; bool tinydiarize = false; + bool save_audio = false; // save audio to wav file + bool use_gpu = true; std::string language = "en"; std::string model = "models/ggml-base.en.bin"; - bool use_gpu = true; std::string fname_out; - bool save_audio = false; // save audio to wav file }; void whisper_print_usage(int argc, char ** argv, const whisper_params & params); @@ -66,26 +66,26 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { whisper_print_usage(argc, argv, params); exit(0); } - else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); } - else if ( arg == "--step") { params.step_ms = std::stoi(argv[++i]); } - else if ( arg == "--length") { params.length_ms = std::stoi(argv[++i]); } - else if ( arg == "--keep") { params.keep_ms = std::stoi(argv[++i]); } - else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); } - else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); } - else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); } - else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); } - else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); } - else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } - else if (arg == "-tr" || arg == "--translate") { params.translate = true; } - else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; } - else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } - else if (arg == "-kc" || arg == "--keep-context") { params.no_context = false; } - else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; } - else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } - else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; } - else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; } - else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; } - else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } + else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); } + else if ( arg == "--step") { params.step_ms = std::stoi(argv[++i]); } + else if ( arg == "--length") { params.length_ms = std::stoi(argv[++i]); } + else if ( arg == "--keep") { params.keep_ms = std::stoi(argv[++i]); } + else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); } + else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); } + else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); } + else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); } + else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); } + else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } + else if (arg == "-tr" || arg == "--translate") { params.translate = true; } + else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; } + else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } + else if (arg == "-kc" || arg == "--keep-context") { params.no_context = false; } + else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; } + else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } + else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; } + else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; } + else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; } + else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); @@ -120,8 +120,9 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str()); fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str()); - fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false"); + fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false"); fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false"); + fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n", params.use_gpu ? "false" : "true"); fprintf(stderr, "\n"); } @@ -167,6 +168,7 @@ int main(int argc, char ** argv) { struct whisper_context_params cparams; cparams.use_gpu = params.use_gpu; + struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); std::vector pcmf32 (n_samples_30s, 0.0f); @@ -428,4 +430,4 @@ int main(int argc, char ** argv) { whisper_free(ctx); return 0; -} \ No newline at end of file +} diff --git a/examples/talk-llama/talk-llama.cpp b/examples/talk-llama/talk-llama.cpp index be3c2a16dca..6cc30c1653e 100644 --- a/examples/talk-llama/talk-llama.cpp +++ b/examples/talk-llama/talk-llama.cpp @@ -63,12 +63,12 @@ struct whisper_params { bool print_energy = false; bool no_timestamps = true; bool verbose_prompt = false; + bool use_gpu = true; std::string person = "Georgi"; std::string language = "en"; std::string model_wsp = "models/ggml-base.en.bin"; std::string model_llama = "models/ggml-llama-7B.bin"; - bool use_gpu = true; std::string speak = "./examples/talk-llama/speak"; std::string prompt = ""; std::string fname_out; @@ -85,25 +85,26 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { whisper_print_usage(argc, argv, params); exit(0); } - else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); } - else if (arg == "-vms" || arg == "--voice-ms") { params.voice_ms = std::stoi(argv[++i]); } - else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); } - else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); } - else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); } - else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); } - else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); } - else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } - else if (arg == "-tr" || arg == "--translate") { params.translate = true; } - else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } - else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; } - else if (arg == "--verbose-prompt") { params.verbose_prompt = true; } - else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; } - else if (arg == "--session") { params.path_session = argv[++i];} - else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; } - else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; } - else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; } - else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; } - else if (arg == "--prompt-file") { + else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); } + else if (arg == "-vms" || arg == "--voice-ms") { params.voice_ms = std::stoi(argv[++i]); } + else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); } + else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); } + else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); } + else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); } + else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); } + else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } + else if (arg == "-tr" || arg == "--translate") { params.translate = true; } + else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } + else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; } + else if (arg == "-vp" || arg == "--verbose-prompt") { params.verbose_prompt = true; } + else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } + else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; } + else if (arg == "--session") { params.path_session = argv[++i];} + else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; } + else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; } + else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; } + else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; } + else if (arg == "--prompt-file") { std::ifstream file(argv[++i]); std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); if (params.prompt.back() == '\n') { @@ -127,27 +128,28 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help [default] show this help message and exit\n"); - fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads); - fprintf(stderr, " -vms N, --voice-ms N [%-7d] voice duration in milliseconds\n", params.voice_ms); - fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id); - fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens); - fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx); - fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold); - fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold); - fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); - fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); - fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); - fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false"); - fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str()); - fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str()); - fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str()); - fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str()); - fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str()); - fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", ""); - fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n"); - fprintf(stderr, " --verbose-prompt [%-7s] print prompt at start\n", params.verbose_prompt ? "true" : "false"); - fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str()); + fprintf(stderr, " -h, --help [default] show this help message and exit\n"); + fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads); + fprintf(stderr, " -vms N, --voice-ms N [%-7d] voice duration in milliseconds\n", params.voice_ms); + fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id); + fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens); + fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx); + fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold); + fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold); + fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); + fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); + fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); + fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false"); + fprintf(stderr, " -vp, --verbose-prompt [%-7s] print prompt at start\n", params.verbose_prompt ? "true" : "false"); + fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true"); + fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str()); + fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str()); + fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str()); + fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str()); + fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str()); + fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", ""); + fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n"); + fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str()); fprintf(stderr, "\n"); } @@ -256,6 +258,7 @@ int main(int argc, char ** argv) { struct whisper_context_params cparams; cparams.use_gpu = params.use_gpu; + struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams); // llama init diff --git a/examples/talk/talk.cpp b/examples/talk/talk.cpp index 4e9fca9c625..cdb1a230b7d 100644 --- a/examples/talk/talk.cpp +++ b/examples/talk/talk.cpp @@ -31,12 +31,12 @@ struct whisper_params { bool print_special = false; bool print_energy = false; bool no_timestamps = true; + bool use_gpu = true; std::string person = "Santa"; std::string language = "en"; std::string model_wsp = "models/ggml-base.en.bin"; std::string model_gpt = "models/ggml-gpt-2-117M.bin"; - bool use_gpu = true; std::string speak = "./examples/talk/speak"; std::string fname_out; }; @@ -62,13 +62,13 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; } + else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; } else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; } else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; } else if (arg == "-mg" || arg == "--model-gpt") { params.model_gpt = argv[++i]; } else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; } else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; } - else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params); @@ -96,6 +96,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false"); + fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true"); fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str()); fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str()); fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str()); @@ -185,6 +186,7 @@ int main(int argc, char ** argv) { // whisper init struct whisper_context_params cparams; cparams.use_gpu = params.use_gpu; + struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams); // gpt init diff --git a/whisper.h b/whisper.h index 73862b6f818..300fc4bac37 100644 --- a/whisper.h +++ b/whisper.h @@ -73,15 +73,16 @@ extern "C" { // understanding of how the model works. // - struct whisper_context_params { - bool use_gpu; - }; struct whisper_context; struct whisper_state; struct whisper_full_params; typedef int whisper_token; + struct whisper_context_params { + bool use_gpu; + }; + typedef struct whisper_token_data { whisper_token id; // token id whisper_token tid; // forced timestamp token id