11// Defines fileno on msys:
22#ifndef _GNU_SOURCE
33#define _GNU_SOURCE
4+ #include < cstdint>
5+ #include < cstdio>
46#endif
57
68#include " llama_util.h"
@@ -1759,8 +1761,7 @@ int llama_model_quantize(
17591761 }
17601762}
17611763
1762- int llama_apply_lora_from_file (struct llama_context * ctx, const char * path_lora, int n_threads) {
1763- // TODO: refactor all of this after PR #801
1764+ int llama_apply_lora_from_file_internal (struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
17641765 fprintf (stderr, " %s: applying lora adapter from '%s' - please wait ...\n " , __func__, path_lora);
17651766
17661767 auto & model = ctx->model ;
@@ -1801,13 +1802,13 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
18011802
18021803 // create a temporary ggml context to store the lora tensors
18031804 // todo: calculate size from biggest possible tensor
1804- std::vector<uint8_t > buf (1024ull * 1024ull * 1024ull );
1805+ std::vector<uint8_t > lora_buf (1024ull * 1024ull * 1024ull );
18051806 struct ggml_init_params params;
1806- params.mem_size = buf .size ();
1807- params.mem_buffer = buf .data ();
1807+ params.mem_size = lora_buf .size ();
1808+ params.mem_buffer = lora_buf .data ();
18081809 params.no_alloc = false ;
18091810
1810- ggml_context* lora_ctx = ggml_init (params);
1811+ ggml_context * lora_ctx = ggml_init (params);
18111812 std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
18121813
18131814 // create a name -> tensor map of the model to accelerate lookups
@@ -1816,6 +1817,32 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
18161817 model_tensors.insert (kv);
18171818 }
18181819
1820+
1821+ // load base model
1822+ std::unique_ptr<llama_model_loader> model_loader;
1823+ ggml_context * base_ctx = NULL ;
1824+ llama_buffer base_buf;
1825+ if (path_base_model) {
1826+ fprintf (stderr, " %s: loading base model from '%s'\n " , __func__, path_base_model);
1827+ model_loader.reset (new llama_model_loader (path_base_model, /* use_mmap*/ true , /* vocab_only*/ false ));
1828+
1829+ size_t ctx_size, mmapped_size;
1830+ model_loader->calc_sizes (&ctx_size, &mmapped_size);
1831+ base_buf.resize (ctx_size);
1832+
1833+ ggml_init_params base_params;
1834+ base_params.mem_size = base_buf.size ;
1835+ base_params.mem_buffer = base_buf.addr ;
1836+ base_params.no_alloc = model_loader->use_mmap ;
1837+
1838+ base_ctx = ggml_init (base_params);
1839+
1840+ model_loader->ggml_ctx = base_ctx;
1841+
1842+ // maybe this should in llama_model_loader
1843+ model_loader->mapping .reset (new llama_mmap (&model_loader->file_loaders .at (0 )->file , false ));
1844+ }
1845+
18191846 fprintf (stderr, " %s: " , __func__);
18201847
18211848 // read tensors and apply
@@ -1892,13 +1919,31 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
18921919 if (lora_tensors.find (base_name + " .loraA" ) != lora_tensors.end () &&
18931920 lora_tensors.find (base_name + " .loraB" ) != lora_tensors.end ()) {
18941921
1895- ggml_tensor * tensor = model_tensors[base_name];
1922+ ggml_tensor * dest_t = model_tensors[base_name];
1923+ ggml_tensor * base_t ;
1924+ if (model_loader) {
1925+ // load from base model
1926+ if (model_loader->tensors_map .name_to_idx .find (base_name) == model_loader->tensors_map .name_to_idx .end ()) {
1927+ fprintf (stderr, " %s: error: tensor '%s' not found in base model\n " , __func__, base_name.c_str ());
1928+ return 1 ;
1929+ }
1930+ size_t idx = model_loader->tensors_map .name_to_idx [base_name];
1931+ llama_load_tensor & lt = model_loader->tensors_map .tensors [idx];
1932+ base_t = model_loader->get_tensor (base_name, { (uint32_t )dest_t ->ne [0 ], (uint32_t )dest_t ->ne [1 ] });
1933+ lt.data = (uint8_t *) lt.ggml_tensor ->data ;
1934+ model_loader->load_data_for (lt);
1935+ lt.ggml_tensor ->data = lt.data ;
1936+ }
1937+ else {
1938+ base_t = dest_t ;
1939+ }
1940+
18961941 ggml_tensor * loraA = lora_tensors[base_name + " .loraA" ];
18971942 ggml_tensor * loraB = lora_tensors[base_name + " .loraB" ];
18981943
1899- if (tensor ->ne [0 ] != loraA->ne [1 ] || tensor ->ne [1 ] != loraB->ne [1 ]) {
1944+ if (base_t ->ne [0 ] != loraA->ne [1 ] || base_t ->ne [1 ] != loraB->ne [1 ]) {
19001945 fprintf (stderr, " %s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 " );"
1901- " are you sure that this adapter is for this model?\n " , __func__, tensor ->ne [0 ], loraA->ne [1 ]);
1946+ " are you sure that this adapter is for this model?\n " , __func__, base_t ->ne [0 ], loraA->ne [1 ]);
19021947 return 1 ;
19031948 }
19041949
@@ -1910,14 +1955,14 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
19101955 BA = ggml_scale (lora_ctx, BA, scale_tensor);
19111956 }
19121957
1913- // printf("%s: (B)(%d %d %d %d) x (A)(%d %d %d %d) => (BA)(%d %d %d %d) + (T)(%d %d %d %d)\n",
1914- // base_name.c_str(),
1915- // (int)loraB->ne[0], (int)loraB->ne[1], (int)loraB->ne[2], (int)loraB->ne[3],
1916- // (int)loraA->ne[0], (int)loraA->ne[1], (int)loraA->ne[2], (int)loraA->ne[3],
1917- // (int)BA->ne[0], (int)BA->ne[1], (int)BA->ne[2], (int)BA->ne[3],
1918- // (int)tensor->ne[0], (int)tensor->ne[1], (int)tensor->ne[2], (int)tensor->ne[3]
1919- // );
1920- ggml_tensor * r = ggml_add_inplace (lora_ctx, tensor, BA);
1958+ ggml_tensor * r;
1959+ if ( base_t == dest_t ) {
1960+ r = ggml_add_inplace (lora_ctx, dest_t , BA);
1961+ }
1962+ else {
1963+ r = ggml_add (lora_ctx, base_t , BA);
1964+ r = ggml_cpy (lora_ctx, r, dest_t );
1965+ }
19211966
19221967 struct ggml_cgraph gf = ggml_build_forward (r);
19231968 gf.n_threads = n_threads;
@@ -1934,14 +1979,27 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
19341979 }
19351980 }
19361981
1982+ // TODO: this should be in a destructor, it will leak on failure
19371983 ggml_free (lora_ctx);
1984+ if (base_ctx) {
1985+ ggml_free (base_ctx);
1986+ }
19381987
19391988 const int64_t t_lora_us = ggml_time_us () - t_start_lora_us;
19401989 fprintf (stderr, " done (%.2f ms)\n " , t_lora_us / 1000.0 );
19411990
19421991 return 0 ;
19431992}
19441993
1994+ int llama_apply_lora_from_file (struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
1995+ try {
1996+ return llama_apply_lora_from_file_internal (ctx, path_lora, path_base_model, n_threads);
1997+ } catch (const std::string & err) {
1998+ fprintf (stderr, " %s: failed to apply lora adapter: %s\n " , __func__, err.c_str ());
1999+ return 1 ;
2000+ }
2001+ }
2002+
19452003// Returns the KV cache that will contain the context for the
19462004// ongoing prediction with the model.
19472005const uint8_t * llama_get_kv_cache (struct llama_context * ctx) {
0 commit comments