@@ -510,22 +510,9 @@ struct llama_state {
510510// global state
511511static llama_state g_state;
512512
513- template <typename T>
514- static T checked_mul (T a, T b) {
515- T ret = a * b;
516- if (a != 0 && ret / a != b) {
517- throw std::runtime_error (format (" overflow multiplying %llu * %llu" ,
518- (unsigned long long ) a, (unsigned long long ) b));
519- }
520- return ret;
521- }
522-
523- static size_t checked_div (size_t a, size_t b) {
524- if (b == 0 || a % b != 0 ) {
525- throw std::runtime_error (format (" error dividing %zu / %zu" , a, b));
526- }
527- return a / b;
528- }
513+ //
514+ // model loading and saving
515+ //
529516
530517static std::string llama_format_tensor_shape (const std::vector<uint32_t > & ne) {
531518 char buf[256 ];
@@ -536,14 +523,6 @@ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
536523 return buf;
537524}
538525
539- static size_t llama_calc_tensor_size (const std::vector<uint32_t > & ne, enum ggml_type type) {
540- size_t size = ggml_type_size (type);
541- for (uint32_t dim : ne) {
542- size = checked_mul<size_t >(size, dim);
543- }
544- return size / ggml_blck_size (type);
545- }
546-
547526struct gguf_load_tensor {
548527 std::string name;
549528 enum ggml_type type = GGML_TYPE_F32;
@@ -573,20 +552,19 @@ struct gguf_file_loader {
573552
574553 struct ggml_context * ctx_data = NULL ;
575554
576- gguf_file_loader (const char * fname, gguf_load_tensors_map & tensors_map)
577- : file(fname, " rb" ) {
555+ gguf_file_loader (const char * fname, gguf_load_tensors_map & tensors_map) : file(fname, " rb" ) {
578556 fprintf (stderr, " llama.cpp: loading model from %s\n " , fname);
579557
580- struct gguf_init_params params = {
581- /* .no_alloc = */ true ,
582- /* .ctx = */ &ctx_data,
583- };
558+ struct gguf_init_params params = {
559+ /* .no_alloc = */ true ,
560+ /* .ctx = */ &ctx_data,
561+ };
584562
585- gguf_ctx = gguf_init_from_file (fname, params);
586- file_version = (enum gguf_file_version) gguf_get_version (gguf_ctx);
563+ gguf_ctx = gguf_init_from_file (fname, params);
564+ file_version = (enum gguf_file_version) gguf_get_version (gguf_ctx);
587565
588- read_hparams ();
589- read_vocab ();
566+ read_hparams ();
567+ read_vocab ();
590568 read_tensor_metadata (tensors_map);
591569 }
592570
@@ -636,18 +614,18 @@ struct gguf_file_loader {
636614
637615 void read_vocab () {
638616 vocab.id_to_token .resize (hparams.n_vocab );
639- int token_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.tokens" );
617+
618+ const int token_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.tokens" );
640619 if (token_idx == -1 ) {
641620 throw std::runtime_error (" cannot find token list in GGUF file\n " );
642621 }
643622
644- int score_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.scores" );
623+ const int score_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.scores" );
645624 if (score_idx == -1 ) {
646625 throw std::runtime_error (" cannot find token scores list in GGUF file\n " );
647626 }
648627
649628 for (uint32_t i = 0 ; i < hparams.n_vocab ; i++) {
650-
651629 std::string word = gguf_get_arr_str (gguf_ctx, token_idx, i);
652630
653631 vocab.token_to_id [word] = i;
@@ -701,7 +679,7 @@ struct gguf_file_loader {
701679 tensor.file_off = gguf_get_data_offset (gguf_ctx) + gguf_get_tensor_offset (gguf_ctx, i);
702680
703681 tensor.name = name;
704- tensor.size = llama_calc_tensor_size (tensor. ne , tensor. type );
682+ tensor.size = ggml_nbytes (cur );
705683
706684 tensors_map.tensors .push_back (tensor);
707685 tensors_map.name_to_idx [name] = tensors_map.tensors .size () - 1 ;
@@ -786,7 +764,7 @@ struct gguf_file_saver {
786764 gguf_type arr_type;
787765 int n_arr;
788766
789- switch (vtype) {
767+ switch (vtype) {
790768 case GGUF_TYPE_BOOL:
791769 bool_val = gguf_get_val_bool (fl->gguf_ctx , i);
792770 file.write_val <bool >(key, GGUF_TYPE_BOOL, bool_val);
@@ -809,7 +787,7 @@ struct gguf_file_saver {
809787 break ;
810788 case GGUF_TYPE_STRING:
811789 str_val = gguf_get_val_str (fl->gguf_ctx , i);
812- file.write_val <std::string> (key, GGUF_TYPE_STRING, str_val);
790+ file.write_str (key, GGUF_TYPE_STRING, str_val);
813791 break ;
814792 case GGUF_TYPE_UINT16:
815793 u16_val = gguf_get_val_u16 (fl->gguf_ctx , i);
@@ -825,7 +803,7 @@ struct gguf_file_saver {
825803 break ;
826804 case GGUF_TYPE_ARRAY:
827805 arr_type = gguf_get_arr_type (fl->gguf_ctx , i);
828- n_arr = gguf_get_arr_n (fl->gguf_ctx , i);
806+ n_arr = gguf_get_arr_n (fl->gguf_ctx , i);
829807 if (arr_type == GGUF_TYPE_FLOAT32) {
830808 write_hparam_arr_f32 (key, arr_type, i, n_arr);
831809 } else if (arr_type == GGUF_TYPE_STRING) {
@@ -922,20 +900,6 @@ struct llama_model_loader {
922900 }
923901 }
924902
925- struct ggml_tensor * get_tensor (const std::string & name, const std::vector<uint32_t > & ne, ggml_backend backend) {
926- auto it = tensors_map.name_to_idx .find (name);
927- if (it == tensors_map.name_to_idx .end ()) {
928- throw std::runtime_error (std::runtime_error (format (" llama.cpp: tensor '%s' is missing from model" , name.c_str ())));
929- }
930- gguf_load_tensor & lt = tensors_map.tensors .at (it->second );
931- if (lt.ne != ne) {
932- throw std::runtime_error (format (" llama.cpp: tensor '%s' has wrong shape; expected %s, got %s" ,
933- name.c_str (), llama_format_tensor_shape (ne).c_str (), llama_format_tensor_shape (lt.ne ).c_str ()));
934- }
935-
936- return get_tensor_for (lt, backend);
937- }
938-
939903 struct ggml_tensor * get_tensor_for (gguf_load_tensor & lt, ggml_backend backend) {
940904 struct ggml_tensor * tensor;
941905 if (backend != GGML_BACKEND_CPU) {
@@ -959,16 +923,41 @@ struct llama_model_loader {
959923 return tensor;
960924 }
961925
926+ struct ggml_tensor * get_tensor (const std::string & name, const std::vector<uint32_t > & ne, ggml_backend backend) {
927+ auto it = tensors_map.name_to_idx .find (name);
928+ if (it == tensors_map.name_to_idx .end ()) {
929+ throw std::runtime_error (std::runtime_error (format (" llama.cpp: tensor '%s' is missing from model" , name.c_str ())));
930+ }
931+ gguf_load_tensor & lt = tensors_map.tensors .at (it->second );
932+ if (lt.ne != ne) {
933+ throw std::runtime_error (format (" llama.cpp: tensor '%s' has wrong shape; expected %s, got %s" ,
934+ name.c_str (), llama_format_tensor_shape (ne).c_str (), llama_format_tensor_shape (lt.ne ).c_str ()));
935+ }
936+
937+ return get_tensor_for (lt, backend);
938+ }
939+
962940 void done_getting_tensors () const {
963941 if (num_ggml_tensors_created != tensors_map.tensors .size ()) {
964942 throw std::runtime_error (std::string (" llama.cpp: file contained more tensors than expected" ));
965943 }
966944 }
967945
968- void load_all_data (llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
969- size_t data_size = 0 ;
946+ void load_data_for (gguf_load_tensor & lt) const {
947+ if (use_mmap) {
948+ lt.data = (uint8_t *) mapping->addr + lt.file_off ;
949+ } else {
950+ gguf_file & file = file_loader->file ;
951+ file.seek (lt.file_off , SEEK_SET);
952+ file.read_raw (lt.data , lt.size );
953+ }
954+ }
955+
956+ void load_all_data (llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
957+ size_t data_size = 0 ;
970958 size_t prefetch_size = 0 ;
971- size_t lock_size = 0 ;
959+ size_t lock_size = 0 ;
960+
972961 for (const gguf_load_tensor & lt : tensors_map.tensors ) {
973962 data_size += lt.size ;
974963 if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
@@ -1030,31 +1019,6 @@ struct llama_model_loader {
10301019 done_size += lt.size ;
10311020 }
10321021 }
1033-
1034- void load_data_for (gguf_load_tensor & lt) {
1035- if (use_mmap) {
1036- lt.data = (uint8_t *) mapping->addr + lt.file_off ;
1037- } else {
1038- gguf_file & file = file_loader->file ;
1039- file.seek (lt.file_off , SEEK_SET);
1040- file.read_raw (lt.data , lt.size );
1041- }
1042-
1043- if (0 ) {
1044- print_checksum (lt);
1045- }
1046- }
1047-
1048- static void print_checksum (gguf_load_tensor & lt) {
1049- uint32_t sum = 0 ;
1050- for (size_t i = 0 ; i < lt.size ; i++) {
1051- uint8_t byte = lt.data [i];
1052- sum = byte + (sum << 6 ) + (sum << 16 ) - sum; // sdbm hash
1053- }
1054- fprintf (stderr, " %s checksum: %#08x (%s, size %zu)\n " , lt.name .c_str (), sum,
1055- llama_format_tensor_shape (lt.ne ).c_str (), lt.size );
1056- }
1057-
10581022};
10591023
10601024//
@@ -1184,18 +1148,18 @@ int64_t llama_time_us() {
11841148}
11851149
11861150//
1187- // model loading
1151+ // load LLaMA models
11881152//
11891153
1190- static const char *gguf_file_version_name (gguf_file_version version) {
1154+ static const char * gguf_file_version_name (gguf_file_version version) {
11911155 switch (version) {
11921156 case GGUF_FILE_VERSION_V1: return " GGUF V1 (latest)" ;
1193- }
1157+ }
11941158
11951159 return " unknown" ;
11961160}
11971161
1198- static const char *llama_ftype_name (enum llama_ftype ftype) {
1162+ static const char * llama_ftype_name (enum llama_ftype ftype) {
11991163 switch (ftype) {
12001164 case LLAMA_FTYPE_ALL_F32: return " all F32" ;
12011165 case LLAMA_FTYPE_MOSTLY_F16: return " mostly F16" ;
@@ -1206,24 +1170,26 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
12061170 case LLAMA_FTYPE_MOSTLY_Q5_0: return " mostly Q5_0" ;
12071171 case LLAMA_FTYPE_MOSTLY_Q5_1: return " mostly Q5_1" ;
12081172 case LLAMA_FTYPE_MOSTLY_Q8_0: return " mostly Q8_0" ;
1173+
12091174 // K-quants
1210- case LLAMA_FTYPE_MOSTLY_Q2_K: return " mostly Q2_K" ;
1175+ case LLAMA_FTYPE_MOSTLY_Q2_K: return " mostly Q2_K" ;
12111176 case LLAMA_FTYPE_MOSTLY_Q3_K_S: return " mostly Q3_K - Small" ;
12121177 case LLAMA_FTYPE_MOSTLY_Q3_K_M: return " mostly Q3_K - Medium" ;
12131178 case LLAMA_FTYPE_MOSTLY_Q3_K_L: return " mostly Q3_K - Large" ;
12141179 case LLAMA_FTYPE_MOSTLY_Q4_K_S: return " mostly Q4_K - Small" ;
12151180 case LLAMA_FTYPE_MOSTLY_Q4_K_M: return " mostly Q4_K - Medium" ;
12161181 case LLAMA_FTYPE_MOSTLY_Q5_K_S: return " mostly Q5_K - Small" ;
12171182 case LLAMA_FTYPE_MOSTLY_Q5_K_M: return " mostly Q5_K - Medium" ;
1218- case LLAMA_FTYPE_MOSTLY_Q6_K: return " mostly Q6_K" ;
1219- default : return " unknown, may not work" ;
1183+ case LLAMA_FTYPE_MOSTLY_Q6_K: return " mostly Q6_K" ;
1184+
1185+ default : return " unknown, may not work" ;
12201186 }
12211187}
12221188
1223- static const char *llama_model_type_name (e_model type) {
1189+ static const char * llama_model_type_name (e_model type) {
12241190 switch (type) {
1225- case MODEL_3B: return " 3B" ;
1226- case MODEL_7B: return " 7B" ;
1191+ case MODEL_3B: return " 3B" ;
1192+ case MODEL_7B: return " 7B" ;
12271193 case MODEL_13B: return " 13B" ;
12281194 case MODEL_30B: return " 30B" ;
12291195 case MODEL_65B: return " 65B" ;
@@ -1604,7 +1570,6 @@ static struct ggml_cgraph * llama_build_graph(
16041570 const int64_t n_embd_head = hparams.n_embd_head ();
16051571 const int64_t n_embd_gqa = hparams.n_embd_gqa ();
16061572
1607-
16081573 GGML_ASSERT (n_embd_head == hparams.n_rot );
16091574
16101575 const float freq_base = hparams.rope_freq_base ;
@@ -1713,7 +1678,7 @@ static struct ggml_cgraph * llama_build_graph(
17131678
17141679 struct ggml_tensor * inpSA = inpL;
17151680
1716- lctx. use_buf (ctx0, 0 );
1681+ llama_context:: use_buf (ctx0, 0 );
17171682
17181683 // norm
17191684 {
@@ -1852,7 +1817,7 @@ static struct ggml_cgraph * llama_build_graph(
18521817 ggml_set_name (cur, " result_wo" );
18531818 }
18541819
1855- lctx. use_buf (ctx0, 1 );
1820+ llama_context:: use_buf (ctx0, 1 );
18561821
18571822 struct ggml_tensor * inpFF = ggml_add (ctx0, cur, inpSA);
18581823 offload_func (inpFF);
@@ -1908,7 +1873,7 @@ static struct ggml_cgraph * llama_build_graph(
19081873 inpL = cur;
19091874 }
19101875
1911- lctx. use_buf (ctx0, 0 );
1876+ llama_context:: use_buf (ctx0, 0 );
19121877
19131878 // norm
19141879 {
@@ -1926,7 +1891,7 @@ static struct ggml_cgraph * llama_build_graph(
19261891 cur = ggml_mul_mat (ctx0, model.output , cur);
19271892 ggml_set_name (cur, " result_output" );
19281893
1929- lctx. use_buf (ctx0, -1 );
1894+ llama_context:: use_buf (ctx0, -1 );
19301895
19311896 // logits -> probs
19321897 // cur = ggml_soft_max_inplace(ctx0, cur);
@@ -2996,9 +2961,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
29962961 }
29972962 }
29982963
2999- const auto rejects =
3000- llama_grammar_reject_candidates (grammar->rules , grammar->stacks , candidates_grammar);
3001- for (auto & reject : rejects) {
2964+ const auto rejects = llama_grammar_reject_candidates (grammar->rules , grammar->stacks , candidates_grammar);
2965+ for (const auto & reject : rejects) {
30022966 candidates->data [reject.index ].logit = -INFINITY;
30032967 }
30042968
@@ -3725,7 +3689,7 @@ void llama_free(struct llama_context * ctx) {
37253689int llama_model_quantize (
37263690 const char * fname_inp,
37273691 const char * fname_out,
3728- const llama_model_quantize_params *params) {
3692+ const llama_model_quantize_params * params) {
37293693 try {
37303694 llama_model_quantize_internal (fname_inp, fname_out, params);
37313695 return 0 ;
@@ -4343,8 +4307,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
43434307 GGML_UNUSED (n_token_capacity);
43444308 GGML_UNUSED (n_token_count_out);
43454309
4346-
4347- // TODO: implement with GGUF format
4310+ // TODO: implement with GGUF format
43484311 return true ;
43494312}
43504313
@@ -4389,7 +4352,6 @@ int llama_eval(
43894352 return 0 ;
43904353}
43914354
4392-
43934355int llama_eval_embd (
43944356 struct llama_context * ctx,
43954357 const float * embd,
0 commit comments