@@ -573,20 +573,19 @@ struct gguf_file_loader {
573573
574574 struct ggml_context * ctx_data = NULL ;
575575
576- gguf_file_loader (const char * fname, gguf_load_tensors_map & tensors_map)
577- : file(fname, " rb" ) {
576+ gguf_file_loader (const char * fname, gguf_load_tensors_map & tensors_map) : file(fname, " rb" ) {
578577 fprintf (stderr, " llama.cpp: loading model from %s\n " , fname);
579578
580- struct gguf_init_params params = {
581- /* .no_alloc = */ true ,
582- /* .ctx = */ &ctx_data,
583- };
579+ struct gguf_init_params params = {
580+ /* .no_alloc = */ true ,
581+ /* .ctx = */ &ctx_data,
582+ };
584583
585- gguf_ctx = gguf_init_from_file (fname, params);
586- file_version = (enum gguf_file_version) gguf_get_version (gguf_ctx);
584+ gguf_ctx = gguf_init_from_file (fname, params);
585+ file_version = (enum gguf_file_version) gguf_get_version (gguf_ctx);
587586
588- read_hparams ();
589- read_vocab ();
587+ read_hparams ();
588+ read_vocab ();
590589 read_tensor_metadata (tensors_map);
591590 }
592591
@@ -636,18 +635,18 @@ struct gguf_file_loader {
636635
637636 void read_vocab () {
638637 vocab.id_to_token .resize (hparams.n_vocab );
639- int token_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.tokens" );
638+
639+ const int token_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.tokens" );
640640 if (token_idx == -1 ) {
641641 throw std::runtime_error (" cannot find token list in GGUF file\n " );
642642 }
643643
644- int score_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.scores" );
644+ const int score_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.scores" );
645645 if (score_idx == -1 ) {
646646 throw std::runtime_error (" cannot find token scores list in GGUF file\n " );
647647 }
648648
649649 for (uint32_t i = 0 ; i < hparams.n_vocab ; i++) {
650-
651650 std::string word = gguf_get_arr_str (gguf_ctx, token_idx, i);
652651
653652 vocab.token_to_id [word] = i;
@@ -786,7 +785,7 @@ struct gguf_file_saver {
786785 gguf_type arr_type;
787786 int n_arr;
788787
789- switch (vtype) {
788+ switch (vtype) {
790789 case GGUF_TYPE_BOOL:
791790 bool_val = gguf_get_val_bool (fl->gguf_ctx , i);
792791 file.write_val <bool >(key, GGUF_TYPE_BOOL, bool_val);
@@ -809,7 +808,7 @@ struct gguf_file_saver {
809808 break ;
810809 case GGUF_TYPE_STRING:
811810 str_val = gguf_get_val_str (fl->gguf_ctx , i);
812- file.write_val <std::string> (key, GGUF_TYPE_STRING, str_val);
811+ file.write_str (key, GGUF_TYPE_STRING, str_val);
813812 break ;
814813 case GGUF_TYPE_UINT16:
815814 u16_val = gguf_get_val_u16 (fl->gguf_ctx , i);
@@ -825,7 +824,7 @@ struct gguf_file_saver {
825824 break ;
826825 case GGUF_TYPE_ARRAY:
827826 arr_type = gguf_get_arr_type (fl->gguf_ctx , i);
828- n_arr = gguf_get_arr_n (fl->gguf_ctx , i);
827+ n_arr = gguf_get_arr_n (fl->gguf_ctx , i);
829828 if (arr_type == GGUF_TYPE_FLOAT32) {
830829 write_hparam_arr_f32 (key, arr_type, i, n_arr);
831830 } else if (arr_type == GGUF_TYPE_STRING) {
@@ -922,20 +921,6 @@ struct llama_model_loader {
922921 }
923922 }
924923
925- struct ggml_tensor * get_tensor (const std::string & name, const std::vector<uint32_t > & ne, ggml_backend backend) {
926- auto it = tensors_map.name_to_idx .find (name);
927- if (it == tensors_map.name_to_idx .end ()) {
928- throw std::runtime_error (std::runtime_error (format (" llama.cpp: tensor '%s' is missing from model" , name.c_str ())));
929- }
930- gguf_load_tensor & lt = tensors_map.tensors .at (it->second );
931- if (lt.ne != ne) {
932- throw std::runtime_error (format (" llama.cpp: tensor '%s' has wrong shape; expected %s, got %s" ,
933- name.c_str (), llama_format_tensor_shape (ne).c_str (), llama_format_tensor_shape (lt.ne ).c_str ()));
934- }
935-
936- return get_tensor_for (lt, backend);
937- }
938-
939924 struct ggml_tensor * get_tensor_for (gguf_load_tensor & lt, ggml_backend backend) {
940925 struct ggml_tensor * tensor;
941926 if (backend != GGML_BACKEND_CPU) {
@@ -959,16 +944,41 @@ struct llama_model_loader {
959944 return tensor;
960945 }
961946
947+ struct ggml_tensor * get_tensor (const std::string & name, const std::vector<uint32_t > & ne, ggml_backend backend) {
948+ auto it = tensors_map.name_to_idx .find (name);
949+ if (it == tensors_map.name_to_idx .end ()) {
950+ throw std::runtime_error (std::runtime_error (format (" llama.cpp: tensor '%s' is missing from model" , name.c_str ())));
951+ }
952+ gguf_load_tensor & lt = tensors_map.tensors .at (it->second );
953+ if (lt.ne != ne) {
954+ throw std::runtime_error (format (" llama.cpp: tensor '%s' has wrong shape; expected %s, got %s" ,
955+ name.c_str (), llama_format_tensor_shape (ne).c_str (), llama_format_tensor_shape (lt.ne ).c_str ()));
956+ }
957+
958+ return get_tensor_for (lt, backend);
959+ }
960+
962961 void done_getting_tensors () const {
963962 if (num_ggml_tensors_created != tensors_map.tensors .size ()) {
964963 throw std::runtime_error (std::string (" llama.cpp: file contained more tensors than expected" ));
965964 }
966965 }
967966
968- void load_all_data (llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
969- size_t data_size = 0 ;
967+ void load_data_for (gguf_load_tensor & lt) const {
968+ if (use_mmap) {
969+ lt.data = (uint8_t *) mapping->addr + lt.file_off ;
970+ } else {
971+ gguf_file & file = file_loader->file ;
972+ file.seek (lt.file_off , SEEK_SET);
973+ file.read_raw (lt.data , lt.size );
974+ }
975+ }
976+
977+ void load_all_data (llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
978+ size_t data_size = 0 ;
970979 size_t prefetch_size = 0 ;
971- size_t lock_size = 0 ;
980+ size_t lock_size = 0 ;
981+
972982 for (const gguf_load_tensor & lt : tensors_map.tensors ) {
973983 data_size += lt.size ;
974984 if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
@@ -1030,31 +1040,6 @@ struct llama_model_loader {
10301040 done_size += lt.size ;
10311041 }
10321042 }
1033-
1034- void load_data_for (gguf_load_tensor & lt) {
1035- if (use_mmap) {
1036- lt.data = (uint8_t *) mapping->addr + lt.file_off ;
1037- } else {
1038- gguf_file & file = file_loader->file ;
1039- file.seek (lt.file_off , SEEK_SET);
1040- file.read_raw (lt.data , lt.size );
1041- }
1042-
1043- if (0 ) {
1044- print_checksum (lt);
1045- }
1046- }
1047-
1048- static void print_checksum (gguf_load_tensor & lt) {
1049- uint32_t sum = 0 ;
1050- for (size_t i = 0 ; i < lt.size ; i++) {
1051- uint8_t byte = lt.data [i];
1052- sum = byte + (sum << 6 ) + (sum << 16 ) - sum; // sdbm hash
1053- }
1054- fprintf (stderr, " %s checksum: %#08x (%s, size %zu)\n " , lt.name .c_str (), sum,
1055- llama_format_tensor_shape (lt.ne ).c_str (), lt.size );
1056- }
1057-
10581043};
10591044
10601045//
@@ -1187,15 +1172,15 @@ int64_t llama_time_us() {
11871172// model loading
11881173//
11891174
1190- static const char *gguf_file_version_name (gguf_file_version version) {
1175+ static const char * gguf_file_version_name (gguf_file_version version) {
11911176 switch (version) {
11921177 case GGUF_FILE_VERSION_V1: return " GGUF V1 (latest)" ;
1193- }
1178+ }
11941179
11951180 return " unknown" ;
11961181}
11971182
1198- static const char *llama_ftype_name (enum llama_ftype ftype) {
1183+ static const char * llama_ftype_name (enum llama_ftype ftype) {
11991184 switch (ftype) {
12001185 case LLAMA_FTYPE_ALL_F32: return " all F32" ;
12011186 case LLAMA_FTYPE_MOSTLY_F16: return " mostly F16" ;
@@ -1220,10 +1205,10 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
12201205 }
12211206}
12221207
1223- static const char *llama_model_type_name (e_model type) {
1208+ static const char * llama_model_type_name (e_model type) {
12241209 switch (type) {
1225- case MODEL_3B: return " 3B" ;
1226- case MODEL_7B: return " 7B" ;
1210+ case MODEL_3B: return " 3B" ;
1211+ case MODEL_7B: return " 7B" ;
12271212 case MODEL_13B: return " 13B" ;
12281213 case MODEL_30B: return " 30B" ;
12291214 case MODEL_65B: return " 65B" ;
@@ -2996,9 +2981,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
29962981 }
29972982 }
29982983
2999- const auto rejects =
3000- llama_grammar_reject_candidates (grammar->rules , grammar->stacks , candidates_grammar);
3001- for (auto & reject : rejects) {
2984+ const auto rejects = llama_grammar_reject_candidates (grammar->rules , grammar->stacks , candidates_grammar);
2985+ for (const auto & reject : rejects) {
30022986 candidates->data [reject.index ].logit = -INFINITY;
30032987 }
30042988
@@ -3725,7 +3709,7 @@ void llama_free(struct llama_context * ctx) {
37253709int llama_model_quantize (
37263710 const char * fname_inp,
37273711 const char * fname_out,
3728- const llama_model_quantize_params *params) {
3712+ const llama_model_quantize_params * params) {
37293713 try {
37303714 llama_model_quantize_internal (fname_inp, fname_out, params);
37313715 return 0 ;
@@ -4343,8 +4327,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
43434327 GGML_UNUSED (n_token_capacity);
43444328 GGML_UNUSED (n_token_count_out);
43454329
4346-
4347- // TODO: implement with GGUF format
4330+ // TODO: implement with GGUF format
43484331 return true ;
43494332}
43504333
@@ -4389,7 +4372,6 @@ int llama_eval(
43894372 return 0 ;
43904373}
43914374
4392-
43934375int llama_eval_embd (
43944376 struct llama_context * ctx,
43954377 const float * embd,
0 commit comments