@@ -695,172 +695,14 @@ struct gguf_file_loader {
695695
696696 tensor.name = name;
697697 tensor.size = ggml_nbytes (cur);
698+ tensor.ggml_tensor = cur;
698699
699700 tensors_map.tensors .push_back (tensor);
700701 tensors_map.name_to_idx [name] = tensors_map.tensors .size () - 1 ;
701702 }
702703 }
703704};
704705
705- struct gguf_file_saver {
706- // TODO
707- // this implementation now assumes that the data section is of the same length as the unquantized model.
708- // this is needed to write tensor metadata and weights in a single pass by seeking to appropriate positions in the file.
709- // this may not be true when we add quantization version and change ftype description (currently it's string according to the specs,
710- // but better to have it as uint32).
711- // we need to calculate the delta in number of bytes written with a counter as a struct member.
712-
713- gguf_context * ctx; // loaded gguf context (used to re-write the KV section (good enough for now))
714-
715- gguf_file file;
716- size_t info_offset;
717- size_t tensor_offset;
718-
719- gguf_file_saver (const char * fname, gguf_context * ctx) : ctx(ctx), file(fname, " wb" ) {
720- LLAMA_LOG_INFO (" %s: saving model to %s\n " , __func__, fname);
721-
722- write_header ();
723- write_kv ();
724- }
725-
726- void write_header () {
727- file.write_i32 (GGUF_MAGIC);
728- file.write_i32 (GGUF_VERSION);
729- file.write_i32 (gguf_get_n_tensors (ctx));
730- file.write_i32 (gguf_get_n_kv (ctx));
731- }
732-
733- void write_kv_arr_i32 (const std::string & key, enum gguf_type type, int i, int n_arr) {
734- std::vector<int32_t > data (n_arr);
735-
736- for (int j = 0 ; j < n_arr; ++j) {
737- int32_t val = gguf_get_arr_i32 (ctx, i, j);
738- data[j] = val;
739- }
740-
741- file.write_arr <int32_t >(key, type, data);
742- }
743-
744- void write_kv_arr_f32 (const std::string & key, enum gguf_type type, int i, int n_arr) {
745- std::vector<float > data (n_arr);
746-
747- for (int j = 0 ; j < n_arr; ++j) {
748- float val = gguf_get_arr_f32 (ctx, i, j);
749- data[j] = val;
750- }
751-
752- file.write_arr <float >(key, type, data);
753- }
754-
755- void write_kv_arr_str (const std::string & key, enum gguf_type type, int i, int n_arr) {
756- std::vector<std::string> data (n_arr);
757-
758- for (int j = 0 ; j < n_arr; ++j) {
759- std::string val = gguf_get_arr_str (ctx, i, j);
760- data[j] = val;
761- }
762-
763- file.write_arr (key, type, data);
764- }
765-
766- // re-write the key-value section from the loaded file
767- void write_kv () {
768- const int32_t n_kv = gguf_get_n_kv (ctx);
769- for (int i = 0 ; i < n_kv; ++i) {
770- const char * key = gguf_get_key (ctx, i);
771- LLAMA_LOG_INFO (" %s: writing key '%s'\n " , __func__, key);
772-
773- if (strcmp (key, " general.quantization_version" ) == 0 ) {
774- file.write_val <uint32_t >(" general.quantization_version" , GGUF_TYPE_UINT32, GGML_QNT_VERSION);
775- } else {
776- const gguf_type vtype = gguf_get_kv_type (ctx, i);
777-
778- switch (vtype) {
779- case GGUF_TYPE_BOOL: file.write_val <bool > (key, GGUF_TYPE_BOOL, gguf_get_val_bool (ctx, i)); break ;
780- case GGUF_TYPE_FLOAT32: file.write_val <float > (key, GGUF_TYPE_FLOAT32, gguf_get_val_f32 (ctx, i)); break ;
781- case GGUF_TYPE_INT16: file.write_val <int16_t > (key, GGUF_TYPE_INT16, gguf_get_val_i16 (ctx, i)); break ;
782- case GGUF_TYPE_INT32: file.write_val <int32_t > (key, GGUF_TYPE_INT32, gguf_get_val_i32 (ctx, i)); break ;
783- case GGUF_TYPE_INT8: file.write_val <int8_t > (key, GGUF_TYPE_INT8, gguf_get_val_i8 (ctx, i)); break ;
784- case GGUF_TYPE_STRING: file.write_str (key, GGUF_TYPE_STRING, gguf_get_val_str (ctx, i)); break ;
785- case GGUF_TYPE_UINT16: file.write_val <uint16_t >(key, GGUF_TYPE_UINT16, gguf_get_val_u16 (ctx, i)); break ;
786- case GGUF_TYPE_UINT32: file.write_val <uint32_t >(key, GGUF_TYPE_UINT32, gguf_get_val_u32 (ctx, i)); break ;
787- case GGUF_TYPE_UINT8: file.write_val <uint8_t > (key, GGUF_TYPE_UINT8, gguf_get_val_u8 (ctx, i)); break ;
788- case GGUF_TYPE_ARRAY:
789- {
790- const gguf_type arr_type = gguf_get_arr_type (ctx, i);
791- const int n_arr = gguf_get_arr_n (ctx, i);
792-
793- switch (arr_type) {
794- case GGUF_TYPE_FLOAT32: write_kv_arr_f32 (key, arr_type, i, n_arr); break ;
795- case GGUF_TYPE_INT32: write_kv_arr_i32 (key, arr_type, i, n_arr); break ;
796- case GGUF_TYPE_STRING: write_kv_arr_str (key, arr_type, i, n_arr); break ;
797- default :
798- throw std::runtime_error (format (" cannot recognize array type for key %s\n " , key));
799- }
800- } break ;
801- default :
802- throw std::runtime_error (format (" cannot recognize value type for key %s\n " , key));
803- }
804- }
805- }
806-
807- info_offset = file.tell ();
808-
809- GGML_ASSERT (gguf_get_data_offset (ctx) >= info_offset);
810-
811- const size_t count = gguf_get_data_offset (ctx) - info_offset;
812-
813- file.write_zeros (count);
814- file.seek (info_offset, SEEK_SET);
815- }
816-
817- size_t write_tensor_info (gguf_load_tensor & tensor, enum ggml_type type) {
818- size_t total_written = 0 ;
819- file.seek (info_offset, SEEK_SET);
820- total_written += file.write_str (tensor.name );
821-
822- int32_t n_dims = tensor.ne .size ();
823- total_written += file.write_i32 (n_dims);
824- for (int32_t i = 0 ; i < n_dims; ++i) {
825- total_written += file.write_i32 (tensor.ne [i]);
826- }
827-
828- total_written += file.write_i32 (type);
829- total_written += file.write_u64 (tensor_offset);
830- info_offset += total_written; // position to write info of the next tensor
831-
832- file.seek (0 , SEEK_END);
833-
834- return total_written;
835- }
836-
837- void write_tensor (gguf_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
838- switch (new_type) {
839- case GGML_TYPE_F32:
840- case GGML_TYPE_F16:
841- case GGML_TYPE_Q4_0:
842- case GGML_TYPE_Q4_1:
843- case GGML_TYPE_Q5_0:
844- case GGML_TYPE_Q5_1:
845- case GGML_TYPE_Q8_0:
846- case GGML_TYPE_Q2_K:
847- case GGML_TYPE_Q3_K:
848- case GGML_TYPE_Q4_K:
849- case GGML_TYPE_Q5_K:
850- case GGML_TYPE_Q6_K:
851- break ;
852- default : GGML_ASSERT (false );
853- }
854-
855- write_tensor_info (tensor, new_type);
856- file.write_raw (new_data, new_size);
857- size_t padded_size = GGML_PAD (new_size, GGUF_DEFAULT_ALIGNMENT); // TODO: handle custom alignment
858- size_t pad = padded_size - new_size;
859- file.write_zeros (pad);
860- tensor_offset += padded_size; // offset of the next tensor
861- }
862- };
863-
864706struct llama_model_loader {
865707 std::unique_ptr<gguf_file_loader> file_loader;
866708 gguf_load_tensors_map tensors_map;
@@ -897,7 +739,6 @@ struct llama_model_loader {
897739 tensor = ggml_new_tensor_1d (ggml_ctx, lt.type , lt.ne .at (0 ));
898740 }
899741 ggml_set_name (tensor, lt.name .c_str ());
900- GGML_ASSERT (lt.ggml_tensor == NULL ); // if this fails, we called get_tensor twice on the same tensor
901742
902743 if (backend != GGML_BACKEND_CPU) {
903744 ggml_set_no_alloc (ggml_ctx, use_mmap);
@@ -3245,7 +3086,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
32453086 }
32463087
32473088 std::unique_ptr<llama_model_loader> model_loader (new llama_model_loader (fname_inp, /* use_mmap*/ false ));
3248- gguf_file_saver file_saver (fname_out.c_str (), model_loader->file_loader ->gguf_ctx );
3089+
3090+ struct gguf_context * ctx_out = gguf_init_empty ();
3091+
3092+ // copy the KV pairs from the input file
3093+ gguf_set_kv (ctx_out, model_loader->file_loader ->gguf_ctx );
3094+ gguf_set_val_u32 (ctx_out, " general.quantization_version" , GGML_QNT_VERSION);
32493095
32503096#ifdef GGML_USE_K_QUANTS
32513097 int n_attention_wv = 0 ;
@@ -3279,6 +3125,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
32793125 std::vector<uint8_t > read_data;
32803126 std::vector<uint8_t > work;
32813127
3128+ std::vector<std::vector<uint8_t >> work_map (model_loader->tensors_map .tensors .size ());
3129+
32823130 for (gguf_load_tensor & tensor : model_loader->tensors_map .tensors ) {
32833131 read_data.resize (tensor.size );
32843132 tensor.data = read_data.data ();
@@ -3437,12 +3285,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
34373285 }
34383286 total_size_org += tensor.size ;
34393287 total_size_new += new_size;
3440- file_saver.write_tensor (tensor, new_type, new_data, new_size);
3288+
3289+ // TODO: temp fix until we have stream support in gguf
3290+ work_map[idx - 1 ] = std::vector<uint8_t >((char *) new_data, (char *) new_data + new_size);
3291+
3292+ gguf_add_tensor_ex (ctx_out, tensor.ggml_tensor , new_type, work_map[idx - 1 ].data (), new_size);
34413293 }
34423294
3295+ gguf_write_to_file (ctx_out, fname_out.c_str ());
3296+ gguf_free (ctx_out);
3297+
34433298 LLAMA_LOG_INFO (" %s: model size = %8.2f MB\n " , __func__, total_size_org/1024.0 /1024.0 );
34443299 LLAMA_LOG_INFO (" %s: quant size = %8.2f MB\n " , __func__, total_size_new/1024.0 /1024.0 );
34453300
3301+ // print histogram for all tensors
34463302 {
34473303 int64_t sum_all = 0 ;
34483304 for (size_t i = 0 ; i < hist_all.size (); i++) {
0 commit comments