1717#pragma warning(disable: 4244 4267) // possible loss of data
1818#endif
1919
20+ #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
21+ #define LLAMA_FILE_VERSION_GGJT_V3 3
22+
2023// ////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
2124typedef struct {
2225 int dim; // transformer dimension
@@ -49,10 +52,10 @@ typedef struct {
4952 // float* freq_cis_real; // (seq_len, dim/2)
5053 // float* freq_cis_imag; // (seq_len, dim/2)
5154 // (optional) classifier weights for the logits, on the last layer
52- // float* wcls;
55+ float * wcls;
5356} TransformerWeights;
5457
55- void malloc_weights (TransformerWeights* w, Config* p) {
58+ void malloc_weights (TransformerWeights* w, Config* p, bool shared_weights ) {
5659 // we calloc instead of malloc to keep valgrind happy
5760 w->token_embedding_table = new float [p->vocab_size * p->dim ]();
5861 printf (" [%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n " ,__func__,p->vocab_size , p->dim , p->vocab_size * p->dim );
@@ -86,9 +89,16 @@ void malloc_weights(TransformerWeights* w, Config* p) {
8689
8790 w->rms_final_weight = new float [p->dim ]();
8891 printf (" [%s:AK] Allocating [%d] float space for w->rms_final_weight\n " ,__func__,p->dim );
92+
93+ if (shared_weights) {
94+ w->wcls = NULL ;
95+ } else {
96+ w->wcls = new float [p->vocab_size * p->dim ]();
97+ printf (" [%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n " ,__func__,p->vocab_size , p->dim , p->vocab_size * p->dim );
98+ }
8999}
90100
91- int checkpoint_init_weights (TransformerWeights *w, Config* p, FILE* f) {
101+ int checkpoint_init_weights (TransformerWeights *w, Config* p, FILE* f, bool shared_weights ) {
92102 if (fread (w->token_embedding_table , sizeof (float ), p->vocab_size * p->dim , f) != static_cast <size_t >(p->vocab_size * p->dim )) return 1 ;
93103 if (fread (w->rms_att_weight , sizeof (float ), p->n_layers * p->dim , f) != static_cast <size_t >(p->n_layers * p->dim )) return 1 ;
94104 if (fread (w->wq , sizeof (float ), p->n_layers * p->dim * p->dim , f) != static_cast <size_t >(p->n_layers * p->dim * p->dim )) return 1 ;
@@ -100,6 +110,22 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
100110 if (fread (w->w2 , sizeof (float ), p->n_layers * p->hidden_dim * p->dim , f) != static_cast <size_t >(p->n_layers * p->hidden_dim * p->dim )) return 1 ;
101111 if (fread (w->w3 , sizeof (float ), p->n_layers * p->dim * p->hidden_dim , f) != static_cast <size_t >(p->n_layers * p->dim * p->hidden_dim )) return 1 ;
102112 if (fread (w->rms_final_weight , sizeof (float ), p->dim , f) != static_cast <size_t >(p->dim )) return 1 ;
113+
114+ // Skip freq_cis_real & freq_cis_imag
115+ int head_size = p->dim / p->n_heads ;
116+ fseek (f, p->seq_len * head_size * sizeof (float ), SEEK_CUR);
117+
118+ if (!shared_weights && fread (w->wcls , sizeof (float ), p->vocab_size * p->dim , f) != static_cast <size_t >(p->vocab_size * p->dim )) return 1 ;
119+
120+ // Check we didn't forget to read anything
121+ auto curr = ftell (f);
122+ fseek (f, 0 , SEEK_END);
123+ auto end = ftell (f);
124+ if (curr != end) {
125+ printf (" Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n " , curr, end);
126+ return 1 ;
127+ }
128+
103129 return 0 ;
104130}
105131
@@ -115,6 +141,7 @@ void free_weights(TransformerWeights* w) {
115141 delete w->w2 ;
116142 delete w->w3 ;
117143 delete w->rms_final_weight ;
144+ if (w->wcls ) delete w->wcls ;
118145}
119146
120147void print_sample_weights (TransformerWeights *w){
@@ -131,6 +158,7 @@ void print_sample_weights(TransformerWeights *w){
131158 printf (" %f\n " , w->w2 [0 ]);
132159 printf (" %f\n " , w->w3 [0 ]);
133160 printf (" %f\n " , w->rms_att_weight [0 ]);
161+ if (w->wcls ) printf (" %f\n " , w->wcls [0 ]);
134162}
135163// //////////////////////////////////////////////////////////////////////////////////////////////////////////
136164
@@ -509,26 +537,28 @@ bool is_ggml_file(const char *filename) {
509537}
510538
511539void load_vocab (const char *filename, Config *config, struct llama_vocab *vocab) {
512- // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
513- if (is_ggml_file (filename)) {
514-
515- struct llama_context_params llama_params = llama_context_default_params ();
516- llama_params.vocab_only = true ;
517-
518- struct llama_model * lmodel = llama_load_model_from_file (filename, llama_params);
519- struct llama_context * lctx = llama_new_context_with_model (lmodel, llama_params);
520-
521- const int n_vocab = llama_n_vocab (lctx);
522- vocab->id_to_token .resize (n_vocab);
523- for (int i=0 ; i<n_vocab; ++i) {
524- vocab->id_to_token [i].text = llama_token_get_text (lctx, i);
525- vocab->id_to_token [i].score = llama_token_get_score (lctx, i);
526- vocab->id_to_token [i].type = llama_token_get_type (lctx, i);
527- vocab->token_to_id .emplace (vocab->id_to_token [i].text , i);
528- }
529- llama_free (lctx);
530- llama_free_model (lmodel);
531- } else { // assume llama2.c vocabulary
540+ #pragma message("TODO: implement reading vocabulary using gguf")
541+ // // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
542+ // if (is_ggml_file(filename)) {
543+ //
544+ // struct llama_context_params llama_params = llama_context_default_params();
545+ // llama_params.vocab_only = true;
546+ //
547+ // struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
548+ // struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
549+ //
550+ // const int n_vocab = llama_n_vocab(lctx);
551+ // vocab->id_to_token.resize(n_vocab);
552+ // for (int i=0; i<n_vocab; ++i) {
553+ // vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
554+ // vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
555+ // vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
556+ // vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
557+ // }
558+ // llama_free(lctx);
559+ // llama_free_model(lmodel);
560+ // } else
561+ { // assume llama2.c vocabulary
532562 printf (" Assuming llama2.c vocabulary since %s is not a ggml file\n " , filename);
533563 llama_file file (filename, " rb" );
534564 const int n_vocab = config->vocab_size ;
@@ -538,6 +568,12 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
538568 float_t score = file.read_f32 ();
539569 uint32_t len = file.read_u32 ();
540570 std::string text = file.read_string (len);
571+ // Special-case handling of <0xXX> single byte tokens.
572+ char byte_val;
573+ if (sscanf (text.c_str (), " <0x%02hhX>" , &byte_val) == 1 ) {
574+ char cstr[2 ] = { byte_val, 0 };
575+ text = cstr;
576+ }
541577 vocab->id_to_token [i].text = text;
542578 vocab->id_to_token [i].score = score;
543579 vocab->id_to_token [i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
@@ -589,83 +625,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
589625 }
590626
591627#pragma message("TODO: implement file saving using gguf")
592- (void ) vocab;
593- (void ) model;
594- (void ) w;
595- // // write_magic
596- // file.write_u32(LLAMA_FILE_MAGIC); // magic
597- // file.write_u32(LLAMA_FILE_VERSION); // version
598- // // write_hparams
599- // file.write_u32(model->hparams.n_vocab);
600- // file.write_u32(model->hparams.n_embd);
601- // file.write_u32(model->hparams.n_mult);
602- // file.write_u32(model->hparams.n_head);
603- // file.write_u32(model->hparams.n_layer);
604- // file.write_u32(model->hparams.n_rot);
605- // file.write_u32(LLAMA_FTYPE_ALL_F32);
606- //
607- // // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
608- // uint32_t n_vocab = model->hparams.n_vocab;
609- // for (uint32_t i = 0; i < n_vocab; i++) {
610- // const auto & token_data = vocab->id_to_token.at(i);
611- // file.write_u32((uint32_t) token_data.tok.size());
612- // file.write_raw(token_data.tok.data(), token_data.tok.size());
613- // file.write_raw(&token_data.score, sizeof(token_data.score));
614- // }
615- //
616- // // stuff AK weights into GG weights one by one.
617- // // w->token_embedding_table -> model->tok_embeddings
618- // // float* -> struct ggml_tensor
619- // stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
620- // stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
621- //
622- // stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
623- // //print_row(model->norm, 0);
624- //
625- // // for rms-att-weight
626- // int row_length = model->hparams.n_embd;
627- // const auto & hparams = model->hparams;
628- // //int n_ff = model->hparams.n_embd;
629- // int n_ff = get_n_ff(&hparams);
630- //
631- // for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
632- // auto & layer = model->layers[i];
633- // // 1d
634- // stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
635- // stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
636- //
637- // // from 3d matrix layer x dim x dim to 2d matrix dim x dim
638- // stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
639- // stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
640- // stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
641- // stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
642- //
643- // stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
644- // stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
645- // stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
646- // }
647- // // write tensors
648- // write_tensor(&file, model->tok_embeddings);
649- // write_tensor(&file, model->norm);
650- // write_tensor(&file, model->output); // ?
651- // for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
652- // auto & layer = model->layers[i];
653- //
654- // write_tensor(&file, layer.attention_norm);
655- // write_tensor(&file, layer.wq);
656- // write_tensor(&file, layer.wk);
657- // write_tensor(&file, layer.wv);
658- // write_tensor(&file, layer.wo);
659- // write_tensor(&file, layer.ffn_norm);
660- // write_tensor(&file, layer.w1);
661- // write_tensor(&file, layer.w2);
662- // write_tensor(&file, layer.w3);
663- // }
628+ // write_magic
629+ file.write_u32 (LLAMA_FILE_MAGIC_GGJT); // magic
630+ file.write_u32 (LLAMA_FILE_VERSION_GGJT_V3); // version
631+ // write_hparams
632+ file.write_u32 (model->hparams .n_vocab );
633+ file.write_u32 (model->hparams .n_embd );
634+ file.write_u32 (model->hparams .n_mult );
635+ file.write_u32 (model->hparams .n_head );
636+ file.write_u32 (model->hparams .n_layer );
637+ file.write_u32 (model->hparams .n_rot );
638+ file.write_u32 (LLAMA_FTYPE_ALL_F32);
639+
640+ // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
641+ uint32_t n_vocab = model->hparams .n_vocab ;
642+ for (uint32_t i = 0 ; i < n_vocab; i++) {
643+ const auto & token_data = vocab->id_to_token .at (i);
644+ file.write_u32 ((uint32_t ) token_data.text .size ());
645+ file.write_raw (token_data.text .data (), token_data.text .size ());
646+ file.write_raw (&token_data.score , sizeof (token_data.score ));
647+ }
648+
649+ // stuff AK weights into GG weights one by one.
650+ // w->token_embedding_table -> model->tok_embeddings
651+ // float* -> struct ggml_tensor
652+ stuff_karpathy_weights_into_gg (model->tok_embeddings , w->token_embedding_table );
653+ stuff_karpathy_weights_into_gg (model->output , w->wcls ? w->wcls : w->token_embedding_table );
654+
655+ stuff_karpathy_weights_into_gg (model->norm , w->rms_final_weight );
656+ // print_row(model->norm, 0);
657+
658+ // for rms-att-weight
659+ int row_length = model->hparams .n_embd ;
660+ const auto & hparams = model->hparams ;
661+ // int n_ff = model->hparams.n_embd;
662+ int n_ff = get_n_ff (&hparams);
663+
664+ for (uint32_t i = 0 ; i < model->hparams .n_layer ; ++i){
665+ auto & layer = model->layers [i];
666+ // 1d
667+ stuff_karpathy_weights_into_gg (layer.attention_norm , &w->rms_att_weight [i*row_length]);
668+ stuff_karpathy_weights_into_gg (layer.ffn_norm , &w->rms_ffn_weight [i*row_length]);
669+
670+ // from 3d matrix layer x dim x dim to 2d matrix dim x dim
671+ stuff_karpathy_weights_into_gg (layer.wq , &w->wq [i*row_length*row_length]);
672+ stuff_karpathy_weights_into_gg (layer.wk , &w->wk [i*row_length*row_length]);
673+ stuff_karpathy_weights_into_gg (layer.wv , &w->wv [i*row_length*row_length]);
674+ stuff_karpathy_weights_into_gg (layer.wo , &w->wo [i*row_length*row_length]);
675+
676+ stuff_karpathy_weights_into_gg (layer.w1 , &w->w1 [i*row_length*n_ff]);
677+ stuff_karpathy_weights_into_gg (layer.w2 , &w->w2 [i*n_ff*row_length]);
678+ stuff_karpathy_weights_into_gg (layer.w3 , &w->w3 [i*row_length*n_ff]);
679+ }
680+ // write tensors
681+ write_tensor (&file, model->tok_embeddings );
682+ write_tensor (&file, model->norm );
683+ write_tensor (&file, model->output ); // ?
684+ for (uint32_t i = 0 ; i < model->hparams .n_layer ; ++i) {
685+ auto & layer = model->layers [i];
686+
687+ write_tensor (&file, layer.attention_norm );
688+ write_tensor (&file, layer.wq );
689+ write_tensor (&file, layer.wk );
690+ write_tensor (&file, layer.wv );
691+ write_tensor (&file, layer.wo );
692+ write_tensor (&file, layer.ffn_norm );
693+ write_tensor (&file, layer.w1 );
694+ write_tensor (&file, layer.w2 );
695+ write_tensor (&file, layer.w3 );
696+ }
664697}
665698
666699struct train_params get_default_train_params () {
667700 struct train_params params;
668- params.fn_vocab_model = " models/ggml-vocab .bin" ;
701+ params.fn_vocab_model = " tokenizer .bin" ;
669702 params.fn_llama2c_output_model = " ak_llama_model.bin" ;
670703 params.fn_train_data = " shakespeare.txt" ;
671704 params.fn_checkpoint_in = " checkpoint.bin" ;
@@ -718,7 +751,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
718751 fprintf (stderr, " \n " );
719752 fprintf (stderr, " options:\n " );
720753 fprintf (stderr, " -h, --help show this help message and exit\n " );
721- fprintf (stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n " , params->fn_vocab_model );
754+ fprintf (stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n " , params->fn_vocab_model );
722755 fprintf (stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n " );
723756 fprintf (stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n " , params->fn_llama2c_output_model );
724757 fprintf (stderr, " \n " );
@@ -791,9 +824,12 @@ int main(int argc, char ** argv) {
791824 if (!file) { printf (" Unable to open the checkpoint file %s!\n " , params.fn_llama2c_model ); return 1 ; }
792825 // read in the config header
793826 if (fread (&config, sizeof (Config), 1 , file) != 1 ) { return 1 ; }
827+ auto shared_weights = config.vocab_size > 0 ;
828+ config.vocab_size = abs (config.vocab_size );
829+
794830 // read in the Transformer weights
795- malloc_weights (&weights, &config);
796- if (checkpoint_init_weights (&weights, &config, file)) { return 1 ; }
831+ malloc_weights (&weights, &config, shared_weights );
832+ if (checkpoint_init_weights (&weights, &config, file, shared_weights )) { return 1 ; }
797833 fclose (file);
798834 }
799835
0 commit comments