From f4519830edf7e7e8adef5cd1cbe9fdae09c9a6f3 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Mon, 24 Jul 2023 22:29:30 -0400
Subject: [PATCH 01/30] first crack at lamma2.c model conversion

---
 Makefile                                      |   8 +-
 examples/convert-llama2c/convert-lamma-2c.cpp | 576 ++++++++++++++++++
 2 files changed, 582 insertions(+), 2 deletions(-)
 create mode 100644 examples/convert-llama2c/convert-lamma-2c.cpp
diff --git a/Makefile b/Makefile
index fb7c27cd972bb..10343a4f870e5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c simple server embd-input-test
 
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
@@ -330,7 +330,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
 clean:
-	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c embd-input-test build-info.h $(TEST_TARGETS)
 
 #
 # Examples
@@ -373,6 +373,10 @@ embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-te
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+convert-llama2c: examples/convert-llama2c/convert-lamma-2c.cpp    build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
diff --git a/examples/convert-llama2c/convert-lamma-2c.cpp b/examples/convert-llama2c/convert-lamma-2c.cpp
new file mode 100644
index 0000000000000..88b0619f0b89d
--- /dev/null
+++ b/examples/convert-llama2c/convert-lamma-2c.cpp
@@ -0,0 +1,576 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+#include <string.h>
+#include <sys/time.h>
+#include <cstdlib>
+#include <cstdio>
+#include <stdexcept>
+#include <cstring>
+#include <string>
+
+#include "llama.h"
+#include "ggml.h"
+
+typedef struct {
+    int dim; // transformer dimension
+    int hidden_dim; // for ffn layers
+    int n_layers; // number of layers
+    int n_heads; // number of query heads
+    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
+    int vocab_size; // vocabulary size, usually 256 (byte-level)
+    int seq_len; // max sequence length
+} Config;
+
+typedef struct {
+    // token embedding table
+    float* token_embedding_table;    // (vocab_size, dim)
+    // weights for rmsnorms
+    float* rms_att_weight; // (layer, dim) rmsnorm weights
+    float* rms_ffn_weight; // (layer, dim)
+    // weights for matmuls
+    float* wq; // (layer, dim, dim)
+    float* wk; // (layer, dim, dim)
+    float* wv; // (layer, dim, dim)
+    float* wo; // (layer, dim, dim)
+    // weights for ffn
+    float* w1; // (layer, hidden_dim, dim)
+    float* w2; // (layer, dim, hidden_dim)
+    float* w3; // (layer, hidden_dim, dim)
+    // final rmsnorm
+    float* rms_final_weight; // (dim,)
+    // freq_cis for RoPE relatively positional embeddings
+    float* freq_cis_real; // (seq_len, dim/2)
+    float* freq_cis_imag; // (seq_len, dim/2)
+} TransformerWeights;
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+struct my_llama_hparams {
+    uint32_t n_vocab = 32000;
+    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t n_embd  = 4096;
+    uint32_t n_mult  = 4;
+    uint32_t n_head  = 32;
+    uint32_t n_layer = 32;
+    uint32_t n_rot   = 64;
+
+    bool operator!=(const my_llama_hparams& other) const {
+        return memcmp(this, &other, sizeof(my_llama_hparams));
+    }
+};
+struct my_llama_layer {
+    // normalization
+    struct ggml_tensor * attention_norm;
+
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+
+    // normalization
+    struct ggml_tensor * ffn_norm;
+
+    // ff
+    struct ggml_tensor * w1;
+    struct ggml_tensor * w2;
+    struct ggml_tensor * w3;
+};
+struct my_llama_model {
+    struct ggml_context * ctx = NULL;
+
+    my_llama_hparams hparams;
+
+    struct ggml_tensor * tok_embeddings;
+
+    struct ggml_tensor * norm;
+    struct ggml_tensor * output;
+
+    std::vector<my_llama_layer> layers;
+
+    uint32_t train_its = 0;
+    uint32_t train_samples = 0;
+    uint32_t train_tokens = 0;
+};
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    struct token_score {
+        token tok;
+        float score;
+    };
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_score> id_to_token;
+};
+
+struct train_params {
+    const char * fn_vocab_model;
+    const char * fn_train_data;
+    const char * fn_checkpoint_in;
+    const char * fn_checkpoint_out;
+    const char * fn_model_out;
+
+    uint32_t seed;
+
+    int n_ctx;
+    int n_embd;
+    int n_mult;
+    int n_head;
+    int n_layer;
+    int n_rotmax;
+
+    int n_threads;
+    int n_batch;
+    int n_examples;
+    int n_predict;
+
+    int print_info_interval;
+    int print_details_interval;
+
+    bool samples_start_after_nl;
+    bool use_adam;
+    bool use_flash;
+    bool use_scratch;
+
+    // only adam
+    int   warmup;
+    int   cos_decay_steps;
+    float cos_decay_restart;
+    float cos_decay_alpha;
+
+    int   lbfgs_n_iter;
+    int   adam_n_iter;
+    float adam_alpha;
+    float adam_decay;
+
+    int mem_model_gb;
+    int mem_compute_gb;
+    int mem_compute0_gb;
+    int mem_compute1_gb;
+};
+
+struct train_params get_default_train_params() {
+    struct train_params params;
+    params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
+    params.fn_train_data     = "shakespeare.txt";
+    params.fn_checkpoint_in  = "checkpoint.bin";
+    params.fn_checkpoint_out = "checkpoint.bin";
+    params.fn_model_out      = "ggml-checkpoint-f32.bin";
+
+    params.seed       =   -1;
+
+    params.n_ctx      =  128;
+    params.n_embd     =  256;
+    params.n_mult     =  256;
+    params.n_head     =    8;
+    params.n_layer    =   16;
+    params.n_rotmax   =   64;
+
+    params.n_threads  =    6;
+    params.n_batch    =    8;
+    params.n_examples =    8;
+    params.n_predict  = 1024;
+
+    params.print_info_interval    = 1;
+    params.print_details_interval = 2;
+
+    params.samples_start_after_nl = false;
+    params.use_adam               = true;
+    params.use_flash              = true;
+    params.use_scratch            = true;
+
+    // only adam
+    params.warmup            =  100;
+    params.cos_decay_steps   = 1000;
+    params.cos_decay_restart = 1.1f;
+    params.cos_decay_alpha   = 0.0f;
+
+    params.lbfgs_n_iter      = 16;
+    params.adam_n_iter       = 16;
+    params.adam_alpha        = 1e-3f;
+    params.adam_decay        = 1e-3f;
+
+    params.mem_model_gb   = 2;
+    params.mem_compute_gb = 24;
+    params.mem_compute0_gb = 8;
+    params.mem_compute1_gb = 2;
+
+    return params;
+}
+
+void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+    if (tensor == NULL) {
+        file->write_u32(0);
+        file->write_u32(0);
+        file->write_u32(GGML_TYPE_F32);
+        file->seek((0-file->tell()) & 31, SEEK_CUR);
+        return;
+    }
+    const char * name = ggml_get_name(tensor);
+    uint32_t name_len = strlen(name);
+    uint32_t nd = tensor->n_dims;
+    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
+                       (uint32_t)tensor->ne[1],
+                       (uint32_t)tensor->ne[2],
+                       (uint32_t)tensor->ne[3] };
+    file->write_u32(nd);
+    file->write_u32(name_len);
+    file->write_u32(tensor->type);
+    file->write_raw(ne, sizeof(ne[0]) * nd);
+    file->write_raw(name, name_len);
+    file->seek((0-file->tell()) & 31, SEEK_CUR);
+    file->write_raw(tensor->data, ggml_nbytes(tensor));
+}
+
+void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, const char * filename) {
+    struct llama_file file(filename, "wb");
+    if (file.fp == NULL) {
+        return;
+    }
+
+    // write_magic
+    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+    file.write_u32(LLAMA_FILE_VERSION); // version
+    // write_hparams
+    file.write_u32(model->hparams.n_vocab);
+    file.write_u32(model->hparams.n_embd);
+    file.write_u32(model->hparams.n_mult);
+    file.write_u32(model->hparams.n_head);
+    file.write_u32(model->hparams.n_layer);
+    file.write_u32(model->hparams.n_rot);
+    file.write_u32(LLAMA_FTYPE_ALL_F32);
+    // write_vocab
+    uint32_t n_vocab = model->hparams.n_vocab;
+    for (uint32_t i = 0; i < n_vocab; i++) {
+        const auto & token_score = vocab->id_to_token.at(i);
+        file.write_u32((uint32_t) token_score.tok.size());
+        file.write_raw(token_score.tok.data(), token_score.tok.size());
+        file.write_raw(&token_score.score, sizeof(token_score.score));
+    }
+    // write tensors
+    write_tensor(&file, model->tok_embeddings);
+    write_tensor(&file, model->norm);
+    write_tensor(&file, model->output);
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        write_tensor(&file, layer.attention_norm);
+        write_tensor(&file, layer.wq);
+        write_tensor(&file, layer.wk);
+        write_tensor(&file, layer.wv);
+        write_tensor(&file, layer.wo);
+        write_tensor(&file, layer.ffn_norm);
+        write_tensor(&file, layer.w1);
+        write_tensor(&file, layer.w2);
+        write_tensor(&file, layer.w3);
+    }
+}
+
+void print_config(Config* p){
+    printf("----- Configs extracted from the header -------\n");
+    printf("config.dim %d\n", p->dim);
+    printf("config.hidden_dim %d\n", p->hidden_dim);
+    printf("config.n_layers %d\n", p->n_layers);
+    printf("config.n_heads %d\n", p->n_heads );
+    printf("config.n_kv_heads %d\n", p->n_kv_heads);
+    printf("config.vocab_size %d\n", p->vocab_size);
+    printf("config.seq_len %d\n", p->seq_len);
+    printf("----------------------------------------------\n");
+}
+
+void print_sample_weights(TransformerWeights *w){
+    printf("----- Quick print of first of the weight vales of all the variables\n");
+    printf("%f\n", w->token_embedding_table[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+    printf("%f\n", w->rms_ffn_weight[0]);
+
+    printf("%f\n", w->wq[0]);
+    printf("%f\n", w->wk[0]);
+    printf("%f\n", w->wv[0]);
+    printf("%f\n", w->wo[0]);
+    printf("%f\n", w->w1[0]);
+    printf("%f\n", w->w2[0]);
+    printf("%f\n", w->w3[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+    printf("%f\n", w->freq_cis_real[0]);
+    printf("%f\n", w->freq_cis_imag[0]);
+    printf("------------------------------------------------------------------\n");
+
+    
+}
+void malloc_weights(TransformerWeights* w, Config* p) {
+    // we calloc instead of malloc to keep valgrind happy
+    w->token_embedding_table = new float[p->vocab_size * p->dim]();//calloc(p->vocab_size * p->dim, sizeof(float));
+    w->rms_att_weight = new float[p->n_layers * p->dim](); //calloc(p->n_layers * p->dim, sizeof(float));
+    w->rms_ffn_weight = new float[p->n_layers * p->dim](); //calloc(p->n_layers * p->dim, sizeof(float));
+    w->wq = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    w->wk = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    w->wv = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    w->wo = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
+    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->dim * p->hidden_dim, sizeof(float));
+    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
+    w->rms_final_weight = new float[p->dim](); //calloc(p->dim, sizeof(float));
+    w->freq_cis_real = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
+    w->freq_cis_imag = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
+    // ensure all mallocs went fine
+    // if (!w->token_embedding_table || !w->rms_att_weight || !w->rms_ffn_weight 
+    //  || !w->wq || !w->wk || !w->wv || !w->wo || !w->w1 || !w->w2 || !w->w3 || 
+    //     !w->rms_final_weight || !w->freq_cis_real || !w->freq_cis_imag) {
+    //     printf("malloc failed!\n");
+    //     exit(1);
+    // }
+}
+
+void free_weights(TransformerWeights* w) {
+    free(w->token_embedding_table);
+    free(w->rms_att_weight);
+    free(w->rms_ffn_weight);
+    free(w->wq);
+    free(w->wk);
+    free(w->wv);
+    free(w->wo);
+    free(w->w1);
+    free(w->w2);
+    free(w->w3);
+    free(w->rms_final_weight);
+    free(w->freq_cis_real);
+    free(w->freq_cis_imag);
+}
+
+int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
+    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
+    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
+    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
+    int head_size = p->dim / p->n_heads;
+    if (fread(w->freq_cis_real, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
+    if (fread(w->freq_cis_imag, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
+    return 0;
+}
+
+int main(int argc, char *argv[]) {
+
+    // poor man's C argparse
+    char *checkpoint = NULL;
+    char *tokenizer = NULL;
+    // float temperature = 0.9f;
+    // 'checkpoint' is necessary arg
+    if (argc < 3) {
+        printf("Usage: %s <checkpoint_file> <tokenizer_file>\n", argv[0]);
+        return 1;
+    }
+    checkpoint = argv[1];
+    tokenizer = argv[2];
+    // if (argc < 3) {
+    //     printf("Usage: %s <checkpoint_file>\n", argv[0]);
+    //     return 1;
+    // }
+    // temperature is optional
+    // if (argc >= 3) {
+    //     temperature = atof(argv[2]);
+    // }
+    // seed is optional
+    // if (argc >= 4) {
+    //     unsigned int seed = atoi(argv[3]);
+    //     srand(seed);
+    // } else {
+    //     time_t current_time; 
+    //     time(&current_time);
+    //     srand((unsigned int)current_time);
+    // }
+
+    // read in the Karpathy model.bin file
+    Config config; // Configs are stashed in the bin file as header
+    TransformerWeights weights;
+
+    {
+        FILE *file = fopen(checkpoint, "rb");
+        if (!file) {
+            printf("Unable to open the checkpoint file %s!\n", checkpoint);
+            return 1;
+        }
+        else{
+            printf("model file opened for reading...\n");
+        }
+        // read in the config header
+        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
+        printf("config file read..\n");
+        print_config(&config);
+        // read in the Transformer weights
+        malloc_weights(&weights, &config);
+        printf("reading the opened model file...\n");
+        if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
+        print_sample_weights(&weights);
+        printf("Closing model file..bye...\n");
+        fclose(file);
+    }
+
+    // read in the tokenizer.bin file
+    char** vocab_ak = (char**)malloc(config.vocab_size * sizeof(char*));
+    {
+        FILE *file = fopen(tokenizer, "rb");
+        if (!file) {
+            printf("Unable to open the tokenizer file tokenizer.bin! Run "
+            "python tokenizer.py to convert tokenizer.model -> tokenizer.bin\n");
+            return 1;
+        }
+        int len;
+        printf("karpathy vocab size = %d\n", config.vocab_size);
+
+        for (int i = 0; i < config.vocab_size; i++) {
+            if(fread(&len, sizeof(int), 1, file) != 1) { return 1; }
+            vocab_ak[i] = (char *)malloc(len + 1);
+            if(fread(vocab_ak[i], len, 1, file) != 1) { return 1; }
+            vocab_ak[i][len] = '\0'; // add the string terminating token
+            printf("len = %d, %s\n", len, vocab_ak[i]);
+
+        }
+        fclose(file);
+    }
+
+    //TODO:-------------------------------------------------------------------------------
+    struct my_llama_model model;
+    struct train_params params = get_default_train_params();
+    struct llama_context_params llama_params = llama_context_default_params();
+    struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
+    struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
+    struct llama_vocab vocab;
+    {
+        std::vector<const char *> strings;
+        std::vector<float> scores;
+        int n_vocab = llama_n_vocab(lctx);
+        strings.resize(n_vocab, NULL);
+        scores.resize(n_vocab, 0);
+        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
+        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        vocab.id_to_token.resize(n_vocab);
+        for (int i=0; i<n_vocab; ++i) {
+            std::string tok   = std::string(strings[i]);
+            float       score = scores[i];
+            vocab.id_to_token[i].tok   = tok;
+            vocab.id_to_token[i].score = score;
+            vocab.token_to_id.emplace(tok, i);
+        }
+    }
+
+    save_as_llama_model(&vocab, &model, params.fn_model_out);
+
+    printf("\n");
+    free_weights(&weights);
+    free(vocab_ak);
+    return 0;
+
+}
\ No newline at end of file

From 78f8e4d604ed5ad990eb3812f6963ab7cfb8ccbf Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Tue, 25 Jul 2023 16:36:39 -0400
Subject: [PATCH 02/30] add the new example directory in gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index c1ab6bb6d08a3..aeca1dca75af8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,6 +39,7 @@ models-mnt
 /perplexity
 /embedding
 /train-text-from-scratch
+/convert-llama2c
 /simple
 /benchmark-matmult
 /vdot

From a9019963a140969ad7834e051ab5ce77ec36a65b Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Tue, 25 Jul 2023 16:37:38 -0400
Subject: [PATCH 03/30] WIP: super not working attempt atm. will update as I
 learn more ggml :D

---
 examples/convert-llama2c/convert-lamma-2c.cpp | 164 ++++++++++++++----
 1 file changed, 132 insertions(+), 32 deletions(-)

diff --git a/examples/convert-llama2c/convert-lamma-2c.cpp b/examples/convert-llama2c/convert-lamma-2c.cpp
index 88b0619f0b89d..bf282ab86ac4e 100644
--- a/examples/convert-llama2c/convert-lamma-2c.cpp
+++ b/examples/convert-llama2c/convert-lamma-2c.cpp
@@ -495,7 +495,7 @@ int main(int argc, char *argv[]) {
     // read in the Karpathy model.bin file
     Config config; // Configs are stashed in the bin file as header
     TransformerWeights weights;
-
+    struct my_llama_model model;
     {
         FILE *file = fopen(checkpoint, "rb");
         if (!file) {
@@ -514,15 +514,110 @@ int main(int argc, char *argv[]) {
         printf("reading the opened model file...\n");
         if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
         print_sample_weights(&weights);
+
+        // copy weights to ggml tensors.
+        //model.tok_embeddings <<< weights.token_embedding_table;
+
+
         printf("Closing model file..bye...\n");
         fclose(file);
     }
 
     // read in the tokenizer.bin file
+    // char** vocab_ak = (char**)malloc(config.vocab_size * sizeof(char*));
+    // {
+    //     FILE *file = fopen(tokenizer, "rb");
+    //     if (!file) {
+    //         printf("Unable to open the tokenizer file tokenizer.bin! Run "
+    //         "python tokenizer.py to convert tokenizer.model -> tokenizer.bin\n");
+    //         return 1;
+    //     }
+    //     int len;
+    //     printf("karpathy vocab size = %d\n", config.vocab_size);
+
+    //     for (int i = 0; i < config.vocab_size; i++) {
+    //         if(fread(&len, sizeof(int), 1, file) != 1) { return 1; }
+    //         vocab_ak[i] = (char *)malloc(len + 1);
+    //         if(fread(vocab_ak[i], len, 1, file) != 1) { return 1; }
+    //         vocab_ak[i][len] = '\0'; // add the string terminating token
+    //         printf("len = %d, %s\n", len, vocab_ak[i]);
+
+    //     }
+    //     fclose(file);
+    // }
+
+    //TODO:-------------------------------------------------------------------------------
+    
+    // struct train_params params = get_default_train_params();
+    // struct llama_context_params llama_params = llama_context_default_params();
+    // struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
+    // struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
+    // struct llama_vocab vocab;
+    // {
+    //     std::vector<const char *> strings;
+    //     std::vector<float> scores;
+    //     int n_vocab = llama_n_vocab(lctx);
+    //     strings.resize(n_vocab, NULL);
+    //     scores.resize(n_vocab, 0);
+    //     n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
+    //     GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+    //     vocab.id_to_token.resize(n_vocab);
+    //     for (int i=0; i<n_vocab; ++i) {
+    //         std::string tok   = std::string(strings[i]);
+    //         float       score = scores[i];
+    //         vocab.id_to_token[i].tok   = tok;
+    //         vocab.id_to_token[i].score = score;
+    //         vocab.token_to_id.emplace(tok, i);
+    //     }
+    // }
+
+    // save_as_llama_model(&vocab, &model, params.fn_model_out);
+
+
+    // --------------------------------------------- save
+    struct llama_file file("ak_model.bin", "wb");
+    if (file.fp == NULL) {
+        return 0;
+    }
+
+    // write_magic
+    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+    file.write_u32(LLAMA_FILE_VERSION); // version
+    // write_hparams
+
+    // printf("config.dim %d\n", p->dim);
+    // printf("config.hidden_dim %d\n", p->hidden_dim);
+    // printf("config.n_layers %d\n", p->n_layers);
+    // printf("config.n_heads %d\n", p->n_heads );
+    // printf("config.n_kv_heads %d\n", p->n_kv_heads);
+    // printf("config.vocab_size %d\n", p->vocab_size);
+    // printf("config.seq_len %d\n", p->seq_len);
+
+    // file.write_u32(model->hparams.n_vocab);
+    file.write_u32(config.vocab_size); // 32000
+
+    // file.write_u32(model->hparams.n_embd);
+    file.write_u32(config.dim);             /// <<<<<<<<<<<<<< NEEDS CHECKING
+
+    // file.write_u32(model->hparams.n_mult);
+    file.write_u32(config.dim);     /// <<<<<<<<<<<<<< JUST PLACEHOLDER
+
+    // file.write_u32(model->hparams.n_head);
+    file.write_u32(config.n_heads);
+
+    // file.write_u32(model->hparams.n_layer);
+    file.write_u32(config.n_layers);
+
+    // file.write_u32(model->hparams.n_rot);
+    file.write_u32(config.dim); /// <<<<<<<<<<<<<< JUST PLACEHOLDER
+    
+    file.write_u32(LLAMA_FTYPE_ALL_F32);
+
+    // write_vocab /////////////////////////////////////////////////////////////////
     char** vocab_ak = (char**)malloc(config.vocab_size * sizeof(char*));
     {
-        FILE *file = fopen(tokenizer, "rb");
-        if (!file) {
+        FILE *file_tok_ak = fopen(tokenizer, "rb");
+        if (!file_tok_ak) {
             printf("Unable to open the tokenizer file tokenizer.bin! Run "
             "python tokenizer.py to convert tokenizer.model -> tokenizer.bin\n");
             return 1;
@@ -531,42 +626,47 @@ int main(int argc, char *argv[]) {
         printf("karpathy vocab size = %d\n", config.vocab_size);
 
         for (int i = 0; i < config.vocab_size; i++) {
-            if(fread(&len, sizeof(int), 1, file) != 1) { return 1; }
+            if(fread(&len, sizeof(int), 1, file_tok_ak) != 1) { return 1; }
+            file.write_u32((uint32_t) len);
+            
             vocab_ak[i] = (char *)malloc(len + 1);
-            if(fread(vocab_ak[i], len, 1, file) != 1) { return 1; }
+            if(fread(vocab_ak[i], len, 1, file_tok_ak) != 1) { return 1; }
             vocab_ak[i][len] = '\0'; // add the string terminating token
-            printf("len = %d, %s\n", len, vocab_ak[i]);
+            file.write_raw(vocab_ak[i], len+1);
+            float x = 0.0f;
+            file.write_raw(&x, sizeof(float));
+            // printf("len = %d, %s\n", len, vocab_ak[i]);
 
         }
-        fclose(file);
+        fclose(file_tok_ak);
     }
 
-    //TODO:-------------------------------------------------------------------------------
-    struct my_llama_model model;
-    struct train_params params = get_default_train_params();
-    struct llama_context_params llama_params = llama_context_default_params();
-    struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
-    struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
-    struct llama_vocab vocab;
-    {
-        std::vector<const char *> strings;
-        std::vector<float> scores;
-        int n_vocab = llama_n_vocab(lctx);
-        strings.resize(n_vocab, NULL);
-        scores.resize(n_vocab, 0);
-        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
-        vocab.id_to_token.resize(n_vocab);
-        for (int i=0; i<n_vocab; ++i) {
-            std::string tok   = std::string(strings[i]);
-            float       score = scores[i];
-            vocab.id_to_token[i].tok   = tok;
-            vocab.id_to_token[i].score = score;
-            vocab.token_to_id.emplace(tok, i);
-        }
-    }
+    // uint32_t n_vocab = config.vocab_size;//model->hparams.n_vocab;
+    // for (uint32_t i = 0; i < n_vocab; i++) {
+    //     const auto & token_score = vocab->id_to_token.at(i);
+    //     file.write_u32((uint32_t) token_score.tok.size());
+    //     file.write_raw(token_score.tok.data(), token_score.tok.size());
+    //     file.write_raw(&token_score.score, sizeof(token_score.score));
+    // }
+    /////////////////////////////////////////////////////////////////
 
-    save_as_llama_model(&vocab, &model, params.fn_model_out);
+    // write tensors
+    write_tensor(&file, model.tok_embeddings);
+    // write_tensor(&file, model.norm);
+    // write_tensor(&file, model.output);
+    // for (int i = 0; i < config.n_layers; ++i) {
+    //     auto & layer = model.layers[i];
+
+    //     write_tensor(&file, layer.attention_norm);
+    //     write_tensor(&file, layer.wq);
+    //     write_tensor(&file, layer.wk);
+    //     write_tensor(&file, layer.wv);
+    //     write_tensor(&file, layer.wo);
+    //     write_tensor(&file, layer.ffn_norm);
+    //     write_tensor(&file, layer.w1);
+    //     write_tensor(&file, layer.w2);
+    //     write_tensor(&file, layer.w3);
+    // }
 
     printf("\n");
     free_weights(&weights);

From 912fc590c4050877ccb59be4af8813abcc1fcd51 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Fri, 28 Jul 2023 12:25:21 -0400
Subject: [PATCH 04/30] Updated makefile to compile rough tests

---
 Makefile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 10343a4f870e5..9f2923ae26464 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c simple server embd-input-test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c my-tests simple server embd-input-test
 
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
@@ -330,7 +330,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
 clean:
-	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c embd-input-test build-info.h $(TEST_TARGETS)
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c my-tests embd-input-test build-info.h $(TEST_TARGETS)
 
 #
 # Examples
@@ -376,6 +376,9 @@ train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratc
 convert-llama2c: examples/convert-llama2c/convert-lamma-2c.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+my-tests: examples/my-tests/my-tests.cpp    build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp

From 485e62b1e9090fa2d150b2eb8c3402341458a386 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Fri, 28 Jul 2023 12:26:11 -0400
Subject: [PATCH 05/30] Adding a doc that shows mappings that are coded in
 between llama.c <-> gg

---
 examples/my-tests/mappings.md | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 examples/my-tests/mappings.md

diff --git a/examples/my-tests/mappings.md b/examples/my-tests/mappings.md
new file mode 100644
index 0000000000000..f0a561a88a147
--- /dev/null
+++ b/examples/my-tests/mappings.md
@@ -0,0 +1,31 @@
+Variable mapping from llama.c to ggml llama.cpp
+
+config variables in llama.c
+`dim`, `vocab_size`, `num_layers`, `num_heads`, `num_kv_heads`, `seq_length`
+
+| llama.c (karpathy) | ggml (gg)     |dim|
+| ------------- | ------------- |-- |
+| `dim`  | `n_embed`  | Transformer dim |
+| `hidden_dim`  | `n_ff` (calculated)  | ff hidden dim |
+| `n_layers`  | `n_layers`  | number of decoder layers |
+| `n_heads`  | `n_head`  | number of heads |
+| `n_kv_heads`  | `-`  |  |
+| `vocab_size`  | `n_vocab`  |  |
+| `seq_len`  | `-`  |  |
+| ---  | ---  | --- |
+| `rms_att_weight`  | `attention_norm`  | `num_layers` x `dim`  |
+| `rms_ffn_weight`  | `ffn_norm`  | `num_layers` x `dim`  |
+| `wq`  | `ffn_norm`  |  `num_layers` x `dim` x `dim` |
+| `qk`  | `ffn_norm`  |  `num_layers` x `dim` x `dim` |
+| `wv`  | `ffn_norm`  |  `num_layers` x `dim` x `dim` |
+| `wo`  | `wo`  |  `num_layers` x `dim` x `dim` |
+| `w1`  | `w1`  |  `num_layers` x `hidden_dim` x `dim` |
+| `w2`  | `w2`  |  `num_layers` x `dim` x `hidden_dim` |
+| `w3`  | `w3`  |  `num_layers` x `hidden_dim` x `dim` |
+| `token_embedding_table`  | `tok_embeddings`  | `vocab_size` x `dim`  |
+| `rms_final_weight`  | `?`  |  `dim` |
+| `freq_cis_real`  | `?`  | `seq_len` x `dim/2`  |
+| `freq_cis_img `  | `?`  | `seq_len` x `dim/2`  |
+
+
+

From cc5c67be9b0ea757bdd0f7de25f54b7a02880933 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Fri, 28 Jul 2023 12:26:44 -0400
Subject: [PATCH 06/30] adding the rough attempt to convert the model

---
 examples/my-tests/my-tests.cpp | 1820 ++++++++++++++++++++++++++++++++
 1 file changed, 1820 insertions(+)
 create mode 100644 examples/my-tests/my-tests.cpp

diff --git a/examples/my-tests/my-tests.cpp b/examples/my-tests/my-tests.cpp
new file mode 100644
index 0000000000000..0f3e5082703f7
--- /dev/null
+++ b/examples/my-tests/my-tests.cpp
@@ -0,0 +1,1820 @@
+#include "ggml.h"
+#include "llama.h"
+#include <unordered_map>
+#include <vector>
+#include <cassert>
+#include <climits>
+#include <cstring>
+#include <cstdarg>
+#include <ctime>
+#include <random>
+#include <stdexcept>
+#include <algorithm>
+#include <string>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+// static const float rms_norm_eps = 1e-6f;
+
+typedef struct {
+    // token embedding table
+    float* token_embedding_table;    // (vocab_size, dim)
+    // weights for rmsnorms
+    float* rms_att_weight; // (layer, dim) rmsnorm weights
+    float* rms_ffn_weight; // (layer, dim)
+    // weights for matmuls
+    float* wq; // (layer, dim, dim)
+    float* wk; // (layer, dim, dim)
+    float* wv; // (layer, dim, dim)
+    float* wo; // (layer, dim, dim)
+    // weights for ffn
+    float* w1; // (layer, hidden_dim, dim)
+    float* w2; // (layer, dim, hidden_dim)
+    float* w3; // (layer, hidden_dim, dim)
+    // final rmsnorm
+    float* rms_final_weight; // (dim,)
+    // freq_cis for RoPE relatively positional embeddings
+    float* freq_cis_real; // (seq_len, dim/2)
+    float* freq_cis_imag; // (seq_len, dim/2)
+} TransformerWeights;
+
+struct random_normal_distribution {
+    std::mt19937 gen;
+    std::normal_distribution<float> rd;
+    float min;
+    float max;
+};
+
+struct random_uniform_distribution {
+    std::mt19937 gen;
+    std::uniform_real_distribution<float> rd;
+};
+
+void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
+    rnd->gen = std::mt19937(seed);
+    rnd->rd = std::normal_distribution<float>{mean, std};
+    rnd->min = min;
+    rnd->max = max;
+}
+
+void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) {
+    rnd->gen = std::mt19937(seed);
+    rnd->rd = std::uniform_real_distribution<float>{min, max};
+}
+
+int clamp(const int v, const int min, const int max) {
+    return ((v < min) ? (min) : (v > max) ? (max) : v);
+}
+
+float fclamp(const float v, const float min, const float max) {
+    return ((v < min) ? (min) : (v > max) ? (max) : v);
+}
+
+float frand() {
+    return (float)rand()/(float)RAND_MAX;
+}
+
+float frand_normal(struct random_normal_distribution * rnd) {
+    return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
+}
+
+float frand_uniform(struct random_uniform_distribution * rnd) {
+    return rnd->rd(rnd->gen);
+}
+
+void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
+struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
+    float scale = 1.0f; // xavier
+    switch (tensor->n_dims) {
+        case 1:
+            scale /= sqrtf(tensor->ne[0]);
+            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
+                *dst = scale * frand_normal(rnd);
+            }
+            break;
+        case 2:
+            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
+            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+                    *dst = scale * frand_normal(rnd);
+                }
+            }
+            break;
+        case 3:
+            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
+            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
+                        *dst = scale * frand_normal(rnd);
+                    }
+                }
+            }
+            break;
+        case 4:
+            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
+            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
+                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
+                            *dst = scale * frand_normal(rnd);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+    return tensor;
+}
+
+struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
+    switch (tensor->n_dims) {
+        case 1:
+            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
+                *dst = frand_uniform(rnd);
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+                    *dst = frand_uniform(rnd);
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
+                        *dst = frand_uniform(rnd);
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
+                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
+                            *dst = frand_uniform(rnd);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+    return tensor;
+}
+
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    struct token_score {
+        token tok;
+        float score;
+    };
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_score> id_to_token;
+};
+
+struct my_llama_hparams {
+    uint32_t n_vocab = 32000;
+    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t n_embd  = 4096;
+    uint32_t n_mult  = 4;
+    uint32_t n_head  = 32;
+    uint32_t n_layer = 32;
+    uint32_t n_rot   = 64;
+
+    bool operator!=(const my_llama_hparams& other) const {
+        return memcmp(this, &other, sizeof(my_llama_hparams));
+    }
+};
+
+struct my_llama_layer {
+    // normalization
+    struct ggml_tensor * attention_norm;
+
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+
+    // normalization
+    struct ggml_tensor * ffn_norm;
+
+    // ff
+    struct ggml_tensor * w1;
+    struct ggml_tensor * w2;
+    struct ggml_tensor * w3;
+};
+
+struct my_llama_kv_cache {
+    struct ggml_context * ctx = NULL;
+
+    struct ggml_tensor * k;
+    struct ggml_tensor * v;
+
+    // llama_ctx_buffer buf;
+
+    int n; // number of tokens currently in the cache
+};
+
+struct my_llama_model {
+    struct ggml_context * ctx = NULL;
+
+    my_llama_hparams hparams;
+
+    struct ggml_tensor * tok_embeddings;
+
+    struct ggml_tensor * norm;
+    struct ggml_tensor * output;
+
+    std::vector<my_llama_layer> layers;
+
+    uint32_t train_its = 0;
+    uint32_t train_samples = 0;
+    uint32_t train_tokens = 0;
+};
+
+uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
+    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
+    return n_ff;
+}
+
+void print_params(struct my_llama_hparams * params) {
+    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
+    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
+    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
+    printf("%s: n_head:  %d\n", __func__, params->n_head);
+    printf("%s: n_ff:    %d\n", __func__, get_n_ff(params));
+    printf("%s: n_layer: %d\n", __func__, params->n_layer);
+    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+}
+
+void init_model(struct my_llama_model * model) {
+    const auto & hparams = model->hparams;
+
+    const uint32_t n_embd  = hparams.n_embd;
+    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_vocab = hparams.n_vocab;
+
+    const uint32_t n_ff = get_n_ff(&hparams);
+
+    struct ggml_context * ctx = model->ctx;
+
+    model->train_its = 0;
+    model->train_samples = 0;
+    model->train_tokens = 0;
+    // printf("FROM INIT_MODEL BHAI...\n\n\n");
+    // print_params(&model->hparams);
+    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
+
+    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+    printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
+
+    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
+
+    // printing the per-layer allocations here so we dont print in the for loop.
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__,n_embd, n_ff, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__,n_ff, n_embd, n_ff * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__,n_embd, n_ff, n_embd * n_ff, n_layer);
+    
+
+    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
+    ggml_set_name(model->norm,           "norm.weight");
+    ggml_set_name(model->output,         "output.weight");
+
+    model->layers.resize(n_layer);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        std::string layers_i = "layers." + std::to_string(i);
+
+        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+
+        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
+        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
+        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
+
+        ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
+
+        ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
+        ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
+        ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
+        ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
+
+        ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
+
+        ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
+        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
+        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
+    }
+}
+
+void set_param_model(struct my_llama_model * model) {
+    const auto& hparams = model->hparams;
+
+    const uint32_t n_layer = hparams.n_layer;
+
+    struct ggml_context* ctx = model->ctx;
+
+    ggml_set_param(ctx, model->tok_embeddings);
+    ggml_set_param(ctx, model->norm);
+    ggml_set_param(ctx, model->output);
+
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        ggml_set_param(ctx, layer.attention_norm);
+        ggml_set_param(ctx, layer.wq);
+        ggml_set_param(ctx, layer.wk);
+        ggml_set_param(ctx, layer.wv);
+        ggml_set_param(ctx, layer.wo);
+        ggml_set_param(ctx, layer.ffn_norm);
+        ggml_set_param(ctx, layer.w1);
+        ggml_set_param(ctx, layer.w2);
+        ggml_set_param(ctx, layer.w3);
+    }
+}
+
+
+bool init_kv_cache(struct my_llama_kv_cache* cache, struct my_llama_model * model, int n_batch) {
+    const auto & hparams = model->hparams;
+
+    const uint32_t n_ctx   = hparams.n_ctx;
+    const uint32_t n_embd  = hparams.n_embd;
+    const uint32_t n_layer = hparams.n_layer;
+
+    const int64_t n_mem      = n_layer*n_ctx*n_batch;
+    const int64_t n_elements = n_embd*n_mem;
+
+    // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
+
+    // struct ggml_init_params params;
+    // params.mem_size   = cache.buf.size;
+    // params.mem_buffer = cache.buf.addr;
+    // params.no_alloc   = false;
+    if (!cache->ctx) {
+        struct ggml_init_params params;
+        params.mem_size   = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+
+        cache->ctx = ggml_init(params);
+
+        if (!cache->ctx) {
+            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
+            return false;
+        }
+    }
+
+    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
+    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
+
+    return true;
+}
+
+
+void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
+    GGML_ASSERT(tensor->n_dims == 1);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+}
+
+void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
+    GGML_ASSERT(tensor->n_dims == 2);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+}
+
+void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
+    GGML_ASSERT(tensor->n_dims == 3);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+    GGML_ASSERT(tensor->ne[2] == ne2);
+}
+
+void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+    GGML_ASSERT(tensor->n_dims == 4);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+    GGML_ASSERT(tensor->ne[2] == ne2);
+    GGML_ASSERT(tensor->ne[3] == ne3);
+}
+
+void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
+    *ptr = value;
+}
+
+void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    *ptr = value;
+}
+
+void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) {
+    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    *ptr = value;
+}
+
+float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+void print_row(struct ggml_tensor * probs, int i) {
+    for (int k = 0; k < probs->ne[0]; ++k) {
+        float p = get_f32_2d(probs, k, i);
+        printf(" %.2f", p);
+    }
+    printf("\n");
+}
+
+void print_matrix(struct ggml_tensor * probs) {
+    assert(probs->n_dims == 2);
+    for (int i = 0; i < probs->ne[1]; ++i) {
+        for (int k = 0; k < probs->ne[0]; ++k) {
+            float p = get_f32_2d(probs, k, i);
+            printf(" %.2f", p);
+        }
+        printf("\n");
+    }
+}
+
+
+void print_token(struct llama_context * ctx, llama_token token) {
+    printf("%s", llama_token_to_str(ctx, token));
+}
+
+void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
+    for (int i=0; i<tokens->ne[0]; ++i) {
+        int token = ggml_get_i32_1d(tokens, i);
+        print_token(ctx, token);
+    }
+}
+
+void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) {
+    for (int i1=0; i1<tokens->ne[1]; ++i1) {
+        //int num_newline = 0;
+        for (int i0=0; i0<tokens->ne[0]; ++i0) {
+            int token = get_i32_2d(tokens, i0, i1);
+            print_token(ctx, token);
+            // bool isnl = (token == llama_token_nl());
+            // if (isnl) {
+            //     ++num_newline;
+            // }
+            // if (isnl) {
+            //     if (num_newline < 2) {
+            //         print_token(ctx, token);
+            //     } else {
+            //         printf("\\n");
+            //     }
+            // } else {
+            //     print_token(ctx, token);
+            // }
+        }
+        printf("\n--\n");
+    }
+}
+
+void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+    int n_tokens = tokens_input->ne[0];
+    int n_vocab  = target_logits->ne[0];
+
+    size_t sample = train_samples[example_id % n_train_samples];
+    GGML_ASSERT(sample+n_tokens-1 < n_train_data);
+
+    ggml_set_f32(target_logits, -1.0f/n_vocab);
+    ggml_set_f32(target_probs, 0.0f);
+    ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
+    for (int i=1; i<n_tokens+1; ++i) {
+        int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
+        set_f32_2d(target_logits, token, i-1, +1.0f);
+        set_f32_2d(target_probs,  token, i-1, +1.0f);
+        if (i<n_tokens) {
+            ggml_set_i32_1d(tokens_input, i, token);
+        }
+    }
+}
+
+void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+    GGML_ASSERT(tokens_input->n_dims  == 2);
+    GGML_ASSERT(target_logits->n_dims == 3);
+    GGML_ASSERT(target_probs->n_dims  == 3);
+    int n_vocab  = target_logits->ne[0];
+    int n_tokens = tokens_input->ne[0];
+    int n_batch  = tokens_input->ne[1];
+    GGML_ASSERT(n_tokens == target_logits->ne[1]);
+    GGML_ASSERT(n_batch  == target_logits->ne[2]);
+    GGML_ASSERT(n_vocab  == target_probs->ne[0]);
+    GGML_ASSERT(n_tokens == target_probs->ne[1]);
+    GGML_ASSERT(n_batch  == target_probs->ne[2]);
+
+    ggml_set_f32(target_logits, -1.0f/n_vocab);
+    ggml_set_f32(target_probs, 0.0f);
+    for (int k=0; k<n_batch; ++k) {
+        // printf("%s: batch %d\n", __func__, k);
+        size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples];
+        GGML_ASSERT(sample+n_tokens-1 < n_train_data);
+
+        set_i32_2d(tokens_input, 0, k, llama_token_bos());
+        for (int i=1; i<n_tokens+1; ++i) {
+            int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
+            // print_token(lctx, token);
+            set_f32_3d(target_logits, token, i-1, k, +1.0f);
+            set_f32_3d(target_probs,  token, i-1, k, +1.0f);
+            if (i<n_tokens) {
+                set_i32_2d(tokens_input, i, k, token);
+            }
+        }
+        // printf("\n=\n");
+        // for (int i=0; i<n_tokens; ++i) {
+        //     int token = get_i32_2d(tokens_input, i, k);
+        //     print_token(lctx, token);
+        // }
+        // printf("\n-\n");
+    }
+}
+
+
+void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs, int n_shift) {
+    int n_tokens = tokens_input->ne[0];
+    int n_vocab = target_logits->ne[0];
+    for (int i=0; i<n_tokens-n_shift; ++i) {
+        ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
+        for (int k=0; k<n_vocab; ++k) {
+            ggml_set_f32_1d(target_logits, i*n_vocab + k, ggml_get_f32_1d(target_logits, (i + n_shift)*n_vocab + k));
+            ggml_set_f32_1d(target_probs, i*n_vocab + k,  ggml_get_f32_1d(target_probs,  (i + n_shift)*n_vocab + k));
+        }
+    }
+}
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
+    struct llama_file f(filename, "rb");
+
+    std::vector<char> buf;
+    buf.resize(f.size+1);
+
+    f.read_raw(buf.data(), f.size);
+    buf[f.size] = '\0';
+
+    out.resize(buf.size());
+
+    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
+    if (n_tokens >= 0) {
+        out.resize(n_tokens);
+    }
+
+    bool verify = false;
+    if (verify) {
+        const char * in  = buf.data();
+        const char * end = buf.data() + buf.size();
+        for (int i = 0; i < (int) out.size(); ++i) {
+            const char * s = llama_token_to_str(lctx, out[i]);
+            int len = strlen(s);
+            if (in >= end) {
+                printf("%s: unexpected end of original text.\n", __func__);
+                break;
+            }
+            const bool matches = (strncmp(in, s, len) == 0);
+            if (matches) {
+                in += len;
+            } else {
+                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
+            }
+        }
+    }
+
+    return n_tokens;
+}
+
+void shuffle_ints(int * begin, int * end) {
+    if (end <= begin) return;
+    int max=begin[0];
+    for (int i=1; i<end-begin; ++i) {
+        if (begin[i] > max) {
+            max = begin[i];
+        }
+    }
+    std::vector<float> vals;
+    vals.resize(max+1);
+    for (int i=0; i<max+1; ++i) {
+       vals[i] = frand();
+    }
+    std::sort(begin, end, [&vals](int a, int b){
+       return vals.at(a) < vals.at(b);
+    });
+}
+
+struct my_llama_sampler_params {
+    float temp            = 0.0f;  // <= 0.0 disabled
+    int   top_k           = 20;    // <= 0 to use vocab size
+    float top_p           = 0.95f; // 1.0 = disabled
+    float tfs_z           = 1.00f; // 1.0 = disabled
+    float typical_p       = 1.00f; // 1.0 = disabled
+    int   repeat_last_n   = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float repeat_penalty  = 1.0f;  // 1.0 = disabled
+    float alpha_presence  = 0.0f;  // 0.0 = disabled
+    float alpha_frequency = 0.0f;  // 0.0 = disabled
+    int   mirostat        = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float mirostat_tau    = 5.00f; // target entropy
+    float mirostat_eta    = 0.10f; // learning rate
+    bool  penalize_nl     = true;  // consider newlines as a repeatable token
+};
+
+struct my_llama_sampler {
+    struct llama_context * ctx = NULL;
+    my_llama_sampler_params params;
+
+    int n_vocab = 0;
+    int n_ctx = 0;
+
+    float mirostat_mu;
+
+    std::vector<llama_token_data> candidates;
+    llama_token_data_array candidates_p;
+
+};
+
+void init_sampler(struct my_llama_sampler * sampler, struct llama_context * ctx) {
+    sampler->ctx = ctx;
+    sampler->n_vocab = llama_n_vocab(sampler->ctx);
+    sampler->n_ctx   = llama_n_ctx(sampler->ctx);
+    sampler->mirostat_mu = 2.0f * sampler->params.mirostat_tau;
+}
+
+llama_token sample(struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) {
+    GGML_ASSERT(sampler->ctx != NULL);
+
+    struct llama_context * ctx = sampler->ctx;
+
+    sampler->candidates.resize(sampler->n_vocab);
+    for (llama_token token_id = 0; token_id < sampler->n_vocab; ++token_id) {
+        sampler->candidates[token_id].id = token_id;
+        sampler->candidates[token_id].logit = logits[token_id];
+        sampler->candidates[token_id].p = 0.0;
+    }
+
+    llama_token_data_array * candidates_p = & sampler->candidates_p;
+
+    candidates_p->data = sampler->candidates.data();
+    candidates_p->size = sampler->candidates.size();
+    candidates_p->sorted = false;
+
+    const auto params = sampler->params;
+
+    // Apply penalties
+    const float nl_logit = logits[llama_token_nl()];
+
+    const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx);
+
+    llama_sample_repetition_penalty(
+        ctx,
+        candidates_p,
+        last_tokens + n_last_tokens - n_last,
+        n_last,
+        params.repeat_penalty);
+    llama_sample_frequency_and_presence_penalties(
+        ctx,
+        candidates_p,
+        last_tokens + n_last_tokens - n_last,
+        n_last,
+        params.alpha_frequency,
+        params.alpha_presence);
+
+    if (!params.penalize_nl) {
+        logits[llama_token_nl()] = nl_logit;
+    }
+
+    llama_token token = 0;
+    if (params.temp <= 0) {
+        // Greedy sampling
+        token = llama_sample_token_greedy(ctx, candidates_p);
+    } else {
+        if (params.mirostat == 1) {
+            int mirostat_m = 100;
+            llama_sample_temperature(ctx, candidates_p, params.temp);
+            token = llama_sample_token_mirostat(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, mirostat_m, &sampler->mirostat_mu);
+        } else if (params.mirostat == 2) {
+            llama_sample_temperature(ctx, candidates_p, params.temp);
+            token = llama_sample_token_mirostat_v2(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, &sampler->mirostat_mu);
+        } else {
+            // Temperature sampling
+            llama_sample_top_k        (ctx, candidates_p, params.top_k, 1);
+            llama_sample_tail_free    (ctx, candidates_p, params.tfs_z, 1);
+            llama_sample_typical      (ctx, candidates_p, params.typical_p, 1);
+
+            llama_sample_top_p        (ctx, candidates_p, params.top_p, 1);
+            llama_sample_temperature  (ctx, candidates_p, params.temp);
+            token = llama_sample_token(ctx, candidates_p);
+        }
+    }
+    return token;
+}
+
+void set_logits_masked(struct ggml_tensor * logits, std::vector<bool>& mask, float value) {
+    GGML_ASSERT(logits->ne[0] == (int64_t) mask.size());
+    for (int i2 = 0; i2 < logits->ne[2]; ++i2) {
+        for (int i1 = 0; i1 < logits->ne[1]; ++i1) {
+            for (int i0 = 0; i0 < logits->ne[0]; ++i0) {
+                if (!mask[i0]) continue;
+                float * ptr = (float *) ((char *) logits->data + i2*logits->nb[2] + i1*logits->nb[1] + i0*logits->nb[0]);
+                *ptr = value;
+            }
+        }
+    }
+}
+
+void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+    if (tensor == NULL) {
+        file->write_u32(0);
+        file->write_u32(0);
+        file->write_u32(GGML_TYPE_F32);
+        file->seek((0-file->tell()) & 31, SEEK_CUR);
+        return;
+    }
+    const char * name = ggml_get_name(tensor);
+    uint32_t name_len = strlen(name);
+    uint32_t nd = tensor->n_dims;
+    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
+                       (uint32_t)tensor->ne[1],
+                       (uint32_t)tensor->ne[2],
+                       (uint32_t)tensor->ne[3] };
+    file->write_u32(nd);
+    file->write_u32(name_len);
+    file->write_u32(tensor->type);
+    file->write_raw(ne, sizeof(ne[0]) * nd);
+    file->write_raw(name, name_len);
+    file->seek((0-file->tell()) & 31, SEEK_CUR);
+    file->write_raw(tensor->data, ggml_nbytes(tensor));
+}
+
+void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+    int32_t nd = file->read_u32();
+    GGML_ASSERT(nd == tensor->n_dims);
+
+    uint32_t name_len       = file->read_u32();
+    enum     ggml_type type = (enum ggml_type) file->read_u32();
+    GGML_ASSERT(type == tensor->type);
+
+    uint32_t ne[4];
+    file->read_raw(ne, sizeof(ne[0]) * nd);
+    for (int i=0; i<nd; ++i) {
+        GGML_ASSERT(ne[i] == tensor->ne[i]);
+    }
+
+    std::string name = file->read_string(name_len);
+    GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
+
+    file->seek((0-file->tell()) & 31, SEEK_CUR);
+    file->read_raw(tensor->data, ggml_nbytes(tensor));
+}
+
+void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt) {
+    const uint32_t version = 0;
+    GGML_ASSERT(opt->nx   >= 0);
+    GGML_ASSERT(opt->iter >= 0);
+    file->write_u32(version);
+    file->write_raw(&opt->params, sizeof(opt->params));
+    file->write_raw(&opt->nx,     sizeof(opt->nx));
+    file->write_raw(&opt->iter,   sizeof(opt->iter));
+    file->write_u32((uint32_t)  opt->just_initialized);
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                GGML_ASSERT(opt->adam.x != NULL);
+                write_tensor(file, opt->adam.x);
+                write_tensor(file, opt->adam.g1);
+                write_tensor(file, opt->adam.g2);
+                write_tensor(file, opt->adam.m);
+                write_tensor(file, opt->adam.v);
+                write_tensor(file, opt->adam.mh);
+                write_tensor(file, opt->adam.vh);
+                write_tensor(file, opt->adam.pf);
+                file->write_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
+                file->write_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
+                file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                GGML_ASSERT(opt->adam.x != NULL);
+                write_tensor(file, opt->lbfgs.x);
+                write_tensor(file, opt->lbfgs.xp);
+                write_tensor(file, opt->lbfgs.g);
+                write_tensor(file, opt->lbfgs.gp);
+                write_tensor(file, opt->lbfgs.d);
+                write_tensor(file, opt->lbfgs.pf);
+                write_tensor(file, opt->lbfgs.lmal);
+                write_tensor(file, opt->lbfgs.lmys);
+                write_tensor(file, opt->lbfgs.lms);
+                write_tensor(file, opt->lbfgs.lmy);
+                file->write_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
+                file->write_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
+                file->write_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
+                file->write_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
+                file->write_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
+                file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
+            } break;
+    }
+}
+
+void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
+    uint32_t version = file->read_u32();
+    GGML_ASSERT(version == 0);
+
+    file->read_raw(&opt->params, sizeof(opt->params));
+    file->read_raw(&opt->nx,     sizeof(opt->nx));
+    ggml_opt_init(ctx, opt, opt->params, opt->nx);
+
+    file->read_raw(&opt->iter,   sizeof(opt->iter));
+    opt->just_initialized = (bool) file->read_u32();
+
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                read_tensor(file, opt->adam.x);
+                read_tensor(file, opt->adam.g1);
+                read_tensor(file, opt->adam.g2);
+                read_tensor(file, opt->adam.m);
+                read_tensor(file, opt->adam.v);
+                read_tensor(file, opt->adam.mh);
+                read_tensor(file, opt->adam.vh);
+                if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
+                file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
+                file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
+                file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                GGML_ASSERT(opt->adam.x != NULL);
+                read_tensor(file, opt->lbfgs.x);
+                read_tensor(file, opt->lbfgs.xp);
+                read_tensor(file, opt->lbfgs.g);
+                read_tensor(file, opt->lbfgs.gp);
+                read_tensor(file, opt->lbfgs.d);
+                if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
+                read_tensor(file, opt->lbfgs.lmal);
+                read_tensor(file, opt->lbfgs.lmys);
+                read_tensor(file, opt->lbfgs.lms);
+                read_tensor(file, opt->lbfgs.lmy);
+                file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
+                file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
+                file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
+                file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
+                file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
+                file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
+            } break;
+    }
+}
+
+bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename, bool init) {
+    struct llama_file file(filename, "rb");
+
+    uint32_t magic;
+    uint32_t version;
+
+    uint32_t train_its = 0;
+    uint32_t train_samples = 0;
+    uint32_t train_tokens = 0;
+
+    if (file.fp) {
+        printf("%s: Loading model from '%s'.\n", __func__, filename);
+        magic                  = file.read_u32();
+        GGML_ASSERT(magic     == 'ggcp');
+        version                = file.read_u32();
+        GGML_ASSERT(version   == 0);
+        train_its              = file.read_u32();
+        train_samples          = file.read_u32();
+        train_tokens           = file.read_u32();
+        model->hparams.n_vocab = file.read_u32();
+        model->hparams.n_embd  = file.read_u32();
+        model->hparams.n_mult  = file.read_u32();
+        model->hparams.n_head  = file.read_u32();
+        model->hparams.n_layer = file.read_u32();
+        model->hparams.n_rot   = file.read_u32();
+        print_params(&model->hparams);
+    }
+
+    if (init) {
+        init_model(model);
+    }
+
+    if (file.fp) {
+        model->train_its = train_its;
+        model->train_samples = train_samples;
+        model->train_tokens = train_tokens;
+    }
+
+    printf("%s: Training iterations: %u.\n", __func__, model->train_its);
+    printf("%s: Training samples:    %u.\n", __func__, model->train_samples);
+    printf("%s: Training tokens:     %u.\n", __func__, model->train_tokens);
+
+    if (file.fp) {
+        read_tensor(&file, model->tok_embeddings);
+        read_tensor(&file, model->norm);
+        read_tensor(&file, model->output);
+
+        for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+            auto & layer = model->layers[i];
+
+            read_tensor(&file, layer.attention_norm);
+            read_tensor(&file, layer.wq);
+            read_tensor(&file, layer.wk);
+            read_tensor(&file, layer.wv);
+            read_tensor(&file, layer.wo);
+            read_tensor(&file, layer.ffn_norm);
+            read_tensor(&file, layer.w1);
+            read_tensor(&file, layer.w2);
+            read_tensor(&file, layer.w3);
+        }
+
+        read_opt_context(&file, model->ctx, opt);
+    }
+
+    return (file.fp != NULL);
+}
+
+void print_sample_weights(TransformerWeights *w){
+    printf("----- Quick print of first of the weight vales of all the variables\n");
+    printf("%f\n", w->token_embedding_table[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+    printf("%f\n", w->rms_ffn_weight[0]);
+
+    printf("%f\n", w->wq[0]);
+    printf("%f\n", w->wk[0]);
+    printf("%f\n", w->wv[0]);
+    printf("%f\n", w->wo[0]);
+    printf("%f\n", w->w1[0]);
+    printf("%f\n", w->w2[0]);
+    printf("%f\n", w->w3[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+    printf("%f\n", w->freq_cis_real[0]);
+    printf("%f\n", w->freq_cis_imag[0]);
+    printf("------------------------------------------------------------------\n");
+
+    
+}
+
+void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
+    
+    int ct;
+    switch (gg_weights->n_dims){
+        case 1:
+            ct = 0;
+            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
+                float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
+                *ptr = karpathy_weights[ct];
+            }
+        case 2:
+            ct = 0;
+            for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
+                    // set_f32_2d(gg_weights, k, i, karpathy_weights[ct]);
+                    float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
+                    *ptr = karpathy_weights[ct];
+                    ct++;
+                }
+            }
+            break;
+        case 3:
+            ct = 0;
+            for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
+                for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
+                        // set_f32_3d(gg_weights, k, j, i, karpathy_weights[ct]);
+                        float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
+                        *ptr = karpathy_weights[ct];
+                        ct++;
+                    }
+                }
+            }
+            break;    
+    }
+
+    // void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value)
+    // set_f32_2d(gg_weights, 142.0, 0, 0);
+
+    // float p = get_f32_2d(gg_weights, 0, 0);
+    // print_row(gg_weights, 0);
+    // print_matrix(gg_weights);
+}
+
+void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
+    struct llama_file file(filename, "wb");
+    if (file.fp == NULL) {
+        return;
+    }
+    // print_sample_weights(w);
+    // write_magic
+    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+    file.write_u32(LLAMA_FILE_VERSION); // version
+    // write_hparams
+    file.write_u32(model->hparams.n_vocab);
+    file.write_u32(model->hparams.n_embd);
+    file.write_u32(model->hparams.n_mult);
+    file.write_u32(model->hparams.n_head);
+    file.write_u32(model->hparams.n_layer);
+    file.write_u32(model->hparams.n_rot);
+    file.write_u32(LLAMA_FTYPE_ALL_F32);
+    // write_vocab
+    uint32_t n_vocab = model->hparams.n_vocab;
+    for (uint32_t i = 0; i < n_vocab; i++) {
+        const auto & token_score = vocab->id_to_token.at(i);
+        file.write_u32((uint32_t) token_score.tok.size());
+        file.write_raw(token_score.tok.data(), token_score.tok.size());
+        file.write_raw(&token_score.score, sizeof(token_score.score));
+    }
+
+    // stuff AK weights into GG weights one by one.
+    // w->token_embedding_table -> model->tok_embeddings
+    // float*                   -> struct ggml_tensor
+    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+    print_row(model->tok_embeddings, 0);
+
+    // stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
+    // stuff_karpathy_weights_into_gg(model->norm, w->freq_cis_real);               // <<<<<<<<<< mostly wrong
+    // stuff_karpathy_weights_into_gg(model->norm, w->freq_cis_imag);               // <<<<<<<<<< mostly wrong
+
+    // for rms-att-weight 
+    int row_length = model->hparams.n_embd;
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
+        auto & layer = model->layers[i];
+        // 2d        
+        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length]);
+    }
+    
+    // write tensors
+    write_tensor(&file, model->tok_embeddings);
+    write_tensor(&file, model->norm);
+    write_tensor(&file, model->output);
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+        printf(" testing new here %d\n", i);
+        auto & layer = model->layers[i];
+
+        write_tensor(&file, layer.attention_norm);
+        write_tensor(&file, layer.wq);
+        write_tensor(&file, layer.wk);
+        write_tensor(&file, layer.wv);
+        write_tensor(&file, layer.wo);
+        write_tensor(&file, layer.ffn_norm);
+        write_tensor(&file, layer.w1);
+        write_tensor(&file, layer.w2);
+        write_tensor(&file, layer.w3);
+    }
+}
+
+
+
+struct train_params {
+    const char * fn_vocab_model;
+    const char * fn_train_data;
+    const char * fn_checkpoint_in;
+    const char * fn_checkpoint_out;
+    const char * fn_model_out;
+
+    uint32_t seed;
+
+    int n_ctx;
+    int n_embd;
+    int n_mult;
+    int n_head;
+    int n_layer;
+    int n_rotmax;
+
+    int n_threads;
+    int n_batch;
+    int n_examples;
+    int n_predict;
+
+    int print_info_interval;
+    int print_details_interval;
+
+    bool samples_start_after_nl;
+    bool use_adam;
+    bool use_flash;
+    bool use_scratch;
+
+    // only adam
+    int   warmup;
+    int   cos_decay_steps;
+    float cos_decay_restart;
+    float cos_decay_alpha;
+
+    int   lbfgs_n_iter;
+    int   adam_n_iter;
+    float adam_alpha;
+    float adam_decay;
+
+    int mem_model_gb;
+    int mem_compute_gb;
+    int mem_compute0_gb;
+    int mem_compute1_gb;
+};
+
+struct train_params get_default_train_params() {
+    struct train_params params;
+    params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
+    params.fn_train_data     = "shakespeare.txt";
+    params.fn_checkpoint_in  = "checkpoint.bin";
+    params.fn_checkpoint_out = "checkpoint.bin";
+    params.fn_model_out      = "ggml-checkpoint-f32.bin";
+
+    params.seed       =   -1;
+
+    params.n_ctx      =  128;
+    params.n_embd     =  256;
+    params.n_mult     =  256;
+    params.n_head     =    8;
+    params.n_layer    =   16;
+    params.n_rotmax   =   64;
+
+    params.n_threads  =    6;
+    params.n_batch    =    8;
+    params.n_examples =    8;
+    params.n_predict  = 1024;
+
+    params.print_info_interval    = 1;
+    params.print_details_interval = 2;
+
+    params.samples_start_after_nl = false;
+    params.use_adam               = true;
+    params.use_flash              = true;
+    params.use_scratch            = true;
+
+    // only adam
+    params.warmup            =  100;
+    params.cos_decay_steps   = 1000;
+    params.cos_decay_restart = 1.1f;
+    params.cos_decay_alpha   = 0.0f;
+
+    params.lbfgs_n_iter      = 16;
+    params.adam_n_iter       = 16;
+    params.adam_alpha        = 1e-3f;
+    params.adam_decay        = 1e-3f;
+
+    params.mem_model_gb   = 2;
+    params.mem_compute_gb = 24;
+    params.mem_compute0_gb = 8;
+    params.mem_compute1_gb = 2;
+
+    return params;
+}
+
+void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help                 show this help message and exit\n");
+    fprintf(stderr, "  --vocab-model FNAME        model path from which to load vocab (default '%s')\n", params->fn_vocab_model);
+    fprintf(stderr, "  --train-data FNAME         path from which to load training data (default '%s')\n", params->fn_train_data);
+    fprintf(stderr, "  --checkpoint-in FNAME      path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
+    fprintf(stderr, "  --checkpoint-out FNAME     path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
+    fprintf(stderr, "  --model-out FNAME          path to save ggml model (default '%s')\n", params->fn_model_out);
+    fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for -1)\n");
+    fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
+    fprintf(stderr, "  --embd N                   Embedding size used for new models (default %d)\n", params->n_embd);
+    fprintf(stderr, "  --mult N                   Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult);
+    fprintf(stderr, "  --head N                   Number of heads for new models (default %d)\n", params->n_head);
+    fprintf(stderr, "  --layer N                  Number of layers for new models (default %d)\n", params->n_layer);
+    fprintf(stderr, "  --rotmax N                 Maximal number Rope dimensions for new models (default %d)\n", params->n_rotmax);
+    fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
+    fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
+    fprintf(stderr, "  -n N, --examples N         Number of examples to train (default %d)\n", params->n_examples);
+    fprintf(stderr, "  --predict N                Number of tokens to generate after training (default %d)\n", params->n_predict);
+    fprintf(stderr, "  --print-info-interval N    Print infos during training each N examples (default %d)\n", params->print_info_interval);
+    fprintf(stderr, "  --print-details-interval N Print details during training each N examples (default %d)\n", params->print_details_interval);
+    fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
+    fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
+    fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
+    fprintf(stderr, "  --no-flash                 Don't use flash attention.\n");
+    fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
+    fprintf(stderr, "  --no-scratch               Don't use scratch buffers\n");
+    fprintf(stderr, "  --use-scratch              Use scratch buffers (default)\n");
+    fprintf(stderr, "  --warmup N                 Number of warmup steps (default %d)\n", params->warmup);
+    fprintf(stderr, "  --cos-decay-steps N        Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
+    fprintf(stderr, "  --cos-decay-restart N      Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
+    fprintf(stderr, "  --cos-decay-alpha N        Cosine decay alpha (default %f)\n", params->cos_decay_alpha);
+    fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
+    fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
+    fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
+    fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
+    fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
+    fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
+    fprintf(stderr, "  --mem-compute0 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb);
+    fprintf(stderr, "  --mem-compute1 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute1_gb);
+    fprintf(stderr, "\n");
+}
+
+bool train_params_parse(int argc, char ** argv, struct train_params * params) {
+    bool invalid_param = false;
+    std::string arg;
+    struct train_params default_params = get_default_train_params();
+    const std::string arg_prefix = "--";
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        if (arg == "--vocab-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_vocab_model = argv[i];
+        } else if (arg == "--train-data") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_train_data = argv[i];
+        } else if (arg == "--checkpoint-in") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_checkpoint_in = argv[i];
+        } else if (arg == "--checkpoint-out") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_checkpoint_out = argv[i];
+        } else if (arg == "--model-out") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_model_out = argv[i];
+        } else if (arg == "-s" || arg == "--seed") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->seed = std::stoi(argv[i]);
+        } else if (arg == "-c" || arg == "--ctx") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_ctx = std::stoi(argv[i]);
+        } else if (arg == "--embd") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_embd = std::stoi(argv[i]);
+        } else if (arg == "--mult") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_mult = std::stoi(argv[i]);
+        } else if (arg == "--head") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_head = std::stoi(argv[i]);
+        } else if (arg == "--layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_layer = std::stoi(argv[i]);
+        } else if (arg == "--rotmax") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_rotmax = std::stoi(argv[i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_threads = std::stoi(argv[i]);
+        } else if (arg == "-b" || arg == "--batch") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_batch = std::stoi(argv[i]);
+        } else if (arg == "-n" || arg == "--examples") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_examples = std::stoi(argv[i]);
+        } else if (arg == "--predict") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_predict = std::stoi(argv[i]);
+        } else if (arg == "--print-info-interval") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->print_info_interval = std::stoi(argv[i]);
+        } else if (arg == "--print-details-interval") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->print_details_interval = std::stoi(argv[i]);
+        } else if (arg == "--samples-after-nl") {
+            params->samples_start_after_nl = true;
+        } else if (arg == "--use-lbfgs") {
+            params->use_adam = false;
+        } else if (arg == "--use-adam") {
+            params->use_adam = true;
+        } else if (arg == "--no-flash") {
+            params->use_flash = false;
+        } else if (arg == "--use-flash") {
+            params->use_flash = true;
+        } else if (arg == "--no-scratch") {
+            params->use_scratch = false;
+        } else if (arg == "--use-scratch") {
+            params->use_scratch = true;
+        } else if (arg == "--warmup") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->warmup = std::stoi(argv[i]);
+        } else if (arg == "--cos-decay-steps") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->cos_decay_steps = std::stof(argv[i]);
+        } else if (arg == "--cos-decay-restart") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->cos_decay_restart = std::stof(argv[i]);
+        } else if (arg == "--cos-decay-alpha") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->cos_decay_alpha = std::stof(argv[i]);
+        } else if (arg == "--lbfgs-iter") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->lbfgs_n_iter = std::stoi(argv[i]);
+        } else if (arg == "--adam-iter") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_n_iter = std::stoi(argv[i]);
+        } else if (arg == "--adam-alpha") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_alpha = std::stof(argv[i]);
+        } else if (arg == "--adam-decay") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_decay = std::stof(argv[i]);
+        } else if (arg == "--mem-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_model_gb = std::stoi(argv[i]);
+        } else if (arg == "--mem-compute") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_compute_gb = std::stoi(argv[i]);
+        } else if (arg == "--mem-compute0") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_compute0_gb = std::stoi(argv[i]);
+        } else if (arg == "--mem-compute1") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_compute1_gb = std::stoi(argv[i]);
+        } else if (arg == "-h" || arg == "--help") {
+            train_print_usage(argc, argv, &default_params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            train_print_usage(argc, argv, &default_params);
+            exit(1);
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        train_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+
+    return true;
+}
+
+typedef struct {
+    int dim; // transformer dimension
+    int hidden_dim; // for ffn layers
+    int n_layers; // number of layers
+    int n_heads; // number of query heads
+    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
+    int vocab_size; // vocabulary size, usually 256 (byte-level)
+    int seq_len; // max sequence length
+} Config;
+
+
+
+void malloc_weights(TransformerWeights* w, Config* p) {
+    // we calloc instead of malloc to keep valgrind happy
+    w->token_embedding_table = new float[p->vocab_size * p->dim]();//calloc(p->vocab_size * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+    
+    w->rms_att_weight = new float[p->n_layers * p->dim](); //calloc(p->n_layers * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+
+    w->rms_ffn_weight = new float[p->n_layers * p->dim](); //calloc(p->n_layers * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+
+    w->wq = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wk = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wv = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wo = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->dim * p->hidden_dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->rms_final_weight = new float[p->dim](); //calloc(p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+
+    w->freq_cis_real = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
+    printf("[%s:AK] Allocating [%d] float space for w->freq_cis_real\n",__func__,p->seq_len * p->dim / 2);
+
+    w->freq_cis_imag = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
+    printf("[%s:AK] Allocating [%d] float space for w->freq_cis_imag\n\n",__func__,p->seq_len * p->dim / 2);
+
+    // ensure all mallocs went fine
+    // if (!w->token_embedding_table || !w->rms_att_weight || !w->rms_ffn_weight 
+    //  || !w->wq || !w->wk || !w->wv || !w->wo || !w->w1 || !w->w2 || !w->w3 || 
+    //     !w->rms_final_weight || !w->freq_cis_real || !w->freq_cis_imag) {
+    //     printf("malloc failed!\n");
+    //     exit(1);
+    // }
+}
+
+int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
+    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
+    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
+    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
+    int head_size = p->dim / p->n_heads;
+    if (fread(w->freq_cis_real, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
+    if (fread(w->freq_cis_imag, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
+    return 0;
+}
+
+void free_weights(TransformerWeights* w) {
+    free(w->token_embedding_table);
+    free(w->rms_att_weight);
+    free(w->rms_ffn_weight);
+    free(w->wq);
+    free(w->wk);
+    free(w->wv);
+    free(w->wo);
+    free(w->w1);
+    free(w->w2);
+    free(w->w3);
+    free(w->rms_final_weight);
+    free(w->freq_cis_real);
+    free(w->freq_cis_imag);
+}
+
+
+void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
+    const auto & hparams = model->hparams;
+
+    const uint32_t n_layer = hparams.n_layer;
+
+    struct random_normal_distribution rnd;
+    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
+
+    randomize_tensor_normal(model->tok_embeddings, &rnd);
+    randomize_tensor_normal(model->norm,           &rnd);
+    randomize_tensor_normal(model->output,         &rnd);
+
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+        randomize_tensor_normal(layer.attention_norm, &rnd);
+
+        randomize_tensor_normal(layer.wq, &rnd);
+        randomize_tensor_normal(layer.wk, &rnd);
+        randomize_tensor_normal(layer.wv, &rnd);
+        randomize_tensor_normal(layer.wo, &rnd);
+
+        randomize_tensor_normal(layer.ffn_norm, &rnd);
+
+        randomize_tensor_normal(layer.w1, &rnd);
+        randomize_tensor_normal(layer.w2, &rnd);
+        randomize_tensor_normal(layer.w3, &rnd);
+    }
+}
+
+int main(int argc, char ** argv) {
+    Config config;
+    TransformerWeights weights;
+    {
+        FILE *file = fopen("/Users/aniket/Projects/karpathy/llama2.c/out/model.bin", "rb");
+        if (!file) {
+            printf("Unable to open the checkpoint file %s!\n", "/Users/aniket/Projects/karpathy/llama2.c/out/model.bin");
+            return 1;
+        }
+        else{
+            printf("model file opened for reading...\n");
+        }
+        // read in the config header
+        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
+        printf("config file read..\n");
+
+        // read in the Transformer weights
+        malloc_weights(&weights, &config);
+        printf("reading the opened model file...\n");
+        if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
+
+        fclose(file);
+
+    }
+    ////////////// Loads default train parameters ///////////////////////////
+    struct train_params params = get_default_train_params();
+    printf("params.n_ctx %d\n", params.n_ctx);
+    printf("params.n_embd %d\n", params.n_embd);
+    printf("params.fn_vocab_model %s\n", params.fn_vocab_model);
+    
+    if (!train_params_parse(argc, argv, &params)) {
+        return 1;
+    }
+
+    // Seed not needed here.
+    // if (params.seed == LLAMA_DEFAULT_SEED) {
+    //     params.seed = time(NULL);
+    // }
+    // printf("[%s]: seed: %u\n", __func__, params.seed);
+    // srand(params.seed);
+    ////////////////////////////////////////////////////////////////////////////////////
+
+    struct llama_context_params llama_params = llama_context_default_params();
+    llama_params.vocab_only = true;
+
+    struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
+    struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
+
+    struct llama_vocab vocab;
+    {
+        std::vector<const char *> strings;
+        std::vector<float> scores;
+        int n_vocab = llama_n_vocab(lctx);
+        printf("nvocab = %d\n", n_vocab);
+        strings.resize(n_vocab, NULL);
+        scores.resize(n_vocab, 0);
+        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
+        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        vocab.id_to_token.resize(n_vocab);
+        for (int i=0; i<n_vocab; ++i) {
+            std::string tok   = std::string(strings[i]);
+            float       score = scores[i];
+            // printf("%s - %f\n", tok.c_str(), score);
+            vocab.id_to_token[i].tok   = tok;
+            vocab.id_to_token[i].score = score;
+            vocab.token_to_id.emplace(tok, i);
+        }
+    }
+
+    printf("%s: tokenize training data\n", __func__);
+    std::vector<llama_token> train_tokens;
+    if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) {
+        fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data);
+    }
+    printf("%s: number of training tokens: %d\n", __func__, (int) train_tokens.size());
+
+    struct my_llama_model model;
+    
+    model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
+    model.hparams.n_ctx   = params.n_ctx;
+    model.hparams.n_embd  = config.dim; //params.n_embd;
+    model.hparams.n_mult  = params.n_mult; 
+    model.hparams.n_head  = config.n_heads; //params.n_head;
+    model.hparams.n_layer = config.n_layers; //params.n_layer;
+    model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
+
+    print_params(&model.hparams);
+    struct ggml_init_params lcparams;
+    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
+    lcparams.mem_buffer = NULL;
+    lcparams.no_alloc   = false;
+
+    model.ctx = ggml_init(lcparams);
+
+    init_model(&model);
+    // randomize_model(&model, params.seed, 0.0f, 1.0f, -1.0f, +1.0f);
+    save_as_llama_model(&vocab, &model, &weights, "ak_model.bin");
+
+    // llama_free(lctx);
+    llama_free_model(lmodel);
+    ggml_free(model.ctx);
+    // free(&weights);
+    return 0;
+}

From b3aa1073ab4bbdccd3b6d93c8e5a153ed2a7be89 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Fri, 28 Jul 2023 16:08:09 -0400
Subject: [PATCH 07/30] saving the file with all the variables found in llama.c
 model

---
 .../convert-llama2c-to-ggml.cpp               | 928 ++++++++++++++++++
 1 file changed, 928 insertions(+)
 create mode 100644 examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
new file mode 100644
index 0000000000000..c81c773177631
--- /dev/null
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -0,0 +1,928 @@
+#include "ggml.h"
+#include "llama.h"
+#include <unordered_map>
+#include <vector>
+#include <cassert>
+#include <climits>
+#include <cstring>
+#include <cstdarg>
+#include <ctime>
+#include <random>
+#include <stdexcept>
+#include <algorithm>
+#include <string>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+//////////////////////////////////////// llama.c model structs and functions to load models, alloc memory etc.
+typedef struct {
+    int dim; // transformer dimension
+    int hidden_dim; // for ffn layers
+    int n_layers; // number of layers
+    int n_heads; // number of query heads
+    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
+    int vocab_size; // vocabulary size, usually 256 (byte-level)
+    int seq_len; // max sequence length
+} Config;
+
+typedef struct {
+    // token embedding table
+    float* token_embedding_table;    // (vocab_size, dim)
+    // weights for rmsnorms
+    float* rms_att_weight; // (layer, dim) rmsnorm weights
+    float* rms_ffn_weight; // (layer, dim)
+    // weights for matmuls
+    float* wq; // (layer, dim, dim)
+    float* wk; // (layer, dim, dim)
+    float* wv; // (layer, dim, dim)
+    float* wo; // (layer, dim, dim)
+    // weights for ffn
+    float* w1; // (layer, hidden_dim, dim)
+    float* w2; // (layer, dim, hidden_dim)
+    float* w3; // (layer, hidden_dim, dim)
+    // final rmsnorm
+    float* rms_final_weight; // (dim,)
+    // freq_cis for RoPE relatively positional embeddings
+    float* freq_cis_real; // (seq_len, dim/2)
+    float* freq_cis_imag; // (seq_len, dim/2)
+    // (optional) classifier weights for the logits, on the last layer
+    float* wcls;
+} TransformerWeights;
+
+int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
+    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
+    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
+    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
+    int head_size = p->dim / p->n_heads;
+    if (fread(w->freq_cis_real, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
+    if (fread(w->freq_cis_imag, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
+    return 0;
+}
+
+void malloc_weights(TransformerWeights* w, Config* p) {
+    // we calloc instead of malloc to keep valgrind happy
+    w->token_embedding_table = new float[p->vocab_size * p->dim]();//calloc(p->vocab_size * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+    
+    w->rms_att_weight = new float[p->n_layers * p->dim](); //calloc(p->n_layers * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+
+    w->rms_ffn_weight = new float[p->n_layers * p->dim](); //calloc(p->n_layers * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+
+    w->wq = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wk = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wv = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wo = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->dim * p->hidden_dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->rms_final_weight = new float[p->dim](); //calloc(p->dim, sizeof(float));
+    printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+
+    w->freq_cis_real = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->freq_cis_real\n",__func__,p->seq_len, p->dim / 2, p->seq_len * p->dim / 2);
+
+    w->freq_cis_imag = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->freq_cis_imag\n\n",__func__,p->seq_len, p->dim / 2, p->seq_len * p->dim / 2);
+
+    // ensure all mallocs went fine
+    // if (!w->token_embedding_table || !w->rms_att_weight || !w->rms_ffn_weight 
+    //  || !w->wq || !w->wk || !w->wv || !w->wo || !w->w1 || !w->w2 || !w->w3 || 
+    //     !w->rms_final_weight || !w->freq_cis_real || !w->freq_cis_imag) {
+    //     printf("malloc failed!\n");
+    //     exit(1);
+    // }
+}
+
+void free_weights(TransformerWeights* w) {
+    free(w->token_embedding_table);
+    free(w->rms_att_weight);
+    free(w->rms_ffn_weight);
+    free(w->wq);
+    free(w->wk);
+    free(w->wv);
+    free(w->wo);
+    free(w->w1);
+    free(w->w2);
+    free(w->w3);
+    free(w->rms_final_weight);
+    free(w->freq_cis_real);
+    free(w->freq_cis_imag);
+}
+
+void print_sample_weights(TransformerWeights *w){
+    printf("----- Quick print of first of the weight vales of all the variables\n");
+    printf("%f\n", w->token_embedding_table[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+    printf("%f\n", w->rms_ffn_weight[0]);
+
+    printf("%f\n", w->wq[0]);
+    printf("%f\n", w->wk[0]);
+    printf("%f\n", w->wv[0]);
+    printf("%f\n", w->wo[0]);
+    printf("%f\n", w->w1[0]);
+    printf("%f\n", w->w2[0]);
+    printf("%f\n", w->w3[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+    printf("%f\n", w->freq_cis_real[0]);
+    printf("%f\n", w->freq_cis_imag[0]);
+    printf("------------------------------------------------------------------\n");
+
+    
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
+
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    struct token_score {
+        token tok;
+        float score;
+    };
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_score> id_to_token;
+};
+
+struct my_llama_hparams {
+    uint32_t n_vocab = 32000;
+    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t n_embd  = 4096;
+    uint32_t n_mult  = 4;
+    uint32_t n_head  = 32;
+    uint32_t n_layer = 32;
+    uint32_t n_rot   = 64;
+
+    bool operator!=(const my_llama_hparams& other) const {
+        return memcmp(this, &other, sizeof(my_llama_hparams));
+    }
+};
+
+struct my_llama_layer {
+    // normalization
+    struct ggml_tensor * attention_norm;
+
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+
+    // normalization
+    struct ggml_tensor * ffn_norm;
+
+    // ff
+    struct ggml_tensor * w1;
+    struct ggml_tensor * w2;
+    struct ggml_tensor * w3;
+};
+
+struct my_llama_model {
+    struct ggml_context * ctx = NULL;
+
+    my_llama_hparams hparams;
+
+    struct ggml_tensor * tok_embeddings;
+
+    struct ggml_tensor * norm;
+    struct ggml_tensor * output;
+
+    struct ggml_tensor * freq_cis_real;
+    struct ggml_tensor * freq_cis_imag;
+
+    std::vector<my_llama_layer> layers;
+
+    uint32_t train_its = 0;
+    uint32_t train_samples = 0;
+    uint32_t train_tokens = 0;
+};
+
+uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
+    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
+    return n_ff;
+}
+
+void print_params(struct my_llama_hparams * params) {
+    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
+    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
+    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
+    printf("%s: n_head:  %d\n", __func__, params->n_head);
+    printf("%s: n_ff:    %d\n", __func__, get_n_ff(params));
+    printf("%s: n_layer: %d\n", __func__, params->n_layer);
+    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+}
+
+void init_model(struct my_llama_model * model) {
+    const auto & hparams = model->hparams;
+
+    const uint32_t n_embd  = hparams.n_embd;
+    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_vocab = hparams.n_vocab;
+
+    const uint32_t n_ff = get_n_ff(&hparams);
+
+    struct ggml_context * ctx = model->ctx;
+
+    model->train_its = 0;
+    model->train_samples = 0;
+    model->train_tokens = 0;
+
+    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
+
+    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+    printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
+
+    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
+
+    model->freq_cis_real         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->freq_cis_real\n",__func__,n_embd, n_embd / 2, n_embd * n_embd / 2);
+    
+    model->freq_cis_imag         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->freq_cis_imag\n",__func__,n_embd, n_embd / 2, n_embd * n_embd / 2);
+
+    // printing the per-layer allocations here so we dont print in the for loop.
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__,n_embd, n_ff, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__,n_ff, n_embd, n_ff * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__,n_embd, n_ff, n_embd * n_ff, n_layer);
+    
+
+    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
+    ggml_set_name(model->norm,           "norm.weight");
+    ggml_set_name(model->output,         "output.weight");
+    ggml_set_name(model->freq_cis_real,         "output.freq_cis_real");
+    ggml_set_name(model->freq_cis_imag,         "output.freq_cis_imag");
+
+    model->layers.resize(n_layer);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        std::string layers_i = "layers." + std::to_string(i);
+
+        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+
+        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
+        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
+        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
+
+        ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
+
+        ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
+        ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
+        ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
+        ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
+
+        ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
+
+        ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
+        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
+        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
+    }
+}
+
+void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
+    *ptr = value;
+}
+
+void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    *ptr = value;
+}
+
+void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) {
+    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    *ptr = value;
+}
+
+float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+void print_row(struct ggml_tensor * probs, int i) {
+    for (int k = 0; k < probs->ne[0]; ++k) {
+        float p = get_f32_2d(probs, k, i);
+        printf(" %.2f", p);
+    }
+    printf("\n");
+}
+
+void print_matrix(struct ggml_tensor * probs) {
+    assert(probs->n_dims == 2);
+    for (int i = 0; i < probs->ne[1]; ++i) {
+        for (int k = 0; k < probs->ne[0]; ++k) {
+            float p = get_f32_2d(probs, k, i);
+            printf(" %.2f", p);
+        }
+        printf("\n");
+    }
+}
+
+void print_token(struct llama_context * ctx, llama_token token) {
+    printf("%s", llama_token_to_str(ctx, token));
+}
+
+void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
+    for (int i=0; i<tokens->ne[0]; ++i) {
+        int token = ggml_get_i32_1d(tokens, i);
+        print_token(ctx, token);
+    }
+}
+
+void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) {
+    for (int i1=0; i1<tokens->ne[1]; ++i1) {
+        //int num_newline = 0;
+        for (int i0=0; i0<tokens->ne[0]; ++i0) {
+            int token = get_i32_2d(tokens, i0, i1);
+            print_token(ctx, token);
+            // bool isnl = (token == llama_token_nl());
+            // if (isnl) {
+            //     ++num_newline;
+            // }
+            // if (isnl) {
+            //     if (num_newline < 2) {
+            //         print_token(ctx, token);
+            //     } else {
+            //         printf("\\n");
+            //     }
+            // } else {
+            //     print_token(ctx, token);
+            // }
+        }
+        printf("\n--\n");
+    }
+}
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
+    struct llama_file f(filename, "rb");
+
+    std::vector<char> buf;
+    buf.resize(f.size+1);
+
+    f.read_raw(buf.data(), f.size);
+    buf[f.size] = '\0';
+
+    out.resize(buf.size());
+
+    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
+    if (n_tokens >= 0) {
+        out.resize(n_tokens);
+    }
+
+    bool verify = false;
+    if (verify) {
+        const char * in  = buf.data();
+        const char * end = buf.data() + buf.size();
+        for (int i = 0; i < (int) out.size(); ++i) {
+            const char * s = llama_token_to_str(lctx, out[i]);
+            int len = strlen(s);
+            if (in >= end) {
+                printf("%s: unexpected end of original text.\n", __func__);
+                break;
+            }
+            const bool matches = (strncmp(in, s, len) == 0);
+            if (matches) {
+                in += len;
+            } else {
+                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
+            }
+        }
+    }
+
+    return n_tokens;
+}
+
+void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+    if (tensor == NULL) {
+        file->write_u32(0);
+        file->write_u32(0);
+        file->write_u32(GGML_TYPE_F32);
+        file->seek((0-file->tell()) & 31, SEEK_CUR);
+        return;
+    }
+    const char * name = ggml_get_name(tensor);
+    uint32_t name_len = strlen(name);
+    uint32_t nd = tensor->n_dims;
+    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
+                       (uint32_t)tensor->ne[1],
+                       (uint32_t)tensor->ne[2],
+                       (uint32_t)tensor->ne[3] };
+    file->write_u32(nd);
+    file->write_u32(name_len);
+    file->write_u32(tensor->type);
+    file->write_raw(ne, sizeof(ne[0]) * nd);
+    file->write_raw(name, name_len);
+    file->seek((0-file->tell()) & 31, SEEK_CUR);
+    file->write_raw(tensor->data, ggml_nbytes(tensor));
+}
+
+void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+    int32_t nd = file->read_u32();
+    GGML_ASSERT(nd == tensor->n_dims);
+
+    uint32_t name_len       = file->read_u32();
+    enum     ggml_type type = (enum ggml_type) file->read_u32();
+    GGML_ASSERT(type == tensor->type);
+
+    uint32_t ne[4];
+    file->read_raw(ne, sizeof(ne[0]) * nd);
+    for (int i=0; i<nd; ++i) {
+        GGML_ASSERT(ne[i] == tensor->ne[i]);
+    }
+
+    std::string name = file->read_string(name_len);
+    GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
+
+    file->seek((0-file->tell()) & 31, SEEK_CUR);
+    file->read_raw(tensor->data, ggml_nbytes(tensor));
+}
+
+void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
+    
+    int ct;
+    switch (gg_weights->n_dims){
+        case 1:
+            ct = 0;
+            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
+                float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
+                *ptr = karpathy_weights[ct];
+            }
+        case 2:
+            ct = 0;
+            for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {                    
+                    float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
+                    *ptr = karpathy_weights[ct];
+                    ct++;
+                }
+            }
+            break;
+        case 3:
+            ct = 0;
+            for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
+                for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {                        
+                        float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
+                        *ptr = karpathy_weights[ct];
+                        ct++;
+                    }
+                }
+            }
+            break;    
+    }
+}
+
+void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
+    struct llama_file file(filename, "wb");
+    if (file.fp == NULL) {
+        return;
+    }
+    // write_magic
+    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+    file.write_u32(LLAMA_FILE_VERSION); // version
+    // write_hparams
+    file.write_u32(model->hparams.n_vocab);
+    file.write_u32(model->hparams.n_embd);
+    file.write_u32(model->hparams.n_mult);
+    file.write_u32(model->hparams.n_head);
+    file.write_u32(model->hparams.n_layer);
+    file.write_u32(model->hparams.n_rot);
+    file.write_u32(LLAMA_FTYPE_ALL_F32);
+
+    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
+    uint32_t n_vocab = model->hparams.n_vocab;
+    for (uint32_t i = 0; i < n_vocab; i++) {
+        const auto & token_score = vocab->id_to_token.at(i);
+        file.write_u32((uint32_t) token_score.tok.size());
+        file.write_raw(token_score.tok.data(), token_score.tok.size());
+        file.write_raw(&token_score.score, sizeof(token_score.score));
+    }
+
+    // stuff AK weights into GG weights one by one.
+    // w->token_embedding_table -> model->tok_embeddings
+    // float*                   -> struct ggml_tensor
+    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+    // print_row(model->tok_embeddings, 0);
+
+    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);         
+    stuff_karpathy_weights_into_gg(model->freq_cis_real, w->freq_cis_real);
+    stuff_karpathy_weights_into_gg(model->freq_cis_imag, w->freq_cis_imag);
+
+    // for rms-att-weight 
+    int row_length = model->hparams.n_embd;
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
+        auto & layer = model->layers[i];
+        // 2d        
+        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length]);
+    }
+    
+    // write tensors
+    write_tensor(&file, model->tok_embeddings);
+    write_tensor(&file, model->norm);
+    write_tensor(&file, model->output); // ?
+    write_tensor(&file, model->freq_cis_real);
+    write_tensor(&file, model->freq_cis_imag);
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {        
+        auto & layer = model->layers[i];
+
+        write_tensor(&file, layer.attention_norm);
+        write_tensor(&file, layer.wq);
+        write_tensor(&file, layer.wk);
+        write_tensor(&file, layer.wv);
+        write_tensor(&file, layer.wo);
+        write_tensor(&file, layer.ffn_norm);
+        write_tensor(&file, layer.w1);
+        write_tensor(&file, layer.w2);
+        write_tensor(&file, layer.w3);
+    }
+}
+
+struct train_params {
+    const char * fn_vocab_model;
+    const char * fn_llama2c_model;
+    const char * fn_llama2c_output_model;    
+    const char * fn_train_data;
+    const char * fn_checkpoint_in;
+    const char * fn_checkpoint_out;
+    const char * fn_model_out;
+
+    uint32_t seed;
+
+    int n_ctx;
+    int n_embd;
+    int n_mult;
+    int n_head;
+    int n_layer;
+    int n_rotmax;
+
+    int n_threads;
+    int n_batch;
+    int n_examples;
+    int n_predict;
+
+    int print_info_interval;
+    int print_details_interval;
+
+    bool samples_start_after_nl;
+    bool use_adam;
+    bool use_flash;
+    bool use_scratch;
+
+    // only adam
+    int   warmup;
+    int   cos_decay_steps;
+    float cos_decay_restart;
+    float cos_decay_alpha;
+
+    int   lbfgs_n_iter;
+    int   adam_n_iter;
+    float adam_alpha;
+    float adam_decay;
+
+    int mem_model_gb;
+    int mem_compute_gb;
+    int mem_compute0_gb;
+    int mem_compute1_gb;
+};
+
+struct train_params get_default_train_params() {
+    struct train_params params;
+    params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
+    params.fn_llama2c_output_model = "ak_llama_model.bin";
+    params.fn_train_data     = "shakespeare.txt";
+    params.fn_checkpoint_in  = "checkpoint.bin";
+    params.fn_checkpoint_out = "checkpoint.bin";
+    params.fn_model_out      = "ggml-checkpoint-f32.bin";
+
+    params.seed       =   -1;
+
+    params.n_ctx      =  128;
+    params.n_embd     =  256;
+    params.n_mult     =  256;
+    params.n_head     =    8;
+    params.n_layer    =   16;
+    params.n_rotmax   =   64;
+
+    params.n_threads  =    6;
+    params.n_batch    =    8;
+    params.n_examples =    8;
+    params.n_predict  = 1024;
+
+    params.print_info_interval    = 1;
+    params.print_details_interval = 2;
+
+    params.samples_start_after_nl = false;
+    params.use_adam               = true;
+    params.use_flash              = true;
+    params.use_scratch            = true;
+
+    // only adam
+    params.warmup            =  100;
+    params.cos_decay_steps   = 1000;
+    params.cos_decay_restart = 1.1f;
+    params.cos_decay_alpha   = 0.0f;
+
+    params.lbfgs_n_iter      = 16;
+    params.adam_n_iter       = 16;
+    params.adam_alpha        = 1e-3f;
+    params.adam_decay        = 1e-3f;
+
+    params.mem_model_gb   = 2;
+    params.mem_compute_gb = 24;
+    params.mem_compute0_gb = 8;
+    params.mem_compute1_gb = 2;
+
+    return params;
+}
+
+void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help                       show this help message and exit\n");
+    fprintf(stderr, "  --vocab-model FNAME              model path from which to load vocab (default '%s')\n", params->fn_vocab_model);
+    fprintf(stderr, "  --llama2c-model FNAME            model path from which to load Karpathy's llama2.c model\n");   
+    fprintf(stderr, "  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);   
+    fprintf(stderr, "\n");
+}
+
+bool train_params_parse(int argc, char ** argv, struct train_params * params) {
+    bool invalid_param = false;
+    std::string arg;
+    struct train_params default_params = get_default_train_params();
+    const std::string arg_prefix = "--";
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        if (arg == "--vocab-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_vocab_model = argv[i];
+        } else if (arg == "--llama2c-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_llama2c_model = argv[i]; 
+        } else if (arg == "--llama2c-output-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_llama2c_output_model = argv[i]; 
+        } else if (arg == "-h" || arg == "--help") {
+            train_print_usage(argc, argv, &default_params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            train_print_usage(argc, argv, &default_params);
+            exit(1);
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        train_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    struct train_params params = get_default_train_params();
+    if (!train_params_parse(argc, argv, &params)) {
+        return 1;
+    }
+    Config config;
+    TransformerWeights weights;
+    {        
+        FILE *file = fopen(params.fn_llama2c_model, "rb");
+        if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
+        // read in the config header
+        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
+        // read in the Transformer weights
+        malloc_weights(&weights, &config);
+        if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
+        fclose(file);
+    }
+
+    struct llama_context_params llama_params = llama_context_default_params();
+    llama_params.vocab_only = true;
+
+    struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
+    struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
+
+    struct llama_vocab vocab;
+    {
+        std::vector<const char *> strings;
+        std::vector<float> scores;
+        int n_vocab = llama_n_vocab(lctx);        
+        strings.resize(n_vocab, NULL);
+        scores.resize(n_vocab, 0);
+        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
+        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        vocab.id_to_token.resize(n_vocab);
+        for (int i=0; i<n_vocab; ++i) {
+            std::string tok   = std::string(strings[i]);
+            float       score = scores[i];            
+            vocab.id_to_token[i].tok   = tok;
+            vocab.id_to_token[i].score = score;
+            vocab.token_to_id.emplace(tok, i);
+        }
+    }
+    struct my_llama_model model;
+    model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
+    model.hparams.n_ctx   = params.n_ctx;
+    model.hparams.n_embd  = config.dim; //params.n_embd;
+    model.hparams.n_mult  = params.n_mult; 
+    model.hparams.n_head  = config.n_heads; //params.n_head;
+    model.hparams.n_layer = config.n_layers; //params.n_layer;
+    model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
+
+    print_params(&model.hparams);
+    struct ggml_init_params lcparams;
+    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
+    lcparams.mem_buffer = NULL;
+    lcparams.no_alloc   = false;
+
+    model.ctx = ggml_init(lcparams);
+
+    init_model(&model);
+    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
+
+    printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
+
+    llama_free(lctx);
+    llama_free_model(lmodel);
+    ggml_free(model.ctx);
+    free_weights(&weights);
+    return 0;
+}
\ No newline at end of file

From af9caca4343d017ca72c475fd34f05b82d590ecf Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Fri, 28 Jul 2023 16:08:51 -0400
Subject: [PATCH 08/30] updating makefile to compile finalized version

---
 Makefile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 9f2923ae26464..6fad72ab7f87f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c my-tests simple server embd-input-test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c my-tests convert-llama2c-to-ggml simple server embd-input-test
 
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
@@ -330,7 +330,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
 clean:
-	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c my-tests embd-input-test build-info.h $(TEST_TARGETS)
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c my-tests convert-llama2c-to-ggml embd-input-test build-info.h $(TEST_TARGETS)
 
 #
 # Examples
@@ -379,6 +379,9 @@ convert-llama2c: examples/convert-llama2c/convert-lamma-2c.cpp    build-info.h g
 my-tests: examples/my-tests/my-tests.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp    build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp

From 817cc20f4c90e2dd6628b4cd39e26acb0d27125b Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Fri, 28 Jul 2023 16:09:33 -0400
Subject: [PATCH 09/30] updating gitignore to ignore additional binaries

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index aeca1dca75af8..ed7fd2cac45ad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 *.o
 *.a
 *.so
+*.bin
 .DS_Store
 .build/
 .cache/
@@ -40,6 +41,8 @@ models-mnt
 /embedding
 /train-text-from-scratch
 /convert-llama2c
+/my-tests
+/convert-llama2c-to-ggml
 /simple
 /benchmark-matmult
 /vdot

From 5a87675db47129f9a4e0823f52f3eea17fe07477 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Fri, 28 Jul 2023 16:17:44 -0400
Subject: [PATCH 10/30] output vector is not part of llama.c model file

---
 .../convert-llama2c-to-ggml.cpp                        | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index c81c773177631..28f74066e5197 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -213,7 +213,7 @@ struct my_llama_model {
     struct ggml_tensor * tok_embeddings;
 
     struct ggml_tensor * norm;
-    struct ggml_tensor * output;
+    // struct ggml_tensor * output;
 
     struct ggml_tensor * freq_cis_real;
     struct ggml_tensor * freq_cis_imag;
@@ -262,8 +262,8 @@ void init_model(struct my_llama_model * model) {
     model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
     printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
 
-    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
+    // model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    // printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
 
     model->freq_cis_real         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
     printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->freq_cis_real\n",__func__,n_embd, n_embd / 2, n_embd * n_embd / 2);
@@ -286,7 +286,7 @@ void init_model(struct my_llama_model * model) {
 
     ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
     ggml_set_name(model->norm,           "norm.weight");
-    ggml_set_name(model->output,         "output.weight");
+    // ggml_set_name(model->output,         "output.weight");
     ggml_set_name(model->freq_cis_real,         "output.freq_cis_real");
     ggml_set_name(model->freq_cis_imag,         "output.freq_cis_imag");
 
@@ -681,7 +681,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
     // write tensors
     write_tensor(&file, model->tok_embeddings);
     write_tensor(&file, model->norm);
-    write_tensor(&file, model->output); // ?
+    // write_tensor(&file, model->output); // ?
     write_tensor(&file, model->freq_cis_real);
     write_tensor(&file, model->freq_cis_imag);
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {        

From aebccdbf00843277b6017178710d78b9973370fc Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Mon, 31 Jul 2023 09:33:57 -0400
Subject: [PATCH 11/30] fixing bug that didnt unroll the 1d karpathy arrays

---
 .../convert-llama2c-to-ggml.cpp               | 120 ++++++++++--------
 1 file changed, 70 insertions(+), 50 deletions(-)

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 28f74066e5197..a2a12de658be0 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -45,8 +45,8 @@ typedef struct {
     // final rmsnorm
     float* rms_final_weight; // (dim,)
     // freq_cis for RoPE relatively positional embeddings
-    float* freq_cis_real; // (seq_len, dim/2)
-    float* freq_cis_imag; // (seq_len, dim/2)
+    // float* freq_cis_real; // (seq_len, dim/2)
+    // float* freq_cis_imag; // (seq_len, dim/2)
     // (optional) classifier weights for the logits, on the last layer
     float* wcls;
 } TransformerWeights;
@@ -63,9 +63,9 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
     if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
     if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
     if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
-    int head_size = p->dim / p->n_heads;
-    if (fread(w->freq_cis_real, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
-    if (fread(w->freq_cis_imag, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
+    //int head_size = p->dim / p->n_heads;
+    // if (fread(w->freq_cis_real, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
+    // if (fread(w->freq_cis_imag, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
     return 0;
 }
 
@@ -96,7 +96,7 @@ void malloc_weights(TransformerWeights* w, Config* p) {
     printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
 
     w->w2 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->dim * p->hidden_dim, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
 
     w->w3 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
     printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
@@ -104,11 +104,11 @@ void malloc_weights(TransformerWeights* w, Config* p) {
     w->rms_final_weight = new float[p->dim](); //calloc(p->dim, sizeof(float));
     printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
 
-    w->freq_cis_real = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->freq_cis_real\n",__func__,p->seq_len, p->dim / 2, p->seq_len * p->dim / 2);
+    // w->freq_cis_real = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
+    // printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->freq_cis_real\n",__func__,p->seq_len, p->dim / 2, p->seq_len * p->dim / 2);
 
-    w->freq_cis_imag = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->freq_cis_imag\n\n",__func__,p->seq_len, p->dim / 2, p->seq_len * p->dim / 2);
+    // w->freq_cis_imag = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
+    // printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->freq_cis_imag\n\n",__func__,p->seq_len, p->dim / 2, p->seq_len * p->dim / 2);
 
     // ensure all mallocs went fine
     // if (!w->token_embedding_table || !w->rms_att_weight || !w->rms_ffn_weight 
@@ -131,8 +131,8 @@ void free_weights(TransformerWeights* w) {
     free(w->w2);
     free(w->w3);
     free(w->rms_final_weight);
-    free(w->freq_cis_real);
-    free(w->freq_cis_imag);
+    // free(w->freq_cis_real);
+    // free(w->freq_cis_imag);
 }
 
 void print_sample_weights(TransformerWeights *w){
@@ -149,8 +149,8 @@ void print_sample_weights(TransformerWeights *w){
     printf("%f\n", w->w2[0]);
     printf("%f\n", w->w3[0]);
     printf("%f\n", w->rms_att_weight[0]);
-    printf("%f\n", w->freq_cis_real[0]);
-    printf("%f\n", w->freq_cis_imag[0]);
+    // printf("%f\n", w->freq_cis_real[0]);
+    // printf("%f\n", w->freq_cis_imag[0]);
     printf("------------------------------------------------------------------\n");
 
     
@@ -213,10 +213,10 @@ struct my_llama_model {
     struct ggml_tensor * tok_embeddings;
 
     struct ggml_tensor * norm;
-    // struct ggml_tensor * output;
+    struct ggml_tensor * output;
 
-    struct ggml_tensor * freq_cis_real;
-    struct ggml_tensor * freq_cis_imag;
+    // struct ggml_tensor * freq_cis_real;
+    // struct ggml_tensor * freq_cis_imag;
 
     std::vector<my_llama_layer> layers;
 
@@ -262,33 +262,33 @@ void init_model(struct my_llama_model * model) {
     model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
     printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
 
-    // model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    // printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
+    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
 
-    model->freq_cis_real         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->freq_cis_real\n",__func__,n_embd, n_embd / 2, n_embd * n_embd / 2);
+    // model->freq_cis_real         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
+    // printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->freq_cis_real\n",__func__,n_embd, n_embd / 2, n_embd * n_embd / 2);
     
-    model->freq_cis_imag         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->freq_cis_imag\n",__func__,n_embd, n_embd / 2, n_embd * n_embd / 2);
+    // model->freq_cis_imag         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
+    // printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->freq_cis_imag\n",__func__,n_embd, n_embd / 2, n_embd * n_embd / 2);
 
     // printing the per-layer allocations here so we dont print in the for loop.
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
 
     printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
 
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__,n_embd, n_ff, n_embd * n_ff, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__,n_ff, n_embd, n_ff * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__,n_embd, n_ff, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
     
 
     ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
     ggml_set_name(model->norm,           "norm.weight");
-    // ggml_set_name(model->output,         "output.weight");
-    ggml_set_name(model->freq_cis_real,         "output.freq_cis_real");
-    ggml_set_name(model->freq_cis_imag,         "output.freq_cis_imag");
+    ggml_set_name(model->output,         "output.weight");
+    // ggml_set_name(model->freq_cis_real,         "output.freq_cis_real");
+    // ggml_set_name(model->freq_cis_imag,         "output.freq_cis_imag");
 
     model->layers.resize(n_layer);
     for (uint32_t i = 0; i < n_layer; ++i) {
@@ -305,10 +305,14 @@ void init_model(struct my_llama_model * model) {
 
         layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
 
-        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
-        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
-        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
+        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
+        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
+        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
 
+        // layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
+        // layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
+        // layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
+ 
         ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
 
         ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
@@ -352,7 +356,7 @@ int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
 void print_row(struct ggml_tensor * probs, int i) {
     for (int k = 0; k < probs->ne[0]; ++k) {
         float p = get_f32_2d(probs, k, i);
-        printf(" %.2f", p);
+        printf(" %f", p);
     }
     printf("\n");
 }
@@ -656,34 +660,50 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
     // w->token_embedding_table -> model->tok_embeddings
     // float*                   -> struct ggml_tensor
     stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+    stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
     // print_row(model->tok_embeddings, 0);
 
     stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);         
-    stuff_karpathy_weights_into_gg(model->freq_cis_real, w->freq_cis_real);
-    stuff_karpathy_weights_into_gg(model->freq_cis_imag, w->freq_cis_imag);
+    print_row(model->norm, 0);
+    //stuff_karpathy_weights_into_gg(model->freq_cis_real, w->freq_cis_real);
+    //stuff_karpathy_weights_into_gg(model->freq_cis_imag, w->freq_cis_imag);
 
     // for rms-att-weight 
     int row_length = model->hparams.n_embd;
+    const auto & hparams = model->hparams;
+    int n_ff = get_n_ff(&hparams);
+    //int n_ff = model->hparams.n_embd;
+    //const auto & hparams = model->hparams;
+    //int row_length = get_n_ff(&hparams);
+
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
         auto & layer = model->layers[i];
-        // 2d        
+        // 1d        
         stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
         stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length]);
+
+        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
+        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+        
+        //stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
+        
+        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
+        //stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff]);
+        
+        //stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*n_ff*row_length]);
     }
     
     // write tensors
     write_tensor(&file, model->tok_embeddings);
     write_tensor(&file, model->norm);
-    // write_tensor(&file, model->output); // ?
-    write_tensor(&file, model->freq_cis_real);
-    write_tensor(&file, model->freq_cis_imag);
+    write_tensor(&file, model->output); // ?
+    // write_tensor(&file, model->freq_cis_real);
+    // write_tensor(&file, model->freq_cis_imag);
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {        
         auto & layer = model->layers[i];
 

From f1c03f4b16ab5405f09339524467226001723af9 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Mon, 31 Jul 2023 13:20:32 -0400
Subject: [PATCH 12/30] more bug fixn

---
 .../convert-llama2c-to-ggml.cpp                 | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index a2a12de658be0..ab6d177fa3c7b 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -604,7 +604,9 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * kar
             for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
                 float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
                 *ptr = karpathy_weights[ct];
+                ct++;
             }
+            break;
         case 2:
             ct = 0;
             for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
@@ -661,10 +663,9 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
     // float*                   -> struct ggml_tensor
     stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
     stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
-    // print_row(model->tok_embeddings, 0);
 
     stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);         
-    print_row(model->norm, 0);
+    //print_row(model->norm, 0);
     //stuff_karpathy_weights_into_gg(model->freq_cis_real, w->freq_cis_real);
     //stuff_karpathy_weights_into_gg(model->freq_cis_imag, w->freq_cis_imag);
 
@@ -678,8 +679,18 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
 
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
         auto & layer = model->layers[i];
-        // 1d        
+        // 1d
+        //if (i == 0){
+        //    printf("%f %f\n", w->rms_att_weight[0],  w->rms_att_weight[1]);
+        //}       
+        //printf("layer.attention_norm->n_dims = %d\n", layer.attention_norm->n_dims);
         stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
+        //if (i == 0){
+        //    print_row(layer.attention_norm, 0);
+        //    printf("%f\n", layer.attention_norm[0]);
+       // }
+        //printf("AFTER---\n");
+        //print_row(layer.attention_norm, 0);
         stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
 
         // from 3d matrix layer x dim x dim to 2d matrix dim x dim

From df659f6befc38c9a165ef265b25a97ce43ec9eb6 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Wed, 2 Aug 2023 09:16:00 -0400
Subject: [PATCH 13/30] cleaning up code a little bit with removing extra
 printfs needed during debug

---
 .../convert-llama2c-to-ggml.cpp               | 365 +++++-------------
 1 file changed, 96 insertions(+), 269 deletions(-)

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index ab6d177fa3c7b..5a208a4f27ad6 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -48,91 +48,72 @@ typedef struct {
     // float* freq_cis_real; // (seq_len, dim/2)
     // float* freq_cis_imag; // (seq_len, dim/2)
     // (optional) classifier weights for the logits, on the last layer
-    float* wcls;
+    //float* wcls;
 } TransformerWeights;
 
-int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
-    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
-    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
-    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
-    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
-    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
-    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
-    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
-    //int head_size = p->dim / p->n_heads;
-    // if (fread(w->freq_cis_real, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
-    // if (fread(w->freq_cis_imag, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
-    return 0;
-}
-
 void malloc_weights(TransformerWeights* w, Config* p) {
     // we calloc instead of malloc to keep valgrind happy
-    w->token_embedding_table = new float[p->vocab_size * p->dim]();//calloc(p->vocab_size * p->dim, sizeof(float));
+    w->token_embedding_table = new float[p->vocab_size * p->dim]();
     printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
     
-    w->rms_att_weight = new float[p->n_layers * p->dim](); //calloc(p->n_layers * p->dim, sizeof(float));
+    w->rms_att_weight = new float[p->n_layers * p->dim]();
     printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
 
-    w->rms_ffn_weight = new float[p->n_layers * p->dim](); //calloc(p->n_layers * p->dim, sizeof(float));
+    w->rms_ffn_weight = new float[p->n_layers * p->dim]();
     printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
 
-    w->wq = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    w->wq = new float[p->n_layers * p->dim * p->dim](); 
     printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
 
-    w->wk = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    w->wk = new float[p->n_layers * p->dim * p->dim](); 
     printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
 
-    w->wv = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    w->wv = new float[p->n_layers * p->dim * p->dim]();
     printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
 
-    w->wo = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    w->wo = new float[p->n_layers * p->dim * p->dim]();
     printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
 
-    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
+    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
     printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
 
-    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->dim * p->hidden_dim, sizeof(float));
+    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
     printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
 
-    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
+    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
     printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
 
-    w->rms_final_weight = new float[p->dim](); //calloc(p->dim, sizeof(float));
+    w->rms_final_weight = new float[p->dim]();
     printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+}
 
-    // w->freq_cis_real = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
-    // printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->freq_cis_real\n",__func__,p->seq_len, p->dim / 2, p->seq_len * p->dim / 2);
-
-    // w->freq_cis_imag = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
-    // printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->freq_cis_imag\n\n",__func__,p->seq_len, p->dim / 2, p->seq_len * p->dim / 2);
-
-    // ensure all mallocs went fine
-    // if (!w->token_embedding_table || !w->rms_att_weight || !w->rms_ffn_weight 
-    //  || !w->wq || !w->wk || !w->wv || !w->wo || !w->w1 || !w->w2 || !w->w3 || 
-    //     !w->rms_final_weight || !w->freq_cis_real || !w->freq_cis_imag) {
-    //     printf("malloc failed!\n");
-    //     exit(1);
-    // }
+int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
+    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
+    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
+    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
+    return 0;
 }
 
 void free_weights(TransformerWeights* w) {
-    free(w->token_embedding_table);
-    free(w->rms_att_weight);
-    free(w->rms_ffn_weight);
-    free(w->wq);
-    free(w->wk);
-    free(w->wv);
-    free(w->wo);
-    free(w->w1);
-    free(w->w2);
-    free(w->w3);
-    free(w->rms_final_weight);
-    // free(w->freq_cis_real);
-    // free(w->freq_cis_imag);
+    delete w->token_embedding_table;
+    delete w->rms_att_weight;
+    delete w->rms_ffn_weight;
+    delete w->wq;
+    delete w->wk;
+    delete w->wv;
+    delete w->wo;
+    delete w->w1;
+    delete w->w2;
+    delete w->w3;
+    delete w->rms_final_weight;
 }
 
 void print_sample_weights(TransformerWeights *w){
@@ -149,11 +130,6 @@ void print_sample_weights(TransformerWeights *w){
     printf("%f\n", w->w2[0]);
     printf("%f\n", w->w3[0]);
     printf("%f\n", w->rms_att_weight[0]);
-    // printf("%f\n", w->freq_cis_real[0]);
-    // printf("%f\n", w->freq_cis_imag[0]);
-    printf("------------------------------------------------------------------\n");
-
-    
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -180,7 +156,6 @@ struct my_llama_hparams {
     uint32_t n_head  = 32;
     uint32_t n_layer = 32;
     uint32_t n_rot   = 64;
-
     bool operator!=(const my_llama_hparams& other) const {
         return memcmp(this, &other, sizeof(my_llama_hparams));
     }
@@ -215,9 +190,6 @@ struct my_llama_model {
     struct ggml_tensor * norm;
     struct ggml_tensor * output;
 
-    // struct ggml_tensor * freq_cis_real;
-    // struct ggml_tensor * freq_cis_imag;
-
     std::vector<my_llama_layer> layers;
 
     uint32_t train_its = 0;
@@ -225,6 +197,54 @@ struct my_llama_model {
     uint32_t train_tokens = 0;
 };
 
+struct train_params {
+    const char * fn_vocab_model;
+    const char * fn_llama2c_model;
+    const char * fn_llama2c_output_model;    
+    const char * fn_train_data;
+    const char * fn_checkpoint_in;
+    const char * fn_checkpoint_out;
+    const char * fn_model_out;
+
+    uint32_t seed;
+
+    int n_ctx;
+    int n_embd;
+    int n_mult;
+    int n_head;
+    int n_layer;
+    int n_rotmax;
+
+    int n_threads;
+    int n_batch;
+    int n_examples;
+    int n_predict;
+
+    int print_info_interval;
+    int print_details_interval;
+
+    bool samples_start_after_nl;
+    bool use_adam;
+    bool use_flash;
+    bool use_scratch;
+
+    // only adam
+    int   warmup;
+    int   cos_decay_steps;
+    float cos_decay_restart;
+    float cos_decay_alpha;
+
+    int   lbfgs_n_iter;
+    int   adam_n_iter;
+    float adam_alpha;
+    float adam_decay;
+
+    int mem_model_gb;
+    int mem_compute_gb;
+    int mem_compute0_gb;
+    int mem_compute1_gb;
+};
+
 uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
     const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
     return n_ff;
@@ -249,7 +269,6 @@ void init_model(struct my_llama_model * model) {
     const uint32_t n_vocab = hparams.n_vocab;
 
     const uint32_t n_ff = get_n_ff(&hparams);
-
     struct ggml_context * ctx = model->ctx;
 
     model->train_its = 0;
@@ -265,12 +284,6 @@ void init_model(struct my_llama_model * model) {
     model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
     printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
 
-    // model->freq_cis_real         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
-    // printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->freq_cis_real\n",__func__,n_embd, n_embd / 2, n_embd * n_embd / 2);
-    
-    // model->freq_cis_imag         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd/2);
-    // printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->freq_cis_imag\n",__func__,n_embd, n_embd / 2, n_embd * n_embd / 2);
-
     // printing the per-layer allocations here so we dont print in the for loop.
     printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
     printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
@@ -287,8 +300,6 @@ void init_model(struct my_llama_model * model) {
     ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
     ggml_set_name(model->norm,           "norm.weight");
     ggml_set_name(model->output,         "output.weight");
-    // ggml_set_name(model->freq_cis_real,         "output.freq_cis_real");
-    // ggml_set_name(model->freq_cis_imag,         "output.freq_cis_imag");
 
     model->layers.resize(n_layer);
     for (uint32_t i = 0; i < n_layer; ++i) {
@@ -309,10 +320,6 @@ void init_model(struct my_llama_model * model) {
         layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
         layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
 
-        // layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
-        // layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
-        // layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
- 
         ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
 
         ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
@@ -328,21 +335,6 @@ void init_model(struct my_llama_model * model) {
     }
 }
 
-void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
-    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
-    *ptr = value;
-}
-
-void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) {
-    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    *ptr = value;
-}
-
-void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) {
-    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    *ptr = value;
-}
-
 float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
     float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
     return *ptr;
@@ -372,41 +364,6 @@ void print_matrix(struct ggml_tensor * probs) {
     }
 }
 
-void print_token(struct llama_context * ctx, llama_token token) {
-    printf("%s", llama_token_to_str(ctx, token));
-}
-
-void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
-    for (int i=0; i<tokens->ne[0]; ++i) {
-        int token = ggml_get_i32_1d(tokens, i);
-        print_token(ctx, token);
-    }
-}
-
-void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) {
-    for (int i1=0; i1<tokens->ne[1]; ++i1) {
-        //int num_newline = 0;
-        for (int i0=0; i0<tokens->ne[0]; ++i0) {
-            int token = get_i32_2d(tokens, i0, i1);
-            print_token(ctx, token);
-            // bool isnl = (token == llama_token_nl());
-            // if (isnl) {
-            //     ++num_newline;
-            // }
-            // if (isnl) {
-            //     if (num_newline < 2) {
-            //         print_token(ctx, token);
-            //     } else {
-            //         printf("\\n");
-            //     }
-            // } else {
-            //     print_token(ctx, token);
-            // }
-        }
-        printf("\n--\n");
-    }
-}
-
 #ifdef __GNUC__
 #ifdef __MINGW32__
 __attribute__((format(gnu_printf, 1, 2)))
@@ -511,45 +468,6 @@ struct llama_file {
     }
 };
 
-int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
-    struct llama_file f(filename, "rb");
-
-    std::vector<char> buf;
-    buf.resize(f.size+1);
-
-    f.read_raw(buf.data(), f.size);
-    buf[f.size] = '\0';
-
-    out.resize(buf.size());
-
-    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
-    if (n_tokens >= 0) {
-        out.resize(n_tokens);
-    }
-
-    bool verify = false;
-    if (verify) {
-        const char * in  = buf.data();
-        const char * end = buf.data() + buf.size();
-        for (int i = 0; i < (int) out.size(); ++i) {
-            const char * s = llama_token_to_str(lctx, out[i]);
-            int len = strlen(s);
-            if (in >= end) {
-                printf("%s: unexpected end of original text.\n", __func__);
-                break;
-            }
-            const bool matches = (strncmp(in, s, len) == 0);
-            if (matches) {
-                in += len;
-            } else {
-                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
-            }
-        }
-    }
-
-    return n_tokens;
-}
-
 void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     if (tensor == NULL) {
         file->write_u32(0);
@@ -574,29 +492,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     file->write_raw(tensor->data, ggml_nbytes(tensor));
 }
 
-void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
-    int32_t nd = file->read_u32();
-    GGML_ASSERT(nd == tensor->n_dims);
-
-    uint32_t name_len       = file->read_u32();
-    enum     ggml_type type = (enum ggml_type) file->read_u32();
-    GGML_ASSERT(type == tensor->type);
-
-    uint32_t ne[4];
-    file->read_raw(ne, sizeof(ne[0]) * nd);
-    for (int i=0; i<nd; ++i) {
-        GGML_ASSERT(ne[i] == tensor->ne[i]);
-    }
-
-    std::string name = file->read_string(name_len);
-    GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
-
-    file->seek((0-file->tell()) & 31, SEEK_CUR);
-    file->read_raw(tensor->data, ggml_nbytes(tensor));
-}
-
 void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
-    
     int ct;
     switch (gg_weights->n_dims){
         case 1:
@@ -663,34 +559,20 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
     // float*                   -> struct ggml_tensor
     stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
     stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
-
+    
     stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);         
     //print_row(model->norm, 0);
-    //stuff_karpathy_weights_into_gg(model->freq_cis_real, w->freq_cis_real);
-    //stuff_karpathy_weights_into_gg(model->freq_cis_imag, w->freq_cis_imag);
 
     // for rms-att-weight 
     int row_length = model->hparams.n_embd;
     const auto & hparams = model->hparams;
-    int n_ff = get_n_ff(&hparams);
     //int n_ff = model->hparams.n_embd;
-    //const auto & hparams = model->hparams;
-    //int row_length = get_n_ff(&hparams);
-
+    int n_ff = get_n_ff(&hparams);
+    
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
         auto & layer = model->layers[i];
         // 1d
-        //if (i == 0){
-        //    printf("%f %f\n", w->rms_att_weight[0],  w->rms_att_weight[1]);
-        //}       
-        //printf("layer.attention_norm->n_dims = %d\n", layer.attention_norm->n_dims);
         stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
-        //if (i == 0){
-        //    print_row(layer.attention_norm, 0);
-        //    printf("%f\n", layer.attention_norm[0]);
-       // }
-        //printf("AFTER---\n");
-        //print_row(layer.attention_norm, 0);
         stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
 
         // from 3d matrix layer x dim x dim to 2d matrix dim x dim
@@ -699,22 +581,16 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
         stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
         stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
         
-        //stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length]);
         stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
         
         stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
-        //stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff]);
         
-        //stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*n_ff*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
     }
-    
     // write tensors
     write_tensor(&file, model->tok_embeddings);
     write_tensor(&file, model->norm);
     write_tensor(&file, model->output); // ?
-    // write_tensor(&file, model->freq_cis_real);
-    // write_tensor(&file, model->freq_cis_imag);
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {        
         auto & layer = model->layers[i];
 
@@ -730,54 +606,6 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
     }
 }
 
-struct train_params {
-    const char * fn_vocab_model;
-    const char * fn_llama2c_model;
-    const char * fn_llama2c_output_model;    
-    const char * fn_train_data;
-    const char * fn_checkpoint_in;
-    const char * fn_checkpoint_out;
-    const char * fn_model_out;
-
-    uint32_t seed;
-
-    int n_ctx;
-    int n_embd;
-    int n_mult;
-    int n_head;
-    int n_layer;
-    int n_rotmax;
-
-    int n_threads;
-    int n_batch;
-    int n_examples;
-    int n_predict;
-
-    int print_info_interval;
-    int print_details_interval;
-
-    bool samples_start_after_nl;
-    bool use_adam;
-    bool use_flash;
-    bool use_scratch;
-
-    // only adam
-    int   warmup;
-    int   cos_decay_steps;
-    float cos_decay_restart;
-    float cos_decay_alpha;
-
-    int   lbfgs_n_iter;
-    int   adam_n_iter;
-    float adam_alpha;
-    float adam_decay;
-
-    int mem_model_gb;
-    int mem_compute_gb;
-    int mem_compute0_gb;
-    int mem_compute1_gb;
-};
-
 struct train_params get_default_train_params() {
     struct train_params params;
     params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
@@ -828,7 +656,7 @@ struct train_params get_default_train_params() {
     return params;
 }
 
-void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
+void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
@@ -839,7 +667,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "\n");
 }
 
-bool train_params_parse(int argc, char ** argv, struct train_params * params) {
+bool params_parse(int argc, char ** argv, struct train_params * params) {
     bool invalid_param = false;
     std::string arg;
     struct train_params default_params = get_default_train_params();
@@ -870,17 +698,17 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
             }
             params->fn_llama2c_output_model = argv[i]; 
         } else if (arg == "-h" || arg == "--help") {
-            train_print_usage(argc, argv, &default_params);
+            print_usage(argc, argv, &default_params);
             exit(0);
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            train_print_usage(argc, argv, &default_params);
+            print_usage(argc, argv, &default_params);
             exit(1);
         }
     }
     if (invalid_param) {
         fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        train_print_usage(argc, argv, &default_params);
+        print_usage(argc, argv, &default_params);
         exit(1);
     }
 
@@ -889,7 +717,7 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
 
 int main(int argc, char ** argv) {
     struct train_params params = get_default_train_params();
-    if (!train_params_parse(argc, argv, &params)) {
+    if (!params_parse(argc, argv, &params)) {
         return 1;
     }
     Config config;
@@ -933,11 +761,10 @@ int main(int argc, char ** argv) {
     model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
     model.hparams.n_ctx   = params.n_ctx;
     model.hparams.n_embd  = config.dim; //params.n_embd;
-    model.hparams.n_mult  = params.n_mult; 
+    model.hparams.n_mult  = 32;//params.n_mult; 
     model.hparams.n_head  = config.n_heads; //params.n_head;
     model.hparams.n_layer = config.n_layers; //params.n_layer;
     model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
-
     print_params(&model.hparams);
     struct ggml_init_params lcparams;
     lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);

From ff9fae57d1f71c0fc3c141fb7c53a93cd44f8812 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Tue, 8 Aug 2023 13:45:00 -0400
Subject: [PATCH 14/30] updating makefile so test scripts are not compiled

---
 Makefile | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 6fad72ab7f87f..5f8a0ae217773 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c my-tests convert-llama2c-to-ggml simple server embd-input-test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch my-tests convert-llama2c-to-ggml simple server embd-input-test
 
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
@@ -330,7 +330,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
 clean:
-	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c my-tests convert-llama2c-to-ggml embd-input-test build-info.h $(TEST_TARGETS)
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch my-tests convert-llama2c-to-ggml embd-input-test build-info.h $(TEST_TARGETS)
 
 #
 # Examples
@@ -373,9 +373,6 @@ embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-te
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-convert-llama2c: examples/convert-llama2c/convert-lamma-2c.cpp    build-info.h ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
 my-tests: examples/my-tests/my-tests.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 

From 2a0138e5eaf678fa4156ccd60979aac5d5042373 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Tue, 8 Aug 2023 13:52:20 -0400
Subject: [PATCH 15/30] updating readme for instructions for compilation and
 use

---
 examples/convert-llama2c-to-ggml/README.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 examples/convert-llama2c-to-ggml/README.md

diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md
new file mode 100644
index 0000000000000..18293947a498c
--- /dev/null
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -0,0 +1,13 @@
+## Convert llama2.c model to ggml
+
+This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format.
+
+To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
+
+`$ make -j`
+
+`$ ./convert-llama2c-to-ggml --vocab-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>`
+
+Now you can use the model with command:
+
+`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5`
\ No newline at end of file

From 9a09e6418f6f60c38d98bf09b82bce22e15c2f1a Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Tue, 8 Aug 2023 14:00:05 -0400
Subject: [PATCH 16/30] minor spacing update

---
 examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 5a208a4f27ad6..f7b144eed5f45 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -16,7 +16,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-//////////////////////////////////////// llama.c model structs and functions to load models, alloc memory etc.
+//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
 typedef struct {
     int dim; // transformer dimension
     int hidden_dim; // for ffn layers
@@ -582,9 +582,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
         stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
         
         stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
-        
         stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
-        
         stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
     }
     // write tensors

From 223ddb77b395ebfccda1b9ab78390da58b5a0ab4 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Tue, 8 Aug 2023 14:19:30 -0400
Subject: [PATCH 17/30] updating makefile so my initial tests are not compiled

---
 Makefile | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 54a8382234456..d06843d00f3b1 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch my-tests convert-llama2c-to-ggml simple server embd-input-test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test
 
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
@@ -350,7 +350,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
 clean:
-	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch my-tests convert-llama2c-to-ggml embd-input-test build-info.h $(TEST_TARGETS)
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test build-info.h $(TEST_TARGETS)
 
 #
 # Examples
@@ -393,9 +393,6 @@ embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-te
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-my-tests: examples/my-tests/my-tests.cpp    build-info.h ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 

From 088eb86fbe7195d448bc3d26bae6ebe35393b283 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Tue, 8 Aug 2023 14:21:14 -0400
Subject: [PATCH 18/30] updating gitignore

---
 .gitignore | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index ed7fd2cac45ad..e345e64ed91e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,8 +40,6 @@ models-mnt
 /perplexity
 /embedding
 /train-text-from-scratch
-/convert-llama2c
-/my-tests
 /convert-llama2c-to-ggml
 /simple
 /benchmark-matmult

From 08e94332fc5533ab779fc4735ef97d8ad880e1fe Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Tue, 8 Aug 2023 14:27:01 -0400
Subject: [PATCH 19/30] cleaning up some earlier files used for experiments

---
 examples/convert-llama2c/convert-lamma-2c.cpp |  676 ------
 examples/my-tests/mappings.md                 |   31 -
 examples/my-tests/my-tests.cpp                | 1820 -----------------
 3 files changed, 2527 deletions(-)
 delete mode 100644 examples/convert-llama2c/convert-lamma-2c.cpp
 delete mode 100644 examples/my-tests/mappings.md
 delete mode 100644 examples/my-tests/my-tests.cpp

diff --git a/examples/convert-llama2c/convert-lamma-2c.cpp b/examples/convert-llama2c/convert-lamma-2c.cpp
deleted file mode 100644
index bf282ab86ac4e..0000000000000
--- a/examples/convert-llama2c/convert-lamma-2c.cpp
+++ /dev/null
@@ -1,676 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <math.h>
-#include <string.h>
-#include <sys/time.h>
-#include <cstdlib>
-#include <cstdio>
-#include <stdexcept>
-#include <cstring>
-#include <string>
-
-#include "llama.h"
-#include "ggml.h"
-
-typedef struct {
-    int dim; // transformer dimension
-    int hidden_dim; // for ffn layers
-    int n_layers; // number of layers
-    int n_heads; // number of query heads
-    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
-    int vocab_size; // vocabulary size, usually 256 (byte-level)
-    int seq_len; // max sequence length
-} Config;
-
-typedef struct {
-    // token embedding table
-    float* token_embedding_table;    // (vocab_size, dim)
-    // weights for rmsnorms
-    float* rms_att_weight; // (layer, dim) rmsnorm weights
-    float* rms_ffn_weight; // (layer, dim)
-    // weights for matmuls
-    float* wq; // (layer, dim, dim)
-    float* wk; // (layer, dim, dim)
-    float* wv; // (layer, dim, dim)
-    float* wo; // (layer, dim, dim)
-    // weights for ffn
-    float* w1; // (layer, hidden_dim, dim)
-    float* w2; // (layer, dim, hidden_dim)
-    float* w3; // (layer, hidden_dim, dim)
-    // final rmsnorm
-    float* rms_final_weight; // (dim,)
-    // freq_cis for RoPE relatively positional embeddings
-    float* freq_cis_real; // (seq_len, dim/2)
-    float* freq_cis_imag; // (seq_len, dim/2)
-} TransformerWeights;
-
-#ifdef __GNUC__
-#ifdef __MINGW32__
-__attribute__((format(gnu_printf, 1, 2)))
-#else
-__attribute__((format(printf, 1, 2)))
-#endif
-#endif
-static std::string format(const char * fmt, ...) {
-    va_list ap, ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX);
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    GGML_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
-        } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
-        }
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            throw std::runtime_error(format("read error: %s", strerror(errno)));
-        }
-        if (ret != 1) {
-            throw std::runtime_error(std::string("unexpectedly reached end of file"));
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            throw std::runtime_error(format("write error: %s", strerror(errno)));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-struct my_llama_hparams {
-    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;   // this is provided as user input?
-    uint32_t n_embd  = 4096;
-    uint32_t n_mult  = 4;
-    uint32_t n_head  = 32;
-    uint32_t n_layer = 32;
-    uint32_t n_rot   = 64;
-
-    bool operator!=(const my_llama_hparams& other) const {
-        return memcmp(this, &other, sizeof(my_llama_hparams));
-    }
-};
-struct my_llama_layer {
-    // normalization
-    struct ggml_tensor * attention_norm;
-
-    // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-
-    // ff
-    struct ggml_tensor * w1;
-    struct ggml_tensor * w2;
-    struct ggml_tensor * w3;
-};
-struct my_llama_model {
-    struct ggml_context * ctx = NULL;
-
-    my_llama_hparams hparams;
-
-    struct ggml_tensor * tok_embeddings;
-
-    struct ggml_tensor * norm;
-    struct ggml_tensor * output;
-
-    std::vector<my_llama_layer> layers;
-
-    uint32_t train_its = 0;
-    uint32_t train_samples = 0;
-    uint32_t train_tokens = 0;
-};
-struct llama_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    struct token_score {
-        token tok;
-        float score;
-    };
-
-    std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
-};
-
-struct train_params {
-    const char * fn_vocab_model;
-    const char * fn_train_data;
-    const char * fn_checkpoint_in;
-    const char * fn_checkpoint_out;
-    const char * fn_model_out;
-
-    uint32_t seed;
-
-    int n_ctx;
-    int n_embd;
-    int n_mult;
-    int n_head;
-    int n_layer;
-    int n_rotmax;
-
-    int n_threads;
-    int n_batch;
-    int n_examples;
-    int n_predict;
-
-    int print_info_interval;
-    int print_details_interval;
-
-    bool samples_start_after_nl;
-    bool use_adam;
-    bool use_flash;
-    bool use_scratch;
-
-    // only adam
-    int   warmup;
-    int   cos_decay_steps;
-    float cos_decay_restart;
-    float cos_decay_alpha;
-
-    int   lbfgs_n_iter;
-    int   adam_n_iter;
-    float adam_alpha;
-    float adam_decay;
-
-    int mem_model_gb;
-    int mem_compute_gb;
-    int mem_compute0_gb;
-    int mem_compute1_gb;
-};
-
-struct train_params get_default_train_params() {
-    struct train_params params;
-    params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
-    params.fn_train_data     = "shakespeare.txt";
-    params.fn_checkpoint_in  = "checkpoint.bin";
-    params.fn_checkpoint_out = "checkpoint.bin";
-    params.fn_model_out      = "ggml-checkpoint-f32.bin";
-
-    params.seed       =   -1;
-
-    params.n_ctx      =  128;
-    params.n_embd     =  256;
-    params.n_mult     =  256;
-    params.n_head     =    8;
-    params.n_layer    =   16;
-    params.n_rotmax   =   64;
-
-    params.n_threads  =    6;
-    params.n_batch    =    8;
-    params.n_examples =    8;
-    params.n_predict  = 1024;
-
-    params.print_info_interval    = 1;
-    params.print_details_interval = 2;
-
-    params.samples_start_after_nl = false;
-    params.use_adam               = true;
-    params.use_flash              = true;
-    params.use_scratch            = true;
-
-    // only adam
-    params.warmup            =  100;
-    params.cos_decay_steps   = 1000;
-    params.cos_decay_restart = 1.1f;
-    params.cos_decay_alpha   = 0.0f;
-
-    params.lbfgs_n_iter      = 16;
-    params.adam_n_iter       = 16;
-    params.adam_alpha        = 1e-3f;
-    params.adam_decay        = 1e-3f;
-
-    params.mem_model_gb   = 2;
-    params.mem_compute_gb = 24;
-    params.mem_compute0_gb = 8;
-    params.mem_compute1_gb = 2;
-
-    return params;
-}
-
-void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
-    if (tensor == NULL) {
-        file->write_u32(0);
-        file->write_u32(0);
-        file->write_u32(GGML_TYPE_F32);
-        file->seek((0-file->tell()) & 31, SEEK_CUR);
-        return;
-    }
-    const char * name = ggml_get_name(tensor);
-    uint32_t name_len = strlen(name);
-    uint32_t nd = tensor->n_dims;
-    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
-                       (uint32_t)tensor->ne[1],
-                       (uint32_t)tensor->ne[2],
-                       (uint32_t)tensor->ne[3] };
-    file->write_u32(nd);
-    file->write_u32(name_len);
-    file->write_u32(tensor->type);
-    file->write_raw(ne, sizeof(ne[0]) * nd);
-    file->write_raw(name, name_len);
-    file->seek((0-file->tell()) & 31, SEEK_CUR);
-    file->write_raw(tensor->data, ggml_nbytes(tensor));
-}
-
-void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, const char * filename) {
-    struct llama_file file(filename, "wb");
-    if (file.fp == NULL) {
-        return;
-    }
-
-    // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC);   // magic
-    file.write_u32(LLAMA_FILE_VERSION); // version
-    // write_hparams
-    file.write_u32(model->hparams.n_vocab);
-    file.write_u32(model->hparams.n_embd);
-    file.write_u32(model->hparams.n_mult);
-    file.write_u32(model->hparams.n_head);
-    file.write_u32(model->hparams.n_layer);
-    file.write_u32(model->hparams.n_rot);
-    file.write_u32(LLAMA_FTYPE_ALL_F32);
-    // write_vocab
-    uint32_t n_vocab = model->hparams.n_vocab;
-    for (uint32_t i = 0; i < n_vocab; i++) {
-        const auto & token_score = vocab->id_to_token.at(i);
-        file.write_u32((uint32_t) token_score.tok.size());
-        file.write_raw(token_score.tok.data(), token_score.tok.size());
-        file.write_raw(&token_score.score, sizeof(token_score.score));
-    }
-    // write tensors
-    write_tensor(&file, model->tok_embeddings);
-    write_tensor(&file, model->norm);
-    write_tensor(&file, model->output);
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        write_tensor(&file, layer.attention_norm);
-        write_tensor(&file, layer.wq);
-        write_tensor(&file, layer.wk);
-        write_tensor(&file, layer.wv);
-        write_tensor(&file, layer.wo);
-        write_tensor(&file, layer.ffn_norm);
-        write_tensor(&file, layer.w1);
-        write_tensor(&file, layer.w2);
-        write_tensor(&file, layer.w3);
-    }
-}
-
-void print_config(Config* p){
-    printf("----- Configs extracted from the header -------\n");
-    printf("config.dim %d\n", p->dim);
-    printf("config.hidden_dim %d\n", p->hidden_dim);
-    printf("config.n_layers %d\n", p->n_layers);
-    printf("config.n_heads %d\n", p->n_heads );
-    printf("config.n_kv_heads %d\n", p->n_kv_heads);
-    printf("config.vocab_size %d\n", p->vocab_size);
-    printf("config.seq_len %d\n", p->seq_len);
-    printf("----------------------------------------------\n");
-}
-
-void print_sample_weights(TransformerWeights *w){
-    printf("----- Quick print of first of the weight vales of all the variables\n");
-    printf("%f\n", w->token_embedding_table[0]);
-    printf("%f\n", w->rms_att_weight[0]);
-    printf("%f\n", w->rms_ffn_weight[0]);
-
-    printf("%f\n", w->wq[0]);
-    printf("%f\n", w->wk[0]);
-    printf("%f\n", w->wv[0]);
-    printf("%f\n", w->wo[0]);
-    printf("%f\n", w->w1[0]);
-    printf("%f\n", w->w2[0]);
-    printf("%f\n", w->w3[0]);
-    printf("%f\n", w->rms_att_weight[0]);
-    printf("%f\n", w->freq_cis_real[0]);
-    printf("%f\n", w->freq_cis_imag[0]);
-    printf("------------------------------------------------------------------\n");
-
-    
-}
-void malloc_weights(TransformerWeights* w, Config* p) {
-    // we calloc instead of malloc to keep valgrind happy
-    w->token_embedding_table = new float[p->vocab_size * p->dim]();//calloc(p->vocab_size * p->dim, sizeof(float));
-    w->rms_att_weight = new float[p->n_layers * p->dim](); //calloc(p->n_layers * p->dim, sizeof(float));
-    w->rms_ffn_weight = new float[p->n_layers * p->dim](); //calloc(p->n_layers * p->dim, sizeof(float));
-    w->wq = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
-    w->wk = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
-    w->wv = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
-    w->wo = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
-    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
-    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->dim * p->hidden_dim, sizeof(float));
-    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
-    w->rms_final_weight = new float[p->dim](); //calloc(p->dim, sizeof(float));
-    w->freq_cis_real = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
-    w->freq_cis_imag = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
-    // ensure all mallocs went fine
-    // if (!w->token_embedding_table || !w->rms_att_weight || !w->rms_ffn_weight 
-    //  || !w->wq || !w->wk || !w->wv || !w->wo || !w->w1 || !w->w2 || !w->w3 || 
-    //     !w->rms_final_weight || !w->freq_cis_real || !w->freq_cis_imag) {
-    //     printf("malloc failed!\n");
-    //     exit(1);
-    // }
-}
-
-void free_weights(TransformerWeights* w) {
-    free(w->token_embedding_table);
-    free(w->rms_att_weight);
-    free(w->rms_ffn_weight);
-    free(w->wq);
-    free(w->wk);
-    free(w->wv);
-    free(w->wo);
-    free(w->w1);
-    free(w->w2);
-    free(w->w3);
-    free(w->rms_final_weight);
-    free(w->freq_cis_real);
-    free(w->freq_cis_imag);
-}
-
-int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
-    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
-    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
-    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
-    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
-    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
-    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
-    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
-    int head_size = p->dim / p->n_heads;
-    if (fread(w->freq_cis_real, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
-    if (fread(w->freq_cis_imag, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
-    return 0;
-}
-
-int main(int argc, char *argv[]) {
-
-    // poor man's C argparse
-    char *checkpoint = NULL;
-    char *tokenizer = NULL;
-    // float temperature = 0.9f;
-    // 'checkpoint' is necessary arg
-    if (argc < 3) {
-        printf("Usage: %s <checkpoint_file> <tokenizer_file>\n", argv[0]);
-        return 1;
-    }
-    checkpoint = argv[1];
-    tokenizer = argv[2];
-    // if (argc < 3) {
-    //     printf("Usage: %s <checkpoint_file>\n", argv[0]);
-    //     return 1;
-    // }
-    // temperature is optional
-    // if (argc >= 3) {
-    //     temperature = atof(argv[2]);
-    // }
-    // seed is optional
-    // if (argc >= 4) {
-    //     unsigned int seed = atoi(argv[3]);
-    //     srand(seed);
-    // } else {
-    //     time_t current_time; 
-    //     time(&current_time);
-    //     srand((unsigned int)current_time);
-    // }
-
-    // read in the Karpathy model.bin file
-    Config config; // Configs are stashed in the bin file as header
-    TransformerWeights weights;
-    struct my_llama_model model;
-    {
-        FILE *file = fopen(checkpoint, "rb");
-        if (!file) {
-            printf("Unable to open the checkpoint file %s!\n", checkpoint);
-            return 1;
-        }
-        else{
-            printf("model file opened for reading...\n");
-        }
-        // read in the config header
-        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
-        printf("config file read..\n");
-        print_config(&config);
-        // read in the Transformer weights
-        malloc_weights(&weights, &config);
-        printf("reading the opened model file...\n");
-        if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
-        print_sample_weights(&weights);
-
-        // copy weights to ggml tensors.
-        //model.tok_embeddings <<< weights.token_embedding_table;
-
-
-        printf("Closing model file..bye...\n");
-        fclose(file);
-    }
-
-    // read in the tokenizer.bin file
-    // char** vocab_ak = (char**)malloc(config.vocab_size * sizeof(char*));
-    // {
-    //     FILE *file = fopen(tokenizer, "rb");
-    //     if (!file) {
-    //         printf("Unable to open the tokenizer file tokenizer.bin! Run "
-    //         "python tokenizer.py to convert tokenizer.model -> tokenizer.bin\n");
-    //         return 1;
-    //     }
-    //     int len;
-    //     printf("karpathy vocab size = %d\n", config.vocab_size);
-
-    //     for (int i = 0; i < config.vocab_size; i++) {
-    //         if(fread(&len, sizeof(int), 1, file) != 1) { return 1; }
-    //         vocab_ak[i] = (char *)malloc(len + 1);
-    //         if(fread(vocab_ak[i], len, 1, file) != 1) { return 1; }
-    //         vocab_ak[i][len] = '\0'; // add the string terminating token
-    //         printf("len = %d, %s\n", len, vocab_ak[i]);
-
-    //     }
-    //     fclose(file);
-    // }
-
-    //TODO:-------------------------------------------------------------------------------
-    
-    // struct train_params params = get_default_train_params();
-    // struct llama_context_params llama_params = llama_context_default_params();
-    // struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
-    // struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
-    // struct llama_vocab vocab;
-    // {
-    //     std::vector<const char *> strings;
-    //     std::vector<float> scores;
-    //     int n_vocab = llama_n_vocab(lctx);
-    //     strings.resize(n_vocab, NULL);
-    //     scores.resize(n_vocab, 0);
-    //     n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-    //     GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
-    //     vocab.id_to_token.resize(n_vocab);
-    //     for (int i=0; i<n_vocab; ++i) {
-    //         std::string tok   = std::string(strings[i]);
-    //         float       score = scores[i];
-    //         vocab.id_to_token[i].tok   = tok;
-    //         vocab.id_to_token[i].score = score;
-    //         vocab.token_to_id.emplace(tok, i);
-    //     }
-    // }
-
-    // save_as_llama_model(&vocab, &model, params.fn_model_out);
-
-
-    // --------------------------------------------- save
-    struct llama_file file("ak_model.bin", "wb");
-    if (file.fp == NULL) {
-        return 0;
-    }
-
-    // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC);   // magic
-    file.write_u32(LLAMA_FILE_VERSION); // version
-    // write_hparams
-
-    // printf("config.dim %d\n", p->dim);
-    // printf("config.hidden_dim %d\n", p->hidden_dim);
-    // printf("config.n_layers %d\n", p->n_layers);
-    // printf("config.n_heads %d\n", p->n_heads );
-    // printf("config.n_kv_heads %d\n", p->n_kv_heads);
-    // printf("config.vocab_size %d\n", p->vocab_size);
-    // printf("config.seq_len %d\n", p->seq_len);
-
-    // file.write_u32(model->hparams.n_vocab);
-    file.write_u32(config.vocab_size); // 32000
-
-    // file.write_u32(model->hparams.n_embd);
-    file.write_u32(config.dim);             /// <<<<<<<<<<<<<< NEEDS CHECKING
-
-    // file.write_u32(model->hparams.n_mult);
-    file.write_u32(config.dim);     /// <<<<<<<<<<<<<< JUST PLACEHOLDER
-
-    // file.write_u32(model->hparams.n_head);
-    file.write_u32(config.n_heads);
-
-    // file.write_u32(model->hparams.n_layer);
-    file.write_u32(config.n_layers);
-
-    // file.write_u32(model->hparams.n_rot);
-    file.write_u32(config.dim); /// <<<<<<<<<<<<<< JUST PLACEHOLDER
-    
-    file.write_u32(LLAMA_FTYPE_ALL_F32);
-
-    // write_vocab /////////////////////////////////////////////////////////////////
-    char** vocab_ak = (char**)malloc(config.vocab_size * sizeof(char*));
-    {
-        FILE *file_tok_ak = fopen(tokenizer, "rb");
-        if (!file_tok_ak) {
-            printf("Unable to open the tokenizer file tokenizer.bin! Run "
-            "python tokenizer.py to convert tokenizer.model -> tokenizer.bin\n");
-            return 1;
-        }
-        int len;
-        printf("karpathy vocab size = %d\n", config.vocab_size);
-
-        for (int i = 0; i < config.vocab_size; i++) {
-            if(fread(&len, sizeof(int), 1, file_tok_ak) != 1) { return 1; }
-            file.write_u32((uint32_t) len);
-            
-            vocab_ak[i] = (char *)malloc(len + 1);
-            if(fread(vocab_ak[i], len, 1, file_tok_ak) != 1) { return 1; }
-            vocab_ak[i][len] = '\0'; // add the string terminating token
-            file.write_raw(vocab_ak[i], len+1);
-            float x = 0.0f;
-            file.write_raw(&x, sizeof(float));
-            // printf("len = %d, %s\n", len, vocab_ak[i]);
-
-        }
-        fclose(file_tok_ak);
-    }
-
-    // uint32_t n_vocab = config.vocab_size;//model->hparams.n_vocab;
-    // for (uint32_t i = 0; i < n_vocab; i++) {
-    //     const auto & token_score = vocab->id_to_token.at(i);
-    //     file.write_u32((uint32_t) token_score.tok.size());
-    //     file.write_raw(token_score.tok.data(), token_score.tok.size());
-    //     file.write_raw(&token_score.score, sizeof(token_score.score));
-    // }
-    /////////////////////////////////////////////////////////////////
-
-    // write tensors
-    write_tensor(&file, model.tok_embeddings);
-    // write_tensor(&file, model.norm);
-    // write_tensor(&file, model.output);
-    // for (int i = 0; i < config.n_layers; ++i) {
-    //     auto & layer = model.layers[i];
-
-    //     write_tensor(&file, layer.attention_norm);
-    //     write_tensor(&file, layer.wq);
-    //     write_tensor(&file, layer.wk);
-    //     write_tensor(&file, layer.wv);
-    //     write_tensor(&file, layer.wo);
-    //     write_tensor(&file, layer.ffn_norm);
-    //     write_tensor(&file, layer.w1);
-    //     write_tensor(&file, layer.w2);
-    //     write_tensor(&file, layer.w3);
-    // }
-
-    printf("\n");
-    free_weights(&weights);
-    free(vocab_ak);
-    return 0;
-
-}
\ No newline at end of file
diff --git a/examples/my-tests/mappings.md b/examples/my-tests/mappings.md
deleted file mode 100644
index f0a561a88a147..0000000000000
--- a/examples/my-tests/mappings.md
+++ /dev/null
@@ -1,31 +0,0 @@
-Variable mapping from llama.c to ggml llama.cpp
-
-config variables in llama.c
-`dim`, `vocab_size`, `num_layers`, `num_heads`, `num_kv_heads`, `seq_length`
-
-| llama.c (karpathy) | ggml (gg)     |dim|
-| ------------- | ------------- |-- |
-| `dim`  | `n_embed`  | Transformer dim |
-| `hidden_dim`  | `n_ff` (calculated)  | ff hidden dim |
-| `n_layers`  | `n_layers`  | number of decoder layers |
-| `n_heads`  | `n_head`  | number of heads |
-| `n_kv_heads`  | `-`  |  |
-| `vocab_size`  | `n_vocab`  |  |
-| `seq_len`  | `-`  |  |
-| ---  | ---  | --- |
-| `rms_att_weight`  | `attention_norm`  | `num_layers` x `dim`  |
-| `rms_ffn_weight`  | `ffn_norm`  | `num_layers` x `dim`  |
-| `wq`  | `ffn_norm`  |  `num_layers` x `dim` x `dim` |
-| `qk`  | `ffn_norm`  |  `num_layers` x `dim` x `dim` |
-| `wv`  | `ffn_norm`  |  `num_layers` x `dim` x `dim` |
-| `wo`  | `wo`  |  `num_layers` x `dim` x `dim` |
-| `w1`  | `w1`  |  `num_layers` x `hidden_dim` x `dim` |
-| `w2`  | `w2`  |  `num_layers` x `dim` x `hidden_dim` |
-| `w3`  | `w3`  |  `num_layers` x `hidden_dim` x `dim` |
-| `token_embedding_table`  | `tok_embeddings`  | `vocab_size` x `dim`  |
-| `rms_final_weight`  | `?`  |  `dim` |
-| `freq_cis_real`  | `?`  | `seq_len` x `dim/2`  |
-| `freq_cis_img `  | `?`  | `seq_len` x `dim/2`  |
-
-
-
diff --git a/examples/my-tests/my-tests.cpp b/examples/my-tests/my-tests.cpp
deleted file mode 100644
index 0f3e5082703f7..0000000000000
--- a/examples/my-tests/my-tests.cpp
+++ /dev/null
@@ -1,1820 +0,0 @@
-#include "ggml.h"
-#include "llama.h"
-#include <unordered_map>
-#include <vector>
-#include <cassert>
-#include <climits>
-#include <cstring>
-#include <cstdarg>
-#include <ctime>
-#include <random>
-#include <stdexcept>
-#include <algorithm>
-#include <string>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// static const float rms_norm_eps = 1e-6f;
-
-typedef struct {
-    // token embedding table
-    float* token_embedding_table;    // (vocab_size, dim)
-    // weights for rmsnorms
-    float* rms_att_weight; // (layer, dim) rmsnorm weights
-    float* rms_ffn_weight; // (layer, dim)
-    // weights for matmuls
-    float* wq; // (layer, dim, dim)
-    float* wk; // (layer, dim, dim)
-    float* wv; // (layer, dim, dim)
-    float* wo; // (layer, dim, dim)
-    // weights for ffn
-    float* w1; // (layer, hidden_dim, dim)
-    float* w2; // (layer, dim, hidden_dim)
-    float* w3; // (layer, hidden_dim, dim)
-    // final rmsnorm
-    float* rms_final_weight; // (dim,)
-    // freq_cis for RoPE relatively positional embeddings
-    float* freq_cis_real; // (seq_len, dim/2)
-    float* freq_cis_imag; // (seq_len, dim/2)
-} TransformerWeights;
-
-struct random_normal_distribution {
-    std::mt19937 gen;
-    std::normal_distribution<float> rd;
-    float min;
-    float max;
-};
-
-struct random_uniform_distribution {
-    std::mt19937 gen;
-    std::uniform_real_distribution<float> rd;
-};
-
-void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
-    rnd->gen = std::mt19937(seed);
-    rnd->rd = std::normal_distribution<float>{mean, std};
-    rnd->min = min;
-    rnd->max = max;
-}
-
-void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) {
-    rnd->gen = std::mt19937(seed);
-    rnd->rd = std::uniform_real_distribution<float>{min, max};
-}
-
-int clamp(const int v, const int min, const int max) {
-    return ((v < min) ? (min) : (v > max) ? (max) : v);
-}
-
-float fclamp(const float v, const float min, const float max) {
-    return ((v < min) ? (min) : (v > max) ? (max) : v);
-}
-
-float frand() {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-float frand_normal(struct random_normal_distribution * rnd) {
-    return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
-}
-
-float frand_uniform(struct random_uniform_distribution * rnd) {
-    return rnd->rd(rnd->gen);
-}
-
-void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
-struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
-    float scale = 1.0f; // xavier
-    switch (tensor->n_dims) {
-        case 1:
-            scale /= sqrtf(tensor->ne[0]);
-            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
-                *dst = scale * frand_normal(rnd);
-            }
-            break;
-        case 2:
-            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
-            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-                    *dst = scale * frand_normal(rnd);
-                }
-            }
-            break;
-        case 3:
-            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
-            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
-                        *dst = scale * frand_normal(rnd);
-                    }
-                }
-            }
-            break;
-        case 4:
-            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
-            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
-                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
-                            *dst = scale * frand_normal(rnd);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-    return tensor;
-}
-
-struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
-    switch (tensor->n_dims) {
-        case 1:
-            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
-                *dst = frand_uniform(rnd);
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-                    *dst = frand_uniform(rnd);
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
-                        *dst = frand_uniform(rnd);
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
-                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
-                            *dst = frand_uniform(rnd);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-    return tensor;
-}
-
-struct llama_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    struct token_score {
-        token tok;
-        float score;
-    };
-
-    std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
-};
-
-struct my_llama_hparams {
-    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;   // this is provided as user input?
-    uint32_t n_embd  = 4096;
-    uint32_t n_mult  = 4;
-    uint32_t n_head  = 32;
-    uint32_t n_layer = 32;
-    uint32_t n_rot   = 64;
-
-    bool operator!=(const my_llama_hparams& other) const {
-        return memcmp(this, &other, sizeof(my_llama_hparams));
-    }
-};
-
-struct my_llama_layer {
-    // normalization
-    struct ggml_tensor * attention_norm;
-
-    // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-
-    // ff
-    struct ggml_tensor * w1;
-    struct ggml_tensor * w2;
-    struct ggml_tensor * w3;
-};
-
-struct my_llama_kv_cache {
-    struct ggml_context * ctx = NULL;
-
-    struct ggml_tensor * k;
-    struct ggml_tensor * v;
-
-    // llama_ctx_buffer buf;
-
-    int n; // number of tokens currently in the cache
-};
-
-struct my_llama_model {
-    struct ggml_context * ctx = NULL;
-
-    my_llama_hparams hparams;
-
-    struct ggml_tensor * tok_embeddings;
-
-    struct ggml_tensor * norm;
-    struct ggml_tensor * output;
-
-    std::vector<my_llama_layer> layers;
-
-    uint32_t train_its = 0;
-    uint32_t train_samples = 0;
-    uint32_t train_tokens = 0;
-};
-
-uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
-    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
-    return n_ff;
-}
-
-void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
-    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
-    printf("%s: n_head:  %d\n", __func__, params->n_head);
-    printf("%s: n_ff:    %d\n", __func__, get_n_ff(params));
-    printf("%s: n_layer: %d\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
-}
-
-void init_model(struct my_llama_model * model) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_layer = hparams.n_layer;
-    const uint32_t n_vocab = hparams.n_vocab;
-
-    const uint32_t n_ff = get_n_ff(&hparams);
-
-    struct ggml_context * ctx = model->ctx;
-
-    model->train_its = 0;
-    model->train_samples = 0;
-    model->train_tokens = 0;
-    // printf("FROM INIT_MODEL BHAI...\n\n\n");
-    // print_params(&model->hparams);
-    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
-
-    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-    printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
-
-    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
-
-    // printing the per-layer allocations here so we dont print in the for loop.
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__,n_embd, n_embd, n_embd * n_embd, n_layer);
-
-    printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
-
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__,n_embd, n_ff, n_embd * n_ff, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__,n_ff, n_embd, n_ff * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__,n_embd, n_ff, n_embd * n_ff, n_layer);
-    
-
-    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
-    ggml_set_name(model->norm,           "norm.weight");
-    ggml_set_name(model->output,         "output.weight");
-
-    model->layers.resize(n_layer);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        std::string layers_i = "layers." + std::to_string(i);
-
-        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-
-        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
-        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
-        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
-
-        ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
-
-        ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
-        ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
-        ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
-        ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
-
-        ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
-
-        ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
-        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
-        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
-    }
-}
-
-void set_param_model(struct my_llama_model * model) {
-    const auto& hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct ggml_context* ctx = model->ctx;
-
-    ggml_set_param(ctx, model->tok_embeddings);
-    ggml_set_param(ctx, model->norm);
-    ggml_set_param(ctx, model->output);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        ggml_set_param(ctx, layer.attention_norm);
-        ggml_set_param(ctx, layer.wq);
-        ggml_set_param(ctx, layer.wk);
-        ggml_set_param(ctx, layer.wv);
-        ggml_set_param(ctx, layer.wo);
-        ggml_set_param(ctx, layer.ffn_norm);
-        ggml_set_param(ctx, layer.w1);
-        ggml_set_param(ctx, layer.w2);
-        ggml_set_param(ctx, layer.w3);
-    }
-}
-
-
-bool init_kv_cache(struct my_llama_kv_cache* cache, struct my_llama_model * model, int n_batch) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_ctx   = hparams.n_ctx;
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_layer = hparams.n_layer;
-
-    const int64_t n_mem      = n_layer*n_ctx*n_batch;
-    const int64_t n_elements = n_embd*n_mem;
-
-    // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
-
-    // struct ggml_init_params params;
-    // params.mem_size   = cache.buf.size;
-    // params.mem_buffer = cache.buf.addr;
-    // params.no_alloc   = false;
-    if (!cache->ctx) {
-        struct ggml_init_params params;
-        params.mem_size   = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
-        params.mem_buffer = NULL;
-        params.no_alloc   = false;
-
-        cache->ctx = ggml_init(params);
-
-        if (!cache->ctx) {
-            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
-            return false;
-        }
-    }
-
-    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
-    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
-
-    return true;
-}
-
-
-void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
-    GGML_ASSERT(tensor->n_dims == 1);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-}
-
-void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
-    GGML_ASSERT(tensor->n_dims == 2);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-}
-
-void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
-    GGML_ASSERT(tensor->n_dims == 3);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == ne2);
-}
-
-void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
-    GGML_ASSERT(tensor->n_dims == 4);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == ne2);
-    GGML_ASSERT(tensor->ne[3] == ne3);
-}
-
-void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
-    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
-    *ptr = value;
-}
-
-void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) {
-    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    *ptr = value;
-}
-
-void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) {
-    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    *ptr = value;
-}
-
-float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
-    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    return *ptr;
-}
-
-int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
-    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    return *ptr;
-}
-
-void print_row(struct ggml_tensor * probs, int i) {
-    for (int k = 0; k < probs->ne[0]; ++k) {
-        float p = get_f32_2d(probs, k, i);
-        printf(" %.2f", p);
-    }
-    printf("\n");
-}
-
-void print_matrix(struct ggml_tensor * probs) {
-    assert(probs->n_dims == 2);
-    for (int i = 0; i < probs->ne[1]; ++i) {
-        for (int k = 0; k < probs->ne[0]; ++k) {
-            float p = get_f32_2d(probs, k, i);
-            printf(" %.2f", p);
-        }
-        printf("\n");
-    }
-}
-
-
-void print_token(struct llama_context * ctx, llama_token token) {
-    printf("%s", llama_token_to_str(ctx, token));
-}
-
-void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
-    for (int i=0; i<tokens->ne[0]; ++i) {
-        int token = ggml_get_i32_1d(tokens, i);
-        print_token(ctx, token);
-    }
-}
-
-void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) {
-    for (int i1=0; i1<tokens->ne[1]; ++i1) {
-        //int num_newline = 0;
-        for (int i0=0; i0<tokens->ne[0]; ++i0) {
-            int token = get_i32_2d(tokens, i0, i1);
-            print_token(ctx, token);
-            // bool isnl = (token == llama_token_nl());
-            // if (isnl) {
-            //     ++num_newline;
-            // }
-            // if (isnl) {
-            //     if (num_newline < 2) {
-            //         print_token(ctx, token);
-            //     } else {
-            //         printf("\\n");
-            //     }
-            // } else {
-            //     print_token(ctx, token);
-            // }
-        }
-        printf("\n--\n");
-    }
-}
-
-void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
-    int n_tokens = tokens_input->ne[0];
-    int n_vocab  = target_logits->ne[0];
-
-    size_t sample = train_samples[example_id % n_train_samples];
-    GGML_ASSERT(sample+n_tokens-1 < n_train_data);
-
-    ggml_set_f32(target_logits, -1.0f/n_vocab);
-    ggml_set_f32(target_probs, 0.0f);
-    ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
-    for (int i=1; i<n_tokens+1; ++i) {
-        int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-        set_f32_2d(target_logits, token, i-1, +1.0f);
-        set_f32_2d(target_probs,  token, i-1, +1.0f);
-        if (i<n_tokens) {
-            ggml_set_i32_1d(tokens_input, i, token);
-        }
-    }
-}
-
-void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
-    GGML_ASSERT(tokens_input->n_dims  == 2);
-    GGML_ASSERT(target_logits->n_dims == 3);
-    GGML_ASSERT(target_probs->n_dims  == 3);
-    int n_vocab  = target_logits->ne[0];
-    int n_tokens = tokens_input->ne[0];
-    int n_batch  = tokens_input->ne[1];
-    GGML_ASSERT(n_tokens == target_logits->ne[1]);
-    GGML_ASSERT(n_batch  == target_logits->ne[2]);
-    GGML_ASSERT(n_vocab  == target_probs->ne[0]);
-    GGML_ASSERT(n_tokens == target_probs->ne[1]);
-    GGML_ASSERT(n_batch  == target_probs->ne[2]);
-
-    ggml_set_f32(target_logits, -1.0f/n_vocab);
-    ggml_set_f32(target_probs, 0.0f);
-    for (int k=0; k<n_batch; ++k) {
-        // printf("%s: batch %d\n", __func__, k);
-        size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples];
-        GGML_ASSERT(sample+n_tokens-1 < n_train_data);
-
-        set_i32_2d(tokens_input, 0, k, llama_token_bos());
-        for (int i=1; i<n_tokens+1; ++i) {
-            int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-            // print_token(lctx, token);
-            set_f32_3d(target_logits, token, i-1, k, +1.0f);
-            set_f32_3d(target_probs,  token, i-1, k, +1.0f);
-            if (i<n_tokens) {
-                set_i32_2d(tokens_input, i, k, token);
-            }
-        }
-        // printf("\n=\n");
-        // for (int i=0; i<n_tokens; ++i) {
-        //     int token = get_i32_2d(tokens_input, i, k);
-        //     print_token(lctx, token);
-        // }
-        // printf("\n-\n");
-    }
-}
-
-
-void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs, int n_shift) {
-    int n_tokens = tokens_input->ne[0];
-    int n_vocab = target_logits->ne[0];
-    for (int i=0; i<n_tokens-n_shift; ++i) {
-        ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
-        for (int k=0; k<n_vocab; ++k) {
-            ggml_set_f32_1d(target_logits, i*n_vocab + k, ggml_get_f32_1d(target_logits, (i + n_shift)*n_vocab + k));
-            ggml_set_f32_1d(target_probs, i*n_vocab + k,  ggml_get_f32_1d(target_probs,  (i + n_shift)*n_vocab + k));
-        }
-    }
-}
-
-#ifdef __GNUC__
-#ifdef __MINGW32__
-__attribute__((format(gnu_printf, 1, 2)))
-#else
-__attribute__((format(printf, 1, 2)))
-#endif
-#endif
-static std::string format(const char * fmt, ...) {
-    va_list ap, ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX);
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    GGML_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
-        } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
-        }
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            throw std::runtime_error(format("read error: %s", strerror(errno)));
-        }
-        if (ret != 1) {
-            throw std::runtime_error(std::string("unexpectedly reached end of file"));
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            throw std::runtime_error(format("write error: %s", strerror(errno)));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
-    struct llama_file f(filename, "rb");
-
-    std::vector<char> buf;
-    buf.resize(f.size+1);
-
-    f.read_raw(buf.data(), f.size);
-    buf[f.size] = '\0';
-
-    out.resize(buf.size());
-
-    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
-    if (n_tokens >= 0) {
-        out.resize(n_tokens);
-    }
-
-    bool verify = false;
-    if (verify) {
-        const char * in  = buf.data();
-        const char * end = buf.data() + buf.size();
-        for (int i = 0; i < (int) out.size(); ++i) {
-            const char * s = llama_token_to_str(lctx, out[i]);
-            int len = strlen(s);
-            if (in >= end) {
-                printf("%s: unexpected end of original text.\n", __func__);
-                break;
-            }
-            const bool matches = (strncmp(in, s, len) == 0);
-            if (matches) {
-                in += len;
-            } else {
-                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
-            }
-        }
-    }
-
-    return n_tokens;
-}
-
-void shuffle_ints(int * begin, int * end) {
-    if (end <= begin) return;
-    int max=begin[0];
-    for (int i=1; i<end-begin; ++i) {
-        if (begin[i] > max) {
-            max = begin[i];
-        }
-    }
-    std::vector<float> vals;
-    vals.resize(max+1);
-    for (int i=0; i<max+1; ++i) {
-       vals[i] = frand();
-    }
-    std::sort(begin, end, [&vals](int a, int b){
-       return vals.at(a) < vals.at(b);
-    });
-}
-
-struct my_llama_sampler_params {
-    float temp            = 0.0f;  // <= 0.0 disabled
-    int   top_k           = 20;    // <= 0 to use vocab size
-    float top_p           = 0.95f; // 1.0 = disabled
-    float tfs_z           = 1.00f; // 1.0 = disabled
-    float typical_p       = 1.00f; // 1.0 = disabled
-    int   repeat_last_n   = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float repeat_penalty  = 1.0f;  // 1.0 = disabled
-    float alpha_presence  = 0.0f;  // 0.0 = disabled
-    float alpha_frequency = 0.0f;  // 0.0 = disabled
-    int   mirostat        = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float mirostat_tau    = 5.00f; // target entropy
-    float mirostat_eta    = 0.10f; // learning rate
-    bool  penalize_nl     = true;  // consider newlines as a repeatable token
-};
-
-struct my_llama_sampler {
-    struct llama_context * ctx = NULL;
-    my_llama_sampler_params params;
-
-    int n_vocab = 0;
-    int n_ctx = 0;
-
-    float mirostat_mu;
-
-    std::vector<llama_token_data> candidates;
-    llama_token_data_array candidates_p;
-
-};
-
-void init_sampler(struct my_llama_sampler * sampler, struct llama_context * ctx) {
-    sampler->ctx = ctx;
-    sampler->n_vocab = llama_n_vocab(sampler->ctx);
-    sampler->n_ctx   = llama_n_ctx(sampler->ctx);
-    sampler->mirostat_mu = 2.0f * sampler->params.mirostat_tau;
-}
-
-llama_token sample(struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) {
-    GGML_ASSERT(sampler->ctx != NULL);
-
-    struct llama_context * ctx = sampler->ctx;
-
-    sampler->candidates.resize(sampler->n_vocab);
-    for (llama_token token_id = 0; token_id < sampler->n_vocab; ++token_id) {
-        sampler->candidates[token_id].id = token_id;
-        sampler->candidates[token_id].logit = logits[token_id];
-        sampler->candidates[token_id].p = 0.0;
-    }
-
-    llama_token_data_array * candidates_p = & sampler->candidates_p;
-
-    candidates_p->data = sampler->candidates.data();
-    candidates_p->size = sampler->candidates.size();
-    candidates_p->sorted = false;
-
-    const auto params = sampler->params;
-
-    // Apply penalties
-    const float nl_logit = logits[llama_token_nl()];
-
-    const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx);
-
-    llama_sample_repetition_penalty(
-        ctx,
-        candidates_p,
-        last_tokens + n_last_tokens - n_last,
-        n_last,
-        params.repeat_penalty);
-    llama_sample_frequency_and_presence_penalties(
-        ctx,
-        candidates_p,
-        last_tokens + n_last_tokens - n_last,
-        n_last,
-        params.alpha_frequency,
-        params.alpha_presence);
-
-    if (!params.penalize_nl) {
-        logits[llama_token_nl()] = nl_logit;
-    }
-
-    llama_token token = 0;
-    if (params.temp <= 0) {
-        // Greedy sampling
-        token = llama_sample_token_greedy(ctx, candidates_p);
-    } else {
-        if (params.mirostat == 1) {
-            int mirostat_m = 100;
-            llama_sample_temperature(ctx, candidates_p, params.temp);
-            token = llama_sample_token_mirostat(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, mirostat_m, &sampler->mirostat_mu);
-        } else if (params.mirostat == 2) {
-            llama_sample_temperature(ctx, candidates_p, params.temp);
-            token = llama_sample_token_mirostat_v2(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, &sampler->mirostat_mu);
-        } else {
-            // Temperature sampling
-            llama_sample_top_k        (ctx, candidates_p, params.top_k, 1);
-            llama_sample_tail_free    (ctx, candidates_p, params.tfs_z, 1);
-            llama_sample_typical      (ctx, candidates_p, params.typical_p, 1);
-
-            llama_sample_top_p        (ctx, candidates_p, params.top_p, 1);
-            llama_sample_temperature  (ctx, candidates_p, params.temp);
-            token = llama_sample_token(ctx, candidates_p);
-        }
-    }
-    return token;
-}
-
-void set_logits_masked(struct ggml_tensor * logits, std::vector<bool>& mask, float value) {
-    GGML_ASSERT(logits->ne[0] == (int64_t) mask.size());
-    for (int i2 = 0; i2 < logits->ne[2]; ++i2) {
-        for (int i1 = 0; i1 < logits->ne[1]; ++i1) {
-            for (int i0 = 0; i0 < logits->ne[0]; ++i0) {
-                if (!mask[i0]) continue;
-                float * ptr = (float *) ((char *) logits->data + i2*logits->nb[2] + i1*logits->nb[1] + i0*logits->nb[0]);
-                *ptr = value;
-            }
-        }
-    }
-}
-
-void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
-    if (tensor == NULL) {
-        file->write_u32(0);
-        file->write_u32(0);
-        file->write_u32(GGML_TYPE_F32);
-        file->seek((0-file->tell()) & 31, SEEK_CUR);
-        return;
-    }
-    const char * name = ggml_get_name(tensor);
-    uint32_t name_len = strlen(name);
-    uint32_t nd = tensor->n_dims;
-    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
-                       (uint32_t)tensor->ne[1],
-                       (uint32_t)tensor->ne[2],
-                       (uint32_t)tensor->ne[3] };
-    file->write_u32(nd);
-    file->write_u32(name_len);
-    file->write_u32(tensor->type);
-    file->write_raw(ne, sizeof(ne[0]) * nd);
-    file->write_raw(name, name_len);
-    file->seek((0-file->tell()) & 31, SEEK_CUR);
-    file->write_raw(tensor->data, ggml_nbytes(tensor));
-}
-
-void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
-    int32_t nd = file->read_u32();
-    GGML_ASSERT(nd == tensor->n_dims);
-
-    uint32_t name_len       = file->read_u32();
-    enum     ggml_type type = (enum ggml_type) file->read_u32();
-    GGML_ASSERT(type == tensor->type);
-
-    uint32_t ne[4];
-    file->read_raw(ne, sizeof(ne[0]) * nd);
-    for (int i=0; i<nd; ++i) {
-        GGML_ASSERT(ne[i] == tensor->ne[i]);
-    }
-
-    std::string name = file->read_string(name_len);
-    GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
-
-    file->seek((0-file->tell()) & 31, SEEK_CUR);
-    file->read_raw(tensor->data, ggml_nbytes(tensor));
-}
-
-void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt) {
-    const uint32_t version = 0;
-    GGML_ASSERT(opt->nx   >= 0);
-    GGML_ASSERT(opt->iter >= 0);
-    file->write_u32(version);
-    file->write_raw(&opt->params, sizeof(opt->params));
-    file->write_raw(&opt->nx,     sizeof(opt->nx));
-    file->write_raw(&opt->iter,   sizeof(opt->iter));
-    file->write_u32((uint32_t)  opt->just_initialized);
-    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
-            {
-                GGML_ASSERT(opt->adam.x != NULL);
-                write_tensor(file, opt->adam.x);
-                write_tensor(file, opt->adam.g1);
-                write_tensor(file, opt->adam.g2);
-                write_tensor(file, opt->adam.m);
-                write_tensor(file, opt->adam.v);
-                write_tensor(file, opt->adam.mh);
-                write_tensor(file, opt->adam.vh);
-                write_tensor(file, opt->adam.pf);
-                file->write_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
-                file->write_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
-                file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                GGML_ASSERT(opt->adam.x != NULL);
-                write_tensor(file, opt->lbfgs.x);
-                write_tensor(file, opt->lbfgs.xp);
-                write_tensor(file, opt->lbfgs.g);
-                write_tensor(file, opt->lbfgs.gp);
-                write_tensor(file, opt->lbfgs.d);
-                write_tensor(file, opt->lbfgs.pf);
-                write_tensor(file, opt->lbfgs.lmal);
-                write_tensor(file, opt->lbfgs.lmys);
-                write_tensor(file, opt->lbfgs.lms);
-                write_tensor(file, opt->lbfgs.lmy);
-                file->write_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
-                file->write_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
-                file->write_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
-                file->write_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
-                file->write_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
-                file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
-            } break;
-    }
-}
-
-void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-    uint32_t version = file->read_u32();
-    GGML_ASSERT(version == 0);
-
-    file->read_raw(&opt->params, sizeof(opt->params));
-    file->read_raw(&opt->nx,     sizeof(opt->nx));
-    ggml_opt_init(ctx, opt, opt->params, opt->nx);
-
-    file->read_raw(&opt->iter,   sizeof(opt->iter));
-    opt->just_initialized = (bool) file->read_u32();
-
-    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
-            {
-                read_tensor(file, opt->adam.x);
-                read_tensor(file, opt->adam.g1);
-                read_tensor(file, opt->adam.g2);
-                read_tensor(file, opt->adam.m);
-                read_tensor(file, opt->adam.v);
-                read_tensor(file, opt->adam.mh);
-                read_tensor(file, opt->adam.vh);
-                if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
-                file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
-                file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
-                file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                GGML_ASSERT(opt->adam.x != NULL);
-                read_tensor(file, opt->lbfgs.x);
-                read_tensor(file, opt->lbfgs.xp);
-                read_tensor(file, opt->lbfgs.g);
-                read_tensor(file, opt->lbfgs.gp);
-                read_tensor(file, opt->lbfgs.d);
-                if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
-                read_tensor(file, opt->lbfgs.lmal);
-                read_tensor(file, opt->lbfgs.lmys);
-                read_tensor(file, opt->lbfgs.lms);
-                read_tensor(file, opt->lbfgs.lmy);
-                file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
-                file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
-                file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
-                file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
-                file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
-                file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
-            } break;
-    }
-}
-
-bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename, bool init) {
-    struct llama_file file(filename, "rb");
-
-    uint32_t magic;
-    uint32_t version;
-
-    uint32_t train_its = 0;
-    uint32_t train_samples = 0;
-    uint32_t train_tokens = 0;
-
-    if (file.fp) {
-        printf("%s: Loading model from '%s'.\n", __func__, filename);
-        magic                  = file.read_u32();
-        GGML_ASSERT(magic     == 'ggcp');
-        version                = file.read_u32();
-        GGML_ASSERT(version   == 0);
-        train_its              = file.read_u32();
-        train_samples          = file.read_u32();
-        train_tokens           = file.read_u32();
-        model->hparams.n_vocab = file.read_u32();
-        model->hparams.n_embd  = file.read_u32();
-        model->hparams.n_mult  = file.read_u32();
-        model->hparams.n_head  = file.read_u32();
-        model->hparams.n_layer = file.read_u32();
-        model->hparams.n_rot   = file.read_u32();
-        print_params(&model->hparams);
-    }
-
-    if (init) {
-        init_model(model);
-    }
-
-    if (file.fp) {
-        model->train_its = train_its;
-        model->train_samples = train_samples;
-        model->train_tokens = train_tokens;
-    }
-
-    printf("%s: Training iterations: %u.\n", __func__, model->train_its);
-    printf("%s: Training samples:    %u.\n", __func__, model->train_samples);
-    printf("%s: Training tokens:     %u.\n", __func__, model->train_tokens);
-
-    if (file.fp) {
-        read_tensor(&file, model->tok_embeddings);
-        read_tensor(&file, model->norm);
-        read_tensor(&file, model->output);
-
-        for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-            auto & layer = model->layers[i];
-
-            read_tensor(&file, layer.attention_norm);
-            read_tensor(&file, layer.wq);
-            read_tensor(&file, layer.wk);
-            read_tensor(&file, layer.wv);
-            read_tensor(&file, layer.wo);
-            read_tensor(&file, layer.ffn_norm);
-            read_tensor(&file, layer.w1);
-            read_tensor(&file, layer.w2);
-            read_tensor(&file, layer.w3);
-        }
-
-        read_opt_context(&file, model->ctx, opt);
-    }
-
-    return (file.fp != NULL);
-}
-
-void print_sample_weights(TransformerWeights *w){
-    printf("----- Quick print of first of the weight vales of all the variables\n");
-    printf("%f\n", w->token_embedding_table[0]);
-    printf("%f\n", w->rms_att_weight[0]);
-    printf("%f\n", w->rms_ffn_weight[0]);
-
-    printf("%f\n", w->wq[0]);
-    printf("%f\n", w->wk[0]);
-    printf("%f\n", w->wv[0]);
-    printf("%f\n", w->wo[0]);
-    printf("%f\n", w->w1[0]);
-    printf("%f\n", w->w2[0]);
-    printf("%f\n", w->w3[0]);
-    printf("%f\n", w->rms_att_weight[0]);
-    printf("%f\n", w->freq_cis_real[0]);
-    printf("%f\n", w->freq_cis_imag[0]);
-    printf("------------------------------------------------------------------\n");
-
-    
-}
-
-void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
-    
-    int ct;
-    switch (gg_weights->n_dims){
-        case 1:
-            ct = 0;
-            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
-                float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
-                *ptr = karpathy_weights[ct];
-            }
-        case 2:
-            ct = 0;
-            for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
-                for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
-                    // set_f32_2d(gg_weights, k, i, karpathy_weights[ct]);
-                    float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
-                    *ptr = karpathy_weights[ct];
-                    ct++;
-                }
-            }
-            break;
-        case 3:
-            ct = 0;
-            for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
-                for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
-                        // set_f32_3d(gg_weights, k, j, i, karpathy_weights[ct]);
-                        float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
-                        *ptr = karpathy_weights[ct];
-                        ct++;
-                    }
-                }
-            }
-            break;    
-    }
-
-    // void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value)
-    // set_f32_2d(gg_weights, 142.0, 0, 0);
-
-    // float p = get_f32_2d(gg_weights, 0, 0);
-    // print_row(gg_weights, 0);
-    // print_matrix(gg_weights);
-}
-
-void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
-    struct llama_file file(filename, "wb");
-    if (file.fp == NULL) {
-        return;
-    }
-    // print_sample_weights(w);
-    // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC);   // magic
-    file.write_u32(LLAMA_FILE_VERSION); // version
-    // write_hparams
-    file.write_u32(model->hparams.n_vocab);
-    file.write_u32(model->hparams.n_embd);
-    file.write_u32(model->hparams.n_mult);
-    file.write_u32(model->hparams.n_head);
-    file.write_u32(model->hparams.n_layer);
-    file.write_u32(model->hparams.n_rot);
-    file.write_u32(LLAMA_FTYPE_ALL_F32);
-    // write_vocab
-    uint32_t n_vocab = model->hparams.n_vocab;
-    for (uint32_t i = 0; i < n_vocab; i++) {
-        const auto & token_score = vocab->id_to_token.at(i);
-        file.write_u32((uint32_t) token_score.tok.size());
-        file.write_raw(token_score.tok.data(), token_score.tok.size());
-        file.write_raw(&token_score.score, sizeof(token_score.score));
-    }
-
-    // stuff AK weights into GG weights one by one.
-    // w->token_embedding_table -> model->tok_embeddings
-    // float*                   -> struct ggml_tensor
-    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
-    print_row(model->tok_embeddings, 0);
-
-    // stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
-    // stuff_karpathy_weights_into_gg(model->norm, w->freq_cis_real);               // <<<<<<<<<< mostly wrong
-    // stuff_karpathy_weights_into_gg(model->norm, w->freq_cis_imag);               // <<<<<<<<<< mostly wrong
-
-    // for rms-att-weight 
-    int row_length = model->hparams.n_embd;
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
-        auto & layer = model->layers[i];
-        // 2d        
-        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length]);
-    }
-    
-    // write tensors
-    write_tensor(&file, model->tok_embeddings);
-    write_tensor(&file, model->norm);
-    write_tensor(&file, model->output);
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        printf(" testing new here %d\n", i);
-        auto & layer = model->layers[i];
-
-        write_tensor(&file, layer.attention_norm);
-        write_tensor(&file, layer.wq);
-        write_tensor(&file, layer.wk);
-        write_tensor(&file, layer.wv);
-        write_tensor(&file, layer.wo);
-        write_tensor(&file, layer.ffn_norm);
-        write_tensor(&file, layer.w1);
-        write_tensor(&file, layer.w2);
-        write_tensor(&file, layer.w3);
-    }
-}
-
-
-
-struct train_params {
-    const char * fn_vocab_model;
-    const char * fn_train_data;
-    const char * fn_checkpoint_in;
-    const char * fn_checkpoint_out;
-    const char * fn_model_out;
-
-    uint32_t seed;
-
-    int n_ctx;
-    int n_embd;
-    int n_mult;
-    int n_head;
-    int n_layer;
-    int n_rotmax;
-
-    int n_threads;
-    int n_batch;
-    int n_examples;
-    int n_predict;
-
-    int print_info_interval;
-    int print_details_interval;
-
-    bool samples_start_after_nl;
-    bool use_adam;
-    bool use_flash;
-    bool use_scratch;
-
-    // only adam
-    int   warmup;
-    int   cos_decay_steps;
-    float cos_decay_restart;
-    float cos_decay_alpha;
-
-    int   lbfgs_n_iter;
-    int   adam_n_iter;
-    float adam_alpha;
-    float adam_decay;
-
-    int mem_model_gb;
-    int mem_compute_gb;
-    int mem_compute0_gb;
-    int mem_compute1_gb;
-};
-
-struct train_params get_default_train_params() {
-    struct train_params params;
-    params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
-    params.fn_train_data     = "shakespeare.txt";
-    params.fn_checkpoint_in  = "checkpoint.bin";
-    params.fn_checkpoint_out = "checkpoint.bin";
-    params.fn_model_out      = "ggml-checkpoint-f32.bin";
-
-    params.seed       =   -1;
-
-    params.n_ctx      =  128;
-    params.n_embd     =  256;
-    params.n_mult     =  256;
-    params.n_head     =    8;
-    params.n_layer    =   16;
-    params.n_rotmax   =   64;
-
-    params.n_threads  =    6;
-    params.n_batch    =    8;
-    params.n_examples =    8;
-    params.n_predict  = 1024;
-
-    params.print_info_interval    = 1;
-    params.print_details_interval = 2;
-
-    params.samples_start_after_nl = false;
-    params.use_adam               = true;
-    params.use_flash              = true;
-    params.use_scratch            = true;
-
-    // only adam
-    params.warmup            =  100;
-    params.cos_decay_steps   = 1000;
-    params.cos_decay_restart = 1.1f;
-    params.cos_decay_alpha   = 0.0f;
-
-    params.lbfgs_n_iter      = 16;
-    params.adam_n_iter       = 16;
-    params.adam_alpha        = 1e-3f;
-    params.adam_decay        = 1e-3f;
-
-    params.mem_model_gb   = 2;
-    params.mem_compute_gb = 24;
-    params.mem_compute0_gb = 8;
-    params.mem_compute1_gb = 2;
-
-    return params;
-}
-
-void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help                 show this help message and exit\n");
-    fprintf(stderr, "  --vocab-model FNAME        model path from which to load vocab (default '%s')\n", params->fn_vocab_model);
-    fprintf(stderr, "  --train-data FNAME         path from which to load training data (default '%s')\n", params->fn_train_data);
-    fprintf(stderr, "  --checkpoint-in FNAME      path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
-    fprintf(stderr, "  --checkpoint-out FNAME     path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
-    fprintf(stderr, "  --model-out FNAME          path to save ggml model (default '%s')\n", params->fn_model_out);
-    fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for -1)\n");
-    fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
-    fprintf(stderr, "  --embd N                   Embedding size used for new models (default %d)\n", params->n_embd);
-    fprintf(stderr, "  --mult N                   Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult);
-    fprintf(stderr, "  --head N                   Number of heads for new models (default %d)\n", params->n_head);
-    fprintf(stderr, "  --layer N                  Number of layers for new models (default %d)\n", params->n_layer);
-    fprintf(stderr, "  --rotmax N                 Maximal number Rope dimensions for new models (default %d)\n", params->n_rotmax);
-    fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
-    fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
-    fprintf(stderr, "  -n N, --examples N         Number of examples to train (default %d)\n", params->n_examples);
-    fprintf(stderr, "  --predict N                Number of tokens to generate after training (default %d)\n", params->n_predict);
-    fprintf(stderr, "  --print-info-interval N    Print infos during training each N examples (default %d)\n", params->print_info_interval);
-    fprintf(stderr, "  --print-details-interval N Print details during training each N examples (default %d)\n", params->print_details_interval);
-    fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
-    fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
-    fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
-    fprintf(stderr, "  --no-flash                 Don't use flash attention.\n");
-    fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
-    fprintf(stderr, "  --no-scratch               Don't use scratch buffers\n");
-    fprintf(stderr, "  --use-scratch              Use scratch buffers (default)\n");
-    fprintf(stderr, "  --warmup N                 Number of warmup steps (default %d)\n", params->warmup);
-    fprintf(stderr, "  --cos-decay-steps N        Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
-    fprintf(stderr, "  --cos-decay-restart N      Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
-    fprintf(stderr, "  --cos-decay-alpha N        Cosine decay alpha (default %f)\n", params->cos_decay_alpha);
-    fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
-    fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
-    fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
-    fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
-    fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
-    fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
-    fprintf(stderr, "  --mem-compute0 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb);
-    fprintf(stderr, "  --mem-compute1 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute1_gb);
-    fprintf(stderr, "\n");
-}
-
-bool train_params_parse(int argc, char ** argv, struct train_params * params) {
-    bool invalid_param = false;
-    std::string arg;
-    struct train_params default_params = get_default_train_params();
-    const std::string arg_prefix = "--";
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-
-        if (arg == "--vocab-model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_vocab_model = argv[i];
-        } else if (arg == "--train-data") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_train_data = argv[i];
-        } else if (arg == "--checkpoint-in") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_checkpoint_in = argv[i];
-        } else if (arg == "--checkpoint-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_checkpoint_out = argv[i];
-        } else if (arg == "--model-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_out = argv[i];
-        } else if (arg == "-s" || arg == "--seed") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->seed = std::stoi(argv[i]);
-        } else if (arg == "-c" || arg == "--ctx") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_ctx = std::stoi(argv[i]);
-        } else if (arg == "--embd") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_embd = std::stoi(argv[i]);
-        } else if (arg == "--mult") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_mult = std::stoi(argv[i]);
-        } else if (arg == "--head") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_head = std::stoi(argv[i]);
-        } else if (arg == "--layer") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_layer = std::stoi(argv[i]);
-        } else if (arg == "--rotmax") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rotmax = std::stoi(argv[i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_threads = std::stoi(argv[i]);
-        } else if (arg == "-b" || arg == "--batch") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_batch = std::stoi(argv[i]);
-        } else if (arg == "-n" || arg == "--examples") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_examples = std::stoi(argv[i]);
-        } else if (arg == "--predict") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_predict = std::stoi(argv[i]);
-        } else if (arg == "--print-info-interval") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->print_info_interval = std::stoi(argv[i]);
-        } else if (arg == "--print-details-interval") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->print_details_interval = std::stoi(argv[i]);
-        } else if (arg == "--samples-after-nl") {
-            params->samples_start_after_nl = true;
-        } else if (arg == "--use-lbfgs") {
-            params->use_adam = false;
-        } else if (arg == "--use-adam") {
-            params->use_adam = true;
-        } else if (arg == "--no-flash") {
-            params->use_flash = false;
-        } else if (arg == "--use-flash") {
-            params->use_flash = true;
-        } else if (arg == "--no-scratch") {
-            params->use_scratch = false;
-        } else if (arg == "--use-scratch") {
-            params->use_scratch = true;
-        } else if (arg == "--warmup") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->warmup = std::stoi(argv[i]);
-        } else if (arg == "--cos-decay-steps") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->cos_decay_steps = std::stof(argv[i]);
-        } else if (arg == "--cos-decay-restart") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->cos_decay_restart = std::stof(argv[i]);
-        } else if (arg == "--cos-decay-alpha") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->cos_decay_alpha = std::stof(argv[i]);
-        } else if (arg == "--lbfgs-iter") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->lbfgs_n_iter = std::stoi(argv[i]);
-        } else if (arg == "--adam-iter") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_n_iter = std::stoi(argv[i]);
-        } else if (arg == "--adam-alpha") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_alpha = std::stof(argv[i]);
-        } else if (arg == "--adam-decay") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_decay = std::stof(argv[i]);
-        } else if (arg == "--mem-model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_model_gb = std::stoi(argv[i]);
-        } else if (arg == "--mem-compute") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_compute_gb = std::stoi(argv[i]);
-        } else if (arg == "--mem-compute0") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_compute0_gb = std::stoi(argv[i]);
-        } else if (arg == "--mem-compute1") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_compute1_gb = std::stoi(argv[i]);
-        } else if (arg == "-h" || arg == "--help") {
-            train_print_usage(argc, argv, &default_params);
-            exit(0);
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            train_print_usage(argc, argv, &default_params);
-            exit(1);
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        train_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-
-    return true;
-}
-
-typedef struct {
-    int dim; // transformer dimension
-    int hidden_dim; // for ffn layers
-    int n_layers; // number of layers
-    int n_heads; // number of query heads
-    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
-    int vocab_size; // vocabulary size, usually 256 (byte-level)
-    int seq_len; // max sequence length
-} Config;
-
-
-
-void malloc_weights(TransformerWeights* w, Config* p) {
-    // we calloc instead of malloc to keep valgrind happy
-    w->token_embedding_table = new float[p->vocab_size * p->dim]();//calloc(p->vocab_size * p->dim, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
-    
-    w->rms_att_weight = new float[p->n_layers * p->dim](); //calloc(p->n_layers * p->dim, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
-
-    w->rms_ffn_weight = new float[p->n_layers * p->dim](); //calloc(p->n_layers * p->dim, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
-
-    w->wq = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
-
-    w->wk = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
-
-    w->wv = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
-
-    w->wo = new float[p->n_layers * p->dim * p->dim](); //calloc(p->n_layers * p->dim * p->dim, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
-
-    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
-
-    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->dim * p->hidden_dim, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
-
-    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim](); //calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
-
-    w->rms_final_weight = new float[p->dim](); //calloc(p->dim, sizeof(float));
-    printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
-
-    w->freq_cis_real = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
-    printf("[%s:AK] Allocating [%d] float space for w->freq_cis_real\n",__func__,p->seq_len * p->dim / 2);
-
-    w->freq_cis_imag = new float[p->seq_len * p->dim / 2](); //calloc(p->seq_len * p->dim / 2, sizeof(float));
-    printf("[%s:AK] Allocating [%d] float space for w->freq_cis_imag\n\n",__func__,p->seq_len * p->dim / 2);
-
-    // ensure all mallocs went fine
-    // if (!w->token_embedding_table || !w->rms_att_weight || !w->rms_ffn_weight 
-    //  || !w->wq || !w->wk || !w->wv || !w->wo || !w->w1 || !w->w2 || !w->w3 || 
-    //     !w->rms_final_weight || !w->freq_cis_real || !w->freq_cis_imag) {
-    //     printf("malloc failed!\n");
-    //     exit(1);
-    // }
-}
-
-int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
-    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
-    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
-    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
-    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
-    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
-    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
-    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
-    int head_size = p->dim / p->n_heads;
-    if (fread(w->freq_cis_real, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
-    if (fread(w->freq_cis_imag, sizeof(float), p->seq_len * head_size / 2, f) != static_cast<size_t>(p->seq_len * head_size / 2)) return 1;
-    return 0;
-}
-
-void free_weights(TransformerWeights* w) {
-    free(w->token_embedding_table);
-    free(w->rms_att_weight);
-    free(w->rms_ffn_weight);
-    free(w->wq);
-    free(w->wk);
-    free(w->wv);
-    free(w->wo);
-    free(w->w1);
-    free(w->w2);
-    free(w->w3);
-    free(w->rms_final_weight);
-    free(w->freq_cis_real);
-    free(w->freq_cis_imag);
-}
-
-
-void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct random_normal_distribution rnd;
-    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
-
-    randomize_tensor_normal(model->tok_embeddings, &rnd);
-    randomize_tensor_normal(model->norm,           &rnd);
-    randomize_tensor_normal(model->output,         &rnd);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, &rnd);
-
-        randomize_tensor_normal(layer.wq, &rnd);
-        randomize_tensor_normal(layer.wk, &rnd);
-        randomize_tensor_normal(layer.wv, &rnd);
-        randomize_tensor_normal(layer.wo, &rnd);
-
-        randomize_tensor_normal(layer.ffn_norm, &rnd);
-
-        randomize_tensor_normal(layer.w1, &rnd);
-        randomize_tensor_normal(layer.w2, &rnd);
-        randomize_tensor_normal(layer.w3, &rnd);
-    }
-}
-
-int main(int argc, char ** argv) {
-    Config config;
-    TransformerWeights weights;
-    {
-        FILE *file = fopen("/Users/aniket/Projects/karpathy/llama2.c/out/model.bin", "rb");
-        if (!file) {
-            printf("Unable to open the checkpoint file %s!\n", "/Users/aniket/Projects/karpathy/llama2.c/out/model.bin");
-            return 1;
-        }
-        else{
-            printf("model file opened for reading...\n");
-        }
-        // read in the config header
-        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
-        printf("config file read..\n");
-
-        // read in the Transformer weights
-        malloc_weights(&weights, &config);
-        printf("reading the opened model file...\n");
-        if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
-
-        fclose(file);
-
-    }
-    ////////////// Loads default train parameters ///////////////////////////
-    struct train_params params = get_default_train_params();
-    printf("params.n_ctx %d\n", params.n_ctx);
-    printf("params.n_embd %d\n", params.n_embd);
-    printf("params.fn_vocab_model %s\n", params.fn_vocab_model);
-    
-    if (!train_params_parse(argc, argv, &params)) {
-        return 1;
-    }
-
-    // Seed not needed here.
-    // if (params.seed == LLAMA_DEFAULT_SEED) {
-    //     params.seed = time(NULL);
-    // }
-    // printf("[%s]: seed: %u\n", __func__, params.seed);
-    // srand(params.seed);
-    ////////////////////////////////////////////////////////////////////////////////////
-
-    struct llama_context_params llama_params = llama_context_default_params();
-    llama_params.vocab_only = true;
-
-    struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
-    struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
-
-    struct llama_vocab vocab;
-    {
-        std::vector<const char *> strings;
-        std::vector<float> scores;
-        int n_vocab = llama_n_vocab(lctx);
-        printf("nvocab = %d\n", n_vocab);
-        strings.resize(n_vocab, NULL);
-        scores.resize(n_vocab, 0);
-        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
-        vocab.id_to_token.resize(n_vocab);
-        for (int i=0; i<n_vocab; ++i) {
-            std::string tok   = std::string(strings[i]);
-            float       score = scores[i];
-            // printf("%s - %f\n", tok.c_str(), score);
-            vocab.id_to_token[i].tok   = tok;
-            vocab.id_to_token[i].score = score;
-            vocab.token_to_id.emplace(tok, i);
-        }
-    }
-
-    printf("%s: tokenize training data\n", __func__);
-    std::vector<llama_token> train_tokens;
-    if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) {
-        fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data);
-    }
-    printf("%s: number of training tokens: %d\n", __func__, (int) train_tokens.size());
-
-    struct my_llama_model model;
-    
-    model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
-    model.hparams.n_ctx   = params.n_ctx;
-    model.hparams.n_embd  = config.dim; //params.n_embd;
-    model.hparams.n_mult  = params.n_mult; 
-    model.hparams.n_head  = config.n_heads; //params.n_head;
-    model.hparams.n_layer = config.n_layers; //params.n_layer;
-    model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
-
-    print_params(&model.hparams);
-    struct ggml_init_params lcparams;
-    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
-    lcparams.mem_buffer = NULL;
-    lcparams.no_alloc   = false;
-
-    model.ctx = ggml_init(lcparams);
-
-    init_model(&model);
-    // randomize_model(&model, params.seed, 0.0f, 1.0f, -1.0f, +1.0f);
-    save_as_llama_model(&vocab, &model, &weights, "ak_model.bin");
-
-    // llama_free(lctx);
-    llama_free_model(lmodel);
-    ggml_free(model.ctx);
-    // free(&weights);
-    return 0;
-}

From 5520876c3c46477fdf3e55bea4aa0fc95490ce15 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Tue, 8 Aug 2023 14:28:34 -0400
Subject: [PATCH 20/30] cleaning up Makefile empty space before mearge

---
 Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Makefile b/Makefile
index d06843d00f3b1..37cf8a880d4a8 100644
--- a/Makefile
+++ b/Makefile
@@ -396,7 +396,6 @@ train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratc
 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \

From d14c066f0c34d723f20706dd8eea64934e5c2561 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Tue, 8 Aug 2023 20:40:17 -0400
Subject: [PATCH 21/30] cleaning up to remove spaces and satisfy failed checks

---
 .../convert-llama2c-to-ggml.cpp               | 43 +++++++++----------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index f7b144eed5f45..2f2e9a1591d7e 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -55,17 +55,17 @@ void malloc_weights(TransformerWeights* w, Config* p) {
     // we calloc instead of malloc to keep valgrind happy
     w->token_embedding_table = new float[p->vocab_size * p->dim]();
     printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
-    
+
     w->rms_att_weight = new float[p->n_layers * p->dim]();
     printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
 
     w->rms_ffn_weight = new float[p->n_layers * p->dim]();
     printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
 
-    w->wq = new float[p->n_layers * p->dim * p->dim](); 
+    w->wq = new float[p->n_layers * p->dim * p->dim]();
     printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
 
-    w->wk = new float[p->n_layers * p->dim * p->dim](); 
+    w->wk = new float[p->n_layers * p->dim * p->dim]();
     printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
 
     w->wv = new float[p->n_layers * p->dim * p->dim]();
@@ -200,7 +200,7 @@ struct my_llama_model {
 struct train_params {
     const char * fn_vocab_model;
     const char * fn_llama2c_model;
-    const char * fn_llama2c_output_model;    
+    const char * fn_llama2c_output_model;
     const char * fn_train_data;
     const char * fn_checkpoint_in;
     const char * fn_checkpoint_out;
@@ -295,7 +295,6 @@ void init_model(struct my_llama_model * model) {
     printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
     printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
     printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
-    
 
     ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
     ggml_set_name(model->norm,           "norm.weight");
@@ -506,7 +505,7 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * kar
         case 2:
             ct = 0;
             for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
-                for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {                    
+                for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
                     float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
                     *ptr = karpathy_weights[ct];
                     ct++;
@@ -517,14 +516,14 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * kar
             ct = 0;
             for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
                 for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {                        
+                    for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
                         float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
                         *ptr = karpathy_weights[ct];
                         ct++;
                     }
                 }
             }
-            break;    
+            break;
     }
 }
 
@@ -559,8 +558,8 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
     // float*                   -> struct ggml_tensor
     stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
     stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
-    
-    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);         
+
+    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
     //print_row(model->norm, 0);
 
     // for rms-att-weight 
@@ -568,7 +567,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
     const auto & hparams = model->hparams;
     //int n_ff = model->hparams.n_embd;
     int n_ff = get_n_ff(&hparams);
-    
+
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
         auto & layer = model->layers[i];
         // 1d
@@ -580,7 +579,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
         stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
         stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
         stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
-        
+
         stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
         stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
         stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
@@ -589,7 +588,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
     write_tensor(&file, model->tok_embeddings);
     write_tensor(&file, model->norm);
     write_tensor(&file, model->output); // ?
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {        
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
         auto & layer = model->layers[i];
 
         write_tensor(&file, layer.attention_norm);
@@ -660,8 +659,8 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h, --help                       show this help message and exit\n");
     fprintf(stderr, "  --vocab-model FNAME              model path from which to load vocab (default '%s')\n", params->fn_vocab_model);
-    fprintf(stderr, "  --llama2c-model FNAME            model path from which to load Karpathy's llama2.c model\n");   
-    fprintf(stderr, "  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);   
+    fprintf(stderr, "  --llama2c-model FNAME            model path from which to load Karpathy's llama2.c model\n");
+    fprintf(stderr, "  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
     fprintf(stderr, "\n");
 }
 
@@ -688,13 +687,13 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
                 invalid_param = true;
                 break;
             }
-            params->fn_llama2c_model = argv[i]; 
+            params->fn_llama2c_model = argv[i];
         } else if (arg == "--llama2c-output-model") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->fn_llama2c_output_model = argv[i]; 
+            params->fn_llama2c_output_model = argv[i];
         } else if (arg == "-h" || arg == "--help") {
             print_usage(argc, argv, &default_params);
             exit(0);
@@ -720,7 +719,7 @@ int main(int argc, char ** argv) {
     }
     Config config;
     TransformerWeights weights;
-    {        
+    {
         FILE *file = fopen(params.fn_llama2c_model, "rb");
         if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
         // read in the config header
@@ -741,7 +740,7 @@ int main(int argc, char ** argv) {
     {
         std::vector<const char *> strings;
         std::vector<float> scores;
-        int n_vocab = llama_n_vocab(lctx);        
+        int n_vocab = llama_n_vocab(lctx);
         strings.resize(n_vocab, NULL);
         scores.resize(n_vocab, 0);
         n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
@@ -749,7 +748,7 @@ int main(int argc, char ** argv) {
         vocab.id_to_token.resize(n_vocab);
         for (int i=0; i<n_vocab; ++i) {
             std::string tok   = std::string(strings[i]);
-            float       score = scores[i];            
+            float       score = scores[i];
             vocab.id_to_token[i].tok   = tok;
             vocab.id_to_token[i].score = score;
             vocab.token_to_id.emplace(tok, i);
@@ -759,7 +758,7 @@ int main(int argc, char ** argv) {
     model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
     model.hparams.n_ctx   = params.n_ctx;
     model.hparams.n_embd  = config.dim; //params.n_embd;
-    model.hparams.n_mult  = 32;//params.n_mult; 
+    model.hparams.n_mult  = 32;//params.n_mult;
     model.hparams.n_head  = config.n_heads; //params.n_head;
     model.hparams.n_layer = config.n_layers; //params.n_layer;
     model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
@@ -781,4 +780,4 @@ int main(int argc, char ** argv) {
     ggml_free(model.ctx);
     free_weights(&weights);
     return 0;
-}
\ No newline at end of file
+}

From 7b1f062620deffbb22c78cc13089cd169053eadf Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Wed, 9 Aug 2023 09:04:24 -0400
Subject: [PATCH 22/30] adding add_subdirectory in examples dir CMakeLists.txt

---
 examples/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a7b26776ad355..b5d9bb29e6ad8 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -42,6 +42,7 @@ else()
     add_subdirectory(benchmark)
     add_subdirectory(baby-llama)
     add_subdirectory(train-text-from-scratch)
+    add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(simple)
     add_subdirectory(embd-input)
     if (LLAMA_METAL)

From 7d0404c3931b3550aa3f8d5f44e6add8fadecba4 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Wed, 9 Aug 2023 09:05:37 -0400
Subject: [PATCH 23/30] adding newline in readme

---
 examples/convert-llama2c-to-ggml/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md
index 18293947a498c..da9b51a51495e 100644
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -10,4 +10,4 @@ To convert the model first download the models from the [llma2.c](https://github
 
 Now you can use the model with command:
 
-`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5`
\ No newline at end of file
+`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5`

From afb8f6ee6ab7c7c5bd3a6aa37f4790b49d605140 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Wed, 9 Aug 2023 09:06:10 -0400
Subject: [PATCH 24/30] removing 1 whitespace

---
 examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 2f2e9a1591d7e..bdad519953bf2 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -562,7 +562,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
     stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
     //print_row(model->norm, 0);
 
-    // for rms-att-weight 
+    // for rms-att-weight
     int row_length = model->hparams.n_embd;
     const auto & hparams = model->hparams;
     //int n_ff = model->hparams.n_embd;

From 40a51ec6a30485d2aea48188334d9670a94745ae Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Wed, 9 Aug 2023 09:06:47 -0400
Subject: [PATCH 25/30] adding CMakeLists.txt file in the conversion script
 directory

---
 examples/convert-llama2c-to-ggml/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 examples/convert-llama2c-to-ggml/CMakeLists.txt

diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt
new file mode 100644
index 0000000000000..cfd9eb87512f6
--- /dev/null
+++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET convert-llama2c-to-ggml)
+add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
\ No newline at end of file

From a3fa0abaaa40a6e5b6cb0362ed1bde435af3776f Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Wed, 9 Aug 2023 09:16:30 -0400
Subject: [PATCH 26/30] for got to add newline

---
 examples/convert-llama2c-to-ggml/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt
index cfd9eb87512f6..e262d44f98496 100644
--- a/examples/convert-llama2c-to-ggml/CMakeLists.txt
+++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET convert-llama2c-to-ggml)
 add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
\ No newline at end of file
+target_compile_features(${TARGET} PRIVATE cxx_std_11)

From db5d7ab3f71d2de74c130893b1860cb7b1383a73 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Thu, 10 Aug 2023 09:49:14 -0400
Subject: [PATCH 27/30] Adding more information in the README to use conversion
 tool.

---
 examples/convert-llama2c-to-ggml/README.md | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md
index da9b51a51495e..868f57d6dc97b 100644
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -1,13 +1,26 @@
 ## Convert llama2.c model to ggml
 
-This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format.
+This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
 
 To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
 
 `$ make -j`
 
-`$ ./convert-llama2c-to-ggml --vocab-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>`
+After successful compilation, following usage options are available:
+```
+usage: ./convert-llama2c-to-ggml [options]
 
-Now you can use the model with command:
+options:
+  -h, --help                       show this help message and exit
+  --copy-vocab-from-model FNAME    model path from which to copy vocab (default 'models/ggml-vocab.bin')
+  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model
+  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default ak_llama_model.bin')
+```
+
+An example command is as follows:
+
+`$ ./convert-llama2c-to-ggml --copy-vocab-from-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>`
+
+Now you can use the model with command like:
 
 `$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5`

From aab15de46610f7af8d481ca11297a00fe9536f09 Mon Sep 17 00:00:00 2001
From: Aniket <aniket@care.ai>
Date: Thu, 10 Aug 2023 09:53:21 -0400
Subject: [PATCH 28/30] commandline argument changes for clarity.

Changed the parameter to load the vocab model to --copy-vocab-from-model
Made the default vocab model to load from models/ggml-vocal.bin
Made sure a llama2.c model is provided, or else exit with usage.
Updated README.md for better use.
---
 .../convert-llama2c-to-ggml.cpp                   | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index bdad519953bf2..3bd388635fe86 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -605,7 +605,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
 
 struct train_params get_default_train_params() {
     struct train_params params;
-    params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
+    params.fn_vocab_model    = "models/ggml-vocab.bin";
     params.fn_llama2c_output_model = "ak_llama_model.bin";
     params.fn_train_data     = "shakespeare.txt";
     params.fn_checkpoint_in  = "checkpoint.bin";
@@ -658,14 +658,15 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h, --help                       show this help message and exit\n");
-    fprintf(stderr, "  --vocab-model FNAME              model path from which to load vocab (default '%s')\n", params->fn_vocab_model);
-    fprintf(stderr, "  --llama2c-model FNAME            model path from which to load Karpathy's llama2.c model\n");
+    fprintf(stderr, "  --copy-vocab-from-model FNAME    model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
+    fprintf(stderr, "  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
     fprintf(stderr, "  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
     fprintf(stderr, "\n");
 }
 
 bool params_parse(int argc, char ** argv, struct train_params * params) {
     bool invalid_param = false;
+    bool reqd_param_found = false;
     std::string arg;
     struct train_params default_params = get_default_train_params();
     const std::string arg_prefix = "--";
@@ -676,7 +677,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
             std::replace(arg.begin(), arg.end(), '_', '-');
         }
 
-        if (arg == "--vocab-model") {
+        if (arg == "--copy-vocab-from-model") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -687,6 +688,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
                 invalid_param = true;
                 break;
             }
+            reqd_param_found = true;
             params->fn_llama2c_model = argv[i];
         } else if (arg == "--llama2c-output-model") {
             if (++i >= argc) {
@@ -708,6 +710,11 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
         print_usage(argc, argv, &default_params);
         exit(1);
     }
+    if (!reqd_param_found){
+        fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
+        print_usage(argc, argv, &default_params);
+        exit(1);
+    }
 
     return true;
 }

From d2b95e7e70d259cce689cd329a59bed6f89eb2a4 Mon Sep 17 00:00:00 2001
From: Johannes Rudolph <johannes.rudolph@gmail.com>
Date: Thu, 10 Aug 2023 16:17:26 +0200
Subject: [PATCH 29/30] refactor vocab loading into its own method

---
 .../convert-llama2c-to-ggml.cpp               | 53 ++++++++++---------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 3bd388635fe86..28759ae39c01f 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -491,6 +491,32 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     file->write_raw(tensor->data, ggml_nbytes(tensor));
 }
 
+void load_vocab(const char *filename, struct llama_vocab *vocab) {
+    struct llama_context_params llama_params = llama_context_default_params();
+    llama_params.vocab_only = true;
+
+    struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
+    struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
+
+    std::vector<const char *> strings;
+    std::vector<float> scores;
+    int n_vocab = llama_n_vocab(lctx);
+    strings.resize(n_vocab, NULL);
+    scores.resize(n_vocab, 0);
+    n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
+    GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+    vocab->id_to_token.resize(n_vocab);
+    for (int i=0; i<n_vocab; ++i) {
+        std::string tok   = std::string(strings[i]);
+        float       score = scores[i];
+        vocab->id_to_token[i].tok   = tok;
+        vocab->id_to_token[i].score = score;
+        vocab->token_to_id.emplace(tok, i);
+    }
+    llama_free(lctx);
+    llama_free_model(lmodel);
+}
+
 void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
     int ct;
     switch (gg_weights->n_dims){
@@ -737,30 +763,9 @@ int main(int argc, char ** argv) {
         fclose(file);
     }
 
-    struct llama_context_params llama_params = llama_context_default_params();
-    llama_params.vocab_only = true;
-
-    struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
-    struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
-
     struct llama_vocab vocab;
-    {
-        std::vector<const char *> strings;
-        std::vector<float> scores;
-        int n_vocab = llama_n_vocab(lctx);
-        strings.resize(n_vocab, NULL);
-        scores.resize(n_vocab, 0);
-        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
-        vocab.id_to_token.resize(n_vocab);
-        for (int i=0; i<n_vocab; ++i) {
-            std::string tok   = std::string(strings[i]);
-            float       score = scores[i];
-            vocab.id_to_token[i].tok   = tok;
-            vocab.id_to_token[i].score = score;
-            vocab.token_to_id.emplace(tok, i);
-        }
-    }
+    load_vocab(params.fn_vocab_model, &vocab);
+
     struct my_llama_model model;
     model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
     model.hparams.n_ctx   = params.n_ctx;
@@ -782,8 +787,6 @@ int main(int argc, char ** argv) {
 
     printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
 
-    llama_free(lctx);
-    llama_free_model(lmodel);
     ggml_free(model.ctx);
     free_weights(&weights);
     return 0;

From aa2620129127960abd38e5207036153449508bf3 Mon Sep 17 00:00:00 2001
From: Johannes Rudolph <johannes.rudolph@gmail.com>
Date: Thu, 10 Aug 2023 16:32:44 +0200
Subject: [PATCH 30/30] also support loading from llama2.c vocabulary

---
 .../convert-llama2c-to-ggml.cpp               | 82 +++++++++++++------
 1 file changed, 57 insertions(+), 25 deletions(-)

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 28759ae39c01f..1a238c4dd945a 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -438,6 +438,11 @@ struct llama_file {
         read_raw(&ret, sizeof(ret));
         return ret;
     }
+    std::float_t read_f32() {
+        std::float_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
 
     std::string read_string(std::uint32_t len) {
         std::vector<char> chars(len);
@@ -491,30 +496,57 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     file->write_raw(tensor->data, ggml_nbytes(tensor));
 }
 
-void load_vocab(const char *filename, struct llama_vocab *vocab) {
-    struct llama_context_params llama_params = llama_context_default_params();
-    llama_params.vocab_only = true;
-
-    struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
-    struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
-
-    std::vector<const char *> strings;
-    std::vector<float> scores;
-    int n_vocab = llama_n_vocab(lctx);
-    strings.resize(n_vocab, NULL);
-    scores.resize(n_vocab, 0);
-    n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-    GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
-    vocab->id_to_token.resize(n_vocab);
-    for (int i=0; i<n_vocab; ++i) {
-        std::string tok   = std::string(strings[i]);
-        float       score = scores[i];
-        vocab->id_to_token[i].tok   = tok;
-        vocab->id_to_token[i].score = score;
-        vocab->token_to_id.emplace(tok, i);
+bool is_ggml_file(const char *filename) {
+    llama_file file(filename, "rb");
+    if (file.size < 4) {
+        return false;
+    }
+    uint32_t magic = file.read_u32();
+    return magic == LLAMA_FILE_MAGIC;
+}
+
+void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
+    // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
+    if (is_ggml_file(filename)) {
+
+        struct llama_context_params llama_params = llama_context_default_params();
+        llama_params.vocab_only = true;
+
+        struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
+        struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
+
+        std::vector<const char *> strings;
+        std::vector<float> scores;
+        int n_vocab = llama_n_vocab(lctx);
+        strings.resize(n_vocab, NULL);
+        scores.resize(n_vocab, 0);
+        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
+        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        vocab->id_to_token.resize(n_vocab);
+        for (int i=0; i<n_vocab; ++i) {
+            std::string tok   = std::string(strings[i]);
+            float       score = scores[i];
+            vocab->id_to_token[i].tok   = tok;
+            vocab->id_to_token[i].score = score;
+            vocab->token_to_id.emplace(tok, i);
+        }
+        llama_free(lctx);
+        llama_free_model(lmodel);
+    } else { // assume llama2.c vocabulary
+        printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
+        llama_file file(filename, "rb");
+        uint32_t n_vocab = config->vocab_size;
+        /* uint32_t max_token_length =  */ file.read_u32(); // unused
+        vocab->id_to_token.resize(n_vocab);
+        for (uint32_t i=0; i<n_vocab; ++i) {
+            float_t score = file.read_f32();
+            uint32_t len = file.read_u32();
+            std::string tok = file.read_string(len);
+            vocab->id_to_token[i].tok = tok;
+            vocab->id_to_token[i].score = score;
+            vocab->token_to_id.emplace(tok, i);
+        }
     }
-    llama_free(lctx);
-    llama_free_model(lmodel);
 }
 
 void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
@@ -684,7 +716,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h, --help                       show this help message and exit\n");
-    fprintf(stderr, "  --copy-vocab-from-model FNAME    model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
+    fprintf(stderr, "  --copy-vocab-from-model FNAME    llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
     fprintf(stderr, "  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
     fprintf(stderr, "  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
     fprintf(stderr, "\n");
@@ -764,7 +796,7 @@ int main(int argc, char ** argv) {
     }
 
     struct llama_vocab vocab;
-    load_vocab(params.fn_vocab_model, &vocab);
+    load_vocab(params.fn_vocab_model, &config, &vocab);
 
     struct my_llama_model model;
     model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);