Use smart pointers in simple-chat

ericcurtin · ericcurtin · commit 17f086bf2c8a · 2024-11-14T14:00:32.000Z
Avoid manual memory cleanups. Less memory leaks in the code now.

Signed-off-by: Eric Curtin &lt;ecurtin@redhat.com&gt;
diff --git a/examples/simple-chat/CMakeLists.txt b/examples/simple-chat/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET llama-simple-chat)
 add_executable(${TARGET} simple-chat.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_14)
diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
@@ -1,10 +1,132 @@
-#include "llama.h"
 #include <cstdio>
 #include <cstring>
 #include <iostream>
+#include <memory>
 #include <string>
 #include <vector>
 
+#include "llama.h"
+
+// Add a message to `messages` and store its content in `owned_content`
+static void add_message(const std::string &role, const std::string &text,
+                        std::vector<llama_chat_message> &messages,
+                        std::vector<std::unique_ptr<char[]>> &owned_content) {
+    auto content = std::make_unique<char[]>(text.size() + 1);
+    std::strcpy(content.get(), text.c_str());
+    messages.push_back({role.c_str(), content.get()});
+    owned_content.push_back(std::move(content));
+}
+
+// Function to apply the chat template and resize `formatted` if needed
+static int apply_chat_template(const llama_model *model,
+                               const std::vector<llama_chat_message> &messages,
+                               std::vector<char> &formatted, bool append) {
+    int result = llama_chat_apply_template(model, nullptr, messages.data(),
+                                           messages.size(), append,
+                                           formatted.data(), formatted.size());
+    if (result > static_cast<int>(formatted.size())) {
+        formatted.resize(result);
+        result = llama_chat_apply_template(model, nullptr, messages.data(),
+                                           messages.size(), append,
+                                           formatted.data(), formatted.size());
+    }
+
+    return result;
+}
+
+// Function to tokenize the prompt
+static int tokenize_prompt(const llama_model *model, const std::string &prompt,
+                           std::vector<llama_token> &prompt_tokens) {
+    const int n_prompt_tokens = -llama_tokenize(
+        model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
+    prompt_tokens.resize(n_prompt_tokens);
+    if (llama_tokenize(model, prompt.c_str(), prompt.size(),
+                       prompt_tokens.data(), prompt_tokens.size(), true,
+                       true) < 0) {
+        GGML_ABORT("failed to tokenize the prompt\n");
+        return -1;
+    }
+
+    return n_prompt_tokens;
+}
+
+// Check if we have enough space in the context to evaluate this batch
+static int check_context_size(const llama_context *ctx,
+                              const llama_batch &batch) {
+    const int n_ctx = llama_n_ctx(ctx);
+    const int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
+    if (n_ctx_used + batch.n_tokens > n_ctx) {
+        printf("\033[0m\n");
+        fprintf(stderr, "context size exceeded\n");
+        return 1;
+    }
+
+    return 0;
+}
+
+// convert the token to a string
+static int convert_token_to_string(const llama_model *model,
+                                   const llama_token token_id,
+                                   std::string &piece) {
+    char buf[256];
+    int n = llama_token_to_piece(model, token_id, buf, sizeof(buf), 0, true);
+    if (n < 0) {
+        GGML_ABORT("failed to convert token to piece\n");
+        return 1;
+    }
+
+    piece = std::string(buf, n);
+    return 0;
+}
+
+static void print_word_and_concatenate_to_response(const std::string &piece,
+                                                   std::string &response) {
+    printf("%s", piece.c_str());
+    fflush(stdout);
+    response += piece;
+}
+
+// helper function to evaluate a prompt and generate a response
+static int generate(const llama_model *model, llama_sampler *smpl,
+                    llama_context *ctx, const std::string &prompt,
+                    std::string &response) {
+    std::vector<llama_token> prompt_tokens;
+    const int n_prompt_tokens = tokenize_prompt(model, prompt, prompt_tokens);
+    if (n_prompt_tokens < 0) {
+        return 1;
+    }
+
+    // prepare a batch for the prompt
+    llama_batch batch =
+        llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
+    llama_token new_token_id;
+    while (true) {
+        check_context_size(ctx, batch);
+        if (llama_decode(ctx, batch)) {
+            GGML_ABORT("failed to decode\n");
+            return 1;
+        }
+
+        // sample the next token, check is it an end of generation?
+        new_token_id = llama_sampler_sample(smpl, ctx, -1);
+        if (llama_token_is_eog(model, new_token_id)) {
+            break;
+        }
+
+        std::string piece;
+        if (convert_token_to_string(model, new_token_id, piece)) {
+            return 1;
+        }
+
+        print_word_and_concatenate_to_response(piece, response);
+
+        // prepare the next batch with the sampled token
+        batch = llama_batch_get_one(&new_token_id, 1);
+    }
+
+    return 0;
+}
+
 static void print_usage(int, char ** argv) {
     printf("\nexample usage:\n");
     printf("\n    %s -m model.gguf [-c context_size] [-ngl n_gpu_layers]\n", argv[0]);
@@ -66,6 +188,7 @@ int main(int argc, char ** argv) {
     llama_model_params model_params = llama_model_default_params();
     model_params.n_gpu_layers = ngl;
 
+    // This prints ........
     llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
     if (!model) {
         fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@@ -88,107 +211,49 @@ int main(int argc, char ** argv) {
     llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
     llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8f));
     llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
-
-    // helper function to evaluate a prompt and generate a response
-    auto generate = [&](const std::string & prompt) {
-        std::string response;
-
-        // tokenize the prompt
-        const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
-        std::vector<llama_token> prompt_tokens(n_prompt_tokens);
-        if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
-            GGML_ABORT("failed to tokenize the prompt\n");
-        }
-
-        // prepare a batch for the prompt
-        llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
-        llama_token new_token_id;
-        while (true) {
-            // check if we have enough space in the context to evaluate this batch
-            int n_ctx = llama_n_ctx(ctx);
-            int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
-            if (n_ctx_used + batch.n_tokens > n_ctx) {
-                printf("\033[0m\n");
-                fprintf(stderr, "context size exceeded\n");
-                exit(0);
-            }
-
-            if (llama_decode(ctx, batch)) {
-                GGML_ABORT("failed to decode\n");
-            }
-
-            // sample the next token
-            new_token_id = llama_sampler_sample(smpl, ctx, -1);
-
-            // is it an end of generation?
-            if (llama_token_is_eog(model, new_token_id)) {
-                break;
-            }
-
-            // convert the token to a string, print it and add it to the response
-            char buf[256];
-            int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
-            if (n < 0) {
-                GGML_ABORT("failed to convert token to piece\n");
-            }
-            std::string piece(buf, n);
-            printf("%s", piece.c_str());
-            fflush(stdout);
-            response += piece;
-
-            // prepare the next batch with the sampled token
-            batch = llama_batch_get_one(&new_token_id, 1);
-        }
-
-        return response;
-    };
-
     std::vector<llama_chat_message> messages;
+    std::vector<std::unique_ptr<char[]>> owned_content;
     std::vector<char> formatted(llama_n_ctx(ctx));
     int prev_len = 0;
     while (true) {
         // get user input
         printf("\033[32m> \033[0m");
         std::string user;
         std::getline(std::cin, user);
-
         if (user.empty()) {
             break;
         }
 
-        // add the user input to the message list and format it
-        messages.push_back({"user", strdup(user.c_str())});
-        int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
-        if (new_len > (int)formatted.size()) {
-            formatted.resize(new_len);
-            new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
-        }
+        // Add user input to messages
+        add_message("user", user, messages, owned_content);
+        int new_len = apply_chat_template(model, messages, formatted, true);
         if (new_len < 0) {
             fprintf(stderr, "failed to apply the chat template\n");
             return 1;
         }
 
-        // remove previous messages to obtain the prompt to generate the response
-        std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len);
+        // remove previous messages to obtain the prompt to generate the
+        // response
+        std::string prompt(formatted.begin() + prev_len,
+                           formatted.begin() + new_len);
 
         // generate a response
         printf("\033[33m");
-        std::string response = generate(prompt);
+        std::string response;
+        if (generate(model, smpl, ctx, prompt, response)) {
+            return 1;
+        }
+
         printf("\n\033[0m");
 
-        // add the response to the messages
-        messages.push_back({"assistant", strdup(response.c_str())});
-        prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0);
+        // Add response to messages
+        prev_len = apply_chat_template(model, messages, formatted, false);
         if (prev_len < 0) {
             fprintf(stderr, "failed to apply the chat template\n");
             return 1;
         }
     }
 
-    // free resources
-    for (auto & msg : messages) {
-        free(const_cast<char *>(msg.content));
-    }
     llama_sampler_free(smpl);
     llama_free(ctx);
     llama_free_model(model);