diff --git a/engine/controllers/models.cc b/engine/controllers/models.cc index de14886da..3f91da848 100644 --- a/engine/controllers/models.cc +++ b/engine/controllers/models.cc @@ -184,8 +184,8 @@ void Models::ListModel( obj["model"] = model_entry.model; obj["model"] = model_entry.model; auto es = model_service_->GetEstimation(model_entry.model); - if (es.has_value()) { - obj["recommendation"] = hardware::ToJson(es.value()); + if (es.has_value() && !!es.value()) { + obj["recommendation"] = hardware::ToJson(*(es.value())); } data.append(std::move(obj)); yaml_handler.Reset(); diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index d81a9b649..7f79ddaf7 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -341,9 +341,10 @@ cpp::result ModelService::HandleDownloadUrlAsync( return download_service_->AddTask(downloadTask, on_finished); } -cpp::result ModelService::GetEstimation( - const std::string& model_handle, const std::string& kv_cache, int n_batch, - int n_ubatch) { +cpp::result, std::string> +ModelService::GetEstimation(const std::string& model_handle, + const std::string& kv_cache, int n_batch, + int n_ubatch) { namespace fs = std::filesystem; namespace fmu = file_manager_utils; cortex::db::Models modellist_handler; @@ -918,7 +919,7 @@ cpp::result ModelService::GetModelStatus( if (status == drogon::k200OK) { return true; } else { - CTL_ERR("Model failed to get model status with status code: " << status); + CTL_WRN("Model failed to get model status with status code: " << status); return cpp::fail("Model failed to get model status: " + data["message"].asString()); } @@ -1146,13 +1147,13 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl, .free_vram_MiB = free_vram_MiB}; auto es = hardware::EstimateLLaMACppRun(model_path, rc); - if (es.gpu_mode.vram_MiB > free_vram_MiB && is_cuda) { - CTL_WRN("Not enough VRAM - " << "required: " << es.gpu_mode.vram_MiB + if (!!es && (*es).gpu_mode.vram_MiB > free_vram_MiB && is_cuda) { + CTL_WRN("Not enough VRAM - " << "required: " << (*es).gpu_mode.vram_MiB << ", available: " << free_vram_MiB); } - if (es.cpu_mode.ram_MiB > free_ram_MiB) { - CTL_WRN("Not enough RAM - " << "required: " << es.cpu_mode.ram_MiB + if (!!es && (*es).cpu_mode.ram_MiB > free_ram_MiB) { + CTL_WRN("Not enough RAM - " << "required: " << (*es).cpu_mode.ram_MiB << ", available: " << free_ram_MiB); } diff --git a/engine/services/model_service.h b/engine/services/model_service.h index 7235d5a0a..e2638fd1f 100644 --- a/engine/services/model_service.h +++ b/engine/services/model_service.h @@ -97,7 +97,7 @@ class ModelService { bool HasModel(const std::string& id) const; - cpp::result GetEstimation( + cpp::result, std::string> GetEstimation( const std::string& model_handle, const std::string& kv_cache = "f16", int n_batch = 2048, int n_ubatch = 2048); diff --git a/engine/utils/hardware/gguf/gguf_file.h b/engine/utils/hardware/gguf/gguf_file.h index 1263debf2..361668242 100644 --- a/engine/utils/hardware/gguf/gguf_file.h +++ b/engine/utils/hardware/gguf/gguf_file.h @@ -11,6 +11,7 @@ #include #include #include +#include #ifdef _WIN32 #include @@ -23,13 +24,14 @@ #include "ggml.h" #include "utils/string_utils.h" +#include "utils/logging_utils.h" // #define GGUF_LOG(msg) \ // do { \ // std::cout << __FILE__ << "(@" << __LINE__ << "): " << msg << '\n'; \ // } while (false) -#define GGUF_LOG(msg) +#define GGUF_LOG(msg) namespace hardware { #undef min #undef max @@ -169,8 +171,6 @@ inline std::string to_string(const GGUFMetadataKV& kv) { return "Invalid type "; } - - struct GGUFTensorInfo { /* Basic */ std::string name; @@ -208,14 +208,14 @@ struct GGUFHelper { CreateFileA(file_path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); if (file_handle == INVALID_HANDLE_VALUE) { - std::cout << "Failed to open file" << std::endl; + CTL_INF("Failed to open file: " << file_path); return false; } // Get the file size LARGE_INTEGER file_size_struct; if (!GetFileSizeEx(file_handle, &file_size_struct)) { CloseHandle(file_handle); - std::cout << "Failed to open file" << std::endl; + CTL_INF("Failed to get file size: " << file_path); return false; } file_size = static_cast(file_size_struct.QuadPart); @@ -225,7 +225,7 @@ struct GGUFHelper { CreateFileMappingA(file_handle, nullptr, PAGE_READONLY, 0, 0, nullptr); if (file_mapping == nullptr) { CloseHandle(file_handle); - std::cout << "Failed to create file mapping" << std::endl; + CTL_INF("Failed to create file mapping: " << file_path); return false; } @@ -235,7 +235,7 @@ struct GGUFHelper { if (data == nullptr) { CloseHandle(file_mapping); CloseHandle(file_handle); - std::cout << "Failed to map file" << std::endl; + CTL_INF("Failed to map file:: " << file_path); return false; } @@ -479,10 +479,12 @@ struct GGUFFile { double model_bits_per_weight; }; -inline GGUFFile ParseGgufFile(const std::string& path) { +inline std::optional ParseGgufFile(const std::string& path) { GGUFFile gf; GGUFHelper h; - h.OpenAndMMap(path); + if(!h.OpenAndMMap(path)) { + return std::nullopt; + } GGUFMagic magic = h.Read(); // GGUF_LOG("magic: " << magic); diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h index fde0b0ac0..12a7e72e1 100644 --- a/engine/utils/hardware/gguf/gguf_file_estimate.h +++ b/engine/utils/hardware/gguf/gguf_file_estimate.h @@ -62,20 +62,22 @@ inline float GetQuantBit(const std::string& kv_cache_t) { return 16.0; } -inline Estimation EstimateLLaMACppRun(const std::string& file_path, - const RunConfig& rc) { +inline std::optional EstimateLLaMACppRun( + const std::string& file_path, const RunConfig& rc) { Estimation res; // token_embeddings_size = n_vocab * embedding_length * 2 * quant_bit/16 bytes //RAM = token_embeddings_size + ((total_ngl-ngl) >=1 ? Output_layer_size + (total_ngl - ngl - 1 ) / (total_ngl-1) * (total_file_size - token_embeddings_size - Output_layer_size) : 0 ) (bytes) // VRAM = total_file_size - RAM (bytes) auto gf = ParseGgufFile(file_path); + if (!gf) + return std::nullopt; int32_t embedding_length = 0; int64_t n_vocab = 0; int32_t num_block = 0; int32_t total_ngl = 0; auto file_size = std::filesystem::file_size(file_path); - for (auto const& kv : gf.header.metadata_kv) { + for (auto const& kv : (*gf).header.metadata_kv) { if (kv.key.find("embedding_length") != std::string::npos) { embedding_length = std::any_cast(kv.value); } else if (kv.key == "tokenizer.ggml.tokens") { @@ -92,7 +94,7 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path, int32_t quant_bit_in = 0; int32_t quant_bit_out = 0; - for (auto const& ti : gf.tensor_infos) { + for (auto const& ti : (*gf).tensor_infos) { if (ti->name == "output.weight") { quant_bit_out = GetQuantBit(ti->type); // std::cout << ti->type << std::endl;