@@ -2027,6 +2027,21 @@ void llama_context::perf_reset() {
20272027 n_reused = 0 ;
20282028}
20292029
2030+ std::map<ggml_backend_buffer_type_t , llama_memory_breakdown_data> llama_context::memory_breakdown () const {
2031+ std::map<ggml_backend_buffer_type_t , llama_memory_breakdown_data> ret;
2032+ for (const auto & buft_size : model.memory_breakdown ()) {
2033+ ret[buft_size.first ].model += buft_size.second ;
2034+ }
2035+ for (const auto & buft_size : memory->memory_breakdown ()) {
2036+ ret[buft_size.first ].context += buft_size.second ;
2037+ }
2038+ for (const auto & backend_ptr : backends) {
2039+ ggml_backend_t backend = backend_ptr.get ();
2040+ ret[ggml_backend_sched_get_buffer_type (sched.get (), backend)].compute += ggml_backend_sched_get_buffer_size (sched.get (), backend);
2041+ }
2042+ return ret;
2043+ }
2044+
20302045//
20312046// training
20322047//
@@ -2765,6 +2780,142 @@ void llama_perf_context_reset(llama_context * ctx) {
27652780 ctx->perf_reset ();
27662781}
27672782
2783+ void llama_memory_breakdown_print (const struct llama_context * ctx) {
2784+ const std::vector<ggml_backend_dev_t > & devices = ctx->get_model ().devices ;
2785+
2786+ std::map<ggml_backend_buffer_type_t , llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown ();
2787+
2788+ std::vector<std::array<std::string, 9 >> table_data;
2789+ table_data.reserve (devices.size ());
2790+ const std::string template_header = " %s: | %s | %s %s %s %s %s %s %s |\n " ;
2791+ const std::string template_gpu = " %s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n " ;
2792+ const std::string template_other = " %s: | %s | %s %s %s = %s + %s + %s %s |\n " ;
2793+
2794+ table_data.push_back ({template_header, " memory breakdown [MiB]" , " total" , " free" , " self" , " model" , " context" , " compute" , " unaccounted" });
2795+
2796+ constexpr size_t MiB = 1024 * 1024 ;
2797+ const std::vector<std::string> desc_prefixes_strip = {" NVIDIA " , " GeForce " , " Tesla " , " AMD " , " Radeon " , " Instinct " };
2798+
2799+ // track seen buffer types to avoid double counting:
2800+ std::set<ggml_backend_buffer_type_t > seen_buffer_types;
2801+
2802+ // accumulative memory breakdown for each device and for host:
2803+ std::vector<llama_memory_breakdown_data> mb_dev (devices.size ());
2804+ llama_memory_breakdown_data mb_host;
2805+
2806+ for (const auto & buft_mb : memory_breakdown) {
2807+ ggml_backend_buffer_type_t buft = buft_mb.first ;
2808+ const llama_memory_breakdown_data & mb = buft_mb.second ;
2809+ if (ggml_backend_buft_is_host (buft)) {
2810+ mb_host.model += mb.model ;
2811+ mb_host.context += mb.context ;
2812+ mb_host.compute += mb.compute ;
2813+ seen_buffer_types.insert (buft);
2814+ continue ;
2815+ }
2816+ ggml_backend_dev_t dev = ggml_backend_buft_get_device (buft);
2817+ if (dev) {
2818+ int i_dev = -1 ;
2819+ for (size_t i = 0 ; i < devices.size (); i++) {
2820+ if (devices[i] == dev) {
2821+ i_dev = i;
2822+ break ;
2823+ }
2824+ }
2825+ if (i_dev != -1 ) {
2826+ mb_dev[i_dev].model += mb.model ;
2827+ mb_dev[i_dev].context += mb.context ;
2828+ mb_dev[i_dev].compute += mb.compute ;
2829+ seen_buffer_types.insert (buft);
2830+ continue ;
2831+ }
2832+ }
2833+ }
2834+
2835+ // print memory breakdown for each device:
2836+ for (size_t i = 0 ; i < devices.size (); i++) {
2837+ ggml_backend_dev_t dev = devices[i];
2838+ llama_memory_breakdown_data mb = mb_dev[i];
2839+
2840+ const std::string name = ggml_backend_dev_name (dev);
2841+ std::string desc = ggml_backend_dev_description (dev);
2842+ for (const std::string & prefix : desc_prefixes_strip) {
2843+ if (desc.length () >= prefix.length () && desc.substr (0 , prefix.length ()) == prefix) {
2844+ desc = desc.substr (prefix.length ());
2845+ }
2846+ }
2847+
2848+ size_t free, total;
2849+ ggml_backend_dev_memory (dev, &free, &total);
2850+
2851+ const size_t self = mb.model + mb.context + mb.compute ;
2852+ const size_t unaccounted = total - self - free;
2853+
2854+ table_data.push_back ({
2855+ template_gpu,
2856+ " - " + name + " (" + desc + " )" ,
2857+ std::to_string (total / MiB),
2858+ std::to_string (free / MiB),
2859+ std::to_string (self / MiB),
2860+ std::to_string (mb.model / MiB),
2861+ std::to_string (mb.context / MiB),
2862+ std::to_string (mb.compute / MiB),
2863+ std::to_string (unaccounted / MiB)});
2864+ }
2865+
2866+ // print memory breakdown for host:
2867+ {
2868+ const size_t self = mb_host.model + mb_host.context + mb_host.compute ;
2869+ table_data.push_back ({
2870+ template_other,
2871+ " - Host" ,
2872+ " " , // total
2873+ " " , // free
2874+ std::to_string (self / MiB),
2875+ std::to_string (mb_host.model / MiB),
2876+ std::to_string (mb_host.context / MiB),
2877+ std::to_string (mb_host.compute / MiB),
2878+ " " }); // unaccounted
2879+ }
2880+
2881+ // print memory breakdown for all remaining buffer types:
2882+ for (const auto & buft_mb : memory_breakdown) {
2883+ ggml_backend_buffer_type_t buft = buft_mb.first ;
2884+ const llama_memory_breakdown_data & mb = buft_mb.second ;
2885+ if (seen_buffer_types.count (buft) == 1 ) {
2886+ continue ;
2887+ }
2888+ const std::string name = ggml_backend_buft_name (buft);
2889+ const size_t self = mb.model + mb.context + mb.compute ;
2890+ table_data.push_back ({
2891+ template_other,
2892+ " - " + name,
2893+ " " , // total
2894+ " " , // free
2895+ std::to_string (self / MiB),
2896+ std::to_string (mb.model / MiB),
2897+ std::to_string (mb.context / MiB),
2898+ std::to_string (mb.compute / MiB),
2899+ " " }); // unaccounted
2900+ seen_buffer_types.insert (buft);
2901+ }
2902+
2903+ for (size_t j = 1 ; j < table_data[0 ].size (); j++) {
2904+ size_t max_len = 0 ;
2905+ for (const auto & td : table_data) {
2906+ max_len = std::max (max_len, td[j].length ());
2907+ }
2908+ for (auto & td : table_data) {
2909+ td[j].insert (j == 1 ? td[j].length () : 0 , max_len - td[j].length (), ' ' );
2910+ }
2911+ }
2912+ for (const auto & td : table_data) {
2913+ LLAMA_LOG_INFO (td[0 ].c_str (),
2914+ __func__, td[1 ].c_str (), td[2 ].c_str (), td[3 ].c_str (), td[4 ].c_str (), td[5 ].c_str (),
2915+ td[6 ].c_str (), td[7 ].c_str (), td[8 ].c_str ());
2916+ }
2917+ }
2918+
27682919//
27692920// training
27702921//
0 commit comments