1010
1111#include " ggml.h"
1212#ifdef GGML_USE_CUBLAS
13- #include " ggml-cuda.h "
13+ #include < cuda_runtime.h >
1414#include < cufile.h>
15+ #include " ggml-cuda.h"
1516#endif
1617
1718#include < array>
@@ -784,7 +785,18 @@ struct llama_model_loader {
784785 size_t offset = lt.shards .at (0 ).file_off ;
785786 size_t actual_size;
786787 void * buf = ggml_cuda_pool_malloc (lt.size , &actual_size);
787- cuFileRead (file.cf_handle , buf, lt.size , offset, 0 );
788+
789+ if (file.cf_need_workaround ) { // load to host, then copy to device
790+ void * buf_host = ggml_cuda_host_malloc (lt.size );
791+ file.seek (offset, SEEK_SET);
792+ file.read_raw (buf_host, lt.size );
793+ cudaMemcpy (buf, buf_host, lt.size , cudaMemcpyHostToDevice);
794+ cudaDeviceSynchronize ();
795+ ggml_cuda_host_free (buf_host);
796+ } else { // load directly to device
797+ cuFileRead (file.cf_handle , buf, lt.size , offset, 0 );
798+ }
799+
788800 lt.data = (uint8_t *) buf;
789801 }
790802#endif // GGML_USE_CUBLAS
@@ -974,26 +986,6 @@ static void llama_model_load_internal(
974986 ml->calc_sizes (&ctx_size, &mmapped_size);
975987 fprintf (stderr, " %s: ggml ctx size = %6.2f KB\n " , __func__, ctx_size/1024.0 );
976988
977- // print memory requirements
978- {
979- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1 ;
980-
981- // this is the total memory required to run the inference
982- const size_t mem_required =
983- ctx_size +
984- mmapped_size +
985- MEM_REQ_SCRATCH0 ().at (model.type ) +
986- MEM_REQ_SCRATCH1 ().at (model.type ) +
987- MEM_REQ_EVAL ().at (model.type );
988-
989- // this is the memory required by one llama_state
990- const size_t mem_required_state =
991- scale*MEM_REQ_KV_SELF ().at (model.type );
992-
993- fprintf (stderr, " %s: mem required = %7.2f MB (+ %7.2f MB per state)\n " , __func__,
994- mem_required / 1024.0 / 1024.0 , mem_required_state / 1024.0 / 1024.0 );
995- }
996-
997989 // create the ggml context
998990 {
999991 lctx.model .buf .resize (ctx_size);
@@ -1015,6 +1007,7 @@ static void llama_model_load_internal(
10151007 }
10161008
10171009 // prepare memory for the weights
1010+ size_t vram_total = 0 ;
10181011 {
10191012 const uint32_t n_embd = hparams.n_embd ;
10201013 const uint32_t n_layer = hparams.n_layer ;
@@ -1024,7 +1017,13 @@ static void llama_model_load_internal(
10241017
10251018 model.tok_embeddings = ml->get_tensor (" tok_embeddings.weight" , {n_embd, n_vocab}, GGML_BACKEND_CPU);
10261019 model.norm = ml->get_tensor (" norm.weight" , {n_embd}, GGML_BACKEND_CPU);
1027- model.output = ml->get_tensor (" output.weight" , {n_embd, n_vocab}, GGML_BACKEND_CPU);
1020+ ggml_backend backend_output;
1021+ if (n_gpu_layers > int (n_layer)) {
1022+ backend_output = GGML_BACKEND_CUDA;
1023+ } else {
1024+ backend_output = GGML_BACKEND_CPU;
1025+ }
1026+ model.output = ml->get_tensor (" output.weight" , {n_embd, n_vocab}, backend_output);
10281027
10291028 model.layers .resize (n_layer);
10301029 const int i_gpu_start = n_layer - n_gpu_layers;
@@ -1046,51 +1045,56 @@ static void llama_model_load_internal(
10461045 layer.w1 = ml->get_tensor (layers_i + " .feed_forward.w1.weight" , {n_embd, n_ff}, backend);
10471046 layer.w2 = ml->get_tensor (layers_i + " .feed_forward.w2.weight" , { n_ff, n_embd}, backend);
10481047 layer.w3 = ml->get_tensor (layers_i + " .feed_forward.w3.weight" , {n_embd, n_ff}, backend);
1048+ if (backend == GGML_BACKEND_CUDA) {
1049+ vram_total += ggml_nbytes (layer.attention_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk )
1050+ + ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.attention_norm )
1051+ + ggml_nbytes (layer.w1 ) + ggml_nbytes (layer.w2 ) + ggml_nbytes (layer.w3 );
1052+ }
10491053 }
10501054 }
10511055
10521056 ml->done_getting_tensors ();
10531057
1054- // populate `tensors_by_name`
1055- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1056- model.tensors_by_name .emplace_back (lt.name , lt.ggml_tensor );
1057- }
1058-
1059- ml->load_all_data (progress_callback, progress_callback_user_data, use_mlock ? &lctx.model .mlock_mmap : NULL );
1060-
1061- model.mapping = std::move (ml->mapping );
1062- #ifdef GGML_USE_CUBLAS
1058+ // print memory requirements
10631059 {
1064- // const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1065- const int n_gpu = 0 ;
1060+ const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1 ;
10661061
1067- fprintf (stderr, " %s: [cublas] offloading %d layers to GPU\n " , __func__, n_gpu);
1062+ // this is the total memory required to run the inference
1063+ const size_t mem_required =
1064+ ctx_size +
1065+ mmapped_size - vram_total + // weights in VRAM not in memory
1066+ MEM_REQ_SCRATCH0 ().at (model.type ) +
1067+ MEM_REQ_SCRATCH1 ().at (model.type ) +
1068+ MEM_REQ_EVAL ().at (model.type );
10681069
1069- size_t vram_total = 0 ;
1070+ // this is the memory required by one llama_state
1071+ const size_t mem_required_state =
1072+ scale*MEM_REQ_KV_SELF ().at (model.type );
10701073
1071- for ( int i = 0 ; i < n_gpu; ++i) {
1072- const auto & layer = model. layers [i] ;
1074+ fprintf (stderr, " %s: mem required = %7.2f MB (+ %7.2f MB per state) \n " , __func__,
1075+ mem_required / 1024.0 / 1024.0 , mem_required_state / 1024.0 / 1024.0 ) ;
10731076
1074- ggml_cuda_transform_tensor (layer.attention_norm ); vram_total += ggml_nbytes (layer.attention_norm );
1075- ggml_cuda_transform_tensor (layer.wq ); vram_total += ggml_nbytes (layer.wq );
1076- ggml_cuda_transform_tensor (layer.wk ); vram_total += ggml_nbytes (layer.wk );
1077- ggml_cuda_transform_tensor (layer.wv ); vram_total += ggml_nbytes (layer.wv );
1078- ggml_cuda_transform_tensor (layer.wo ); vram_total += ggml_nbytes (layer.wo );
1079- ggml_cuda_transform_tensor (layer.ffn_norm ); vram_total += ggml_nbytes (layer.ffn_norm );
1080- ggml_cuda_transform_tensor (layer.w1 ); vram_total += ggml_nbytes (layer.w1 );
1081- ggml_cuda_transform_tensor (layer.w2 ); vram_total += ggml_nbytes (layer.w2 );
1082- ggml_cuda_transform_tensor (layer.w3 ); vram_total += ggml_nbytes (layer.w3 );
1083- }
1077+ #ifdef GGML_USE_CUBLAS
1078+ const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
1079+
1080+ fprintf (stderr, " %s: [cublas] offloading %d layers to GPU\n " , __func__, n_gpu);
10841081 if (n_gpu_layers > (int ) hparams.n_layer ) {
10851082 fprintf (stderr, " %s: [cublas] offloading output layer to GPU\n " , __func__);
1086- ggml_cuda_transform_tensor (model.output ); vram_total += ggml_nbytes (model.output );
10871083 }
1088-
10891084 fprintf (stderr, " %s: [cublas] total VRAM used: %zu MB\n " , __func__, vram_total / 1024 / 1024 );
1090- }
10911085#else
10921086 (void ) n_gpu_layers;
10931087#endif
1088+ }
1089+
1090+ // populate `tensors_by_name`
1091+ for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1092+ model.tensors_by_name .emplace_back (lt.name , lt.ggml_tensor );
1093+ }
1094+
1095+ ml->load_all_data (progress_callback, progress_callback_user_data, use_mlock ? &lctx.model .mlock_mmap : NULL );
1096+
1097+ model.mapping = std::move (ml->mapping );
10941098
10951099 // loading time will be recalculate after the first eval, so
10961100 // we take page faults deferred by mmap() into consideration
0 commit comments