@@ -707,6 +707,9 @@ struct llama_model_loader {
707707
708708 struct ggml_tensor * get_tensor_for (llama_load_tensor & lt, ggml_backend backend) {
709709 struct ggml_tensor * tensor;
710+ if (backend != GGML_BACKEND_CPU) {
711+ ggml_set_no_alloc (ggml_ctx, true );
712+ }
710713 if (lt.ne .size () == 2 ) {
711714 tensor = ggml_new_tensor_2d (ggml_ctx, lt.type , lt.ne .at (0 ), lt.ne .at (1 ));
712715 } else {
@@ -716,6 +719,9 @@ struct llama_model_loader {
716719 ggml_set_name (tensor, lt.name .c_str ());
717720 LLAMA_ASSERT (lt.ggml_tensor == NULL ); // if this fails, we called get_tensor twice on the same tensor
718721
722+ if (backend != GGML_BACKEND_CPU) {
723+ ggml_set_no_alloc (ggml_ctx, use_mmap);
724+ }
719725 tensor->backend = backend;
720726 lt.ggml_tensor = tensor;
721727 num_ggml_tensors_created++;
@@ -731,6 +737,7 @@ struct llama_model_loader {
731737 void load_all_data (llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
732738 size_t data_size = 0 ;
733739 size_t prefetch_size = 0 ;
740+ size_t lock_size = 0 ;
734741 for (const llama_load_tensor & lt : tensors_map.tensors ) {
735742 data_size += lt.size ;
736743 if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
@@ -752,20 +759,48 @@ struct llama_model_loader {
752759
753760 size_t done_size = 0 ;
754761 for (llama_load_tensor & lt : tensors_map.tensors ) {
755- if (lt.ggml_tensor ->backend != GGML_BACKEND_CPU) {
756- continue ;
757- }
758762 if (progress_callback) {
759763 progress_callback ((float ) done_size / data_size, progress_callback_user_data);
760764 }
761765 LLAMA_ASSERT (lt.ggml_tensor ); // unused tensors should have been caught by load_data already
762766 lt.data = (uint8_t *) lt.ggml_tensor ->data ;
767+
768+ // allocate temp buffer if not using mmap
769+ if (!use_mmap && lt.data == NULL ) {
770+ lt.data = (uint8_t *)malloc (ggml_nbytes (lt.ggml_tensor ));
771+ }
772+
763773 load_data_for (lt);
764- lt.ggml_tensor ->data = lt.data ;
765- done_size += lt.size ;
766- if (use_mmap && lmlock) {
767- lmlock->grow_to (done_size);
774+
775+ switch (lt.ggml_tensor ->backend ) {
776+ case GGML_BACKEND_CPU:
777+ lt.ggml_tensor ->data = lt.data ;
778+ if (use_mmap && lmlock) {
779+ lock_size += lt.size ;
780+ lmlock->grow_to (lock_size);
781+ }
782+ break ;
783+ #if defined(GGML_USE_CUBLAS)
784+ case GGML_BACKEND_GPU:
785+ case GGML_BACKEND_GPU_SPLIT:
786+ ggml_cuda_transform_tensor (lt.data , lt.ggml_tensor );
787+ if (!use_mmap) {
788+ free (lt.data );
789+ }
790+ break ;
791+ #elif defined(GGML_USE_CLBLAST)
792+ case GGML_BACKEND_GPU:
793+ ggml_cl_transform_tensor (lt.data , lt.ggml_tensor );
794+ if (!use_mmap) {
795+ free (lt.data );
796+ }
797+ break ;
798+ #endif
799+ default :
800+ continue ;
768801 }
802+
803+ done_size += lt.size ;
769804 }
770805 }
771806
@@ -1141,7 +1176,7 @@ static void llama_model_load_internal(
11411176 if (backend == GGML_BACKEND_GPU) {
11421177 vram_weights +=
11431178 ggml_nbytes (layer.attention_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
1144- ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.attention_norm ) +
1179+ ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.ffn_norm ) +
11451180 ggml_nbytes (layer.w1 ) + ggml_nbytes (layer.w2 ) + ggml_nbytes (layer.w3 );
11461181 }
11471182 }
@@ -1196,58 +1231,14 @@ static void llama_model_load_internal(
11961231 model.tensors_by_name .emplace_back (lt.name , lt.ggml_tensor );
11971232 }
11981233
1199- ml->load_all_data (progress_callback, progress_callback_user_data, use_mlock ? &lctx.model .mlock_mmap : NULL );
1200-
12011234#if defined(GGML_USE_CUBLAS)
12021235 {
12031236 ggml_cuda_set_tensor_split (tensor_split);
1204-
1205- size_t done_size = 0 ;
1206- size_t data_size = 0 ;
1207- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1208- data_size += lt.size ;
1209- if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
1210- done_size += lt.size ;
1211- }
1212- }
1213- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1214- ggml_backend backend = lt.ggml_tensor ->backend ;
1215- if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
1216- continue ;
1217- }
1218- if (progress_callback) {
1219- progress_callback ((float ) done_size / data_size, progress_callback_user_data);
1220- }
1221- ggml_cuda_load_data (fname.c_str (), lt.ggml_tensor , lt.shards .at (0 ).file_off );
1222- done_size += lt.size ;
1223- }
12241237 }
1225- #elif defined(GGML_USE_CLBLAST)
1226- {
1227- size_t done_size = 0 ;
1228- size_t data_size = 0 ;
1229- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1230- data_size += lt.size ;
1231- if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
1232- done_size += lt.size ;
1233- }
1234- }
1235- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1236- if (lt.ggml_tensor ->backend != GGML_BACKEND_GPU) {
1237- continue ;
1238- }
1239- if (progress_callback) {
1240- progress_callback ((float ) done_size / data_size, progress_callback_user_data);
1241- }
1242- ggml_cl_load_data (fname.c_str (), lt.ggml_tensor , lt.shards .at (0 ).file_off );
1243- done_size += lt.size ;
1244- }
1245- }
1246- #else
1247- (void ) n_batch;
1248- (void ) tensor_split;
12491238#endif
12501239
1240+ ml->load_all_data (progress_callback, progress_callback_user_data, use_mlock ? &lctx.model .mlock_mmap : NULL );
1241+
12511242 if (progress_callback) {
12521243 progress_callback (1 .0f , progress_callback_user_data);
12531244 }
0 commit comments