@@ -673,13 +673,21 @@ struct llama_model_loader {
673673
674674 struct ggml_tensor * get_tensor_for (llama_load_tensor & lt, ggml_backend backend) {
675675 struct ggml_tensor * tensor;
676+
677+ if (backend != GGML_BACKEND_CPU) {
678+ ggml_set_no_alloc (ggml_ctx, true );
679+ }
676680 if (lt.ne .size () == 2 ) {
677681 tensor = ggml_new_tensor_2d (ggml_ctx, lt.type , lt.ne .at (0 ), lt.ne .at (1 ));
678682 } else {
679683 LLAMA_ASSERT (lt.ne .size () == 1 );
680684 tensor = ggml_new_tensor_1d (ggml_ctx, lt.type , lt.ne .at (0 ));
681685 }
682686 ggml_set_name (tensor, lt.name .c_str ());
687+
688+ if (backend != GGML_BACKEND_CPU) {
689+ ggml_set_no_alloc (ggml_ctx, use_mmap);
690+ }
683691 LLAMA_ASSERT (lt.ggml_tensor == NULL ); // if this fails, we called get_tensor twice on the same tensor
684692 tensor->backend = backend;
685693 lt.ggml_tensor = tensor;
@@ -696,6 +704,7 @@ struct llama_model_loader {
696704 void load_all_data (llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
697705 size_t data_size = 0 ;
698706 size_t prefetch_size = 0 ;
707+ size_t lock_size = 0 ;
699708 for (const llama_load_tensor & lt : tensors_map.tensors ) {
700709 data_size += lt.size ;
701710 if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
@@ -705,32 +714,52 @@ struct llama_model_loader {
705714
706715 if (use_mmap) {
707716 mapping.reset (new llama_mmap (&file_loaders.at (0 )->file , prefetch_size));
708- if (!lmlock) {
709- // Don't call the callback since the actual loading will be lazy
710- // and we can't measure it.
711- progress_callback = NULL ;
712- }
713717 if (lmlock) {
714718 lmlock->init (mapping->addr );
715719 }
716720 }
717721
718722 size_t done_size = 0 ;
719723 for (llama_load_tensor & lt : tensors_map.tensors ) {
720- if (lt.ggml_tensor ->backend != GGML_BACKEND_CPU) {
721- continue ;
722- }
723724 if (progress_callback) {
724725 progress_callback ((float ) done_size / data_size, progress_callback_user_data);
725726 }
726727 LLAMA_ASSERT (lt.ggml_tensor ); // unused tensors should have been caught by load_data already
727728 lt.data = (uint8_t *) lt.ggml_tensor ->data ;
729+ // allocate temp buffer if not using mmap
730+ if (!use_mmap && lt.data == NULL ) {
731+ lt.data = (uint8_t *)malloc (ggml_nbytes (lt.ggml_tensor ));
732+ }
733+
728734 load_data_for (lt);
729- lt.ggml_tensor ->data = lt.data ;
730- done_size += lt.size ;
731- if (use_mmap && lmlock) {
732- lmlock->grow_to (done_size);
735+ switch (lt.ggml_tensor ->backend ) {
736+ case GGML_BACKEND_CPU:
737+ lt.ggml_tensor ->data = lt.data ;
738+ if (use_mmap && lmlock) {
739+ lock_size += lt.size ;
740+ lmlock->grow_to (lock_size);
741+ }
742+ break ;
743+ #ifdef GGML_USE_CUBLAS
744+ case GGML_BACKEND_CUDA:
745+ ggml_cuda_transform_tensor (lt.data , lt.ggml_tensor );
746+ if (!use_mmap) {
747+ free (lt.data );
748+ }
749+ break ;
750+ #endif
751+ #ifdef GGML_USE_CLBLAST
752+ case GGML_BACKEND_CL:
753+ ggml_cl_transform_tensor (lt.data , lt.ggml_tensor );
754+ if (!use_mmap) {
755+ free (lt.data );
756+ }
757+ break ;
758+ #endif
759+ default :
760+ continue ;
733761 }
762+ done_size += lt.size ;
734763 }
735764 }
736765
@@ -1069,8 +1098,8 @@ static void llama_model_load_internal(
10691098
10701099 if (backend == LLAMA_BACKEND_OFFLOAD) {
10711100 vram_total +=
1072- ggml_nbytes (layer.attention_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
1073- ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.attention_norm ) +
1101+ ggml_nbytes (layer.attention_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
1102+ ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.ffn_norm ) +
10741103 ggml_nbytes (layer.w1 ) + ggml_nbytes (layer.w2 ) + ggml_nbytes (layer.w3 );
10751104 }
10761105 }
@@ -1117,50 +1146,6 @@ static void llama_model_load_internal(
11171146
11181147 ml->load_all_data (progress_callback, progress_callback_user_data, use_mlock ? &lctx.model .mlock_mmap : NULL );
11191148
1120- #if defined(GGML_USE_CUBLAS)
1121- {
1122- size_t done_size = 0 ;
1123- size_t data_size = 0 ;
1124- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1125- data_size += lt.size ;
1126- if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
1127- done_size += lt.size ;
1128- }
1129- }
1130- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1131- if (lt.ggml_tensor ->backend != GGML_BACKEND_CUDA) {
1132- continue ;
1133- }
1134- if (progress_callback) {
1135- progress_callback ((float ) done_size / data_size, progress_callback_user_data);
1136- }
1137- ggml_cuda_load_data (fname.c_str (), lt.ggml_tensor , lt.shards .at (0 ).file_off );
1138- done_size += lt.size ;
1139- }
1140- }
1141- #elif defined(GGML_USE_CLBLAST)
1142- {
1143- size_t done_size = 0 ;
1144- size_t data_size = 0 ;
1145- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1146- data_size += lt.size ;
1147- if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
1148- done_size += lt.size ;
1149- }
1150- }
1151- for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1152- if (lt.ggml_tensor ->backend != GGML_BACKEND_CL) {
1153- continue ;
1154- }
1155- if (progress_callback) {
1156- progress_callback ((float ) done_size / data_size, progress_callback_user_data);
1157- }
1158- ggml_cl_load_data (fname.c_str (), lt.ggml_tensor , lt.shards .at (0 ).file_off );
1159- done_size += lt.size ;
1160- }
1161- }
1162- #endif
1163-
11641149 if (progress_callback) {
11651150 progress_callback (1 .0f , progress_callback_user_data);
11661151 }
0 commit comments