@@ -668,13 +668,21 @@ struct llama_model_loader {
668668
669669 struct ggml_tensor * get_tensor_for (llama_load_tensor & lt, ggml_backend backend) {
670670 struct ggml_tensor * tensor;
671+
672+ if (backend != GGML_BACKEND_CPU) {
673+ ggml_set_no_alloc (ggml_ctx, true );
674+ }
671675 if (lt.ne .size () == 2 ) {
672676 tensor = ggml_new_tensor_2d (ggml_ctx, lt.type , lt.ne .at (0 ), lt.ne .at (1 ));
673677 } else {
674678 LLAMA_ASSERT (lt.ne .size () == 1 );
675679 tensor = ggml_new_tensor_1d (ggml_ctx, lt.type , lt.ne .at (0 ));
676680 }
677681 ggml_set_name (tensor, lt.name .c_str ());
682+
683+ if (backend != GGML_BACKEND_CPU) {
684+ ggml_set_no_alloc (ggml_ctx, use_mmap);
685+ }
678686 LLAMA_ASSERT (lt.ggml_tensor == NULL ); // if this fails, we called get_tensor twice on the same tensor
679687 tensor->backend = backend;
680688 lt.ggml_tensor = tensor;
@@ -713,6 +721,11 @@ struct llama_model_loader {
713721 }
714722 LLAMA_ASSERT (lt.ggml_tensor ); // unused tensors should have been caught by load_data already
715723 lt.data = (uint8_t *) lt.ggml_tensor ->data ;
724+ // allocate temp buffer if not using mmap
725+ if (!use_mmap && lt.data == NULL ) {
726+ lt.data = (uint8_t *)malloc (ggml_nbytes (lt.ggml_tensor ));
727+ }
728+
716729 load_data_for (lt);
717730 switch (lt.ggml_tensor ->backend ) {
718731 case GGML_BACKEND_CPU:
@@ -726,11 +739,17 @@ struct llama_model_loader {
726739#ifdef GGML_USE_CUBLAS
727740 case GGML_BACKEND_CUDA:
728741 ggml_cuda_load_data (lt.data , lt.ggml_tensor );
742+ if (!use_mmap) {
743+ free (lt.data );
744+ }
729745 break ;
730746#endif
731747#ifdef GGML_USE_CLBLAST
732748 case GGML_BACKEND_CL:
733749 ggml_cl_transform_tensor (lt.data , lt.ggml_tensor );
750+ if (!use_mmap) {
751+ free (lt.data );
752+ }
734753 break ;
735754#endif
736755 default :
0 commit comments