@@ -1539,7 +1539,7 @@ static bool llama_kv_cache_init(
15391539 ggml_cuda_assign_buffers_no_scratch (v);
15401540 vram_kv_cache += ggml_nbytes (k);
15411541 vram_kv_cache += ggml_nbytes (v);
1542- // HACK: mark tensor as allocated, but crash if we try to use it from the CPU
1542+ // HACK: mark tensor as allocated
15431543 k->data = v->data = (void *)(uintptr_t )1 ;
15441544 }
15451545 }
@@ -2285,9 +2285,15 @@ struct llama_model_loader {
22852285 ggml_backend_tensor_set (cur, (uint8_t *)mapping->addr + offs, 0 , ggml_nbytes (cur));
22862286 }
22872287 } else {
2288- // FIXME: use read_buf for device buffers without unified memory
2289- file.seek (offs, SEEK_SET);
2290- file.read_raw (cur->data , ggml_nbytes (cur));
2288+ if (ggml_backend_buffer_is_host (cur->buffer )) {
2289+ file.seek (offs, SEEK_SET);
2290+ file.read_raw (cur->data , ggml_nbytes (cur));
2291+ } else {
2292+ read_buf.resize (ggml_nbytes (cur));
2293+ file.seek (offs, SEEK_SET);
2294+ file.read_raw (read_buf.data (), ggml_nbytes (cur));
2295+ ggml_backend_tensor_set (cur, read_buf.data (), 0 , ggml_nbytes (cur));
2296+ }
22912297 }
22922298
22932299 if (use_mmap && lmlock) {
@@ -2298,7 +2304,7 @@ struct llama_model_loader {
22982304
22992305 case GGML_BACKEND_GPU:
23002306 case GGML_BACKEND_GPU_SPLIT: {
2301- // HACK: mark tensor as allocated, but crash if we try to use it from the CPU
2307+ // HACK: mark tensor as allocated
23022308 cur->data = (void *)(uintptr_t )1 ;
23032309 void * data;
23042310 if (use_mmap) {
@@ -5773,7 +5779,7 @@ static struct ggml_cgraph * llama_build_graph(
57735779 const int64_t n_tokens = cur->ne [1 ];
57745780
57755781 float * data;
5776- if (/* is_sys_mem_buf (cur->buffer)*/ false ) { // TODO
5782+ if (ggml_backend_buffer_is_host (cur->buffer )) {
57775783 data = (float *) cur->data ;
57785784 } else {
57795785 lctx.buf_copy .resize (ggml_nbytes (cur));
@@ -5812,7 +5818,7 @@ static struct ggml_cgraph * llama_build_graph(
58125818 const int64_t n_ctx = cur->ne [0 ];
58135819
58145820 int32_t * data;
5815- if (/* is_sys_mem_buf (cur->buffer)*/ false ) { // TODO
5821+ if (ggml_backend_buffer_is_host (cur->buffer )) {
58165822 data = (int32_t *) cur->data ;
58175823 } else {
58185824 lctx.buf_copy .resize (ggml_nbytes (cur));
@@ -9230,13 +9236,15 @@ struct llama_context * llama_new_context_with_model(
92309236 }
92319237#endif
92329238
9233- if (ctx->backend == nullptr ) {
9234- // FIXME: this may fail if the model buffer is not compatible with the CPU backend
9239+ if (ctx->backend == nullptr && ggml_backend_buffer_is_host (model->buf )) {
92359240 ctx->backend = ggml_backend_cpu_init ();
9241+ if (ctx->backend == nullptr ) {
9242+ LLAMA_LOG_ERROR (" %s: failed to initialize CPU backend\n " , __func__);
9243+ }
92369244 }
92379245
92389246 if (ctx->backend == nullptr ) {
9239- LLAMA_LOG_ERROR (" %s: failed to initialize CPU backend\n " , __func__);
9247+ LLAMA_LOG_ERROR (" %s: failed to initialize a backend\n " , __func__);
92409248 delete ctx;
92419249 return nullptr ;
92429250 }
0 commit comments