@@ -2369,13 +2369,39 @@ struct llama_context {
23692369 struct llama_control_vector cvec;
23702370};
23712371
2372+ static size_t llama_get_device_count(const llama_model & model) {
2373+ size_t count = 1;
2374+ #if defined(GGML_USE_CUDA)
2375+ count = ggml_backend_cuda_get_device_count();
2376+ #elif defined(GGML_USE_SYCL)
2377+ count = ggml_backend_sycl_get_device_count();
2378+ #elif defined(GGML_USE_VULKAN)
2379+ count = ggml_backend_vk_get_device_count();
2380+ #endif
2381+ #if defined(GGML_USE_RPC)
2382+ int rpc_count = (int)model.rpc_servers.size();
2383+ for (int i = 0; i < rpc_count; i++) {
2384+ int device = count + i;
2385+ const char * endpoint = model.rpc_servers[i].c_str();
2386+ ggml_backend_rpc_setdevice(endpoint, device);
2387+ }
2388+ count += rpc_count;
2389+ #endif
2390+ return count;
2391+ GGML_UNUSED(model);
2392+ }
2393+
23722394static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
23732395 ggml_backend_buffer_type_t buft = nullptr;
23742396
2375- #ifdef GGML_USE_RPC
2376- std::string endpoint = model.rpc_servers[gpu];
2377- buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2378- #elif defined(GGML_USE_METAL)
2397+ #if defined(GGML_USE_RPC)
2398+ int dev_count = (int)llama_get_device_count(model);
2399+ int rpc_count = (int)model.rpc_servers.size();
2400+ if (gpu >= dev_count - rpc_count) {
2401+ return ggml_backend_rpc_buffer_type(gpu);
2402+ }
2403+ #endif
2404+ #if defined(GGML_USE_METAL)
23792405 buft = ggml_backend_metal_buffer_type();
23802406#elif defined(GGML_USE_CUDA)
23812407 buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2423,29 +2449,18 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
24232449 GGML_UNUSED(tensor_split);
24242450}
24252451
2426- static size_t llama_get_device_count(const llama_model & model) {
2427- #if defined(GGML_USE_RPC)
2428- return model.rpc_servers.size();
2429- #elif defined(GGML_USE_CUDA)
2430- return ggml_backend_cuda_get_device_count();
2431- #elif defined(GGML_USE_SYCL)
2432- return ggml_backend_sycl_get_device_count();
2433- #elif defined(GGML_USE_VULKAN)
2434- return ggml_backend_vk_get_device_count();
2435- #else
2436- return 1;
2437- #endif
2438- GGML_UNUSED(model);
2439- }
2440-
24412452static size_t llama_get_device_memory(const llama_model & model, int device) {
24422453#if defined(GGML_USE_RPC)
2443- size_t total;
2444- size_t free;
2445- std::string endpoint = model.rpc_servers[device];
2446- ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2447- return free;
2448- #elif defined(GGML_USE_CUDA)
2454+ int dev_count = (int)llama_get_device_count(model);
2455+ int rpc_count = (int)model.rpc_servers.size();
2456+ if (device >= dev_count - rpc_count) {
2457+ size_t total;
2458+ size_t free;
2459+ ggml_backend_rpc_get_device_memory(device, &free, &total);
2460+ return free;
2461+ }
2462+ #endif
2463+ #if defined(GGML_USE_CUDA)
24492464 size_t total;
24502465 size_t free;
24512466 ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -16146,7 +16161,7 @@ struct llama_model * llama_load_model_from_file(
1614616161 return true;
1614716162 };
1614816163 }
16149- if (params.rpc_servers != nullptr) {
16164+ if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0' ) {
1615016165 // split the servers set them into model->rpc_servers
1615116166 std::string servers(params.rpc_servers);
1615216167 size_t pos = 0;
@@ -16304,17 +16319,7 @@ struct llama_context * llama_new_context_with_model(
1630416319
1630516320 if (!hparams.vocab_only) {
1630616321 // initialize backends
16307- #if defined(GGML_USE_RPC)
16308- for (auto & server : model->rpc_servers) {
16309- ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
16310- if (backend == nullptr) {
16311- LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
16312- llama_free(ctx);
16313- return nullptr;
16314- }
16315- ctx->backends.push_back(backend);
16316- }
16317- #elif defined(GGML_USE_METAL)
16322+ #if defined(GGML_USE_METAL)
1631816323 if (model->n_gpu_layers > 0) {
1631916324 ctx->backend_metal = ggml_backend_metal_init();
1632016325 if (ctx->backend_metal == nullptr) {
@@ -16406,6 +16411,19 @@ struct llama_context * llama_new_context_with_model(
1640616411 }
1640716412 ctx->backends.push_back(backend);
1640816413 }
16414+ #endif
16415+ #if defined(GGML_USE_RPC)
16416+ int dev_count = (int)llama_get_device_count(*model);
16417+ int rpc_count = (int)model->rpc_servers.size();
16418+ for (int i = dev_count - rpc_count; i < dev_count; i++) {
16419+ ggml_backend_t backend = ggml_backend_rpc_init(i);
16420+ if (backend == nullptr) {
16421+ LLAMA_LOG_ERROR("%s: failed to initialize RPC #%d\n", __func__, i);
16422+ llama_free(ctx);
16423+ return nullptr;
16424+ }
16425+ ctx->backends.push_back(backend);
16426+ }
1640916427#endif
1641016428 ctx->backend_cpu = ggml_backend_cpu_init();
1641116429 if (ctx->backend_cpu == nullptr) {
0 commit comments