Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ extern "C" {
GGML_BACKEND_DEVICE_TYPE_CPU,
// GPU device using dedicated memory
GGML_BACKEND_DEVICE_TYPE_GPU,
// integrated GPU device using host memory
GGML_BACKEND_DEVICE_TYPE_IGPU,
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_ACCEL
};
Expand All @@ -150,11 +152,21 @@ extern "C" {

// all the device properties
struct ggml_backend_dev_props {
// device name
const char * name;
// device description
const char * description;
// device free memory in bytes
size_t memory_free;
// device total memory in bytes
size_t memory_total;
// device type
enum ggml_backend_dev_type type;
// device id
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
// if the id is unknown, this should be NULL
const char * device_id;
// device capabilities
struct ggml_backend_dev_caps caps;
};

Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-backend-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
extern "C" {
#endif

#define GGML_BACKEND_API_VERSION 1
#define GGML_BACKEND_API_VERSION 2

//
// Backend buffer type
Expand Down
5 changes: 2 additions & 3 deletions ggml/src/ggml-backend-reg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,9 +400,8 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const

ggml_backend_t ggml_backend_init_best(void) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
if (!dev) {
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
}
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (!dev) {
return nullptr;
}
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3204,6 +3204,7 @@ struct ggml_backend_cuda_device_context {
int device;
std::string name;
std::string description;
std::string pci_bus_id;
};

static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
Expand All @@ -3228,9 +3229,12 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
}

static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;

props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev);
props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);

bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
Expand Down Expand Up @@ -3792,6 +3796,10 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;

char pci_bus_id[16] = {};
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
dev_ctx->pci_bus_id = pci_bus_id;

ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg,
Expand Down
2 changes: 2 additions & 0 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11750,13 +11750,15 @@ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(gg

static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
UNUSED(dev);
// TODO: return GGML_BACKEND_DEVICE_TYPE_IGPU for integrated GPUs
return GGML_BACKEND_DEVICE_TYPE_GPU;
}

static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->name = ggml_backend_vk_device_get_name(dev);
props->description = ggml_backend_vk_device_get_description(dev);
props->type = ggml_backend_vk_device_get_type(dev);
// TODO: set props->device_id to PCI bus id
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* .async = */ false,
Expand Down
59 changes: 50 additions & 9 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ bool llama_supports_mlock(void) {

bool llama_supports_gpu_offload(void) {
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
llama_supports_rpc();
}

Expand Down Expand Up @@ -182,8 +183,13 @@ static struct llama_model * llama_model_load_from_file_impl(
model->devices.push_back(*dev);
}
} else {
// default device selection

// build list of available devices
std::vector<ggml_backend_dev_t> gpus;
std::vector<ggml_backend_dev_t> igpus;
std::vector<ggml_backend_dev_t> rpc_servers;
// use all available devices

for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
switch (ggml_backend_dev_type(dev)) {
Expand All @@ -192,19 +198,51 @@ static struct llama_model * llama_model_load_from_file_impl(
// skip CPU backends since they are handled separately
break;

case GGML_BACKEND_DEVICE_TYPE_GPU:
case GGML_BACKEND_DEVICE_TYPE_GPU: {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
rpc_servers.push_back(dev);
} else {
model->devices.push_back(dev);
// check if there is already a GPU with the same device id
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
ggml_backend_dev_props d_props;
ggml_backend_dev_get_props(d, &d_props);
if (props.device_id && d_props.device_id) {
return strcmp(props.device_id, d_props.device_id) == 0;
}
return false;
});

if (it != gpus.end()) {
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
__func__,
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
props.device_id ? props.device_id : "unknown id",
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
} else {
gpus.push_back(dev);
}
}
break;
}

case GGML_BACKEND_DEVICE_TYPE_IGPU:
igpus.push_back(dev);
break;
}
}
// add RPC servers at the front of the list
if (!rpc_servers.empty()) {
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());

// add RPC servers at the front of the list to minimize network transfers
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());

// add GPUs
model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());

// add integrated GPUs only if no other devices were found
if (model->devices.empty()) {
model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
}
}

Expand All @@ -225,9 +263,12 @@ static struct llama_model * llama_model_load_from_file_impl(
}

for (auto * dev : model->devices) {
size_t free, total; // NOLINT
ggml_backend_dev_memory(dev, &free, &total);
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
props.device_id ? props.device_id : "unknown id",
props.memory_free/1024/1024);
}

const int status = llama_model_load(path_model, splits, *model, params);
Expand Down
7 changes: 6 additions & 1 deletion tools/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ static std::string get_gpu_info() {
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
auto * dev = ggml_backend_dev_get(i);
auto dev_type = ggml_backend_dev_type(dev);
if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU || dev_type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
gpu_list.push_back(ggml_backend_dev_description(dev));
}
}
Expand Down Expand Up @@ -945,6 +945,7 @@ struct cmd_params_instance {
exit(1);
}
}
// FIXME: use llama.cpp device selection logic
// add local GPU devices if any
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
Expand All @@ -957,6 +958,10 @@ struct cmd_params_instance {
case GGML_BACKEND_DEVICE_TYPE_GPU:
devices.push_back(dev);
break;

case GGML_BACKEND_DEVICE_TYPE_IGPU:
// iGPUs are not used when there are RPC servers
break;
}
}
devices.push_back(nullptr);
Expand Down