Skip to content

Commit 0d77fbd

Browse files
committed
cuda : improve cuda pool efficiency using virtual memory
1 parent 7082d24 commit 0d77fbd

File tree

2 files changed

+150
-26
lines changed

2 files changed

+150
-26
lines changed

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,9 +367,10 @@ endif # LLAMA_BLIS
367367

368368
ifdef LLAMA_CUBLAS
369369
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
370-
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib
370+
MK_LDFLAGS += -lcuda -L/usr/lib/wsl/lib -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib
371371
OBJS += ggml-cuda.o
372372
MK_NVCCFLAGS = -use_fast_math
373+
373374
ifndef JETSON_EOL_MODULE_DETECT
374375
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
375376
endif # JETSON_EOL_MODULE_DETECT

ggml-cuda.cu

Lines changed: 148 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@
8888
#define __trap abort
8989
#else
9090
#include <cuda_runtime.h>
91+
#include <cuda.h>
9192
#include <cublas_v2.h>
9293
#include <cuda_fp16.h>
9394
// CUDA 10.2 does not have these macro definitions.
@@ -213,6 +214,24 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
213214
} \
214215
} while (0)
215216

217+
// driver API
218+
#define CU_CHECK(err) \
219+
do { \
220+
CUresult err_ = (err); \
221+
if (err_ != CUDA_SUCCESS) { \
222+
int id; \
223+
cuDeviceGet(&id, 0); \
224+
const char * err_str; \
225+
cuGetErrorString(err_, &err_str); \
226+
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
227+
err_str); \
228+
fprintf(stderr, "%s\n", #err); \
229+
fprintf(stderr, "current device: %d\n", id); \
230+
GGML_ASSERT(!"CUDA error"); \
231+
} \
232+
} while (0)
233+
234+
216235
#if CUDART_VERSION >= 12000
217236
#define CUBLAS_CHECK(err) \
218237
do { \
@@ -6543,21 +6562,26 @@ struct scoped_spin_lock {
65436562
scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
65446563
};
65456564

6565+
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
6566+
6567+
#if 0
6568+
#define DEBUG_CUDA_MALLOC
65466569
struct cuda_buffer {
65476570
void * ptr = nullptr;
65486571
size_t size = 0;
65496572
};
65506573

65516574
static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
6552-
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
6575+
6576+
static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
65536577

65546578
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
65556579
scoped_spin_lock lock(g_cuda_pool_lock);
65566580
int id;
65576581
CUDA_CHECK(cudaGetDevice(&id));
65586582
#ifdef DEBUG_CUDA_MALLOC
65596583
int nnz = 0;
6560-
size_t max_size = 0, tot_size = 0;
6584+
size_t max_size = 0;
65616585
#endif
65626586
size_t best_diff = 1ull << 36;
65636587
int ibest = -1;
@@ -6566,7 +6590,6 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
65666590
if (b.ptr != nullptr) {
65676591
#ifdef DEBUG_CUDA_MALLOC
65686592
++nnz;
6569-
tot_size += b.size;
65706593
if (b.size > max_size) max_size = b.size;
65716594
#endif
65726595
if (b.size >= size) {
@@ -6593,15 +6616,16 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
65936616
b.size = 0;
65946617
return ptr;
65956618
}
6596-
#ifdef DEBUG_CUDA_MALLOC
6597-
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
6598-
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
6599-
#endif
66006619
void * ptr;
66016620
size_t look_ahead_size = (size_t) (1.05 * size);
66026621
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
66036622
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
66046623
*actual_size = look_ahead_size;
6624+
g_cuda_pool_size[id] += look_ahead_size;
6625+
#ifdef DEBUG_CUDA_MALLOC
6626+
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
6627+
(uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
6628+
#endif
66056629
return ptr;
66066630
}
66076631

@@ -6620,8 +6644,107 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
66206644
}
66216645
fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
66226646
CUDA_CHECK(cudaFree(ptr));
6647+
g_cuda_pool_size[id] -= size;
6648+
}
6649+
#else
6650+
6651+
static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
6652+
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
6653+
static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
6654+
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
6655+
6656+
static const size_t CUDA_POOL_MAX_SIZE = 1ull << 36; // 64 GB
6657+
6658+
//#define DEBUG_CUDA_MALLOC
6659+
6660+
#define ggml_cuda_pool_malloc(size, actual_size) ggml_cuda_pool_malloc_(size, actual_size, #size " " #actual_size)
6661+
static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const char * call) {
6662+
scoped_spin_lock lock(g_cuda_pool_lock);
6663+
int id;
6664+
CUDA_CHECK(cudaGetDevice(&id));
6665+
6666+
size_t avail = g_cuda_pool_size[id] - g_cuda_pool_used[id];
6667+
6668+
if (size > avail) {
6669+
size_t reserve_size = size - avail;
6670+
6671+
// allocate more physical memory
6672+
CUmemAllocationProp prop = {};
6673+
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
6674+
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
6675+
prop.location.id = id;
6676+
6677+
// get the minimum allocation granularity for this device
6678+
size_t granularity = 0;
6679+
CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
6680+
6681+
// round up to the nearest granularity
6682+
reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
6683+
6684+
GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_MAX_SIZE);
6685+
6686+
CUmemGenericAllocationHandle handle;
6687+
CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
6688+
6689+
// reserve virtual address space (if not already reserved)
6690+
if (g_cuda_pool_addr[id] == 0) {
6691+
CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], CUDA_POOL_MAX_SIZE, 0, 0, 0));
6692+
}
6693+
6694+
// map at the end of the pool
6695+
CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, 0, handle, 0));
6696+
6697+
// set access
6698+
CUmemAccessDesc access = {};
6699+
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
6700+
access.location.id = id;
6701+
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
6702+
CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, &access, 1));
6703+
6704+
// add to the pool
6705+
g_cuda_pool_handles[id].push_back(handle);
6706+
g_cuda_pool_size[id] += reserve_size;
6707+
6708+
printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB) [%s]\n",
6709+
id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
6710+
(unsigned long long) (reserve_size/1024/1024), call);
6711+
}
6712+
6713+
GGML_ASSERT(g_cuda_pool_addr[id] != 0);
6714+
6715+
void * ptr = (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]);
6716+
*actual_size = size;
6717+
g_cuda_pool_used[id] += size;
6718+
6719+
#ifdef DEBUG_CUDA_MALLOC
6720+
printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr, call);
6721+
#endif
6722+
6723+
return ptr;
6724+
6725+
GGML_UNUSED(call);
66236726
}
66246727

6728+
#define ggml_cuda_pool_free(ptr, size) ggml_cuda_pool_free_(ptr, size, #ptr " " #size)
6729+
static void ggml_cuda_pool_free_(void * ptr, size_t size, const char * call) {
6730+
scoped_spin_lock lock(g_cuda_pool_lock);
6731+
int id;
6732+
CUDA_CHECK(cudaGetDevice(&id));
6733+
6734+
#ifdef DEBUG_CUDA_MALLOC
6735+
printf("cuda pool[%d]: free %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr, call);
6736+
#endif
6737+
6738+
g_cuda_pool_used[id] -= size;
6739+
6740+
// all deallocations must be in reverse order of the allocations
6741+
GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
6742+
6743+
GGML_UNUSED(call);
6744+
}
6745+
6746+
#endif
6747+
66256748
static bool g_cublas_loaded = false;
66266749

66276750
bool ggml_cublas_loaded(void) {
@@ -7437,13 +7560,13 @@ inline void ggml_cuda_op_mul_mat_cublas(
74377560

74387561
ggml_cuda_pool_free(dst_f16, dst_as);
74397562

7440-
if (src0_as != 0) {
7441-
ggml_cuda_pool_free(src0_as_f16, src0_as);
7442-
}
7443-
74447563
if (src1_as != 0) {
74457564
ggml_cuda_pool_free(src1_as_f16, src1_as);
74467565
}
7566+
7567+
if (src0_as != 0) {
7568+
ggml_cuda_pool_free(src0_as_f16, src0_as);
7569+
}
74477570
}
74487571
else {
74497572
float * src0_ddq_as_f32 = nullptr;
@@ -7800,14 +7923,14 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
78007923
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
78017924
}
78027925

7803-
if (src0_asf > 0) {
7804-
ggml_cuda_pool_free(src0_ddf, src0_asf);
7926+
if (dst_asf > 0) {
7927+
ggml_cuda_pool_free(dst_ddf, dst_asf);
78057928
}
78067929
if (src1_asf > 0) {
78077930
ggml_cuda_pool_free(src1_ddf, src1_asf);
78087931
}
7809-
if (dst_asf > 0) {
7810-
ggml_cuda_pool_free(dst_ddf, dst_asf);
7932+
if (src0_asf > 0) {
7933+
ggml_cuda_pool_free(src0_ddf, src0_asf);
78117934
}
78127935

78137936
if (dst->backend == GGML_BACKEND_CPU) {
@@ -8119,17 +8242,17 @@ static void ggml_cuda_op_mul_mat(
81198242
CUDA_CHECK(ggml_cuda_set_device(id));
81208243

81218244
// free buffers again when done
8122-
if (src0_as[id] > 0) {
8123-
ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
8124-
}
8125-
if (src1_asf[id] > 0) {
8126-
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
8245+
if (dst_as[id] > 0) {
8246+
ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
81278247
}
81288248
if (src1_asq[id] > 0) {
81298249
ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
81308250
}
8131-
if (dst_as[id] > 0) {
8132-
ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
8251+
if (src1_asf[id] > 0) {
8252+
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
8253+
}
8254+
if (src0_as[id] > 0) {
8255+
ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
81338256
}
81348257
}
81358258

@@ -8497,12 +8620,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
84978620
cu_compute_type,
84988621
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
84998622

8500-
if (ptrs_src_s != 0) {
8501-
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
8502-
}
85038623
if (ptrs_dst_s != 0) {
85048624
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
85058625
}
8626+
if (ptrs_src_s != 0) {
8627+
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
8628+
}
85068629
}
85078630
#endif
85088631

0 commit comments

Comments
 (0)