8888#define __trap abort
8989#else
9090#include < cuda_runtime.h>
91+ #include < cuda.h>
9192#include < cublas_v2.h>
9293#include < cuda_fp16.h>
9394// CUDA 10.2 does not have these macro definitions.
@@ -213,6 +214,24 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
213214 } \
214215 } while (0 )
215216
217+ // driver API
218+ #define CU_CHECK (err ) \
219+ do { \
220+ CUresult err_ = (err); \
221+ if (err_ != CUDA_SUCCESS) { \
222+ int id; \
223+ cuDeviceGet (&id, 0 ); \
224+ const char * err_str; \
225+ cuGetErrorString (err_, &err_str); \
226+ fprintf (stderr, " \n CUDA error %d at %s:%d: %s\n " , err_, __FILE__, __LINE__, \
227+ err_str); \
228+ fprintf (stderr, " %s\n " , #err); \
229+ fprintf (stderr, " current device: %d\n " , id); \
230+ GGML_ASSERT (!" CUDA error" ); \
231+ } \
232+ } while (0 )
233+
234+
216235#if CUDART_VERSION >= 12000
217236#define CUBLAS_CHECK (err ) \
218237 do { \
@@ -6543,21 +6562,26 @@ struct scoped_spin_lock {
65436562 scoped_spin_lock& operator =(const scoped_spin_lock&) = delete ;
65446563};
65456564
6565+ static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
6566+
6567+ #if 0
6568+ #define DEBUG_CUDA_MALLOC
65466569struct cuda_buffer {
65476570 void * ptr = nullptr;
65486571 size_t size = 0;
65496572};
65506573
65516574static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
6552- static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
6575+
6576+ static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
65536577
65546578static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
65556579 scoped_spin_lock lock(g_cuda_pool_lock);
65566580 int id;
65576581 CUDA_CHECK(cudaGetDevice(&id));
65586582#ifdef DEBUG_CUDA_MALLOC
65596583 int nnz = 0;
6560- size_t max_size = 0 , tot_size = 0 ;
6584+ size_t max_size = 0;
65616585#endif
65626586 size_t best_diff = 1ull << 36;
65636587 int ibest = -1;
@@ -6566,7 +6590,6 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
65666590 if (b.ptr != nullptr) {
65676591#ifdef DEBUG_CUDA_MALLOC
65686592 ++nnz;
6569- tot_size += b.size ;
65706593 if (b.size > max_size) max_size = b.size;
65716594#endif
65726595 if (b.size >= size) {
@@ -6593,15 +6616,16 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
65936616 b.size = 0;
65946617 return ptr;
65956618 }
6596- #ifdef DEBUG_CUDA_MALLOC
6597- fprintf (stderr, " %s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n " , __func__, nnz,
6598- (uint32_t )(max_size/1024 /1024 ), (uint32_t )(tot_size/1024 /1024 ), (uint32_t )(size/1024 /1024 ));
6599- #endif
66006619 void * ptr;
66016620 size_t look_ahead_size = (size_t) (1.05 * size);
66026621 look_ahead_size = 256 * ((look_ahead_size + 255)/256);
66036622 CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
66046623 *actual_size = look_ahead_size;
6624+ g_cuda_pool_size[id] += look_ahead_size;
6625+ #ifdef DEBUG_CUDA_MALLOC
6626+ fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
6627+ (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
6628+ #endif
66056629 return ptr;
66066630}
66076631
@@ -6620,8 +6644,107 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
66206644 }
66216645 fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
66226646 CUDA_CHECK(cudaFree(ptr));
6647+ g_cuda_pool_size[id] -= size;
6648+ }
6649+ #else
6650+
6651+ static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
6652+ static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0 };
6653+ static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0 };
6654+ static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0 };
6655+
6656+ static const size_t CUDA_POOL_MAX_SIZE = 1ull << 36 ; // 64 GB
6657+
6658+ // #define DEBUG_CUDA_MALLOC
6659+
6660+ #define ggml_cuda_pool_malloc (size, actual_size ) ggml_cuda_pool_malloc_(size, actual_size, #size " " #actual_size)
6661+ static void * ggml_cuda_pool_malloc_ (size_t size, size_t * actual_size, const char * call) {
6662+ scoped_spin_lock lock (g_cuda_pool_lock);
6663+ int id;
6664+ CUDA_CHECK (cudaGetDevice (&id));
6665+
6666+ size_t avail = g_cuda_pool_size[id] - g_cuda_pool_used[id];
6667+
6668+ if (size > avail) {
6669+ size_t reserve_size = size - avail;
6670+
6671+ // allocate more physical memory
6672+ CUmemAllocationProp prop = {};
6673+ prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
6674+ prop.location .type = CU_MEM_LOCATION_TYPE_DEVICE;
6675+ prop.location .id = id;
6676+
6677+ // get the minimum allocation granularity for this device
6678+ size_t granularity = 0 ;
6679+ CU_CHECK (cuMemGetAllocationGranularity (&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
6680+
6681+ // round up to the nearest granularity
6682+ reserve_size = granularity * ((reserve_size + granularity - 1 ) / granularity);
6683+
6684+ GGML_ASSERT (g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_MAX_SIZE);
6685+
6686+ CUmemGenericAllocationHandle handle;
6687+ CU_CHECK (cuMemCreate (&handle, reserve_size, &prop, 0 ));
6688+
6689+ // reserve virtual address space (if not already reserved)
6690+ if (g_cuda_pool_addr[id] == 0 ) {
6691+ CU_CHECK (cuMemAddressReserve (&g_cuda_pool_addr[id], CUDA_POOL_MAX_SIZE, 0 , 0 , 0 ));
6692+ }
6693+
6694+ // map at the end of the pool
6695+ CU_CHECK (cuMemMap (g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, 0 , handle, 0 ));
6696+
6697+ // set access
6698+ CUmemAccessDesc access = {};
6699+ access.location .type = CU_MEM_LOCATION_TYPE_DEVICE;
6700+ access.location .id = id;
6701+ access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
6702+ CU_CHECK (cuMemSetAccess (g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, &access, 1 ));
6703+
6704+ // add to the pool
6705+ g_cuda_pool_handles[id].push_back (handle);
6706+ g_cuda_pool_size[id] += reserve_size;
6707+
6708+ printf (" cuda pool[%d]: size increased to %llu MB (reserved %llu MB) [%s]\n " ,
6709+ id, (unsigned long long ) (g_cuda_pool_size[id]/1024 /1024 ),
6710+ (unsigned long long ) (reserve_size/1024 /1024 ), call);
6711+ }
6712+
6713+ GGML_ASSERT (g_cuda_pool_addr[id] != 0 );
6714+
6715+ void * ptr = (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]);
6716+ *actual_size = size;
6717+ g_cuda_pool_used[id] += size;
6718+
6719+ #ifdef DEBUG_CUDA_MALLOC
6720+ printf (" cuda pool[%d]: allocated %llu bytes at %llx [%s]\n " , id, (unsigned long long ) size, ptr, call);
6721+ #endif
6722+
6723+ return ptr;
6724+
6725+ GGML_UNUSED (call);
66236726}
66246727
6728+ #define ggml_cuda_pool_free (ptr, size ) ggml_cuda_pool_free_(ptr, size, #ptr " " #size)
6729+ static void ggml_cuda_pool_free_ (void * ptr, size_t size, const char * call) {
6730+ scoped_spin_lock lock (g_cuda_pool_lock);
6731+ int id;
6732+ CUDA_CHECK (cudaGetDevice (&id));
6733+
6734+ #ifdef DEBUG_CUDA_MALLOC
6735+ printf (" cuda pool[%d]: free %llu bytes at %llx [%s]\n " , id, (unsigned long long ) size, ptr, call);
6736+ #endif
6737+
6738+ g_cuda_pool_used[id] -= size;
6739+
6740+ // all deallocations must be in reverse order of the allocations
6741+ GGML_ASSERT (ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
6742+
6743+ GGML_UNUSED (call);
6744+ }
6745+
6746+ #endif
6747+
66256748static bool g_cublas_loaded = false ;
66266749
66276750bool ggml_cublas_loaded (void ) {
@@ -7437,13 +7560,13 @@ inline void ggml_cuda_op_mul_mat_cublas(
74377560
74387561 ggml_cuda_pool_free (dst_f16, dst_as);
74397562
7440- if (src0_as != 0 ) {
7441- ggml_cuda_pool_free (src0_as_f16, src0_as);
7442- }
7443-
74447563 if (src1_as != 0 ) {
74457564 ggml_cuda_pool_free (src1_as_f16, src1_as);
74467565 }
7566+
7567+ if (src0_as != 0 ) {
7568+ ggml_cuda_pool_free (src0_as_f16, src0_as);
7569+ }
74477570 }
74487571 else {
74497572 float * src0_ddq_as_f32 = nullptr ;
@@ -7800,14 +7923,14 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
78007923 CUDA_CHECK (cudaMemcpyAsync (dst->data , dst_ddf, ggml_nbytes (dst), cudaMemcpyDeviceToHost, main_stream));
78017924 }
78027925
7803- if (src0_asf > 0 ) {
7804- ggml_cuda_pool_free (src0_ddf, src0_asf );
7926+ if (dst_asf > 0 ) {
7927+ ggml_cuda_pool_free (dst_ddf, dst_asf );
78057928 }
78067929 if (src1_asf > 0 ) {
78077930 ggml_cuda_pool_free (src1_ddf, src1_asf);
78087931 }
7809- if (dst_asf > 0 ) {
7810- ggml_cuda_pool_free (dst_ddf, dst_asf );
7932+ if (src0_asf > 0 ) {
7933+ ggml_cuda_pool_free (src0_ddf, src0_asf );
78117934 }
78127935
78137936 if (dst->backend == GGML_BACKEND_CPU) {
@@ -8119,17 +8242,17 @@ static void ggml_cuda_op_mul_mat(
81198242 CUDA_CHECK (ggml_cuda_set_device (id));
81208243
81218244 // free buffers again when done
8122- if (src0_as[id] > 0 ) {
8123- ggml_cuda_pool_free (src0_dd[id], src0_as[id]);
8124- }
8125- if (src1_asf[id] > 0 ) {
8126- ggml_cuda_pool_free (src1_ddf[id], src1_asf[id]);
8245+ if (dst_as[id] > 0 ) {
8246+ ggml_cuda_pool_free (dst_dd[id], dst_as[id]);
81278247 }
81288248 if (src1_asq[id] > 0 ) {
81298249 ggml_cuda_pool_free (src1_ddq[id], src1_asq[id]);
81308250 }
8131- if (dst_as[id] > 0 ) {
8132- ggml_cuda_pool_free (dst_dd[id], dst_as[id]);
8251+ if (src1_asf[id] > 0 ) {
8252+ ggml_cuda_pool_free (src1_ddf[id], src1_asf[id]);
8253+ }
8254+ if (src0_as[id] > 0 ) {
8255+ ggml_cuda_pool_free (src0_dd[id], src0_as[id]);
81338256 }
81348257 }
81358258
@@ -8497,12 +8620,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
84978620 cu_compute_type,
84988621 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
84998622
8500- if (ptrs_src_s != 0 ) {
8501- ggml_cuda_pool_free (ptrs_src, ptrs_src_s);
8502- }
85038623 if (ptrs_dst_s != 0 ) {
85048624 ggml_cuda_pool_free (ptrs_dst, ptrs_dst_s);
85058625 }
8626+ if (ptrs_src_s != 0 ) {
8627+ ggml_cuda_pool_free (ptrs_src, ptrs_src_s);
8628+ }
85068629 }
85078630#endif
85088631
0 commit comments