3939#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
4040#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
4141#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
42- #define cudaDeviceGetMemPool hipDeviceGetMemPool
43- #define cudaMemPoolAttrReleaseThreshold hipMemPoolAttrReleaseThreshold
44- #define cudaMemPoolSetAttribute hipMemPoolSetAttribute
45- #define cudaMemPool_t hipMemPool_t
4642#define cudaDeviceProp hipDeviceProp_t
4743#define cudaDeviceSynchronize hipDeviceSynchronize
4844#define cudaError_t hipError_t
5248#define cudaEvent_t hipEvent_t
5349#define cudaEventDestroy hipEventDestroy
5450#define cudaFree hipFree
55- #define cudaFreeAsync hipFreeAsync
5651#define cudaFreeHost hipHostFree
5752#define cudaGetDevice hipGetDevice
5853#define cudaGetDeviceCount hipGetDeviceCount
5954#define cudaGetDeviceProperties hipGetDeviceProperties
6055#define cudaGetErrorString hipGetErrorString
6156#define cudaGetLastError hipGetLastError
6257#define cudaMalloc hipMalloc
63- #define cudaMallocFromPoolAsync hipMallocFromPoolAsync
6458#define cudaMallocHost (ptr, size ) hipHostMalloc(ptr, size, hipHostMallocDefault)
6559#define cudaMemcpy hipMemcpy
6660#define cudaMemcpy2DAsync hipMemcpy2DAsync
@@ -187,11 +181,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
187181 do { \
188182 cudaError_t err_ = (err); \
189183 if (err_ != cudaSuccess) { \
190- int dev_id ; \
191- cudaGetDevice (&dev_id ); \
184+ int id ; \
185+ cudaGetDevice (&id ); \
192186 fprintf (stderr, " \n CUDA error %d at %s:%d: %s\n " , err_, __FILE__, __LINE__, \
193187 cudaGetErrorString (err_)); \
194- fprintf (stderr, " current device: %d\n " , dev_id ); \
188+ fprintf (stderr, " current device: %d\n " , id ); \
195189 exit (1 ); \
196190 } \
197191 } while (0 )
@@ -201,11 +195,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
201195 do { \
202196 cublasStatus_t err_ = (err); \
203197 if (err_ != CUBLAS_STATUS_SUCCESS) { \
204- int dev_id ; \
205- cudaGetDevice (&dev_id ); \
198+ int id ; \
199+ cudaGetDevice (&id ); \
206200 fprintf (stderr, " \n cuBLAS error %d at %s:%d: %s\n " , \
207201 err_, __FILE__, __LINE__, cublasGetStatusString (err_)); \
208- fprintf (stderr, " current device: %d\n " , dev_id ); \
202+ fprintf (stderr, " current device: %d\n " , id ); \
209203 exit (1 ); \
210204 } \
211205 } while (0 )
@@ -471,7 +465,6 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
471465
472466#define MAX_STREAMS 8
473467static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
474- static cudaMemPool_t g_cudaMemPools[GGML_CUDA_MAX_DEVICES] = { nullptr };
475468
476469struct ggml_tensor_extra_gpu {
477470 void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@@ -5780,16 +5773,6 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
57805773 return ptr;
57815774}
57825775
5783- static void * ggml_cuda_pool_malloc_async (size_t size, size_t * actual_size, int id, cudaStream_t stream) {
5784- if (g_cudaMemPools[id] == nullptr ) {
5785- return ggml_cuda_pool_malloc (size, actual_size);
5786- }
5787- void *ptr;
5788- CUDA_CHECK (cudaMallocFromPoolAsync (&ptr, size, g_cudaMemPools[id], stream));
5789- *actual_size = size;
5790- return ptr;
5791- }
5792-
57935776static void ggml_cuda_pool_free (void * ptr, size_t size) {
57945777 scoped_spin_lock lock (g_cuda_pool_lock);
57955778 int id;
@@ -5808,13 +5791,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
58085791}
58095792
58105793
5811- static void ggml_cuda_pool_free_async (void * ptr, size_t actual_size, int id, cudaStream_t stream) {
5812- if (g_cudaMemPools[id] == nullptr ) {
5813- return ggml_cuda_pool_free (ptr, actual_size);
5814- }
5815- CUDA_CHECK (cudaFreeAsync (ptr, stream));
5816- }
5817-
58185794void ggml_init_cublas () {
58195795 static bool initialized = false ;
58205796
@@ -5869,13 +5845,6 @@ void ggml_init_cublas() {
58695845 // create cublas handle
58705846 CUBLAS_CHECK (cublasCreate (&g_cublas_handles[id]));
58715847 CUBLAS_CHECK (cublasSetMathMode (g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
5872-
5873- // configure memory pool
5874- cudaError_t err = cudaDeviceGetMemPool (&g_cudaMemPools[id], id);
5875- if (err == cudaSuccess) {
5876- size_t treshold = UINT64_MAX;
5877- CUDA_CHECK (cudaMemPoolSetAttribute (g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
5878- }
58795848 }
58805849
58815850 // configure logging to stdout
@@ -6469,7 +6438,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
64696438 const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda (src0->type );
64706439 GGML_ASSERT (to_fp16_cuda != nullptr );
64716440 size_t ne = row_diff*ne00;
6472- src0_as_f16 = (half *) ggml_cuda_pool_malloc_async (ne * sizeof (half), &src0_as, id, stream );
6441+ src0_as_f16 = (half *) ggml_cuda_pool_malloc (ne * sizeof (half), &src0_as);
64736442 to_fp16_cuda (src0_dd_i, src0_as_f16, ne, stream);
64746443 }
64756444 const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
@@ -6480,12 +6449,13 @@ inline void ggml_cuda_op_mul_mat_cublas(
64806449 const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda (src1->type );
64816450 GGML_ASSERT (to_fp16_cuda != nullptr );
64826451 size_t ne = src1_ncols*ne10;
6483- src1_as_f16 = (half *) ggml_cuda_pool_malloc_async (ne * sizeof (half), &src1_as, id, stream );
6452+ src1_as_f16 = (half *) ggml_cuda_pool_malloc (ne * sizeof (half), &src1_as);
64846453 to_fp16_cuda (src1_ddf_i, src1_as_f16, ne, stream);
64856454 }
64866455 const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
6487- size_t dst_f16_as = 0 ;
6488- half * dst_f16 = (half *) ggml_cuda_pool_malloc_async (row_diff*src1_ncols * sizeof (half), &dst_f16_as, id, stream);
6456+
6457+ size_t dst_as = 0 ;
6458+ half * dst_f16 = (half *) ggml_cuda_pool_malloc (row_diff*src1_ncols * sizeof (half), &dst_as);
64896459
64906460 const half alpha_f16 = 1 .0f ;
64916461 const half beta_f16 = 0 .0f ;
@@ -6503,15 +6473,14 @@ inline void ggml_cuda_op_mul_mat_cublas(
65036473 const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
65046474 to_fp32_cuda (dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
65056475
6506- if (dst_f16_as != 0 ) {
6507- ggml_cuda_pool_free_async (dst_f16, dst_f16_as, id, stream);
6508- }
6476+ ggml_cuda_pool_free (dst_f16, dst_as);
65096477
65106478 if (src0_as != 0 ) {
6511- ggml_cuda_pool_free_async (src0_as_f16, src0_as, id, stream );
6479+ ggml_cuda_pool_free (src0_as_f16, src0_as);
65126480 }
6481+
65136482 if (src1_as != 0 ) {
6514- ggml_cuda_pool_free_async (src1_as_f16, src1_as, id, stream );
6483+ ggml_cuda_pool_free (src1_as_f16, src1_as);
65156484 }
65166485 }
65176486 else {
@@ -6521,7 +6490,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
65216490 if (src0->type != GGML_TYPE_F32) {
65226491 const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (src0->type );
65236492 GGML_ASSERT (to_fp32_cuda != nullptr );
6524- src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc_async (row_diff*ne00 * sizeof (float ), &src0_as, id, stream ); // NOLINT
6493+ src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc (row_diff*ne00 * sizeof (float ), &src0_as); // NOLINT
65256494 to_fp32_cuda (src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
65266495 }
65276496 const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
@@ -6538,7 +6507,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
65386507 &beta, dst_dd_i, ldc));
65396508
65406509 if (src0_as != 0 ) {
6541- ggml_cuda_pool_free_async (src0_ddq_as_f32, src0_as, id, stream );
6510+ ggml_cuda_pool_free (src0_ddq_as_f32, src0_as);
65426511 }
65436512 }
65446513
@@ -6961,30 +6930,29 @@ static void ggml_cuda_op_mul_mat(
69616930 src0_dd[id] = (char *) src0_extra->data_device [id];
69626931 } else {
69636932 const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes (src0);
6964- src0_dd[id] = (char *) ggml_cuda_pool_malloc_async (ggml_nbytes (src0), &src0_as[id], id, stream );
6933+ src0_dd[id] = (char *) ggml_cuda_pool_malloc (ggml_nbytes (src0), &src0_as[id]);
69656934 }
69666935
69676936 if (src1_on_device && src1_is_contiguous) {
69686937 src1_ddf[id] = (float *) src1_extra->data_device [id];
69696938 } else {
6970- src1_ddf[id] = (float *) ggml_cuda_pool_malloc_async (ggml_nbytes (src1), &src1_asf[id], id, stream );
6939+ src1_ddf[id] = (float *) ggml_cuda_pool_malloc (ggml_nbytes (src1), &src1_asf[id]);
69716940 }
69726941
69736942 if (convert_src1_to_q8_1) {
6974- const size_t size_dst_ddq = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
6975- src1_ddq[id] = (char *) ggml_cuda_pool_malloc_async (size_dst_ddq, &src1_asq[id], id, stream);
6943+ src1_ddq[id] = (char *) ggml_cuda_pool_malloc (nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
69766944
69776945 if (src1_on_device && src1_is_contiguous) {
69786946 quantize_row_q8_1_cuda (src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6979- // CUDA_CHECK(cudaGetLastError());
6947+ CUDA_CHECK (cudaGetLastError ());
69806948 }
69816949 }
69826950
69836951 if (dst_on_device) {
69846952 dst_dd[id] = (float *) dst_extra->data_device [id];
69856953 } else {
69866954 const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof (float ) : ggml_nbytes (dst);
6987- dst_dd[id] = (float *) ggml_cuda_pool_malloc_async (size_dst_ddf, &dst_as[id], id, stream );
6955+ dst_dd[id] = (float *) ggml_cuda_pool_malloc (size_dst_ddf, &dst_as[id]);
69886956 }
69896957 }
69906958
@@ -7110,6 +7078,24 @@ static void ggml_cuda_op_mul_mat(
71107078 }
71117079 }
71127080
7081+ for (int64_t id = 0 ; id < g_device_count; ++id) {
7082+ CUDA_CHECK (ggml_cuda_set_device (id));
7083+
7084+ // free buffers again when done
7085+ if (src0_as[id] > 0 ) {
7086+ ggml_cuda_pool_free (src0_dd[id], src0_as[id]);
7087+ }
7088+ if (src1_asf[id] > 0 ) {
7089+ ggml_cuda_pool_free (src1_ddf[id], src1_asf[id]);
7090+ }
7091+ if (src1_asq[id] > 0 ) {
7092+ ggml_cuda_pool_free (src1_ddq[id], src1_asq[id]);
7093+ }
7094+ if (dst_as[id] > 0 ) {
7095+ ggml_cuda_pool_free (dst_dd[id], dst_as[id]);
7096+ }
7097+ }
7098+
71137099 // main device waits for all other devices to be finished
71147100 if (split && g_device_count > 1 ) {
71157101 int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1 ) / MUL_MAT_SRC1_COL_STRIDE;
@@ -7127,21 +7113,6 @@ static void ggml_cuda_op_mul_mat(
71277113 CUDA_CHECK (ggml_cuda_set_device (g_main_device));
71287114 CUDA_CHECK (cudaDeviceSynchronize ());
71297115 }
7130-
7131- for (int64_t id = 0 ; id < g_device_count; ++id) {
7132- if (src0_as[id] > 0 ) {
7133- ggml_cuda_pool_free_async (src0_dd[id], src0_as[id], id, g_cudaStreams[id][0 ]);
7134- }
7135- if (src1_asf[id] > 0 ) {
7136- ggml_cuda_pool_free_async (src1_ddf[id], src1_asf[id], id, g_cudaStreams[id][0 ]);
7137- }
7138- if (src1_asq[id] > 0 ) {
7139- ggml_cuda_pool_free_async (src1_ddq[id], src1_asq[id], id, g_cudaStreams[id][0 ]);
7140- }
7141- if (dst_as[id] > 0 ) {
7142- ggml_cuda_pool_free_async (dst_dd[id], dst_as[id], id, g_cudaStreams[id][0 ]);
7143- }
7144- }
71457116}
71467117
71477118static void ggml_cuda_repeat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7328,11 +7299,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
73287299 GGML_ASSERT (to_fp16_cuda != nullptr );
73297300
73307301 size_t src1_as = 0 ;
7331- half * src1_as_f16 = (half *) ggml_cuda_pool_malloc_async (ne1 * sizeof (half), &src1_as, id, main_stream );
7302+ half * src1_as_f16 = (half *) ggml_cuda_pool_malloc (ne1 * sizeof (half), &src1_as);
73327303 to_fp16_cuda (src1_ddf, src1_as_f16, ne1, main_stream);
73337304
73347305 size_t dst_as = 0 ;
7335- half * dst_f16 = (half *) ggml_cuda_pool_malloc_async (ne * sizeof (half), &dst_as, id, main_stream );
7306+ half * dst_f16 = (half *) ggml_cuda_pool_malloc (ne * sizeof (half), &dst_as);
73367307
73377308 GGML_ASSERT (ne12 % ne02 == 0 );
73387309 GGML_ASSERT (ne13 % ne03 == 0 );
@@ -7386,8 +7357,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
73867357 size_t ptrs_src_s = 0 ;
73877358 size_t ptrs_dst_s = 0 ;
73887359
7389- ptrs_src = (const void **) ggml_cuda_pool_malloc_async (2 *ne23*sizeof (void *), &ptrs_src_s, id, main_stream );
7390- ptrs_dst = ( void **) ggml_cuda_pool_malloc_async (1 *ne23*sizeof (void *), &ptrs_dst_s, id, main_stream );
7360+ ptrs_src = (const void **) ggml_cuda_pool_malloc (2 *ne23*sizeof (void *), &ptrs_src_s);
7361+ ptrs_dst = ( void **) ggml_cuda_pool_malloc (1 *ne23*sizeof (void *), &ptrs_dst_s);
73917362
73927363 dim3 block_dims (ne13, ne12);
73937364 k_compute_batched_ptrs<<<1 , block_dims, 0 , main_stream>>> (
@@ -7400,6 +7371,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
74007371 dst->nb [2 ], dst->nb [3 ],
74017372 r2, r3);
74027373 CUDA_CHECK (cudaGetLastError ());
7374+
74037375 CUBLAS_CHECK (
74047376 cublasGemmBatchedEx (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
74057377 ne01, ne11, ne10,
@@ -7411,22 +7383,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
74117383 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
74127384
74137385 if (ptrs_src_s != 0 ) {
7414- ggml_cuda_pool_free_async (ptrs_src, ptrs_src_s, id, main_stream );
7386+ ggml_cuda_pool_free (ptrs_src, ptrs_src_s);
74157387 }
74167388 if (ptrs_dst_s != 0 ) {
7417- ggml_cuda_pool_free_async (ptrs_dst, ptrs_dst_s, id, main_stream );
7389+ ggml_cuda_pool_free (ptrs_dst, ptrs_dst_s);
74187390 }
74197391 }
74207392#endif
74217393
74227394 const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
74237395 to_fp32_cuda (dst_f16, dst_ddf, ne, main_stream);
7424- if (src1_as != 0 ) {
7425- ggml_cuda_pool_free_async (src1_as_f16, src1_as, id, main_stream);
7426- }
7427- if (dst_as != 0 ) {
7428- ggml_cuda_pool_free_async (dst_f16, dst_as, id, main_stream);
7429- }
7396+
7397+ ggml_cuda_pool_free (src1_as_f16, src1_as);
7398+ ggml_cuda_pool_free (dst_f16, dst_as);
74307399}
74317400
74327401static void ggml_cuda_mul_mat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
0 commit comments