@@ -194,6 +194,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co
194194 dst[i] = x[i] + y[i];
195195}
196196
197+ static __global__ void add_f16_f32_f16 (const half * x, const float * y, half * dst, const int k) {
198+ const int i = blockDim .x *blockIdx .x + threadIdx .x ;
199+
200+ if (i >= k) {
201+ return ;
202+ }
203+ dst[i] = x[i] + __float2half (y[i]);
204+ }
205+
197206static __global__ void mul_f32 (const float * x, const float * y, float * dst, const int kx, const int ky) {
198207 const int i = blockDim .x *blockIdx .x + threadIdx .x ;
199208
@@ -1209,6 +1218,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in
12091218 add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0 , stream>>> (x, y, dst, k);
12101219}
12111220
1221+ static void add_f16_f32_f16_cuda (const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
1222+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1 ) / CUDA_ADD_BLOCK_SIZE;
1223+ add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0 , stream>>> (x, y, dst, k);
1224+ }
1225+
12121226static void mul_f32_cuda (const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
12131227 const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1 ) / CUDA_MUL_BLOCK_SIZE;
12141228 mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0 , stream>>> (x, y, dst, kx, ky);
@@ -1675,15 +1689,21 @@ inline void ggml_cuda_op_add(
16751689 float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
16761690 cudaStream_t & cudaStream_main){
16771691
1678- GGML_ASSERT (src0_ddf_i != nullptr );
1692+ GGML_ASSERT (src0_ddq_i != nullptr || src0_ddf_i != nullptr );
16791693 GGML_ASSERT (src1_ddf_i != nullptr );
16801694 GGML_ASSERT (dst_ddf_i != nullptr );
16811695
16821696 const int64_t ne0 = src0->ne [0 ];
16831697 const int64_t i01_diff = i01_high - i01_low;
16841698
16851699 // compute
1686- add_f32_cuda (src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1700+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
1701+ add_f32_cuda (src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1702+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
1703+ add_f16_f32_f16_cuda ((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
1704+ } else {
1705+ GGML_ASSERT (false );
1706+ }
16871707 CUDA_CHECK (cudaGetLastError ());
16881708
16891709 (void ) src1;
@@ -2281,8 +2301,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
22812301}
22822302
22832303void ggml_cuda_add (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2284- GGML_ASSERT (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2285- ggml_cuda_op (src0, src1, dst, ggml_cuda_op_add, true , true );
2304+ // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
2305+ // Due to flatten_rows == true this does in practice not make a difference however.
2306+ // Better solution would be nice but right now that would require disproportionate changes.
2307+ GGML_ASSERT (
2308+ (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
2309+ src1->type == GGML_TYPE_F32 &&
2310+ (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
2311+ ggml_cuda_op (src0, src1, dst, ggml_cuda_op_add, false , true );
22862312}
22872313
22882314void ggml_cuda_mul (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2535,7 +2561,7 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
25352561 delete extra;
25362562}
25372563
2538- void ggml_cuda_assign_buffers_impl (struct ggml_tensor * tensor, bool scratch) {
2564+ void ggml_cuda_assign_buffers_impl (struct ggml_tensor * tensor, bool scratch, bool force_inplace ) {
25392565 if (scratch && g_scratch_size == 0 ) {
25402566 return ;
25412567 }
@@ -2544,22 +2570,23 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
25442570 if (tensor->src0 != nullptr && tensor->src0 ->backend == GGML_BACKEND_CPU) {
25452571 const ggml_op src0_op = tensor->src0 ->op ;
25462572 if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
2547- ggml_cuda_assign_buffers_impl (tensor->src0 , scratch);
2573+ ggml_cuda_assign_buffers_impl (tensor->src0 , scratch, force_inplace );
25482574 }
25492575 }
25502576 if (tensor->op == GGML_OP_CPY && tensor->src1 ->backend == GGML_BACKEND_CPU) {
2551- ggml_cuda_assign_buffers_impl (tensor->src1 , scratch);
2577+ ggml_cuda_assign_buffers_impl (tensor->src1 , scratch, force_inplace );
25522578 }
25532579
25542580 tensor->backend = GGML_BACKEND_GPU;
25552581 struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
25562582
25572583 const bool inplace = (tensor->src0 != nullptr && tensor->src0 ->data == tensor->data ) ||
2558- tensor->op == GGML_OP_VIEW;
2584+ tensor->op == GGML_OP_VIEW ||
2585+ force_inplace;
25592586 const size_t size = ggml_nbytes (tensor);
25602587
25612588 CUDA_CHECK (cudaSetDevice (g_main_device));
2562- if (inplace && tensor->src0 ->backend == GGML_BACKEND_GPU) {
2589+ if (inplace && ( tensor->src0 ->backend == GGML_BACKEND_GPU || tensor-> src0 -> backend == GGML_BACKEND_GPU_SPLIT) ) {
25632590 struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0 ->extra ;
25642591 char * src0_ddc = (char *) src0_extra->data_device [g_main_device];
25652592 size_t offset = 0 ;
@@ -2598,11 +2625,15 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
25982625}
25992626
26002627void ggml_cuda_assign_buffers (struct ggml_tensor * tensor) {
2601- ggml_cuda_assign_buffers_impl (tensor, true );
2628+ ggml_cuda_assign_buffers_impl (tensor, true , false );
26022629}
26032630
26042631void ggml_cuda_assign_buffers_no_scratch (struct ggml_tensor * tensor) {
2605- ggml_cuda_assign_buffers_impl (tensor, false );
2632+ ggml_cuda_assign_buffers_impl (tensor, false , false );
2633+ }
2634+
2635+ void ggml_cuda_assign_buffers_force_inplace (struct ggml_tensor * tensor) {
2636+ ggml_cuda_assign_buffers_impl (tensor, false , true );
26062637}
26072638
26082639void ggml_cuda_set_main_device (int main_device) {
0 commit comments