|
1 | 1 | #include <cstddef> |
2 | 2 | #include <cstdint> |
| 3 | +#include <cstring> |
3 | 4 | #include <limits> |
4 | 5 | #include <stdint.h> |
5 | 6 | #include <stdio.h> |
@@ -194,6 +195,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co |
194 | 195 | dst[i] = x[i] + y[i]; |
195 | 196 | } |
196 | 197 |
|
| 198 | +static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) { |
| 199 | + const int i = blockDim.x*blockIdx.x + threadIdx.x; |
| 200 | + |
| 201 | + if (i >= k) { |
| 202 | + return; |
| 203 | + } |
| 204 | + dst[i] = x[i] + __float2half(y[i]); |
| 205 | +} |
| 206 | + |
197 | 207 | static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { |
198 | 208 | const int i = blockDim.x*blockIdx.x + threadIdx.x; |
199 | 209 |
|
@@ -1209,6 +1219,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in |
1209 | 1219 | add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k); |
1210 | 1220 | } |
1211 | 1221 |
|
| 1222 | +static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) { |
| 1223 | + const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; |
| 1224 | + add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k); |
| 1225 | +} |
| 1226 | + |
1212 | 1227 | static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) { |
1213 | 1228 | const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE; |
1214 | 1229 | mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky); |
@@ -1675,15 +1690,26 @@ inline void ggml_cuda_op_add( |
1675 | 1690 | float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, |
1676 | 1691 | cudaStream_t & cudaStream_main){ |
1677 | 1692 |
|
1678 | | - GGML_ASSERT(src0_ddf_i != nullptr); |
| 1693 | + GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr); |
1679 | 1694 | GGML_ASSERT(src1_ddf_i != nullptr); |
1680 | 1695 | GGML_ASSERT(dst_ddf_i != nullptr); |
1681 | 1696 |
|
1682 | 1697 | const int64_t ne0 = src0->ne[0]; |
1683 | 1698 | const int64_t i01_diff = i01_high - i01_low; |
1684 | 1699 |
|
1685 | 1700 | // compute |
1686 | | - add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main); |
| 1701 | + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { |
| 1702 | + add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main); |
| 1703 | + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { |
| 1704 | + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) src0->extra; |
| 1705 | + // ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) src1->extra; |
| 1706 | + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu * ) dst->extra; |
| 1707 | + GGML_ASSERT(src0_extra->data_device[g_main_device] == dst_extra->data_device[g_main_device]); |
| 1708 | + GGML_ASSERT(src0_ddq_i == (char *) dst_ddf_i); |
| 1709 | + add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main); |
| 1710 | + } else { |
| 1711 | + GGML_ASSERT(false); |
| 1712 | + } |
1687 | 1713 | CUDA_CHECK(cudaGetLastError()); |
1688 | 1714 |
|
1689 | 1715 | (void) src1; |
@@ -2281,8 +2307,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm |
2281 | 2307 | } |
2282 | 2308 |
|
2283 | 2309 | void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
2284 | | - GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); |
2285 | | - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true); |
| 2310 | + GGML_ASSERT( |
| 2311 | + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) && |
| 2312 | + src1->type == GGML_TYPE_F32 && |
| 2313 | + (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16)); |
| 2314 | + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true); |
2286 | 2315 | } |
2287 | 2316 |
|
2288 | 2317 | void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
@@ -2555,11 +2584,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) { |
2555 | 2584 | struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; |
2556 | 2585 |
|
2557 | 2586 | const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) || |
2558 | | - tensor->op == GGML_OP_VIEW; |
| 2587 | + tensor->op == GGML_OP_VIEW || |
| 2588 | + strcmp(tensor->name, "r_add_inplace") == 0; |
2559 | 2589 | const size_t size = ggml_nbytes(tensor); |
2560 | 2590 |
|
2561 | 2591 | CUDA_CHECK(cudaSetDevice(g_main_device)); |
2562 | | - if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) { |
| 2592 | + if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) { |
2563 | 2593 | struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra; |
2564 | 2594 | char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; |
2565 | 2595 | size_t offset = 0; |
|
0 commit comments