Skip to content

Commit 0f91b54

Browse files
hacky and bad but it works
1 parent 7487137 commit 0f91b54

File tree

3 files changed

+65
-14
lines changed

3 files changed

+65
-14
lines changed

examples/common.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -414,13 +414,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
414414
exit(1);
415415
}
416416

417-
#ifdef GGML_USE_CUBLAS
418-
if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
419-
fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
420-
exit(1);
421-
}
422-
#endif // GGML_USE_CUBLAS
423-
424417
if (escape_prompt) {
425418
process_escapes(params.prompt);
426419
}

ggml-cuda.cu

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include <cstddef>
22
#include <cstdint>
3+
#include <cstring>
34
#include <limits>
45
#include <stdint.h>
56
#include <stdio.h>
@@ -194,6 +195,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co
194195
dst[i] = x[i] + y[i];
195196
}
196197

198+
static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
199+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
200+
201+
if (i >= k) {
202+
return;
203+
}
204+
dst[i] = x[i] + __float2half(y[i]);
205+
}
206+
197207
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
198208
const int i = blockDim.x*blockIdx.x + threadIdx.x;
199209

@@ -1209,6 +1219,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in
12091219
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
12101220
}
12111221

1222+
static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
1223+
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
1224+
add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1225+
}
1226+
12121227
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
12131228
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
12141229
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -1675,15 +1690,26 @@ inline void ggml_cuda_op_add(
16751690
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
16761691
cudaStream_t & cudaStream_main){
16771692

1678-
GGML_ASSERT(src0_ddf_i != nullptr);
1693+
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
16791694
GGML_ASSERT(src1_ddf_i != nullptr);
16801695
GGML_ASSERT(dst_ddf_i != nullptr);
16811696

16821697
const int64_t ne0 = src0->ne[0];
16831698
const int64_t i01_diff = i01_high - i01_low;
16841699

16851700
// compute
1686-
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1701+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
1702+
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1703+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
1704+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) src0->extra;
1705+
// ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) src1->extra;
1706+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu * ) dst->extra;
1707+
GGML_ASSERT(src0_extra->data_device[g_main_device] == dst_extra->data_device[g_main_device]);
1708+
GGML_ASSERT(src0_ddq_i == (char *) dst_ddf_i);
1709+
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
1710+
} else {
1711+
GGML_ASSERT(false);
1712+
}
16871713
CUDA_CHECK(cudaGetLastError());
16881714

16891715
(void) src1;
@@ -2281,8 +2307,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
22812307
}
22822308

22832309
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2284-
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2285-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
2310+
GGML_ASSERT(
2311+
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
2312+
src1->type == GGML_TYPE_F32 &&
2313+
(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
2314+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
22862315
}
22872316

22882317
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2555,11 +2584,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
25552584
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
25562585

25572586
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
2558-
tensor->op == GGML_OP_VIEW;
2587+
tensor->op == GGML_OP_VIEW ||
2588+
strcmp(tensor->name, "r_add_inplace") == 0;
25592589
const size_t size = ggml_nbytes(tensor);
25602590

25612591
CUDA_CHECK(cudaSetDevice(g_main_device));
2562-
if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
2592+
if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
25632593
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
25642594
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
25652595
size_t offset = 0;

llama.cpp

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2907,14 +2907,15 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
29072907
return false;
29082908
}
29092909
}
2910-
ggml_tensor* lora_tensor;
2910+
ggml_tensor * lora_tensor;
29112911
if (n_dims == 2) {
29122912
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
29132913
}
29142914
else {
29152915
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
29162916
return 1;
29172917
}
2918+
ggml_set_name(lora_tensor, "lora_tensor");
29182919

29192920
// load tensor data
29202921
size_t offset = fin.tellg();
@@ -2930,6 +2931,15 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
29302931
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
29312932

29322933
ggml_tensor * dest_t = model_tensors[base_name];
2934+
2935+
offload_func_t offload_func = llama_nop;
2936+
2937+
#ifdef GGML_USE_CUBLAS
2938+
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
2939+
offload_func = ggml_cuda_assign_buffers;
2940+
}
2941+
#endif // GGML_USE_CUBLAS
2942+
29332943
ggml_tensor * base_t;
29342944
if (model_loader) {
29352945
// load from base model
@@ -2957,7 +2967,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
29572967
}
29582968

29592969
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
2970+
GGML_ASSERT(loraA->type == GGML_TYPE_F32);
2971+
ggml_set_name(loraA, "loraA");
2972+
29602973
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
2974+
GGML_ASSERT(loraB->type == GGML_TYPE_F32);
2975+
ggml_set_name(loraB, "loraB");
29612976

29622977
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
29632978
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
@@ -2967,19 +2982,32 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
29672982

29682983
// w = w + BA*s
29692984
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
2985+
offload_func(BA);
2986+
ggml_set_name(BA, "BA");
29702987

29712988
if (scaling != 1.0f) {
29722989
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2990+
ggml_set_name(scale_tensor, "scale_tensor");
2991+
29732992
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
2993+
offload_func(BA);
2994+
ggml_set_name(BA, "BA_scaled");
29742995
}
29752996

29762997
ggml_tensor * r;
29772998
if (base_t == dest_t) {
29782999
r = ggml_add_inplace(lora_ctx, dest_t, BA);
3000+
ggml_set_name(r, "r_add_inplace");
3001+
offload_func(r);
29793002
}
29803003
else {
29813004
r = ggml_add(lora_ctx, base_t, BA);
3005+
offload_func(r);
3006+
ggml_set_name(r, "r_add");
3007+
29823008
r = ggml_cpy(lora_ctx, r, dest_t);
3009+
offload_func(r);
3010+
ggml_set_name(r, "r_cpy");
29833011
}
29843012

29853013
struct ggml_cgraph gf = ggml_build_forward(r);

0 commit comments

Comments
 (0)