Skip to content

Commit c9c0c8d

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents bc82723 + 19a5a3e commit c9c0c8d

File tree

2 files changed

+97
-9
lines changed

2 files changed

+97
-9
lines changed

ggml/src/ggml-cpu/vec.h

Lines changed: 91 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,16 +77,85 @@ inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp
7777
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
7878
}
7979
}
80-
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
81-
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
82-
inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
80+
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) {
81+
int i = 0;
82+
#if defined(GGML_SIMD)
83+
const int np = (n & ~(GGML_F32_STEP - 1));
84+
85+
GGML_F32_VEC vv = GGML_F32_VEC_SET1(v);
86+
87+
for (; i < np; i += GGML_F32_STEP) {
88+
for (int j = 0; j < GGML_F32_ARR; ++j) {
89+
GGML_F32_VEC ax = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
90+
GGML_F32_VEC az = GGML_F32_VEC_ADD(ax, vv);
91+
GGML_F32_VEC_STORE(z + i + j*GGML_F32_EPR, az);
92+
}
93+
}
94+
#endif
95+
for (; i < n; ++i) {
96+
z[i] = x[i] + v;
97+
}
98+
}
99+
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) {
100+
int i = 0;
101+
#if defined(GGML_SIMD)
102+
const int np = (n & ~(GGML_F32_STEP - 1));
103+
104+
for (; i < np; i += GGML_F32_STEP) {
105+
for (int j = 0; j < GGML_F32_ARR; ++j) {
106+
GGML_F32_VEC ay = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
107+
GGML_F32_VEC ax = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
108+
ay = GGML_F32_VEC_ADD(ay, ax);
109+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay);
110+
}
111+
}
112+
#endif
113+
for (; i < n; ++i) {
114+
y[i] += x[i];
115+
}
116+
}
117+
inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) {
118+
int i = 0;
119+
#if defined(GGML_SIMD)
120+
const int np = (n & ~(GGML_F32_STEP - 1));
121+
122+
GGML_F32_VEC vv = GGML_F32_VEC_SET1(v);
123+
124+
for (; i < np; i += GGML_F32_STEP) {
125+
for (int j = 0; j < GGML_F32_ARR; ++j) {
126+
GGML_F32_VEC ay = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
127+
ay = GGML_F32_VEC_ADD(ay, vv);
128+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay);
129+
}
130+
}
131+
#endif
132+
for (; i < n; ++i) {
133+
y[i] += v;
134+
}
135+
}
83136
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
84137
inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
85138
for (int i = 0; i < n; ++i) {
86139
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
87140
}
88141
}
89-
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
142+
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) {
143+
int i = 0;
144+
#if defined(GGML_SIMD)
145+
const int np = (n & ~(GGML_F32_STEP - 1));
146+
147+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
148+
149+
for (; i < np; i += GGML_F32_STEP) {
150+
for (int j = 0; j < GGML_F32_ARR; ++j) {
151+
GGML_F32_VEC_STORE(x + i + j*GGML_F32_EPR, vx);
152+
}
153+
}
154+
#endif
155+
for (; i < n; ++i) {
156+
x[i] = v;
157+
}
158+
}
90159
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
91160
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
92161
inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
@@ -95,7 +164,24 @@ inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp
95164
}
96165
}
97166

98-
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
167+
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) {
168+
int i = 0;
169+
#if defined(GGML_SIMD)
170+
const int np = (n & ~(GGML_F32_STEP - 1));
171+
172+
for (; i < np; i += GGML_F32_STEP) {
173+
for (int j = 0; j < GGML_F32_ARR; ++j) {
174+
GGML_F32_VEC ax = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
175+
GGML_F32_VEC ay = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
176+
GGML_F32_VEC az = GGML_F32_VEC_MUL(ax, ay);
177+
GGML_F32_VEC_STORE(z + i + j*GGML_F32_EPR, az);
178+
}
179+
}
180+
#endif
181+
for (; i < n; ++i) {
182+
z[i] = x[i]*y[i];
183+
}
184+
}
99185
inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
100186
for (int i = 0; i < n; ++i) {
101187
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));

tests/test-thread-safety.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// - Creates n_parallel (--parallel) contexts per model
44
// - Runs inference in parallel on each context
55

6+
#include <array>
67
#include <thread>
78
#include <vector>
89
#include <atomic>
@@ -38,13 +39,14 @@ int main(int argc, char ** argv) {
3839
cparams.n_seq_max = 1;
3940

4041
int dev_count = ggml_backend_dev_count();
41-
int gpu_dev_count = 0;
42+
std::vector<std::array<ggml_backend_dev_t, 2>> gpus;
4243
for (int i = 0; i < dev_count; ++i) {
4344
auto * dev = ggml_backend_dev_get(i);
4445
if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
45-
gpu_dev_count++;
46+
gpus.push_back({dev, nullptr});
4647
}
4748
}
49+
const int gpu_dev_count = (int)gpus.size();
4850
const int num_models = gpu_dev_count + 1 + 1; // GPUs + 1 CPU model + 1 layer split
4951
//const int num_models = std::max(1, gpu_dev_count);
5052
const int num_contexts = std::max(1, params.n_parallel);
@@ -58,12 +60,12 @@ int main(int argc, char ** argv) {
5860

5961
if (m < gpu_dev_count) {
6062
mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
61-
mparams.main_gpu = m;
63+
mparams.devices = gpus[m].data();
6264
} else if (m == gpu_dev_count) {
6365
mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
6466
mparams.main_gpu = -1; // CPU model
6567
} else {
66-
mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;;
68+
mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;
6769
}
6870

6971
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);

0 commit comments

Comments
 (0)