Skip to content

Commit fed78a3

Browse files
committed
add cache support to ggml runner
1 parent aa5566f commit fed78a3

File tree

3 files changed

+126
-97
lines changed

3 files changed

+126
-97
lines changed

ggml_extend.hpp

Lines changed: 81 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1307,13 +1307,18 @@ struct GGMLRunner {
13071307
ggml_backend_buffer_t runtime_params_buffer = NULL;
13081308
bool params_on_runtime_backend = false;
13091309

1310+
struct ggml_context* cache_ctx = NULL;
1311+
ggml_backend_buffer_t cache_buffer = NULL;
1312+
13101313
struct ggml_context* compute_ctx = NULL;
13111314
struct ggml_gallocr* compute_allocr = NULL;
13121315

13131316
std::vector<float> one_vec = {1.f};
13141317
ggml_tensor* one_tensor = NULL;
13151318

13161319
std::map<struct ggml_tensor*, const void*> backend_tensor_data_map;
1320+
std::map<std::string, struct ggml_tensor*> cache_tensor_map; // name -> tensor
1321+
const std::string final_result_name = "ggml_runner_final_result_tensor";
13171322

13181323
void alloc_params_ctx() {
13191324
struct ggml_init_params params;
@@ -1340,6 +1345,23 @@ struct GGMLRunner {
13401345
}
13411346
}
13421347

1348+
void alloc_cache_ctx() {
1349+
struct ggml_init_params params;
1350+
params.mem_size = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead());
1351+
params.mem_buffer = NULL;
1352+
params.no_alloc = true;
1353+
1354+
cache_ctx = ggml_init(params);
1355+
GGML_ASSERT(cache_ctx != NULL);
1356+
}
1357+
1358+
void free_cache_ctx() {
1359+
if (cache_ctx != NULL) {
1360+
ggml_free(cache_ctx);
1361+
cache_ctx = NULL;
1362+
}
1363+
}
1364+
13431365
void alloc_compute_ctx() {
13441366
struct ggml_init_params params;
13451367
params.mem_size = static_cast<size_t>(ggml_tensor_overhead() * MAX_GRAPH_SIZE + ggml_graph_overhead());
@@ -1370,6 +1392,8 @@ struct GGMLRunner {
13701392
struct ggml_cgraph* get_compute_graph(get_graph_cb_t get_graph) {
13711393
prepare_build_in_tensor_before();
13721394
struct ggml_cgraph* gf = get_graph();
1395+
auto result = ggml_graph_node(gf, -1);
1396+
ggml_set_name(result, final_result_name.c_str());
13731397
prepare_build_in_tensor_after(gf);
13741398
return gf;
13751399
}
@@ -1399,7 +1423,43 @@ struct GGMLRunner {
13991423
return true;
14001424
}
14011425

1402-
void cpy_data_to_backend_tensor() {
1426+
void free_cache_buffer() {
1427+
if (cache_buffer != NULL) {
1428+
ggml_backend_buffer_free(cache_buffer);
1429+
cache_buffer = NULL;
1430+
}
1431+
}
1432+
1433+
void copy_cache_tensors_to_cache_buffer() {
1434+
if (cache_tensor_map.size() == 0) {
1435+
return;
1436+
}
1437+
free_cache_ctx_and_buffer();
1438+
alloc_cache_ctx();
1439+
GGML_ASSERT(cache_buffer == NULL);
1440+
std::map<ggml_tensor*, ggml_tensor*> runtime_tensor_to_cache_tensor;
1441+
for (auto kv : cache_tensor_map) {
1442+
auto cache_tensor = ggml_dup_tensor(cache_ctx, kv.second);
1443+
ggml_set_name(cache_tensor, kv.first.c_str());
1444+
runtime_tensor_to_cache_tensor[kv.second] = cache_tensor;
1445+
}
1446+
size_t num_tensors = ggml_tensor_num(cache_ctx);
1447+
cache_buffer = ggml_backend_alloc_ctx_tensors(cache_ctx, runtime_backend);
1448+
GGML_ASSERT(cache_buffer != NULL);
1449+
for (auto kv : runtime_tensor_to_cache_tensor) {
1450+
ggml_backend_tensor_copy(kv.first, kv.second);
1451+
}
1452+
ggml_backend_synchronize(runtime_backend);
1453+
cache_tensor_map.clear();
1454+
size_t cache_buffer_size = ggml_backend_buffer_get_size(cache_buffer);
1455+
LOG_DEBUG("%s cache backend buffer size = % 6.2f MB(%s) (%i tensors)",
1456+
get_desc().c_str(),
1457+
cache_buffer_size / (1024.f * 1024.f),
1458+
ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM",
1459+
num_tensors);
1460+
}
1461+
1462+
void copy_data_to_backend_tensor() {
14031463
for (auto& kv : backend_tensor_data_map) {
14041464
auto tensor = kv.first;
14051465
auto data = kv.second;
@@ -1510,6 +1570,7 @@ struct GGMLRunner {
15101570
if (params_backend != runtime_backend) {
15111571
ggml_backend_free(params_backend);
15121572
}
1573+
free_cache_ctx_and_buffer();
15131574
}
15141575

15151576
void reset_compute_ctx() {
@@ -1549,6 +1610,11 @@ struct GGMLRunner {
15491610
return 0;
15501611
}
15511612

1613+
void free_cache_ctx_and_buffer() {
1614+
free_cache_buffer();
1615+
free_cache_ctx();
1616+
}
1617+
15521618
void free_compute_buffer() {
15531619
if (compute_allocr != NULL) {
15541620
ggml_gallocr_free(compute_allocr);
@@ -1579,6 +1645,17 @@ struct GGMLRunner {
15791645
}
15801646
}
15811647

1648+
void cache(const std::string name, struct ggml_tensor* tensor) {
1649+
cache_tensor_map[name] = tensor;
1650+
}
1651+
1652+
struct ggml_tensor* get_cache_tensor_by_name(const std::string& name) {
1653+
if (cache_ctx == NULL) {
1654+
return NULL;
1655+
}
1656+
return ggml_get_tensor(cache_ctx, name.c_str());
1657+
}
1658+
15821659
void compute(get_graph_cb_t get_graph,
15831660
int n_threads,
15841661
bool free_compute_buffer_immediately = true,
@@ -1592,7 +1669,7 @@ struct GGMLRunner {
15921669
reset_compute_ctx();
15931670
struct ggml_cgraph* gf = get_compute_graph(get_graph);
15941671
GGML_ASSERT(ggml_gallocr_alloc_graph(compute_allocr, gf));
1595-
cpy_data_to_backend_tensor();
1672+
copy_data_to_backend_tensor();
15961673
if (ggml_backend_is_cpu(runtime_backend)) {
15971674
ggml_backend_cpu_set_n_threads(runtime_backend, n_threads);
15981675
}
@@ -1601,8 +1678,9 @@ struct GGMLRunner {
16011678
#ifdef GGML_PERF
16021679
ggml_graph_print(gf);
16031680
#endif
1681+
copy_cache_tensors_to_cache_buffer();
16041682
if (output != NULL) {
1605-
auto result = ggml_graph_node(gf, -1);
1683+
auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
16061684
if (*output == NULL && output_ctx != NULL) {
16071685
*output = ggml_dup_tensor(output_ctx, result);
16081686
}

stable-diffusion.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2384,7 +2384,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
23842384
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps + high_noise_sample_steps);
23852385

23862386
struct ggml_init_params params;
2387-
params.mem_size = static_cast<size_t>(100 * 1024) * 1024; // 100 MB
2387+
params.mem_size = static_cast<size_t>(200 * 1024) * 1024; // 200 MB
23882388
params.mem_size += width * height * frames * 3 * sizeof(float) * 2;
23892389
params.mem_buffer = NULL;
23902390
params.no_alloc = false;

0 commit comments

Comments
 (0)