@@ -1307,13 +1307,18 @@ struct GGMLRunner {
13071307 ggml_backend_buffer_t runtime_params_buffer = NULL ;
13081308 bool params_on_runtime_backend = false ;
13091309
1310+ struct ggml_context * cache_ctx = NULL ;
1311+ ggml_backend_buffer_t cache_buffer = NULL ;
1312+
13101313 struct ggml_context * compute_ctx = NULL ;
13111314 struct ggml_gallocr * compute_allocr = NULL ;
13121315
13131316 std::vector<float > one_vec = {1 .f };
13141317 ggml_tensor* one_tensor = NULL ;
13151318
13161319 std::map<struct ggml_tensor *, const void *> backend_tensor_data_map;
1320+ std::map<std::string, struct ggml_tensor *> cache_tensor_map; // name -> tensor
1321+ const std::string final_result_name = " ggml_runner_final_result_tensor" ;
13171322
13181323 void alloc_params_ctx () {
13191324 struct ggml_init_params params;
@@ -1340,6 +1345,23 @@ struct GGMLRunner {
13401345 }
13411346 }
13421347
1348+ void alloc_cache_ctx () {
1349+ struct ggml_init_params params;
1350+ params.mem_size = static_cast <size_t >(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead ());
1351+ params.mem_buffer = NULL ;
1352+ params.no_alloc = true ;
1353+
1354+ cache_ctx = ggml_init (params);
1355+ GGML_ASSERT (cache_ctx != NULL );
1356+ }
1357+
1358+ void free_cache_ctx () {
1359+ if (cache_ctx != NULL ) {
1360+ ggml_free (cache_ctx);
1361+ cache_ctx = NULL ;
1362+ }
1363+ }
1364+
13431365 void alloc_compute_ctx () {
13441366 struct ggml_init_params params;
13451367 params.mem_size = static_cast <size_t >(ggml_tensor_overhead () * MAX_GRAPH_SIZE + ggml_graph_overhead ());
@@ -1370,6 +1392,8 @@ struct GGMLRunner {
13701392 struct ggml_cgraph * get_compute_graph (get_graph_cb_t get_graph) {
13711393 prepare_build_in_tensor_before ();
13721394 struct ggml_cgraph * gf = get_graph ();
1395+ auto result = ggml_graph_node (gf, -1 );
1396+ ggml_set_name (result, final_result_name.c_str ());
13731397 prepare_build_in_tensor_after (gf);
13741398 return gf;
13751399 }
@@ -1399,7 +1423,43 @@ struct GGMLRunner {
13991423 return true ;
14001424 }
14011425
1402- void cpy_data_to_backend_tensor () {
1426+ void free_cache_buffer () {
1427+ if (cache_buffer != NULL ) {
1428+ ggml_backend_buffer_free (cache_buffer);
1429+ cache_buffer = NULL ;
1430+ }
1431+ }
1432+
1433+ void copy_cache_tensors_to_cache_buffer () {
1434+ if (cache_tensor_map.size () == 0 ) {
1435+ return ;
1436+ }
1437+ free_cache_ctx_and_buffer ();
1438+ alloc_cache_ctx ();
1439+ GGML_ASSERT (cache_buffer == NULL );
1440+ std::map<ggml_tensor*, ggml_tensor*> runtime_tensor_to_cache_tensor;
1441+ for (auto kv : cache_tensor_map) {
1442+ auto cache_tensor = ggml_dup_tensor (cache_ctx, kv.second );
1443+ ggml_set_name (cache_tensor, kv.first .c_str ());
1444+ runtime_tensor_to_cache_tensor[kv.second ] = cache_tensor;
1445+ }
1446+ size_t num_tensors = ggml_tensor_num (cache_ctx);
1447+ cache_buffer = ggml_backend_alloc_ctx_tensors (cache_ctx, runtime_backend);
1448+ GGML_ASSERT (cache_buffer != NULL );
1449+ for (auto kv : runtime_tensor_to_cache_tensor) {
1450+ ggml_backend_tensor_copy (kv.first , kv.second );
1451+ }
1452+ ggml_backend_synchronize (runtime_backend);
1453+ cache_tensor_map.clear ();
1454+ size_t cache_buffer_size = ggml_backend_buffer_get_size (cache_buffer);
1455+ LOG_DEBUG (" %s cache backend buffer size = % 6.2f MB(%s) (%i tensors)" ,
1456+ get_desc ().c_str (),
1457+ cache_buffer_size / (1024 .f * 1024 .f ),
1458+ ggml_backend_is_cpu (runtime_backend) ? " RAM" : " VRAM" ,
1459+ num_tensors);
1460+ }
1461+
1462+ void copy_data_to_backend_tensor () {
14031463 for (auto & kv : backend_tensor_data_map) {
14041464 auto tensor = kv.first ;
14051465 auto data = kv.second ;
@@ -1510,6 +1570,7 @@ struct GGMLRunner {
15101570 if (params_backend != runtime_backend) {
15111571 ggml_backend_free (params_backend);
15121572 }
1573+ free_cache_ctx_and_buffer ();
15131574 }
15141575
15151576 void reset_compute_ctx () {
@@ -1549,6 +1610,11 @@ struct GGMLRunner {
15491610 return 0 ;
15501611 }
15511612
1613+ void free_cache_ctx_and_buffer () {
1614+ free_cache_buffer ();
1615+ free_cache_ctx ();
1616+ }
1617+
15521618 void free_compute_buffer () {
15531619 if (compute_allocr != NULL ) {
15541620 ggml_gallocr_free (compute_allocr);
@@ -1579,6 +1645,17 @@ struct GGMLRunner {
15791645 }
15801646 }
15811647
1648+ void cache (const std::string name, struct ggml_tensor * tensor) {
1649+ cache_tensor_map[name] = tensor;
1650+ }
1651+
1652+ struct ggml_tensor * get_cache_tensor_by_name (const std::string& name) {
1653+ if (cache_ctx == NULL ) {
1654+ return NULL ;
1655+ }
1656+ return ggml_get_tensor (cache_ctx, name.c_str ());
1657+ }
1658+
15821659 void compute (get_graph_cb_t get_graph,
15831660 int n_threads,
15841661 bool free_compute_buffer_immediately = true ,
@@ -1592,7 +1669,7 @@ struct GGMLRunner {
15921669 reset_compute_ctx ();
15931670 struct ggml_cgraph * gf = get_compute_graph (get_graph);
15941671 GGML_ASSERT (ggml_gallocr_alloc_graph (compute_allocr, gf));
1595- cpy_data_to_backend_tensor ();
1672+ copy_data_to_backend_tensor ();
15961673 if (ggml_backend_is_cpu (runtime_backend)) {
15971674 ggml_backend_cpu_set_n_threads (runtime_backend, n_threads);
15981675 }
@@ -1601,8 +1678,9 @@ struct GGMLRunner {
16011678#ifdef GGML_PERF
16021679 ggml_graph_print (gf);
16031680#endif
1681+ copy_cache_tensors_to_cache_buffer ();
16041682 if (output != NULL ) {
1605- auto result = ggml_graph_node (gf, - 1 );
1683+ auto result = ggml_get_tensor (compute_ctx, final_result_name. c_str () );
16061684 if (*output == NULL && output_ctx != NULL ) {
16071685 *output = ggml_dup_tensor (output_ctx, result);
16081686 }
0 commit comments