@@ -57,47 +57,67 @@ class PyQnnManager {
5757 qnn_executorch_option_ptr_.cast <std::string_view>().data ());
5858
5959 // merge multiple qcirs into one context with multiple graphs
60- std::vector<flatbuffers::Offset<qcir::Graph>> graphs;
60+
61+ // this makes it easier to do subtraction for offsets
62+ std::vector<uint32_t > offsets (1 , 0 );
63+ std::vector<const flatbuffers::Vector64<uint8_t >*> tensor_data;
64+ fb_opt_.max_size = FLATBUFFERS_MAX_64_BUFFER_SIZE;
6165 for (size_t i = 0 ; i < qcirs.size (); ++i) {
6266 py::buffer_info info (py::buffer (qcirs[i].cast <py::bytes>()).request ());
6367 flatbuffers::Verifier verifier_binary_info (
6468 static_cast <const uint8_t * const >(info.ptr ),
65- info.size * info.itemsize );
69+ info.size * info.itemsize ,
70+ fb_opt_);
6671 if (!qnn_delegate::VerifyBinaryInfoBuffer (verifier_binary_info)) {
6772 QNN_EXECUTORCH_LOG_ERROR (" Fail to verify binary info" );
6873 return ;
6974 }
7075 auto binary_info = qnn_delegate::GetBinaryInfo (info.ptr );
76+ tensor_data.push_back (binary_info->tensor_data ());
7177
7278 flatbuffers::Verifier verifier_qcir (
73- binary_info->data ()->data (), binary_info->data ()->size ());
79+ binary_info->context_data ()->Data (),
80+ binary_info->context_data ()->size ());
7481 if (!qcir::VerifyContextBuffer (verifier_qcir)) {
7582 QNN_EXECUTORCH_LOG_ERROR (" Fail to verify qcir format" );
7683 return ;
7784 }
78- auto context = qcir::GetContext (binary_info->data ()->data ());
85+ offsets.push_back (offsets.back () + binary_info->tensor_data ()->size ());
86+ }
87+
88+ std::vector<flatbuffers::Offset<qcir::Graph>> graphs;
89+ for (size_t i = 0 ; i < qcirs.size (); ++i) {
90+ py::buffer_info info (py::buffer (qcirs[i].cast <py::bytes>()).request ());
91+ auto binary_info = qnn_delegate::GetBinaryInfo (info.ptr );
92+ auto context = qcir::GetContext (binary_info->context_data ()->Data ());
7993 for (const auto & graph : *context->graphs ()) {
8094 std::vector<flatbuffers::Offset<qcir::Tensor>> tensors;
8195 for (const auto tensor : *graph->tensors ()) {
8296 // here we need to take a detour to merge multiple qcir flatbuffers
8397 // outer ToTensor
8498 // return: flatbuffers::Offset<Tensor>
85- // consume: QnnTensor, flatbuffers::FlatBufferBuilder*
99+ // consume: QnnTensor, data_offset, flatbuffers::FlatBufferBuilder*
86100 // inner ToTensor
87101 // return: QnnTensor
88- // consume: flatbuffers::Vector<::flatbuffers::Offset<qcir::Tensor>>
89- tensors.emplace_back (ToTensor (ToTensor (tensor), &builder_));
102+ // consume:
103+ // flatbuffers::Vector<::flatbuffers::Offset<qcir::Tensor>>,
104+ // data_ptr
105+ tensors.emplace_back (ToTensor (
106+ ToTensor (tensor, nullptr ),
107+ offsets[i] + tensor->offset (),
108+ &builder_));
90109 }
91110 std::vector<flatbuffers::Offset<qcir::Operator>> nodes;
92111 for (const auto & node : *graph->nodes ()) {
93- int32_t * inputs_ptr = const_cast <int32_t *>(node->inputs ()->data ());
94- int32_t * outputs_ptr = const_cast <int32_t *>(node->outputs ()->data ());
95- int32_t * params_ptr = const_cast <int32_t *>(node->params ()->data ());
96- std::vector<int32_t > inputs (
112+ uint32_t * inputs_ptr = const_cast <uint32_t *>(node->inputs ()->data ());
113+ uint32_t * outputs_ptr =
114+ const_cast <uint32_t *>(node->outputs ()->data ());
115+ uint32_t * params_ptr = const_cast <uint32_t *>(node->params ()->data ());
116+ std::vector<uint32_t > inputs (
97117 inputs_ptr, inputs_ptr + node->inputs ()->size ());
98- std::vector<int32_t > outputs (
118+ std::vector<uint32_t > outputs (
99119 outputs_ptr, outputs_ptr + node->outputs ()->size ());
100- std::vector<int32_t > params (
120+ std::vector<uint32_t > params (
101121 params_ptr, params_ptr + node->params ()->size ());
102122 nodes.emplace_back (qcir::CreateOperatorDirect (
103123 builder_,
@@ -118,7 +138,7 @@ class PyQnnManager {
118138 QnnExecuTorchContextBinary qcir_bin (
119139 {builder_.GetBufferPointer (), builder_.GetSize ()});
120140
121- qnn_executorch_context_binary_ = MakeBinaryInfo (qcir_bin);
141+ qnn_executorch_context_binary_ = MakeBinaryInfo (qcir_bin, tensor_data );
122142 qnn_manager_ = std::make_shared<QnnManager>(
123143 qnn_executorch_options, qnn_executorch_context_binary_);
124144 }
@@ -157,26 +177,37 @@ class PyQnnManager {
157177
158178 if (qnn_manager_->IsOnlinePrepare () || qnn_manager_->IsMultipleGraphs ()) {
159179 builder_.Reset ();
160- std::vector<flatbuffers::Offset<qcir::Tensor>> tensors;
180+ std::vector<uint8_t > tensor_data;
181+ std::vector<uint64_t > offsets;
161182 std::unordered_map<void *, int > tensor_map;
183+ std::vector<flatbuffers::Offset<qcir::Tensor>> fb_tensors;
184+ std::vector<flatbuffers::Offset<qcir::Operator>> fb_ops;
162185
163186 auto set_tensor = [&](const std::shared_ptr<TensorWrapper>& wrapper,
164- std::vector<int >& index) {
187+ std::vector<uint32_t >& index) {
165188 auto it = tensor_map.find (wrapper.get ());
166189 if (it != tensor_map.end ()) {
167190 index.push_back (it->second );
168191 } else {
169- int i = tensors.size ();
170- tensor_map[wrapper.get ()] = i;
171- index.push_back (i);
172- tensors.emplace_back (
173- ToTensor (wrapper->CloneTensorStruct (), &builder_));
192+ tensor_map[wrapper.get ()] = fb_tensors.size ();
193+ index.push_back (fb_tensors.size ());
194+ offsets.push_back (tensor_data.size ());
195+ Qnn_Tensor_t qnn_tensor = wrapper->CloneTensorStruct ();
196+ fb_tensors.emplace_back (
197+ ToTensor (qnn_tensor, offsets.back (), &builder_));
198+ uint8_t * data_ptr =
199+ static_cast <uint8_t *>(QNN_VER_PTR (qnn_tensor)->clientBuf .data );
200+ if (data_ptr != nullptr ) {
201+ tensor_data.insert (
202+ tensor_data.end (),
203+ data_ptr,
204+ data_ptr + QNN_VER_PTR (qnn_tensor)->clientBuf .dataSize );
205+ }
174206 }
175207 };
176208
177- std::vector<flatbuffers::Offset<qcir::Operator>> operators;
178209 for (std::shared_ptr<OpWrapper>& op_wrapper : op_wrappers) {
179- std::vector<int > inputs, outputs, params;
210+ std::vector<uint32_t > inputs, outputs, params;
180211
181212 for (const auto & tensor_wrapper : op_wrapper->GetInputTensors ()) {
182213 set_tensor (tensor_wrapper, inputs);
@@ -207,13 +238,22 @@ class PyQnnManager {
207238 static_cast <void *>(&p.scalarParam .uint8Value );
208239 QNN_VER_PTR (t)->clientBuf .dataSize =
209240 GetDataTypeSize (QNN_VER_PTR (t)->dataType );
210- params.push_back (tensors.size ());
211- tensors.emplace_back (ToTensor (t, &builder_));
241+
242+ // collect tensor data
243+ offsets.push_back (tensor_data.size ());
244+ const uint8_t * data_ptr =
245+ static_cast <uint8_t *>(QNN_VER_PTR (t)->clientBuf .data );
246+ tensor_data.insert (
247+ tensor_data.end (),
248+ data_ptr,
249+ data_ptr + QNN_VER_PTR (t)->clientBuf .dataSize );
250+ params.push_back (fb_tensors.size ());
251+ fb_tensors.emplace_back (ToTensor (t, offsets.back (), &builder_));
212252 }
213253 }
214254
215255 Qnn_OpConfig_t op_config = op_wrapper->GetOpConfig ();
216- operators .emplace_back (qcir::CreateOperatorDirect (
256+ fb_ops .emplace_back (qcir::CreateOperatorDirect (
217257 builder_,
218258 QNN_VER_PTR (op_config)->name ,
219259 QNN_VER_PTR (op_config)->packageName ,
@@ -222,14 +262,16 @@ class PyQnnManager {
222262 &outputs,
223263 ¶ms));
224264 }
225- auto graph = qcir::CreateGraphDirect (
226- builder_, graph_name.c_str (), &operators, &tensors);
227- std::vector<flatbuffers::Offset<qcir::Graph>> graphs ({graph});
228- auto context = qcir::CreateContextDirect (builder_, &graphs);
265+
266+ std::vector<flatbuffers::Offset<qcir::Graph>> fb_graphs (
267+ {qcir::CreateGraphDirect (
268+ builder_, graph_name.c_str (), &fb_ops, &fb_tensors)});
269+ auto context = qcir::CreateContextDirect (builder_, &fb_graphs);
229270 builder_.Finish (context);
271+
230272 QnnExecuTorchContextBinary qcir_binary (
231273 {builder_.GetBufferPointer (), builder_.GetSize ()});
232- binary_info = MakeBinaryInfo (qcir_binary);
274+ binary_info = MakeBinaryInfo (qcir_binary, tensor_data );
233275 } else {
234276 if (qnn_manager_->Compile (graph_name, op_wrappers) !=
235277 executorch::runtime::Error::Ok) {
@@ -300,38 +342,97 @@ class PyQnnManager {
300342 py::buffer_info info (py::buffer (ctx_bin).request ());
301343 QnnExecuTorchContextBinary binary (
302344 {info.ptr , static_cast <uint64_t >(info.size * info.itemsize )});
303- auto binary_info = MakeBinaryInfo (binary);
345+ std::vector<uint8_t > tensor_data;
346+ auto binary_info = MakeBinaryInfo (binary, tensor_data);
304347 auto result = py::array_t <char >(binary_info.nbytes );
305348 auto result_buffer = result.request ();
306349 std::memcpy (result_buffer.ptr , binary_info.buffer , binary_info.nbytes );
307350 return result;
308351 }
309352
310353 private:
354+ std::string signature () {
355+ return std::to_string (
356+ std::chrono::high_resolution_clock::now ().time_since_epoch ().count ());
357+ };
358+
311359 QnnExecuTorchContextBinary MakeBinaryInfo (
312- const QnnExecuTorchContextBinary& ctx_bin) {
313- auto signature = []() {
314- return std::to_string (
315- std::chrono::high_resolution_clock::now ().time_since_epoch ().count ());
316- };
317- const uint8_t * base = static_cast <uint8_t *>(ctx_bin.buffer );
318- std::vector<uint8_t > data (base, base + ctx_bin.nbytes );
360+ const QnnExecuTorchContextBinary& ctx_bin,
361+ const std::vector<const flatbuffers::Vector64<uint8_t >*>& tensor_data) {
362+ // the build order matters, 64 bit data is required to be shipped first
363+ // add context data
364+ builder64_.Reset ();
365+ auto offset_context = builder64_.CreateVector <
366+ uint8_t ,
367+ flatbuffers::Offset64,
368+ flatbuffers::Vector64>(
369+ static_cast <const uint8_t *>(ctx_bin.buffer ), ctx_bin.nbytes );
370+ // add tensor data
371+ // this is a little bit tricky but have smallest memory footprint in AoT
372+ size_t buffer_size = 0 ;
373+ for (auto & td : tensor_data) {
374+ buffer_size += td->size ();
375+ }
376+ builder64_.StartVector <
377+ uint8_t ,
378+ flatbuffers::Offset64,
379+ flatbuffers::Vector64<uint8_t >::size_type>(buffer_size);
380+ for (int i = tensor_data.size () - 1 ; i >= 0 ; --i) {
381+ builder64_.PushBytes (tensor_data[i]->Data (), tensor_data[i]->size ());
382+ }
383+ auto offset_tensor = flatbuffers::Offset64<flatbuffers::Vector64<uint8_t >>(
384+ builder64_.EndVector <
385+ flatbuffers::Vector64<uint8_t >::size_type,
386+ flatbuffers::Offset64<flatbuffers::Vector64<uint8_t >>::offset_type>(
387+ buffer_size));
319388 // add signature to binary for cache reuse in runtime
320- builder_.Reset ();
321- auto binary_info = qnn_delegate::CreateBinaryInfoDirect (
322- builder_, signature ().c_str (), &data);
323- builder_.Finish (binary_info);
389+ auto offset_signature = builder64_.CreateString (signature ().c_str ());
390+ // build binary info
391+ auto binary_info = qnn_delegate::CreateBinaryInfo (
392+ builder64_, offset_signature, offset_context, offset_tensor);
393+ builder64_.Finish (binary_info);
324394
325395 return QnnExecuTorchContextBinary (
326- {builder_.GetBufferPointer (), builder_.GetSize ()});
396+ {builder64_.GetBufferPointer (), builder64_.GetSize ()});
397+ }
398+
399+ QnnExecuTorchContextBinary MakeBinaryInfo (
400+ const QnnExecuTorchContextBinary& ctx_bin,
401+ const std::vector<uint8_t >& tensor_data) {
402+ // the build order matters, 64 bit data is required to be shipped first
403+ // add context data
404+ builder64_.Reset ();
405+
406+ auto offset_context = builder64_.CreateVector <
407+ uint8_t ,
408+ flatbuffers::Offset64,
409+ flatbuffers::Vector64>(
410+ static_cast <const uint8_t *>(ctx_bin.buffer ), ctx_bin.nbytes );
411+ // add tensor data
412+ auto offset_tensor = builder64_.CreateVector <
413+ uint8_t ,
414+ flatbuffers::Offset64,
415+ flatbuffers::Vector64>(
416+ static_cast <const uint8_t *>(tensor_data.data ()), tensor_data.size ());
417+ // add signature to binary for cache reuse in runtime
418+ auto offset_signature = builder64_.CreateString (signature ().c_str ());
419+ // build binary info
420+ auto binary_info = qnn_delegate::CreateBinaryInfo (
421+ builder64_, offset_signature, offset_context, offset_tensor);
422+ builder64_.Finish (binary_info);
423+
424+ return QnnExecuTorchContextBinary (
425+ {builder64_.GetBufferPointer (), builder64_.GetSize ()});
327426 }
328427
329428 // Store the bytes object instead of a raw pointer so that this module will
330429 // keep the bytes alive.
331430 const py::bytes qnn_executorch_option_ptr_;
332431 QnnExecuTorchContextBinary qnn_executorch_context_binary_;
333432 std::shared_ptr<QnnManager> qnn_manager_;
433+ flatbuffers::FlatBufferBuilder64 builder64_;
334434 flatbuffers::FlatBufferBuilder builder_;
435+ flatbuffers::Verifier::Options fb_opt_;
335436};
336437} // namespace qnn
337438} // namespace backends
0 commit comments