Skip to content

Commit 0521a3a

Browse files
authored
Merge pull request #24476 from fengyuentau:attention_layer
dnn: add attention layer #24476 Resolves #24609 Merge with: opencv/opencv_extra#1128. Attention operator spec from onnxruntime: https://github.com/microsoft/onnxruntime/blob/v1.16.1/docs/ContribOperators.md#com.microsoft.Attention. TODO: - [x] benchmark (before this PR vs. with this PR vs. ORT). - [x] Layer fusion: Take care Slice with end=INT64_MAX. - [x] Layer fusion: match more potential attention (VIT) patterns. - [x] Single-head attention is supported. - [x] Test AttentionSubgraph fusion. - [x] Add acc tests for VIT_B_32 and VitTrack - [x] Add perf tests for VIT_B_32 and VitTrack ## Benchmarks Platform: Macbook Air M1. ### Attention Subgraph Input scale: [1, 197, 768]. | | mean (ms) | median (ms) | min (ms) | | ---------------------- | --------- | ----------- | -------- | | w/ Attention (this PR) | 3.75 | 3.68 | 3.22 | | w/o Attention | 9.06 | 9.01 | 8.24 | | ORT (python) | 4.32 | 2.63 | 2.50 | ### ViTs All data in millisecond (ms). | ViTs | With Attention | Without Attention | ORT | | -------- | -------------- | ----------------- | ------ | | vit_b_16 | 302.77 | 365.35 | 109.70 | | vit_b_32 | 89.92 | 116.22 | 30.36 | | vit_l_16 | 1593.32 | 1730.74 | 419.92 | | vit_l_32 | 468.11 | 577.41 | 134.12 | | VitTrack | 3.80 | 3.87 | 2.25 | ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
1 parent e64c5dc commit 0521a3a

File tree

13 files changed

+891
-66
lines changed

13 files changed

+891
-66
lines changed

modules/dnn/include/opencv2/dnn/all_layers.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1178,6 +1178,11 @@ CV__DNN_INLINE_NS_BEGIN
11781178
static Ptr<InstanceNormLayer> create(const LayerParams &params);
11791179
};
11801180

1181+
class CV_EXPORTS AttentionLayer : public Layer {
1182+
public:
1183+
static Ptr<AttentionLayer> create(const LayerParams &params);
1184+
};
1185+
11811186
//! @}
11821187
//! @}
11831188
CV__DNN_INLINE_NS_END

modules/dnn/perf/perf_layer.cpp

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -739,6 +739,62 @@ PERF_TEST_P_(Layer_InstanceNorm, InstanceNorm)
739739
test_layer({N, C, H, W});
740740
}
741741

742+
struct Layer_Attention : public TestBaseWithParam<tuple<Backend, Target>> {
743+
void test_layer(const std::vector<int> x_shape, const std::vector<int> qkv_hidden_sizes, const int num_heads) {
744+
int backendId = get<0>(GetParam());
745+
int targetId = get<1>(GetParam());
746+
747+
auto qk_hidden_size = qkv_hidden_sizes[0];
748+
auto v_hidden_size = qkv_hidden_sizes[2];
749+
750+
auto input_hidden_size = x_shape[2];
751+
auto hidden_size = qk_hidden_size + qk_hidden_size + v_hidden_size;
752+
753+
Mat x(x_shape, CV_32F);
754+
Mat weight(std::vector<int>{input_hidden_size, hidden_size}, CV_32F);
755+
Mat bias(std::vector<int>{hidden_size}, CV_32F);
756+
757+
randu(x, 0.f, 1.f);
758+
randu(weight, 0.f, 1.f);
759+
randu(bias, 0.f, 1.f);
760+
761+
LayerParams lp;
762+
lp.type = "Attention";
763+
lp.name = "testLayer";
764+
lp.set("num_heads", num_heads);
765+
lp.set("qkv_hidden_sizes", DictValue::arrayInt(qkv_hidden_sizes.data(), qkv_hidden_sizes.size()));
766+
767+
Net net;
768+
int id = net.addLayerToPrev(lp.name, lp.type, lp);
769+
net.connect(0, 0, id, 0);
770+
net.connect(0, 1, id, 1);
771+
net.connect(0, 2, id, 2);
772+
773+
{
774+
std::vector<std::string> input_names{"x", "weight", "bias"};
775+
net.setInputsNames(input_names);
776+
net.setInput(x, input_names[0]);
777+
net.setInput(weight, input_names[1]);
778+
net.setInput(bias, input_names[2]);
779+
780+
net.setPreferableBackend(backendId);
781+
net.setPreferableTarget(targetId);
782+
Mat out = net.forward();
783+
}
784+
785+
TEST_CYCLE()
786+
{
787+
Mat out = net.forward();
788+
}
789+
790+
SANITY_CHECK_NOTHING();
791+
}
792+
};
793+
794+
PERF_TEST_P_(Layer_Attention, VisionTransformer) {
795+
test_layer({1, 197, 768}, {768, 768, 768}, 12);
796+
}
797+
742798
INSTANTIATE_TEST_CASE_P(/**/, Layer_Slice, dnnBackendsAndTargets(false, false));
743799
INSTANTIATE_TEST_CASE_P(/**/, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
744800
#ifdef HAVE_CUDA
@@ -750,6 +806,7 @@ INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(D
750806
INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
751807
INSTANTIATE_TEST_CASE_P(/**/, Layer_GatherElements, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
752808
INSTANTIATE_TEST_CASE_P(/**/, Layer_InstanceNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
809+
INSTANTIATE_TEST_CASE_P(/**/, Layer_Attention, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
753810

754811

755812
typedef TestBaseWithParam<tuple<Vec4i, int, bool, tuple<Backend, Target> > > Layer_FullyConnected;

modules/dnn/perf/perf_net.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,6 @@ class DNNTestNetwork : public ::perf::TestBaseWithParam< tuple<Backend, Target>
9393
}
9494
};
9595

96-
9796
PERF_TEST_P_(DNNTestNetwork, AlexNet)
9897
{
9998
processNet("dnn/bvlc_alexnet.caffemodel", "dnn/bvlc_alexnet.prototxt",
@@ -391,17 +390,16 @@ PERF_TEST_P_(DNNTestNetwork, CRNN) {
391390
processNet("", "dnn/text_recognition_CRNN_EN_2021sep.onnx", "", inp);
392391
}
393392

394-
PERF_TEST_P_(DNNTestNetwork, ViTTrack) {
393+
PERF_TEST_P_(DNNTestNetwork, VitTrack) {
395394
Mat inp1(cv::Size(128, 128), CV_32FC3);
396395
Mat inp2(cv::Size(256, 256), CV_32FC3);
397396
randu(inp1, 0.0f, 1.0f);
398397
randu(inp2, 0.0f, 1.0f);
399398
inp1 = blobFromImage(inp1, 1.0, Size(), Scalar(), false);
400399
inp2 = blobFromImage(inp2, 1.0, Size(), Scalar(), false);
401-
processNet("", "dnn/onnx/models/vitTracker.onnx", "", {std::make_tuple(inp1, "template"), std::make_tuple(inp2, "search")});
400+
processNet("", "dnn/onnx/models/object_tracking_vittrack_2023sep.onnx", "", {std::make_tuple(inp1, "template"), std::make_tuple(inp2, "search")});
402401
}
403402

404-
405403
PERF_TEST_P_(DNNTestNetwork, EfficientDet_int8)
406404
{
407405
if (target != DNN_TARGET_CPU || (backend != DNN_BACKEND_OPENCV &&
@@ -413,6 +411,10 @@ PERF_TEST_P_(DNNTestNetwork, EfficientDet_int8)
413411
processNet("", "dnn/tflite/coco_efficientdet_lite0_v1_1.0_quant_2021_09_06.tflite", "", inp);
414412
}
415413

414+
PERF_TEST_P_(DNNTestNetwork, VIT_B_32) {
415+
processNet("", "dnn/onnx/models/vit_b_32.onnx", "", cv::Size(224, 224));
416+
}
417+
416418
INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets());
417419

418420
} // namespace

modules/dnn/src/init.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ void initializeLayerFactory()
162162
CV_DNN_REGISTER_LAYER_CLASS(LayerNormalization, LayerNormLayer);
163163
CV_DNN_REGISTER_LAYER_CLASS(Expand, ExpandLayer);
164164
CV_DNN_REGISTER_LAYER_CLASS(InstanceNormalization, InstanceNormLayer);
165+
CV_DNN_REGISTER_LAYER_CLASS(Attention, AttentionLayer);
165166

166167
CV_DNN_REGISTER_LAYER_CLASS(Crop, CropLayer);
167168
CV_DNN_REGISTER_LAYER_CLASS(Eltwise, EltwiseLayer);

0 commit comments

Comments
 (0)