1010
1111#include " webrtc/modules/audio_processing/aec3/adaptive_fir_filter.h"
1212
13+ #if defined(WEBRTC_HAS_NEON)
14+ #include < arm_neon.h>
15+ #endif
1316#include " webrtc/typedefs.h"
1417#if defined(WEBRTC_ARCH_X86_FAMILY)
1518#include < emmintrin.h>
@@ -52,6 +55,26 @@ void UpdateFrequencyResponse(
5255 }
5356}
5457
58+ #if defined(WEBRTC_HAS_NEON)
59+ // Computes and stores the frequency response of the filter.
60+ void UpdateFrequencyResponse_NEON (
61+ rtc::ArrayView<const FftData> H,
62+ std::vector<std::array<float , kFftLengthBy2Plus1 >>* H2) {
63+ RTC_DCHECK_EQ (H.size (), H2->size ());
64+ for (size_t k = 0 ; k < H.size (); ++k) {
65+ for (size_t j = 0 ; j < kFftLengthBy2 ; j += 4 ) {
66+ const float32x4_t re = vld1q_f32 (&H[k].re [j]);
67+ const float32x4_t im = vld1q_f32 (&H[k].im [j]);
68+ float32x4_t H2_k_j = vmulq_f32 (re, re);
69+ H2_k_j = vmlaq_f32 (H2_k_j, im, im);
70+ vst1q_f32 (&(*H2)[k][j], H2_k_j);
71+ }
72+ (*H2)[k][kFftLengthBy2 ] = H[k].re [kFftLengthBy2 ] * H[k].re [kFftLengthBy2 ] +
73+ H[k].im [kFftLengthBy2 ] * H[k].im [kFftLengthBy2 ];
74+ }
75+ }
76+ #endif
77+
5578#if defined(WEBRTC_ARCH_X86_FAMILY)
5679// Computes and stores the frequency response of the filter.
5780void UpdateFrequencyResponse_SSE2 (
@@ -85,6 +108,25 @@ void UpdateErlEstimator(
85108 }
86109}
87110
111+ #if defined(WEBRTC_HAS_NEON)
112+ // Computes and stores the echo return loss estimate of the filter, which is the
113+ // sum of the partition frequency responses.
114+ void UpdateErlEstimator_NEON (
115+ const std::vector<std::array<float , kFftLengthBy2Plus1 >>& H2,
116+ std::array<float , kFftLengthBy2Plus1 >* erl) {
117+ erl->fill (0 .f );
118+ for (auto & H2_j : H2) {
119+ for (size_t k = 0 ; k < kFftLengthBy2 ; k += 4 ) {
120+ const float32x4_t H2_j_k = vld1q_f32 (&H2_j[k]);
121+ float32x4_t erl_k = vld1q_f32 (&(*erl)[k]);
122+ erl_k = vaddq_f32 (erl_k, H2_j_k);
123+ vst1q_f32 (&(*erl)[k], erl_k);
124+ }
125+ (*erl)[kFftLengthBy2 ] += H2_j[kFftLengthBy2 ];
126+ }
127+ }
128+ #endif
129+
88130#if defined(WEBRTC_ARCH_X86_FAMILY)
89131// Computes and stores the echo return loss estimate of the filter, which is the
90132// sum of the partition frequency responses.
@@ -121,6 +163,63 @@ void AdaptPartitions(const RenderBuffer& render_buffer,
121163 }
122164}
123165
166+ #if defined(WEBRTC_HAS_NEON)
167+ // Adapts the filter partitions. (NEON variant)
168+ void AdaptPartitions_NEON (const RenderBuffer& render_buffer,
169+ const FftData& G,
170+ rtc::ArrayView<FftData> H) {
171+ rtc::ArrayView<const FftData> render_buffer_data = render_buffer.Buffer ();
172+ const int lim1 =
173+ std::min (render_buffer_data.size () - render_buffer.Position (), H.size ());
174+ const int lim2 = H.size ();
175+ constexpr int kNumFourBinBands = kFftLengthBy2 / 4 ;
176+ FftData* H_j = &H[0 ];
177+ const FftData* X = &render_buffer_data[render_buffer.Position ()];
178+ int limit = lim1;
179+ int j = 0 ;
180+ do {
181+ for (; j < limit; ++j, ++H_j, ++X) {
182+ for (int k = 0 , n = 0 ; n < kNumFourBinBands ; ++n, k += 4 ) {
183+ const float32x4_t G_re = vld1q_f32 (&G.re [k]);
184+ const float32x4_t G_im = vld1q_f32 (&G.im [k]);
185+ const float32x4_t X_re = vld1q_f32 (&X->re [k]);
186+ const float32x4_t X_im = vld1q_f32 (&X->im [k]);
187+ const float32x4_t H_re = vld1q_f32 (&H_j->re [k]);
188+ const float32x4_t H_im = vld1q_f32 (&H_j->im [k]);
189+ const float32x4_t a = vmulq_f32 (X_re, G_re);
190+ const float32x4_t e = vmlaq_f32 (a, X_im, G_im);
191+ const float32x4_t c = vmulq_f32 (X_re, G_im);
192+ const float32x4_t f = vmlsq_f32 (c, X_im, G_re);
193+ const float32x4_t g = vaddq_f32 (H_re, e);
194+ const float32x4_t h = vaddq_f32 (H_im, f);
195+
196+ vst1q_f32 (&H_j->re [k], g);
197+ vst1q_f32 (&H_j->im [k], h);
198+ }
199+ }
200+
201+ X = &render_buffer_data[0 ];
202+ limit = lim2;
203+ } while (j < lim2);
204+
205+ H_j = &H[0 ];
206+ X = &render_buffer_data[render_buffer.Position ()];
207+ limit = lim1;
208+ j = 0 ;
209+ do {
210+ for (; j < limit; ++j, ++H_j, ++X) {
211+ H_j->re [kFftLengthBy2 ] += X->re [kFftLengthBy2 ] * G.re [kFftLengthBy2 ] +
212+ X->im [kFftLengthBy2 ] * G.im [kFftLengthBy2 ];
213+ H_j->im [kFftLengthBy2 ] += X->re [kFftLengthBy2 ] * G.im [kFftLengthBy2 ] -
214+ X->im [kFftLengthBy2 ] * G.re [kFftLengthBy2 ];
215+ }
216+
217+ X = &render_buffer_data[0 ];
218+ limit = lim2;
219+ } while (j < lim2);
220+ }
221+ #endif
222+
124223#if defined(WEBRTC_ARCH_X86_FAMILY)
125224// Adapts the filter partitions. (SSE2 variant)
126225void AdaptPartitions_SSE2 (const RenderBuffer& render_buffer,
@@ -203,6 +302,65 @@ void ApplyFilter(const RenderBuffer& render_buffer,
203302 }
204303}
205304
305+ #if defined(WEBRTC_HAS_NEON)
306+ // Produces the filter output (NEON variant).
307+ void ApplyFilter_NEON (const RenderBuffer& render_buffer,
308+ rtc::ArrayView<const FftData> H,
309+ FftData* S) {
310+ RTC_DCHECK_GE (H.size (), H.size () - 1 );
311+ S->re .fill (0 .f );
312+ S->im .fill (0 .f );
313+
314+ rtc::ArrayView<const FftData> render_buffer_data = render_buffer.Buffer ();
315+ const int lim1 =
316+ std::min (render_buffer_data.size () - render_buffer.Position (), H.size ());
317+ const int lim2 = H.size ();
318+ constexpr int kNumFourBinBands = kFftLengthBy2 / 4 ;
319+ const FftData* H_j = &H[0 ];
320+ const FftData* X = &render_buffer_data[render_buffer.Position ()];
321+
322+ int j = 0 ;
323+ int limit = lim1;
324+ do {
325+ for (; j < limit; ++j, ++H_j, ++X) {
326+ for (int k = 0 , n = 0 ; n < kNumFourBinBands ; ++n, k += 4 ) {
327+ const float32x4_t X_re = vld1q_f32 (&X->re [k]);
328+ const float32x4_t X_im = vld1q_f32 (&X->im [k]);
329+ const float32x4_t H_re = vld1q_f32 (&H_j->re [k]);
330+ const float32x4_t H_im = vld1q_f32 (&H_j->im [k]);
331+ const float32x4_t S_re = vld1q_f32 (&S->re [k]);
332+ const float32x4_t S_im = vld1q_f32 (&S->im [k]);
333+ const float32x4_t a = vmulq_f32 (X_re, H_re);
334+ const float32x4_t e = vmlsq_f32 (a, X_im, H_im);
335+ const float32x4_t c = vmulq_f32 (X_re, H_im);
336+ const float32x4_t f = vmlaq_f32 (c, X_im, H_re);
337+ const float32x4_t g = vaddq_f32 (S_re, e);
338+ const float32x4_t h = vaddq_f32 (S_im, f);
339+ vst1q_f32 (&S->re [k], g);
340+ vst1q_f32 (&S->im [k], h);
341+ }
342+ }
343+ limit = lim2;
344+ X = &render_buffer_data[0 ];
345+ } while (j < lim2);
346+
347+ H_j = &H[0 ];
348+ X = &render_buffer_data[render_buffer.Position ()];
349+ j = 0 ;
350+ limit = lim1;
351+ do {
352+ for (; j < limit; ++j, ++H_j, ++X) {
353+ S->re [kFftLengthBy2 ] += X->re [kFftLengthBy2 ] * H_j->re [kFftLengthBy2 ] -
354+ X->im [kFftLengthBy2 ] * H_j->im [kFftLengthBy2 ];
355+ S->im [kFftLengthBy2 ] += X->re [kFftLengthBy2 ] * H_j->im [kFftLengthBy2 ] +
356+ X->im [kFftLengthBy2 ] * H_j->re [kFftLengthBy2 ];
357+ }
358+ limit = lim2;
359+ X = &render_buffer_data[0 ];
360+ } while (j < lim2);
361+ }
362+ #endif
363+
206364#if defined(WEBRTC_ARCH_X86_FAMILY)
207365// Produces the filter output (SSE2 variant).
208366void ApplyFilter_SSE2 (const RenderBuffer& render_buffer,
@@ -305,6 +463,11 @@ void AdaptiveFirFilter::Filter(const RenderBuffer& render_buffer,
305463 case Aec3Optimization::kSse2 :
306464 aec3::ApplyFilter_SSE2 (render_buffer, H_, S);
307465 break ;
466+ #endif
467+ #if defined(WEBRTC_HAS_NEON)
468+ case Aec3Optimization::kNeon :
469+ aec3::ApplyFilter_NEON (render_buffer, H_, S);
470+ break ;
308471#endif
309472 default :
310473 aec3::ApplyFilter (render_buffer, H_, S);
@@ -319,6 +482,11 @@ void AdaptiveFirFilter::Adapt(const RenderBuffer& render_buffer,
319482 case Aec3Optimization::kSse2 :
320483 aec3::AdaptPartitions_SSE2 (render_buffer, G, H_);
321484 break ;
485+ #endif
486+ #if defined(WEBRTC_HAS_NEON)
487+ case Aec3Optimization::kNeon :
488+ aec3::AdaptPartitions_NEON (render_buffer, G, H_);
489+ break ;
322490#endif
323491 default :
324492 aec3::AdaptPartitions (render_buffer, G, H_);
@@ -337,6 +505,12 @@ void AdaptiveFirFilter::Adapt(const RenderBuffer& render_buffer,
337505 aec3::UpdateFrequencyResponse_SSE2 (H_, &H2_);
338506 aec3::UpdateErlEstimator_SSE2 (H2_, &erl_);
339507 break ;
508+ #endif
509+ #if defined(WEBRTC_HAS_NEON)
510+ case Aec3Optimization::kNeon :
511+ aec3::UpdateFrequencyResponse_NEON (H_, &H2_);
512+ aec3::UpdateErlEstimator_NEON (H2_, &erl_);
513+ break ;
340514#endif
341515 default :
342516 aec3::UpdateFrequencyResponse (H_, &H2_);
0 commit comments