Skip to content

Commit 5d153c7

Browse files
peahCommit bot
authored andcommitted
Reland of Added ARM Neon SIMD optimizations for AEC3 (patchset #1 id:1 of https://codereview.webrtc.org/2856113003/ )
Reason for revert: The original patch set was correct, but the Chromium bug number needed to be corrected. Original issue's description: > Revert of Added ARM Neon SIMD optimizations for AEC3 (patchset #2 id:970001 of https://codereview.webrtc.org/2834073005/ ) > > Reason for revert: > The bug number for the chromium bug was wrong. > > Original issue's description: > > Added ARM Neon optimizations for AEC3 > > > > This CL adds Neon SIMD optimizations for AEC3 on ARM, resulting > > in an 8 times complexity reduction. The optimizations are basically > > identical to what was already in place for SSE2. > > > > BUG=chromium:14993, webrtc:6018 > > > > Review-Url: https://codereview.webrtc.org/2834073005 > > Cr-Commit-Position: refs/heads/master@{#17993} > > Committed: https://chromium.googlesource.com/external/webrtc/+/f246b91eba0e8d95bd3fee4634887fb6d3017811 > > [email protected] > # Skipping CQ checks because original CL landed less than 1 days ago. > NOPRESUBMIT=true > NOTREECHECKS=true > NOTRY=true > BUG=chromium:14993, webrtc:6018 > > Review-Url: https://codereview.webrtc.org/2856113003 > Cr-Commit-Position: refs/heads/master@{#17994} > Committed: https://chromium.googlesource.com/external/webrtc/+/b70f8cfd4d5cf6fe31a8089df9955b7e7b7ebd32 [email protected] # Skipping CQ checks because original CL landed less than 1 days ago. NOPRESUBMIT=true NOTREECHECKS=true NOTRY=true BUG=chromium:714993, webrtc:6018 Review-Url: https://codereview.webrtc.org/2862573002 Cr-Commit-Position: refs/heads/master@{#17997}
1 parent 3dae705 commit 5d153c7

File tree

10 files changed

+621
-6
lines changed

10 files changed

+621
-6
lines changed

webrtc/modules/audio_processing/aec3/adaptive_fir_filter.cc

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010

1111
#include "webrtc/modules/audio_processing/aec3/adaptive_fir_filter.h"
1212

13+
#if defined(WEBRTC_HAS_NEON)
14+
#include <arm_neon.h>
15+
#endif
1316
#include "webrtc/typedefs.h"
1417
#if defined(WEBRTC_ARCH_X86_FAMILY)
1518
#include <emmintrin.h>
@@ -52,6 +55,26 @@ void UpdateFrequencyResponse(
5255
}
5356
}
5457

58+
#if defined(WEBRTC_HAS_NEON)
59+
// Computes and stores the frequency response of the filter.
60+
void UpdateFrequencyResponse_NEON(
61+
rtc::ArrayView<const FftData> H,
62+
std::vector<std::array<float, kFftLengthBy2Plus1>>* H2) {
63+
RTC_DCHECK_EQ(H.size(), H2->size());
64+
for (size_t k = 0; k < H.size(); ++k) {
65+
for (size_t j = 0; j < kFftLengthBy2; j += 4) {
66+
const float32x4_t re = vld1q_f32(&H[k].re[j]);
67+
const float32x4_t im = vld1q_f32(&H[k].im[j]);
68+
float32x4_t H2_k_j = vmulq_f32(re, re);
69+
H2_k_j = vmlaq_f32(H2_k_j, im, im);
70+
vst1q_f32(&(*H2)[k][j], H2_k_j);
71+
}
72+
(*H2)[k][kFftLengthBy2] = H[k].re[kFftLengthBy2] * H[k].re[kFftLengthBy2] +
73+
H[k].im[kFftLengthBy2] * H[k].im[kFftLengthBy2];
74+
}
75+
}
76+
#endif
77+
5578
#if defined(WEBRTC_ARCH_X86_FAMILY)
5679
// Computes and stores the frequency response of the filter.
5780
void UpdateFrequencyResponse_SSE2(
@@ -85,6 +108,25 @@ void UpdateErlEstimator(
85108
}
86109
}
87110

111+
#if defined(WEBRTC_HAS_NEON)
112+
// Computes and stores the echo return loss estimate of the filter, which is the
113+
// sum of the partition frequency responses.
114+
void UpdateErlEstimator_NEON(
115+
const std::vector<std::array<float, kFftLengthBy2Plus1>>& H2,
116+
std::array<float, kFftLengthBy2Plus1>* erl) {
117+
erl->fill(0.f);
118+
for (auto& H2_j : H2) {
119+
for (size_t k = 0; k < kFftLengthBy2; k += 4) {
120+
const float32x4_t H2_j_k = vld1q_f32(&H2_j[k]);
121+
float32x4_t erl_k = vld1q_f32(&(*erl)[k]);
122+
erl_k = vaddq_f32(erl_k, H2_j_k);
123+
vst1q_f32(&(*erl)[k], erl_k);
124+
}
125+
(*erl)[kFftLengthBy2] += H2_j[kFftLengthBy2];
126+
}
127+
}
128+
#endif
129+
88130
#if defined(WEBRTC_ARCH_X86_FAMILY)
89131
// Computes and stores the echo return loss estimate of the filter, which is the
90132
// sum of the partition frequency responses.
@@ -121,6 +163,63 @@ void AdaptPartitions(const RenderBuffer& render_buffer,
121163
}
122164
}
123165

166+
#if defined(WEBRTC_HAS_NEON)
167+
// Adapts the filter partitions. (NEON variant)
168+
void AdaptPartitions_NEON(const RenderBuffer& render_buffer,
169+
const FftData& G,
170+
rtc::ArrayView<FftData> H) {
171+
rtc::ArrayView<const FftData> render_buffer_data = render_buffer.Buffer();
172+
const int lim1 =
173+
std::min(render_buffer_data.size() - render_buffer.Position(), H.size());
174+
const int lim2 = H.size();
175+
constexpr int kNumFourBinBands = kFftLengthBy2 / 4;
176+
FftData* H_j = &H[0];
177+
const FftData* X = &render_buffer_data[render_buffer.Position()];
178+
int limit = lim1;
179+
int j = 0;
180+
do {
181+
for (; j < limit; ++j, ++H_j, ++X) {
182+
for (int k = 0, n = 0; n < kNumFourBinBands; ++n, k += 4) {
183+
const float32x4_t G_re = vld1q_f32(&G.re[k]);
184+
const float32x4_t G_im = vld1q_f32(&G.im[k]);
185+
const float32x4_t X_re = vld1q_f32(&X->re[k]);
186+
const float32x4_t X_im = vld1q_f32(&X->im[k]);
187+
const float32x4_t H_re = vld1q_f32(&H_j->re[k]);
188+
const float32x4_t H_im = vld1q_f32(&H_j->im[k]);
189+
const float32x4_t a = vmulq_f32(X_re, G_re);
190+
const float32x4_t e = vmlaq_f32(a, X_im, G_im);
191+
const float32x4_t c = vmulq_f32(X_re, G_im);
192+
const float32x4_t f = vmlsq_f32(c, X_im, G_re);
193+
const float32x4_t g = vaddq_f32(H_re, e);
194+
const float32x4_t h = vaddq_f32(H_im, f);
195+
196+
vst1q_f32(&H_j->re[k], g);
197+
vst1q_f32(&H_j->im[k], h);
198+
}
199+
}
200+
201+
X = &render_buffer_data[0];
202+
limit = lim2;
203+
} while (j < lim2);
204+
205+
H_j = &H[0];
206+
X = &render_buffer_data[render_buffer.Position()];
207+
limit = lim1;
208+
j = 0;
209+
do {
210+
for (; j < limit; ++j, ++H_j, ++X) {
211+
H_j->re[kFftLengthBy2] += X->re[kFftLengthBy2] * G.re[kFftLengthBy2] +
212+
X->im[kFftLengthBy2] * G.im[kFftLengthBy2];
213+
H_j->im[kFftLengthBy2] += X->re[kFftLengthBy2] * G.im[kFftLengthBy2] -
214+
X->im[kFftLengthBy2] * G.re[kFftLengthBy2];
215+
}
216+
217+
X = &render_buffer_data[0];
218+
limit = lim2;
219+
} while (j < lim2);
220+
}
221+
#endif
222+
124223
#if defined(WEBRTC_ARCH_X86_FAMILY)
125224
// Adapts the filter partitions. (SSE2 variant)
126225
void AdaptPartitions_SSE2(const RenderBuffer& render_buffer,
@@ -203,6 +302,65 @@ void ApplyFilter(const RenderBuffer& render_buffer,
203302
}
204303
}
205304

305+
#if defined(WEBRTC_HAS_NEON)
306+
// Produces the filter output (NEON variant).
307+
void ApplyFilter_NEON(const RenderBuffer& render_buffer,
308+
rtc::ArrayView<const FftData> H,
309+
FftData* S) {
310+
RTC_DCHECK_GE(H.size(), H.size() - 1);
311+
S->re.fill(0.f);
312+
S->im.fill(0.f);
313+
314+
rtc::ArrayView<const FftData> render_buffer_data = render_buffer.Buffer();
315+
const int lim1 =
316+
std::min(render_buffer_data.size() - render_buffer.Position(), H.size());
317+
const int lim2 = H.size();
318+
constexpr int kNumFourBinBands = kFftLengthBy2 / 4;
319+
const FftData* H_j = &H[0];
320+
const FftData* X = &render_buffer_data[render_buffer.Position()];
321+
322+
int j = 0;
323+
int limit = lim1;
324+
do {
325+
for (; j < limit; ++j, ++H_j, ++X) {
326+
for (int k = 0, n = 0; n < kNumFourBinBands; ++n, k += 4) {
327+
const float32x4_t X_re = vld1q_f32(&X->re[k]);
328+
const float32x4_t X_im = vld1q_f32(&X->im[k]);
329+
const float32x4_t H_re = vld1q_f32(&H_j->re[k]);
330+
const float32x4_t H_im = vld1q_f32(&H_j->im[k]);
331+
const float32x4_t S_re = vld1q_f32(&S->re[k]);
332+
const float32x4_t S_im = vld1q_f32(&S->im[k]);
333+
const float32x4_t a = vmulq_f32(X_re, H_re);
334+
const float32x4_t e = vmlsq_f32(a, X_im, H_im);
335+
const float32x4_t c = vmulq_f32(X_re, H_im);
336+
const float32x4_t f = vmlaq_f32(c, X_im, H_re);
337+
const float32x4_t g = vaddq_f32(S_re, e);
338+
const float32x4_t h = vaddq_f32(S_im, f);
339+
vst1q_f32(&S->re[k], g);
340+
vst1q_f32(&S->im[k], h);
341+
}
342+
}
343+
limit = lim2;
344+
X = &render_buffer_data[0];
345+
} while (j < lim2);
346+
347+
H_j = &H[0];
348+
X = &render_buffer_data[render_buffer.Position()];
349+
j = 0;
350+
limit = lim1;
351+
do {
352+
for (; j < limit; ++j, ++H_j, ++X) {
353+
S->re[kFftLengthBy2] += X->re[kFftLengthBy2] * H_j->re[kFftLengthBy2] -
354+
X->im[kFftLengthBy2] * H_j->im[kFftLengthBy2];
355+
S->im[kFftLengthBy2] += X->re[kFftLengthBy2] * H_j->im[kFftLengthBy2] +
356+
X->im[kFftLengthBy2] * H_j->re[kFftLengthBy2];
357+
}
358+
limit = lim2;
359+
X = &render_buffer_data[0];
360+
} while (j < lim2);
361+
}
362+
#endif
363+
206364
#if defined(WEBRTC_ARCH_X86_FAMILY)
207365
// Produces the filter output (SSE2 variant).
208366
void ApplyFilter_SSE2(const RenderBuffer& render_buffer,
@@ -305,6 +463,11 @@ void AdaptiveFirFilter::Filter(const RenderBuffer& render_buffer,
305463
case Aec3Optimization::kSse2:
306464
aec3::ApplyFilter_SSE2(render_buffer, H_, S);
307465
break;
466+
#endif
467+
#if defined(WEBRTC_HAS_NEON)
468+
case Aec3Optimization::kNeon:
469+
aec3::ApplyFilter_NEON(render_buffer, H_, S);
470+
break;
308471
#endif
309472
default:
310473
aec3::ApplyFilter(render_buffer, H_, S);
@@ -319,6 +482,11 @@ void AdaptiveFirFilter::Adapt(const RenderBuffer& render_buffer,
319482
case Aec3Optimization::kSse2:
320483
aec3::AdaptPartitions_SSE2(render_buffer, G, H_);
321484
break;
485+
#endif
486+
#if defined(WEBRTC_HAS_NEON)
487+
case Aec3Optimization::kNeon:
488+
aec3::AdaptPartitions_NEON(render_buffer, G, H_);
489+
break;
322490
#endif
323491
default:
324492
aec3::AdaptPartitions(render_buffer, G, H_);
@@ -337,6 +505,12 @@ void AdaptiveFirFilter::Adapt(const RenderBuffer& render_buffer,
337505
aec3::UpdateFrequencyResponse_SSE2(H_, &H2_);
338506
aec3::UpdateErlEstimator_SSE2(H2_, &erl_);
339507
break;
508+
#endif
509+
#if defined(WEBRTC_HAS_NEON)
510+
case Aec3Optimization::kNeon:
511+
aec3::UpdateFrequencyResponse_NEON(H_, &H2_);
512+
aec3::UpdateErlEstimator_NEON(H2_, &erl_);
513+
break;
340514
#endif
341515
default:
342516
aec3::UpdateFrequencyResponse(H_, &H2_);

webrtc/modules/audio_processing/aec3/adaptive_fir_filter.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ namespace aec3 {
2929
void UpdateFrequencyResponse(
3030
rtc::ArrayView<const FftData> H,
3131
std::vector<std::array<float, kFftLengthBy2Plus1>>* H2);
32+
#if defined(WEBRTC_HAS_NEON)
33+
void UpdateFrequencyResponse_NEON(
34+
rtc::ArrayView<const FftData> H,
35+
std::vector<std::array<float, kFftLengthBy2Plus1>>* H2);
36+
#endif
3237
#if defined(WEBRTC_ARCH_X86_FAMILY)
3338
void UpdateFrequencyResponse_SSE2(
3439
rtc::ArrayView<const FftData> H,
@@ -40,6 +45,11 @@ void UpdateFrequencyResponse_SSE2(
4045
void UpdateErlEstimator(
4146
const std::vector<std::array<float, kFftLengthBy2Plus1>>& H2,
4247
std::array<float, kFftLengthBy2Plus1>* erl);
48+
#if defined(WEBRTC_HAS_NEON)
49+
void UpdateErlEstimator_NEON(
50+
const std::vector<std::array<float, kFftLengthBy2Plus1>>& H2,
51+
std::array<float, kFftLengthBy2Plus1>* erl);
52+
#endif
4353
#if defined(WEBRTC_ARCH_X86_FAMILY)
4454
void UpdateErlEstimator_SSE2(
4555
const std::vector<std::array<float, kFftLengthBy2Plus1>>& H2,
@@ -50,6 +60,11 @@ void UpdateErlEstimator_SSE2(
5060
void AdaptPartitions(const RenderBuffer& render_buffer,
5161
const FftData& G,
5262
rtc::ArrayView<FftData> H);
63+
#if defined(WEBRTC_HAS_NEON)
64+
void AdaptPartitions_NEON(const RenderBuffer& render_buffer,
65+
const FftData& G,
66+
rtc::ArrayView<FftData> H);
67+
#endif
5368
#if defined(WEBRTC_ARCH_X86_FAMILY)
5469
void AdaptPartitions_SSE2(const RenderBuffer& render_buffer,
5570
const FftData& G,
@@ -60,6 +75,11 @@ void AdaptPartitions_SSE2(const RenderBuffer& render_buffer,
6075
void ApplyFilter(const RenderBuffer& render_buffer,
6176
rtc::ArrayView<const FftData> H,
6277
FftData* S);
78+
#if defined(WEBRTC_HAS_NEON)
79+
void ApplyFilter_NEON(const RenderBuffer& render_buffer,
80+
rtc::ArrayView<const FftData> H,
81+
FftData* S);
82+
#endif
6383
#if defined(WEBRTC_ARCH_X86_FAMILY)
6484
void ApplyFilter_SSE2(const RenderBuffer& render_buffer,
6585
rtc::ArrayView<const FftData> H,

0 commit comments

Comments
 (0)