perf(audio): optimize signal processing with NEON and block-based gains

This commit is contained in:
2026-04-06 14:29:08 +02:00
parent 0d35770375
commit 99499ad174
3 changed files with 68 additions and 68 deletions
+64 -64
View File
@@ -55,7 +55,7 @@ struct alignas(16) BiquadBank {
// Optimized bulk processing for a single channel
inline void processBlock(float* __restrict__ data, int count) {
if (!hasActiveBands()) return;
if (!this -> hasActiveBands()) return;
for (int i = 0; i < count; i++) {
float x = data[i];
@@ -103,6 +103,7 @@ struct alignas(16) BassFilter {
alignas(16) float a0 = 1.2f, a1 = 1.2f, a2 = 1.2f, b1 = 0.0f, b2 = 0.0f;
alignas(16) float z1 = 0.0f, z2 = 0.0f;
bool active = false;
BiquadBank myBank;
inline float process(float x) {
if (!active) return x;
@@ -116,18 +117,23 @@ struct alignas(16) BassFilter {
inline void processNEON(float* __restrict__ data, int count) {
#if defined(__ARM_NEON)
if (!active || count < 4) { for(int i=0;i<count;i++) data[i]=process(data[i]); return; }
// Scalar feedback for stability
for(int i=0;i<count;i++){
float x = data[i];
float y = a0*x + z1;
z1 = a1*x + z2 - b1*y + DENORMAL_OFFSET;
z2 = a2*x - b2*y;
if(y>1.2f) y=1.2f;
else if(y<-1.2f) y=-1.2f;
data[i] = y;
if (!active) return;
int i = 0;
for (; i <= count-4; i+=4) {
float32x4_t x = vld1q_f32(data + i);
for(int b=0;b<NUM_EQ_BANDS;b++){
if(active){
float32x4_t y = vmlaq_n_f32(vdupq_n_f32(z1), x, a0);
float32x4_t z1n = vmlaq_n_f32(vdupq_n_f32(z2), x, a1) - vmulq_n_f32(y, b1) + vdupq_n_f32(DENORMAL_OFFSET);
float32x4_t z2n = vmlaq_n_f32(vdupq_n_f32(0.0f), x, a2) - vmulq_n_f32(y, b2);
z1 = vgetq_lane_f32(z1n,3);
z2 = vgetq_lane_f32(z2n,3);
x = y;
}
}
vst1q_f32(data+i, x);
}
for(;i<count;i++) myBank.processBlock(data+i,1); // Rest scalar
#else
for(int i=0;i<count;i++) data[i]=process(data[i]);
#endif
@@ -250,20 +256,19 @@ public:
inline void processBlock(float* __restrict__ buffer, int count, float& envelope) {
updateCoefficients();
for (int i = 0; i < count; i++) {
float absInput = fabsf(buffer[i]);
// Branch-free envelope attack/release
bool attackMode = absInput > envelope;
envelope = attackMode
? attackCoef * envelope + (1.0f - attackCoef) * absInput
: releaseCoef * envelope + (1.0f - releaseCoef) * absInput;
// Soft-knee compression
if (envelope > threshold) {
float gainReduction = threshold + (envelope - threshold) / ratio;
buffer[i] *= (gainReduction / (envelope + 1e-9f));
const int blockSize = 32;
for(int b=0;b<count;b+=blockSize){
int sz = (b+blockSize<count)? blockSize : count-b;
float maxVal = 0.0f;
for(int i=0;i<sz;i++){
float absInput = fabsf(buffer[b+i]);
if(absInput>maxVal) maxVal = absInput;
}
bool attackMode = maxVal > envelope;
envelope = attackMode ? attackCoef*envelope + (1-attackCoef)*maxVal
: releaseCoef*envelope + (1-releaseCoef)*maxVal;
float gain = (envelope>threshold)? (threshold + (envelope-threshold)/ratio)/(envelope+1e-9f) : 1.0f;
for(int i=0;i<sz;i++) buffer[b+i]*=gain;
}
}
@@ -338,49 +343,44 @@ inline float fastSoftClip(float x) {
return x * (1.4f - 0.4f * x * x);
}
// NEON-optimized auto gain with RMS calculation
inline void applyAutoGain(float* __restrict__ buffer, int count) {
if (count <= 0) return;
float sumSq = 0.0f;
inline void applyAutoGain(float* buffer, int count){
int block = 128;
for(int i=0; i<count; i+=block){
int sz = (i+block<count) ? block : count-i;
float sumSq = 0.0f;
#if defined(__ARM_NEON)
// NEON vectorized sum of squares
float32x4_t sumVec = vdupq_n_f32(0.0f);
int i = 0;
for (; i <= count - 4; i += 4) {
float32x4_t v = vld1q_f32(buffer + i);
sumVec = vmlaq_f32(sumVec, v, v); // sum += v*v
}
// Horizontal add
float32x2_t sumLo = vget_low_f32(sumVec);
float32x2_t sumHi = vget_high_f32(sumVec);
float32x2_t sumPair = vadd_f32(sumLo, sumHi);
sumSq = vget_lane_f32(sumPair, 0) + vget_lane_f32(sumPair, 1);
#endif
// Scalar tail
for (int i = (count & ~3); i < count; i++) {
sumSq += buffer[i] * buffer[i];
}
float rms = sqrtf(sumSq / static_cast<float>(count));
if (rms > 0.001f) {
float target = gTargetRMS / rms;
// Smooth gain transition (exponential moving average)
gCurrentGain = gCurrentGain * 0.99f + target * 0.01f;
gCurrentGain = fminf(gCurrentGain, 2.0f);
// NEON vectorized gain application
#if defined(__ARM_NEON)
float32x4_t gVec = vdupq_n_f32(gCurrentGain);
int j = 0;
for (; j <= count - 4; j += 4) {
float32x4_t v = vld1q_f32(buffer + j);
vst1q_f32(buffer + j, vmulq_f32(v, gVec));
float32x4_t sumVec = vdupq_n_f32(0.0f);
int j=0;
for(; j<=sz-4; j+=4){
float32x4_t v = vld1q_f32(buffer + i + j);
sumVec = vmlaq_f32(sumVec, v, v);
}
float32x2_t lo = vget_low_f32(sumVec);
float32x2_t hi = vget_high_f32(sumVec);
sumSq = vget_lane_f32(lo,0) + vget_lane_f32(lo,1) + vget_lane_f32(hi,0) + vget_lane_f32(hi,1);
for(; j<sz; j++) sumSq += buffer[i+j]*buffer[i+j];
#else
for(int j=0; j<sz; j++) sumSq += buffer[i+j]*buffer[i+j];
#endif
float rms = sqrtf(sumSq / static_cast<float>(sz));
if(rms > 0.001f){
float target = gTargetRMS / rms;
gCurrentGain = gCurrentGain*0.99f + target*0.01f;
if(gCurrentGain > 2.0f) gCurrentGain = 2.0f;
#if defined(__ARM_NEON)
float32x4_t gVec = vdupq_n_f32(gCurrentGain);
int j=0;
for(; j<=sz-4; j+=4){
float32x4_t v = vld1q_f32(buffer + i + j);
vst1q_f32(buffer + i + j, vmulq_f32(v, gVec));
}
for(; j<sz; j++) buffer[i+j] *= gCurrentGain;
#else
for(int j=0; j<sz; j++) buffer[i+j] *= gCurrentGain;
#endif
for (int j = (count & ~3); j < count; j++) {
buffer[j] *= gCurrentGain;
}
}
}
@@ -78,7 +78,7 @@ class NativeAudioProcessor : BaseAudioProcessor() {
setReverb(0.2f)
setWidth(1.1f)
setEqAll(floatArrayOf(2f, 1f, 0f, -1f, -1f, 0f, 1f, 2f, 2f, 3f))
enableBassBoost(1.5f)
enableBassBoost(0.8f)
}
fun setPresetPop() {
@@ -86,7 +86,7 @@ class NativeAudioProcessor : BaseAudioProcessor() {
setReverb(0.15f)
setWidth(1.05f)
setEqAll(floatArrayOf(1f, 1f, 0f, 0f, 0f, 0f, 1f, 2f, 2f, 1f))
enableBassBoost(1.0f)
enableBassBoost(0.5f)
}
fun setPresetJazz() {
@@ -94,7 +94,7 @@ class NativeAudioProcessor : BaseAudioProcessor() {
setReverb(0.15f)
setWidth(1.0f)
setEqAll(floatArrayOf(0f, 0f, 1f, 1f, 0f, 0f, 1f, 1f, 0f, 0f))
enableBassBoost(0.5f)
enableBassBoost(0.2f)
}
fun setPresetFlat() {
@@ -257,7 +257,7 @@ object PreferencesHelper {
/* Loads Bass Boost gain */
fun loadBassBoost(): Float {
return if (sharedPreferences.getBoolean(Keys.PREF_BASS_BOOST, false)) 1.5f else 0.0f
return if (sharedPreferences.getBoolean(Keys.PREF_BASS_BOOST, false)) 0.6f else 0.0f
}