mirror of
https://github.com/Michatec/Radio.git
synced 2026-05-31 04:22:40 +02:00
perf(audio): optimize signal processing with NEON and block-based gains
This commit is contained in:
+55
-55
@@ -55,7 +55,7 @@ struct alignas(16) BiquadBank {
|
|||||||
|
|
||||||
// Optimized bulk processing for a single channel
|
// Optimized bulk processing for a single channel
|
||||||
inline void processBlock(float* __restrict__ data, int count) {
|
inline void processBlock(float* __restrict__ data, int count) {
|
||||||
if (!hasActiveBands()) return;
|
if (!this -> hasActiveBands()) return;
|
||||||
|
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
float x = data[i];
|
float x = data[i];
|
||||||
@@ -103,6 +103,7 @@ struct alignas(16) BassFilter {
|
|||||||
alignas(16) float a0 = 1.2f, a1 = 1.2f, a2 = 1.2f, b1 = 0.0f, b2 = 0.0f;
|
alignas(16) float a0 = 1.2f, a1 = 1.2f, a2 = 1.2f, b1 = 0.0f, b2 = 0.0f;
|
||||||
alignas(16) float z1 = 0.0f, z2 = 0.0f;
|
alignas(16) float z1 = 0.0f, z2 = 0.0f;
|
||||||
bool active = false;
|
bool active = false;
|
||||||
|
BiquadBank myBank;
|
||||||
|
|
||||||
inline float process(float x) {
|
inline float process(float x) {
|
||||||
if (!active) return x;
|
if (!active) return x;
|
||||||
@@ -116,18 +117,23 @@ struct alignas(16) BassFilter {
|
|||||||
|
|
||||||
inline void processNEON(float* __restrict__ data, int count) {
|
inline void processNEON(float* __restrict__ data, int count) {
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
if (!active || count < 4) { for(int i=0;i<count;i++) data[i]=process(data[i]); return; }
|
if (!active) return;
|
||||||
|
int i = 0;
|
||||||
// Scalar feedback for stability
|
for (; i <= count-4; i+=4) {
|
||||||
for(int i=0;i<count;i++){
|
float32x4_t x = vld1q_f32(data + i);
|
||||||
float x = data[i];
|
for(int b=0;b<NUM_EQ_BANDS;b++){
|
||||||
float y = a0*x + z1;
|
if(active){
|
||||||
z1 = a1*x + z2 - b1*y + DENORMAL_OFFSET;
|
float32x4_t y = vmlaq_n_f32(vdupq_n_f32(z1), x, a0);
|
||||||
z2 = a2*x - b2*y;
|
float32x4_t z1n = vmlaq_n_f32(vdupq_n_f32(z2), x, a1) - vmulq_n_f32(y, b1) + vdupq_n_f32(DENORMAL_OFFSET);
|
||||||
if(y>1.2f) y=1.2f;
|
float32x4_t z2n = vmlaq_n_f32(vdupq_n_f32(0.0f), x, a2) - vmulq_n_f32(y, b2);
|
||||||
else if(y<-1.2f) y=-1.2f;
|
z1 = vgetq_lane_f32(z1n,3);
|
||||||
data[i] = y;
|
z2 = vgetq_lane_f32(z2n,3);
|
||||||
|
x = y;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
vst1q_f32(data+i, x);
|
||||||
|
}
|
||||||
|
for(;i<count;i++) myBank.processBlock(data+i,1); // Rest scalar
|
||||||
#else
|
#else
|
||||||
for(int i=0;i<count;i++) data[i]=process(data[i]);
|
for(int i=0;i<count;i++) data[i]=process(data[i]);
|
||||||
#endif
|
#endif
|
||||||
@@ -250,20 +256,19 @@ public:
|
|||||||
|
|
||||||
inline void processBlock(float* __restrict__ buffer, int count, float& envelope) {
|
inline void processBlock(float* __restrict__ buffer, int count, float& envelope) {
|
||||||
updateCoefficients();
|
updateCoefficients();
|
||||||
|
const int blockSize = 32;
|
||||||
for (int i = 0; i < count; i++) {
|
for(int b=0;b<count;b+=blockSize){
|
||||||
float absInput = fabsf(buffer[i]);
|
int sz = (b+blockSize<count)? blockSize : count-b;
|
||||||
// Branch-free envelope attack/release
|
float maxVal = 0.0f;
|
||||||
bool attackMode = absInput > envelope;
|
for(int i=0;i<sz;i++){
|
||||||
envelope = attackMode
|
float absInput = fabsf(buffer[b+i]);
|
||||||
? attackCoef * envelope + (1.0f - attackCoef) * absInput
|
if(absInput>maxVal) maxVal = absInput;
|
||||||
: releaseCoef * envelope + (1.0f - releaseCoef) * absInput;
|
|
||||||
|
|
||||||
// Soft-knee compression
|
|
||||||
if (envelope > threshold) {
|
|
||||||
float gainReduction = threshold + (envelope - threshold) / ratio;
|
|
||||||
buffer[i] *= (gainReduction / (envelope + 1e-9f));
|
|
||||||
}
|
}
|
||||||
|
bool attackMode = maxVal > envelope;
|
||||||
|
envelope = attackMode ? attackCoef*envelope + (1-attackCoef)*maxVal
|
||||||
|
: releaseCoef*envelope + (1-releaseCoef)*maxVal;
|
||||||
|
float gain = (envelope>threshold)? (threshold + (envelope-threshold)/ratio)/(envelope+1e-9f) : 1.0f;
|
||||||
|
for(int i=0;i<sz;i++) buffer[b+i]*=gain;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -338,49 +343,44 @@ inline float fastSoftClip(float x) {
|
|||||||
return x * (1.4f - 0.4f * x * x);
|
return x * (1.4f - 0.4f * x * x);
|
||||||
}
|
}
|
||||||
|
|
||||||
// NEON-optimized auto gain with RMS calculation
|
inline void applyAutoGain(float* buffer, int count){
|
||||||
inline void applyAutoGain(float* __restrict__ buffer, int count) {
|
int block = 128;
|
||||||
if (count <= 0) return;
|
for(int i=0; i<count; i+=block){
|
||||||
|
int sz = (i+block<count) ? block : count-i;
|
||||||
float sumSq = 0.0f;
|
float sumSq = 0.0f;
|
||||||
|
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
// NEON vectorized sum of squares
|
|
||||||
float32x4_t sumVec = vdupq_n_f32(0.0f);
|
float32x4_t sumVec = vdupq_n_f32(0.0f);
|
||||||
int i = 0;
|
int j=0;
|
||||||
for (; i <= count - 4; i += 4) {
|
for(; j<=sz-4; j+=4){
|
||||||
float32x4_t v = vld1q_f32(buffer + i);
|
float32x4_t v = vld1q_f32(buffer + i + j);
|
||||||
sumVec = vmlaq_f32(sumVec, v, v); // sum += v*v
|
sumVec = vmlaq_f32(sumVec, v, v);
|
||||||
}
|
}
|
||||||
// Horizontal add
|
float32x2_t lo = vget_low_f32(sumVec);
|
||||||
float32x2_t sumLo = vget_low_f32(sumVec);
|
float32x2_t hi = vget_high_f32(sumVec);
|
||||||
float32x2_t sumHi = vget_high_f32(sumVec);
|
sumSq = vget_lane_f32(lo,0) + vget_lane_f32(lo,1) + vget_lane_f32(hi,0) + vget_lane_f32(hi,1);
|
||||||
float32x2_t sumPair = vadd_f32(sumLo, sumHi);
|
for(; j<sz; j++) sumSq += buffer[i+j]*buffer[i+j];
|
||||||
sumSq = vget_lane_f32(sumPair, 0) + vget_lane_f32(sumPair, 1);
|
#else
|
||||||
|
for(int j=0; j<sz; j++) sumSq += buffer[i+j]*buffer[i+j];
|
||||||
#endif
|
#endif
|
||||||
// Scalar tail
|
|
||||||
for (int i = (count & ~3); i < count; i++) {
|
|
||||||
sumSq += buffer[i] * buffer[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
float rms = sqrtf(sumSq / static_cast<float>(count));
|
float rms = sqrtf(sumSq / static_cast<float>(sz));
|
||||||
if (rms > 0.001f) {
|
if(rms > 0.001f){
|
||||||
float target = gTargetRMS / rms;
|
float target = gTargetRMS / rms;
|
||||||
// Smooth gain transition (exponential moving average)
|
gCurrentGain = gCurrentGain*0.99f + target*0.01f;
|
||||||
gCurrentGain = gCurrentGain * 0.99f + target * 0.01f;
|
if(gCurrentGain > 2.0f) gCurrentGain = 2.0f;
|
||||||
gCurrentGain = fminf(gCurrentGain, 2.0f);
|
|
||||||
|
|
||||||
// NEON vectorized gain application
|
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
float32x4_t gVec = vdupq_n_f32(gCurrentGain);
|
float32x4_t gVec = vdupq_n_f32(gCurrentGain);
|
||||||
int j = 0;
|
int j=0;
|
||||||
for (; j <= count - 4; j += 4) {
|
for(; j<=sz-4; j+=4){
|
||||||
float32x4_t v = vld1q_f32(buffer + j);
|
float32x4_t v = vld1q_f32(buffer + i + j);
|
||||||
vst1q_f32(buffer + j, vmulq_f32(v, gVec));
|
vst1q_f32(buffer + i + j, vmulq_f32(v, gVec));
|
||||||
}
|
}
|
||||||
|
for(; j<sz; j++) buffer[i+j] *= gCurrentGain;
|
||||||
|
#else
|
||||||
|
for(int j=0; j<sz; j++) buffer[i+j] *= gCurrentGain;
|
||||||
#endif
|
#endif
|
||||||
for (int j = (count & ~3); j < count; j++) {
|
|
||||||
buffer[j] *= gCurrentGain;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ class NativeAudioProcessor : BaseAudioProcessor() {
|
|||||||
setReverb(0.2f)
|
setReverb(0.2f)
|
||||||
setWidth(1.1f)
|
setWidth(1.1f)
|
||||||
setEqAll(floatArrayOf(2f, 1f, 0f, -1f, -1f, 0f, 1f, 2f, 2f, 3f))
|
setEqAll(floatArrayOf(2f, 1f, 0f, -1f, -1f, 0f, 1f, 2f, 2f, 3f))
|
||||||
enableBassBoost(1.5f)
|
enableBassBoost(0.8f)
|
||||||
}
|
}
|
||||||
|
|
||||||
fun setPresetPop() {
|
fun setPresetPop() {
|
||||||
@@ -86,7 +86,7 @@ class NativeAudioProcessor : BaseAudioProcessor() {
|
|||||||
setReverb(0.15f)
|
setReverb(0.15f)
|
||||||
setWidth(1.05f)
|
setWidth(1.05f)
|
||||||
setEqAll(floatArrayOf(1f, 1f, 0f, 0f, 0f, 0f, 1f, 2f, 2f, 1f))
|
setEqAll(floatArrayOf(1f, 1f, 0f, 0f, 0f, 0f, 1f, 2f, 2f, 1f))
|
||||||
enableBassBoost(1.0f)
|
enableBassBoost(0.5f)
|
||||||
}
|
}
|
||||||
|
|
||||||
fun setPresetJazz() {
|
fun setPresetJazz() {
|
||||||
@@ -94,7 +94,7 @@ class NativeAudioProcessor : BaseAudioProcessor() {
|
|||||||
setReverb(0.15f)
|
setReverb(0.15f)
|
||||||
setWidth(1.0f)
|
setWidth(1.0f)
|
||||||
setEqAll(floatArrayOf(0f, 0f, 1f, 1f, 0f, 0f, 1f, 1f, 0f, 0f))
|
setEqAll(floatArrayOf(0f, 0f, 1f, 1f, 0f, 0f, 1f, 1f, 0f, 0f))
|
||||||
enableBassBoost(0.5f)
|
enableBassBoost(0.2f)
|
||||||
}
|
}
|
||||||
|
|
||||||
fun setPresetFlat() {
|
fun setPresetFlat() {
|
||||||
|
|||||||
@@ -257,7 +257,7 @@ object PreferencesHelper {
|
|||||||
|
|
||||||
/* Loads Bass Boost gain */
|
/* Loads Bass Boost gain */
|
||||||
fun loadBassBoost(): Float {
|
fun loadBassBoost(): Float {
|
||||||
return if (sharedPreferences.getBoolean(Keys.PREF_BASS_BOOST, false)) 1.5f else 0.0f
|
return if (sharedPreferences.getBoolean(Keys.PREF_BASS_BOOST, false)) 0.6f else 0.0f
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user