build: rename native audio library from radio to dsp

2026-05-31 05:12:41 +02:00 · 2026-04-06 15:16:41 +02:00
parent bc38742eae
commit 12445a3918
3 changed files with 2 additions and 2 deletions
@@ -0,0 +1,541 @@
+#include <jni.h>
+#include <vector>
+#include <cmath>
+#include <complex>
+#include <array>
+
+#if defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+// =============================================================================
+// OPTIMIZED CONFIGURATION
+// =============================================================================
+
+// Use L1/L2 cache-optimized block size (typical L1: 32KB, L2: 256KB)
+static constexpr int FFT_SIZE = 512;
+static constexpr int NUM_EQ_BANDS = 10;
+
+// Pre-compute constants at compile time
+static constexpr float INV_32768 = 1.0f / 32768.0f;
+static constexpr float SQRT_2_INV = 0.70710678f;  // 1/sqrt(2)
+
+// Denormal protection - use single scalar instead of adding per-sample
+static constexpr float DENORMAL_OFFSET = 1e-18f;
+
+// EQ frequencies - static const for compile-time access
+static constexpr std::array<float, NUM_EQ_BANDS> EQ_FREQUENCIES = {
+        31.25f, 62.5f, 125.0f, 250.0f, 500.0f,
+        1000.0f, 2000.0f, 4000.0f, 8000.0f, 16000.0f
+};
+
+// =============================================================================
+// OPTIMIZED DSP CLASSES - Structure of Arrays (SoA) for cache efficiency
+// =============================================================================
+
+struct alignas(16) BiquadBank {
+    // Coefficients (SoA - better for SIMD loads)
+    alignas(16) std::array<float, NUM_EQ_BANDS> a0{}, a1{}, a2{}, b1{}, b2{};
+    // State variables
+    alignas(16) std::array<float, NUM_EQ_BANDS> z1{}, z2{};
+    // Active flags (packed into bitmask for branch-free processing)
+    uint16_t activeMask = 0;
+
+    // Pre-check if any EQ band is active - branch free
+    [[nodiscard]] inline bool hasActiveBands() const { return activeMask != 0; }
+
+    inline void setBandActive(int band, bool active) {
+        if (active) activeMask |= (1 << band);
+        else activeMask &= ~(1 << band);
+    }
+
+    // Optimized bulk processing for a single channel
+    inline void processBlock(float* __restrict__ data, int count) {
+        if (!this -> hasActiveBands()) return;
+
+        for (int i = 0; i < count; i++) {
+            float x = data[i];
+            // Process all bands (compiler will optimize for activeMask)
+#pragma GCC unroll 10
+            for (int b = 0; b < NUM_EQ_BANDS; b++) {
+                if (activeMask & (1 << b)) {
+                    float y = x * a0[b] + z1[b];
+                    z1[b] = x * a1[b] + z2[b] - b1[b] * y + DENORMAL_OFFSET;
+                    z2[b] = x * a2[b] - b2[b] * y;
+                    x = y;
+                }
+            }
+            data[i] = x;
+        }
+    }
+
+    void setPeakingEQ(int band, float sr, float f, float g, float bw) {
+        if (band < 0 || band >= NUM_EQ_BANDS) return;
+
+        const bool active = std::abs(g) > 0.1f;
+        setBandActive(band, active);
+        if (!active) return;
+
+        const float A = powf(10.0f, g / 40.0f);
+        const float w = 2.0f * static_cast<float>(M_PI) * f / sr;
+        const float alpha = sinf(w) * sinhf(logf(2.0f) / 2.0f * bw * w / sinf(w));
+        const float c = cosf(w);
+
+        const float a0_raw = 1.0f + alpha / A;
+        const float invA0 = 1.0f / a0_raw;
+
+        a0[band] = (1.0f + alpha * A) * invA0;
+        a1[band] = (-2.0f * c) * invA0;
+        a2[band] = (1.0f - alpha * A) * invA0;
+        b1[band] = (-2.0f * c) * invA0;
+        b2[band] = (1.0f - alpha / A) * invA0;
+    }
+};
+
+// =============================================================================
+// BASS BOOST
+// =============================================================================
+struct alignas(16) BassFilter {
+    alignas(16) float a0 = 1.2f, a1 = 1.2f, a2 = 1.2f, b1 = 0.0f, b2 = 0.0f;
+    alignas(16) float z1 = 0.0f, z2 = 0.0f;
+    bool active = false;
+    BiquadBank myBank;
+
+    inline float process(float x) {
+        if (!active) return x;
+        float y = x * a0 + z1;
+        z1 = x * a1 + z2 - b1 * y + DENORMAL_OFFSET;
+        z2 = x * a2 - b2 * y;
+        y = bassSafeClip(y);
+        if(y > 1.2f) y = 1.2f;
+        else if(y < -1.2f) y = -1.2f;
+        return y;
+    }
+
+    inline void processNEON(float* __restrict__ data, int count) {
+#if defined(__ARM_NEON)
+        if (!active) return;
+        int i = 0;
+        for (; i <= count-4; i+=4) {
+            float32x4_t x = vld1q_f32(data + i);
+            for(int b=0;b<NUM_EQ_BANDS;b++){
+                if(active){
+                    float32x4_t y  = vmlaq_n_f32(vdupq_n_f32(z1), x, a0);
+                    float32x4_t z1n = vmlaq_n_f32(vdupq_n_f32(z2), x, a1) - vmulq_n_f32(y, b1) + vdupq_n_f32(DENORMAL_OFFSET);
+                    float32x4_t z2n = vmlaq_n_f32(vdupq_n_f32(0.0f), x, a2) - vmulq_n_f32(y, b2);
+                    z1 = vgetq_lane_f32(z1n,3);
+                    z2 = vgetq_lane_f32(z2n,3);
+                    x = y;
+                }
+            }
+            vst1q_f32(data+i, x);
+        }
+        for(;i<count;i++) myBank.processBlock(data+i,1); // Rest scalar
+#else
+        for(int i=0;i<count;i++) data[i]=process(data[i]);
+#endif
+    }
+
+    static inline float bassSafeClip(float x) {
+        const float maxGain = 1.0f;
+        if (x > maxGain) return maxGain - (maxGain - x) * 0.5f;
+        if (x < -maxGain) return -maxGain - (-maxGain - x) * 0.5f;
+        return x;
+    }
+
+    void setLowShelf(float sr,float f,float g,float q){
+        active=std::abs(g)>0.01f;
+        if(!active) return;
+
+        float A=powf(10.0f,g/40.0f);
+        float w=2.0f*static_cast<float>(M_PI)*f/sr;
+        float alpha=sinf(w)/2.0f*sqrtf((A+1.0f/A)*(1.0f/q-1.0f)+2.0f);
+        float c=cosf(w),sqrtA=sqrtf(A);
+        float a0_raw=(A+1.0f)+(A-1.0f)*c+2.0f*sqrtA*alpha;
+        float invA0=1.0f/a0_raw;
+        a0=A*((A+1.0f)-(A-1.0f)*c+2.0f*sqrtA*alpha)*invA0;
+        a1=2.0f*A*((A-1.0f)-(A+1.0f)*c)*invA0;
+        a2=A*((A+1.0f)-(A-1.0f)*c-2.0f*sqrtA*alpha)*invA0;
+        b1=-2.0f*((A-1.0f)+(A+1.0f)*c)*invA0;
+        b2=((A+1.0f)+(A-1.0f)*c-2.0f*sqrtA*alpha)*invA0;
+    }
+};
+
+// =============================================================================
+// LOCK-FREE REVERB - Fixed-size circular buffers (no heap allocation)
+// =============================================================================
+
+template<int SIZE>
+struct CircularBuffer {
+    alignas(16) std::array<float, SIZE> data = {};
+    int pos = 0;
+
+    [[nodiscard]] inline float read() const { return data[pos]; }
+    inline void write(float v) { data[pos] = v; }
+    inline void advance() { pos = (pos + 1) % SIZE; }
+
+};
+
+class ReverbOptimized {
+    // Classic Schroeder: 4 parallel comb filters + 2 series allpass
+    // Fixed buffer sizes for lock-free operation
+    std::array<CircularBuffer<1116>, 4> combs;
+    std::array<CircularBuffer<556>, 2> allpasses;
+    std::array<float, 4> combFeedback = {0.841f, 0.815f, 0.796f, 0.771f};
+
+    float mix = 0.0f;
+
+public:
+    ReverbOptimized() = default;
+
+    inline void setMix(float m) { mix = m; }
+
+    // Branch-free processing with inline inlining
+    inline float process(float x) {
+        if (mix < 0.01f) return x;
+
+        // Parallel comb filters (unrolled for ARM NEON)
+        float out = 0.0f;
+#pragma GCC unroll 4
+        for (int i = 0; i < 4; i++) {
+            float delayed = combs[i].read();
+            out += delayed;
+            combs[i].write(x + delayed * combFeedback[i] + DENORMAL_OFFSET);
+            combs[i].advance();
+        }
+        out *= 0.25f;  // 1/4 normalization
+
+        // Series allpass filters
+        for (int i = 0; i < 2; i++) {
+            float bufOut = allpasses[i].read();
+            float xOut = -0.5f * out + bufOut;
+            allpasses[i].write(out + 0.5f * bufOut);
+            allpasses[i].advance();
+            out = xOut;
+        }
+
+        return x * (1.0f - mix) + out * mix;
+    }
+
+    // NEON-optimized block processing
+    inline void processBlock(float* __restrict__ left, float* __restrict__ right, int count) {
+        if (mix < 0.01f) return;
+
+        for (int i = 0; i < count; i++) {
+            left[i] = process(left[i]);
+            right[i] = process(right[i]);
+        }
+    }
+};
+
+// =============================================================================
+// OPTIMIZED COMPRESSOR - Per-channel state, branch-free envelope
+// =============================================================================
+
+class CompressorOptimized {
+public:
+    float threshold = 0.3f;
+    float ratio = 4.0f;
+    float attack = 0.08f;
+    float release = 0.8f;
+    float sampleRate = 44100.0f;
+
+private:
+    // Per-channel envelope state
+    float envelopeL = 0.0f;
+    float envelopeR = 0.0f;
+
+    // Pre-computed coefficients
+    float attackCoef = 0.0f;
+    float releaseCoef = 0.0f;
+    bool coefficientsValid = false;
+
+public:
+    inline void updateCoefficients() {
+        if (coefficientsValid) return;
+        attackCoef = expf(-1.0f / (attack * sampleRate));
+        releaseCoef = expf(-1.0f / (release * sampleRate));
+        coefficientsValid = true;
+    }
+
+    inline void processBlock(float* __restrict__ buffer, int count, float& envelope) {
+        updateCoefficients();
+        const int blockSize = 32;
+        for(int b=0;b<count;b+=blockSize){
+            int sz = (b+blockSize<count)? blockSize : count-b;
+            float maxVal = 0.0f;
+            for(int i=0;i<sz;i++){
+                float absInput = fabsf(buffer[b+i]);
+                if(absInput>maxVal) maxVal = absInput;
+            }
+            bool attackMode = maxVal > envelope;
+            envelope = attackMode ? attackCoef*envelope + (1-attackCoef)*maxVal
+                    : releaseCoef*envelope + (1-releaseCoef)*maxVal;
+            float gain = (envelope>threshold)? (threshold + (envelope-threshold)/ratio)/(envelope+1e-9f) : 1.0f;
+            for(int i=0;i<sz;i++) buffer[b+i]*=gain;
+        }
+    }
+
+    inline void process(float* __restrict__ left, float* __restrict__ right, int count) {
+        processBlock(left, count, envelopeL);
+        processBlock(right, count, envelopeR);
+    }
+};
+
+// =============================================================================
+// GLOBAL ENGINE - SoA layout for cache efficiency
+// =============================================================================
+
+CompressorOptimized gCompressor;
+ReverbOptimized gReverbL, gReverbR;
+BiquadBank gEqL, gEqR;
+BassFilter gBassL, gBassR;
+
+// Global state flags
+bool gDrcEnabled = false;
+bool gEqEnabled = false;  // Derived from gEqL.hasActiveBands()
+bool gBassBoostEnabled = false;
+float gStereoWidth = 1.0f;
+float gTargetRMS = 0.20f;
+float gCurrentGain = 1.0f;
+
+// Pre-allocated buffers - fixed size to avoid heap allocation in real-time
+alignas(16) std::array<float, 4096> gLeftBuf;
+alignas(16) std::array<float, 4096> gRightBuf;
+alignas(16) std::array<float, 256> gFFTData;
+alignas(16) std::array<std::complex<float>, FFT_SIZE> gFFTWork;
+
+// Fast FFT - iterative Cooley-Tukey
+inline void fastFFT(std::complex<float>* __restrict__ data, int n) {
+    // Bit-reversal permutation (iterative, cache-friendly)
+    for (int i = 1, j = 0; i < n; i++) {
+        int bit = n >> 1;
+        for (; j & bit; bit >>= 1) j ^= bit;
+        j ^= bit;
+        if (i < j) std::swap(data[i], data[j]);
+    }
+
+    // Cooley-Tukey stages
+    for (int len = 2; len <= n; len <<= 1) {
+        float ang = -2.0f * static_cast<float>(M_PI) / static_cast<float>(len);
+        // Pre-compute wlen - critical for performance
+        std::complex<float> wlen(cosf(ang), sinf(ang));
+
+        for (int i = 0; i < n; i += len) {
+            std::complex<float> w(1.0f);
+            for (int j = 0; j < len / 2; j++) {
+                std::complex<float> u = data[i + j];
+                std::complex<float> v = data[i + j + len / 2] * w;
+                data[i + j] = u + v;
+                data[i + j + len / 2] = u - v;
+                w *= wlen;
+            }
+        }
+    }
+}
+
+// =============================================================================
+// HIGH-PERFORMANCE AUDIO PROCESSING
+// =============================================================================
+
+// Fast soft clipping with polynomial approximation
+inline float fastSoftClip(float x) {
+    // Branchless clipping using min/max
+    float ax = fabsf(x);
+    float sign = x > 0 ? 1.0f : -1.0f;
+    if (ax > 1.0f) return sign;
+    return x * (1.4f - 0.4f * x * x);
+}
+
+inline void applyAutoGain(float* buffer, int count){
+    int block = 128;
+    for(int i=0; i<count; i+=block){
+        int sz = (i+block<count) ? block : count-i;
+        float sumSq = 0.0f;
+
+#if defined(__ARM_NEON)
+        float32x4_t sumVec = vdupq_n_f32(0.0f);
+        int j=0;
+        for(; j<=sz-4; j+=4){
+            float32x4_t v = vld1q_f32(buffer + i + j);
+            sumVec = vmlaq_f32(sumVec, v, v);
+        }
+        float32x2_t lo = vget_low_f32(sumVec);
+        float32x2_t hi = vget_high_f32(sumVec);
+        sumSq = vget_lane_f32(lo,0) + vget_lane_f32(lo,1) + vget_lane_f32(hi,0) + vget_lane_f32(hi,1);
+        for(; j<sz; j++) sumSq += buffer[i+j]*buffer[i+j];
+#else
+        for(int j=0; j<sz; j++) sumSq += buffer[i+j]*buffer[i+j];
+#endif
+
+        float rms = sqrtf(sumSq / static_cast<float>(sz));
+        if(rms > 0.001f){
+            float target = gTargetRMS / rms;
+            gCurrentGain = gCurrentGain*0.99f + target*0.01f;
+            if(gCurrentGain > 2.0f) gCurrentGain = 2.0f;
+
+#if defined(__ARM_NEON)
+            float32x4_t gVec = vdupq_n_f32(gCurrentGain);
+            int j=0;
+            for(; j<=sz-4; j+=4){
+                float32x4_t v = vld1q_f32(buffer + i + j);
+                vst1q_f32(buffer + i + j, vmulq_f32(v, gVec));
+            }
+            for(; j<sz; j++) buffer[i+j] *= gCurrentGain;
+#else
+            for(int j=0; j<sz; j++) buffer[i+j] *= gCurrentGain;
+#endif
+        }
+    }
+}
+
+inline void applyRMSLimit(float* buffer, int count){
+    float sumSq = 0.0f;
+    for(int i=0;i<count;i++) sumSq += buffer[i]*buffer[i];
+    float rms = sqrtf(sumSq / float(count));
+    if(rms > 0.8f){
+        float scale = 0.8f / rms;
+        for(int i=0;i<count;i++) buffer[i] *= scale;
+    }
+}
+
+// Main processing function - heavily optimized
+extern "C" {
+
+JNIEXPORT void JNICALL Java_com_michatec_radio_helpers_NativeAudioProcessor_setDrcEnabled(JNIEnv*, jobject, jboolean e) {
+    gDrcEnabled = e;
+}
+
+JNIEXPORT void JNICALL Java_com_michatec_radio_helpers_NativeAudioProcessor_setReverbMix(JNIEnv*, jobject, jfloat m) {
+    gReverbL.setMix(m);
+    gReverbR.setMix(m);
+}
+
+JNIEXPORT void JNICALL Java_com_michatec_radio_helpers_NativeAudioProcessor_setEqBand(JNIEnv*, jobject, jint b, jfloat g) {
+    if (b >= 0 && b < NUM_EQ_BANDS) {
+        gEqL.setPeakingEQ(b, 44100.0f, EQ_FREQUENCIES[b], g, 1.0f);
+        gEqR.setPeakingEQ(b, 44100.0f, EQ_FREQUENCIES[b], g, 1.0f);
+    }
+    gEqEnabled = gEqL.hasActiveBands();
+}
+
+JNIEXPORT void JNICALL Java_com_michatec_radio_helpers_NativeAudioProcessor_setBassBoost(JNIEnv*, jobject, jfloat g) {
+    if (g > 0.01f) {
+        gBassL.setLowShelf(44100.0f, 150.0f, g, SQRT_2_INV);
+        gBassR.setLowShelf(44100.0f, 150.0f, g, SQRT_2_INV);
+        gBassBoostEnabled = true;
+    } else {
+        gBassBoostEnabled = false;
+    }
+}
+
+JNIEXPORT void JNICALL Java_com_michatec_radio_helpers_NativeAudioProcessor_setStereoWidth(JNIEnv*, jobject, jfloat w) {
+    gStereoWidth = fmaxf(0.0f, fminf(w, 2.0f));
+}
+
+JNIEXPORT jfloatArray JNICALL Java_com_michatec_radio_helpers_NativeAudioProcessor_getFftData(JNIEnv* env, jobject) {
+    jfloatArray arr = env->NewFloatArray(256);
+    env->SetFloatArrayRegion(arr, 0, 256, gFFTData.data());
+    return arr;
+}
+
+JNIEXPORT void JNICALL Java_com_michatec_radio_helpers_NativeAudioProcessor_processAudioDirect(JNIEnv* env, jobject, jobject byteBuffer, jint size) {
+    auto* buffer = static_cast<jshort*>(env->GetDirectBufferAddress(byteBuffer));
+    if (!buffer) return;
+
+    int numFrames = (size / 2) / 2;
+    if (numFrames > 4096) numFrames = 4096;  // Clamp to buffer size
+
+    // =========================================================================
+    // STAGE 1: Convert to Float (NEON optimized, interleaved stereo)
+    // =========================================================================
+    int i = 0;
+#if defined(__ARM_NEON)
+    float32x4_t invScale = vdupq_n_f32(INV_32768);
+    for (; i <= numFrames - 4; i += 4) {
+        // Load interleaved 16-bit stereo, deinterleave to two floats
+        int16x4x2_t raw = vld2_s16(buffer + i * 2);
+        // Expand to 32-bit, convert to float, scale
+        float32x4_t left = vmulq_f32(vcvtq_f32_s32(vmovl_s16(raw.val[0])), invScale);
+        float32x4_t right = vmulq_f32(vcvtq_f32_s32(vmovl_s16(raw.val[1])), invScale);
+        vst1q_f32(gLeftBuf.data() + i, left);
+        vst1q_f32(gRightBuf.data() + i, right);
+    }
+#endif
+    // Scalar tail
+    for (; i < numFrames; i++) {
+        gLeftBuf[i] = static_cast<float>(buffer[i * 2]) * INV_32768;
+        gRightBuf[i] = static_cast<float>(buffer[i * 2 + 1]) * INV_32768;
+    }
+
+    // =========================================================================
+    // STAGE 2: DSP Chain (EQ -> Bass -> Reverb -> Stereo Width)
+    // =========================================================================
+
+    // EQ processing (branch-free based on active mask)
+    if (gEqEnabled) {
+        gEqL.processBlock(gLeftBuf.data(), numFrames);
+        gEqR.processBlock(gRightBuf.data(), numFrames);
+    }
+
+    // Bass boost
+    if (gBassBoostEnabled) {
+        gBassL.processNEON(gLeftBuf.data(), numFrames);
+        gBassR.processNEON(gRightBuf.data(), numFrames);
+
+        applyRMSLimit(gLeftBuf.data(), numFrames);
+        applyRMSLimit(gRightBuf.data(), numFrames);
+    }
+
+    // Reverb
+    gReverbL.processBlock(gLeftBuf.data(), gRightBuf.data(), numFrames);
+
+    // Stereo width processing (branch-free)
+    if (gStereoWidth != 1.0f) {
+        float halfWidth = gStereoWidth * 0.5f;
+        for (int j = 0; j < numFrames; j++) {
+            float mid = (gLeftBuf[j] + gRightBuf[j]) * 0.5f;
+            float side = (gLeftBuf[j] - gRightBuf[j]) * halfWidth;
+            gLeftBuf[j] = mid + side;
+            gRightBuf[j] = mid - side;
+        }
+    }
+
+    // =========================================================================
+    // STAGE 3: Dynamic Control (AutoGain -> Compressor)
+    // =========================================================================
+    applyAutoGain(gLeftBuf.data(), numFrames);
+    applyAutoGain(gRightBuf.data(), numFrames);
+
+    if (gDrcEnabled) {
+        gCompressor.process(gLeftBuf.data(), gRightBuf.data(), numFrames);
+    }
+
+    // =========================================================================
+    // STAGE 4: FFT Analysis (downsampled for visualization)
+    // =========================================================================
+    // Zero-pad for FFT (use first 256 samples only)
+    for (int k = 0; k < FFT_SIZE; k++) {
+        gFFTWork[k] = (k < 256) ? std::complex<float>(gLeftBuf[k], 0.0f) : std::complex<float>(0.0f, 0.0f);
+    }
+    fastFFT(gFFTWork.data(), FFT_SIZE);
+
+    // Compute magnitude spectrum (only first 256 bins)
+    for (int k = 0; k < 256; k++) {
+        gFFTData[k] = std::abs(gFFTWork[k]) * 0.05f;
+    }
+
+    // =========================================================================
+    // STAGE 5: Convert back to 16-bit with soft clipping
+    // =========================================================================
+    for (int k = 0; k < numFrames; k++) {
+        buffer[k * 2] = static_cast<jshort>(fastSoftClip(gLeftBuf[k]) * 32767.0f);
+        buffer[k * 2 + 1] = static_cast<jshort>(fastSoftClip(gRightBuf[k]) * 32767.0f);
+    }
+}
+
+} // extern "C"