#include #include #include #include #include #if defined(__ARM_NEON) #include #endif #ifndef M_PI #define M_PI 3.14159265358979323846 #endif // ============================================================================= // OPTIMIZED CONFIGURATION // ============================================================================= // Use L1/L2 cache-optimized block size (typical L1: 32KB, L2: 256KB) static constexpr int FFT_SIZE = 512; static constexpr int NUM_EQ_BANDS = 10; // Pre-compute constants at compile time static constexpr float INV_32768 = 1.0f / 32768.0f; static constexpr float SQRT_2_INV = 0.70710678f; // 1/sqrt(2) // Denormal protection - use single scalar instead of adding per-sample static constexpr float DENORMAL_OFFSET = 1e-18f; // EQ frequencies - static const for compile-time access static constexpr std::array EQ_FREQUENCIES = { 31.25f, 62.5f, 125.0f, 250.0f, 500.0f, 1000.0f, 2000.0f, 4000.0f, 8000.0f, 16000.0f }; // ============================================================================= // OPTIMIZED DSP CLASSES - Structure of Arrays (SoA) for cache efficiency // ============================================================================= struct alignas(16) BiquadBank { // Coefficients (SoA - better for SIMD loads) alignas(16) std::array a0{}, a1{}, a2{}, b1{}, b2{}; // State variables alignas(16) std::array z1{}, z2{}; // Active flags (packed into bitmask for branch-free processing) uint16_t activeMask = 0; // Pre-check if any EQ band is active - branch free [[nodiscard]] inline bool hasActiveBands() const { return activeMask != 0; } inline void setBandActive(int band, bool active) { if (active) activeMask |= (1 << band); else activeMask &= ~(1 << band); } // Optimized bulk processing for a single channel inline void processBlock(float* __restrict__ data, int count) { if (!this -> hasActiveBands()) return; for (int i = 0; i < count; i++) { float x = data[i]; // Process all bands (compiler will optimize for activeMask) #pragma GCC unroll 10 for (int b = 0; b < NUM_EQ_BANDS; b++) { if (activeMask & (1 << b)) { float y = x * a0[b] + z1[b]; z1[b] = x * a1[b] + z2[b] - b1[b] * y + DENORMAL_OFFSET; z2[b] = x * a2[b] - b2[b] * y; x = y; } } data[i] = x; } } void setPeakingEQ(int band, float sr, float f, float g, float bw) { if (band < 0 || band >= NUM_EQ_BANDS) return; const bool active = std::abs(g) > 0.1f; setBandActive(band, active); if (!active) return; const float A = powf(10.0f, g / 40.0f); const float w = 2.0f * static_cast(M_PI) * f / sr; const float alpha = sinf(w) * sinhf(logf(2.0f) / 2.0f * bw * w / sinf(w)); const float c = cosf(w); const float a0_raw = 1.0f + alpha / A; const float invA0 = 1.0f / a0_raw; a0[band] = (1.0f + alpha * A) * invA0; a1[band] = (-2.0f * c) * invA0; a2[band] = (1.0f - alpha * A) * invA0; b1[band] = (-2.0f * c) * invA0; b2[band] = (1.0f - alpha / A) * invA0; } }; // ============================================================================= // BASS BOOST // ============================================================================= struct alignas(16) BassFilter { alignas(16) float a0 = 1.2f, a1 = 1.2f, a2 = 1.2f, b1 = 0.0f, b2 = 0.0f; alignas(16) float z1 = 0.0f, z2 = 0.0f; bool active = false; BiquadBank myBank; inline float process(float x) { if (!active) return x; float y = x * a0 + z1; z1 = x * a1 + z2 - b1 * y + DENORMAL_OFFSET; z2 = x * a2 - b2 * y; if(y>1.2f) y=1.2f; else if(y<-1.2f) y=-1.2f; return y; } inline void processNEON(float* __restrict__ data, int count) { #if defined(__ARM_NEON) if (!active) return; int i = 0; for (; i <= count-4; i+=4) { float32x4_t x = vld1q_f32(data + i); for(int b=0;b0.01f; if(!active) return; float A=powf(10.0f,g/40.0f); float w=2.0f*static_cast(M_PI)*f/sr; float alpha=sinf(w)/2.0f*sqrtf((A+1.0f/A)*(1.0f/q-1.0f)+2.0f); float c=cosf(w),sqrtA=sqrtf(A); float a0_raw=(A+1.0f)+(A-1.0f)*c+2.0f*sqrtA*alpha; float invA0=1.0f/a0_raw; a0=A*((A+1.0f)-(A-1.0f)*c+2.0f*sqrtA*alpha)*invA0; a1=2.0f*A*((A-1.0f)-(A+1.0f)*c)*invA0; a2=A*((A+1.0f)-(A-1.0f)*c-2.0f*sqrtA*alpha)*invA0; b1=-2.0f*((A-1.0f)+(A+1.0f)*c)*invA0; b2=((A+1.0f)+(A-1.0f)*c-2.0f*sqrtA*alpha)*invA0; } }; // ============================================================================= // LOCK-FREE REVERB - Fixed-size circular buffers (no heap allocation) // ============================================================================= template struct CircularBuffer { alignas(16) std::array data = {}; int pos = 0; [[nodiscard]] inline float read() const { return data[pos]; } inline void write(float v) { data[pos] = v; } inline void advance() { pos = (pos + 1) % SIZE; } }; class ReverbOptimized { // Classic Schroeder: 4 parallel comb filters + 2 series allpass // Fixed buffer sizes for lock-free operation std::array, 4> combs; std::array, 2> allpasses; std::array combFeedback = {0.841f, 0.815f, 0.796f, 0.771f}; float mix = 0.0f; public: ReverbOptimized() = default; inline void setMix(float m) { mix = m; } // Branch-free processing with inline inlining inline float process(float x) { if (mix < 0.01f) return x; // Parallel comb filters (unrolled for ARM NEON) float out = 0.0f; #pragma GCC unroll 4 for (int i = 0; i < 4; i++) { float delayed = combs[i].read(); out += delayed; combs[i].write(x + delayed * combFeedback[i] + DENORMAL_OFFSET); combs[i].advance(); } out *= 0.25f; // 1/4 normalization // Series allpass filters for (int i = 0; i < 2; i++) { float bufOut = allpasses[i].read(); float xOut = -0.5f * out + bufOut; allpasses[i].write(out + 0.5f * bufOut); allpasses[i].advance(); out = xOut; } return x * (1.0f - mix) + out * mix; } // NEON-optimized block processing inline void processBlock(float* __restrict__ left, float* __restrict__ right, int count) { if (mix < 0.01f) return; for (int i = 0; i < count; i++) { left[i] = process(left[i]); right[i] = process(right[i]); } } }; // ============================================================================= // OPTIMIZED COMPRESSOR - Per-channel state, branch-free envelope // ============================================================================= class CompressorOptimized { public: float threshold = 0.3f; float ratio = 4.0f; float attack = 0.01f; float release = 0.2f; float sampleRate = 44100.0f; private: // Per-channel envelope state float envelopeL = 0.0f; float envelopeR = 0.0f; // Pre-computed coefficients float attackCoef = 0.0f; float releaseCoef = 0.0f; bool coefficientsValid = false; public: inline void updateCoefficients() { if (coefficientsValid) return; attackCoef = expf(-1.0f / (attack * sampleRate)); releaseCoef = expf(-1.0f / (release * sampleRate)); coefficientsValid = true; } inline void processBlock(float* __restrict__ buffer, int count, float& envelope) { updateCoefficients(); const int blockSize = 32; for(int b=0;bmaxVal) maxVal = absInput; } bool attackMode = maxVal > envelope; envelope = attackMode ? attackCoef*envelope + (1-attackCoef)*maxVal : releaseCoef*envelope + (1-releaseCoef)*maxVal; float gain = (envelope>threshold)? (threshold + (envelope-threshold)/ratio)/(envelope+1e-9f) : 1.0f; for(int i=0;i gLeftBuf; alignas(16) std::array gRightBuf; alignas(16) std::array gFFTData; alignas(16) std::array, FFT_SIZE> gFFTWork; // Fast FFT - iterative Cooley-Tukey inline void fastFFT(std::complex* __restrict__ data, int n) { // Bit-reversal permutation (iterative, cache-friendly) for (int i = 1, j = 0; i < n; i++) { int bit = n >> 1; for (; j & bit; bit >>= 1) j ^= bit; j ^= bit; if (i < j) std::swap(data[i], data[j]); } // Cooley-Tukey stages for (int len = 2; len <= n; len <<= 1) { float ang = -2.0f * static_cast(M_PI) / static_cast(len); // Pre-compute wlen - critical for performance std::complex wlen(cosf(ang), sinf(ang)); for (int i = 0; i < n; i += len) { std::complex w(1.0f); for (int j = 0; j < len / 2; j++) { std::complex u = data[i + j]; std::complex v = data[i + j + len / 2] * w; data[i + j] = u + v; data[i + j + len / 2] = u - v; w *= wlen; } } } } // ============================================================================= // HIGH-PERFORMANCE AUDIO PROCESSING // ============================================================================= // Fast soft clipping with polynomial approximation inline float fastSoftClip(float x) { // Branchless clipping using min/max float ax = fabsf(x); float sign = x > 0 ? 1.0f : -1.0f; if (ax > 1.0f) return sign; return x * (1.4f - 0.4f * x * x); } inline void applyAutoGain(float* buffer, int count){ int block = 128; for(int i=0; i(sz)); if(rms > 0.001f){ float target = gTargetRMS / rms; gCurrentGain = gCurrentGain*0.99f + target*0.01f; if(gCurrentGain > 2.0f) gCurrentGain = 2.0f; #if defined(__ARM_NEON) float32x4_t gVec = vdupq_n_f32(gCurrentGain); int j=0; for(; j<=sz-4; j+=4){ float32x4_t v = vld1q_f32(buffer + i + j); vst1q_f32(buffer + i + j, vmulq_f32(v, gVec)); } for(; j= 0 && b < NUM_EQ_BANDS) { gEqL.setPeakingEQ(b, 44100.0f, EQ_FREQUENCIES[b], g, 1.0f); gEqR.setPeakingEQ(b, 44100.0f, EQ_FREQUENCIES[b], g, 1.0f); } gEqEnabled = gEqL.hasActiveBands(); } JNIEXPORT void JNICALL Java_com_michatec_radio_helpers_NativeAudioProcessor_setBassBoost(JNIEnv*, jobject, jfloat g) { if (g > 0.01f) { gBassL.setLowShelf(44100.0f, 150.0f, g, SQRT_2_INV); gBassR.setLowShelf(44100.0f, 150.0f, g, SQRT_2_INV); gBassBoostEnabled = true; } else { gBassBoostEnabled = false; } } JNIEXPORT void JNICALL Java_com_michatec_radio_helpers_NativeAudioProcessor_setStereoWidth(JNIEnv*, jobject, jfloat w) { gStereoWidth = fmaxf(0.0f, fminf(w, 2.0f)); } JNIEXPORT jfloatArray JNICALL Java_com_michatec_radio_helpers_NativeAudioProcessor_getFftData(JNIEnv* env, jobject) { jfloatArray arr = env->NewFloatArray(256); env->SetFloatArrayRegion(arr, 0, 256, gFFTData.data()); return arr; } JNIEXPORT void JNICALL Java_com_michatec_radio_helpers_NativeAudioProcessor_processAudioDirect(JNIEnv* env, jobject, jobject byteBuffer, jint size) { auto* buffer = static_cast(env->GetDirectBufferAddress(byteBuffer)); if (!buffer) return; int numFrames = (size / 2) / 2; if (numFrames > 4096) numFrames = 4096; // Clamp to buffer size // ========================================================================= // STAGE 1: Convert to Float (NEON optimized, interleaved stereo) // ========================================================================= int i = 0; #if defined(__ARM_NEON) float32x4_t invScale = vdupq_n_f32(INV_32768); for (; i <= numFrames - 4; i += 4) { // Load interleaved 16-bit stereo, deinterleave to two floats int16x4x2_t raw = vld2_s16(buffer + i * 2); // Expand to 32-bit, convert to float, scale float32x4_t left = vmulq_f32(vcvtq_f32_s32(vmovl_s16(raw.val[0])), invScale); float32x4_t right = vmulq_f32(vcvtq_f32_s32(vmovl_s16(raw.val[1])), invScale); vst1q_f32(gLeftBuf.data() + i, left); vst1q_f32(gRightBuf.data() + i, right); } #endif // Scalar tail for (; i < numFrames; i++) { gLeftBuf[i] = static_cast(buffer[i * 2]) * INV_32768; gRightBuf[i] = static_cast(buffer[i * 2 + 1]) * INV_32768; } // ========================================================================= // STAGE 2: DSP Chain (EQ -> Bass -> Reverb -> Stereo Width) // ========================================================================= // EQ processing (branch-free based on active mask) if (gEqEnabled) { gEqL.processBlock(gLeftBuf.data(), numFrames); gEqR.processBlock(gRightBuf.data(), numFrames); } // Bass boost if (gBassBoostEnabled) { gBassL.processNEON(gLeftBuf.data(), numFrames); gBassR.processNEON(gRightBuf.data(), numFrames); } // Reverb gReverbL.processBlock(gLeftBuf.data(), gRightBuf.data(), numFrames); // Stereo width processing (branch-free) if (gStereoWidth != 1.0f) { float halfWidth = gStereoWidth * 0.5f; for (int j = 0; j < numFrames; j++) { float mid = (gLeftBuf[j] + gRightBuf[j]) * 0.5f; float side = (gLeftBuf[j] - gRightBuf[j]) * halfWidth; gLeftBuf[j] = mid + side; gRightBuf[j] = mid - side; } } // ========================================================================= // STAGE 3: Dynamic Control (AutoGain -> Compressor) // ========================================================================= applyAutoGain(gLeftBuf.data(), numFrames); applyAutoGain(gRightBuf.data(), numFrames); if (gDrcEnabled) { gCompressor.process(gLeftBuf.data(), gRightBuf.data(), numFrames); } // ========================================================================= // STAGE 4: FFT Analysis (downsampled for visualization) // ========================================================================= // Zero-pad for FFT (use first 256 samples only) for (int k = 0; k < FFT_SIZE; k++) { gFFTWork[k] = (k < 256) ? std::complex(gLeftBuf[k], 0.0f) : std::complex(0.0f, 0.0f); } fastFFT(gFFTWork.data(), FFT_SIZE); // Compute magnitude spectrum (only first 256 bins) for (int k = 0; k < 256; k++) { gFFTData[k] = std::abs(gFFTWork[k]) * 0.05f; } // ========================================================================= // STAGE 5: Convert back to 16-bit with soft clipping // ========================================================================= for (int k = 0; k < numFrames; k++) { buffer[k * 2] = static_cast(fastSoftClip(gLeftBuf[k]) * 32767.0f); buffer[k * 2 + 1] = static_cast(fastSoftClip(gRightBuf[k]) * 32767.0f); } } } // extern "C"