//$ nobt
//$ nocpp

/**
 * @file CDSPRealFFT.h
 *
 * @brief Real-valued FFT transform class.
 *
 * This file includes FFT object implementation. All created FFT objects are
 * kept in a global list after use, for a future reusal. Such approach
 * minimizes time necessary to initialize the FFT object of the required
 * length.
 *
 * r8brain-free-src Copyright (c) 2013-2022 Aleksey Vaneev
 * See the "LICENSE" file for license.
 */

#ifndef R8B_CDSPREALFFT_INCLUDED
#define R8B_CDSPREALFFT_INCLUDED

#include "r8bbase.h"

#if !R8B_IPP && !R8B_PFFFT && !R8B_PFFFT_DOUBLE
	#include "fft4g.h"
#endif // !R8B_IPP && !R8B_PFFFT && !R8B_PFFFT_DOUBLE

#if R8B_PFFFT
	#include "pffft.h"
#endif // R8B_PFFFT

#if R8B_PFFFT_DOUBLE
	#include "pffft_double/pffft_double.h"
#endif // R8B_PFFFT_DOUBLE

namespace r8b {

/**
 * @brief Real-valued FFT transform class.
 *
 * Class implements a wrapper for real-valued discrete fast Fourier transform
 * functions. The object of this class can only be obtained via the
 * CDSPRealFFTKeeper class.
 *
 * Uses functions from the FFT package by: Copyright(C) 1996-2001 Takuya OOURA
 * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
 *
 * Also uses Intel IPP library functions if available (if the R8B_IPP=1 macro
 * was defined). Note that IPP library's FFT functions are 2-3 times more
 * efficient on the modern Intel Core i7-3770K processor than Ooura's
 * functions. It may be worthwhile investing in IPP. Note, that FFT functions
 * take less than 20% of the overall sample rate conversion time. However,
 * when the "power of 2" resampling is used the performance of FFT functions
 * becomes "everything".
 */

class CDSPRealFFT : public R8B_BASECLASS
{
	R8BNOCTOR( CDSPRealFFT );

	friend class CDSPRealFFTKeeper;

public:
	/**
	 * @return A multiplication constant that should be used after inverse
	 * transform to obtain a correct value scale.
	 */

	double getInvMulConst() const
	{
		return( InvMulConst );
	}

	/**
	 * @return The length (the number of real values in a transform) of *this
	 * FFT object, expressed as Nth power of 2.
	 */

	int getLenBits() const
	{
		return( LenBits );
	}

	/**
	 * @return The length (the number of real values in a transform) of *this
	 * FFT object.
	 */

	int getLen() const
	{
		return( Len );
	}

	/**
	 * Function performs in-place forward FFT.
	 *
	 * @param[in,out] p Pointer to data block to transform, length should be
	 * equal to *this object's getLen().
	 */

	void forward( double* const p ) const
	{
	#if R8B_FLOATFFT

		float* const op = (float*) p;
		int i;

		for( i = 0; i < Len; i++ )
		{
			op[ i ] = (float) p[ i ];
		}

	#endif // R8B_FLOATFFT

	#if R8B_IPP

		ippsFFTFwd_RToPerm_64f( p, p, SPtr, WorkBuffer );

	#elif R8B_PFFFT

		pffft_transform_ordered( setup, op, op, work, PFFFT_FORWARD );

	#elif R8B_PFFFT_DOUBLE

		pffftd_transform_ordered( setup, p, p, work, PFFFT_FORWARD );

	#else // R8B_PFFFT_DOUBLE

		ooura_fft :: rdft( Len, 1, p, wi.getPtr(), wd.getPtr() );

	#endif // R8B_IPP
	}

	/**
	 * Function performs in-place inverse FFT.
	 *
	 * @param[in,out] p Pointer to data block to transform, length should be
	 * equal to *this object's getLen().
	 */

	void inverse( double* const p ) const
	{
	#if R8B_IPP

		ippsFFTInv_PermToR_64f( p, p, SPtr, WorkBuffer );

	#elif R8B_PFFFT

		pffft_transform_ordered( setup, (float*) p, (float*) p, work,
			PFFFT_BACKWARD );

	#elif R8B_PFFFT_DOUBLE

		pffftd_transform_ordered( setup, p, p, work, PFFFT_BACKWARD );

	#else // R8B_PFFFT_DOUBLE

		ooura_fft :: rdft( Len, -1, p, wi.getPtr(), wd.getPtr() );

	#endif // R8B_IPP

	#if R8B_FLOATFFT

		const float* const ip = (const float*) p;
		int i;

		for( i = Len - 1; i >= 0; i-- )
		{
			p[ i ] = ip[ i ];
		}

	#endif // R8B_FLOATFFT
	}

	/**
	 * Function multiplies two complex-valued data blocks and places result in
	 * a new data block. Length of all data blocks should be equal to *this
	 * object's block length. Input blocks should have been produced with the
	 * forward() function of *this object.
	 *
	 * @param aip1 Input data block 1.
	 * @param aip2 Input data block 2.
	 * @param[out] aop Output data block, should not be equal to aip1 nor
	 * aip2.
	 */

	void multiplyBlocks( const double* const aip1, const double* const aip2,
		double* const aop ) const
	{
	#if R8B_FLOATFFT

		const float* const ip1 = (const float*) aip1;
		const float* const ip2 = (const float*) aip2;
		float* const op = (float*) aop;

	#else // R8B_FLOATFFT

		const double* const ip1 = aip1;
		const double* const ip2 = aip2;
		double* const op = aop;

	#endif // R8B_FLOATFFT

	#if R8B_IPP

		ippsMulPerm_64f( (Ipp64f*) ip1, (Ipp64f*) ip2, (Ipp64f*) op, Len );

	#else // R8B_IPP

		op[ 0 ] = ip1[ 0 ] * ip2[ 0 ];
		op[ 1 ] = ip1[ 1 ] * ip2[ 1 ];

		int i = 2;

		while( i < Len )
		{
			op[ i ] = ip1[ i ] * ip2[ i ] - ip1[ i + 1 ] * ip2[ i + 1 ];
			op[ i + 1 ] = ip1[ i ] * ip2[ i + 1 ] + ip1[ i + 1 ] * ip2[ i ];
			i += 2;
		}

	#endif // R8B_IPP
	}

	/**
	 * Function multiplies two complex-valued data blocks in-place. Length of
	 * both data blocks should be equal to *this object's block length. Blocks
	 * should have been produced with the forward() function of *this object.
	 *
	 * @param aip Input data block 1.
	 * @param[in,out] aop Output/input data block 2.
	 */

	void multiplyBlocks( const double* const aip, double* const aop ) const
	{
	#if R8B_FLOATFFT

		const float* const ip = (const float*) aip;
		float* const op = (float*) aop;
		float t;

	#else // R8B_FLOATFFT

		const double* const ip = aip;
		double* const op = aop;

		#if !R8B_IPP
		double t;
		#endif // !R8B_IPP

	#endif // R8B_FLOATFFT

	#if R8B_IPP

		ippsMulPerm_64f( (Ipp64f*) op, (Ipp64f*) ip, (Ipp64f*) op, Len );

	#else // R8B_IPP

		op[ 0 ] *= ip[ 0 ];
		op[ 1 ] *= ip[ 1 ];

		int i = 2;

		while( i < Len )
		{
			t = op[ i ] * ip[ i ] - op[ i + 1 ] * ip[ i + 1 ];
			op[ i + 1 ] = op[ i ] * ip[ i + 1 ] + op[ i + 1 ] * ip[ i ];
			op[ i ] = t;
			i += 2;
		}

	#endif // R8B_IPP
	}

	/**
	 * Function multiplies two complex-valued data blocks in-place,
	 * considering that the "ip" block contains "zero-phase" response. Length
	 * of both data blocks should be equal to *this object's block length.
	 * Blocks should have been produced with the forward() function of *this
	 * object.
	 *
	 * @param aip Input data block 1, "zero-phase" response. This block should
	 * be first transformed via the convertToZP() function.
	 * @param[in,out] aop Output/input data block 2.
	 */

	void multiplyBlocksZP( const double* const aip, double* const aop ) const
	{
	#if R8B_FLOATFFT

		const float* const ip = (const float*) aip;
		float* const op = (float*) aop;

	#else // R8B_FLOATFFT

		const double* ip = aip;
		double* op = aop;

	#endif // R8B_FLOATFFT

	// SIMD implementations assume that pointers are address-aligned.

	#if !R8B_FLOATFFT && defined( R8B_SSE2 )

		int c8 = Len >> 3;

		while( c8 != 0 )
		{
			const __m128d iv1 = _mm_load_pd( ip );
			const __m128d iv2 = _mm_load_pd( ip + 2 );
			const __m128d ov1 = _mm_load_pd( op );
			const __m128d ov2 = _mm_load_pd( op + 2 );
			_mm_store_pd( op, _mm_mul_pd( iv1, ov1 ));
			_mm_store_pd( op + 2, _mm_mul_pd( iv2, ov2 ));

			const __m128d iv3 = _mm_load_pd( ip + 4 );
			const __m128d ov3 = _mm_load_pd( op + 4 );
			const __m128d iv4 = _mm_load_pd( ip + 6 );
			const __m128d ov4 = _mm_load_pd( op + 6 );
			_mm_store_pd( op + 4, _mm_mul_pd( iv3, ov3 ));
			_mm_store_pd( op + 6, _mm_mul_pd( iv4, ov4 ));

			ip += 8;
			op += 8;
			c8--;
		}

		int c = Len & 7;

		while( c != 0 )
		{
			*op *= *ip;
			ip++;
			op++;
			c--;
		}

	#elif !R8B_FLOATFFT && defined( R8B_NEON )

		int c8 = Len >> 3;

		while( c8 != 0 )
		{
			const float64x2_t iv1 = vld1q_f64( ip );
			const float64x2_t iv2 = vld1q_f64( ip + 2 );
			const float64x2_t ov1 = vld1q_f64( op );
			const float64x2_t ov2 = vld1q_f64( op + 2 );
			vst1q_f64( op, vmulq_f64( iv1, ov1 ));
			vst1q_f64( op + 2, vmulq_f64( iv2, ov2 ));

			const float64x2_t iv3 = vld1q_f64( ip + 4 );
			const float64x2_t iv4 = vld1q_f64( ip + 6 );
			const float64x2_t ov3 = vld1q_f64( op + 4 );
			const float64x2_t ov4 = vld1q_f64( op + 6 );
			vst1q_f64( op + 4, vmulq_f64( iv3, ov3 ));
			vst1q_f64( op + 6, vmulq_f64( iv4, ov4 ));

			ip += 8;
			op += 8;
			c8--;
		}

		int c = Len & 7;

		while( c != 0 )
		{
			*op *= *ip;
			ip++;
			op++;
			c--;
		}

	#else // SIMD

		int i;

		for( i = 0; i < Len; i++ )
		{
			op[ i ] *= ip[ i ];
		}

	#endif // SIMD
	}

	/**
	 * Function converts the specified forward-transformed block into
	 * "zero-phase" form suitable for use with the multiplyBlocksZ() function.
	 *
	 * @param[in,out] ap Block to transform.
	 */

	void convertToZP( double* const ap ) const
	{
	#if R8B_FLOATFFT

		float* const p = (float*) ap;

	#else // R8B_FLOATFFT

		double* const p = ap;

	#endif // R8B_FLOATFFT

		int i = 2;

		while( i < Len )
		{
			p[ i + 1 ] = p[ i ];
			i += 2;
		}
	}

private:
	int LenBits; ///< Length of FFT block (expressed as Nth power of 2).
		///<
	int Len; ///< Length of FFT block (number of real values).
		///<
	double InvMulConst; ///< Inverse FFT multiply constant.
		///<
	CDSPRealFFT* Next; ///< Next object in a singly-linked list.
		///<

	#if R8B_IPP
		IppsFFTSpec_R_64f* SPtr; ///< Pointer to initialized data buffer
			///< to be passed to IPP's FFT functions.
			///<
		CFixedBuffer< unsigned char > SpecBuffer; ///< Working buffer.
			///<
		CFixedBuffer< unsigned char > WorkBuffer; ///< Working buffer.
			///<
	#elif R8B_PFFFT
		PFFFT_Setup* setup; ///< PFFFT setup object.
			///<
		CFixedBuffer< float > work; ///< Working buffer.
			///<
	#elif R8B_PFFFT_DOUBLE
		PFFFTD_Setup* setup; ///< PFFFTD setup object.
			///<
		CFixedBuffer< double > work; ///< Working buffer.
			///<
	#else // R8B_PFFFT_DOUBLE
		CFixedBuffer< int > wi; ///< Working buffer (ints).
			///<
		CFixedBuffer< double > wd; ///< Working buffer (doubles).
			///<
	#endif // R8B_IPP

	/**
	 * A simple class that keeps the pointer to the object and deletes it
	 * automatically.
	 */

	class CObjKeeper
	{
		R8BNOCTOR( CObjKeeper );

	public:
		CObjKeeper()
			: Object( NULL )
		{
		}

		~CObjKeeper()
		{
			delete Object;
		}

		CObjKeeper& operator = ( CDSPRealFFT* const aObject )
		{
			Object = aObject;
			return( *this );
		}

		operator CDSPRealFFT* () const
		{
			return( Object );
		}

	private:
		CDSPRealFFT* Object; ///< FFT object being kept.
			///<
	};

	CDSPRealFFT()
	{
	}

	/**
	 * Constructor initializes FFT object.
	 *
	 * @param aLenBits The length of FFT block (Nth power of 2), specifies the
	 * number of real values in a block. Values from 1 to 30 inclusive are
	 * supported.
	 */

	CDSPRealFFT( const int aLenBits )
		: LenBits( aLenBits )
		, Len( 1 << aLenBits )
	#if R8B_IPP
		, InvMulConst( 1.0 / Len )
	#elif R8B_PFFFT
		, InvMulConst( 1.0 / Len )
	#elif R8B_PFFFT_DOUBLE
		, InvMulConst( 1.0 / Len )
	#else // R8B_PFFFT_DOUBLE
		, InvMulConst( 2.0 / Len )
	#endif // R8B_IPP
	{
	#if R8B_IPP

		int SpecSize;
		int SpecBufferSize;
		int BufferSize;

		ippsFFTGetSize_R_64f( LenBits, IPP_FFT_NODIV_BY_ANY,
			ippAlgHintFast, &SpecSize, &SpecBufferSize, &BufferSize );

		CFixedBuffer< unsigned char > InitBuffer( SpecBufferSize );
		SpecBuffer.alloc( SpecSize );
		WorkBuffer.alloc( BufferSize );

		ippsFFTInit_R_64f( &SPtr, LenBits, IPP_FFT_NODIV_BY_ANY,
			ippAlgHintFast, SpecBuffer, InitBuffer );

	#elif R8B_PFFFT

		setup = pffft_new_setup( Len, PFFFT_REAL );
		work.alloc( Len );

	#elif R8B_PFFFT_DOUBLE

		setup = pffftd_new_setup( Len, PFFFT_REAL );
		work.alloc( Len );

	#else // R8B_PFFFT_DOUBLE

		wi.alloc( (int) ceil( 2.0 + sqrt( (double) ( Len >> 1 ))));
		wi[ 0 ] = 0;
		wd.alloc( Len >> 1 );

	#endif // R8B_IPP
	}

	~CDSPRealFFT()
	{
		#if R8B_PFFFT
			pffft_destroy_setup( setup );
		#elif R8B_PFFFT_DOUBLE
			pffftd_destroy_setup( setup );
		#endif // R8B_PFFFT_DOUBLE

		delete Next;
	}
};

/**
 * @brief A "keeper" class for real-valued FFT transform objects.
 *
 * Class implements "keeper" functionality for handling CDSPRealFFT objects.
 * The allocated FFT objects are placed on the global static list of objects
 * for future reuse instead of deallocation.
 */

class CDSPRealFFTKeeper : public R8B_BASECLASS
{
	R8BNOCTOR( CDSPRealFFTKeeper );

public:
	CDSPRealFFTKeeper()
		: Object( NULL )
	{
	}

	/**
	 * Function acquires FFT object with the specified block length.
	 *
	 * @param LenBits The length of FFT block (Nth power of 2), in the range
	 * [1; 30] inclusive, specifies the number of real values in a FFT block.
	 */

	CDSPRealFFTKeeper( const int LenBits )
	{
		Object = acquire( LenBits );
	}

	~CDSPRealFFTKeeper()
	{
		if( Object != NULL )
		{
			release( Object );
		}
	}

	/**
	 * @return Pointer to the acquired FFT object.
	 */

	const CDSPRealFFT* operator -> () const
	{
		R8BASSERT( Object != NULL );

		return( Object );
	}

	/**
	 * Function acquires FFT object with the specified block length. This
	 * function can be called any number of times.
	 *
	 * @param LenBits The length of FFT block (Nth power of 2), in the range
	 * [1; 30] inclusive, specifies the number of real values in a FFT block.
	 */

	void init( const int LenBits )
	{
		if( Object != NULL )
		{
			if( Object -> LenBits == LenBits )
			{
				return;
			}

			release( Object );
		}

		Object = acquire( LenBits );
	}

	/**
	 * Function releases a previously acquired FFT object.
	 */

	void reset()
	{
		if( Object != NULL )
		{
			release( Object );
			Object = NULL;
		}
	}

private:
	CDSPRealFFT* Object; ///< FFT object.
		///<

	static CSyncObject StateSync; ///< FFTObjects synchronizer.
		///<
	static CDSPRealFFT :: CObjKeeper FFTObjects[]; ///< Pool of FFT objects of
		///< various lengths.
		///<

	/**
	 * Function acquires FFT object from the global pool.
	 *
	 * @param LenBits FFT block length (expressed as Nth power of 2).
	 */

	CDSPRealFFT* acquire( const int LenBits )
	{
		R8BASSERT( LenBits > 0 && LenBits <= 30 );

		R8BSYNC( StateSync );

		if( FFTObjects[ LenBits ] == NULL )
		{
			return( new CDSPRealFFT( LenBits ));
		}

		CDSPRealFFT* ffto = FFTObjects[ LenBits ];
		FFTObjects[ LenBits ] = ffto -> Next;

		return( ffto );
	}

	/**
	 * Function releases a previously acquired FFT object.
	 *
	 * @param ffto FFT object to release.
	 */

	void release( CDSPRealFFT* const ffto )
	{
		R8BSYNC( StateSync );

		ffto -> Next = FFTObjects[ ffto -> LenBits ];
		FFTObjects[ ffto -> LenBits ] = ffto;
	}
};

/**
 * Function calculates the minimum-phase transform of the filter kernel, using
 * a discrete Hilbert transform in cepstrum domain.
 *
 * For more details, see part III.B of
 * http://www.hpl.hp.com/personal/Niranjan_Damera-Venkata/files/ComplexMinPhase.pdf
 *
 * @param[in,out] Kernel Filter kernel buffer.
 * @param KernelLen Filter kernel's length, in samples.
 * @param LenMult Kernel length multiplier. Used as a coefficient of the
 * "oversampling" in the frequency domain. Such oversampling is needed to
 * improve the precision of the minimum-phase transform. If the filter's
 * attenuation is high, this multiplier should be increased or otherwise the
 * required attenuation will not be reached due to "smoothing" effect of this
 * transform.
 * @param DoFinalMul "True" if the final multiplication after transform should
 * be performed or not. Such multiplication returns the gain of the signal to
 * its original value. This parameter can be set to "false" if normalization
 * of the resulting filter kernel is planned to be used.
 * @param[out] DCGroupDelay If not NULL, this variable receives group delay
 * at DC offset, in samples (can be a non-integer value).
 */

inline void calcMinPhaseTransform( double* const Kernel, const int KernelLen,
	const int LenMult = 2, const bool DoFinalMul = true,
	double* const DCGroupDelay = NULL )
{
	R8BASSERT( KernelLen > 0 );
	R8BASSERT( LenMult >= 2 );

	const int LenBits = getBitOccupancy(( KernelLen * LenMult ) - 1 );
	const int Len = 1 << LenBits;
	const int Len2 = Len >> 1;
	int i;

	CFixedBuffer< double > ip( Len );
	CFixedBuffer< double > ip2( Len2 + 1 );

	memcpy( &ip[ 0 ], Kernel, KernelLen * sizeof( ip[ 0 ]));
	memset( &ip[ KernelLen ], 0, ( Len - KernelLen ) * sizeof( ip[ 0 ]));

	CDSPRealFFTKeeper ffto( LenBits );
	ffto -> forward( ip );

	// Create the "log |c|" spectrum while saving the original power spectrum
	// in the "ip2" buffer.

	#if R8B_FLOATFFT
		float* const aip = (float*) &ip[ 0 ];
		float* const aip2 = (float*) &ip2[ 0 ];
		const float nzbias = 1e-35;
	#else // R8B_FLOATFFT
		double* const aip = &ip[ 0 ];
		double* const aip2 = &ip2[ 0 ];
		const double nzbias = 1e-300;
	#endif // R8B_FLOATFFT

	aip2[ 0 ] = aip[ 0 ];
	aip[ 0 ] = log( fabs( aip[ 0 ]) + nzbias );
	aip2[ Len2 ] = aip[ 1 ];
	aip[ 1 ] = log( fabs( aip[ 1 ]) + nzbias );

	for( i = 1; i < Len2; i++ )
	{
		aip2[ i ] = sqrt( aip[ i * 2 ] * aip[ i * 2 ] +
			aip[ i * 2 + 1 ] * aip[ i * 2 + 1 ]);

		aip[ i * 2 ] = log( aip2[ i ] + nzbias );
		aip[ i * 2 + 1 ] = 0.0;
	}

	// Convert to cepstrum and apply discrete Hilbert transform.

	ffto -> inverse( ip );

	const double m1 = ffto -> getInvMulConst();
	const double m2 = -m1;

	ip[ 0 ] = 0.0;

	for( i = 1; i < Len2; i++ )
	{
		ip[ i ] *= m1;
	}

	ip[ Len2 ] = 0.0;

	for( i = Len2 + 1; i < Len; i++ )
	{
		ip[ i ] *= m2;
	}

	// Convert Hilbert-transformed cepstrum back to the "log |c|" spectrum and
	// perform its exponentiation, multiplied by the power spectrum previously
	// saved in the "ip2" buffer.

	ffto -> forward( ip );

	aip[ 0 ] = aip2[ 0 ];
	aip[ 1 ] = aip2[ Len2 ];

	for( i = 1; i < Len2; i++ )
	{
		aip[ i * 2 + 0 ] = cos( aip[ i * 2 + 1 ]) * aip2[ i ];
		aip[ i * 2 + 1 ] = sin( aip[ i * 2 + 1 ]) * aip2[ i ];
	}

	ffto -> inverse( ip );

	if( DoFinalMul )
	{
		for( i = 0; i < KernelLen; i++ )
		{
			Kernel[ i ] = ip[ i ] * m1;
		}
	}
	else
	{
		memcpy( &Kernel[ 0 ], &ip[ 0 ], KernelLen * sizeof( Kernel[ 0 ]));
	}

	if( DCGroupDelay != NULL )
	{
		double tmp;

		calcFIRFilterResponseAndGroupDelay( Kernel, KernelLen, 0.0,
			tmp, tmp, *DCGroupDelay );
	}
}

} // namespace r8b

#endif // VOX_CDSPREALFFT_INCLUDED