/*
 *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */


/*
 * This file contains the function WebRtcSpl_ComplexFFT().
 * The description header can be found in signal_processing_library.h
 *
 */

#include "webrtc/common_audio/signal_processing/complex_fft_tables.h"
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"

#define CFFTSFT 14
#define CFFTRND 1
#define CFFTRND2 16384

#define CIFFTSFT 14
#define CIFFTRND 1


int WebRtcSpl_ComplexFFT(int16_t frfi[], int stages, int mode)
{
    int i, j, l, k, istep, n, m;
    int16_t wr, wi;
    int32_t tr32, ti32, qr32, qi32;

    /* The 1024-value is a constant given from the size of kSinTable1024[],
     * and should not be changed depending on the input parameter 'stages'
     */
    n = 1 << stages;
    if (n > 1024)
        return -1;

    l = 1;
    k = 10 - 1; /* Constant for given kSinTable1024[]. Do not change
         depending on the input parameter 'stages' */

    if (mode == 0)
    {
        // mode==0: Low-complexity and Low-accuracy mode
        while (l < n)
        {
            istep = l << 1;

            for (m = 0; m < l; ++m)
            {
                j = m << k;

                /* The 256-value is a constant given as 1/4 of the size of
                 * kSinTable1024[], and should not be changed depending on the input
                 * parameter 'stages'. It will result in 0 <= j < N_SINE_WAVE/2
                 */
                wr = kSinTable1024[j + 256];
                wi = -kSinTable1024[j];

                for (i = m; i < n; i += istep)
                {
                    j = i + l;

                    tr32 = (wr * frfi[2 * j] - wi * frfi[2 * j + 1]) >> 15;

                    ti32 = (wr * frfi[2 * j + 1] + wi * frfi[2 * j]) >> 15;

                    qr32 = (int32_t)frfi[2 * i];
                    qi32 = (int32_t)frfi[2 * i + 1];
                    frfi[2 * j] = (int16_t)((qr32 - tr32) >> 1);
                    frfi[2 * j + 1] = (int16_t)((qi32 - ti32) >> 1);
                    frfi[2 * i] = (int16_t)((qr32 + tr32) >> 1);
                    frfi[2 * i + 1] = (int16_t)((qi32 + ti32) >> 1);
                }
            }

            --k;
            l = istep;

        }

    } else
    {
        // mode==1: High-complexity and High-accuracy mode
        while (l < n)
        {
            istep = l << 1;

            for (m = 0; m < l; ++m)
            {
                j = m << k;

                /* The 256-value is a constant given as 1/4 of the size of
                 * kSinTable1024[], and should not be changed depending on the input
                 * parameter 'stages'. It will result in 0 <= j < N_SINE_WAVE/2
                 */
                wr = kSinTable1024[j + 256];
                wi = -kSinTable1024[j];

#ifdef WEBRTC_ARCH_ARM_V7
                int32_t wri = 0;
                __asm __volatile("pkhbt %0, %1, %2, lsl #16" : "=r"(wri) :
                    "r"((int32_t)wr), "r"((int32_t)wi));
#endif

                for (i = m; i < n; i += istep)
                {
                    j = i + l;

#ifdef WEBRTC_ARCH_ARM_V7
                    register int32_t frfi_r;
                    __asm __volatile(
                        "pkhbt %[frfi_r], %[frfi_even], %[frfi_odd],"
                        " lsl #16\n\t"
                        "smlsd %[tr32], %[wri], %[frfi_r], %[cfftrnd]\n\t"
                        "smladx %[ti32], %[wri], %[frfi_r], %[cfftrnd]\n\t"
                        :[frfi_r]"=&r"(frfi_r),
                         [tr32]"=&r"(tr32),
                         [ti32]"=r"(ti32)
                        :[frfi_even]"r"((int32_t)frfi[2*j]),
                         [frfi_odd]"r"((int32_t)frfi[2*j +1]),
                         [wri]"r"(wri),
                         [cfftrnd]"r"(CFFTRND));
#else
                    tr32 = wr * frfi[2 * j] - wi * frfi[2 * j + 1] + CFFTRND;

                    ti32 = wr * frfi[2 * j + 1] + wi * frfi[2 * j] + CFFTRND;
#endif

                    tr32 >>= 15 - CFFTSFT;
                    ti32 >>= 15 - CFFTSFT;

                    qr32 = ((int32_t)frfi[2 * i]) << CFFTSFT;
                    qi32 = ((int32_t)frfi[2 * i + 1]) << CFFTSFT;

                    frfi[2 * j] = (int16_t)(
                        (qr32 - tr32 + CFFTRND2) >> (1 + CFFTSFT));
                    frfi[2 * j + 1] = (int16_t)(
                        (qi32 - ti32 + CFFTRND2) >> (1 + CFFTSFT));
                    frfi[2 * i] = (int16_t)(
                        (qr32 + tr32 + CFFTRND2) >> (1 + CFFTSFT));
                    frfi[2 * i + 1] = (int16_t)(
                        (qi32 + ti32 + CFFTRND2) >> (1 + CFFTSFT));
                }
            }

            --k;
            l = istep;
        }
    }
    return 0;
}

int WebRtcSpl_ComplexIFFT(int16_t frfi[], int stages, int mode)
{
    size_t i, j, l, istep, n, m;
    int k, scale, shift;
    int16_t wr, wi;
    int32_t tr32, ti32, qr32, qi32;
    int32_t tmp32, round2;

    /* The 1024-value is a constant given from the size of kSinTable1024[],
     * and should not be changed depending on the input parameter 'stages'
     */
    n = 1 << stages;
    if (n > 1024)
        return -1;

    scale = 0;

    l = 1;
    k = 10 - 1; /* Constant for given kSinTable1024[]. Do not change
         depending on the input parameter 'stages' */

    while (l < n)
    {
        // variable scaling, depending upon data
        shift = 0;
        round2 = 8192;

        tmp32 = WebRtcSpl_MaxAbsValueW16(frfi, 2 * n);
        if (tmp32 > 13573)
        {
            shift++;
            scale++;
            round2 <<= 1;
        }
        if (tmp32 > 27146)
        {
            shift++;
            scale++;
            round2 <<= 1;
        }

        istep = l << 1;

        if (mode == 0)
        {
            // mode==0: Low-complexity and Low-accuracy mode
            for (m = 0; m < l; ++m)
            {
                j = m << k;

                /* The 256-value is a constant given as 1/4 of the size of
                 * kSinTable1024[], and should not be changed depending on the input
                 * parameter 'stages'. It will result in 0 <= j < N_SINE_WAVE/2
                 */
                wr = kSinTable1024[j + 256];
                wi = kSinTable1024[j];

                for (i = m; i < n; i += istep)
                {
                    j = i + l;

                    tr32 = (wr * frfi[2 * j] - wi * frfi[2 * j + 1]) >> 15;

                    ti32 = (wr * frfi[2 * j + 1] + wi * frfi[2 * j]) >> 15;

                    qr32 = (int32_t)frfi[2 * i];
                    qi32 = (int32_t)frfi[2 * i + 1];
                    frfi[2 * j] = (int16_t)((qr32 - tr32) >> shift);
                    frfi[2 * j + 1] = (int16_t)((qi32 - ti32) >> shift);
                    frfi[2 * i] = (int16_t)((qr32 + tr32) >> shift);
                    frfi[2 * i + 1] = (int16_t)((qi32 + ti32) >> shift);
                }
            }
        } else
        {
            // mode==1: High-complexity and High-accuracy mode

            for (m = 0; m < l; ++m)
            {
                j = m << k;

                /* The 256-value is a constant given as 1/4 of the size of
                 * kSinTable1024[], and should not be changed depending on the input
                 * parameter 'stages'. It will result in 0 <= j < N_SINE_WAVE/2
                 */
                wr = kSinTable1024[j + 256];
                wi = kSinTable1024[j];

#ifdef WEBRTC_ARCH_ARM_V7
                int32_t wri = 0;
                __asm __volatile("pkhbt %0, %1, %2, lsl #16" : "=r"(wri) :
                    "r"((int32_t)wr), "r"((int32_t)wi));
#endif

                for (i = m; i < n; i += istep)
                {
                    j = i + l;

#ifdef WEBRTC_ARCH_ARM_V7
                    register int32_t frfi_r;
                    __asm __volatile(
                      "pkhbt %[frfi_r], %[frfi_even], %[frfi_odd], lsl #16\n\t"
                      "smlsd %[tr32], %[wri], %[frfi_r], %[cifftrnd]\n\t"
                      "smladx %[ti32], %[wri], %[frfi_r], %[cifftrnd]\n\t"
                      :[frfi_r]"=&r"(frfi_r),
                       [tr32]"=&r"(tr32),
                       [ti32]"=r"(ti32)
                      :[frfi_even]"r"((int32_t)frfi[2*j]),
                       [frfi_odd]"r"((int32_t)frfi[2*j +1]),
                       [wri]"r"(wri),
                       [cifftrnd]"r"(CIFFTRND)
                    );
#else

                    tr32 = wr * frfi[2 * j] - wi * frfi[2 * j + 1] + CIFFTRND;

                    ti32 = wr * frfi[2 * j + 1] + wi * frfi[2 * j] + CIFFTRND;
#endif
                    tr32 >>= 15 - CIFFTSFT;
                    ti32 >>= 15 - CIFFTSFT;

                    qr32 = ((int32_t)frfi[2 * i]) << CIFFTSFT;
                    qi32 = ((int32_t)frfi[2 * i + 1]) << CIFFTSFT;

                    frfi[2 * j] = (int16_t)(
                        (qr32 - tr32 + round2) >> (shift + CIFFTSFT));
                    frfi[2 * j + 1] = (int16_t)(
                        (qi32 - ti32 + round2) >> (shift + CIFFTSFT));
                    frfi[2 * i] = (int16_t)(
                        (qr32 + tr32 + round2) >> (shift + CIFFTSFT));
                    frfi[2 * i + 1] = (int16_t)(
                        (qi32 + ti32 + round2) >> (shift + CIFFTSFT));
                }
            }

        }
        --k;
        l = istep;
    }
    return scale;
}