#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>

#include "./main.h"

static __global__ void vectorScalarDivDouble(cufftDoubleReal *data, double s)
{
    unsigned int index;

    index = blockDim.x * blockIdx.x + threadIdx.x;

    data[index] /= s;

    return;
}

static __global__ void vectorScalarDiv(cufftReal *dataf, float s)
{
    unsigned int index;

    index = blockDim.x * blockIdx.x + threadIdx.x;

    dataf[index] /= s;

    return;
}

static __global__ void cplxVectorScalarDivDouble(cufftDoubleComplex *cplx_data, double s)
{
    unsigned int index;
    double re, im;

    index = blockDim.x * blockIdx.x + threadIdx.x;

    re = cuCreal(cplx_data[index]);
    im = cuCimag(cplx_data[index]);
    cplx_data[index] = make_cuDoubleComplex(re / s, im / s);

    return;
}

static __global__ void cplxVectorScalarDiv(cufftComplex *cplx_dataf, float s)
{
    unsigned int index;
    float re, im;

    index = blockDim.x * blockIdx.x + threadIdx.x;

    re = cuCrealf(cplx_dataf[index]);
    im = cuCimagf(cplx_dataf[index]);
    cplx_dataf[index] = make_cuFloatComplex(re / s, im / s);

    return;
}

void spVectorScalarDivDoubleCUFFT(spFFTRecCUFFT fftrec, double s)
{
    vectorScalarDivDouble<<<fftrec->gridsize, fftrec->blocksize>>>(fftrec->dev_data, s);
    cudaThreadSynchronize();
    
    return;
}

void spVectorScalarDivCUFFT(spFFTRecCUFFT fftrec, float s)
{
    vectorScalarDiv<<<fftrec->gridsize, fftrec->blocksize>>>(fftrec->dev_dataf, s);
    cudaThreadSynchronize();

    return;
}

void spCplxVectorScalarDivDoubleCUFFT(spFFTRecCUFFT fftrec, double s)
{
    cplxVectorScalarDivDouble<<<fftrec->gridsize, fftrec->blocksize>>>(fftrec->dev_cplx_data, s);
    cudaThreadSynchronize();

    return;
}

void spCplxVectorScalarDivCUFFT(spFFTRecCUFFT fftrec, float s)
{
    cplxVectorScalarDiv<<<fftrec->gridsize, fftrec->blocksize>>>(fftrec->dev_cplx_dataf, s);
    cudaThreadSynchronize();

    return;
}

/* The following function includes uncoalesced memory accesses,
 * but the performance is not so bad in total compared to
 * coalesced version by using shared memory
 * (because cudaMemcpy from dev to dev is slow in this case). */
static __global__ void rfftToPowerDouble(long fftl, cufftDoubleComplex *cplx_data, cufftDoubleReal *data, double exponent)
{
    unsigned int index, offset;
    unsigned int index_cplx;
    unsigned int index_data, index_data_rev;
    cufftDoubleReal re, im, pw;
    long fftl2, hfftl, batch;

    fftl2 = fftl / 2;
    hfftl = fftl2 + 1;
    
    index = blockDim.x * blockIdx.x + threadIdx.x;
    //offset = index % fftl2;
    offset = index & (fftl2 - 1);
    batch = index / fftl2;
    //batch = index >> log2(fftl2);
    index_cplx = batch * hfftl + offset;
    index_data = batch * fftl + offset;

    re = cuCreal(cplx_data[index_cplx]);
    im = cuCimag(cplx_data[index_cplx]); /* dummy access */
    if (offset == 0) {
	pw = re * re;
    } else {
	pw = re * re + im * im;
    }

    do {
	if (exponent == 1.0) {
	    data[index_data] = pw;
	} else if (exponent == 0.5) {
	    data[index_data] = sqrt(pw);
	} else {
	    data[index_data] = pow(pw, exponent);
	}
	
	if (offset == fftl2 - 1) {
	    re = cuCreal(cplx_data[index_cplx + 1]);
	    im = cuCimag(cplx_data[index_cplx + 1]); /* dummy access */
	    pw = re * re;
	    ++offset;
	    ++index_data;
	} else {
	    if (offset > 0 && offset < fftl2) {
		index_data_rev = (batch + 1) * fftl - offset;
		data[index_data_rev] = data[index_data]; /* uncoalesced */
	    }

	    break;
	}
    } while (1);

    return;
}

void spRealFFTToPowerDoubleCUFFT(spFFTRecCUFFT fftrec, double exponent)
{
    rfftToPowerDouble<<<fftrec->gridsize / 2, fftrec->blocksize>>>(fftrec->fftl, fftrec->dev_cplx_data, fftrec->dev_data,
								   exponent);
    cudaThreadSynchronize();

    return;
}

static __global__ void rfftToPower(long fftl, cufftComplex *cplx_dataf, cufftReal *dataf, float exponent)
{
    unsigned int index, offset;
    unsigned int index_cplx;
    unsigned int index_data, index_data_rev;
    cufftReal re, im, pw;
    long fftl2, hfftl, batch;

    fftl2 = fftl / 2;
    hfftl = fftl2 + 1;
    
    index = blockDim.x * blockIdx.x + threadIdx.x;
    //offset = index % fftl2;
    offset = index & (fftl2 - 1);
    batch = index / fftl2;
    //batch = index >> log2(fftl2);
    index_cplx = batch * hfftl + offset;
    index_data = batch * fftl + offset;

    re = cuCrealf(cplx_dataf[index_cplx]);
    im = cuCimagf(cplx_dataf[index_cplx]);
    if (offset == 0) {
	pw = re * re;
    } else {
	pw = re * re + im * im;
    }

    do {
	if (exponent == 1.0f) {
	    dataf[index_data] = pw;
	} else if (exponent == 0.5f) {
	    dataf[index_data] = sqrtf(pw);
	} else {
	    dataf[index_data] = powf(pw, exponent);
	}
	
	if (offset == fftl2 - 1) {
	    /* executed on (offset == fftl2 - 1) case once */
	    re = cuCrealf(cplx_dataf[index_cplx + 1]);
	    im = cuCimagf(cplx_dataf[index_cplx + 1]);
	    pw = re * re;
	    ++offset;
	    ++index_data;
	} else {
	    if (offset > 0 && offset < fftl2) {
		index_data_rev = (batch + 1) * fftl - offset;
		dataf[index_data_rev] = dataf[index_data];
	    }

	    break;
	}
    } while (1);

    return;
}

void spRealFFTToPowerCUFFT(spFFTRecCUFFT fftrec, float exponent)
{
    rfftToPower<<<fftrec->gridsize / 2, fftrec->blocksize>>>(fftrec->fftl, fftrec->dev_cplx_dataf, fftrec->dev_dataf,
							     exponent);
    cudaThreadSynchronize();

    return;
}
