#include <sp/spDefs.h>
#include <sp/spBase.h>
#include <sp/spMemory.h>

#include <sp/fftplugin.h>
#include <sp/spPluginMain.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>

#include "./main.h"

static spBool spInitCUFFT(const char *lang)
{
    int driverVersion, runtimeVersion;

    if (cudaDriverGetVersion(&driverVersion) != cudaSuccess
	|| cudaRuntimeGetVersion(&runtimeVersion) != cudaSuccess) {
	spWarning("spInitCUFFT: get version failed\n");
	return SP_FALSE;
    }

    spDebug(10, "spInitCUFFT", "driverVersion = %d, runtimeVersion = %d\n", driverVersion, runtimeVersion);
    
    return SP_TRUE;
}

static spBool spFreeCUFFT(void)
{
    return SP_TRUE;
}

static void spCreateThreadIdListCUFFT(spPluginInstanceCUFFT pinstance, spThreadId id)
{
    spThreadIdListCUFFT prev_thread_id_list = NULL;
    spThreadIdListCUFFT thread_id_list;

    spDebug(10, "spCreateThreadIdListCUFFT", "thread ID: %ld\n", (long)id);
    
    thread_id_list = xalloc(1, struct _spThreadIdListCUFFT);
    memset(thread_id_list, 0, sizeof(struct _spThreadIdListCUFFT));
    thread_id_list->id = id;

    prev_thread_id_list = pinstance->thread_id_list;
    while (prev_thread_id_list != NULL) {
	if (prev_thread_id_list->next == NULL) {
	    break;
	}
	prev_thread_id_list = prev_thread_id_list->next;
    }

    if (prev_thread_id_list == NULL) {
	pinstance->thread_id_list = thread_id_list;
    } else {
	prev_thread_id_list->next = thread_id_list;
    }

    cudaSetDeviceFlags(cudaDeviceMapHost);
    
    return;
}

static spBool spFindThreadIdListCUFFT(spThreadIdListCUFFT thread_id_list, spThreadId id)
{
    spThreadIdListCUFFT next_thread_id_list;

    next_thread_id_list = thread_id_list;
    
    while (next_thread_id_list != NULL) {
	if (spEqThreadId(next_thread_id_list->id, id) == SP_TRUE) {
	    return SP_TRUE;
	}
	next_thread_id_list = next_thread_id_list->next;
    }

    return SP_FALSE;
}

static spBool spUpdateThreadIdListCUFFT(spPluginInstanceCUFFT pinstance)
{
    spThreadId id;
    spBool updated = SP_FALSE;

    id = spGetCurrentThreadId();
    spDebug(10, "spUpdateThreadIdListCUFFT", "current thread ID: %ld\n", (long)id);

    if (spFindThreadIdListCUFFT(pinstance->thread_id_list, id) == SP_FALSE) {
	spCreateThreadIdListCUFFT(pinstance, id);
	updated = SP_TRUE;
	spDebug(10, "spUpdateThreadIdListCUFFT", "updated\n");
    }

    return updated;
}

static void *spInitInstanceCUFFT(const char *lang)
{
    spPluginInstanceCUFFT pinstance;
    struct cudaDeviceProp deviceProp;

    cudaGetDeviceProperties(&deviceProp, 0);
    spDebug(50, "spInitInstanceCUFFT",
	    "name = %s, totalGlobalMem = %ld, sharedMemPerBlock = %ld, regsPerBlock = %d, warpSize = %d, canMapHostMemory = %d\n",
	    deviceProp.name, deviceProp.totalGlobalMem, deviceProp.sharedMemPerBlock,
	    deviceProp.regsPerBlock, deviceProp.warpSize, deviceProp.canMapHostMemory);

    if (!deviceProp.canMapHostMemory) {
	return NULL;
    }
    
    pinstance = xalloc(1, struct _spPluginInstanceCUFFT);
    memset(pinstance, 0, sizeof(struct _spPluginInstanceCUFFT));
    
    pinstance->version = deviceProp.major*10 + deviceProp.minor;
    spDebug(50, "spInitInstanceCUFFT",
	    "Compute capability %d.%d, version = %d\n", deviceProp.major, deviceProp.minor, pinstance->version);
    if (pinstance->version >= 13) {
	pinstance->support_double = SP_TRUE;
    } else {
	pinstance->support_double = SP_FALSE;
    }

    spCreateThreadIdListCUFFT(pinstance, spGetCurrentThreadId());
    
    return (void *)pinstance;
}

static spBool spFreeInstanceCUFFT(void *instance)
{
    spThreadIdListCUFFT thread_id_list, next_thread_id_list;
    spPluginInstanceCUFFT pinstance = (spPluginInstanceCUFFT)instance;

    thread_id_list = pinstance->thread_id_list;
    
    while (thread_id_list != NULL) {
	next_thread_id_list = thread_id_list->next;
	xfree(thread_id_list);
	thread_id_list = next_thread_id_list;
    }

    xfree(pinstance);
    
    return SP_TRUE;
}

static spBool spFreeFFTCUFFT(void *instance, void *ifftrec)
{
    spFFTRecCUFFT fftrec = (spFFTRecCUFFT)ifftrec;

    if (fftrec->data != NULL) {
	cudaFreeHost(fftrec->data);
    }
    if (fftrec->cplx_data != NULL) {
	cudaFreeHost(fftrec->cplx_data);
    }
    
    if (fftrec->dataf != NULL) {
	cudaFreeHost(fftrec->dataf);
    }
    if (fftrec->cplx_dataf != NULL) {
	cudaFreeHost(fftrec->cplx_dataf);
    }
    
    if (fftrec->plan_c2c_created == SP_TRUE) {
	cufftDestroy(fftrec->plan_c2c);
    }
    if (fftrec->plan_c2r_created == SP_TRUE) {
	cufftDestroy(fftrec->plan_c2r);
    }
    if (fftrec->plan_r2c_created == SP_TRUE) {
	cufftDestroy(fftrec->plan_r2c);
    }
    
    xfree(fftrec);
    
    return SP_TRUE;
}

static spBool spIsPrecisionSupportedCUFFT(void *instance, spFFTPrecision precision, spFFTSpeed *speed)
{
    if (precision == SP_FFT_DOUBLE_PRECISION || precision == SP_FFT_FLOAT_PRECISION) {
	if (speed != NULL) {
	    if (precision == SP_FFT_FLOAT_PRECISION) {
		*speed = SP_FFT_SPEED_VERY_FAST;
	    } else {
		*speed = SP_FFT_SPEED_FASTER;
	    }
	}
	return SP_TRUE;
    } else {
	return SP_FALSE;
    }
}

static void *spInitFFTCUFFT(void *instance, long order, long batch, spFFTPrecision precision)
{
    spFFTRecCUFFT fftrec;
    cufftResult res;
    cudaError_t err;
    spPluginInstanceCUFFT pinstance = (spPluginInstanceCUFFT)instance;
    
    fftrec = xalloc(1, struct _spFFTRecCUFFT);
    memset(fftrec, 0, sizeof(struct _spFFTRecCUFFT));
    fftrec->fftl = POW2(order);
    fftrec->length = fftrec->fftl * batch;
    fftrec->batch = batch;
    spDebug(50, "spInitFFTCUFFT", "order = %ld, batch = %ld, fftl = %ld, length = %ld\n",
	    order, batch, fftrec->fftl, fftrec->length);

    spUpdateThreadIdListCUFFT(pinstance);
    
    if (precision >= SP_FFT_DOUBLE_PRECISION) {
	if ((res = cufftPlan1d(&fftrec->plan_c2c, fftrec->fftl, CUFFT_Z2Z, batch)) != CUFFT_SUCCESS) {
	    spDebug(10, "spInitCUFFT", "cufftPlan1d of CUFFT_Z2Z failed: %d\n", res);
	    spFreeFFTCUFFT(instance, fftrec);
	    return NULL;
	}
	fftrec->plan_c2c_created = SP_TRUE;
	
	if ((res = cufftPlan1d(&fftrec->plan_r2c, fftrec->fftl, CUFFT_D2Z, batch)) != CUFFT_SUCCESS) {
	    spDebug(10, "spInitCUFFT", "cufftPlan1d of CUFFT_D2Z failed: %d\n", res);
	    spFreeFFTCUFFT(instance, fftrec);
	    return NULL;
	}
	fftrec->plan_r2c_created = SP_TRUE;
	
	if ((res = cufftPlan1d(&fftrec->plan_c2r, fftrec->fftl, CUFFT_Z2D, batch)) != CUFFT_SUCCESS) {
	    spDebug(10, "spInitCUFFT", "cufftPlan1d of CUFFT_Z2D failed: %d\n", res);
	    spFreeFFTCUFFT(instance, fftrec);
	    return NULL;
	}
	fftrec->plan_c2r_created = SP_TRUE;
	
	if ((err = cudaHostAlloc(&fftrec->cplx_data, sizeof(cufftDoubleComplex) * fftrec->length,
				 cudaHostAllocMapped)) != cudaSuccess) {
	    spDebug(10, "spInitCUFFT", "cudaHostAlloc failed: %d\n", err);
	    spFreeFFTCUFFT(instance, fftrec);
	    return NULL;
	}
	if ((err = cudaHostGetDevicePointer(&fftrec->dev_cplx_data, fftrec->cplx_data, 0)) != cudaSuccess) {
	    spDebug(10, "spInitCUFFT", "cudaHostGetDevicePointer failed: %d\n", err);
	    spFreeFFTCUFFT(instance, fftrec);
	    return NULL;
	}
	
	if ((err = cudaHostAlloc(&fftrec->data, sizeof(cufftDoubleReal) * fftrec->length,
				 cudaHostAllocMapped)) != cudaSuccess) {
	    spDebug(10, "spInitCUFFT", "cudaHostAlloc failed: %d\n", err);
	    spFreeFFTCUFFT(instance, fftrec);
	    return NULL;
	}
	if ((err = cudaHostGetDevicePointer(&fftrec->dev_data, fftrec->data, 0)) != cudaSuccess) {
	    spDebug(10, "spInitCUFFT", "cudaHostGetDevicePointer failed: %d\n", err);
	    spFreeFFTCUFFT(instance, fftrec);
	    return NULL;
	}
    } else {
	if ((res = cufftPlan1d(&fftrec->plan_c2c, fftrec->fftl, CUFFT_C2C, batch)) != CUFFT_SUCCESS) {
	    spDebug(10, "spInitCUFFT", "cufftPlan1d of CUFFT_C2C failed: %d\n", res);
	    spFreeFFTCUFFT(instance, fftrec);
	    return NULL;
	}
	fftrec->plan_c2c_created = SP_TRUE;
	
	if ((res = cufftPlan1d(&fftrec->plan_r2c, fftrec->fftl, CUFFT_R2C, batch)) != CUFFT_SUCCESS) {
	    spDebug(10, "spInitCUFFT", "cufftPlan1d of CUFFT_R2C failed: %d\n", res);
	    spFreeFFTCUFFT(instance, fftrec);
	    return NULL;
	}
	fftrec->plan_r2c_created = SP_TRUE;
	
	if ((res = cufftPlan1d(&fftrec->plan_c2r, fftrec->fftl, CUFFT_C2R, batch)) != CUFFT_SUCCESS) {
	    spDebug(10, "spInitCUFFT", "cufftPlan1d of CUFFT_C2R failed: %d\n", res);
	    spFreeFFTCUFFT(instance, fftrec);
	    return NULL;
	}
	fftrec->plan_c2r_created = SP_TRUE;
	
	if ((err = cudaHostAlloc(&fftrec->cplx_dataf, sizeof(cufftComplex) * fftrec->length,
				 cudaHostAllocMapped)) != cudaSuccess) {
	    spDebug(10, "spInitCUFFT", "cudaHostAlloc failed: %d\n", err);
	    spFreeFFTCUFFT(instance, fftrec);
	    return NULL;
	}
	if ((err = cudaHostGetDevicePointer(&fftrec->dev_cplx_dataf, fftrec->cplx_dataf, 0)) != cudaSuccess) {
	    spDebug(10, "spInitCUFFT", "cudaHostGetDevicePointer failed: %d\n", err);
	    spFreeFFTCUFFT(instance, fftrec);
	    return NULL;
	}
	
	if ((err = cudaHostAlloc(&fftrec->dataf, sizeof(cufftReal) * fftrec->length,
				 cudaHostAllocMapped)) != cudaSuccess) {
	    spDebug(10, "spInitCUFFT", "cudaHostAlloc failed: %d\n", err);
	    spFreeFFTCUFFT(instance, fftrec);
	    return NULL;
	}
	if ((err = cudaHostGetDevicePointer(&fftrec->dev_dataf, fftrec->dataf, 0)) != cudaSuccess) {
	    spDebug(10, "spInitCUFFT", "cudaHostGetDevicePointer failed: %d\n", err);
	    spFreeFFTCUFFT(instance, fftrec);
	    return NULL;
	}
    }

#if 0
    spDebug(10, "spInitCUFFT", "plan_c2c = %d, plan_r2c = %d, plan_c2r = %d\n",
	    fftrec->plan_c2c, fftrec->plan_r2c, fftrec->plan_c2r);
#endif
    
    fftrec->blocksize = MIN(MAX((unsigned int)fftrec->length / 32, 1), 512);
    fftrec->gridsize = (unsigned int)fftrec->length / fftrec->blocksize;
    spDebug(10, "spInitCUFFT", "fftl = %ld, gridsize = %d, blocksize = %d\n",
	    fftrec->fftl, fftrec->gridsize, fftrec->blocksize);
    
    return fftrec;
}

static spBool spExecFFTFCUFFT(void *instance, void *ifftrec, float *real, float *imag, int inv)
{
    long k;
    cufftResult res;
    spFFTRecCUFFT fftrec = (spFFTRecCUFFT)ifftrec;

    if (fftrec->cplx_data != NULL) {
	for (k = 0; k < fftrec->length; k++) {
	    fftrec->cplx_data[k] = make_cuDoubleComplex(real[k], imag[k]);
	}
	
	if ((res = cufftExecZ2Z(fftrec->plan_c2c, fftrec->dev_cplx_data, fftrec->dev_cplx_data,
				(inv ? CUFFT_INVERSE : CUFFT_FORWARD))) != CUFFT_SUCCESS) {
	    spDebug(10, "spExecFFTFCUFFT", "cufftExecZ2Z failed: %d\n", res);
	    return SP_FALSE;
	}
	cudaThreadSynchronize();
	
	if (inv) {
	    spCplxVectorScalarDivDoubleCUFFT(fftrec, (double)fftrec->fftl);
	}
	
	for (k = 0; k < fftrec->length; k++) {
	    real[k] = (float)cuCreal(fftrec->cplx_data[k]);
	    imag[k] = (float)cuCimag(fftrec->cplx_data[k]);
	}
    } else {
	for (k = 0; k < fftrec->length; k++) {
	    fftrec->cplx_dataf[k] = make_cuFloatComplex(real[k], imag[k]);
	}
	
	if ((res = cufftExecC2C(fftrec->plan_c2c, fftrec->dev_cplx_dataf, fftrec->dev_cplx_dataf,
				(inv ? CUFFT_INVERSE : CUFFT_FORWARD))) != CUFFT_SUCCESS) {
	    spDebug(10, "spExecFFTFCUFFT", "cufftExecC2C failed: %d\n", res);
	    return SP_FALSE;
	}
	cudaThreadSynchronize();
	
	if (inv) {
	    spCplxVectorScalarDivCUFFT(fftrec, (float)fftrec->fftl);
	}
		
	for (k = 0; k < fftrec->length; k++) {
	    real[k] = (float)cuCrealf(fftrec->cplx_dataf[k]);
	    imag[k] = (float)cuCimagf(fftrec->cplx_dataf[k]);
	}
    }
    
    return SP_TRUE;
}

static spBool spExecFFTCUFFT(void *instance, void *ifftrec, double *real, double *imag, int inv)
{
    long k;
    cufftResult res;
    spFFTRecCUFFT fftrec = (spFFTRecCUFFT)ifftrec;

    if (fftrec->cplx_data != NULL) {
	for (k = 0; k < fftrec->length; k++) {
	    fftrec->cplx_data[k] = make_cuDoubleComplex(real[k], imag[k]);
	}
	
	if ((res = cufftExecZ2Z(fftrec->plan_c2c, fftrec->dev_cplx_data, fftrec->dev_cplx_data,
				(inv ? CUFFT_INVERSE : CUFFT_FORWARD))) != CUFFT_SUCCESS) {
	    spDebug(10, "spExecFFTCUFFT", "cufftExecZ2Z failed: %d\n", res);
	    return SP_FALSE;
	}
	cudaThreadSynchronize();
	
	if (inv) {
	    spCplxVectorScalarDivDoubleCUFFT(fftrec, (double)fftrec->fftl);
	}
	
	for (k = 0; k < fftrec->length; k++) {
	    real[k] = cuCreal(fftrec->cplx_data[k]);
	    imag[k] = cuCimag(fftrec->cplx_data[k]);
	}
    } else {
	for (k = 0; k < fftrec->length; k++) {
	    fftrec->cplx_dataf[k] = make_cuFloatComplex((float)real[k], (float)imag[k]);
	}
	
	if ((res = cufftExecC2C(fftrec->plan_c2c, fftrec->dev_cplx_dataf, fftrec->dev_cplx_dataf,
				(inv ? CUFFT_INVERSE : CUFFT_FORWARD))) != CUFFT_SUCCESS) {
	    spDebug(10, "spExecFFTCUFFT", "cufftExecC2C failed: %d\n", res);
	    return SP_FALSE;
	}
	cudaThreadSynchronize();
	
	if (inv) {
	    spCplxVectorScalarDivCUFFT(fftrec, (float)fftrec->fftl);
	}
	
	for (k = 0; k < fftrec->length; k++) {
	    real[k] = (double)cuCrealf(fftrec->cplx_dataf[k]);
	    imag[k] = (double)cuCimagf(fftrec->cplx_dataf[k]);
	}
    }
    
    return SP_TRUE;
}

static spBool execRealFFTFCUFFT(void *instance, void *ifftrec, float *data, int inv,
				spBool power_flag, float exponent)
{
    long k, n, hfftl, fftl2;
    cufftResult res;
    spFFTRecCUFFT fftrec = (spFFTRecCUFFT)ifftrec;

    fftl2 = fftrec->fftl / 2;
    hfftl = fftl2 + 1;
    
    if (fftrec->cplx_data != NULL) {
	if (inv) {
	    for (n = 0; n < fftrec->batch; n++) {
		fftrec->cplx_data[n * hfftl] = make_cuDoubleComplex(data[n * fftrec->fftl], 0.0);
		for (k = 1; k < fftl2; k++) {
		    fftrec->cplx_data[n * hfftl + k]
			= make_cuDoubleComplex(data[n * fftrec->fftl + k * 2], data[n * fftrec->fftl + k * 2 + 1]);
		}
		fftrec->cplx_data[n * hfftl + k] = make_cuDoubleComplex(data[n * fftrec->fftl + 1], 0.0);
	    }
	    
	    if ((res = cufftExecZ2D(fftrec->plan_c2r, fftrec->dev_cplx_data, fftrec->dev_data)) != CUFFT_SUCCESS) {
		spDebug(10, "spExecRealFFTFCUFFT", "cufftExecZ2D failed: %d\n", res);
		return SP_FALSE;
	    }
	    cudaThreadSynchronize();

	    spVectorScalarDivDoubleCUFFT(fftrec, (double)fftrec->fftl);
	    
	    for (k = 0; k < fftrec->length; k++) {
		data[k] = (float)fftrec->data[k];
	    }
	} else {
	    for (k = 0; k < fftrec->length; k++) {
		fftrec->data[k] = (double)data[k];
	    }
	
	    if ((res = cufftExecD2Z(fftrec->plan_r2c, fftrec->dev_data, fftrec->dev_cplx_data)) != CUFFT_SUCCESS) {
		spDebug(10, "spExecRealFFTFCUFFT", "cufftExecD2Z failed: %d\n", res);
		return SP_FALSE;
	    }
	    cudaThreadSynchronize();
	    
	    if (power_flag == SP_TRUE) {
		spRealFFTToPowerDoubleCUFFT(fftrec, exponent);
		for (k = 0; k < fftrec->length; k++) {
		    data[k] = (float)fftrec->data[k];
		}
	    } else {
		for (n = 0; n < fftrec->batch; n++) {
		    data[n * fftrec->fftl] = (float)cuCreal(fftrec->cplx_data[n * hfftl]);
		    for (k = 1; k < fftl2; k++) {
			data[n * fftrec->fftl + k * 2] = (float)cuCreal(fftrec->cplx_data[n * hfftl + k]);
			data[n * fftrec->fftl + k * 2 + 1] = (float)cuCimag(fftrec->cplx_data[n * hfftl + k]);
		    }
		    data[n * fftrec->fftl + 1] = (float)cuCreal(fftrec->cplx_data[n * hfftl + k]);
		}
	    }
	}
    } else {
	if (inv) {
	    for (n = 0; n < fftrec->batch; n++) {
		fftrec->cplx_dataf[n * hfftl] = make_cuFloatComplex(data[n * fftrec->fftl], 0.0);
		for (k = 1; k < fftl2; k++) {
		    fftrec->cplx_dataf[n * hfftl + k]
			= make_cuFloatComplex(data[n * fftrec->fftl + k * 2],
					      data[n * fftrec->fftl + k * 2 + 1]);
		}
		fftrec->cplx_dataf[n * hfftl + k] = make_cuFloatComplex(data[n * fftrec->fftl + 1], 0.0);
	    }
	    
	    if ((res = cufftExecC2R(fftrec->plan_c2r, fftrec->dev_cplx_dataf, fftrec->dev_dataf)) != CUFFT_SUCCESS) {
		spDebug(10, "spExecRealFFTFCUFFT", "cufftExecC2R failed: %d\n", res);
		return SP_FALSE;
	    }
	    cudaThreadSynchronize();

	    spVectorScalarDivCUFFT(fftrec, (float)fftrec->fftl);

	    cudaMemcpy(data, fftrec->dataf, fftrec->length * sizeof(float), cudaMemcpyHostToHost);
	} else {
	    cudaMemcpy(fftrec->dataf, data, fftrec->length * sizeof(float), cudaMemcpyHostToHost);
	    /*memcpy(fftrec->dataf, data, fftrec->length * sizeof(float));*/
	
	    if ((res = cufftExecR2C(fftrec->plan_r2c, fftrec->dev_dataf, fftrec->dev_cplx_dataf)) != CUFFT_SUCCESS) {
		spDebug(10, "spExecRealFFTFCUFFT", "cufftExecR2C failed: %d\n", res);
		return SP_FALSE;
	    }
	    cudaThreadSynchronize();
	    
	    if (power_flag == SP_TRUE) {
		spRealFFTToPowerCUFFT(fftrec, exponent);
		cudaMemcpy(data, fftrec->dataf, fftrec->length * sizeof(float), cudaMemcpyHostToHost);
	    } else {
		for (n = 0; n < fftrec->batch; n++) {
		    data[n * fftrec->fftl] = cuCrealf(fftrec->cplx_dataf[n * hfftl]);
		    for (k = 1; k < fftl2; k++) {
			data[n * fftrec->fftl + k * 2] = cuCrealf(fftrec->cplx_dataf[n * hfftl + k]);
			data[n * fftrec->fftl + k * 2 + 1] = cuCimagf(fftrec->cplx_dataf[n * hfftl + k]);
		    }
		    data[n * fftrec->fftl + 1] = cuCrealf(fftrec->cplx_dataf[n * hfftl + k]);
		}
	    }
	}
    }
    
    return SP_TRUE;
}

static spBool spExecRealFFTFCUFFT(void *instance, void *ifftrec, float *data, int inv)
{
    return execRealFFTFCUFFT(instance, ifftrec, data, inv, SP_FALSE, 0.0);
}

static spBool spExecFFTPowerFCUFFT(void *instance, void *ifftrec, float *data, float exponent)
{
    return execRealFFTFCUFFT(instance, ifftrec, data, 0, SP_TRUE, exponent);
}

static spBool execRealFFTCUFFT(void *instance, void *ifftrec, double *data, int inv,
			       spBool power_flag, double exponent)
{
    long k, n, hfftl, fftl2;
    cufftResult res;
    spFFTRecCUFFT fftrec = (spFFTRecCUFFT)ifftrec;

    fftl2 = fftrec->fftl / 2;
    hfftl = fftl2 + 1;
    
    if (fftrec->cplx_data != NULL) {
	if (inv) {
	    for (n = 0; n < fftrec->batch; n++) {
		fftrec->cplx_data[n * hfftl] = make_cuDoubleComplex(data[n * fftrec->fftl], 0.0);
		for (k = 1; k < fftl2; k++) {
		    fftrec->cplx_data[n * hfftl + k]
			= make_cuDoubleComplex(data[n * fftrec->fftl + k * 2],
					       data[n * fftrec->fftl + k * 2 + 1]);
		}
		fftrec->cplx_data[n * hfftl + k] = make_cuDoubleComplex(data[n * fftrec->fftl + 1], 0.0);
	    }
	    
	    if ((res = cufftExecZ2D(fftrec->plan_c2r, fftrec->dev_cplx_data, fftrec->dev_data)) != CUFFT_SUCCESS) {
		spDebug(10, "spExecRealFFTCUFFT", "cufftExecZ2D failed: %d\n", res);
		return SP_FALSE;
	    }
	    cudaThreadSynchronize();
	    
	    spVectorScalarDivDoubleCUFFT(fftrec, (double)fftrec->fftl);
	    
	    cudaMemcpy(data, fftrec->data, fftrec->length * sizeof(double), cudaMemcpyHostToHost);
	} else {
	    cudaMemcpy(fftrec->data, data, fftrec->length * sizeof(double), cudaMemcpyHostToHost);
	    /*memcpy(fftrec->data, data, fftrec->length * sizeof(double));*/
	
	    if ((res = cufftExecD2Z(fftrec->plan_r2c, fftrec->dev_data, fftrec->dev_cplx_data)) != CUFFT_SUCCESS) {
		spDebug(10, "spExecRealFFTCUFFT", "cufftExecD2Z failed: %d\n", res);
		return SP_FALSE;
	    }
	    cudaThreadSynchronize();

	    if (power_flag == SP_TRUE) {
		spRealFFTToPowerDoubleCUFFT(fftrec, exponent);
		cudaMemcpy(data, fftrec->data, fftrec->length * sizeof(double), cudaMemcpyHostToHost);
	    } else {
		for (n = 0; n < fftrec->batch; n++) {
		    data[n * fftrec->fftl] = cuCreal(fftrec->cplx_data[n * hfftl]);
		    for (k = 1; k < fftl2; k++) {
			data[n * fftrec->fftl + k * 2] = cuCreal(fftrec->cplx_data[n * hfftl + k]);
			data[n * fftrec->fftl + k * 2 + 1] = cuCimag(fftrec->cplx_data[n * hfftl + k]);
		    }
		    data[n * fftrec->fftl + 1] = cuCreal(fftrec->cplx_data[n * hfftl + k]);
		}
	    }
	}
    } else {
	if (inv) {
	    for (n = 0; n < fftrec->batch; n++) {
		fftrec->cplx_dataf[n * hfftl] = make_cuFloatComplex((float)data[n * fftrec->fftl], 0.0f);
		for (k = 1; k < fftl2; k++) {
		    fftrec->cplx_dataf[n * hfftl + k]
			= make_cuFloatComplex((float)data[n * fftrec->fftl + k * 2],
					      (float)data[n * fftrec->fftl + k * 2 + 1]);
		}
		fftrec->cplx_dataf[n * hfftl + k] = make_cuFloatComplex((float)data[n * fftrec->fftl + 1], 0.0f);
	    }
	    
	    if ((res = cufftExecC2R(fftrec->plan_c2r, fftrec->dev_cplx_dataf, fftrec->dev_dataf)) != CUFFT_SUCCESS) {
		spDebug(10, "spExecRealFFTCUFFT", "cufftExecC2R failed: %d\n", res);
		return SP_FALSE;
	    }
	    cudaThreadSynchronize();
	    
	    spVectorScalarDivCUFFT(fftrec, (float)fftrec->fftl);
	    
	    for (k = 0; k < fftrec->length; k++) {
		data[k] = (double)fftrec->dataf[k];
	    }
	} else {
	    for (k = 0; k < fftrec->length; k++) {
		fftrec->dataf[k] = (float)data[k];
	    }
	
	    if ((res = cufftExecR2C(fftrec->plan_r2c, fftrec->dev_dataf, fftrec->dev_cplx_dataf)) != CUFFT_SUCCESS) {
		spDebug(10, "spExecRealFFTCUFFT", "cufftExecR2C failed: %d\n", res);
		return SP_FALSE;
	    }
	    cudaThreadSynchronize();
	    
	    if (power_flag == SP_TRUE) {
		spRealFFTToPowerCUFFT(fftrec, (float)exponent);
		for (k = 0; k < fftrec->length; k++) {
		    data[k] = (double)fftrec->dataf[k];
		}
	    } else {
		for (n = 0; n < fftrec->batch; n++) {
		    data[n * fftrec->fftl] = cuCrealf(fftrec->cplx_dataf[n * hfftl]);
		    for (k = 1; k < fftl2; k++) {
			data[n * fftrec->fftl + k * 2] = cuCrealf(fftrec->cplx_dataf[n * hfftl + k]);
			data[n * fftrec->fftl + k * 2 + 1] = cuCimagf(fftrec->cplx_dataf[n * hfftl + k]);
		    }
		    data[n * fftrec->fftl + 1] = cuCrealf(fftrec->cplx_dataf[n * hfftl + k]);
		}
	    }
	}
    }
    
    return SP_TRUE;
}

static spBool spExecRealFFTCUFFT(void *instance, void *ifftrec, double *data, int inv)
{
    return execRealFFTCUFFT(instance, ifftrec, data, inv, SP_FALSE, 0.0);
}

static spBool spExecFFTPowerCUFFT(void *instance, void *ifftrec, double *data, double exponent)
{
    return execRealFFTCUFFT(instance, ifftrec, data, 0, SP_TRUE, exponent);
}

static spFFTPluginRec sp_fft_plugin_cufft = {
    NULL,
    NULL,

    SP_PLUGIN_FFT,
    "CUFFT",
    1,
    SP_PLUGIN_PRIORITY_MIDDLE,
    SP_PLUGIN_CAPS_THREAD_SAFE | SP_FFT_PLUGIN_CAPS_SUPPORT_BATCH,	/* caps */
    spInitCUFFT,
    spFreeCUFFT,
    "CUFFT",
    "CUFFT Plugin  Version 0.1",
    
    spInitInstanceCUFFT,
    spFreeInstanceCUFFT,
    NULL,
    NULL,

    NULL,
    NULL,
    NULL,

    spIsPrecisionSupportedCUFFT,

    NULL,
    NULL,
	
    spInitFFTCUFFT,
    spFreeFFTCUFFT,
    NULL,
	
    spExecFFTFCUFFT,
    spExecFFTCUFFT,
	
    spExecRealFFTFCUFFT,
    spExecRealFFTCUFFT,
	
    spExecFFTPowerFCUFFT,
    spExecFFTPowerCUFFT,
};

spPluginExport spPluginRec *spGetPluginRec(void)
{
    return (spPluginRec *)&sp_fft_plugin_cufft;
}
