Package com.jogamp.opencl.demos.fft

Source Code of com.jogamp.opencl.demos.fft.CLFFTPlan

/*
* sample is based on Apple's FFT example.
* initial port to JOCL Copyright 2010 Michael Zucchi
*
* TODO: The execute functions may allocate/use temporary memory per call hence they are
* neither thread safe nor multiple-queue safe.  Perhaps some per-queue allocation
* system would suffice.
* TODO: The dynamic device-dependent variables should be dynamic and device-dependent and not
* hardcoded.  Where possible.
* TODO: CPU support?
*/
package com.jogamp.opencl.demos.fft;

import com.jogamp.opencl.CLBuffer;
import com.jogamp.opencl.CLCommandQueue;
import com.jogamp.opencl.CLContext;
import com.jogamp.opencl.CLDevice;
import com.jogamp.opencl.CLEventList;
import com.jogamp.opencl.CLKernel;
import com.jogamp.opencl.CLMemory;
import com.jogamp.opencl.CLMemory.Mem;
import com.jogamp.opencl.CLProgram;
import java.io.OutputStream;
import java.io.PrintStream;
import java.nio.FloatBuffer;
import java.util.LinkedList;

/**
*
* @author notzed
*/
public class CLFFTPlan {

    private class CLFFTDim3 {

        int x;
        int y;
        int z;

        CLFFTDim3(int x, int y, int z) {
            this.x = x;
            this.y = y;
            this.z = z;
        }

        CLFFTDim3(int[] size) {
            x = size[0];
            y = size.length > 1 ? size[1] : 1;
            z = size.length > 2 ? size[2] : 1;
        }
    }

    private class WorkDimensions {

        int batchSize;
        long gWorkItems;
        long lWorkItems;

        public WorkDimensions(int batchSize, long gWorkItems, long lWorkItems) {
            this.batchSize = batchSize;
            this.gWorkItems = gWorkItems;
            this.lWorkItems = lWorkItems;
        }
    }

    private class fftPadding {

        int lMemSize;
        int offset;
        int midPad;

        public fftPadding(int lMemSize, int offset, int midPad) {
            this.lMemSize = lMemSize;
            this.offset = offset;
            this.midPad = midPad;
        }
    }

    class CLFFTKernelInfo {

        CLKernel kernel;
        String kernel_name;
        int lmem_size;
        int num_workgroups;
        int num_xforms_per_workgroup;
        int num_workitems_per_workgroup;
        CLFFTKernelDir dir;
        boolean in_place_possible;
    };

    public enum CLFFTDirection {

        Forward {

            int value() {
                return -1;
            }
        },
        Inverse {

            int value() {
                return 1;
            }
        };

        abstract int value();
    };

    enum CLFFTKernelDir {
        X,
        Y,
        Z
    };

    public enum CLFFTDataFormat {
        SplitComplexFormat,
        InterleavedComplexFormat,
    }

    // context in which fft resources are created and kernels are executed
    CLContext context;
    // size of signal
    CLFFTDim3 size;
    // dimension of transform ... must be either 1, 2 or 3
    int dim;
    // data format ... must be either interleaved or plannar
    CLFFTDataFormat format;
    // string containing kernel source. Generated at runtime based on
    // size, dim, format and other parameters
    StringBuilder kernel_string;
    // CL program containing source and kernel this particular
    // size, dim, data format
    CLProgram program;
    // linked list of kernels which needs to be executed for this fft
    LinkedList<CLFFTKernelInfo> kernel_list;
    // twist kernel for virtualizing fft of very large sizes that do not
    // fit in GPU global memory
    CLKernel twist_kernel;
    // flag indicating if temporary intermediate buffer is needed or not.
    // this depends on fft kernels being executed and if transform is
    // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ...
    // one that does not require global transpose do not need temporary buffer)
    // 2D 1024x1024 out-of-place fft however do require intermediate buffer.
    // If temp buffer is needed, its allocation is lazy i.e. its not allocated
    // until its needed
    boolean temp_buffer_needed;
    // Batch size is runtime parameter and size of temporary buffer (if needed)
    // depends on batch size. Allocation of temporary buffer is lazy i.e. its
    // only created when needed. Once its created at first call of clFFT_Executexxx
    // it is not allocated next time if next time clFFT_Executexxx is called with
    // batch size different than the first call. last_batch_size caches the last
    // batch size with which this plan is used so that we dont keep allocating/deallocating
    // temp buffer if same batch size is used again and again.
    int last_batch_size;
    // temporary buffer for interleaved plan
    CLMemory tempmemobj;
    // temporary buffer for planner plan. Only one of tempmemobj or
    // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending
    // data format of plan (plannar or interleaved)
    CLMemory tempmemobj_real, tempmemobj_imag;
    // Maximum size of signal for which local memory transposed based
    // fft is sufficient i.e. no global mem transpose (communication)
    // is needed
    int max_localmem_fft_size;
    // Maximum work items per work group allowed. This, along with max_radix below controls
    // maximum local memory being used by fft kernels of this plan. Set to 256 by default
    int max_work_item_per_workgroup;
    // Maximum base radix for local memory fft ... this controls the maximum register
    // space used by work items. Currently defaults to 16
    int max_radix;
    // Device depended parameter that tells how many work-items need to be read consecutive
    // values to make sure global memory access by work-items of a work-group result in
    // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
    int min_mem_coalesce_width;
    // Number of local memory banks. This is used to geneate kernel with local memory
    // transposes with appropriate padding to avoid bank conflicts to local memory
    // e.g. on NVidia it is 16.
    int num_local_mem_banks;

    public class InvalidContextException extends Exception {
    }

    /**
     * Create a new FFT plan.
     *
     * Use the matching executeInterleaved() or executePlanar() depending on the dataFormat specified.
     * @param context
     * @param sizes Array of sizes for each dimension.  The length of array defines how many dimensions there are.
     * @param dataFormat Data format, InterleavedComplex (array of complex) or SplitComplex (separate planar arrays).
     * @throws zephyr.cl.CLFFTPlan.InvalidContextException
     */
    public CLFFTPlan(CLContext context, int[] sizes, CLFFTDataFormat dataFormat) throws InvalidContextException {
        int i;
        int err;
        boolean isPow2 = true;
        String kString;
        int num_devices;
        boolean gpu_found = false;
        CLDevice[] devices;
        int ret_size;

        if (sizes.length < 1 || sizes.length > 3) {
            throw new IllegalArgumentException("Dimensions must be between 1 and 3");
        }

        this.size = new CLFFTDim3(sizes);

        isPow2 |= (this.size.x != 0) && (((this.size.x - 1) & this.size.x) == 0);
        isPow2 |= (this.size.y != 0) && (((this.size.y - 1) & this.size.y) == 0);
        isPow2 |= (this.size.z != 0) && (((this.size.z - 1) & this.size.z) == 0);

        if (!isPow2) {
            throw new IllegalArgumentException("Sizes must be power of two");
        }

        //if( (dim == FFT_1D && (size.y != 1 || size.z != 1)) || (dim == FFT_2D && size.z != 1) )
        //  ERR_MACRO(CL_INVALID_VALUE);

        this.context = context;
        //clRetainContext(context);
        //this.size = size;
        this.dim = sizes.length;
        this.format = dataFormat;
        //this.kernel_list = 0;
        //this.twist_kernel = 0;
        //this.program = 0;
        this.temp_buffer_needed = false;
        this.last_batch_size = 0;
        //this.tempmemobj = 0;
        //this.tempmemobj_real = 0;
        //this.tempmemobj_imag = 0;
        this.max_localmem_fft_size = 2048;
        this.max_work_item_per_workgroup = 256;
        this.max_radix = 16;
        this.min_mem_coalesce_width = 16;
        this.num_local_mem_banks = 16;

        boolean done = false;

        // this seems pretty shit, can't it tell this before building it?
        while (!done) {
            kernel_list = new LinkedList<CLFFTKernelInfo>();

            this.kernel_string = new StringBuilder();
            getBlockConfigAndKernelString();

            this.program = context.createProgram(kernel_string.toString());

            devices = context.getDevices();
            for (i = 0; i < devices.length; i++) {
                CLDevice dev = devices[i];

                if (dev.getType() == CLDevice.Type.GPU) {
                    gpu_found = true;
                    program.build("-cl-mad-enable", dev);
                }
            }

            if (!gpu_found) {
                throw new InvalidContextException();
            }

            createKernelList();

            // we created program and kernels based on "some max work group size (default 256)" ... this work group size
            // may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source
            // setting this as limit i.e max group size and rebuild.
            if (getPatchingRequired(devices)) {
                this.max_work_item_per_workgroup = (int) getMaxKernelWorkGroupSize(devices);
                release();
            } else {
                done = true;
            }
        }
    }

    /**
     * Release system resources.
     */
    public void release() {
        program.release();
    }

    void allocateTemporaryBufferInterleaved(int batchSize) {
        if (temp_buffer_needed && last_batch_size != batchSize) {
            last_batch_size = batchSize;
            int tmpLength = size.x * size.y * size.z * batchSize * 2 * 4; // sizeof(float)

            if (tempmemobj != null) {
                tempmemobj.release();
            }

            tempmemobj = context.createFloatBuffer(tmpLength, Mem.READ_WRITE);
        }
    }

    /**
     * Calculate FFT on interleaved complex data.
     * @param queue
     * @param batchSize How many instances to calculate.  Use 1 for a single FFT.
     * @param dir Direction of calculation, Forward or Inverse.
     * @param data_in Input buffer.
     * @param data_out Output buffer.  May be the same as data_in for in-place transform.
     * @param condition Condition to wait for.  NOT YET IMPLEMENTED.
     * @param event Event to wait for completion.  NOT YET IMPLEMENTED.
     */
    public void executeInterleaved(CLCommandQueue queue, int batchSize, CLFFTDirection dir,
            CLBuffer<FloatBuffer> data_in, CLBuffer<FloatBuffer> data_out,
            CLEventList condition, CLEventList event) {
        int s;
        if (format != format.InterleavedComplexFormat) {
            throw new IllegalArgumentException();
        }

        WorkDimensions wd;
        boolean inPlaceDone = false;

        boolean isInPlace = data_in == data_out;

        allocateTemporaryBufferInterleaved(batchSize);

        CLMemory[] memObj = new CLMemory[3];
        memObj[0] = data_in;
        memObj[1] = data_out;
        memObj[2] = tempmemobj;
        int numKernels = kernel_list.size();

        boolean numKernelsOdd = (numKernels & 1) != 0;
        int currRead = 0;
        int currWrite = 1;

        // at least one external dram shuffle (transpose) required
        if (temp_buffer_needed) {
            // in-place transform
            if (isInPlace) {
                inPlaceDone = false;
                currRead = 1;
                currWrite = 2;
            } else {
                currWrite = (numKernels & 1) == 1 ? 1 : 2;
            }

            for (CLFFTKernelInfo kernelInfo : kernel_list) {
                if (isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo.in_place_possible) {
                    currWrite = currRead;
                    inPlaceDone = true;
                }

                s = batchSize;
                wd = getKernelWorkDimensions(kernelInfo, s);
                kernelInfo.kernel.setArg(0, memObj[currRead]);
                kernelInfo.kernel.setArg(1, memObj[currWrite]);
                kernelInfo.kernel.setArg(2, dir.value());
                kernelInfo.kernel.setArg(3, wd.batchSize);
                queue.put2DRangeKernel(kernelInfo.kernel, 0, 0, wd.gWorkItems, 1, wd.lWorkItems, 1);
                //queue.put1DRangeKernel(kernelInfo.kernel, 0, wd.gWorkItems, wd.lWorkItems);

                //System.out.printf("execute %s size %d,%d batch %d, dir %d, currread %d currwrite %d\size", kernelInfo.kernel_name, wd.gWorkItems, wd.lWorkItems, wd.batchSize, dir.value(), currRead, currWrite);

                currRead = (currWrite == 1) ? 1 : 2;
                currWrite = (currWrite == 1) ? 2 : 1;
            }
        } else {
            // no dram shuffle (transpose required) transform
            // all kernels can execute in-place.
            for (CLFFTKernelInfo kernelInfo : kernel_list) {
                {
                    s = batchSize;
                    wd = getKernelWorkDimensions(kernelInfo, s);

                    kernelInfo.kernel.setArg(0, memObj[currRead]);
                    kernelInfo.kernel.setArg(1, memObj[currWrite]);
                    kernelInfo.kernel.setArg(2, dir.value());
                    kernelInfo.kernel.setArg(3, wd.batchSize);
                    queue.put2DRangeKernel(kernelInfo.kernel, 0, 0, wd.gWorkItems, 1, wd.lWorkItems, 1);

                    //System.out.printf("execute %s size %d,%d batch %d, currread %d currwrite %d\size", kernelInfo.kernel_name, wd.gWorkItems, wd.lWorkItems, wd.batchSize, currRead, currWrite);

                    currRead = 1;
                    currWrite = 1;
                }
            }
        }
    }

    void allocateTemporaryBufferPlanar(int batchSize) {
        if (temp_buffer_needed && last_batch_size != batchSize) {
            last_batch_size = batchSize;
            int tmpLength = size.x * size.y * size.z * batchSize * 4; //sizeof(cl_float);

            if (tempmemobj_real != null) {
                tempmemobj_real.release();
            }

            if (tempmemobj_imag != null) {
                tempmemobj_imag.release();
            }

            tempmemobj_real = context.createFloatBuffer(tmpLength, Mem.READ_WRITE);
            tempmemobj_imag = context.createFloatBuffer(tmpLength, Mem.READ_WRITE);
        }
    }

    /**
     * Calculate FFT of planar data.
     * @param queue
     * @param batchSize
     * @param dir
     * @param data_in_real
     * @param data_in_imag
     * @param data_out_real
     * @param data_out_imag
     * @param contition
     * @param event
     */
    public void executePlanar(CLCommandQueue queue, int batchSize, CLFFTDirection dir,
            CLBuffer<FloatBuffer> data_in_real, CLBuffer<FloatBuffer> data_in_imag, CLBuffer<FloatBuffer> data_out_real, CLBuffer<FloatBuffer> data_out_imag,
            CLEventList contition, CLEventList event) {
        int s;

        if (format != format.SplitComplexFormat) {
            throw new IllegalArgumentException();
        }

        int err;
        WorkDimensions wd;
        boolean inPlaceDone = false;

        boolean isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag));

        allocateTemporaryBufferPlanar(batchSize);

        CLMemory[] memObj_real = new CLMemory[3];
        CLMemory[] memObj_imag = new CLMemory[3];
        memObj_real[0] = data_in_real;
        memObj_real[1] = data_out_real;
        memObj_real[2] = tempmemobj_real;
        memObj_imag[0] = data_in_imag;
        memObj_imag[1] = data_out_imag;
        memObj_imag[2] = tempmemobj_imag;

        int numKernels = kernel_list.size();

        boolean numKernelsOdd = (numKernels & 1) == 1;
        int currRead = 0;
        int currWrite = 1;

        // at least one external dram shuffle (transpose) required
        if (temp_buffer_needed) {
            // in-place transform
            if (isInPlace) {
                inPlaceDone = false;
                currRead = 1;
                currWrite = 2;
            } else {
                currWrite = (numKernels & 1) == 1 ? 1 : 2;
            }

            for (CLFFTKernelInfo kernelInfo : kernel_list) {
                if (isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo.in_place_possible) {
                    currWrite = currRead;
                    inPlaceDone = true;
                }

                s = batchSize;
                wd = getKernelWorkDimensions(kernelInfo, s);

                kernelInfo.kernel.setArg(0, memObj_real[currRead]);
                kernelInfo.kernel.setArg(1, memObj_imag[currRead]);
                kernelInfo.kernel.setArg(2, memObj_real[currWrite]);
                kernelInfo.kernel.setArg(3, memObj_imag[currWrite]);
                kernelInfo.kernel.setArg(4, dir.value());
                kernelInfo.kernel.setArg(5, wd.batchSize);

                queue.put1DRangeKernel(kernelInfo.kernel, 0, wd.gWorkItems, wd.lWorkItems);


                currRead = (currWrite == 1) ? 1 : 2;
                currWrite = (currWrite == 1) ? 2 : 1;

            }
        } // no dram shuffle (transpose required) transform
        else {

            for (CLFFTKernelInfo kernelInfo : kernel_list) {
                s = batchSize;
                wd = getKernelWorkDimensions(kernelInfo, s);

                kernelInfo.kernel.setArg(0, memObj_real[currRead]);
                kernelInfo.kernel.setArg(1, memObj_imag[currRead]);
                kernelInfo.kernel.setArg(2, memObj_real[currWrite]);
                kernelInfo.kernel.setArg(3, memObj_imag[currWrite]);
                kernelInfo.kernel.setArg(4, dir.value());
                kernelInfo.kernel.setArg(5, wd.batchSize);

                queue.put1DRangeKernel(kernelInfo.kernel, 0, wd.gWorkItems, wd.lWorkItems);
                currRead = 1;
                currWrite = 1;
            }
        }
    }

    /**
     * Dump the planner result to the output stream.
     * @param os if null, System.out is used.
     */
    public void dumpPlan(OutputStream os) {
        PrintStream out = os == null ? System.out : new PrintStream(os);

        for (CLFFTKernelInfo kInfo : kernel_list) {
            int s = 1;
            WorkDimensions wd = getKernelWorkDimensions(kInfo, s);
            out.printf("Run kernel %s with global dim = {%d*BatchSize}, local dim={%d}\n", kInfo.kernel_name, wd.gWorkItems, wd.lWorkItems);
        }
        out.printf("%s\n", kernel_string.toString());
    }

    WorkDimensions getKernelWorkDimensions(CLFFTKernelInfo kernelInfo, int batchSize) {
        int lWorkItems = kernelInfo.num_workitems_per_workgroup;
        int numWorkGroups = kernelInfo.num_workgroups;
        int numXFormsPerWG = kernelInfo.num_xforms_per_workgroup;

        switch (kernelInfo.dir) {
            case X:
                batchSize *= (size.y * size.z);
                numWorkGroups = ((batchSize % numXFormsPerWG) != 0) ? (batchSize / numXFormsPerWG + 1) : (batchSize / numXFormsPerWG);
                numWorkGroups *= kernelInfo.num_workgroups;
                break;
            case Y:
                batchSize *= size.z;
                numWorkGroups *= batchSize;
                break;
            case Z:
                numWorkGroups *= batchSize;
                break;
        }

        return new WorkDimensions(batchSize, numWorkGroups * lWorkItems, lWorkItems);
    }

    /*
     *
     * Kernel building/customisation code follows
     *
     */
    private void getBlockConfigAndKernelString() {
        this.temp_buffer_needed = false;
        this.kernel_string.append(baseKernels);

        if (this.format == CLFFTDataFormat.SplitComplexFormat) {
            this.kernel_string.append(twistKernelPlannar);
        } else {
            this.kernel_string.append(twistKernelInterleaved);
        }

        switch (this.dim) {
            case 1:
                FFT1D(CLFFTKernelDir.X);
                break;

            case 2:
                FFT1D(CLFFTKernelDir.X);
                FFT1D(CLFFTKernelDir.Y);
                break;

            case 3:
                FFT1D(CLFFTKernelDir.X);
                FFT1D(CLFFTKernelDir.Y);
                FFT1D(CLFFTKernelDir.Z);
                break;

            default:
                return;
        }

        this.temp_buffer_needed = false;
        for (CLFFTKernelInfo kInfo : this.kernel_list) {
            this.temp_buffer_needed |= !kInfo.in_place_possible;
        }
    }

    private void createKernelList() {
        CLFFTKernelInfo kern;
        for (CLFFTKernelInfo kinfo : this.kernel_list) {
            kinfo.kernel = program.createCLKernel(kinfo.kernel_name);
        }

        if (format == format.SplitComplexFormat) {
            twist_kernel = program.createCLKernel("clFFT_1DTwistSplit");
        } else {
            twist_kernel = program.createCLKernel("clFFT_1DTwistInterleaved");
        }
    }

    private boolean getPatchingRequired(CLDevice[] devices) {
        int i;
        for (i = 0; i < devices.length; i++) {
            for (CLFFTKernelInfo kInfo : kernel_list) {
                if (kInfo.kernel.getWorkGroupSize(devices[i]) < kInfo.num_workitems_per_workgroup) {
                    return true;
                }
            }
        }
        return false;
    }

    long getMaxKernelWorkGroupSize(CLDevice[] devices) {
        long max_wg_size = Integer.MAX_VALUE;
        int i;

        for (i = 0; i < devices.length; i++) {
            for (CLFFTKernelInfo kInfo : kernel_list) {
                long wg_size = kInfo.kernel.getWorkGroupSize(devices[i]);

                if (max_wg_size > wg_size) {
                    max_wg_size = wg_size;
                }
            }
        }

        return max_wg_size;
    }

    int log2(int x) {
        return 32 - Integer.numberOfLeadingZeros(x - 1);
    }

// For any size, this function decomposes size into factors for loacal memory tranpose
// based fft. Factors (radices) are sorted such that the first one (radixArray[0])
// is the largest. This base radix determines the number of registers used by each
// work item and product of remaining radices determine the size of work group needed.
// To make things concrete with and example, suppose size = 1024. It is decomposed into
// 1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and
// needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length
// 16 ffts (64 work items working in parallel) following by transpose using local
// memory followed by again 64 length 16 ffts followed by transpose using local memory
// followed by 256 length 4 ffts. For the last step since with size of work group is
// 64 and each work item can array for 16 values, 64 work items can compute 256 length
// 4 ffts by each work item computing 4 length 4 ffts.
// Similarly for size = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work
// iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed
// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
// by transpose using local memory, followed by 512 length 4 in-register ffts. Again,
// for the last step, each work item computes two length 4 in-register ffts and thus
// 256 work items are needed to compute all 512 ffts.
// For size = 32 = 8 x 4, 4 work items first compute 4 in-register
// lenth 8 ffts, followed by transpose using local memory followed by 8 in-register
// length 4 ffts, where each work item computes two length 4 ffts thus 4 work items
// can compute 8 length 4 ffts. However if work group size of say 64 is choosen,
// each work group can compute 64/ 4 = 16 size 32 ffts (batched transform).
// Users can play with these parameters to figure what gives best performance on
// their particular device i.e. some device have less register space thus using
// smaller base radix can avoid spilling ... some has small local memory thus
// using smaller work group size may be required etc
    int getRadixArray(int n, int[] radixArray, int maxRadix) {
        if (maxRadix > 1) {
            maxRadix = Math.min(n, maxRadix);
            int cnt = 0;
            while (n > maxRadix) {
                radixArray[cnt++] = maxRadix;
                n /= maxRadix;
            }
            radixArray[cnt++] = n;
            return cnt;
        }

        switch (n) {
            case 2:
                radixArray[0] = 2;
                return 1;

            case 4:
                radixArray[0] = 4;
                return 1;

            case 8:
                radixArray[0] = 8;
                return 1;

            case 16:
                radixArray[0] = 8;
                radixArray[1] = 2;
                return 2;

            case 32:
                radixArray[0] = 8;
                radixArray[1] = 4;
                return 2;

            case 64:
                radixArray[0] = 8;
                radixArray[1] = 8;
                return 2;

            case 128:
                radixArray[0] = 8;
                radixArray[1] = 4;
                radixArray[2] = 4;
                return 3;

            case 256:
                radixArray[0] = 4;
                radixArray[1] = 4;
                radixArray[2] = 4;
                radixArray[3] = 4;
                return 4;

            case 512:
                radixArray[0] = 8;
                radixArray[1] = 8;
                radixArray[2] = 8;
                return 3;

            case 1024:
                radixArray[0] = 16;
                radixArray[1] = 16;
                radixArray[2] = 4;
                return 3;
            case 2048:
                radixArray[0] = 8;
                radixArray[1] = 8;
                radixArray[2] = 8;
                radixArray[3] = 4;
                return 4;
            default:
                return 0;
        }
    }

    void insertHeader(StringBuilder kernelString, String kernelName, CLFFTDataFormat dataFormat) {
        if (dataFormat == CLFFTPlan.CLFFTDataFormat.SplitComplexFormat) {
            kernelString.append("__kernel void ").append(kernelName).append("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n");
        } else {
            kernelString.append("__kernel void ").append(kernelName).append("(__global float2 *in, __global float2 *out, int dir, int S)\n");
        }
    }

    void insertVariables(StringBuilder kStream, int maxRadix) {
        kStream.append("    int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n");
        kStream.append("    int s, ii, jj, offset;\n");
        kStream.append("    float2 w;\n");
        kStream.append("    float ang, angf, ang1;\n");
        kStream.append("    __local float *lMemStore, *lMemLoad;\n");
        kStream.append("    float2 a[").append(maxRadix).append("];\n");
        kStream.append("    int lId = get_local_id( 0 );\n");
        kStream.append("    int groupId = get_group_id( 0 );\n");
    }

    void formattedLoad(StringBuilder kernelString, int aIndex, int gIndex, CLFFTDataFormat dataFormat) {
        if (dataFormat == dataFormat.InterleavedComplexFormat) {
            kernelString.append("        a[").append(aIndex).append("] = in[").append(gIndex).append("];\n");
        } else {
            kernelString.append("        a[").append(aIndex).append("].x = in_real[").append(gIndex).append("];\n");
            kernelString.append("        a[").append(aIndex).append("].y = in_imag[").append(gIndex).append("];\n");
        }
    }

    void formattedStore(StringBuilder kernelString, int aIndex, int gIndex, CLFFTDataFormat dataFormat) {
        if (dataFormat == dataFormat.InterleavedComplexFormat) {
            kernelString.append("        out[").append(gIndex).append("] = a[").append(aIndex).append("];\n");
        } else {
            kernelString.append("        out_real[").append(gIndex).append("] = a[").append(aIndex).append("].x;\n");
            kernelString.append("        out_imag[").append(gIndex).append("] = a[").append(aIndex).append("].y;\n");
        }
    }

    int insertGlobalLoadsAndTranspose(StringBuilder kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, CLFFTDataFormat dataFormat) {
        int log2NumWorkItemsPerXForm = (int) log2(numWorkItemsPerXForm);
        int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
        int i, j;
        int lMemSize = 0;

        if (numXFormsPerWG > 1) {
            kernelString.append("        s = S & ").append(numXFormsPerWG - 1).append(";\n");
        }

        if (numWorkItemsPerXForm >= mem_coalesce_width) {
            if (numXFormsPerWG > 1) {
                kernelString.append("    ii = lId & ").append(numWorkItemsPerXForm - 1).append(";\n");
                kernelString.append("    jj = lId >> ").append(log2NumWorkItemsPerXForm).append(";\n");
                kernelString.append("    if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
                kernelString.append("        offset = mad24( mad24(groupId, ").append(numXFormsPerWG).append(", jj), ").append(N).append(", ii );\n");
                if (dataFormat == dataFormat.InterleavedComplexFormat) {
                    kernelString.append("        in += offset;\n");
                    kernelString.append("        out += offset;\n");
                } else {
                    kernelString.append("        in_real += offset;\n");
                    kernelString.append("        in_imag += offset;\n");
                    kernelString.append("        out_real += offset;\n");
                    kernelString.append("        out_imag += offset;\n");
                }
                for (i = 0; i < R0; i++) {
                    formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
                }
                kernelString.append("    }\n");
            } else {
                kernelString.append("    ii = lId;\n");
                kernelString.append("    jj = 0;\n");
                kernelString.append("    offset =  mad24(groupId, ").append(N).append(", ii);\n");
                if (dataFormat == dataFormat.InterleavedComplexFormat) {
                    kernelString.append("        in += offset;\n");
                    kernelString.append("        out += offset;\n");
                } else {
                    kernelString.append("        in_real += offset;\n");
                    kernelString.append("        in_imag += offset;\n");
                    kernelString.append("        out_real += offset;\n");
                    kernelString.append("        out_imag += offset;\n");
                }
                for (i = 0; i < R0; i++) {
                    formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
                }
            }
        } else if (N >= mem_coalesce_width) {
            int numInnerIter = N / mem_coalesce_width;
            int numOuterIter = numXFormsPerWG / (groupSize / mem_coalesce_width);

            kernelString.append("    ii = lId & ").append(mem_coalesce_width - 1).append(";\n");
            kernelString.append("    jj = lId >> ").append((int) log2(mem_coalesce_width)).append(";\n");
            kernelString.append("    lMemStore = sMem + mad24( jj, ").append(N + numWorkItemsPerXForm).append(", ii );\n");
            kernelString.append("    offset = mad24( groupId, ").append(numXFormsPerWG).append(", jj);\n");
            kernelString.append("    offset = mad24( offset, ").append(N).append(", ii );\n");
            if (dataFormat == dataFormat.InterleavedComplexFormat) {
                kernelString.append("        in += offset;\n");
                kernelString.append("        out += offset;\n");
            } else {
                kernelString.append("        in_real += offset;\n");
                kernelString.append("        in_imag += offset;\n");
                kernelString.append("        out_real += offset;\n");
                kernelString.append("        out_imag += offset;\n");
            }

            kernelString.append("if((groupId == get_num_groups(0)-1) && s) {\n");
            for (i = 0; i < numOuterIter; i++) {
                kernelString.append("    if( jj < s ) {\n");
                for (j = 0; j < numInnerIter; j++) {
                    formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
                }
                kernelString.append("    }\n");
                if (i != numOuterIter - 1) {
                    kernelString.append("    jj += ").append(groupSize / mem_coalesce_width).append(";\n");
                }
            }
            kernelString.append("}\n ");
            kernelString.append("else {\n");
            for (i = 0; i < numOuterIter; i++) {
                for (j = 0; j < numInnerIter; j++) {
                    formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
                }
            }
            kernelString.append("}\n");

            kernelString.append("    ii = lId & ").append(numWorkItemsPerXForm - 1).append(";\n");
            kernelString.append("    jj = lId >> ").append(log2NumWorkItemsPerXForm).append(";\n");
            kernelString.append("    lMemLoad  = sMem + mad24( jj, ").append(N + numWorkItemsPerXForm).append(", ii);\n");

            for (i = 0; i < numOuterIter; i++) {
                for (j = 0; j < numInnerIter; j++) {
                    kernelString.append("    lMemStore[").append(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)).append("] = a[").append(i * numInnerIter + j).append("].x;\n");
                }
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            for (i = 0; i < R0; i++) {
                kernelString.append("    a[").append(i).append("].x = lMemLoad[").append(i * numWorkItemsPerXForm).append("];\n");
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            for (i = 0; i < numOuterIter; i++) {
                for (j = 0; j < numInnerIter; j++) {
                    kernelString.append("    lMemStore[").append(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)).append("] = a[").append(i * numInnerIter + j).append("].y;\n");
                }
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            for (i = 0; i < R0; i++) {
                kernelString.append("    a[").append(i).append("].y = lMemLoad[").append(i * numWorkItemsPerXForm).append("];\n");
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
        } else {
            kernelString.append("    offset = mad24( groupId,  ").append(N * numXFormsPerWG).append(", lId );\n");
            if (dataFormat == dataFormat.InterleavedComplexFormat) {
                kernelString.append("        in += offset;\n");
                kernelString.append("        out += offset;\n");
            } else {
                kernelString.append("        in_real += offset;\n");
                kernelString.append("        in_imag += offset;\n");
                kernelString.append("        out_real += offset;\n");
                kernelString.append("        out_imag += offset;\n");
            }

            kernelString.append("    ii = lId & ").append(N - 1).append(";\n");
            kernelString.append("    jj = lId >> ").append((int) log2(N)).append(";\n");
            kernelString.append("    lMemStore = sMem + mad24( jj, ").append(N + numWorkItemsPerXForm).append(", ii );\n");

            kernelString.append("if((groupId == get_num_groups(0)-1) && s) {\n");
            for (i = 0; i < R0; i++) {
                kernelString.append("    if(jj < s )\n");
                formattedLoad(kernelString, i, i * groupSize, dataFormat);
                if (i != R0 - 1) {
                    kernelString.append("    jj += ").append(groupSize / N).append(";\n");
                }
            }
            kernelString.append("}\n");
            kernelString.append("else {\n");
            for (i = 0; i < R0; i++) {
                formattedLoad(kernelString, i, i * groupSize, dataFormat);
            }
            kernelString.append("}\n");

            if (numWorkItemsPerXForm > 1) {
                kernelString.append("    ii = lId & ").append(numWorkItemsPerXForm - 1).append(";\n");
                kernelString.append("    jj = lId >> ").append(log2NumWorkItemsPerXForm).append(";\n");
                kernelString.append("    lMemLoad = sMem + mad24( jj, ").append(N + numWorkItemsPerXForm).append(", ii );\n");
            } else {
                kernelString.append("    ii = 0;\n");
                kernelString.append("    jj = lId;\n");
                kernelString.append("    lMemLoad = sMem + mul24( jj, ").append(N + numWorkItemsPerXForm).append(");\n");
            }


            for (i = 0; i < R0; i++) {
                kernelString.append("    lMemStore[").append(i * (groupSize / N) * (N + numWorkItemsPerXForm)).append("] = a[").append(i).append("].x;\n");
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            for (i = 0; i < R0; i++) {
                kernelString.append("    a[").append(i).append("].x = lMemLoad[").append(i * numWorkItemsPerXForm).append("];\n");
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            for (i = 0; i < R0; i++) {
                kernelString.append("    lMemStore[").append(i * (groupSize / N) * (N + numWorkItemsPerXForm)).append("] = a[").append(i).append("].y;\n");
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            for (i = 0; i < R0; i++) {
                kernelString.append("    a[").append(i).append("].y = lMemLoad[").append(i * numWorkItemsPerXForm).append("];\n");
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
        }

        return lMemSize;
    }

    int insertGlobalStoresAndTranspose(StringBuilder kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, CLFFTDataFormat dataFormat) {
        int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
        int i, j, k, ind;
        int lMemSize = 0;
        int numIter = maxRadix / Nr;
        String indent = "";

        if (numWorkItemsPerXForm >= mem_coalesce_width) {
            if (numXFormsPerWG > 1) {
                kernelString.append("    if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
                indent = ("    ");
            }
            for (i = 0; i < maxRadix; i++) {
                j = i % numIter;
                k = i / numIter;
                ind = j * Nr + k;
                formattedStore(kernelString, ind, i * numWorkItemsPerXForm, dataFormat);
            }
            if (numXFormsPerWG > 1) {
                kernelString.append("    }\n");
            }
        } else if (N >= mem_coalesce_width) {
            int numInnerIter = N / mem_coalesce_width;
            int numOuterIter = numXFormsPerWG / (groupSize / mem_coalesce_width);

            kernelString.append("    lMemLoad  = sMem + mad24( jj, ").append(N + numWorkItemsPerXForm).append(", ii );\n");
            kernelString.append("    ii = lId & ").append(mem_coalesce_width - 1).append(";\n");
            kernelString.append("    jj = lId >> ").append((int) log2(mem_coalesce_width)).append(";\n");
            kernelString.append("    lMemStore = sMem + mad24( jj,").append(N + numWorkItemsPerXForm).append(", ii );\n");

            for (i = 0; i < maxRadix; i++) {
                j = i % numIter;
                k = i / numIter;
                ind = j * Nr + k;
                kernelString.append("    lMemLoad[").append(i * numWorkItemsPerXForm).append("] = a[").append(ind).append("].x;\n");
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            for (i = 0; i < numOuterIter; i++) {
                for (j = 0; j < numInnerIter; j++) {
                    kernelString.append("    a[").append(i * numInnerIter + j).append("].x = lMemStore[").append(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)).append("];\n");
                }
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            for (i = 0; i < maxRadix; i++) {
                j = i % numIter;
                k = i / numIter;
                ind = j * Nr + k;
                kernelString.append("    lMemLoad[").append(i * numWorkItemsPerXForm).append("] = a[").append(ind).append("].y;\n");
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            for (i = 0; i < numOuterIter; i++) {
                for (j = 0; j < numInnerIter; j++) {
                    kernelString.append("    a[").append(i * numInnerIter + j).append("].y = lMemStore[").append(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)).append("];\n");
                }
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            kernelString.append("if((groupId == get_num_groups(0)-1) && s) {\n");
            for (i = 0; i < numOuterIter; i++) {
                kernelString.append("    if( jj < s ) {\n");
                for (j = 0; j < numInnerIter; j++) {
                    formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
                }
                kernelString.append("    }\n");
                if (i != numOuterIter - 1) {
                    kernelString.append("    jj += ").append(groupSize / mem_coalesce_width).append(";\n");
                }
            }
            kernelString.append("}\n");
            kernelString.append("else {\n");
            for (i = 0; i < numOuterIter; i++) {
                for (j = 0; j < numInnerIter; j++) {
                    formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
                }
            }
            kernelString.append("}\n");

            lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
        } else {
            kernelString.append("    lMemLoad  = sMem + mad24( jj,").append(N + numWorkItemsPerXForm).append(", ii );\n");

            kernelString.append("    ii = lId & ").append(N - 1).append(";\n");
            kernelString.append("    jj = lId >> ").append((int) log2(N)).append(";\n");
            kernelString.append("    lMemStore = sMem + mad24( jj,").append(N + numWorkItemsPerXForm).append(", ii );\n");

            for (i = 0; i < maxRadix; i++) {
                j = i % numIter;
                k = i / numIter;
                ind = j * Nr + k;
                kernelString.append("    lMemLoad[").append(i * numWorkItemsPerXForm).append("] = a[").append(ind).append("].x;\n");
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            for (i = 0; i < maxRadix; i++) {
                kernelString.append("    a[").append(i).append("].x = lMemStore[").append(i * (groupSize / N) * (N + numWorkItemsPerXForm)).append("];\n");
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            for (i = 0; i < maxRadix; i++) {
                j = i % numIter;
                k = i / numIter;
                ind = j * Nr + k;
                kernelString.append("    lMemLoad[").append(i * numWorkItemsPerXForm).append("] = a[").append(ind).append("].y;\n");
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            for (i = 0; i < maxRadix; i++) {
                kernelString.append("    a[").append(i).append("].y = lMemStore[").append(i * (groupSize / N) * (N + numWorkItemsPerXForm)).append("];\n");
            }
            kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");

            kernelString.append("if((groupId == get_num_groups(0)-1) && s) {\n");
            for (i = 0; i < maxRadix; i++) {
                kernelString.append("    if(jj < s ) {\n");
                formattedStore(kernelString, i, i * groupSize, dataFormat);
                kernelString.append("    }\n");
                if (i != maxRadix - 1) {
                    kernelString.append("    jj +=").append(groupSize / N).append(";\n");
                }
            }
            kernelString.append("}\n");
            kernelString.append("else {\n");
            for (i = 0; i < maxRadix; i++) {
                formattedStore(kernelString, i, i * groupSize, dataFormat);
            }
            kernelString.append("}\n");

            lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
        }

        return lMemSize;
    }

    void insertfftKernel(StringBuilder kernelString, int Nr, int numIter) {
        int i;
        for (i = 0; i < numIter; i++) {
            kernelString.append("    fftKernel").append(Nr).append("(a+").append(i * Nr).append(", dir);\n");
        }
    }

    void insertTwiddleKernel(StringBuilder kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm) {
        int z, k;
        int logNPrev = log2(Nprev);

        for (z = 0; z < numIter; z++) {
            if (z == 0) {
                if (Nprev > 1) {
                    kernelString.append("    angf = (float) (ii >> ").append(logNPrev).append(");\n");
                } else {
                    kernelString.append("    angf = (float) ii;\n");
                }
            } else {
                if (Nprev > 1) {
                    kernelString.append("    angf = (float) ((").append(z * numWorkItemsPerXForm).append(" + ii) >>").append(logNPrev).append(");\n");
                } else {
                    kernelString.append("    angf = (float) (").append(z * numWorkItemsPerXForm).append(" + ii);\n");
                }
            }

            for (k = 1; k < Nr; k++) {
                int ind = z * Nr + k;
                //float fac =  (float) (2.0 * M_PI * (double) k / (double) len);
                kernelString.append("    ang = dir * ( 2.0f * M_PI * ").append(k).append(".0f / ").append(len).append(".0f )").append(" * angf;\n");
                kernelString.append("    w = (float2)(native_cos(ang), native_sin(ang));\n");
                kernelString.append("    a[").append(ind).append("] = complexMul(a[").append(ind).append("], w);\n");
            }
        }
    }

    fftPadding getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks) {
        int offset, midPad;

        if ((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks)) {
            offset = 0;
        } else {
            int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev;
            int numColsReq = 1;
            if (numRowsReq > Nr) {
                numColsReq = numRowsReq / Nr;
            }
            numColsReq = Nprev * numColsReq;
            offset = numColsReq;
        }

        if (numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1) {
            midPad = 0;
        } else {
            int bankNum = ((numWorkItemsReq + offset) * Nr) & (numBanks - 1);
            if (bankNum >= numWorkItemsPerXForm) {
                midPad = 0;
            } else {
                midPad = numWorkItemsPerXForm - bankNum;
            }
        }

        int lMemSize = (numWorkItemsReq + offset) * Nr * numXFormsPerWG + midPad * (numXFormsPerWG - 1);
        return new fftPadding(lMemSize, offset, midPad);
    }

    void insertLocalStores(StringBuilder kernelString, int numIter, int Nr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, String comp) {
        int z, k;

        for (z = 0; z < numIter; z++) {
            for (k = 0; k < Nr; k++) {
                int index = k * (numWorkItemsReq + offset) + z * numWorkItemsPerXForm;
                kernelString.append("    lMemStore[").append(index).append("] = a[").append(z * Nr + k).append("].").append(comp).append(";\n");
            }
        }
        kernelString.append("    barrier(CLK_LOCAL_MEM_FENCE);\n");
    }

    void insertLocalLoads(StringBuilder kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, String comp) {
        int numWorkItemsReqN = n / Nrn;
        int interBlockHNum = Math.max(Nprev / numWorkItemsPerXForm, 1);
        int interBlockHStride = numWorkItemsPerXForm;
        int vertWidth = Math.max(numWorkItemsPerXForm / Nprev, 1);
        vertWidth = Math.min(vertWidth, Nr);
        int vertNum = Nr / vertWidth;
        int vertStride = (n / Nr + offset) * vertWidth;
        int iter = Math.max(numWorkItemsReqN / numWorkItemsPerXForm, 1);
        int intraBlockHStride = (numWorkItemsPerXForm / (Nprev * Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev * Nr)) : 1;
        intraBlockHStride *= Nprev;

        int stride = numWorkItemsReq / Nrn;
        int i;
        for (i = 0; i < iter; i++) {
            int ii = i / (interBlockHNum * vertNum);
            int zz = i % (interBlockHNum * vertNum);
            int jj = zz % interBlockHNum;
            int kk = zz / interBlockHNum;
            int z;
            for (z = 0; z < Nrn; z++) {
                int st = kk * vertStride + jj * interBlockHStride + ii * intraBlockHStride + z * stride;
                kernelString.append("    a[").append(i * Nrn + z).append("].").append(comp).append(" = lMemLoad[").append(st).append("];\n");
            }
        }
        kernelString.append("    barrier(CLK_LOCAL_MEM_FENCE);\n");
    }

    void insertLocalLoadIndexArithmatic(StringBuilder kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad) {
        int Ncurr = Nprev * Nr;
        int logNcurr = log2(Ncurr);
        int logNprev = log2(Nprev);
        int incr = (numWorkItemsReq + offset) * Nr + midPad;

        if (Ncurr < numWorkItemsPerXForm) {
            if (Nprev == 1) {
                kernelString.append("    j = ii & ").append(Ncurr - 1).append(";\n");
            } else {
                kernelString.append("    j = (ii & ").append(Ncurr - 1).append(") >> ").append(logNprev).append(";\n");
            }

            if (Nprev == 1) {
                kernelString.append("    i = ii >> ").append(logNcurr).append(";\n");
            } else {
                kernelString.append("    i = mad24(ii >> ").append(logNcurr).append(", ").append(Nprev).append(", ii & ").append(Nprev - 1).append(");\n");
            }
        } else {
            if (Nprev == 1) {
                kernelString.append("    j = ii;\n");
            } else {
                kernelString.append("    j = ii >> ").append(logNprev).append(";\n");
            }
            if (Nprev == 1) {
                kernelString.append("    i = 0;\n");
            } else {
                kernelString.append("    i = ii & ").append(Nprev - 1).append(";\n");
            }
        }

        if (numXFormsPerWG > 1) {
            kernelString.append("    i = mad24(jj, ").append(incr).append(", i);\n");
        }

        kernelString.append("    lMemLoad = sMem + mad24(j, ").append(numWorkItemsReq + offset).append(", i);\n");
    }

    void insertLocalStoreIndexArithmatic(StringBuilder kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad) {
        if (numXFormsPerWG == 1) {
            kernelString.append("    lMemStore = sMem + ii;\n");
        } else {
            kernelString.append("    lMemStore = sMem + mad24(jj, ").append((numWorkItemsReq + offset) * Nr + midPad).append(", ii);\n");
        }
    }

    void createLocalMemfftKernelString() {
        int[] radixArray = new int[10];
        int numRadix;

        int n = this.size.x;

        assert (n <= this.max_work_item_per_workgroup * this.max_radix);

        numRadix = getRadixArray(n, radixArray, 0);
        assert (numRadix > 0);

        if (n / radixArray[0] > this.max_work_item_per_workgroup) {
            numRadix = getRadixArray(n, radixArray, this.max_radix);
        }

        assert (radixArray[0] <= this.max_radix);
        assert (n / radixArray[0] <= this.max_work_item_per_workgroup);

        int tmpLen = 1;
        int i;
        for (i = 0; i < numRadix; i++) {
            assert ((radixArray[i] != 0) && !(((radixArray[i] - 1) != 0) & (radixArray[i] != 0)));
            tmpLen *= radixArray[i];
        }
        assert (tmpLen == n);

        //int offset, midPad;
        StringBuilder localString = new StringBuilder();
        String kernelName;

        CLFFTDataFormat dataFormat = this.format;
        StringBuilder kernelString = this.kernel_string;

        int kCount = kernel_list.size();

        kernelName = "fft" + (kCount);

        CLFFTKernelInfo kInfo = new CLFFTKernelInfo();
        kernel_list.add(kInfo);
        //kInfo.kernel = null;
        //kInfo.lmem_size = 0;
        //kInfo.num_workgroups = 0;
        //kInfo.num_workitems_per_workgroup = 0;
        kInfo.dir = CLFFTKernelDir.X;
        kInfo.in_place_possible = true;
        //kInfo.next = null;
        kInfo.kernel_name = kernelName;

        int numWorkItemsPerXForm = n / radixArray[0];
        int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm;
        assert (numWorkItemsPerWG <= this.max_work_item_per_workgroup);
        int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm;
        kInfo.num_workgroups = 1;
        kInfo.num_xforms_per_workgroup = numXFormsPerWG;
        kInfo.num_workitems_per_workgroup = numWorkItemsPerWG;

        int[] N = radixArray;
        int maxRadix = N[0];
        int lMemSize = 0;

        insertVariables(localString, maxRadix);

        lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, this.min_mem_coalesce_width, dataFormat);
        kInfo.lmem_size = (lMemSize > kInfo.lmem_size) ? lMemSize : kInfo.lmem_size;

        String xcomp = "x";
        String ycomp = "y";

        int Nprev = 1;
        int len = n;
        int r;
        for (r = 0; r < numRadix; r++) {
            int numIter = N[0] / N[r];
            int numWorkItemsReq = n / N[r];
            int Ncurr = Nprev * N[r];
            insertfftKernel(localString, N[r], numIter);

            if (r < (numRadix - 1)) {
                fftPadding pad;

                insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm);
                pad = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], this.num_local_mem_banks);
                kInfo.lmem_size = (pad.lMemSize > kInfo.lmem_size) ? pad.lMemSize : kInfo.lmem_size;
                insertLocalStoreIndexArithmatic(localString, numWorkItemsReq, numXFormsPerWG, N[r], pad.offset, pad.midPad);
                insertLocalLoadIndexArithmatic(localString, Nprev, N[r], numWorkItemsReq, numWorkItemsPerXForm, numXFormsPerWG, pad.offset, pad.midPad);
                insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, pad.offset, xcomp);
                insertLocalLoads(localString, n, N[r], N[r + 1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, pad.offset, xcomp);
                insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, pad.offset, ycomp);
                insertLocalLoads(localString, n, N[r], N[r + 1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, pad.offset, ycomp);
                Nprev = Ncurr;
                len = len / N[r];
            }
        }

        lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, this.min_mem_coalesce_width, dataFormat);
        kInfo.lmem_size = (lMemSize > kInfo.lmem_size) ? lMemSize : kInfo.lmem_size;

        insertHeader(kernelString, kernelName, dataFormat);
        kernelString.append("{\n");
        if (kInfo.lmem_size > 0) {
            kernelString.append("    __local float sMem[").append(kInfo.lmem_size).append("];\n");
        }
        kernelString.append(localString);
        kernelString.append("}\n");
    }

// For size larger than what can be computed using local memory fft, global transposes
// multiple kernel launces is needed. For these sizes, size can be decomposed using
// much larger base radices i.e. say size = 262144 = 128 x 64 x 32. Thus three kernel
// launches will be needed, first computing 64 x 32, length 128 ffts, second computing
// 128 x 32 length 64 ffts, and finally a kernel computing 128 x 64 length 32 ffts.
// Each of these base radices can futher be divided into factors so that each of these
// base ffts can be computed within one kernel launch using in-register ffts and local
// memory transposes i.e for the first kernel above which computes 64 x 32 ffts on length
// 128, 128 can be decomposed into 128 = 16 x 8 i.e. 8 work items can compute 8 length
// 16 ffts followed by transpose using local memory followed by each of these eight
// work items computing 2 length 8 ffts thus computing 16 length 8 ffts in total. This
// means only 8 work items are needed for computing one length 128 fft. If we choose
// work group size of say 64, we can compute 64/8 = 8 length 128 ffts within one
// work group. Since we need to compute 64 x 32 length 128 ffts in first kernel, this
// means we need to launch 64 x 32 / 8 = 256 work groups with 64 work items in each
// work group where each work group is computing 8 length 128 ffts where each length
// 128 fft is computed by 8 work items. Same logic can be applied to other two kernels
// in this example. Users can play with difference base radices and difference
// decompositions of base radices to generates different kernels and see which gives
// best performance. Following function is just fixed to use 128 as base radix
    int getGlobalRadixInfo(int n, int[] radix, int[] R1, int[] R2) {
        int baseRadix = Math.min(n, 128);

        int numR = 0;
        int N = n;
        while (N > baseRadix) {
            N /= baseRadix;
            numR++;
        }

        for (int i = 0; i < numR; i++) {
            radix[i] = baseRadix;
        }

        radix[numR] = N;
        numR++;

        for (int i = 0; i < numR; i++) {
            int B = radix[i];
            if (B <= 8) {
                R1[i] = B;
                R2[i] = 1;
                continue;
            }

            int r1 = 2;
            int r2 = B / r1;
            while (r2 > r1) {
                r1 *= 2;
                r2 = B / r1;
            }
            R1[i] = r1;
            R2[i] = r2;
        }
        return numR;
    }

    void createGlobalFFTKernelString(int n, int BS, CLFFTKernelDir dir, int vertBS) {
        int i, j, k, t;
        int[] radixArr = new int[10];
        int[] R1Arr = new int[10];
        int[] R2Arr = new int[10];
        int radix, R1, R2;
        int numRadices;

        int maxThreadsPerBlock = this.max_work_item_per_workgroup;
        int maxArrayLen = this.max_radix;
        int batchSize = this.min_mem_coalesce_width;
        CLFFTDataFormat dataFormat = this.format;
        boolean vertical = (dir == dir.X) ? false : true;

        numRadices = getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr);

        int numPasses = numRadices;

        StringBuilder localString = new StringBuilder();
        String kernelName;
        StringBuilder kernelString = this.kernel_string;

        int kCount = kernel_list.size();
        //cl_fft_kernel_info **kInfo = &this.kernel_list;
        //int kCount = 0;

        //while(*kInfo)
        //{
        //  kInfo = &kInfo.next;
        //  kCount++;
        //}

        int N = n;
        int m = (int) log2(n);
        int Rinit = vertical ? BS : 1;
        batchSize = vertical ? Math.min(BS, batchSize) : batchSize;
        int passNum;

        for (passNum = 0; passNum < numPasses; passNum++) {

            localString.setLength(0);
            //kernelName.clear();

            radix = radixArr[passNum];
            R1 = R1Arr[passNum];
            R2 = R2Arr[passNum];

            int strideI = Rinit;
            for (i = 0; i < numPasses; i++) {
                if (i != passNum) {
                    strideI *= radixArr[i];
                }
            }

            int strideO = Rinit;
            for (i = 0; i < passNum; i++) {
                strideO *= radixArr[i];
            }

            int threadsPerXForm = R2;
            batchSize = R2 == 1 ? this.max_work_item_per_workgroup : batchSize;
            batchSize = Math.min(batchSize, strideI);
            int threadsPerBlock = batchSize * threadsPerXForm;
            threadsPerBlock = Math.min(threadsPerBlock, maxThreadsPerBlock);
            batchSize = threadsPerBlock / threadsPerXForm;
            assert (R2 <= R1);
            assert (R1 * R2 == radix);
            assert (R1 <= maxArrayLen);
            assert (threadsPerBlock <= maxThreadsPerBlock);

            int numIter = R1 / R2;
            int gInInc = threadsPerBlock / batchSize;


            int lgStrideO = log2(strideO);
            int numBlocksPerXForm = strideI / batchSize;
            int numBlocks = numBlocksPerXForm;
            if (!vertical) {
                numBlocks *= BS;
            } else {
                numBlocks *= vertBS;
            }

            kernelName = "fft" + (kCount);
            CLFFTKernelInfo kInfo = new CLFFTKernelInfo();
            if (R2 == 1) {
                kInfo.lmem_size = 0;
            } else {
                if (strideO == 1) {
                    kInfo.lmem_size = (radix + 1) * batchSize;
                } else {
                    kInfo.lmem_size = threadsPerBlock * R1;
                }
            }
            kInfo.num_workgroups = numBlocks;
            kInfo.num_xforms_per_workgroup = 1;
            kInfo.num_workitems_per_workgroup = threadsPerBlock;
            kInfo.dir = dir;
            kInfo.in_place_possible = ((passNum == (numPasses - 1)) && ((numPasses & 1) != 0));
            //kInfo.next = NULL;
            kInfo.kernel_name = kernelName;

            insertVariables(localString, R1);

            if (vertical) {
                localString.append("xNum = groupId >> ").append((int) log2(numBlocksPerXForm)).append(";\n");
                localString.append("groupId = groupId & ").append(numBlocksPerXForm - 1).append(";\n");
                localString.append("indexIn = mad24(groupId, ").append(batchSize).append(", xNum << ").append((int) log2(n * BS)).append(");\n");
                localString.append("tid = mul24(groupId, ").append(batchSize).append(");\n");
                localString.append("i = tid >> ").append(lgStrideO).append(";\n");
                localString.append("j = tid & ").append(strideO - 1).append(";\n");
                int stride = radix * Rinit;
                for (i = 0; i < passNum; i++) {
                    stride *= radixArr[i];
                }
                localString.append("indexOut = mad24(i, ").append(stride).append(", j + ").append("(xNum << ").append((int) log2(n * BS)).append("));\n");
                localString.append("bNum = groupId;\n");
            } else {
                int lgNumBlocksPerXForm = log2(numBlocksPerXForm);
                localString.append("bNum = groupId & ").append(numBlocksPerXForm - 1).append(";\n");
                localString.append("xNum = groupId >> ").append(lgNumBlocksPerXForm).append(";\n");
                localString.append("indexIn = mul24(bNum, ").append(batchSize).append(");\n");
                localString.append("tid = indexIn;\n");
                localString.append("i = tid >> ").append(lgStrideO).append(";\n");
                localString.append("j = tid & ").append(strideO - 1).append(";\n");
                int stride = radix * Rinit;
                for (i = 0; i < passNum; i++) {
                    stride *= radixArr[i];
                }
                localString.append("indexOut = mad24(i, ").append(stride).append(", j);\n");
                localString.append("indexIn += (xNum << ").append(m).append(");\n");
                localString.append("indexOut += (xNum << ").append(m).append(");\n");
            }

            // Load Data
            int lgBatchSize = log2(batchSize);
            localString.append("tid = lId;\n");
            localString.append("i = tid & ").append(batchSize - 1).append(";\n");
            localString.append("j = tid >> ").append(lgBatchSize).append(";\n");
            localString.append("indexIn += mad24(j, ").append(strideI).append(", i);\n");

            if (dataFormat == dataFormat.SplitComplexFormat) {
                localString.append("in_real += indexIn;\n");
                localString.append("in_imag += indexIn;\n");
                for (j = 0; j < R1; j++) {
                    localString.append("a[").append(j).append("].x = in_real[").append(j * gInInc * strideI).append("];\n");
                }
                for (j = 0; j < R1; j++) {
                    localString.append("a[").append(j).append("].y = in_imag[").append(j * gInInc * strideI).append("];\n");
                }
            } else {
                localString.append("in += indexIn;\n");
                for (j = 0; j < R1; j++) {
                    localString.append("a[").append(j).append("] = in[").append(j * gInInc * strideI).append("];\n");
                }
            }

            localString.append("fftKernel").append(R1).append("(a, dir);\n");

            if (R2 > 1) {
                // twiddle
                for (k = 1; k < R1; k++) {
                    localString.append("ang = dir*(2.0f*M_PI*").append(k).append("/").append(radix).append(")*j;\n");
                    localString.append("w = (float2)(native_cos(ang), native_sin(ang));\n");
                    localString.append("a[").append(k).append("] = complexMul(a[").append(k).append("], w);\n");
                }

                // shuffle
                numIter = R1 / R2;
                localString.append("indexIn = mad24(j, ").append(threadsPerBlock * numIter).append(", i);\n");
                localString.append("lMemStore = sMem + tid;\n");
                localString.append("lMemLoad = sMem + indexIn;\n");
                for (k = 0; k < R1; k++) {
                    localString.append("lMemStore[").append(k * threadsPerBlock).append("] = a[").append(k).append("].x;\n");
                }
                localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");
                for (k = 0; k < numIter; k++) {
                    for (t = 0; t < R2; t++) {
                        localString.append("a[").append(k * R2 + t).append("].x = lMemLoad[").append(t * batchSize + k * threadsPerBlock).append("];\n");
                    }
                }
                localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");
                for (k = 0; k < R1; k++) {
                    localString.append("lMemStore[").append(k * threadsPerBlock).append("] = a[").append(k).append("].y;\n");
                }
                localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");
                for (k = 0; k < numIter; k++) {
                    for (t = 0; t < R2; t++) {
                        localString.append("a[").append(k * R2 + t).append("].y = lMemLoad[").append(t * batchSize + k * threadsPerBlock).append("];\n");
                    }
                }
                localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");

                for (j = 0; j < numIter; j++) {
                    localString.append("fftKernel").append(R2).append("(a + ").append(j * R2).append(", dir);\n");
                }
            }

            // twiddle
            if (passNum < (numPasses - 1)) {
                localString.append("l = ((bNum << ").append(lgBatchSize).append(") + i) >> ").append(lgStrideO).append(";\n");
                localString.append("k = j << ").append((int) log2(R1 / R2)).append(";\n");
                localString.append("ang1 = dir*(2.0f*M_PI/").append(N).append(")*l;\n");
                for (t = 0; t < R1; t++) {
                    localString.append("ang = ang1*(k + ").append((t % R2) * R1 + (t / R2)).append(");\n");
                    localString.append("w = (float2)(native_cos(ang), native_sin(ang));\n");
                    localString.append("a[").append(t).append("] = complexMul(a[").append(t).append("], w);\n");
                }
            }

            // Store Data
            if (strideO == 1) {

                localString.append("lMemStore = sMem + mad24(i, ").append(radix + 1).append(", j << ").append((int) log2(R1 / R2)).append(");\n");
                localString.append("lMemLoad = sMem + mad24(tid >> ").append((int) log2(radix)).append(", ").append(radix + 1).append(", tid & ").append(radix - 1).append(");\n");

                for (i = 0; i < R1 / R2; i++) {
                    for (j = 0; j < R2; j++) {
                        localString.append("lMemStore[ ").append(i + j * R1).append("] = a[").append(i * R2 + j).append("].x;\n");
                    }
                }
                localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");
                if (threadsPerBlock >= radix) {
                    for (i = 0; i < R1; i++) {
                        localString.append("a[").append(i).append("].x = lMemLoad[").append(i * (radix + 1) * (threadsPerBlock / radix)).append("];\n");
                    }
                } else {
                    int innerIter = radix / threadsPerBlock;
                    int outerIter = R1 / innerIter;
                    for (i = 0; i < outerIter; i++) {
                        for (j = 0; j < innerIter; j++) {
                            localString.append("a[").append(i * innerIter + j).append("].x = lMemLoad[").append(j * threadsPerBlock + i * (radix + 1)).append("];\n");
                        }
                    }
                }
                localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");

                for (i = 0; i < R1 / R2; i++) {
                    for (j = 0; j < R2; j++) {
                        localString.append("lMemStore[ ").append(i + j * R1).append("] = a[").append(i * R2 + j).append("].y;\n");
                    }
                }
                localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");
                if (threadsPerBlock >= radix) {
                    for (i = 0; i < R1; i++) {
                        localString.append("a[").append(i).append("].y = lMemLoad[").append(i * (radix + 1) * (threadsPerBlock / radix)).append("];\n");
                    }
                } else {
                    int innerIter = radix / threadsPerBlock;
                    int outerIter = R1 / innerIter;
                    for (i = 0; i < outerIter; i++) {
                        for (j = 0; j < innerIter; j++) {
                            localString.append("a[").append(i * innerIter + j).append("].y = lMemLoad[").append(j * threadsPerBlock + i * (radix + 1)).append("];\n");
                        }
                    }
                }
                localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");

                localString.append("indexOut += tid;\n");
                if (dataFormat == dataFormat.SplitComplexFormat) {
                    localString.append("out_real += indexOut;\n");
                    localString.append("out_imag += indexOut;\n");
                    for (k = 0; k < R1; k++) {
                        localString.append("out_real[").append(k * threadsPerBlock).append("] = a[").append(k).append("].x;\n");
                    }
                    for (k = 0; k < R1; k++) {
                        localString.append("out_imag[").append(k * threadsPerBlock).append("] = a[").append(k).append("].y;\n");
                    }
                } else {
                    localString.append("out += indexOut;\n");
                    for (k = 0; k < R1; k++) {
                        localString.append("out[").append(k * threadsPerBlock).append("] = a[").append(k).append("];\n");
                    }
                }

            } else {
                localString.append("indexOut += mad24(j, ").append(numIter * strideO).append(", i);\n");
                if (dataFormat == dataFormat.SplitComplexFormat) {
                    localString.append("out_real += indexOut;\n");
                    localString.append("out_imag += indexOut;\n");
                    for (k = 0; k < R1; k++) {
                        localString.append("out_real[").append(((k % R2) * R1 + (k / R2)) * strideO).append("] = a[").append(k).append("].x;\n");
                    }
                    for (k = 0; k < R1; k++) {
                        localString.append("out_imag[").append(((k % R2) * R1 + (k / R2)) * strideO).append("] = a[").append(k).append("].y;\n");
                    }
                } else {
                    localString.append("out += indexOut;\n");
                    for (k = 0; k < R1; k++) {
                        localString.append("out[").append(((k % R2) * R1 + (k / R2)) * strideO).append("] = a[").append(k).append("];\n");
                    }
                }
            }

            insertHeader(kernelString, kernelName, dataFormat);
            kernelString.append("{\n");
            if (kInfo.lmem_size > 0) {
                kernelString.append("    __local float sMem[").append(kInfo.lmem_size).append("];\n");
            }
            kernelString.append(localString);
            kernelString.append("}\n");

            N /= radix;
            kernel_list.add(kInfo);
            kCount++;
        }
    }

    void FFT1D(CLFFTKernelDir dir) {
        int[] radixArray = new int[10];

        switch (dir) {
            case X:
                if (this.size.x > this.max_localmem_fft_size) {
                    createGlobalFFTKernelString(this.size.x, 1, dir, 1);
                } else if (this.size.x > 1) {
                    getRadixArray(this.size.x, radixArray, 0);
                    if (this.size.x / radixArray[0] <= this.max_work_item_per_workgroup) {
                        createLocalMemfftKernelString();
                    } else {
                        getRadixArray(this.size.x, radixArray, this.max_radix);
                        if (this.size.x / radixArray[0] <= this.max_work_item_per_workgroup) {
                            createLocalMemfftKernelString();
                        } else {
                            createGlobalFFTKernelString(this.size.x, 1, dir, 1);
                        }
                    }
                }
                break;

            case Y:
                if (this.size.y > 1) {
                    createGlobalFFTKernelString(this.size.y, this.size.x, dir, 1);
                }
                break;

            case Z:
                if (this.size.z > 1) {
                    createGlobalFFTKernelString(this.size.z, this.size.x * this.size.y, dir, 1);
                }
            default:
                return;
        }
    }

    /*
     *
     * Pre-defined kernel parts
     *
     */
    static String baseKernels =
            "#ifndef M_PI\n"
            + "#define M_PI 0x1.921fb54442d18p+1\n"
            + "#endif\n"
            + "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n"
            + "#define conj(a) ((float2)((a).x, -(a).y))\n"
            + "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n"
            + "\n"
            + "#define fftKernel2(a,dir) \\\n"
            + "{ \\\n"
            + "    float2 c = (a)[0];    \\\n"
            + "    (a)[0] = c + (a)[1];  \\\n"
            + "    (a)[1] = c - (a)[1];  \\\n"
            + "}\n"
            + "\n"
            + "#define fftKernel2S(d1,d2,dir) \\\n"
            + "{ \\\n"
            + "    float2 c = (d1);   \\\n"
            + "    (d1) = c + (d2);   \\\n"
            + "    (d2) = c - (d2);   \\\n"
            + "}\n"
            + "\n"
            + "#define fftKernel4(a,dir) \\\n"
            + "{ \\\n"
            + "    fftKernel2S((a)[0], (a)[2], dir); \\\n"
            + "    fftKernel2S((a)[1], (a)[3], dir); \\\n"
            + "    fftKernel2S((a)[0], (a)[1], dir); \\\n"
            + "    (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
            + "    fftKernel2S((a)[2], (a)[3], dir); \\\n"
            + "    float2 c = (a)[1]; \\\n"
            + "    (a)[1] = (a)[2]; \\\n"
            + "    (a)[2] = c; \\\n"
            + "}\n"
            + "\n"
            + "#define fftKernel4s(a0,a1,a2,a3,dir) \\\n"
            + "{ \\\n"
            + "    fftKernel2S((a0), (a2), dir); \\\n"
            + "    fftKernel2S((a1), (a3), dir); \\\n"
            + "    fftKernel2S((a0), (a1), dir); \\\n"
            + "    (a3) = (float2)(dir)*(conjTransp((a3))); \\\n"
            + "    fftKernel2S((a2), (a3), dir); \\\n"
            + "    float2 c = (a1); \\\n"
            + "    (a1) = (a2); \\\n"
            + "    (a2) = c; \\\n"
            + "}\n"
            + "\n"
            + "#define bitreverse8(a) \\\n"
            + "{ \\\n"
            + "    float2 c; \\\n"
            + "    c = (a)[1]; \\\n"
            + "    (a)[1] = (a)[4]; \\\n"
            + "    (a)[4] = c; \\\n"
            + "    c = (a)[3]; \\\n"
            + "    (a)[3] = (a)[6]; \\\n"
            + "    (a)[6] = c; \\\n"
            + "}\n"
            + "\n"
            + "#define fftKernel8(a,dir) \\\n"
            + "{ \\\n"
            + "  const float2 w1  = (float2)(0x1.6a09e6p-1f,  dir*0x1.6a09e6p-1f);  \\\n"
            + "  const float2 w3  = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f);  \\\n"
            + "  float2 c; \\\n"
            + "  fftKernel2S((a)[0], (a)[4], dir); \\\n"
            + "  fftKernel2S((a)[1], (a)[5], dir); \\\n"
            + "  fftKernel2S((a)[2], (a)[6], dir); \\\n"
            + "  fftKernel2S((a)[3], (a)[7], dir); \\\n"
            + "  (a)[5] = complexMul(w1, (a)[5]); \\\n"
            + "  (a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n"
            + "  (a)[7] = complexMul(w3, (a)[7]); \\\n"
            + "  fftKernel2S((a)[0], (a)[2], dir); \\\n"
            + "  fftKernel2S((a)[1], (a)[3], dir); \\\n"
            + "  fftKernel2S((a)[4], (a)[6], dir); \\\n"
            + "  fftKernel2S((a)[5], (a)[7], dir); \\\n"
            + "  (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
            + "  (a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n"
            + "  fftKernel2S((a)[0], (a)[1], dir); \\\n"
            + "  fftKernel2S((a)[2], (a)[3], dir); \\\n"
            + "  fftKernel2S((a)[4], (a)[5], dir); \\\n"
            + "  fftKernel2S((a)[6], (a)[7], dir); \\\n"
            + "  bitreverse8((a)); \\\n"
            + "}\n"
            + "\n"
            + "#define bitreverse4x4(a) \\\n"
            + "{ \\\n"
            + "  float2 c; \\\n"
            + "  c = (a)[1];  (a)[1]  = (a)[4];  (a)[4]  = c; \\\n"
            + "  c = (a)[2];  (a)[2]  = (a)[8];  (a)[8]  = c; \\\n"
            + "  c = (a)[3];  (a)[3]  = (a)[12]; (a)[12] = c; \\\n"
            + "  c = (a)[6];  (a)[6]  = (a)[9];  (a)[9]  = c; \\\n"
            + "  c = (a)[7];  (a)[7]  = (a)[13]; (a)[13] = c; \\\n"
            + "  c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n"
            + "}\n"
            + "\n"
            + "#define fftKernel16(a,dir) \\\n"
            + "{ \\\n"
            + "    const float w0 = 0x1.d906bcp-1f; \\\n"
            + "    const float w1 = 0x1.87de2ap-2f; \\\n"
            + "    const float w2 = 0x1.6a09e6p-1f; \\\n"
            + "    fftKernel4s((a)[0], (a)[4], (a)[8],  (a)[12], dir); \\\n"
            + "    fftKernel4s((a)[1], (a)[5], (a)[9],  (a)[13], dir); \\\n"
            + "    fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n"
            + "    fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n"
            + "    (a)[5]  = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n"
            + "    (a)[6]  = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n"
            + "    (a)[7]  = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n"
            + "    (a)[9]  = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n"
            + "    (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n"
            + "    (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n"
            + "    (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n"
            + "    (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n"
            + "    (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n"
            + "    fftKernel4((a), dir); \\\n"
            + "    fftKernel4((a) + 4, dir); \\\n"
            + "    fftKernel4((a) + 8, dir); \\\n"
            + "    fftKernel4((a) + 12, dir); \\\n"
            + "    bitreverse4x4((a)); \\\n"
            + "}\n"
            + "\n"
            + "#define bitreverse32(a) \\\n"
            + "{ \\\n"
            + "    float2 c1, c2; \\\n"
            + "    c1 = (a)[2];   (a)[2] = (a)[1];   c2 = (a)[4];   (a)[4] = c1;   c1 = (a)[8];   (a)[8] = c2;    c2 = (a)[16];  (a)[16] = c1;   (a)[1] = c2; \\\n"
            + "    c1 = (a)[6];   (a)[6] = (a)[3];   c2 = (a)[12];  (a)[12] = c1;  c1 = (a)[24];  (a)[24] = c2;   c2 = (a)[17];  (a)[17] = c1;   (a)[3] = c2; \\\n"
            + "    c1 = (a)[10];  (a)[10] = (a)[5];  c2 = (a)[20];  (a)[20] = c1;  c1 = (a)[9];   (a)[9] = c2;    c2 = (a)[18];  (a)[18] = c1;   (a)[5] = c2; \\\n"
            + "    c1 = (a)[14];  (a)[14] = (a)[7];  c2 = (a)[28];  (a)[28] = c1;  c1 = (a)[25];  (a)[25] = c2;   c2 = (a)[19];  (a)[19] = c1;   (a)[7] = c2; \\\n"
            + "    c1 = (a)[22];  (a)[22] = (a)[11]; c2 = (a)[13];  (a)[13] = c1;  c1 = (a)[26];  (a)[26] = c2;   c2 = (a)[21];  (a)[21] = c1;   (a)[11] = c2; \\\n"
            + "    c1 = (a)[30];  (a)[30] = (a)[15]; c2 = (a)[29];  (a)[29] = c1;  c1 = (a)[27];  (a)[27] = c2;   c2 = (a)[23];  (a)[23] = c1;   (a)[15] = c2; \\\n"
            + "}\n"
            + "\n"
            + "#define fftKernel32(a,dir) \\\n"
            + "{ \\\n"
            + "    fftKernel2S((a)[0],  (a)[16], dir); \\\n"
            + "    fftKernel2S((a)[1],  (a)[17], dir); \\\n"
            + "    fftKernel2S((a)[2],  (a)[18], dir); \\\n"
            + "    fftKernel2S((a)[3],  (a)[19], dir); \\\n"
            + "    fftKernel2S((a)[4],  (a)[20], dir); \\\n"
            + "    fftKernel2S((a)[5],  (a)[21], dir); \\\n"
            + "    fftKernel2S((a)[6],  (a)[22], dir); \\\n"
            + "    fftKernel2S((a)[7],  (a)[23], dir); \\\n"
            + "    fftKernel2S((a)[8],  (a)[24], dir); \\\n"
            + "    fftKernel2S((a)[9],  (a)[25], dir); \\\n"
            + "    fftKernel2S((a)[10], (a)[26], dir); \\\n"
            + "    fftKernel2S((a)[11], (a)[27], dir); \\\n"
            + "    fftKernel2S((a)[12], (a)[28], dir); \\\n"
            + "    fftKernel2S((a)[13], (a)[29], dir); \\\n"
            + "    fftKernel2S((a)[14], (a)[30], dir); \\\n"
            + "    fftKernel2S((a)[15], (a)[31], dir); \\\n"
            + "    (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
            + "    (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
            + "    (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
            + "    (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
            + "    (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
            + "    (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
            + "    (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
            + "    (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n"
            + "    (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
            + "    (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
            + "    (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
            + "    (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
            + "    (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
            + "    (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
            + "    (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
            + "    fftKernel16((a), dir); \\\n"
            + "    fftKernel16((a) + 16, dir); \\\n"
            + "    bitreverse32((a)); \\\n"
            + "}\n\n";
    static String twistKernelInterleaved =
            "__kernel void \\\n"
            + "clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
            + "{ \\\n"
            + "   float2 a, w; \\\n"
            + "   float ang; \\\n"
            + "   unsigned int j; \\\n"
            + "  unsigned int i = get_global_id(0); \\\n"
            + "  unsigned int startIndex = i; \\\n"
            + "   \\\n"
            + "  if(i < numCols) \\\n"
            + "  { \\\n"
            + "      for(j = 0; j < numRowsToProcess; j++) \\\n"
            + "      { \\\n"
            + "          a = in[startIndex]; \\\n"
            + "          ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
            + "          w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
            + "          a = complexMul(a, w); \\\n"
            + "          in[startIndex] = a; \\\n"
            + "          startIndex += numCols; \\\n"
            + "      } \\\n"
            + "  }   \\\n"
            + "} \\\n";
    static String twistKernelPlannar =
            "__kernel void \\\n"
            + "clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
            + "{ \\\n"
            + "    float2 a, w; \\\n"
            + "    float ang; \\\n"
            + "    unsigned int j; \\\n"
            + "  unsigned int i = get_global_id(0); \\\n"
            + "  unsigned int startIndex = i; \\\n"
            + "   \\\n"
            + "  if(i < numCols) \\\n"
            + "  { \\\n"
            + "      for(j = 0; j < numRowsToProcess; j++) \\\n"
            + "      { \\\n"
            + "          a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n"
            + "          ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
            + "          w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
            + "          a = complexMul(a, w); \\\n"
            + "          in_real[startIndex] = a.x; \\\n"
            + "          in_imag[startIndex] = a.y; \\\n"
            + "          startIndex += numCols; \\\n"
            + "      } \\\n"
            + "  }   \\\n"
            + "} \\\n";
}
TOP

Related Classes of com.jogamp.opencl.demos.fft.CLFFTPlan

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.