Home | History | Annotate | Download | only in opencl
      1 // This file is part of OpenCV project.
      2 // It is subject to the license terms in the LICENSE file found in the top-level directory
      3 // of this distribution and at http://opencv.org/license.html.
      4 
      5 // Copyright (C) 2014, Itseez, Inc., all rights reserved.
      6 // Third party copyrights are property of their respective owners.
      7 
      8 #define SQRT_2 0.707106781188f
      9 #define sin_120 0.866025403784f
     10 #define fft5_2  0.559016994374f
     11 #define fft5_3 -0.951056516295f
     12 #define fft5_4 -1.538841768587f
     13 #define fft5_5  0.363271264002f
     14 
     15 #ifdef DOUBLE_SUPPORT
     16 #ifdef cl_amd_fp64
     17 #pragma OPENCL EXTENSION cl_amd_fp64:enable
     18 #elif defined (cl_khr_fp64)
     19 #pragma OPENCL EXTENSION cl_khr_fp64:enable
     20 #endif
     21 #endif
     22 
     23 __attribute__((always_inline))
     24 CT mul_complex(CT a, CT b) {
     25     return (CT)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x));
     26 }
     27 
     28 __attribute__((always_inline))
     29 CT twiddle(CT a) {
     30     return (CT)(a.y, -a.x);
     31 }
     32 
     33 __attribute__((always_inline))
     34 void butterfly2(CT a0, CT a1, __local CT* smem, __global const CT* twiddles,
     35                 const int x, const int block_size)
     36 {
     37     const int k = x & (block_size - 1);
     38     a1 = mul_complex(twiddles[k], a1);
     39     const int dst_ind = (x << 1) - k;
     40 
     41     smem[dst_ind] = a0 + a1;
     42     smem[dst_ind+block_size] = a0 - a1;
     43 }
     44 
     45 __attribute__((always_inline))
     46 void butterfly4(CT a0, CT a1, CT a2, CT a3, __local CT* smem, __global const CT* twiddles,
     47                 const int x, const int block_size)
     48 {
     49     const int k = x & (block_size - 1);
     50     a1 = mul_complex(twiddles[k], a1);
     51     a2 = mul_complex(twiddles[k + block_size], a2);
     52     a3 = mul_complex(twiddles[k + 2*block_size], a3);
     53 
     54     const int dst_ind = ((x - k) << 2) + k;
     55 
     56     CT b0 = a0 + a2;
     57     a2 = a0 - a2;
     58     CT b1 = a1 + a3;
     59     a3 = twiddle(a1 - a3);
     60 
     61     smem[dst_ind]                = b0 + b1;
     62     smem[dst_ind + block_size]   = a2 + a3;
     63     smem[dst_ind + 2*block_size] = b0 - b1;
     64     smem[dst_ind + 3*block_size] = a2 - a3;
     65 }
     66 
     67 __attribute__((always_inline))
     68 void butterfly3(CT a0, CT a1, CT a2, __local CT* smem, __global const CT* twiddles,
     69                 const int x, const int block_size)
     70 {
     71     const int k = x % block_size;
     72     a1 = mul_complex(twiddles[k], a1);
     73     a2 = mul_complex(twiddles[k+block_size], a2);
     74     const int dst_ind = ((x - k) * 3) + k;
     75 
     76     CT b1 = a1 + a2;
     77     a2 = twiddle(sin_120*(a1 - a2));
     78     CT b0 = a0 - (CT)(0.5f)*b1;
     79 
     80     smem[dst_ind] = a0 + b1;
     81     smem[dst_ind + block_size] = b0 + a2;
     82     smem[dst_ind + 2*block_size] = b0 - a2;
     83 }
     84 
     85 __attribute__((always_inline))
     86 void butterfly5(CT a0, CT a1, CT a2, CT a3, CT a4, __local CT* smem, __global const CT* twiddles,
     87                 const int x, const int block_size)
     88 {
     89     const int k = x % block_size;
     90     a1 = mul_complex(twiddles[k], a1);
     91     a2 = mul_complex(twiddles[k + block_size], a2);
     92     a3 = mul_complex(twiddles[k+2*block_size], a3);
     93     a4 = mul_complex(twiddles[k+3*block_size], a4);
     94 
     95     const int dst_ind = ((x - k) * 5) + k;
     96     __local CT* dst = smem + dst_ind;
     97 
     98     CT b0, b1, b5;
     99 
    100     b1 = a1 + a4;
    101     a1 -= a4;
    102 
    103     a4 = a3 + a2;
    104     a3 -= a2;
    105 
    106     a2 = b1 + a4;
    107     b0 = a0 - (CT)0.25f * a2;
    108 
    109     b1 = fft5_2 * (b1 - a4);
    110     a4 = fft5_3 * (CT)(-a1.y - a3.y, a1.x + a3.x);
    111     b5 = (CT)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x);
    112 
    113     a4.x += fft5_4 * a3.y;
    114     a4.y -= fft5_4 * a3.x;
    115 
    116     a1 = b0 + b1;
    117     b0 -= b1;
    118 
    119     dst[0] = a0 + a2;
    120     dst[block_size] = a1 + a4;
    121     dst[2 * block_size] = b0 + b5;
    122     dst[3 * block_size] = b0 - b5;
    123     dst[4 * block_size] = a1 - a4;
    124 }
    125 
    126 __attribute__((always_inline))
    127 void fft_radix2(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)
    128 {
    129     CT a0, a1;
    130 
    131     if (x < t)
    132     {
    133         a0 = smem[x];
    134         a1 = smem[x+t];
    135     }
    136 
    137     barrier(CLK_LOCAL_MEM_FENCE);
    138 
    139     if (x < t)
    140         butterfly2(a0, a1, smem, twiddles, x, block_size);
    141 
    142     barrier(CLK_LOCAL_MEM_FENCE);
    143 }
    144 
    145 __attribute__((always_inline))
    146 void fft_radix2_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
    147 {
    148     const int x2 = x1 + t/2;
    149     CT a0, a1, a2, a3;
    150 
    151     if (x1 < t/2)
    152     {
    153         a0 = smem[x1]; a1 = smem[x1+t];
    154         a2 = smem[x2]; a3 = smem[x2+t];
    155     }
    156 
    157     barrier(CLK_LOCAL_MEM_FENCE);
    158 
    159     if (x1 < t/2)
    160     {
    161         butterfly2(a0, a1, smem, twiddles, x1, block_size);
    162         butterfly2(a2, a3, smem, twiddles, x2, block_size);
    163     }
    164 
    165     barrier(CLK_LOCAL_MEM_FENCE);
    166 }
    167 
    168 __attribute__((always_inline))
    169 void fft_radix2_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
    170 {
    171     const int x2 = x1 + t/3;
    172     const int x3 = x1 + 2*t/3;
    173     CT a0, a1, a2, a3, a4, a5;
    174 
    175     if (x1 < t/3)
    176     {
    177         a0 = smem[x1]; a1 = smem[x1+t];
    178         a2 = smem[x2]; a3 = smem[x2+t];
    179         a4 = smem[x3]; a5 = smem[x3+t];
    180     }
    181 
    182     barrier(CLK_LOCAL_MEM_FENCE);
    183 
    184     if (x1 < t/3)
    185     {
    186         butterfly2(a0, a1, smem, twiddles, x1, block_size);
    187         butterfly2(a2, a3, smem, twiddles, x2, block_size);
    188         butterfly2(a4, a5, smem, twiddles, x3, block_size);
    189     }
    190 
    191     barrier(CLK_LOCAL_MEM_FENCE);
    192 }
    193 
    194 __attribute__((always_inline))
    195 void fft_radix2_B4(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
    196 {
    197     const int thread_block = t/4;
    198     const int x2 = x1 + thread_block;
    199     const int x3 = x1 + 2*thread_block;
    200     const int x4 = x1 + 3*thread_block;
    201     CT a0, a1, a2, a3, a4, a5, a6, a7;
    202 
    203     if (x1 < t/4)
    204     {
    205         a0 = smem[x1]; a1 = smem[x1+t];
    206         a2 = smem[x2]; a3 = smem[x2+t];
    207         a4 = smem[x3]; a5 = smem[x3+t];
    208         a6 = smem[x4]; a7 = smem[x4+t];
    209     }
    210 
    211     barrier(CLK_LOCAL_MEM_FENCE);
    212 
    213     if (x1 < t/4)
    214     {
    215         butterfly2(a0, a1, smem, twiddles, x1, block_size);
    216         butterfly2(a2, a3, smem, twiddles, x2, block_size);
    217         butterfly2(a4, a5, smem, twiddles, x3, block_size);
    218         butterfly2(a6, a7, smem, twiddles, x4, block_size);
    219     }
    220 
    221     barrier(CLK_LOCAL_MEM_FENCE);
    222 }
    223 
    224 __attribute__((always_inline))
    225 void fft_radix2_B5(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
    226 {
    227     const int thread_block = t/5;
    228     const int x2 = x1 + thread_block;
    229     const int x3 = x1 + 2*thread_block;
    230     const int x4 = x1 + 3*thread_block;
    231     const int x5 = x1 + 4*thread_block;
    232     CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;
    233 
    234     if (x1 < t/5)
    235     {
    236         a0 = smem[x1]; a1 = smem[x1+t];
    237         a2 = smem[x2]; a3 = smem[x2+t];
    238         a4 = smem[x3]; a5 = smem[x3+t];
    239         a6 = smem[x4]; a7 = smem[x4+t];
    240         a8 = smem[x5]; a9 = smem[x5+t];
    241     }
    242 
    243     barrier(CLK_LOCAL_MEM_FENCE);
    244 
    245     if (x1 < t/5)
    246     {
    247         butterfly2(a0, a1, smem, twiddles, x1, block_size);
    248         butterfly2(a2, a3, smem, twiddles, x2, block_size);
    249         butterfly2(a4, a5, smem, twiddles, x3, block_size);
    250         butterfly2(a6, a7, smem, twiddles, x4, block_size);
    251         butterfly2(a8, a9, smem, twiddles, x5, block_size);
    252     }
    253 
    254     barrier(CLK_LOCAL_MEM_FENCE);
    255 }
    256 
    257 __attribute__((always_inline))
    258 void fft_radix4(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)
    259 {
    260     CT a0, a1, a2, a3;
    261 
    262     if (x < t)
    263     {
    264         a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; a3 = smem[x+3*t];
    265     }
    266 
    267     barrier(CLK_LOCAL_MEM_FENCE);
    268 
    269     if (x < t)
    270         butterfly4(a0, a1, a2, a3, smem, twiddles, x, block_size);
    271 
    272     barrier(CLK_LOCAL_MEM_FENCE);
    273 }
    274 
    275 __attribute__((always_inline))
    276 void fft_radix4_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
    277 {
    278     const int x2 = x1 + t/2;
    279     CT a0, a1, a2, a3, a4, a5, a6, a7;
    280 
    281     if (x1 < t/2)
    282     {
    283         a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];
    284         a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];
    285     }
    286 
    287     barrier(CLK_LOCAL_MEM_FENCE);
    288 
    289     if (x1 < t/2)
    290     {
    291         butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);
    292         butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);
    293     }
    294 
    295     barrier(CLK_LOCAL_MEM_FENCE);
    296 }
    297 
    298 __attribute__((always_inline))
    299 void fft_radix4_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
    300 {
    301     const int x2 = x1 + t/3;
    302     const int x3 = x2 + t/3;
    303     CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;
    304 
    305     if (x1 < t/3)
    306     {
    307         a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];
    308         a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];
    309         a8 = smem[x3]; a9 = smem[x3+t]; a10 = smem[x3+2*t]; a11 = smem[x3+3*t];
    310     }
    311 
    312     barrier(CLK_LOCAL_MEM_FENCE);
    313 
    314     if (x1 < t/3)
    315     {
    316         butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);
    317         butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);
    318         butterfly4(a8, a9, a10, a11, smem, twiddles, x3, block_size);
    319     }
    320 
    321     barrier(CLK_LOCAL_MEM_FENCE);
    322 }
    323 
    324 __attribute__((always_inline))
    325 void fft_radix8(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)
    326 {
    327     const int k = x % block_size;
    328     CT a0, a1, a2, a3, a4, a5, a6, a7;
    329 
    330     if (x < t)
    331     {
    332         int tw_ind = block_size / 8;
    333 
    334         a0 = smem[x];
    335         a1 = mul_complex(twiddles[k], smem[x + t]);
    336         a2 = mul_complex(twiddles[k + block_size],smem[x+2*t]);
    337         a3 = mul_complex(twiddles[k+2*block_size],smem[x+3*t]);
    338         a4 = mul_complex(twiddles[k+3*block_size],smem[x+4*t]);
    339         a5 = mul_complex(twiddles[k+4*block_size],smem[x+5*t]);
    340         a6 = mul_complex(twiddles[k+5*block_size],smem[x+6*t]);
    341         a7 = mul_complex(twiddles[k+6*block_size],smem[x+7*t]);
    342 
    343         CT b0, b1, b6, b7;
    344 
    345         b0 = a0 + a4;
    346         a4 = a0 - a4;
    347         b1 = a1 + a5;
    348         a5 = a1 - a5;
    349         a5 = (CT)(SQRT_2) * (CT)(a5.x + a5.y, -a5.x + a5.y);
    350         b6 = twiddle(a2 - a6);
    351         a2 = a2 + a6;
    352         b7 = a3 - a7;
    353         b7 = (CT)(SQRT_2) * (CT)(-b7.x + b7.y, -b7.x - b7.y);
    354         a3 = a3 + a7;
    355 
    356         a0 = b0 + a2;
    357         a2 = b0 - a2;
    358         a1 = b1 + a3;
    359         a3 = twiddle(b1 - a3);
    360         a6 = a4 - b6;
    361         a4 = a4 + b6;
    362         a7 = twiddle(a5 - b7);
    363         a5 = a5 + b7;
    364 
    365     }
    366 
    367     barrier(CLK_LOCAL_MEM_FENCE);
    368 
    369     if (x < t)
    370     {
    371         const int dst_ind = ((x - k) << 3) + k;
    372         __local CT* dst = smem + dst_ind;
    373 
    374         dst[0] = a0 + a1;
    375         dst[block_size] = a4 + a5;
    376         dst[2 * block_size] = a2 + a3;
    377         dst[3 * block_size] = a6 + a7;
    378         dst[4 * block_size] = a0 - a1;
    379         dst[5 * block_size] = a4 - a5;
    380         dst[6 * block_size] = a2 - a3;
    381         dst[7 * block_size] = a6 - a7;
    382     }
    383 
    384     barrier(CLK_LOCAL_MEM_FENCE);
    385 }
    386 
    387 __attribute__((always_inline))
    388 void fft_radix3(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)
    389 {
    390     CT a0, a1, a2;
    391 
    392     if (x < t)
    393     {
    394         a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t];
    395     }
    396 
    397     barrier(CLK_LOCAL_MEM_FENCE);
    398 
    399     if (x < t)
    400         butterfly3(a0, a1, a2, smem, twiddles, x, block_size);
    401 
    402     barrier(CLK_LOCAL_MEM_FENCE);
    403 }
    404 
    405 __attribute__((always_inline))
    406 void fft_radix3_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
    407 {
    408     const int x2 = x1 + t/2;
    409     CT a0, a1, a2, a3, a4, a5;
    410 
    411     if (x1 < t/2)
    412     {
    413         a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
    414         a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
    415     }
    416 
    417     barrier(CLK_LOCAL_MEM_FENCE);
    418 
    419     if (x1 < t/2)
    420     {
    421         butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
    422         butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
    423     }
    424 
    425     barrier(CLK_LOCAL_MEM_FENCE);
    426 }
    427 
    428 __attribute__((always_inline))
    429 void fft_radix3_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
    430 {
    431     const int x2 = x1 + t/3;
    432     const int x3 = x2 + t/3;
    433     CT a0, a1, a2, a3, a4, a5, a6, a7, a8;
    434 
    435     if (x1 < t/3)
    436     {
    437         a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
    438         a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
    439         a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];
    440     }
    441 
    442     barrier(CLK_LOCAL_MEM_FENCE);
    443 
    444     if (x1 < t/3)
    445     {
    446         butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
    447         butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
    448         butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);
    449     }
    450 
    451     barrier(CLK_LOCAL_MEM_FENCE);
    452 }
    453 
    454 __attribute__((always_inline))
    455 void fft_radix3_B4(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
    456 {
    457     const int thread_block = t/4;
    458     const int x2 = x1 + thread_block;
    459     const int x3 = x1 + 2*thread_block;
    460     const int x4 = x1 + 3*thread_block;
    461     CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;
    462 
    463     if (x1 < t/4)
    464     {
    465         a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
    466         a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
    467         a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];
    468         a9 = smem[x4]; a10 = smem[x4+t]; a11 = smem[x4+2*t];
    469     }
    470 
    471     barrier(CLK_LOCAL_MEM_FENCE);
    472 
    473     if (x1 < t/4)
    474     {
    475         butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
    476         butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
    477         butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);
    478         butterfly3(a9, a10, a11, smem, twiddles, x4, block_size);
    479     }
    480 
    481     barrier(CLK_LOCAL_MEM_FENCE);
    482 }
    483 
    484 __attribute__((always_inline))
    485 void fft_radix5(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)
    486 {
    487     const int k = x % block_size;
    488     CT a0, a1, a2, a3, a4;
    489 
    490     if (x < t)
    491     {
    492         a0 = smem[x]; a1 = smem[x + t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; a4 = smem[x+4*t];
    493     }
    494 
    495     barrier(CLK_LOCAL_MEM_FENCE);
    496 
    497     if (x < t)
    498         butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x, block_size);
    499 
    500     barrier(CLK_LOCAL_MEM_FENCE);
    501 }
    502 
    503 __attribute__((always_inline))
    504 void fft_radix5_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
    505 {
    506     const int x2 = x1+t/2;
    507     CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;
    508 
    509     if (x1 < t/2)
    510     {
    511         a0 = smem[x1]; a1 = smem[x1 + t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; a4 = smem[x1+4*t];
    512         a5 = smem[x2]; a6 = smem[x2 + t]; a7 = smem[x2+2*t]; a8 = smem[x2+3*t]; a9 = smem[x2+4*t];
    513     }
    514 
    515     barrier(CLK_LOCAL_MEM_FENCE);
    516 
    517     if (x1 < t/2)
    518     {
    519         butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x1, block_size);
    520         butterfly5(a5, a6, a7, a8, a9, smem, twiddles, x2, block_size);
    521     }
    522 
    523     barrier(CLK_LOCAL_MEM_FENCE);
    524 }
    525 
    526 #ifdef DFT_SCALE
    527 #define SCALE_VAL(x, scale) x*scale
    528 #else
    529 #define SCALE_VAL(x, scale) x
    530 #endif
    531 
    532 __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
    533                                    __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
    534                                    __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)
    535 {
    536     const int x = get_global_id(0);
    537     const int y = get_group_id(1);
    538     const int block_size = LOCAL_SIZE/kercn;
    539     if (y < nz)
    540     {
    541         __local CT smem[LOCAL_SIZE];
    542         __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);
    543         const int ind = x;
    544 #ifdef IS_1D
    545         FT scale = (FT) 1/dst_cols;
    546 #else
    547         FT scale = (FT) 1/(dst_cols*dst_rows);
    548 #endif
    549 
    550 #ifdef COMPLEX_INPUT
    551         __global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset)));
    552         #pragma unroll
    553         for (int i=0; i<kercn; i++)
    554             smem[x+i*block_size] = src[i*block_size];
    555 #else
    556         __global const FT* src = (__global const FT*)(src_ptr + mad24(y, src_step, mad24(x, (int)sizeof(FT), src_offset)));
    557         #pragma unroll
    558         for (int i=0; i<kercn; i++)
    559             smem[x+i*block_size] = (CT)(src[i*block_size], 0.f);
    560 #endif
    561         barrier(CLK_LOCAL_MEM_FENCE);
    562 
    563         RADIX_PROCESS;
    564 
    565 #ifdef COMPLEX_OUTPUT
    566 #ifdef NO_CONJUGATE
    567         // copy result without complex conjugate
    568         const int cols = dst_cols/2 + 1;
    569 #else
    570         const int cols = dst_cols;
    571 #endif
    572 
    573         __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset));
    574         #pragma unroll
    575         for (int i=x; i<cols; i+=block_size)
    576             dst[i] = SCALE_VAL(smem[i], scale);
    577 #ifdef REAL_INPUT
    578 #ifdef COMPLEX_OUTPUT
    579 #ifdef IS_1D
    580         for(int i=x+1; i < (dst_cols+1)/2; i+=block_size)
    581         {
    582             dst[dst_cols-i] = (CT)(SCALE_VAL(smem[i].x, scale), SCALE_VAL(-smem[i].y, scale));
    583         }
    584 #endif
    585 #endif
    586 #endif
    587 #else
    588         // pack row to CCS
    589         __local FT* smem_1cn = (__local FT*) smem;
    590         __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset));
    591         for (int i=x; i<dst_cols-1; i+=block_size)
    592             dst[i+1] = SCALE_VAL(smem_1cn[i+2], scale);
    593         if (x == 0)
    594             dst[0] = SCALE_VAL(smem_1cn[0], scale);
    595 #endif
    596     }
    597     else
    598     {
    599         // fill with zero other rows
    600 #ifdef COMPLEX_OUTPUT
    601         __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset));
    602 #else
    603         __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset));
    604 #endif
    605         #pragma unroll
    606         for (int i=x; i<dst_cols; i+=block_size)
    607             dst[i] = 0.f;
    608     }
    609 }
    610 
    611 __kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
    612                                    __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
    613                                    __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)
    614 {
    615     const int x = get_group_id(0);
    616     const int y = get_global_id(1);
    617 
    618     if (x < nz)
    619     {
    620         __local CT smem[LOCAL_SIZE];
    621         __global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset));
    622         __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);
    623         const int ind = y;
    624         const int block_size = LOCAL_SIZE/kercn;
    625         FT scale = 1.f/(dst_rows*dst_cols);
    626 
    627         #pragma unroll
    628         for (int i=0; i<kercn; i++)
    629             smem[y+i*block_size] = *((__global const CT*)(src + i*block_size*src_step));
    630 
    631         barrier(CLK_LOCAL_MEM_FENCE);
    632 
    633         RADIX_PROCESS;
    634 
    635 #ifdef COMPLEX_OUTPUT
    636         __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset));
    637         #pragma unroll
    638         for (int i=0; i<kercn; i++)
    639             *((__global CT*)(dst + i*block_size*dst_step)) = SCALE_VAL(smem[y + i*block_size], scale);
    640 #else
    641         if (x == 0)
    642         {
    643             // pack first column to CCS
    644             __local FT* smem_1cn = (__local FT*) smem;
    645             __global uchar* dst = dst_ptr + mad24(y+1, dst_step, dst_offset);
    646             for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
    647                 *((__global FT*) dst) = SCALE_VAL(smem_1cn[i+2], scale);
    648             if (y == 0)
    649                 *((__global FT*) (dst_ptr + dst_offset)) = SCALE_VAL(smem_1cn[0], scale);
    650         }
    651         else if (x == (dst_cols+1)/2)
    652         {
    653             // pack last column to CCS (if needed)
    654             __local FT* smem_1cn = (__local FT*) smem;
    655             __global uchar* dst = dst_ptr + mad24(dst_cols-1, (int)sizeof(FT), mad24(y+1, dst_step, dst_offset));
    656             for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
    657                 *((__global FT*) dst) = SCALE_VAL(smem_1cn[i+2], scale);
    658             if (y == 0)
    659                 *((__global FT*) (dst_ptr + mad24(dst_cols-1, (int)sizeof(FT), dst_offset))) = SCALE_VAL(smem_1cn[0], scale);
    660         }
    661         else
    662         {
    663             __global uchar* dst = dst_ptr + mad24(x, (int)sizeof(FT)*2, mad24(y, dst_step, dst_offset - (int)sizeof(FT)));
    664             #pragma unroll
    665             for (int i=y; i<dst_rows; i+=block_size, dst+=block_size*dst_step)
    666                 vstore2(SCALE_VAL(smem[i], scale), 0, (__global FT*) dst);
    667         }
    668 #endif
    669     }
    670 }
    671 
    672 __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
    673                                     __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
    674                                     __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)
    675 {
    676     const int x = get_global_id(0);
    677     const int y = get_group_id(1);
    678     const int block_size = LOCAL_SIZE/kercn;
    679 #ifdef IS_1D
    680     const FT scale = (FT) 1/dst_cols;
    681 #else
    682     const FT scale = (FT) 1/(dst_cols*dst_rows);
    683 #endif
    684 
    685     if (y < nz)
    686     {
    687         __local CT smem[LOCAL_SIZE];
    688         __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);
    689         const int ind = x;
    690 
    691 #if defined(COMPLEX_INPUT) && !defined(NO_CONJUGATE)
    692         __global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset)));
    693         #pragma unroll
    694         for (int i=0; i<kercn; i++)
    695         {
    696             smem[x+i*block_size].x =  src[i*block_size].x;
    697             smem[x+i*block_size].y = -src[i*block_size].y;
    698         }
    699 #else
    700 
    701     #if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
    702         __global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(2, (int)sizeof(FT), src_offset)));
    703 
    704         #pragma unroll
    705         for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
    706         {
    707             smem[i+1].x = src[i].x;
    708             smem[i+1].y = -src[i].y;
    709             smem[LOCAL_SIZE-i-1] = src[i];
    710         }
    711     #else
    712 
    713         #pragma unroll
    714         for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
    715         {
    716             CT src = vload2(0, (__global const FT*)(src_ptr + mad24(y, src_step, mad24(2*i+1, (int)sizeof(FT), src_offset))));
    717 
    718             smem[i+1].x = src.x;
    719             smem[i+1].y = -src.y;
    720             smem[LOCAL_SIZE-i-1] = src;
    721         }
    722 
    723     #endif
    724 
    725         if (x==0)
    726         {
    727             smem[0].x = *(__global const FT*)(src_ptr + mad24(y, src_step, src_offset));
    728             smem[0].y = 0.f;
    729 
    730             if(LOCAL_SIZE % 2 ==0)
    731             {
    732                 #if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
    733                 smem[LOCAL_SIZE/2].x = src[LOCAL_SIZE/2-1].x;
    734                 #else
    735                 smem[LOCAL_SIZE/2].x = *(__global const FT*)(src_ptr + mad24(y, src_step, mad24(LOCAL_SIZE-1, (int)sizeof(FT), src_offset)));
    736                 #endif
    737                 smem[LOCAL_SIZE/2].y = 0.f;
    738             }
    739         }
    740 #endif
    741 
    742         barrier(CLK_LOCAL_MEM_FENCE);
    743 
    744         RADIX_PROCESS;
    745 
    746         // copy data to dst
    747 #ifdef COMPLEX_OUTPUT
    748         __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset)));
    749         #pragma unroll
    750         for (int i=0; i<kercn; i++)
    751         {
    752             dst[i*block_size].x = SCALE_VAL(smem[x + i*block_size].x, scale);
    753             dst[i*block_size].y = SCALE_VAL(-smem[x + i*block_size].y, scale);
    754         }
    755 #else
    756         __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(FT)), dst_offset)));
    757         #pragma unroll
    758         for (int i=0; i<kercn; i++)
    759         {
    760             dst[i*block_size] = SCALE_VAL(smem[x + i*block_size].x, scale);
    761         }
    762 #endif
    763     }
    764     else
    765     {
    766         // fill with zero other rows
    767 #ifdef COMPLEX_OUTPUT
    768         __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset));
    769 #else
    770         __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset));
    771 #endif
    772         #pragma unroll
    773         for (int i=x; i<dst_cols; i+=block_size)
    774             dst[i] = 0.f;
    775     }
    776 }
    777 
    778 __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
    779                               __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
    780                               __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)
    781 {
    782     const int x = get_group_id(0);
    783     const int y = get_global_id(1);
    784 
    785 #ifdef COMPLEX_INPUT
    786     if (x < nz)
    787     {
    788         __local CT smem[LOCAL_SIZE];
    789         __global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset));
    790         __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset));
    791         __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);
    792         const int ind = y;
    793         const int block_size = LOCAL_SIZE/kercn;
    794 
    795         #pragma unroll
    796         for (int i=0; i<kercn; i++)
    797         {
    798             CT temp = *((__global const CT*)(src + i*block_size*src_step));
    799             smem[y+i*block_size].x =  temp.x;
    800             smem[y+i*block_size].y =  -temp.y;
    801         }
    802 
    803         barrier(CLK_LOCAL_MEM_FENCE);
    804 
    805         RADIX_PROCESS;
    806 
    807         // copy data to dst
    808         #pragma unroll
    809         for (int i=0; i<kercn; i++)
    810         {
    811            __global CT* res = (__global CT*)(dst + i*block_size*dst_step);
    812             res[0].x = smem[y + i*block_size].x;
    813             res[0].y = -smem[y + i*block_size].y;
    814         }
    815     }
    816 #else
    817     if (x < nz)
    818     {
    819         __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);
    820         const int ind = y;
    821         const int block_size = LOCAL_SIZE/kercn;
    822 
    823         __local CT smem[LOCAL_SIZE];
    824 #ifdef EVEN
    825         if (x!=0 && (x!=(nz-1)))
    826 #else
    827         if (x!=0)
    828 #endif
    829         {
    830             __global const uchar* src = src_ptr + mad24(y, src_step, mad24(2*x-1, (int)sizeof(FT), src_offset));
    831             #pragma unroll
    832             for (int i=0; i<kercn; i++)
    833             {
    834                 CT temp = vload2(0, (__global const FT*)(src + i*block_size*src_step));
    835                 smem[y+i*block_size].x = temp.x;
    836                 smem[y+i*block_size].y = -temp.y;
    837             }
    838         }
    839         else
    840         {
    841             int ind = x==0 ? 0: 2*x-1;
    842             __global const FT* src = (__global const FT*)(src_ptr + mad24(1, src_step, mad24(ind, (int)sizeof(FT), src_offset)));
    843             int step = src_step/(int)sizeof(FT);
    844 
    845             #pragma unroll
    846             for (int i=y; i<(LOCAL_SIZE-1)/2; i+=block_size)
    847             {
    848                 smem[i+1].x = src[2*i*step];
    849                 smem[i+1].y = -src[(2*i+1)*step];
    850 
    851                 smem[LOCAL_SIZE-i-1].x = src[2*i*step];;
    852                 smem[LOCAL_SIZE-i-1].y = src[(2*i+1)*step];
    853             }
    854             if (y==0)
    855             {
    856                 smem[0].x = *(__global const FT*)(src_ptr + mad24(ind, (int)sizeof(FT), src_offset));
    857                 smem[0].y = 0.f;
    858 
    859                 if(LOCAL_SIZE % 2 ==0)
    860                 {
    861                     smem[LOCAL_SIZE/2].x = src[(LOCAL_SIZE-2)*step];
    862                     smem[LOCAL_SIZE/2].y = 0.f;
    863                 }
    864             }
    865         }
    866         barrier(CLK_LOCAL_MEM_FENCE);
    867 
    868         RADIX_PROCESS;
    869 
    870         // copy data to dst
    871         __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset));
    872 
    873         #pragma unroll
    874         for (int i=0; i<kercn; i++)
    875         {
    876             __global CT* res = (__global CT*)(dst + i*block_size*dst_step);
    877             res[0].x =  smem[y + i*block_size].x;
    878             res[0].y = -smem[y + i*block_size].y;
    879         }
    880     }
    881 #endif
    882 }