1 // This file is part of OpenCV project. 2 // It is subject to the license terms in the LICENSE file found in the top-level directory 3 // of this distribution and at http://opencv.org/license.html. 4 5 // Copyright (C) 2014, Itseez, Inc., all rights reserved. 6 // Third party copyrights are property of their respective owners. 7 8 #define SQRT_2 0.707106781188f 9 #define sin_120 0.866025403784f 10 #define fft5_2 0.559016994374f 11 #define fft5_3 -0.951056516295f 12 #define fft5_4 -1.538841768587f 13 #define fft5_5 0.363271264002f 14 15 #ifdef DOUBLE_SUPPORT 16 #ifdef cl_amd_fp64 17 #pragma OPENCL EXTENSION cl_amd_fp64:enable 18 #elif defined (cl_khr_fp64) 19 #pragma OPENCL EXTENSION cl_khr_fp64:enable 20 #endif 21 #endif 22 23 __attribute__((always_inline)) 24 CT mul_complex(CT a, CT b) { 25 return (CT)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x)); 26 } 27 28 __attribute__((always_inline)) 29 CT twiddle(CT a) { 30 return (CT)(a.y, -a.x); 31 } 32 33 __attribute__((always_inline)) 34 void butterfly2(CT a0, CT a1, __local CT* smem, __global const CT* twiddles, 35 const int x, const int block_size) 36 { 37 const int k = x & (block_size - 1); 38 a1 = mul_complex(twiddles[k], a1); 39 const int dst_ind = (x << 1) - k; 40 41 smem[dst_ind] = a0 + a1; 42 smem[dst_ind+block_size] = a0 - a1; 43 } 44 45 __attribute__((always_inline)) 46 void butterfly4(CT a0, CT a1, CT a2, CT a3, __local CT* smem, __global const CT* twiddles, 47 const int x, const int block_size) 48 { 49 const int k = x & (block_size - 1); 50 a1 = mul_complex(twiddles[k], a1); 51 a2 = mul_complex(twiddles[k + block_size], a2); 52 a3 = mul_complex(twiddles[k + 2*block_size], a3); 53 54 const int dst_ind = ((x - k) << 2) + k; 55 56 CT b0 = a0 + a2; 57 a2 = a0 - a2; 58 CT b1 = a1 + a3; 59 a3 = twiddle(a1 - a3); 60 61 smem[dst_ind] = b0 + b1; 62 smem[dst_ind + block_size] = a2 + a3; 63 smem[dst_ind + 2*block_size] = b0 - b1; 64 smem[dst_ind + 3*block_size] = a2 - a3; 65 } 66 67 __attribute__((always_inline)) 68 void butterfly3(CT a0, CT a1, CT a2, __local CT* smem, __global const CT* twiddles, 69 const int x, const int block_size) 70 { 71 const int k = x % block_size; 72 a1 = mul_complex(twiddles[k], a1); 73 a2 = mul_complex(twiddles[k+block_size], a2); 74 const int dst_ind = ((x - k) * 3) + k; 75 76 CT b1 = a1 + a2; 77 a2 = twiddle(sin_120*(a1 - a2)); 78 CT b0 = a0 - (CT)(0.5f)*b1; 79 80 smem[dst_ind] = a0 + b1; 81 smem[dst_ind + block_size] = b0 + a2; 82 smem[dst_ind + 2*block_size] = b0 - a2; 83 } 84 85 __attribute__((always_inline)) 86 void butterfly5(CT a0, CT a1, CT a2, CT a3, CT a4, __local CT* smem, __global const CT* twiddles, 87 const int x, const int block_size) 88 { 89 const int k = x % block_size; 90 a1 = mul_complex(twiddles[k], a1); 91 a2 = mul_complex(twiddles[k + block_size], a2); 92 a3 = mul_complex(twiddles[k+2*block_size], a3); 93 a4 = mul_complex(twiddles[k+3*block_size], a4); 94 95 const int dst_ind = ((x - k) * 5) + k; 96 __local CT* dst = smem + dst_ind; 97 98 CT b0, b1, b5; 99 100 b1 = a1 + a4; 101 a1 -= a4; 102 103 a4 = a3 + a2; 104 a3 -= a2; 105 106 a2 = b1 + a4; 107 b0 = a0 - (CT)0.25f * a2; 108 109 b1 = fft5_2 * (b1 - a4); 110 a4 = fft5_3 * (CT)(-a1.y - a3.y, a1.x + a3.x); 111 b5 = (CT)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x); 112 113 a4.x += fft5_4 * a3.y; 114 a4.y -= fft5_4 * a3.x; 115 116 a1 = b0 + b1; 117 b0 -= b1; 118 119 dst[0] = a0 + a2; 120 dst[block_size] = a1 + a4; 121 dst[2 * block_size] = b0 + b5; 122 dst[3 * block_size] = b0 - b5; 123 dst[4 * block_size] = a1 - a4; 124 } 125 126 __attribute__((always_inline)) 127 void fft_radix2(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t) 128 { 129 CT a0, a1; 130 131 if (x < t) 132 { 133 a0 = smem[x]; 134 a1 = smem[x+t]; 135 } 136 137 barrier(CLK_LOCAL_MEM_FENCE); 138 139 if (x < t) 140 butterfly2(a0, a1, smem, twiddles, x, block_size); 141 142 barrier(CLK_LOCAL_MEM_FENCE); 143 } 144 145 __attribute__((always_inline)) 146 void fft_radix2_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 147 { 148 const int x2 = x1 + t/2; 149 CT a0, a1, a2, a3; 150 151 if (x1 < t/2) 152 { 153 a0 = smem[x1]; a1 = smem[x1+t]; 154 a2 = smem[x2]; a3 = smem[x2+t]; 155 } 156 157 barrier(CLK_LOCAL_MEM_FENCE); 158 159 if (x1 < t/2) 160 { 161 butterfly2(a0, a1, smem, twiddles, x1, block_size); 162 butterfly2(a2, a3, smem, twiddles, x2, block_size); 163 } 164 165 barrier(CLK_LOCAL_MEM_FENCE); 166 } 167 168 __attribute__((always_inline)) 169 void fft_radix2_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 170 { 171 const int x2 = x1 + t/3; 172 const int x3 = x1 + 2*t/3; 173 CT a0, a1, a2, a3, a4, a5; 174 175 if (x1 < t/3) 176 { 177 a0 = smem[x1]; a1 = smem[x1+t]; 178 a2 = smem[x2]; a3 = smem[x2+t]; 179 a4 = smem[x3]; a5 = smem[x3+t]; 180 } 181 182 barrier(CLK_LOCAL_MEM_FENCE); 183 184 if (x1 < t/3) 185 { 186 butterfly2(a0, a1, smem, twiddles, x1, block_size); 187 butterfly2(a2, a3, smem, twiddles, x2, block_size); 188 butterfly2(a4, a5, smem, twiddles, x3, block_size); 189 } 190 191 barrier(CLK_LOCAL_MEM_FENCE); 192 } 193 194 __attribute__((always_inline)) 195 void fft_radix2_B4(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 196 { 197 const int thread_block = t/4; 198 const int x2 = x1 + thread_block; 199 const int x3 = x1 + 2*thread_block; 200 const int x4 = x1 + 3*thread_block; 201 CT a0, a1, a2, a3, a4, a5, a6, a7; 202 203 if (x1 < t/4) 204 { 205 a0 = smem[x1]; a1 = smem[x1+t]; 206 a2 = smem[x2]; a3 = smem[x2+t]; 207 a4 = smem[x3]; a5 = smem[x3+t]; 208 a6 = smem[x4]; a7 = smem[x4+t]; 209 } 210 211 barrier(CLK_LOCAL_MEM_FENCE); 212 213 if (x1 < t/4) 214 { 215 butterfly2(a0, a1, smem, twiddles, x1, block_size); 216 butterfly2(a2, a3, smem, twiddles, x2, block_size); 217 butterfly2(a4, a5, smem, twiddles, x3, block_size); 218 butterfly2(a6, a7, smem, twiddles, x4, block_size); 219 } 220 221 barrier(CLK_LOCAL_MEM_FENCE); 222 } 223 224 __attribute__((always_inline)) 225 void fft_radix2_B5(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 226 { 227 const int thread_block = t/5; 228 const int x2 = x1 + thread_block; 229 const int x3 = x1 + 2*thread_block; 230 const int x4 = x1 + 3*thread_block; 231 const int x5 = x1 + 4*thread_block; 232 CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9; 233 234 if (x1 < t/5) 235 { 236 a0 = smem[x1]; a1 = smem[x1+t]; 237 a2 = smem[x2]; a3 = smem[x2+t]; 238 a4 = smem[x3]; a5 = smem[x3+t]; 239 a6 = smem[x4]; a7 = smem[x4+t]; 240 a8 = smem[x5]; a9 = smem[x5+t]; 241 } 242 243 barrier(CLK_LOCAL_MEM_FENCE); 244 245 if (x1 < t/5) 246 { 247 butterfly2(a0, a1, smem, twiddles, x1, block_size); 248 butterfly2(a2, a3, smem, twiddles, x2, block_size); 249 butterfly2(a4, a5, smem, twiddles, x3, block_size); 250 butterfly2(a6, a7, smem, twiddles, x4, block_size); 251 butterfly2(a8, a9, smem, twiddles, x5, block_size); 252 } 253 254 barrier(CLK_LOCAL_MEM_FENCE); 255 } 256 257 __attribute__((always_inline)) 258 void fft_radix4(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t) 259 { 260 CT a0, a1, a2, a3; 261 262 if (x < t) 263 { 264 a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; 265 } 266 267 barrier(CLK_LOCAL_MEM_FENCE); 268 269 if (x < t) 270 butterfly4(a0, a1, a2, a3, smem, twiddles, x, block_size); 271 272 barrier(CLK_LOCAL_MEM_FENCE); 273 } 274 275 __attribute__((always_inline)) 276 void fft_radix4_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 277 { 278 const int x2 = x1 + t/2; 279 CT a0, a1, a2, a3, a4, a5, a6, a7; 280 281 if (x1 < t/2) 282 { 283 a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; 284 a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t]; 285 } 286 287 barrier(CLK_LOCAL_MEM_FENCE); 288 289 if (x1 < t/2) 290 { 291 butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size); 292 butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size); 293 } 294 295 barrier(CLK_LOCAL_MEM_FENCE); 296 } 297 298 __attribute__((always_inline)) 299 void fft_radix4_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 300 { 301 const int x2 = x1 + t/3; 302 const int x3 = x2 + t/3; 303 CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11; 304 305 if (x1 < t/3) 306 { 307 a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; 308 a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t]; 309 a8 = smem[x3]; a9 = smem[x3+t]; a10 = smem[x3+2*t]; a11 = smem[x3+3*t]; 310 } 311 312 barrier(CLK_LOCAL_MEM_FENCE); 313 314 if (x1 < t/3) 315 { 316 butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size); 317 butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size); 318 butterfly4(a8, a9, a10, a11, smem, twiddles, x3, block_size); 319 } 320 321 barrier(CLK_LOCAL_MEM_FENCE); 322 } 323 324 __attribute__((always_inline)) 325 void fft_radix8(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t) 326 { 327 const int k = x % block_size; 328 CT a0, a1, a2, a3, a4, a5, a6, a7; 329 330 if (x < t) 331 { 332 int tw_ind = block_size / 8; 333 334 a0 = smem[x]; 335 a1 = mul_complex(twiddles[k], smem[x + t]); 336 a2 = mul_complex(twiddles[k + block_size],smem[x+2*t]); 337 a3 = mul_complex(twiddles[k+2*block_size],smem[x+3*t]); 338 a4 = mul_complex(twiddles[k+3*block_size],smem[x+4*t]); 339 a5 = mul_complex(twiddles[k+4*block_size],smem[x+5*t]); 340 a6 = mul_complex(twiddles[k+5*block_size],smem[x+6*t]); 341 a7 = mul_complex(twiddles[k+6*block_size],smem[x+7*t]); 342 343 CT b0, b1, b6, b7; 344 345 b0 = a0 + a4; 346 a4 = a0 - a4; 347 b1 = a1 + a5; 348 a5 = a1 - a5; 349 a5 = (CT)(SQRT_2) * (CT)(a5.x + a5.y, -a5.x + a5.y); 350 b6 = twiddle(a2 - a6); 351 a2 = a2 + a6; 352 b7 = a3 - a7; 353 b7 = (CT)(SQRT_2) * (CT)(-b7.x + b7.y, -b7.x - b7.y); 354 a3 = a3 + a7; 355 356 a0 = b0 + a2; 357 a2 = b0 - a2; 358 a1 = b1 + a3; 359 a3 = twiddle(b1 - a3); 360 a6 = a4 - b6; 361 a4 = a4 + b6; 362 a7 = twiddle(a5 - b7); 363 a5 = a5 + b7; 364 365 } 366 367 barrier(CLK_LOCAL_MEM_FENCE); 368 369 if (x < t) 370 { 371 const int dst_ind = ((x - k) << 3) + k; 372 __local CT* dst = smem + dst_ind; 373 374 dst[0] = a0 + a1; 375 dst[block_size] = a4 + a5; 376 dst[2 * block_size] = a2 + a3; 377 dst[3 * block_size] = a6 + a7; 378 dst[4 * block_size] = a0 - a1; 379 dst[5 * block_size] = a4 - a5; 380 dst[6 * block_size] = a2 - a3; 381 dst[7 * block_size] = a6 - a7; 382 } 383 384 barrier(CLK_LOCAL_MEM_FENCE); 385 } 386 387 __attribute__((always_inline)) 388 void fft_radix3(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t) 389 { 390 CT a0, a1, a2; 391 392 if (x < t) 393 { 394 a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; 395 } 396 397 barrier(CLK_LOCAL_MEM_FENCE); 398 399 if (x < t) 400 butterfly3(a0, a1, a2, smem, twiddles, x, block_size); 401 402 barrier(CLK_LOCAL_MEM_FENCE); 403 } 404 405 __attribute__((always_inline)) 406 void fft_radix3_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 407 { 408 const int x2 = x1 + t/2; 409 CT a0, a1, a2, a3, a4, a5; 410 411 if (x1 < t/2) 412 { 413 a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; 414 a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t]; 415 } 416 417 barrier(CLK_LOCAL_MEM_FENCE); 418 419 if (x1 < t/2) 420 { 421 butterfly3(a0, a1, a2, smem, twiddles, x1, block_size); 422 butterfly3(a3, a4, a5, smem, twiddles, x2, block_size); 423 } 424 425 barrier(CLK_LOCAL_MEM_FENCE); 426 } 427 428 __attribute__((always_inline)) 429 void fft_radix3_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 430 { 431 const int x2 = x1 + t/3; 432 const int x3 = x2 + t/3; 433 CT a0, a1, a2, a3, a4, a5, a6, a7, a8; 434 435 if (x1 < t/3) 436 { 437 a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; 438 a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t]; 439 a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t]; 440 } 441 442 barrier(CLK_LOCAL_MEM_FENCE); 443 444 if (x1 < t/3) 445 { 446 butterfly3(a0, a1, a2, smem, twiddles, x1, block_size); 447 butterfly3(a3, a4, a5, smem, twiddles, x2, block_size); 448 butterfly3(a6, a7, a8, smem, twiddles, x3, block_size); 449 } 450 451 barrier(CLK_LOCAL_MEM_FENCE); 452 } 453 454 __attribute__((always_inline)) 455 void fft_radix3_B4(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 456 { 457 const int thread_block = t/4; 458 const int x2 = x1 + thread_block; 459 const int x3 = x1 + 2*thread_block; 460 const int x4 = x1 + 3*thread_block; 461 CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11; 462 463 if (x1 < t/4) 464 { 465 a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; 466 a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t]; 467 a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t]; 468 a9 = smem[x4]; a10 = smem[x4+t]; a11 = smem[x4+2*t]; 469 } 470 471 barrier(CLK_LOCAL_MEM_FENCE); 472 473 if (x1 < t/4) 474 { 475 butterfly3(a0, a1, a2, smem, twiddles, x1, block_size); 476 butterfly3(a3, a4, a5, smem, twiddles, x2, block_size); 477 butterfly3(a6, a7, a8, smem, twiddles, x3, block_size); 478 butterfly3(a9, a10, a11, smem, twiddles, x4, block_size); 479 } 480 481 barrier(CLK_LOCAL_MEM_FENCE); 482 } 483 484 __attribute__((always_inline)) 485 void fft_radix5(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t) 486 { 487 const int k = x % block_size; 488 CT a0, a1, a2, a3, a4; 489 490 if (x < t) 491 { 492 a0 = smem[x]; a1 = smem[x + t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; a4 = smem[x+4*t]; 493 } 494 495 barrier(CLK_LOCAL_MEM_FENCE); 496 497 if (x < t) 498 butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x, block_size); 499 500 barrier(CLK_LOCAL_MEM_FENCE); 501 } 502 503 __attribute__((always_inline)) 504 void fft_radix5_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 505 { 506 const int x2 = x1+t/2; 507 CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9; 508 509 if (x1 < t/2) 510 { 511 a0 = smem[x1]; a1 = smem[x1 + t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; a4 = smem[x1+4*t]; 512 a5 = smem[x2]; a6 = smem[x2 + t]; a7 = smem[x2+2*t]; a8 = smem[x2+3*t]; a9 = smem[x2+4*t]; 513 } 514 515 barrier(CLK_LOCAL_MEM_FENCE); 516 517 if (x1 < t/2) 518 { 519 butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x1, block_size); 520 butterfly5(a5, a6, a7, a8, a9, smem, twiddles, x2, block_size); 521 } 522 523 barrier(CLK_LOCAL_MEM_FENCE); 524 } 525 526 #ifdef DFT_SCALE 527 #define SCALE_VAL(x, scale) x*scale 528 #else 529 #define SCALE_VAL(x, scale) x 530 #endif 531 532 __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols, 533 __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, 534 __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz) 535 { 536 const int x = get_global_id(0); 537 const int y = get_group_id(1); 538 const int block_size = LOCAL_SIZE/kercn; 539 if (y < nz) 540 { 541 __local CT smem[LOCAL_SIZE]; 542 __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset); 543 const int ind = x; 544 #ifdef IS_1D 545 FT scale = (FT) 1/dst_cols; 546 #else 547 FT scale = (FT) 1/(dst_cols*dst_rows); 548 #endif 549 550 #ifdef COMPLEX_INPUT 551 __global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset))); 552 #pragma unroll 553 for (int i=0; i<kercn; i++) 554 smem[x+i*block_size] = src[i*block_size]; 555 #else 556 __global const FT* src = (__global const FT*)(src_ptr + mad24(y, src_step, mad24(x, (int)sizeof(FT), src_offset))); 557 #pragma unroll 558 for (int i=0; i<kercn; i++) 559 smem[x+i*block_size] = (CT)(src[i*block_size], 0.f); 560 #endif 561 barrier(CLK_LOCAL_MEM_FENCE); 562 563 RADIX_PROCESS; 564 565 #ifdef COMPLEX_OUTPUT 566 #ifdef NO_CONJUGATE 567 // copy result without complex conjugate 568 const int cols = dst_cols/2 + 1; 569 #else 570 const int cols = dst_cols; 571 #endif 572 573 __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset)); 574 #pragma unroll 575 for (int i=x; i<cols; i+=block_size) 576 dst[i] = SCALE_VAL(smem[i], scale); 577 #ifdef REAL_INPUT 578 #ifdef COMPLEX_OUTPUT 579 #ifdef IS_1D 580 for(int i=x+1; i < (dst_cols+1)/2; i+=block_size) 581 { 582 dst[dst_cols-i] = (CT)(SCALE_VAL(smem[i].x, scale), SCALE_VAL(-smem[i].y, scale)); 583 } 584 #endif 585 #endif 586 #endif 587 #else 588 // pack row to CCS 589 __local FT* smem_1cn = (__local FT*) smem; 590 __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset)); 591 for (int i=x; i<dst_cols-1; i+=block_size) 592 dst[i+1] = SCALE_VAL(smem_1cn[i+2], scale); 593 if (x == 0) 594 dst[0] = SCALE_VAL(smem_1cn[0], scale); 595 #endif 596 } 597 else 598 { 599 // fill with zero other rows 600 #ifdef COMPLEX_OUTPUT 601 __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset)); 602 #else 603 __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset)); 604 #endif 605 #pragma unroll 606 for (int i=x; i<dst_cols; i+=block_size) 607 dst[i] = 0.f; 608 } 609 } 610 611 __kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols, 612 __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, 613 __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz) 614 { 615 const int x = get_group_id(0); 616 const int y = get_global_id(1); 617 618 if (x < nz) 619 { 620 __local CT smem[LOCAL_SIZE]; 621 __global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset)); 622 __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset); 623 const int ind = y; 624 const int block_size = LOCAL_SIZE/kercn; 625 FT scale = 1.f/(dst_rows*dst_cols); 626 627 #pragma unroll 628 for (int i=0; i<kercn; i++) 629 smem[y+i*block_size] = *((__global const CT*)(src + i*block_size*src_step)); 630 631 barrier(CLK_LOCAL_MEM_FENCE); 632 633 RADIX_PROCESS; 634 635 #ifdef COMPLEX_OUTPUT 636 __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset)); 637 #pragma unroll 638 for (int i=0; i<kercn; i++) 639 *((__global CT*)(dst + i*block_size*dst_step)) = SCALE_VAL(smem[y + i*block_size], scale); 640 #else 641 if (x == 0) 642 { 643 // pack first column to CCS 644 __local FT* smem_1cn = (__local FT*) smem; 645 __global uchar* dst = dst_ptr + mad24(y+1, dst_step, dst_offset); 646 for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size) 647 *((__global FT*) dst) = SCALE_VAL(smem_1cn[i+2], scale); 648 if (y == 0) 649 *((__global FT*) (dst_ptr + dst_offset)) = SCALE_VAL(smem_1cn[0], scale); 650 } 651 else if (x == (dst_cols+1)/2) 652 { 653 // pack last column to CCS (if needed) 654 __local FT* smem_1cn = (__local FT*) smem; 655 __global uchar* dst = dst_ptr + mad24(dst_cols-1, (int)sizeof(FT), mad24(y+1, dst_step, dst_offset)); 656 for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size) 657 *((__global FT*) dst) = SCALE_VAL(smem_1cn[i+2], scale); 658 if (y == 0) 659 *((__global FT*) (dst_ptr + mad24(dst_cols-1, (int)sizeof(FT), dst_offset))) = SCALE_VAL(smem_1cn[0], scale); 660 } 661 else 662 { 663 __global uchar* dst = dst_ptr + mad24(x, (int)sizeof(FT)*2, mad24(y, dst_step, dst_offset - (int)sizeof(FT))); 664 #pragma unroll 665 for (int i=y; i<dst_rows; i+=block_size, dst+=block_size*dst_step) 666 vstore2(SCALE_VAL(smem[i], scale), 0, (__global FT*) dst); 667 } 668 #endif 669 } 670 } 671 672 __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols, 673 __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, 674 __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz) 675 { 676 const int x = get_global_id(0); 677 const int y = get_group_id(1); 678 const int block_size = LOCAL_SIZE/kercn; 679 #ifdef IS_1D 680 const FT scale = (FT) 1/dst_cols; 681 #else 682 const FT scale = (FT) 1/(dst_cols*dst_rows); 683 #endif 684 685 if (y < nz) 686 { 687 __local CT smem[LOCAL_SIZE]; 688 __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset); 689 const int ind = x; 690 691 #if defined(COMPLEX_INPUT) && !defined(NO_CONJUGATE) 692 __global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset))); 693 #pragma unroll 694 for (int i=0; i<kercn; i++) 695 { 696 smem[x+i*block_size].x = src[i*block_size].x; 697 smem[x+i*block_size].y = -src[i*block_size].y; 698 } 699 #else 700 701 #if !defined(REAL_INPUT) && defined(NO_CONJUGATE) 702 __global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(2, (int)sizeof(FT), src_offset))); 703 704 #pragma unroll 705 for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size) 706 { 707 smem[i+1].x = src[i].x; 708 smem[i+1].y = -src[i].y; 709 smem[LOCAL_SIZE-i-1] = src[i]; 710 } 711 #else 712 713 #pragma unroll 714 for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size) 715 { 716 CT src = vload2(0, (__global const FT*)(src_ptr + mad24(y, src_step, mad24(2*i+1, (int)sizeof(FT), src_offset)))); 717 718 smem[i+1].x = src.x; 719 smem[i+1].y = -src.y; 720 smem[LOCAL_SIZE-i-1] = src; 721 } 722 723 #endif 724 725 if (x==0) 726 { 727 smem[0].x = *(__global const FT*)(src_ptr + mad24(y, src_step, src_offset)); 728 smem[0].y = 0.f; 729 730 if(LOCAL_SIZE % 2 ==0) 731 { 732 #if !defined(REAL_INPUT) && defined(NO_CONJUGATE) 733 smem[LOCAL_SIZE/2].x = src[LOCAL_SIZE/2-1].x; 734 #else 735 smem[LOCAL_SIZE/2].x = *(__global const FT*)(src_ptr + mad24(y, src_step, mad24(LOCAL_SIZE-1, (int)sizeof(FT), src_offset))); 736 #endif 737 smem[LOCAL_SIZE/2].y = 0.f; 738 } 739 } 740 #endif 741 742 barrier(CLK_LOCAL_MEM_FENCE); 743 744 RADIX_PROCESS; 745 746 // copy data to dst 747 #ifdef COMPLEX_OUTPUT 748 __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset))); 749 #pragma unroll 750 for (int i=0; i<kercn; i++) 751 { 752 dst[i*block_size].x = SCALE_VAL(smem[x + i*block_size].x, scale); 753 dst[i*block_size].y = SCALE_VAL(-smem[x + i*block_size].y, scale); 754 } 755 #else 756 __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(FT)), dst_offset))); 757 #pragma unroll 758 for (int i=0; i<kercn; i++) 759 { 760 dst[i*block_size] = SCALE_VAL(smem[x + i*block_size].x, scale); 761 } 762 #endif 763 } 764 else 765 { 766 // fill with zero other rows 767 #ifdef COMPLEX_OUTPUT 768 __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset)); 769 #else 770 __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset)); 771 #endif 772 #pragma unroll 773 for (int i=x; i<dst_cols; i+=block_size) 774 dst[i] = 0.f; 775 } 776 } 777 778 __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols, 779 __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, 780 __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz) 781 { 782 const int x = get_group_id(0); 783 const int y = get_global_id(1); 784 785 #ifdef COMPLEX_INPUT 786 if (x < nz) 787 { 788 __local CT smem[LOCAL_SIZE]; 789 __global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset)); 790 __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset)); 791 __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset); 792 const int ind = y; 793 const int block_size = LOCAL_SIZE/kercn; 794 795 #pragma unroll 796 for (int i=0; i<kercn; i++) 797 { 798 CT temp = *((__global const CT*)(src + i*block_size*src_step)); 799 smem[y+i*block_size].x = temp.x; 800 smem[y+i*block_size].y = -temp.y; 801 } 802 803 barrier(CLK_LOCAL_MEM_FENCE); 804 805 RADIX_PROCESS; 806 807 // copy data to dst 808 #pragma unroll 809 for (int i=0; i<kercn; i++) 810 { 811 __global CT* res = (__global CT*)(dst + i*block_size*dst_step); 812 res[0].x = smem[y + i*block_size].x; 813 res[0].y = -smem[y + i*block_size].y; 814 } 815 } 816 #else 817 if (x < nz) 818 { 819 __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset); 820 const int ind = y; 821 const int block_size = LOCAL_SIZE/kercn; 822 823 __local CT smem[LOCAL_SIZE]; 824 #ifdef EVEN 825 if (x!=0 && (x!=(nz-1))) 826 #else 827 if (x!=0) 828 #endif 829 { 830 __global const uchar* src = src_ptr + mad24(y, src_step, mad24(2*x-1, (int)sizeof(FT), src_offset)); 831 #pragma unroll 832 for (int i=0; i<kercn; i++) 833 { 834 CT temp = vload2(0, (__global const FT*)(src + i*block_size*src_step)); 835 smem[y+i*block_size].x = temp.x; 836 smem[y+i*block_size].y = -temp.y; 837 } 838 } 839 else 840 { 841 int ind = x==0 ? 0: 2*x-1; 842 __global const FT* src = (__global const FT*)(src_ptr + mad24(1, src_step, mad24(ind, (int)sizeof(FT), src_offset))); 843 int step = src_step/(int)sizeof(FT); 844 845 #pragma unroll 846 for (int i=y; i<(LOCAL_SIZE-1)/2; i+=block_size) 847 { 848 smem[i+1].x = src[2*i*step]; 849 smem[i+1].y = -src[(2*i+1)*step]; 850 851 smem[LOCAL_SIZE-i-1].x = src[2*i*step];; 852 smem[LOCAL_SIZE-i-1].y = src[(2*i+1)*step]; 853 } 854 if (y==0) 855 { 856 smem[0].x = *(__global const FT*)(src_ptr + mad24(ind, (int)sizeof(FT), src_offset)); 857 smem[0].y = 0.f; 858 859 if(LOCAL_SIZE % 2 ==0) 860 { 861 smem[LOCAL_SIZE/2].x = src[(LOCAL_SIZE-2)*step]; 862 smem[LOCAL_SIZE/2].y = 0.f; 863 } 864 } 865 } 866 barrier(CLK_LOCAL_MEM_FENCE); 867 868 RADIX_PROCESS; 869 870 // copy data to dst 871 __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset)); 872 873 #pragma unroll 874 for (int i=0; i<kercn; i++) 875 { 876 __global CT* res = (__global CT*)(dst + i*block_size*dst_step); 877 res[0].x = smem[y + i*block_size].x; 878 res[0].y = -smem[y + i*block_size].y; 879 } 880 } 881 #endif 882 }