1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. 14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // @Authors 18 // Jia Haipeng, jiahaipeng95 (at) gmail.com 19 // 20 // Redistribution and use in source and binary forms, with or without modification, 21 // are permitted provided that the following conditions are met: 22 // 23 // * Redistribution's of source code must retain the above copyright notice, 24 // this list of conditions and the following disclaimer. 25 // 26 // * Redistribution's in binary form must reproduce the above copyright notice, 27 // this list of conditions and the following disclaimer in the documentation 28 // and/or other materials provided with the distribution. 29 // 30 // * The name of the copyright holders may not be used to endorse or promote products 31 // derived from this software without specific prior written permission. 32 // 33 // This software is provided by the copyright holders and contributors as is and 34 // any express or implied warranties, including, but not limited to, the implied 35 // warranties of merchantability and fitness for a particular purpose are disclaimed. 36 // In no event shall the Intel Corporation or contributors be liable for any direct, 37 // indirect, incidental, special, exemplary, or consequential damages 38 // (including, but not limited to, procurement of substitute goods or services; 39 // loss of use, data, or profits; or business interruption) however caused 40 // and on any theory of liability, whether in contract, strict liability, 41 // or tort (including negligence or otherwise) arising in any way out of 42 // the use of this software, even if advised of the possibility of such damage. 43 // 44 //M*/ 45 46 #if cn != 3 47 #define loadpix(addr) *(__global const T *)(addr) 48 #define storepix(val, addr) *(__global T *)(addr) = val 49 #define TSIZE (int)sizeof(T) 50 #else 51 #define loadpix(addr) vload3(0, (__global const T1 *)(addr)) 52 #define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr)) 53 #define TSIZE ((int)sizeof(T1)*3) 54 #endif 55 56 #ifndef INPLACE 57 58 #define LDS_STEP (TILE_DIM + 1) 59 60 __kernel void transpose(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, 61 __global uchar * dstptr, int dst_step, int dst_offset) 62 { 63 int gp_x = get_group_id(0), gp_y = get_group_id(1); 64 int gs_x = get_num_groups(0), gs_y = get_num_groups(1); 65 66 int groupId_x, groupId_y; 67 68 if (src_rows == src_cols) 69 { 70 groupId_y = gp_x; 71 groupId_x = (gp_x + gp_y) % gs_x; 72 } 73 else 74 { 75 int bid = mad24(gs_x, gp_y, gp_x); 76 groupId_y = bid % gs_y; 77 groupId_x = ((bid / gs_y) + groupId_y) % gs_x; 78 } 79 80 int lx = get_local_id(0); 81 int ly = get_local_id(1); 82 83 int x = mad24(groupId_x, TILE_DIM, lx); 84 int y = mad24(groupId_y, TILE_DIM, ly); 85 86 int x_index = mad24(groupId_y, TILE_DIM, lx); 87 int y_index = mad24(groupId_x, TILE_DIM, ly); 88 89 __local T tile[TILE_DIM * LDS_STEP]; 90 91 if (x < src_cols && y < src_rows) 92 { 93 int index_src = mad24(y, src_step, mad24(x, TSIZE, src_offset)); 94 95 #pragma unroll 96 for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) 97 if (y + i < src_rows) 98 { 99 tile[mad24(ly + i, LDS_STEP, lx)] = loadpix(srcptr + index_src); 100 index_src = mad24(BLOCK_ROWS, src_step, index_src); 101 } 102 } 103 barrier(CLK_LOCAL_MEM_FENCE); 104 105 if (x_index < src_rows && y_index < src_cols) 106 { 107 int index_dst = mad24(y_index, dst_step, mad24(x_index, TSIZE, dst_offset)); 108 109 #pragma unroll 110 for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) 111 if ((y_index + i) < src_cols) 112 { 113 storepix(tile[mad24(lx, LDS_STEP, ly + i)], dstptr + index_dst); 114 index_dst = mad24(BLOCK_ROWS, dst_step, index_dst); 115 } 116 } 117 } 118 119 #else 120 121 __kernel void transpose_inplace(__global uchar * srcptr, int src_step, int src_offset, int src_rows) 122 { 123 int x = get_global_id(0); 124 int y = get_global_id(1) * rowsPerWI; 125 126 if (x < y + rowsPerWI) 127 { 128 int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset)); 129 int dst_index = mad24(x, src_step, mad24(y, TSIZE, src_offset)); 130 T tmp; 131 132 #pragma unroll 133 for (int i = 0; i < rowsPerWI; ++i, ++y, src_index += src_step, dst_index += TSIZE) 134 if (y < src_rows && x < y) 135 { 136 __global uchar * src = srcptr + src_index; 137 __global uchar * dst = srcptr + dst_index; 138 139 tmp = loadpix(dst); 140 storepix(loadpix(src), dst); 141 storepix(tmp, src); 142 } 143 } 144 } 145 146 #endif // INPLACE 147