1 // This file is auto-generated. Do not edit! 2 3 #include "precomp.hpp" 4 #include "opencl_kernels_photo.hpp" 5 6 namespace cv 7 { 8 namespace ocl 9 { 10 namespace photo 11 { 12 13 const struct ProgramEntry nlmeans={"nlmeans", 14 "#ifdef cl_amd_printf\n" 15 "#pragma OPENCL_EXTENSION cl_amd_printf:enable\n" 16 "#endif\n" 17 "#ifdef DOUBLE_SUPPORT\n" 18 "#ifdef cl_amd_fp64\n" 19 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" 20 "#elif defined cl_khr_fp64\n" 21 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" 22 "#endif\n" 23 "#endif\n" 24 "#ifdef OP_CALC_WEIGHTS\n" 25 "__kernel void calcAlmostDist2Weight(__global wlut_t * almostDist2Weight, int almostMaxDist,\n" 26 "FT almostDist2ActualDistMultiplier, int fixedPointMult,\n" 27 "w_t den, FT WEIGHT_THRESHOLD)\n" 28 "{\n" 29 "int almostDist = get_global_id(0);\n" 30 "if (almostDist < almostMaxDist)\n" 31 "{\n" 32 "FT dist = almostDist * almostDist2ActualDistMultiplier;\n" 33 "#ifdef ABS\n" 34 "w_t w = exp((w_t)(-dist*dist) * den);\n" 35 "#else\n" 36 "w_t w = exp((w_t)(-dist) * den);\n" 37 "#endif\n" 38 "wlut_t weight = convert_wlut_t(fixedPointMult * (isnan(w) ? (w_t)1.0 : w));\n" 39 "almostDist2Weight[almostDist] =\n" 40 "weight < (wlut_t)(WEIGHT_THRESHOLD * fixedPointMult) ? (wlut_t)0 : weight;\n" 41 "}\n" 42 "}\n" 43 "#elif defined OP_CALC_FASTNLMEANS\n" 44 "#define noconvert\n" 45 "#define SEARCH_SIZE_SQ (SEARCH_SIZE * SEARCH_SIZE)\n" 46 "inline int calcDist(pixel_t a, pixel_t b)\n" 47 "{\n" 48 "#ifdef ABS\n" 49 "int_t retval = convert_int_t(abs_diff(a, b));\n" 50 "#else\n" 51 "int_t diff = convert_int_t(a) - convert_int_t(b);\n" 52 "int_t retval = diff * diff;\n" 53 "#endif\n" 54 "#if cn == 1\n" 55 "return retval;\n" 56 "#elif cn == 2\n" 57 "return retval.x + retval.y;\n" 58 "#elif cn == 3\n" 59 "return retval.x + retval.y + retval.z;\n" 60 "#elif cn == 4\n" 61 "return retval.x + retval.y + retval.z + retval.w;\n" 62 "#else\n" 63 "#error \"cn should be either 1, 2, 3 or 4\"\n" 64 "#endif\n" 65 "}\n" 66 "#ifdef ABS\n" 67 "inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)\n" 68 "{\n" 69 "return calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t);\n" 70 "}\n" 71 "#else\n" 72 "inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)\n" 73 "{\n" 74 "int_t A = convert_int_t(down_value) - convert_int_t(down_value_t);\n" 75 "int_t B = convert_int_t(up_value) - convert_int_t(up_value_t);\n" 76 "int_t retval = (A - B) * (A + B);\n" 77 "#if cn == 1\n" 78 "return retval;\n" 79 "#elif cn == 2\n" 80 "return retval.x + retval.y;\n" 81 "#elif cn == 3\n" 82 "return retval.x + retval.y + retval.z;\n" 83 "#elif cn == 4\n" 84 "return retval.x + retval.y + retval.z + retval.w;\n" 85 "#else\n" 86 "#error \"cn should be either 1, 2, 3 or 4\"\n" 87 "#endif\n" 88 "}\n" 89 "#endif\n" 90 "#define COND if (x == 0 && y == 0)\n" 91 "inline void calcFirstElementInRow(__global const uchar * src, int src_step, int src_offset,\n" 92 "__local int * dists, int y, int x, int id,\n" 93 "__global int * col_dists, __global int * up_col_dists)\n" 94 "{\n" 95 "y -= TEMPLATE_SIZE2;\n" 96 "int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;\n" 97 "int col_dists_current_private[TEMPLATE_SIZE];\n" 98 "for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)\n" 99 "{\n" 100 "int dist = 0, value;\n" 101 "__global const pixel_t * src_template = (__global const pixel_t *)(src +\n" 102 "mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));\n" 103 "__global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));\n" 104 "__global int * col_dists_current = col_dists + i * TEMPLATE_SIZE;\n" 105 "#pragma unroll\n" 106 "for (int j = 0; j < TEMPLATE_SIZE; ++j)\n" 107 "col_dists_current_private[j] = 0;\n" 108 "for (int ty = 0; ty < TEMPLATE_SIZE; ++ty)\n" 109 "{\n" 110 "#pragma unroll\n" 111 "for (int tx = -TEMPLATE_SIZE2; tx <= TEMPLATE_SIZE2; ++tx)\n" 112 "{\n" 113 "value = calcDist(src_template[tx], src_current[tx]);\n" 114 "col_dists_current_private[tx + TEMPLATE_SIZE2] += value;\n" 115 "dist += value;\n" 116 "}\n" 117 "src_current = (__global const pixel_t *)((__global const uchar *)src_current + src_step);\n" 118 "src_template = (__global const pixel_t *)((__global const uchar *)src_template + src_step);\n" 119 "}\n" 120 "#pragma unroll\n" 121 "for (int j = 0; j < TEMPLATE_SIZE; ++j)\n" 122 "col_dists_current[j] = col_dists_current_private[j];\n" 123 "dists[i] = dist;\n" 124 "up_col_dists[0 + i] = col_dists[TEMPLATE_SIZE - 1];\n" 125 "}\n" 126 "}\n" 127 "inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset,\n" 128 "__local int * dists, int y, int x0, int x, int id, int first,\n" 129 "__global int * col_dists, __global int * up_col_dists)\n" 130 "{\n" 131 "x += TEMPLATE_SIZE2;\n" 132 "y -= TEMPLATE_SIZE2;\n" 133 "int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;\n" 134 "for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)\n" 135 "{\n" 136 "__global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));\n" 137 "__global const pixel_t * src_template = (__global const pixel_t *)(src +\n" 138 "mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));\n" 139 "__global int * col_dists_current = col_dists + TEMPLATE_SIZE * i;\n" 140 "int col_dist = 0;\n" 141 "#pragma unroll\n" 142 "for (int ty = 0; ty < TEMPLATE_SIZE; ++ty)\n" 143 "{\n" 144 "col_dist += calcDist(src_current[0], src_template[0]);\n" 145 "src_current = (__global const pixel_t *)((__global const uchar *)src_current + src_step);\n" 146 "src_template = (__global const pixel_t *)((__global const uchar *)src_template + src_step);\n" 147 "}\n" 148 "dists[i] += col_dist - col_dists_current[first];\n" 149 "col_dists_current[first] = col_dist;\n" 150 "up_col_dists[mad24(x0, SEARCH_SIZE_SQ, i)] = col_dist;\n" 151 "}\n" 152 "}\n" 153 "inline void calcElement(__global const uchar * src, int src_step, int src_offset,\n" 154 "__local int * dists, int y, int x0, int x, int id, int first,\n" 155 "__global int * col_dists, __global int * up_col_dists)\n" 156 "{\n" 157 "int sx = x + TEMPLATE_SIZE2;\n" 158 "int sy_up = y - TEMPLATE_SIZE2 - 1;\n" 159 "int sy_down = y + TEMPLATE_SIZE2;\n" 160 "pixel_t up_value = *(__global const pixel_t *)(src + mad24(sy_up, src_step, mad24(psz, sx, src_offset)));\n" 161 "pixel_t down_value = *(__global const pixel_t *)(src + mad24(sy_down, src_step, mad24(psz, sx, src_offset)));\n" 162 "sx -= SEARCH_SIZE2;\n" 163 "sy_up -= SEARCH_SIZE2;\n" 164 "sy_down -= SEARCH_SIZE2;\n" 165 "for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)\n" 166 "{\n" 167 "int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE;\n" 168 "pixel_t up_value_t = *(__global const pixel_t *)(src + mad24(sy_up + wy, src_step, mad24(psz, sx + wx, src_offset)));\n" 169 "pixel_t down_value_t = *(__global const pixel_t *)(src + mad24(sy_down + wy, src_step, mad24(psz, sx + wx, src_offset)));\n" 170 "__global int * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first);\n" 171 "__global int * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i);\n" 172 "int col_dist = up_col_dists_current[0] + calcDistUpDown(down_value, down_value_t, up_value, up_value_t);\n" 173 "dists[i] += col_dist - col_dists_current[0];\n" 174 "col_dists_current[0] = col_dist;\n" 175 "up_col_dists_current[0] = col_dist;\n" 176 "}\n" 177 "}\n" 178 "inline void convolveWindow(__global const uchar * src, int src_step, int src_offset,\n" 179 "__local int * dists, __global const wlut_t * almostDist2Weight,\n" 180 "__global uchar * dst, int dst_step, int dst_offset,\n" 181 "int y, int x, int id, __local weight_t * weights_local,\n" 182 "__local sum_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)\n" 183 "{\n" 184 "int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;\n" 185 "weight_t weights = (weight_t)0;\n" 186 "sum_t weighted_sum = (sum_t)0;\n" 187 "for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)\n" 188 "{\n" 189 "int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, psz, src_offset));\n" 190 "sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index));\n" 191 "int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;\n" 192 "weight_t weight = convert_weight_t(almostDist2Weight[almostAvgDist]);\n" 193 "weights += weight;\n" 194 "weighted_sum += (sum_t)weight * src_value;\n" 195 "}\n" 196 "weights_local[id] = weights;\n" 197 "weighted_sum_local[id] = weighted_sum;\n" 198 "barrier(CLK_LOCAL_MEM_FENCE);\n" 199 "for (int lsize = CTA_SIZE >> 1; lsize > 2; lsize >>= 1)\n" 200 "{\n" 201 "if (id < lsize)\n" 202 "{\n" 203 "int id2 = lsize + id;\n" 204 "weights_local[id] += weights_local[id2];\n" 205 "weighted_sum_local[id] += weighted_sum_local[id2];\n" 206 "}\n" 207 "barrier(CLK_LOCAL_MEM_FENCE);\n" 208 "}\n" 209 "if (id == 0)\n" 210 "{\n" 211 "int dst_index = mad24(y, dst_step, mad24(psz, x, dst_offset));\n" 212 "sum_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] +\n" 213 "weighted_sum_local[2] + weighted_sum_local[3];\n" 214 "weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];\n" 215 "*(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)weights_local_0);\n" 216 "}\n" 217 "}\n" 218 "__kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset,\n" 219 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" 220 "__global const wlut_t * almostDist2Weight, __global uchar * buffer,\n" 221 "int almostTemplateWindowSizeSqBinShift)\n" 222 "{\n" 223 "int block_x = get_group_id(0), nblocks_x = get_num_groups(0);\n" 224 "int block_y = get_group_id(1);\n" 225 "int id = get_local_id(0), first;\n" 226 "__local int dists[SEARCH_SIZE_SQ];\n" 227 "__local weight_t weights[CTA_SIZE];\n" 228 "__local sum_t weighted_sum[CTA_SIZE];\n" 229 "int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols);\n" 230 "int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows);\n" 231 "int block_data_start = SEARCH_SIZE_SQ * (mad24(block_y, dst_cols, x0) + mad24(block_y, nblocks_x, block_x) * TEMPLATE_SIZE);\n" 232 "__global int * col_dists = (__global int *)(buffer + block_data_start * sizeof(int));\n" 233 "__global int * up_col_dists = col_dists + SEARCH_SIZE_SQ * TEMPLATE_SIZE;\n" 234 "for (int y = y0; y < y1; ++y)\n" 235 "for (int x = x0; x < x1; ++x)\n" 236 "{\n" 237 "if (x == x0)\n" 238 "{\n" 239 "calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists);\n" 240 "first = 0;\n" 241 "}\n" 242 "else\n" 243 "{\n" 244 "if (y == y0)\n" 245 "calcElementInFirstRow(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists);\n" 246 "else\n" 247 "calcElement(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists);\n" 248 "first = (first + 1) % TEMPLATE_SIZE;\n" 249 "}\n" 250 "convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset,\n" 251 "y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift);\n" 252 "}\n" 253 "}\n" 254 "#endif\n" 255 , "094aea838a917cea483f77e19dd39de3"}; 256 ProgramSource nlmeans_oclsrc(nlmeans.programStr); 257 } 258 }} 259