Home | History | Annotate | Download | only in cl_kernel
      1 /*
      2  * kernel_gauss_lap_pyramid.cl
      3  * input0
      4  * input1
      5  * output
      6  * window, pos_x, pos_y, width, height
      7  */
      8 
      9 #ifndef PYRAMID_UV
     10 #define PYRAMID_UV 0
     11 #endif
     12 
     13 #ifndef CL_PYRAMID_ENABLE_DUMP
     14 #define CL_PYRAMID_ENABLE_DUMP 0
     15 #endif
     16 
     17 #ifndef ENABLE_MASK_GAUSS_SCALE
     18 #define ENABLE_MASK_GAUSS_SCALE 0
     19 #endif
     20 
     21 #define fixed_pixels 8
     22 #define GAUSS_V_R 2
     23 #define GAUSS_H_R 1
     24 #define COEFF_MID 4
     25 
     26 #define zero8 (float8)(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)
     27 
     28 __constant const float coeffs[9] = {0.0f, 0.0f, 0.152f, 0.222f, 0.252f, 0.222f, 0.152f, 0.0f, 0.0f};
     29 
     30 #define ARG_FORMAT4 "(%.1f,%.1f,%.1f,%.1f)"
     31 #define ARGS4(a) a.s0, a.s1, a.s2, a.s3
     32 
     33 #define ARG_FORMAT8 "(%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f)"
     34 #define ARGS8(a) a.s0, a.s1, a.s2, a.s3, a.s4, a.s5, a.s6, a.s7
     35 
     36 /*
     37  * input: RGBA-CL_UNSIGNED_INT16
     38  * output_gauss: RGBA-CL_UNSIGNED_INT8
     39  * output_lap:RGBA-CL_UNSIGNED_INT16
     40  * each work-item calc 2 lines
     41  */
     42 __kernel void
     43 kernel_gauss_scale_transform (
     44     __read_only image2d_t input, int in_offset_x,
     45     __write_only image2d_t output_gauss
     46 #if CL_PYRAMID_ENABLE_DUMP
     47     , __write_only image2d_t dump_orig
     48 #endif
     49 )
     50 {
     51     int g_x = get_global_id (0);
     52     int in_x = g_x + in_offset_x;
     53     int g_y = get_global_id (1) * 4;
     54     const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
     55 
     56     int g_out_x = get_global_id (0);
     57     int g_out_y = get_global_id (1) * 2;
     58 
     59 #if CL_PYRAMID_ENABLE_DUMP
     60     write_imageui (dump_orig, (int2)(g_x, g_y + 0), read_imageui(input, sampler, (int2)(in_x, g_y)));
     61     write_imageui (dump_orig, (int2)(g_x, g_y + 1), read_imageui(input, sampler, (int2)(in_x, g_y + 1)));
     62     write_imageui (dump_orig, (int2)(g_x, g_y + 2), read_imageui(input, sampler, (int2)(in_x, g_y + 2)));
     63     write_imageui (dump_orig, (int2)(g_x, g_y + 3), read_imageui(input, sampler, (int2)(in_x, g_y + 3)));
     64 #endif
     65 
     66     float8 result_pre[2] = {zero8, zero8};
     67     float8 result_next[2] = {zero8, zero8};
     68     float8 result_cur[2] = {zero8, zero8};
     69     float4 final_g[2];
     70 
     71     float8 tmp_data;
     72     int i_ver;
     73 
     74 #pragma unroll
     75     for (i_ver = -GAUSS_V_R; i_ver <= GAUSS_V_R + 2; i_ver++) {
     76         int cur_g_y = g_y + i_ver;
     77         float coeff0 = coeffs[i_ver + COEFF_MID];
     78         float coeff1 = coeffs[i_ver + COEFF_MID - 2];
     79         tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x - 1, cur_g_y)))));
     80         result_pre[0] += tmp_data * coeff0;
     81         result_pre[1] += tmp_data * coeff1;
     82         tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x, cur_g_y)))));
     83         result_cur[0] += tmp_data * coeff0;
     84         result_cur[1] += tmp_data * coeff1;
     85         tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x + 1, cur_g_y)))));
     86         result_next[1] += tmp_data * coeff1;
     87         result_next[0] += tmp_data * coeff0;
     88     }
     89 
     90     int i_line;
     91 #pragma unroll
     92     for (i_line = 0; i_line < 2; ++i_line) {
     93 #if !PYRAMID_UV
     94         final_g[i_line] = result_cur[i_line].even * coeffs[COEFF_MID] +
     95                           (float4)(result_pre[i_line].s7, result_cur[i_line].s135) * coeffs[COEFF_MID + 1] +
     96                           (float4)(result_pre[i_line].s6, result_cur[i_line].s024) * coeffs[COEFF_MID + 2] +
     97                           (float4)(result_cur[i_line].s1357) * coeffs[COEFF_MID + 1] +
     98                           (float4)(result_cur[i_line].s246, result_next[i_line].s0) * coeffs[COEFF_MID + 2];
     99 #else
    100         final_g[i_line] = result_cur[i_line].s0145 * coeffs[COEFF_MID] +
    101                           (float4)(result_pre[i_line].s67, result_cur[i_line].s23) * coeffs[COEFF_MID + 1] +
    102                           (float4)(result_pre[i_line].s45, result_cur[i_line].s01) * coeffs[COEFF_MID + 2] +
    103                           (float4)(result_cur[i_line].s2367) * coeffs[COEFF_MID + 1] +
    104                           (float4)(result_cur[i_line].s45, result_next[i_line].s01) * coeffs[COEFF_MID + 2];
    105 #endif
    106         final_g[i_line] = clamp (final_g[i_line] + 0.5f, 0.0f, 255.0f);
    107         write_imageui (output_gauss, (int2)(g_out_x, g_out_y + i_line), convert_uint4(final_g[i_line]));
    108     }
    109 
    110 }
    111 
    112 inline float8
    113 read_scale_y (__read_only image2d_t input, const sampler_t sampler, float2 pos_start, float step_x)
    114 {
    115     float8 data;
    116     data.s0 = read_imagef (input, sampler, pos_start).x;
    117     pos_start.x += step_x;
    118     data.s1 = read_imagef (input, sampler, pos_start).x;
    119     pos_start.x += step_x;
    120     data.s2 = read_imagef (input, sampler, pos_start).x;
    121     pos_start.x += step_x;
    122     data.s3 = read_imagef (input, sampler, pos_start).x;
    123     pos_start.x += step_x;
    124     data.s4 = read_imagef (input, sampler, pos_start).x;
    125     pos_start.x += step_x;
    126     data.s5 = read_imagef (input, sampler, pos_start).x;
    127     pos_start.x += step_x;
    128     data.s6 = read_imagef (input, sampler, pos_start).x;
    129     pos_start.x += step_x;
    130     data.s7 = read_imagef (input, sampler, pos_start).x;
    131     return data;
    132 }
    133 
    134 inline float8
    135 read_scale_uv (__read_only image2d_t input, const sampler_t sampler, float2 pos_start, float step_x)
    136 {
    137     float8 data;
    138     data.s01 = read_imagef (input, sampler, pos_start).xy;
    139     pos_start.x += step_x;
    140     data.s23 = read_imagef (input, sampler, pos_start).xy;
    141     pos_start.x += step_x;
    142     data.s45 = read_imagef (input, sampler, pos_start).xy;
    143     pos_start.x += step_x;
    144     data.s67 = read_imagef (input, sampler, pos_start).xy;
    145     return data;
    146 }
    147 
    148 /*
    149  * input_gauss: RGBA-CL_UNSIGNED_INT18
    150  * input_lap: RGBA-CL_UNSIGNED_INT16
    151  * output:     RGBA-CL_UNSIGNED_INT16
    152  * each work-item calc 2 lines
    153  */
    154 __kernel void
    155 kernel_gauss_lap_reconstruct (
    156     __read_only image2d_t input_gauss,
    157     float in_sampler_offset_x, float in_sampler_offset_y,
    158     __read_only image2d_t input_lap,
    159     __write_only image2d_t output, int out_offset_x, float out_width, float out_height
    160 #if CL_PYRAMID_ENABLE_DUMP
    161     , __write_only image2d_t dump_resize, __write_only image2d_t dump_final
    162 #endif
    163 )
    164 {
    165     int g_x = get_global_id (0);
    166     int g_y = get_global_id (1);
    167     const sampler_t lap_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    168     const sampler_t gauss_sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
    169 
    170     //if (g_x > out_width + 0.9f || g_y > out_height + 0.5f)
    171     //    return;
    172 
    173     float8 lap = convert_float8(as_uchar8(convert_ushort4(read_imageui(input_lap, lap_sampler, (int2)(g_x, g_y)))));
    174     lap = (lap - 128.0f) * 2.0f;
    175 
    176     float8 data_g;
    177     float2 input_gauss_pos;
    178     float step_x;
    179     input_gauss_pos.x = g_x / out_width + in_sampler_offset_x;
    180     input_gauss_pos.y = g_y / out_height + in_sampler_offset_y;
    181 #if !PYRAMID_UV
    182     step_x = 0.125f / out_width;
    183     data_g = read_scale_y (input_gauss, gauss_sampler, input_gauss_pos, step_x) * 256.0f;
    184 #else
    185     step_x = 0.25f / out_width;
    186     data_g = read_scale_uv (input_gauss, gauss_sampler, input_gauss_pos, step_x) * 256.0f;
    187 #endif
    188 
    189 #if CL_PYRAMID_ENABLE_DUMP
    190     write_imageui (dump_resize, (int2)(g_x, g_y), convert_uint4(as_ushort4(convert_uchar8(data_g))));
    191 #endif
    192 
    193     data_g += lap + 0.5f;
    194     data_g = clamp (data_g, 0.0f, 255.0f);
    195     write_imageui (output, (int2)(g_x + out_offset_x, g_y), convert_uint4(as_ushort4(convert_uchar8(data_g))));
    196 #if CL_PYRAMID_ENABLE_DUMP
    197     write_imageui (dump_final, (int2)(g_x, g_y), convert_uint4(as_ushort4(convert_uchar8(data_g))));
    198 #endif
    199 }
    200 
    201 __kernel void
    202 kernel_pyramid_blend (
    203     __read_only image2d_t input0, __read_only image2d_t input1,
    204 #if !PYRAMID_UV
    205     __global const float8 *input0_mask,
    206 #else
    207     __global const float4 *input0_mask,
    208 #endif
    209     __write_only image2d_t output)
    210 {
    211     sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    212     const int g_x = get_global_id (0);
    213     const int g_y = get_global_id (1);
    214     int2 pos = (int2) (g_x, g_y);
    215 
    216     float8 data0 = convert_float8(as_uchar8(convert_ushort4(read_imageui(input0, sampler, pos))));
    217     float8 data1 = convert_float8(as_uchar8(convert_ushort4(read_imageui(input1, sampler, pos))));
    218     float8 out_data;
    219 
    220 #if !PYRAMID_UV
    221     out_data = (data0 - data1) * input0_mask[g_x] + data1;
    222 #else
    223     float8 coeff;
    224     coeff.even = input0_mask[g_x];
    225     coeff.odd = coeff.even;
    226     out_data = (data0 - data1) * coeff + data1;
    227 #endif
    228 
    229     out_data = clamp (out_data + 0.5f, 0.0f, 255.0f);
    230 
    231     write_imageui(output, pos, convert_uint4(as_ushort4(convert_uchar8(out_data))));
    232 }
    233 
    234 __kernel void
    235 kernel_pyramid_scale (
    236     __read_only image2d_t input, __write_only image2d_t output,
    237     int out_offset_x, int output_width, int output_height)
    238 {
    239     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
    240     int g_x = get_global_id (0);
    241     int g_y = get_global_id (1);
    242 
    243     float2 normCoor = (float2)(g_x, g_y) / (float2)(output_width, output_height);
    244     float8 out_data;
    245     float step_x;
    246 
    247 #if !PYRAMID_UV
    248     step_x = 0.125f / output_width;
    249     out_data = read_scale_y (input, sampler, normCoor, step_x) * 255.0f;
    250 #else
    251     step_x = 0.25f / output_width;
    252     out_data = read_scale_uv (input, sampler, normCoor, step_x) * 255.0f;
    253 #endif
    254 
    255     out_data = clamp (out_data + 0.5f, 0.0f, 255.0f);
    256     write_imageui (output, (int2)(g_x + out_offset_x, g_y), convert_uint4(as_ushort4(convert_uchar8(out_data))));
    257 }
    258 
    259 __kernel void
    260 kernel_pyramid_copy (
    261     __read_only image2d_t input, int in_offset_x,
    262     __write_only image2d_t output, int out_offset_x,
    263     int max_g_x, int max_g_y)
    264 {
    265     sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    266     const int g_x = get_global_id (0);
    267     const int g_y = get_global_id (1);
    268 
    269     if (g_x >= max_g_x || g_y >= max_g_y)
    270         return;
    271 
    272     uint4 data = read_imageui (input, sampler, (int2)(g_x + in_offset_x, g_y));
    273     write_imageui (output, (int2)(g_x + out_offset_x, g_y), data);
    274 }
    275 
    276 /*
    277  * input_gauss: RGBA-CL_UNSIGNED_INT18
    278  * input_lap: RGBA-CL_UNSIGNED_INT16
    279  * output:     RGBA-CL_UNSIGNED_INT16
    280  * each work-item calc 2 lines
    281  */
    282 __kernel void
    283 kernel_lap_transform (
    284     __read_only image2d_t input_gauss0, int gauss0_offset_x,
    285     __read_only image2d_t input_gauss1,
    286     float gauss1_sampler_offset_x, float gauss1_sampler_offset_y,
    287     __write_only image2d_t output, int lap_offset_x, float out_width, float out_height)
    288 {
    289     int g_x = get_global_id (0);
    290     int g_y = get_global_id (1);
    291     const sampler_t gauss0_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    292     const sampler_t gauss1_sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
    293 
    294     float8 orig = convert_float8(as_uchar8(convert_ushort4(
    295                       read_imageui(input_gauss0, gauss0_sampler, (int2)(g_x + gauss0_offset_x, g_y)))));
    296     float8 zoom_in;
    297     float2 gauss1_pos;
    298     float sampler_step;
    299     gauss1_pos.y = (g_y / out_height) + gauss1_sampler_offset_y;
    300     gauss1_pos.x = (g_x / out_width) + gauss1_sampler_offset_x;
    301 
    302 #if !PYRAMID_UV
    303     sampler_step = 0.125f / out_width;
    304     zoom_in.s0 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
    305     gauss1_pos.x += sampler_step;
    306     zoom_in.s1 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
    307     gauss1_pos.x += sampler_step;
    308     zoom_in.s2 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
    309     gauss1_pos.x += sampler_step;
    310     zoom_in.s3 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
    311     gauss1_pos.x += sampler_step;
    312     zoom_in.s4 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
    313     gauss1_pos.x += sampler_step;
    314     zoom_in.s5 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
    315     gauss1_pos.x += sampler_step;
    316     zoom_in.s6 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
    317     gauss1_pos.x += sampler_step;
    318     zoom_in.s7 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
    319 #else
    320     sampler_step = 0.25f / out_width;
    321     zoom_in.s01 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).xy;
    322     gauss1_pos.x += sampler_step;
    323     zoom_in.s23 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).xy;
    324     gauss1_pos.x += sampler_step;
    325     zoom_in.s45 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).xy;
    326     gauss1_pos.x += sampler_step;
    327     zoom_in.s67 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).xy;
    328 #endif
    329     float8 lap = (orig - zoom_in * 256.0f) * 0.5f + 128.0f + 0.5f;
    330     lap = clamp (lap, 0.0f, 255.0f);
    331     write_imageui (output, (int2)(g_x + lap_offset_x, g_y), convert_uint4(as_ushort4(convert_uchar8(lap))));
    332 }
    333 
    334 
    335 /*
    336  * input0: RGBA-CL_UNSIGNED_INT16
    337  * input1: RGBA-CL_UNSIGNED_INT16
    338  * out_diff:  RGBA-CL_UNSIGNED_INT16
    339  */
    340 __kernel void
    341 kernel_image_diff (
    342     __read_only image2d_t input0, int offset0,
    343     __read_only image2d_t input1, int offset1,
    344     __write_only image2d_t out_diff)
    345 {
    346     int g_x = get_global_id (0);
    347     int g_y = get_global_id (1);
    348     const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    349 
    350     int8 data0 = convert_int8(as_uchar8(convert_ushort4(read_imageui(input0, sampler, (int2)(g_x + offset0, g_y)))));
    351     int8 data1 = convert_int8(as_uchar8(convert_ushort4(read_imageui(input1, sampler, (int2)(g_x + offset1, g_y)))));
    352     uint8 diff = abs_diff (data0, data1);
    353     write_imageui (out_diff, (int2)(g_x, g_y), convert_uint4(as_ushort4(convert_uchar8(diff))));
    354 }
    355 
    356 
    357 /*
    358  * input0: RGBA-CL_UNSIGNED_INT16
    359  */
    360 #define LEFT_POS (int)(-1)
    361 #define MID_POS (int)(0)
    362 #define RIGHT_POS (int)(1)
    363 
    364 __inline int pos_buf_index (int x, int y, int stride)
    365 {
    366     return mad24 (stride, y, x);
    367 }
    368 
    369 __kernel void
    370 kernel_seam_dp (
    371     __read_only image2d_t image,
    372     __global short *pos_buf, __global float *sum_buf, int offset_x, int valid_width,
    373     int max_pos, int seam_height, int seam_stride)
    374 {
    375     int l_x = get_local_id (0);
    376     int group_id = get_group_id (0);
    377     if (l_x >= valid_width)
    378         return;
    379 
    380     // group0 fill first half slice image curve y = [0, seam_height/2 - 1]
    381     // group1 fill send half slice image curve = [seam_height - 1, seam_height/2]
    382     int first_slice_h = seam_height / 2;
    383     int group_h = (group_id == 0 ? first_slice_h : seam_height - first_slice_h);
    384 
    385     __local float slm_sum[4096];
    386     float mid, left, right, cur;
    387     int slm_idx;
    388     int default_pos;
    389 
    390     int x = l_x + offset_x;
    391     const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    392     int y = (group_id == 0 ? 0 : seam_height - 1);
    393     float sum = convert_float(read_imageui(image, sampler, (int2)(x, y)).x);
    394 
    395     default_pos = x;
    396     slm_sum[l_x] = sum;
    397     barrier (CLK_LOCAL_MEM_FENCE);
    398     pos_buf[pos_buf_index(x, y, seam_stride)] = convert_short(default_pos);
    399 
    400     for (int i = 0; i < group_h; ++i) {
    401         y = (group_id == 0 ? i : seam_height - i - 1);
    402         slm_idx = l_x - 1;
    403         slm_idx = (slm_idx > 0 ? slm_idx : 0);
    404         left = slm_sum[slm_idx];
    405         slm_idx = l_x + 1;
    406         slm_idx = (slm_idx < valid_width ? slm_idx : valid_width - 1);
    407         right = slm_sum[slm_idx];
    408 
    409         cur = convert_float(read_imageui(image, sampler, (int2)(x, y)).x);
    410 
    411         left = left + cur;
    412         right = right + cur;
    413         mid = sum + cur;
    414 
    415         int pos;
    416         pos = (left < mid) ? LEFT_POS : MID_POS;
    417         sum = min (left, mid);
    418         pos = (sum < right) ? pos : RIGHT_POS;
    419         sum = min (sum, right);
    420         slm_sum[l_x] = sum;
    421         barrier (CLK_LOCAL_MEM_FENCE);
    422 
    423         pos += default_pos;
    424         pos = clamp (pos, offset_x, max_pos);
    425         //if (l_x == 3)
    426         //    printf ("s:%f, pos:%d, mid:%f, offset_x:%d\n", sum.s0, pos.s0, mid.s0, offset_x);
    427         pos_buf[pos_buf_index(x, y, seam_stride)] = convert_short(pos);
    428     }
    429     sum_buf[group_id * seam_stride + x] = sum;
    430     //printf ("sum(x):%f(x:%d)\n", sum_buf[x].s0, x);
    431 }
    432 
    433 __kernel void
    434 kernel_seam_mask_blend (
    435     __read_only image2d_t input0, __read_only image2d_t input1,
    436     __read_only image2d_t seam_mask,
    437     __write_only image2d_t output)
    438 {
    439     sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    440     const int g_x = get_global_id (0);
    441     const int g_y = get_global_id (1);
    442     int2 pos = (int2) (g_x, g_y);
    443 
    444     float8 data0 = convert_float8(as_uchar8(convert_ushort4(read_imageui(input0, sampler, pos))));
    445     float8 data1 = convert_float8(as_uchar8(convert_ushort4(read_imageui(input1, sampler, pos))));
    446     float8 coeff0 = convert_float8(as_uchar8(convert_ushort4(read_imageui(seam_mask, sampler, pos)))) / 255.0f;
    447     float8 out_data;
    448 
    449 #if !PYRAMID_UV
    450     out_data = (data0 - data1) * coeff0 + data1;
    451 #else
    452     coeff0.even = (coeff0.even + coeff0.odd) * 0.5f;
    453     coeff0.odd = coeff0.even;
    454     out_data = (data0 - data1) * coeff0 + data1;
    455 #endif
    456 
    457     out_data = clamp (out_data + 0.5f, 0.0f, 255.0f);
    458 
    459     write_imageui(output, pos, convert_uint4(as_ushort4(convert_uchar8(out_data))));
    460 }
    461 
    462 
    463 
    464 #define MASK_GAUSS_R 4
    465 #define MASK_COEFF_MID 7
    466 
    467 __constant const float mask_coeffs[] = {0.0f, 0.0f, 0.0f, 0.082f, 0.102f, 0.119f, 0.130f, 0.134f, 0.130f, 0.119f, 0.102f, 0.082f, 0.0f, 0.0f, 0.0f};
    468 
    469 /*
    470  * input: RGBA-CL_UNSIGNED_INT16
    471  * output_gauss: RGBA-CL_UNSIGNED_INT8 ?
    472  * output_lap:RGBA-CL_UNSIGNED_INT16
    473  * each work-item calc 2 lines
    474  */
    475 __kernel void
    476 kernel_mask_gauss_scale_slm (
    477     __read_only image2d_t input,
    478     __write_only image2d_t output_gauss,
    479     int image_width
    480 #if ENABLE_MASK_GAUSS_SCALE
    481     , __write_only image2d_t output_scale
    482 #endif
    483 )
    484 {
    485 #define WI_LINES 2
    486 // input image width MUST < MASK_GAUSS_SLM_WIDTH*4
    487 #define MASK_GAUSS_SLM_WIDTH  256
    488 #define CONV_COEFF 128.0f
    489 
    490     int g_x = get_global_id (0);
    491     int g_y = get_global_id (1) * WI_LINES;
    492     const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    493     __local ushort4 slm_gauss_y[WI_LINES][MASK_GAUSS_SLM_WIDTH];
    494 
    495     float8 result_cur[WI_LINES] = {zero8, zero8};
    496     float8 tmp_data;
    497     int i_line;
    498     int cur_g_y;
    499 
    500 #pragma unroll
    501     for (i_line = -MASK_GAUSS_R; i_line <= MASK_GAUSS_R + 1; i_line++) {
    502         cur_g_y = g_y + i_line;
    503         tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(g_x, cur_g_y)))));
    504         result_cur[0] += tmp_data * mask_coeffs[i_line + MASK_COEFF_MID];
    505         result_cur[1] += tmp_data * mask_coeffs[i_line + MASK_COEFF_MID - 1];
    506     }
    507     ((__local ushort8*)(slm_gauss_y[0]))[g_x] = convert_ushort8(result_cur[0] * CONV_COEFF);
    508     ((__local ushort8*)(slm_gauss_y[1]))[g_x] = convert_ushort8(result_cur[1] * CONV_COEFF);
    509     barrier (CLK_LOCAL_MEM_FENCE);
    510 
    511     float8 final_g[WI_LINES];
    512     float4 result_pre;
    513     float4 result_next;
    514 
    515 #pragma unroll
    516     for (i_line = 0; i_line < WI_LINES; ++i_line) {
    517         result_pre = convert_float4(slm_gauss_y[i_line][clamp (g_x * 2 - 1, 0, image_width * 2)]) / CONV_COEFF;
    518         result_next = convert_float4(slm_gauss_y[i_line][clamp (g_x * 2 + 2, 0, image_width * 2)]) / CONV_COEFF;
    519         final_g[i_line] = result_cur[i_line] * mask_coeffs[MASK_COEFF_MID] +
    520                           (float8)(result_pre.s3, result_cur[i_line].s0123, result_cur[i_line].s456) *
    521                                   mask_coeffs[MASK_COEFF_MID + 1] +
    522                           (float8)(result_cur[i_line].s1234, result_cur[i_line].s567, result_next.s0) *
    523                                   mask_coeffs[MASK_COEFF_MID + 1] +
    524                           (float8)(result_pre.s23, result_cur[i_line].s0123, result_cur[i_line].s45) *
    525                                   mask_coeffs[MASK_COEFF_MID + 2] +
    526                           (float8)(result_cur[i_line].s2345, result_cur[i_line].s67, result_next.s01) *
    527                                   mask_coeffs[MASK_COEFF_MID + 2] +
    528                           (float8)(result_pre.s123, result_cur[i_line].s0123, result_cur[i_line].s4) *
    529                                   mask_coeffs[MASK_COEFF_MID + 3] +
    530                           (float8)(result_cur[i_line].s3456, result_cur[i_line].s7, result_next.s012) *
    531                                   mask_coeffs[MASK_COEFF_MID + 3] +
    532                           (float8)(result_pre.s0123, result_cur[i_line].s0123) * mask_coeffs[MASK_COEFF_MID + 4] +
    533                           (float8)(result_cur[i_line].s4567, result_next.s0123) * mask_coeffs[MASK_COEFF_MID + 4];
    534         final_g[i_line] = clamp (final_g[i_line] + 0.5f, 0.0f, 255.0f);
    535         //if ((g_x == 9 || g_x == 8) && g_y == 0) {
    536         //    printf ("(x:%d, y:0), pre:" ARG_FORMAT4 "cur" ARG_FORMAT8 "next" ARG_FORMAT4 "final:" ARG_FORMAT8 "\n",
    537         //        g_x, ARGS4(result_pre), ARGS8(result_cur[i_line]), ARGS4(result_next), ARGS8(final_g[i_line]));
    538         //}
    539         write_imageui (output_gauss, (int2)(g_x, g_y + i_line), convert_uint4(as_ushort4(convert_uchar8(final_g[i_line]))));
    540     }
    541 
    542 #if ENABLE_MASK_GAUSS_SCALE
    543     write_imageui (output_scale, (int2)(g_x, get_global_id (1)), convert_uint4(final_g[0].even));
    544 #endif
    545 }
    546 
    547 __kernel void
    548 kernel_mask_gauss_scale (
    549     __read_only image2d_t input,
    550     __write_only image2d_t output_gauss
    551 #if ENABLE_MASK_GAUSS_SCALE
    552     , __write_only image2d_t output_scale
    553 #endif
    554 )
    555 {
    556     int g_x = get_global_id (0);
    557     int in_x = g_x;
    558     int g_y = get_global_id (1) * 2;
    559     const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    560 
    561     float8 result_pre[2] = {zero8, zero8};
    562     float8 result_next[2] = {zero8, zero8};
    563     float8 result_cur[2] = {zero8, zero8};
    564     float8 final_g[2];
    565 
    566     float8 tmp_data;
    567     int i_line;
    568     int cur_g_y;
    569     float coeff0, coeff1;
    570 
    571 #pragma unroll
    572     for (i_line = -MASK_GAUSS_R; i_line <= MASK_GAUSS_R + 1; i_line++) {
    573         cur_g_y = g_y + i_line;
    574         coeff0 = mask_coeffs[i_line + MASK_COEFF_MID];
    575         coeff1 = mask_coeffs[i_line + MASK_COEFF_MID - 1];
    576         tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x - 1, cur_g_y)))));
    577         result_pre[0] += tmp_data * coeff0;
    578         result_pre[1] += tmp_data * coeff1;
    579 
    580         tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x, cur_g_y)))));
    581         result_cur[0] += tmp_data * coeff0;
    582         result_cur[1] += tmp_data * coeff1;
    583         tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x + 1, cur_g_y)))));
    584         result_next[1] += tmp_data * coeff1;
    585         result_next[0] += tmp_data * coeff0;
    586     }
    587 
    588 #pragma unroll
    589     for (i_line = 0; i_line < 2; ++i_line) {
    590         final_g[i_line] = result_cur[i_line] * mask_coeffs[MASK_COEFF_MID] +
    591                           (float8)(result_pre[i_line].s7, result_cur[i_line].s0123, result_cur[i_line].s456) *
    592                                   mask_coeffs[MASK_COEFF_MID + 1] +
    593                           (float8)(result_cur[i_line].s1234, result_cur[i_line].s567, result_next[i_line].s0) *
    594                                   mask_coeffs[MASK_COEFF_MID + 1] +
    595                           (float8)(result_pre[i_line].s67, result_cur[i_line].s0123, result_cur[i_line].s45) *
    596                                   mask_coeffs[MASK_COEFF_MID + 2] +
    597                           (float8)(result_cur[i_line].s2345, result_cur[i_line].s67, result_next[i_line].s01) *
    598                                   mask_coeffs[MASK_COEFF_MID + 2] +
    599                           (float8)(result_pre[i_line].s567, result_cur[i_line].s0123, result_cur[i_line].s4) *
    600                                   mask_coeffs[MASK_COEFF_MID + 3] +
    601                           (float8)(result_cur[i_line].s3456,result_cur[i_line].s7, result_next[i_line].s012) *
    602                                   mask_coeffs[MASK_COEFF_MID + 3] +
    603                           (float8)(result_pre[i_line].s4567, result_cur[i_line].s0123) * mask_coeffs[MASK_COEFF_MID + 4] +
    604                           (float8)(result_cur[i_line].s4567, result_next[i_line].s0123) * mask_coeffs[MASK_COEFF_MID + 4];
    605         final_g[i_line] = clamp (final_g[i_line] + 0.5f, 0.0f, 255.0f);
    606         write_imageui (output_gauss, (int2)(g_x, g_y + i_line), convert_uint4(as_ushort4(convert_uchar8(final_g[i_line]))));
    607     }
    608 
    609 #if ENABLE_MASK_GAUSS_SCALE
    610     write_imageui (output_scale, (int2)(g_x, get_global_id (1)), convert_uint4(final_g[0].even));
    611 #endif
    612 
    613 }
    614 
    615