Home | History | Annotate | Download | only in cl_kernel
      1 /*
      2  * function: kernel_bayer_copy
      3  *     sample code of default kernel arguments
      4  * input:    image2d_t as read only
      5  * output:   image2d_t as write only
      6  */
      7 
      8 //#define ENABLE_IMAGE_2D_INPUT 0
      9 
     10 #ifndef STATS_BITS
     11 #define STATS_BITS 8
     12 #endif
     13 
     14 /*
     15  * GROUP_PIXEL_X_SIZE = 2 * GROUP_CELL_X_SIZE
     16  * GROUP_PIXEL_Y_SIZE = 2 * GROUP_CELL_Y_SIZE
     17 */
     18 
     19 #define GROUP_CELL_X_SIZE 64
     20 #define GROUP_CELL_Y_SIZE 4
     21 
     22 //float4; 16
     23 #define SLM_X_SIZE (GROUP_CELL_X_SIZE / 4)
     24 #define SLM_Y_SIZE GROUP_CELL_Y_SIZE
     25 
     26 #define STATS_3A_CELL_X_SIZE 8
     27 #define STATS_3A_CELL_Y_SIZE GROUP_CELL_Y_SIZE
     28 
     29 typedef struct  {
     30     float  level_gr;  /* Black level for GR pixels */
     31     float  level_r;   /* Black level for R pixels */
     32     float  level_b;   /* Black level for B pixels */
     33     float  level_gb;  /* Black level for GB pixels */
     34     uint   color_bits;
     35 } CLBLCConfig;
     36 
     37 
     38 typedef struct
     39 {
     40     float r_gain;
     41     float gr_gain;
     42     float gb_gain;
     43     float b_gain;
     44 } CLWBConfig;
     45 
     46 inline int slm_pos (const int x, const int y)
     47 {
     48     return mad24 (y, SLM_X_SIZE, x);
     49 }
     50 
     51 inline void gamma_correct(float8 *in_out, __global float *table)
     52 {
     53     in_out->s0 = table[clamp(convert_int(in_out->s0 * 255.0f), 0, 255)];
     54     in_out->s1 = table[clamp(convert_int(in_out->s1 * 255.0f), 0, 255)];
     55     in_out->s2 = table[clamp(convert_int(in_out->s2 * 255.0f), 0, 255)];
     56     in_out->s3 = table[clamp(convert_int(in_out->s3 * 255.0f), 0, 255)];
     57     in_out->s4 = table[clamp(convert_int(in_out->s4 * 255.0f), 0, 255)];
     58     in_out->s5 = table[clamp(convert_int(in_out->s5 * 255.0f), 0, 255)];
     59     in_out->s6 = table[clamp(convert_int(in_out->s6 * 255.0f), 0, 255)];
     60     in_out->s7 = table[clamp(convert_int(in_out->s7 * 255.0f), 0, 255)];
     61 }
     62 
     63 inline float avg_float8 (float8 data)
     64 {
     65     return (data.s0 + data.s1 + data.s2 + data.s3 + data.s4 + data.s5 + data.s6 + data.s7) * 0.125f;
     66 }
     67 
     68 inline void stats_3a_calculate (
     69     __local float4 * slm_gr,
     70     __local float4 * slm_r,
     71     __local float4 * slm_b,
     72     __local float4 * slm_gb,
     73     __global ushort8 * stats_output,
     74     CLWBConfig *wb_config)
     75 {
     76     const int group_x_size = get_num_groups (0);
     77     const int group_id_x = get_group_id (0);
     78     const int group_id_y = get_group_id (1);
     79 
     80     const int l_id_x = get_local_id (0);
     81     const int l_id_y = get_local_id (1);
     82     const int l_size_x = get_local_size (0);
     83     const int stats_float4_x_count = STATS_3A_CELL_X_SIZE / 4;
     84     int count =  stats_float4_x_count * STATS_3A_CELL_Y_SIZE / 4;
     85 
     86     int index = mad24 (l_id_y, l_size_x, l_id_x);
     87     int index_x = index % SLM_X_SIZE;
     88     int index_y = index / SLM_X_SIZE;
     89 
     90     if (mad24 (index_y,  stats_float4_x_count, index_x % stats_float4_x_count) < count) {
     91         int pitch_count = count / stats_float4_x_count * SLM_X_SIZE;
     92         int index1 = index + pitch_count;
     93         int index2 = index1 + pitch_count;
     94         int index3 = index2 + pitch_count;
     95         slm_gr[index] = (slm_gr[index] + slm_gr[index1] + slm_gr[index2] + slm_gr[index3]) * 0.25f;
     96         slm_r[index] = (slm_r[index] + slm_r[index1] + slm_r[index2] + slm_r[index3]) * 0.25f;
     97         slm_b[index] = (slm_b[index] + slm_b[index1] + slm_b[index2] + slm_b[index3]) * 0.25f;
     98         slm_gb[index] = (slm_gb[index] + slm_gb[index1] + slm_gb[index2] + slm_gb[index3]) * 0.25f;
     99     }
    100     barrier (CLK_LOCAL_MEM_FENCE);
    101 
    102     if (index < SLM_X_SIZE / 2) {
    103         float result_gr, result_r, result_b, result_gb, avg_y;
    104         float8 tmp;
    105         tmp = ((__local float8*)slm_gr)[index];
    106         result_gr = avg_float8 (tmp);
    107 
    108         tmp = ((__local float8*)slm_r)[index];
    109         result_r = avg_float8 (tmp);
    110 
    111         tmp = ((__local float8*)slm_b)[index];
    112         result_b = avg_float8 (tmp);
    113 
    114         tmp = ((__local float8*)slm_gb)[index];
    115         result_gb = avg_float8 (tmp);
    116 
    117         int out_index = mad24 (mad24 (group_id_y, group_x_size, group_id_x),
    118                                (GROUP_CELL_X_SIZE / STATS_3A_CELL_X_SIZE) * (GROUP_CELL_Y_SIZE / STATS_3A_CELL_Y_SIZE),
    119                                index);
    120 
    121 #if STATS_BITS==8
    122         avg_y = mad ((result_gr * wb_config->gr_gain + result_gb * wb_config->gb_gain), 74.843f,
    123                      mad (result_r * wb_config->r_gain, 76.245f, result_b * 29.070f));
    124 
    125         //ushort avg_y; avg_r; avg_gr; avg_gb; avg_b; valid_wb_count; f_value1; f_value2;
    126         stats_output[out_index] = (ushort8) (
    127                                       convert_ushort (convert_uchar_sat (avg_y)),
    128                                       convert_ushort (convert_uchar_sat (result_r * 255.0f)),
    129                                       convert_ushort (convert_uchar_sat (result_gr * 255.0f)),
    130                                       convert_ushort (convert_uchar_sat (result_gb * 255.0f)),
    131                                       convert_ushort (convert_uchar_sat (result_b * 255.0f)),
    132                                       STATS_3A_CELL_X_SIZE * STATS_3A_CELL_Y_SIZE,
    133                                       0,
    134                                       0);
    135 #elif STATS_BITS==12
    136         avg_y = mad ((result_gr * wb_config->gr_gain + result_gb * wb_config->gb_gain), 1201.883f,
    137                      mad (result_r * wb_config->r_gain, 1224.405f, result_b * 466.830f));
    138 
    139         stats_output[out_index] = (ushort8) (
    140                                       convert_ushort (clamp (avg_y, 0.0f, 4095.0f)),
    141                                       convert_ushort (clamp (result_r * 4096.0f, 0.0f, 4095.0f)),
    142                                       convert_ushort (clamp (result_gr * 4096.0f, 0.0f, 4095.0f)),
    143                                       convert_ushort (clamp (result_gb * 4096.0f, 0.0f, 4095.0f)),
    144                                       convert_ushort (clamp (result_b * 4096.0f, 0.0f, 4095.0f)),
    145                                       STATS_3A_CELL_X_SIZE * STATS_3A_CELL_Y_SIZE,
    146                                       0,
    147                                       0);
    148 #else
    149         printf ("kernel 3a-stats error, wrong bit depth:%d\n", STATS_BITS);
    150 #endif
    151     }
    152 }
    153 
    154 
    155 __kernel void kernel_bayer_basic (
    156 #if ENABLE_IMAGE_2D_INPUT
    157     __read_only image2d_t input,
    158 #else
    159     __global const ushort8 *input,
    160 #endif
    161     uint input_aligned_width,
    162     __write_only image2d_t output,
    163     uint out_height,
    164     CLBLCConfig blc_config,
    165     CLWBConfig wb_config,
    166     __global float *gamma_table,
    167     __global ushort8 *stats_output
    168 )
    169 {
    170     int g_x = get_global_id (0);
    171     int g_y = get_global_id (1);
    172 
    173     const int l_x = get_local_id (0);
    174     const int l_y = get_local_id (1);
    175     const int l_x_size = get_local_size (0);
    176     const int l_y_size = get_local_size (1);
    177     const int group_id_x = get_group_id (0);
    178     const int group_id_y = get_group_id (1);
    179 
    180     sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
    181 
    182     int index = mad24 (l_y, l_x_size, l_x);
    183     int x_cell_start = (GROUP_CELL_X_SIZE / 4) * group_id_x;
    184     int y_cell_start = GROUP_CELL_Y_SIZE * group_id_y;
    185     int x, y;
    186 
    187     float blc_multiplier = (float)(1 << (16 - blc_config.color_bits));
    188 
    189     __local float4 slm_gr[SLM_X_SIZE * SLM_Y_SIZE], slm_r[SLM_X_SIZE * SLM_Y_SIZE], slm_b[SLM_X_SIZE * SLM_Y_SIZE], slm_gb[SLM_X_SIZE * SLM_Y_SIZE];
    190 
    191     for (; index < SLM_X_SIZE * SLM_Y_SIZE; index += l_x_size * l_y_size) {
    192         float8 line1;
    193         float8 line2;
    194 
    195         x = index % SLM_X_SIZE + x_cell_start;
    196         y = index / SLM_X_SIZE + y_cell_start;
    197 
    198 #if ENABLE_IMAGE_2D_INPUT
    199         line1 = convert_float8 (as_ushort8 (read_imageui(input, sampler, (int2)(x, y * 2)))) / 65536.0f;
    200         line2 = convert_float8 (as_ushort8 (read_imageui(input, sampler, (int2)(x, y * 2 + 1)))) / 65536.0f;
    201 #else
    202         line1 = convert_float8 (input [y * 2 * input_aligned_width + x]) / 65536.0f;
    203         line2 = convert_float8 (input [(y * 2 + 1) * input_aligned_width + x]) / 65536.0f;
    204 #endif
    205 
    206         float4 gr = mad (line1.even, blc_multiplier, - blc_config.level_gr);
    207         float4 r = mad (line1.odd, blc_multiplier, - blc_config.level_r);
    208         float4 b = mad (line2.even, blc_multiplier, - blc_config.level_b);
    209         float4 gb = mad (line2.odd, blc_multiplier, - blc_config.level_gb);
    210 
    211         slm_gr[index] = gr;
    212         slm_r[index] =  r;
    213         slm_b[index] =  b;
    214         slm_gb[index] = gb;
    215     }
    216     barrier(CLK_LOCAL_MEM_FENCE);
    217 
    218     float8 data_gr, data_r, data_b, data_gb;
    219     index = mad24 (l_y, l_x_size, l_x);
    220     x = mad24 (GROUP_CELL_X_SIZE / 8, group_id_x,  index % (SLM_X_SIZE / 2));
    221     y = mad24 (GROUP_CELL_Y_SIZE, group_id_y,  index / (SLM_X_SIZE / 2));
    222 
    223     data_gr = ((__local float8*)slm_gr)[index];
    224     data_gr = data_gr * wb_config.gr_gain;
    225 
    226     data_r = ((__local float8*)slm_r)[index];
    227     data_r  = data_r * wb_config.r_gain;
    228 
    229     data_b = ((__local float8*)slm_b)[index];
    230     data_b = data_b * wb_config.b_gain;
    231 
    232     data_gb = ((__local float8*)slm_gb)[index];
    233     data_gb = data_gb * wb_config.gb_gain;
    234 
    235 #if ENABLE_GAMMA
    236     gamma_correct (&data_gr, gamma_table);
    237     gamma_correct (&data_r, gamma_table);
    238     gamma_correct (&data_b, gamma_table);
    239     gamma_correct (&data_gb, gamma_table);
    240 #endif
    241 
    242 #if 0
    243     if (x % 16 == 0 && y % 16 == 0) {
    244         uint8 value = convert_uint8(convert_uchar8_sat(data_gr * 255.0f));
    245         printf ("(x:%d, y:%d) (blc.bit:%d, level:%d) (wb.gr:%f)=> (%d, %d, %d, %d, %d, %d, %d, %d)\n",
    246                 x * 8, y,
    247                 blc_config.color_bits, convert_uint(blc_config.level_gr * 255.0f),
    248                 wb_config.gr_gain,
    249                 value.s0, value.s1, value.s2, value.s3, value.s4, value.s5, value.s6, value.s7);
    250     }
    251 #endif
    252 
    253     write_imageui (output, (int2)(x, y), as_uint4 (convert_ushort8 (data_gr * 65536.0f)));
    254     write_imageui (output, (int2)(x, y + out_height), as_uint4 (convert_ushort8 (data_r * 65536.0f)));
    255     write_imageui (output, (int2)(x, y + out_height * 2), as_uint4 (convert_ushort8 (data_b * 65536.0f)));
    256     write_imageui (output, (int2)(x, y + out_height * 3), as_uint4 (convert_ushort8 (data_gb * 65536.0f)));
    257 
    258     stats_3a_calculate (slm_gr, slm_r, slm_b, slm_gb, stats_output, &wb_config);
    259 }
    260 
    261