Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* ,:file
     21 @*  ihevc_sao_band_offset_chroma.s
     22 @*
     23 @* ,:brief
     24 @*  Contains function definitions for inter prediction  interpolation.
     25 @* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 @* RVCT
     27 @*
     28 @* ,:author
     29 @*  Parthiban V
     30 @*
     31 @* ,:par List of Functions:
     32 @*
     33 @*
     34 @* ,:remarks
     35 @*  None
     36 @*
     37 @*******************************************************************************
     38 @*/
     39 @void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src,
     40 @                           WORD32 src_strd,
     41 @                           UWORD8 *pu1_src_left,
     42 @                           UWORD8 *pu1_src_top,
     43 @                           UWORD8 *pu1_src_top_left,
     44 @                           WORD32 sao_band_pos_u,
     45 @                           WORD32 sao_band_pos_v,
     46 @                           WORD8 *pi1_sao_offset_u,
     47 @                           WORD8 *pi1_sao_offset_v,
     48 @                           WORD32 wd,
     49 @                           WORD32 ht)
     50 @
     51 @**************Variables Vs Registers*****************************************
     52 @r0 =>  *pu1_src
     53 @r1 =>  src_strd
     54 @r2 =>  *pu1_src_left
     55 @r3 =>  *pu1_src_top
     56 @r4 =>  *pu1_src_top_left
     57 @r5 =>  sao_band_pos_u
     58 @r6 =>  sao_band_pos_v
     59 @r7 =>  *pi1_sao_offset_u
     60 @r8 =>  *pi1_sao_offset_v
     61 @r9 =>  wd
     62 @r10=>  ht
     63 
     64 .text
     65 .p2align 2
     66 
     67 .extern gu1_table_band_idx
     68 .globl ihevc_sao_band_offset_chroma_a9q
     69 
     70 gu1_table_band_idx_addr_1:
     71 .long gu1_table_band_idx - ulbl1 - 8
     72 
     73 gu1_table_band_idx_addr_2:
     74 .long gu1_table_band_idx - ulbl2 - 8
     75 
     76 ihevc_sao_band_offset_chroma_a9q:
     77 
     78     STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
     79     LDR         r4,[sp,#40]                 @Loads pu1_src_top_left
     80     LDR         r10,[sp,#64]                @Loads ht
     81 
     82     LDR         r9,[sp,#60]                 @Loads wd
     83     MOV         r11,r10                     @Move the ht to r9 for loop counter
     84 
     85     ADD         r12,r0,r9                   @pu1_src[row * src_strd + (wd)]
     86     LDR         r14, gu1_table_band_idx_addr_1
     87 ulbl1:
     88     add         r14,r14,pc
     89     SUB         r12,r12,#2                  @wd-2
     90 
     91 SRC_LEFT_LOOP:
     92     LDRH        r5,[r12],r1                 @Load the value
     93     SUBS        r11,r11,#1                  @Decrement the loop counter
     94     STRH        r5,[r2],#2                  @Store the value in pu1_src_left pointer
     95     BNE         SRC_LEFT_LOOP
     96 
     97     LDR         r5,[sp,#44]                 @Loads sao_band_pos_u
     98     VLD1.8      D1,[r14]!                   @band_table_u.val[0]
     99     ADD         r12,r3,r9                   @pu1_src_top[wd]
    100 
    101     LDRH        r11,[r12,#-2]
    102     VLD1.8      D2,[r14]!                   @band_table_u.val[1]
    103     LSL         r6,r5,#3                    @sao_band_pos_u
    104 
    105     STRH        r11,[r4]                    @store to pu1_src_top_left[0]
    106     VLD1.8      D3,[r14]!                   @band_table_u.val[2]
    107     LDR         r7,[sp,#52]                 @Loads pi1_sao_offset_u
    108 
    109     SUB         r4,r10,#1                   @ht-1
    110     VDUP.8      D31,r6                      @band_pos_u
    111     MUL         r4,r4,r1                    @ht-1 * src_strd
    112 
    113     ADD         r4,r4,r0                    @pu1_src[(ht - 1) * src_strd]
    114     VLD1.8      D4,[r14]!                   @band_table_u.val[3]
    115     MOV         r11,r9                      @Move the wd to r9 for loop counter
    116 
    117 SRC_TOP_LOOP:                               @wd is always multiple of 8
    118     VLD1.8      D0,[r4]!                    @Load pu1_src[(ht - 1) * src_strd + col]
    119     SUBS        r11,r11,#8                  @Decrement the loop counter by 8
    120     VST1.8      D0,[r3]!                    @Store to pu1_src_top[col]
    121     BNE         SRC_TOP_LOOP
    122 
    123     VLD1.8      D30,[r7]                    @pi1_sao_offset_u load
    124     VADD.I8     D5,D1,D31                   @band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u)
    125 
    126     VDUP.8      D29,D30[1]                  @vdup_n_u8(pi1_sao_offset_u[1])
    127     VADD.I8     D6,D2,D31                   @band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u)
    128 
    129     VDUP.8      D28,D30[2]                  @vdup_n_u8(pi1_sao_offset_u[2])
    130     VADD.I8     D7,D3,D31                   @band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u)
    131 
    132     VDUP.8      D27,D30[3]                  @vdup_n_u8(pi1_sao_offset_u[3])
    133     VADD.I8     D8,D4,D31                   @band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u)
    134 
    135     CMP         r5,#28
    136     VDUP.8      D26,D30[4]                  @vdup_n_u8(pi1_sao_offset_u[4])
    137     LDR         r14, gu1_table_band_idx_addr_2
    138 ulbl2:
    139     add         r14,r14,pc
    140 
    141     VMOV.I8     D30,#16                     @vdup_n_u8(16)
    142     VADD.I8     D1,D5,D29                   @band_table_u.val[0] = vadd_u8(band_table_u.val[0], vdup_n_u8(pi1_sao_offset_u[1]))
    143 
    144     VLD1.8      D9,[r14]!                   @band_table_v.val[0]
    145     VADD.I8     D2,D6,D28                   @band_table_u.val[1] = vadd_u8(band_table_u.val[1], vdup_n_u8(pi1_sao_offset_u[2]))
    146 
    147     VLD1.8      D10,[r14]!                  @band_table_v.val[1]
    148     VADD.I8     D3,D7,D27                   @band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3]))
    149 
    150     LDR         r6,[sp,#48]                 @Loads sao_band_pos_v
    151     VADD.I8     D4,D8,D26                   @band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4]))
    152     LSL         r11,r6,#3                   @sao_band_pos_v
    153 
    154     BLT         SAO_BAND_POS_U_0
    155 
    156 SAO_BAND_POS_U_28:                          @case 28
    157     VCLE.U8     D13,D4,D30                  @vcle_u8(band_table.val[3], vdup_n_u8(16))
    158     BNE         SAO_BAND_POS_U_29
    159 
    160     VORR.U8     D4,D4,D13                   @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
    161     B           SWITCH_BREAK_U
    162 
    163 SAO_BAND_POS_U_29:                          @case 29
    164     CMP         r5,#29
    165 
    166     VCLE.U8     D14,D3,D30                  @vcle_u8(band_table.val[2], vdup_n_u8(16))
    167     BNE         SAO_BAND_POS_U_30
    168     VORR.U8     D3,D3,D14                   @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
    169 
    170     VAND.U8     D4,D4,D13                   @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
    171     B           SWITCH_BREAK_U
    172 
    173 SAO_BAND_POS_U_30:                          @case 30
    174     CMP         r5,#30
    175 
    176     VCLE.U8     D15,D2,D30                  @vcle_u8(band_table.val[1], vdup_n_u8(16))
    177     BNE         SAO_BAND_POS_U_31
    178     VORR.U8     D2,D2,D15                   @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
    179 
    180     VAND.U8     D3,D3,D14                   @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
    181 
    182 SAO_BAND_POS_U_31:                          @case 31
    183     CMP         r5,#31
    184     BNE         SWITCH_BREAK_U
    185 
    186     VCLE.U8     D16,D1,D30                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
    187     VORR.U8     D1,D1,D16                   @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
    188 
    189     VAND.U8     D2,D2,D15                   @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
    190     B           SWITCH_BREAK_U
    191 
    192 SAO_BAND_POS_U_0:
    193     CMP         r5,#0                       @case 0
    194     BNE         SWITCH_BREAK_U
    195 
    196     VCLE.U8     D16,D1,D30                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
    197     VAND.U8     D1,D1,D16                   @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
    198 
    199 SWITCH_BREAK_U:
    200     VDUP.8      D30,r11                     @band_pos_v
    201     LDR         r8,[sp,#56]                 @Loads pi1_sao_offset_v
    202 
    203     VLD1.8      D11,[r14]!                  @band_table_v.val[2]
    204     VADD.I8     D13,D9,D30                  @band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v)
    205 
    206     VLD1.8      D12,[r14]!                  @band_table_v.val[3]
    207     VADD.I8     D14,D10,D30                 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], band_pos_v)
    208 
    209     VLD1.8      D25,[r8]                    @pi1_sao_offset_v load
    210     VADD.I8     D15,D11,D30                 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v)
    211 
    212     VDUP.8      D29,D25[1]                  @vdup_n_u8(pi1_sao_offset_v[1])
    213     VADD.I8     D16,D12,D30                 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v)
    214 
    215     VDUP.8      D28,D25[2]                  @vdup_n_u8(pi1_sao_offset_v[2])
    216     VADD.I8     D9,D13,D29                  @band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1]))
    217 
    218     VDUP.8      D27,D25[3]                  @vdup_n_u8(pi1_sao_offset_v[3])
    219     VADD.I8     D10,D14,D28                 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2]))
    220 
    221     VDUP.8      D26,D25[4]                  @vdup_n_u8(pi1_sao_offset_v[4])
    222     VADD.I8     D11,D15,D27                 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3]))
    223 
    224     VMOV.I8     D29,#16                     @vdup_n_u8(16)
    225     VADD.I8     D12,D16,D26                 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], vdup_n_u8(pi1_sao_offset_v[4]))
    226     AND         r12,r9,#0xf
    227 
    228     CMP         r6,#28
    229     BLT         SAO_BAND_POS_V_0
    230 
    231 SAO_BAND_POS_V_28:                          @case 28
    232     VCLE.U8     D17,D12,D29                 @vcle_u8(band_table.val[3], vdup_n_u8(16))
    233     BNE         SAO_BAND_POS_V_29
    234     VORR.U8     D12,D12,D17                 @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
    235     B           SWITCH_BREAK_V
    236 
    237 SAO_BAND_POS_V_29:                          @case 29
    238     CMP         r6,#29
    239 
    240     VCLE.U8     D18,D11,D29                 @vcle_u8(band_table.val[2], vdup_n_u8(16))
    241     BNE         SAO_BAND_POS_V_30
    242     VORR.U8     D11,D11,D18                 @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
    243 
    244     VAND.U8     D12,D12,D17                 @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
    245     B           SWITCH_BREAK_V
    246 
    247 SAO_BAND_POS_V_30:                          @case 30
    248     CMP         r6,#30
    249 
    250     VCLE.U8     D19,D10,D29                 @vcle_u8(band_table.val[1], vdup_n_u8(16))
    251     BNE         SAO_BAND_POS_V_31
    252     VORR.U8     D10,D10,D19                 @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
    253 
    254     VAND.U8     D11,D11,D18                 @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
    255     B           SWITCH_BREAK_V
    256 
    257 SAO_BAND_POS_V_31:                          @case 31
    258     CMP         r6,#31
    259     BNE         SWITCH_BREAK_V
    260 
    261     VCLE.U8     D20,D9,D29                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
    262     VORR.U8     D9,D9,D20                   @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
    263 
    264     VAND.U8     D10,D10,D19                 @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
    265     B           SWITCH_BREAK_V
    266 
    267 SAO_BAND_POS_V_0:
    268     CMP         r6,#0                       @case 0
    269     BNE         SWITCH_BREAK_V
    270 
    271     VCLE.U8     D20,D9,D29                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
    272     VAND.U8     D9,D9,D20                   @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
    273 
    274 SWITCH_BREAK_V:
    275     CMP         r9,#16
    276     MOV         r4,r0                       @pu1_src_cpy
    277     BLT         WIDTH_RESIDUE
    278 
    279 WIDTH_LOOP:                                 @Width is assigned to be multiple of 16
    280     MOV         r4,r0                       @pu1_src_cpy
    281     MOV         r11,r10                     @move ht
    282     ADD         r5,r4,r1
    283 
    284 HEIGHT_LOOP:                                @unrolled for 4 rows
    285     ADD         r6,r5,r1
    286     VLD2.8      {D5,D6},[r4]                @vld1q_u8(pu1_src_cpy)
    287     ADD         r7,r6,r1
    288 
    289     VLD2.8      {D13,D14},[r5]              @vld1q_u8(pu1_src_cpy)
    290     VSUB.I8     D7,D5,D31                   @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    291 
    292     VLD2.8      {D17,D18},[r6]              @vld1q_u8(pu1_src_cpy)
    293     VSUB.I8     D8,D6,D30                   @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    294 
    295     VLD2.8      {D21,D22},[r7]              @vld1q_u8(pu1_src_cpy)
    296     VSUB.I8     D15,D13,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    297 
    298     VTBX.8      D5,{D1-D4},D7               @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    299     VSUB.I8     D16,D14,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    300 
    301     VTBX.8      D6,{D9-D12},D8              @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    302     VSUB.I8     D19,D17,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    303 
    304     VTBX.8      D13,{D1-D4},D15             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    305     VSUB.I8     D20,D18,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    306 
    307     VTBX.8      D14,{D9-D12},D16            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    308     VSUB.I8     D23,D21,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    309 
    310     VST2.8      {D5,D6},[r4]                @vst1q_u8(pu1_src_cpy, au1_cur_row)
    311     VSUB.I8     D24,D22,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    312 
    313     SUBS        r11,r11,#4                  @Decrement the ht loop count by 4
    314     VTBX.8      D17,{D1-D4},D19             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    315 
    316     VST2.8      {D13,D14},[r5]              @vst1q_u8(pu1_src_cpy, au1_cur_row)
    317 
    318     VTBX.8      D18,{D9-D12},D20            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    319     VTBX.8      D21,{D1-D4},D23             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    320     VTBX.8      D22,{D9-D12},D24            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    321 
    322     VST2.8      {D17,D18},[r6],r1           @vst1q_u8(pu1_src_cpy, au1_cur_row)
    323 
    324     ADD         r4,r6,r1
    325     VST2.8      {D21,D22},[r7]              @vst1q_u8(pu1_src_cpy, au1_cur_row)
    326     ADD         r5,r4,r1
    327 
    328     BNE         HEIGHT_LOOP
    329 
    330     SUB         r9,r9,#16                   @Decrement the width loop by 16
    331     ADD         r0,r0,#16
    332     CMP         r9,#8
    333     BGT         WIDTH_LOOP
    334     BLT         END_LOOP
    335     MOV         r4,r0                       @pu1_src_cpy
    336 
    337 WIDTH_RESIDUE:                              @If width is not multiple of 16
    338     ADD         r5,r4,r1
    339     VLD2.8      {D5,D6},[r4]                @vld1q_u8(pu1_src_cpy)
    340     ADD         r6,r5,r1
    341 
    342     ADD         r7,r6,r1
    343     VLD2.8      {D13,D14},[r5]              @vld1q_u8(pu1_src_cpy)
    344     VSUB.I8     D7,D5,D31                   @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    345 
    346     VLD2.8      {D17,D18},[r6]              @vld1q_u8(pu1_src_cpy)
    347     VSUB.I8     D8,D6,D30                   @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    348 
    349     VTBX.8      D5,{D1-D4},D7               @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    350     VSUB.I8     D15,D13,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    351 
    352     VTBX.8      D6,{D9-D12},D8              @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    353     VSUB.I8     D16,D14,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    354 
    355     VLD2.8      {D21,D22},[r7]              @vld1q_u8(pu1_src_cpy)
    356     VSUB.I8     D19,D17,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    357 
    358     VTBX.8      D13,{D1-D4},D15             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    359     VSUB.I8     D20,D18,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    360 
    361     VTBX.8      D14,{D9-D12},D16            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    362     VZIP.8      D5,D6
    363 
    364     VTBX.8      D17,{D1-D4},D19             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    365     VSUB.I8     D23,D21,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
    366 
    367     VST1.8      {D5},[r4]                   @vst1q_u8(pu1_src_cpy, au1_cur_row)
    368     VZIP.8      D13,D14
    369 
    370     VTBX.8      D18,{D9-D12},D20            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    371     VSUB.I8     D24,D22,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
    372 
    373     VST1.8      {D13},[r5]                  @vst1q_u8(pu1_src_cpy, au1_cur_row)
    374     SUBS        r10,r10,#4                  @Decrement the ht loop count by 4
    375 
    376     VTBX.8      D21,{D1-D4},D23             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
    377     VZIP.8      D17,D18
    378 
    379     VTBX.8      D22,{D9-D12},D24            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
    380     VST1.8      {D17},[r6],r1               @vst1q_u8(pu1_src_cpy, au1_cur_row)
    381     VZIP.8      D21,D22
    382 
    383     ADD         r4,r6,r1
    384     VST1.8      {D21},[r7]                  @vst1q_u8(pu1_src_cpy, au1_cur_row)
    385     ADD         r5,r4,r1
    386 
    387     BNE         WIDTH_RESIDUE
    388 
    389 END_LOOP:
    390     LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
    391 
    392 
    393 
    394