Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* ,:file
     21 @*  ihevc_sao_band_offset_luma.s
     22 @*
     23 @* ,:brief
     24 @*  Contains function definitions for inter prediction  interpolation.
     25 @* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
     26 @* RVCT
     27 @*
     28 @* ,:author
     29 @*  Parthiban V
     30 @*
     31 @* ,:par List of Functions:
     32 @*
     33 @*
     34 @* ,:remarks
     35 @*  None
     36 @*
     37 @*******************************************************************************
     38 @*/
     39 @void ihevc_sao_band_offset_luma(UWORD8 *pu1_src,
     40 @                           WORD32 src_strd,
     41 @                           UWORD8 *pu1_src_left,
     42 @                           UWORD8 *pu1_src_top,
     43 @                           UWORD8 *pu1_src_top_left,
     44 @                           WORD32 sao_band_pos,
     45 @                           WORD8 *pi1_sao_offset,
     46 @                           WORD32 wd,
     47 @                           WORD32 ht)
     48 @
     49 @**************Variables Vs Registers*****************************************
     50 @r0 =>  *pu1_src
     51 @r1 =>  src_strd
     52 @r2 =>  *pu1_src_left
     53 @r3 =>  *pu1_src_top
     54 @r4 =>  *pu1_src_top_left
     55 @r5 =>  sao_band_pos
     56 @r6 =>  *pi1_sao_offset
     57 @r7 =>  wd
     58 @r8 =>  ht
     59 
     60 .text
     61 .p2align 2
     62 
     63 .extern gu1_table_band_idx
     64 .globl ihevc_sao_band_offset_luma_a9q
     65 
     66 gu1_table_band_idx_addr:
     67 .long gu1_table_band_idx - ulbl1 - 8
     68 
     69 ihevc_sao_band_offset_luma_a9q:
     70 
     71     STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
     72 
     73     LDR         r8,[sp,#56]                 @Loads ht
     74     LDR         r7,[sp,#52]                 @Loads wd
     75 
     76     MOV         r9,r8                       @Move the ht to r9 for loop counter
     77     LDR         r5,[sp,#44]                 @Loads sao_band_pos
     78     ADD         r10,r0,r7                   @pu1_src[row * src_strd + (wd)]
     79 
     80     LDR         r4,[sp,#40]                 @Loads pu1_src_top_left
     81     SUB         r10,r10,#1                  @wd-1
     82     LDR         r14, gu1_table_band_idx_addr
     83 ulbl1:
     84     add         r14,r14,pc
     85 
     86 SRC_LEFT_LOOP:
     87     LDRB        r11,[r10],r1                @Load the value
     88     SUBS        r9,r9,#1                    @Decrement the loop counter
     89     STRB        r11,[r2],#1                 @Store the value in pu1_src_left pointer
     90     BNE         SRC_LEFT_LOOP
     91 
     92     ADD         r9,r3,r7                    @pu1_src_top[wd]
     93     VLD1.8      D1,[r14]!                   @band_table.val[0]
     94     LDR         r6,[sp,#48]                 @Loads pi1_sao_offset
     95 
     96     LSL         r11,r5,#3
     97     VLD1.8      D2,[r14]!                   @band_table.val[1]
     98 
     99     LDRB        r10,[r9,#-1]
    100     VDUP.8      D31,r11                     @band_pos
    101     SUB         r12,r8,#1                   @ht-1
    102 
    103     STRB        r10,[r4]                    @store to pu1_src_top_left[0]
    104     VLD1.8      D3,[r14]!                   @band_table.val[2]
    105     MUL         r12,r12,r1                  @ht-1 * src_strd
    106 
    107     ADD         r4,r12,r0                   @pu1_src[(ht - 1) * src_strd]
    108     VLD1.8      D4,[r14]!                   @band_table.val[3]
    109     MOV         r9,r7                       @Move the wd to r9 for loop counter
    110 
    111 SRC_TOP_LOOP:                               @wd is always multiple of 8
    112     VLD1.8      D0,[r4]!                    @Load pu1_src[(ht - 1) * src_strd + col]
    113     SUBS        r9,r9,#8                    @Decrement the loop counter by 8
    114     VST1.8      D0,[r3]!                    @Store to pu1_src_top[col]
    115     BNE         SRC_TOP_LOOP
    116 
    117     VLD1.8      D30,[r6]                    @pi1_sao_offset load
    118     VADD.I8     D5,D1,D31                   @band_table.val[0] = vadd_u8(band_table.val[0], band_pos)
    119 
    120     VDUP.8      D29,D30[1]                  @vdup_n_u8(pi1_sao_offset[1])
    121     VADD.I8     D6,D2,D31                   @band_table.val[1] = vadd_u8(band_table.val[1], band_pos)
    122 
    123     VDUP.8      D28,D30[2]                  @vdup_n_u8(pi1_sao_offset[2])
    124     VADD.I8     D7,D3,D31                   @band_table.val[2] = vadd_u8(band_table.val[2], band_pos)
    125 
    126     VDUP.8      D27,D30[3]                  @vdup_n_u8(pi1_sao_offset[3])
    127     VADD.I8     D8,D4,D31                   @band_table.val[3] = vadd_u8(band_table.val[3], band_pos)
    128 
    129     VDUP.8      D26,D30[4]                  @vdup_n_u8(pi1_sao_offset[4])
    130     VADD.I8     D1,D5,D29                   @band_table.val[0] = vadd_u8(band_table.val[0], vdup_n_u8(pi1_sao_offset[1]))
    131 
    132     VMOV.I8     D29,#16                     @vdup_n_u8(16)
    133     VADD.I8     D2,D6,D28                   @band_table.val[1] = vadd_u8(band_table.val[1], vdup_n_u8(pi1_sao_offset[2]))
    134 
    135     CMP         r5,#28
    136     VADD.I8     D3,D7,D27                   @band_table.val[2] = vadd_u8(band_table.val[2], vdup_n_u8(pi1_sao_offset[3]))
    137 
    138     VADD.I8     D4,D8,D26                   @band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4]))
    139     BLT         SAO_BAND_POS_0
    140 
    141 SAO_BAND_POS_28:                            @case 28
    142 
    143     VCLE.U8     D12,D4,D29                  @vcle_u8(band_table.val[3], vdup_n_u8(16))
    144 
    145     BNE         SAO_BAND_POS_29
    146     VORR.U8     D4,D4,D12                   @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
    147     B           SWITCH_BREAK
    148 
    149 SAO_BAND_POS_29:                            @case 29
    150     CMP         r5,#29
    151     VCLE.U8     D11,D3,D29                  @vcle_u8(band_table.val[2], vdup_n_u8(16))
    152 
    153     BNE         SAO_BAND_POS_30
    154     VORR.U8     D3,D3,D11                   @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
    155 
    156     VAND.U8     D4,D4,D12                   @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
    157     B           SWITCH_BREAK
    158 
    159 SAO_BAND_POS_30:                            @case 30
    160     CMP         r5,#30
    161     VCLE.U8     D10,D2,D29                  @vcle_u8(band_table.val[1], vdup_n_u8(16))
    162 
    163     BNE         SAO_BAND_POS_31
    164     VORR.U8     D2,D2,D10                   @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
    165 
    166     VAND.U8     D3,D3,D11                   @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
    167     B           SWITCH_BREAK
    168 
    169 SAO_BAND_POS_31:                            @case 31
    170     CMP         r5,#31
    171     BNE         SWITCH_BREAK
    172 
    173     VCLE.U8     D9,D1,D29                   @vcle_u8(band_table.val[0], vdup_n_u8(16))
    174     VORR.U8     D1,D1,D9                    @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
    175 
    176     VAND.U8     D2,D2,D10                   @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
    177 
    178 SAO_BAND_POS_0:
    179     CMP         r5,#0                       @case 0
    180     BNE         SWITCH_BREAK
    181 
    182     VCLE.U8     D9,D1,D29                   @vcle_u8(band_table.val[0], vdup_n_u8(16))
    183     VAND.U8     D1,D1,D9                    @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
    184 
    185 SWITCH_BREAK:
    186     MOV         r4,r0                       @pu1_src_cpy
    187     MOV         r11,r8                      @move ht
    188     ADD         r5,r4,r1
    189 
    190 HEIGHT_LOOP:
    191     ADD         r6,r5,r1
    192     VLD1.8      D13,[r4]                    @au1_cur_row = vld1_u8(pu1_src_cpy)
    193 
    194     ADD         r10,r6,r1
    195     VLD1.8      D15,[r5]                    @au1_cur_row = vld1_u8(pu1_src_cpy)
    196 
    197     VLD1.8      D17,[r6]                    @au1_cur_row = vld1_u8(pu1_src_cpy)
    198 
    199     VLD1.8      D19,[r10]                   @au1_cur_row = vld1_u8(pu1_src_cpy)
    200     VSUB.I8     D14,D13,D31                 @vsub_u8(au1_cur_row, band_pos)
    201 
    202     VTBX.8      D13,{D1-D4},D14             @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
    203     VSUB.I8     D16,D15,D31                 @vsub_u8(au1_cur_row, band_pos)
    204 
    205     VTBX.8      D15,{D1-D4},D16             @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
    206     VSUB.I8     D18,D17,D31                 @vsub_u8(au1_cur_row, band_pos)
    207 
    208     VTBX.8      D17,{D1-D4},D18             @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
    209     VSUB.I8     D20,D19,D31                 @vsub_u8(au1_cur_row, band_pos)
    210 
    211     VTBX.8      D19,{D1-D4},D20             @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
    212     VST1.8      D13,[r4],r1                 @vst1_u8(pu1_src_cpy, au1_cur_row)
    213 
    214     VST1.8      D15,[r5]                    @vst1_u8(pu1_src_cpy, au1_cur_row)
    215     SUBS        r11,r11,#4                  @Decrement the ht loop count by 4
    216 
    217     VST1.8      D17,[r6],r1                 @vst1_u8(pu1_src_cpy, au1_cur_row)
    218 
    219     ADD         r4,r6,r1
    220     VST1.8      D19,[r10]                   @vst1_u8(pu1_src_cpy, au1_cur_row)
    221     ADD         r5,r4,r1
    222 
    223     BNE         HEIGHT_LOOP
    224 
    225     SUBS        r7,r7,#8                    @Decrement the width loop by 8
    226     ADD         r0,r0,#8
    227     BNE         SWITCH_BREAK
    228 
    229     LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
    230 
    231 
    232 
    233 
    234