Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* ,:file
     21 //*  ihevc_sao_band_offset_luma.s
     22 //*
     23 //* ,:brief
     24 //*  Contains function definitions for inter prediction  interpolation.
     25 //* Functions are coded using NEON  intrinsics and can be compiled using// ARM
     26 //* RVCT
     27 //*
     28 //* ,:author
     29 //*  Parthiban V
     30 //*
     31 //* ,:par List of Functions:
     32 //*
     33 //*
     34 //* ,:remarks
     35 //*  None
     36 //*
     37 //*******************************************************************************
     38 //*/
     39 //void ihevc_sao_band_offset_luma(UWORD8 *pu1_src,
     40 //                           WORD32 src_strd,
     41 //                           UWORD8 *pu1_src_left,
     42 //                           UWORD8 *pu1_src_top,
     43 //                           UWORD8 *pu1_src_top_left,
     44 //                           WORD32 sao_band_pos,
     45 //                           WORD8 *pi1_sao_offset,
     46 //                           WORD32 wd,
     47 //                           WORD32 ht)
     48 //
     49 //**************Variables Vs Registers*****************************************
     50 //x0 =>    *pu1_src
     51 //x1 =>    src_strd
     52 //x2 =>    *pu1_src_left
     53 //x3 =>    *pu1_src_top
     54 //x4    =>    *pu1_src_top_left
     55 //x5    =>    sao_band_pos
     56 //x6    =>    *pi1_sao_offset
     57 //x7    =>    wd
     58 //x8    =>    ht
     59 
     60 
     61 .set WIDE_REFERENCE, 0
     62 .set ARCHITECTURE, 5
     63 .set DO1STROUNDING, 0
     64 
     65 .include "ihevc_neon_macros.s"
     66 
     67 .text
     68 .p2align 2
     69 
     70 .globl gu1_table_band_idx
     71 .globl ihevc_sao_band_offset_luma_av8
     72 
     73 ihevc_sao_band_offset_luma_av8:
     74 
     75     // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
     76 
     77     LDR         w8,[sp]                     //Loads ht
     78 
     79 
     80     stp         d13,d14,[sp,#-16]!
     81     stp         d8,d15,[sp,#-16]!           // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
     82                                             // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
     83     stp         x19, x20,[sp,#-16]!
     84 
     85     MOV         x9,x8                       //Move the ht to x9 for loop counter
     86     ADD         x10,x0,x7                   //pu1_src[row * src_strd + (wd)]
     87 
     88     SUB         x10,x10,#1                  //wd-1
     89     ADRP        x14, :got:gu1_table_band_idx
     90     LDR         x14, [x14, #:got_lo12:gu1_table_band_idx]
     91 
     92 SRC_LEFT_LOOP:
     93     LDRB        w11,[x10]
     94     add         x10, x10, x1                //Load the value
     95     SUBS        x9,x9,#1                    //Decrement the loop counter
     96     STRB        w11,[x2],#1                 //Store the value in pu1_src_left pointer
     97     BNE         SRC_LEFT_LOOP
     98 
     99     ADD         x9,x3,x7                    //pu1_src_top[wd]
    100     LD1         {v1.8b},[x14],#8            //band_table.val[0]
    101 
    102     LSL         x11,x5,#3
    103     LD1         {v2.8b},[x14],#8            //band_table.val[1]
    104 
    105     LDRB        w10,[x9,#-1]
    106     dup         v31.8b,w11                  //band_pos
    107     SUB         x12,x8,#1                   //ht-1
    108 
    109     STRB        w10,[x4]                    //store to pu1_src_top_left[0]
    110     LD1         {v3.8b},[x14],#8            //band_table.val[2]
    111     mul         x12, x12, x1                //ht-1 * src_strd
    112 
    113     ADD         x4,x12,x0                   //pu1_src[(ht - 1) * src_strd]
    114     LD1         {v4.8b},[x14],#8            //band_table.val[3]
    115     MOV         x9,x7                       //Move the wd to x9 for loop counter
    116 
    117 SRC_TOP_LOOP:                               //wd is always multiple of 8
    118     LD1         {v0.8b},[x4],#8             //Load pu1_src[(ht - 1) * src_strd + col]
    119     SUBS        x9,x9,#8                    //Decrement the loop counter by 8
    120     ST1         {v0.8b},[x3],#8             //Store to pu1_src_top[col]
    121     BNE         SRC_TOP_LOOP
    122 
    123     LD1         {v30.8b},[x6]               //pi1_sao_offset load
    124     ADD         v5.8b,  v1.8b ,  v31.8b     //band_table.val[0] = vadd_u8(band_table.val[0], band_pos)
    125 
    126     dup         v29.8b, v30.b[1]            //vdup_n_u8(pi1_sao_offset[1])
    127     ADD         v6.8b,  v2.8b ,  v31.8b     //band_table.val[1] = vadd_u8(band_table.val[1], band_pos)
    128 
    129     dup         v28.8b, v30.b[2]            //vdup_n_u8(pi1_sao_offset[2])
    130     ADD         v7.8b,  v3.8b ,  v31.8b     //band_table.val[2] = vadd_u8(band_table.val[2], band_pos)
    131 
    132     dup         v27.8b, v30.b[3]            //vdup_n_u8(pi1_sao_offset[3])
    133     ADD         v21.8b,  v4.8b ,  v31.8b    //band_table.val[3] = vadd_u8(band_table.val[3], band_pos)
    134 
    135     dup         v26.8b, v30.b[4]            //vdup_n_u8(pi1_sao_offset[4])
    136     ADD         v1.8b,  v5.8b ,  v29.8b     //band_table.val[0] = vadd_u8(band_table.val[0], vdup_n_u8(pi1_sao_offset[1]))
    137 
    138     movi        v29.8b, #16                 //vdup_n_u8(16)
    139     ADD         v2.8b,  v6.8b ,  v28.8b     //band_table.val[1] = vadd_u8(band_table.val[1], vdup_n_u8(pi1_sao_offset[2]))
    140 
    141     CMP         x5,#28
    142     ADD         v3.8b,  v7.8b ,  v27.8b     //band_table.val[2] = vadd_u8(band_table.val[2], vdup_n_u8(pi1_sao_offset[3]))
    143 
    144     ADD         v4.8b,  v21.8b ,  v26.8b    //band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4]))
    145     BLT         SAO_BAND_POS_0
    146 
    147 SAO_BAND_POS_28:                            //case 28
    148 
    149     cmhs        v25.8b,  v29.8b ,  v4.8b    //vcle_u8(band_table.val[3], vdup_n_u8(16))
    150 
    151     BNE         SAO_BAND_POS_29
    152     ORR         v4.8b,  v4.8b ,  v25.8b     //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
    153     B           SWITCH_BREAK
    154 
    155 SAO_BAND_POS_29:                            //case 29
    156     CMP         x5,#29
    157     cmhs        v24.8b,  v29.8b ,  v3.8b    //vcle_u8(band_table.val[2], vdup_n_u8(16))
    158 
    159     BNE         SAO_BAND_POS_30
    160     ORR         v3.8b,  v3.8b ,  v24.8b     //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
    161 
    162     AND         v4.8b,  v4.8b ,  v25.8b     //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
    163     B           SWITCH_BREAK
    164 
    165 SAO_BAND_POS_30:                            //case 30
    166     CMP         x5,#30
    167     cmhs        v23.8b,  v29.8b ,  v2.8b    //vcle_u8(band_table.val[1], vdup_n_u8(16))
    168 
    169     BNE         SAO_BAND_POS_31
    170     ORR         v2.8b,  v2.8b ,  v23.8b     //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
    171 
    172     AND         v3.8b,  v3.8b ,  v24.8b     //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
    173     B           SWITCH_BREAK
    174 
    175 SAO_BAND_POS_31:                            //case 31
    176     CMP         x5,#31
    177     BNE         SWITCH_BREAK
    178 
    179     cmhs        v22.8b,  v29.8b ,  v1.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
    180     ORR         v1.8b,  v1.8b ,  v22.8b     //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
    181 
    182     AND         v2.8b,  v2.8b ,  v23.8b     //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
    183 
    184 SAO_BAND_POS_0:
    185     CMP         x5,#0                       //case 0
    186     BNE         SWITCH_BREAK
    187 
    188     cmhs        v22.8b,  v29.8b ,  v1.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
    189     AND         v1.8b,  v1.8b ,  v22.8b     //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
    190 
    191 SWITCH_BREAK:
    192 
    193     mov         v1.d[1],v2.d[0]
    194     mov         v2.d[0],v3.d[0]
    195     mov         v2.d[1],v4.d[0]
    196 
    197 SWITCH_BREAK_1:
    198 
    199     MOV         x4,x0                       //pu1_src_cpy
    200     MOV         x11,x8                      //move ht
    201     ADD         x5,x4,x1
    202 
    203 HEIGHT_LOOP:
    204     ADD         x6,x5,x1
    205     LD1         {v13.8b},[x4]               //au1_cur_row = vld1_u8(pu1_src_cpy)
    206 
    207     ADD         x10,x6,x1
    208     LD1         {v15.8b},[x5]               //au1_cur_row = vld1_u8(pu1_src_cpy)
    209 
    210     LD1         {v17.8b},[x6]               //au1_cur_row = vld1_u8(pu1_src_cpy)
    211 
    212     LD1         {v19.8b},[x10]              //au1_cur_row = vld1_u8(pu1_src_cpy)
    213     SUB         v14.8b,  v13.8b ,  v31.8b   //vsub_u8(au1_cur_row, band_pos)
    214 
    215     TBX         v13.8b, {v1.16b- v2.16b},v14.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
    216     SUB         v16.8b,  v15.8b ,  v31.8b   //vsub_u8(au1_cur_row, band_pos)
    217 
    218     TBX         v15.8b, {v1.16b- v2.16b},v16.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
    219     SUB         v18.8b,  v17.8b ,  v31.8b   //vsub_u8(au1_cur_row, band_pos)
    220 
    221     TBX         v17.8b, {v1.16b- v2.16b},v18.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
    222     SUB         v20.8b,  v19.8b ,  v31.8b   //vsub_u8(au1_cur_row, band_pos)
    223 
    224     TBX         v19.8b, {v1.16b- v2.16b},v20.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
    225     ST1         {v13.8b},[x4],x1            //vst1_u8(pu1_src_cpy, au1_cur_row)
    226 
    227     ST1         {v15.8b},[x5]               //vst1_u8(pu1_src_cpy, au1_cur_row)
    228     SUBS        x11,x11,#4                  //Decrement the ht loop count by 4
    229 
    230     ST1         {v17.8b},[x6],x1            //vst1_u8(pu1_src_cpy, au1_cur_row)
    231 
    232     ADD         x4,x6,x1
    233     ST1         {v19.8b},[x10]              //vst1_u8(pu1_src_cpy, au1_cur_row)
    234     ADD         x5,x4,x1
    235 
    236     BNE         HEIGHT_LOOP
    237 
    238     SUBS        x7,x7,#8                    //Decrement the width loop by 8
    239     ADD         x0,x0,#8
    240     BNE         SWITCH_BREAK_1
    241 
    242     // LDMFD sp!,{x4-x12,x15}               //Reload the registers from SP
    243     ldp         x19, x20,[sp], #16
    244     ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
    245                                             // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
    246     ldp         d13,d14,[sp],#16
    247     ret
    248 
    249 
    250 
    251