Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_luma_horz_neon.s
     22 @*
     23 @* @brief
     24 @*  contains function definition for intra prediction  interpolation filters
     25 @*
     26 @*
     27 @* @author
     28 @*  parthiban v
     29 @*
     30 @* @par list of functions:
     31 @*  - ihevc_intra_pred_luma_horz()
     32 @*
     33 @* @remarks
     34 @*  none
     35 @*
     36 @*******************************************************************************
     37 @*/
     38 @
     39 @/**
     40 @*******************************************************************************
     41 @*
     42 @* @brief
     43 @*     intra prediction interpolation filter for horizontal luma variable.
     44 @*
     45 @* @par description:
     46 @*      horizontal intraprediction(mode 10) with.extern  samples location
     47 @*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
     48 @*      to section 8.4.4.2.6 in the standard (special case)
     49 @*
     50 @* @param[in] pu1_src
     51 @*  uword8 pointer to the source
     52 @*
     53 @* @param[out] pu1_dst
     54 @*  uword8 pointer to the destination
     55 @*
     56 @* @param[in] src_strd
     57 @*  integer source stride
     58 @*
     59 @* @param[in] dst_strd
     60 @*  integer destination stride
     61 @*
     62 @* @param[in] nt
     63 @*  integer transform block size
     64 @*
     65 @* @param[in] mode
     66 @*  integer intraprediction mode
     67 @*
     68 @* @returns
     69 @*
     70 @* @remarks
     71 @*  none
     72 @*
     73 @*******************************************************************************
     74 @*/
     75 @void ihevc_intra_pred_luma_horz(uword8 *pu1_ref,
     76 @                                word32 src_strd,
     77 @                                uword8 *pu1_dst,
     78 @                                word32 dst_strd,
     79 @                                word32 nt,
     80 @                                word32 mode)
     81 @**************variables vs registers*****************************************
     82 @r0 => *pu1_ref
     83 @r1 =>  src_strd
     84 @r2 => *pu1_dst
     85 @r3 =>  dst_strd
     86 
     87 .text
     88 .align 4
     89 
     90 
     91 
     92 
     93 .globl ihevc_intra_pred_luma_horz_a9q
     94 
     95 .type ihevc_intra_pred_luma_horz_a9q, %function
     96 
     97 ihevc_intra_pred_luma_horz_a9q:
     98 
     99     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    100 
    101     ldr         r4,[sp,#40]                 @loads nt
    102     @ldr        r5,[sp,#44]                     @loads mode
    103 
    104     lsl         r6,r4,#1                    @two_nt
    105 
    106     add         r12,r0,r6                   @*pu1_ref[two_nt]
    107     cmp         r4,#4                       @if nt == 4
    108     beq         core_loop_4
    109 
    110     cmp         r4,#8                       @if nt == 8
    111     beq         core_loop_8
    112 
    113     cmp         r4,#16                      @if nt == 16
    114     beq         core_loop_16
    115     sub         r12,r12,#16                 @move to 16th value pointer
    116     add         r9,r2,#16
    117 
    118 core_loop_32:
    119     vld1.8      {q0},[r12]                  @load 16 values. d1[7] will have the 1st value.
    120 
    121     vdup.8      q1,d1[7]                    @duplicate the i value.
    122 
    123     vdup.8      q2,d1[6]                    @duplicate the ii value.
    124     vdup.8      q3,d1[5]                    @duplicate the iii value.
    125     vst1.8      {q1},[r2],r3                @store in 1st row 0-16 columns
    126     vst1.8      {q1},[r9],r3                @store in 1st row 16-32 columns
    127 
    128     vdup.8      q4,d1[4]
    129     vst1.8      {q2},[r2],r3
    130     vst1.8      {q2},[r9],r3
    131 
    132     vdup.8      q1,d1[3]
    133     vst1.8      {q3},[r2],r3
    134     vst1.8      {q3},[r9],r3
    135 
    136     vdup.8      q2,d1[2]
    137     vst1.8      {q4},[r2],r3
    138     vst1.8      {q4},[r9],r3
    139 
    140     vdup.8      q3,d1[1]
    141     vst1.8      {q1},[r2],r3
    142     vst1.8      {q1},[r9],r3
    143 
    144     vdup.8      q4,d1[0]
    145     vst1.8      {q2},[r2],r3
    146     vst1.8      {q2},[r9],r3
    147 
    148     vdup.8      q1,d0[7]
    149     vst1.8      {q3},[r2],r3
    150     vst1.8      {q3},[r9],r3
    151 
    152     vdup.8      q2,d0[6]
    153     vst1.8      {q4},[r2],r3
    154     vst1.8      {q4},[r9],r3
    155 
    156     vdup.8      q3,d0[5]
    157     vst1.8      {q1},[r2],r3
    158     vst1.8      {q1},[r9],r3
    159 
    160     vdup.8      q4,d0[4]
    161     vst1.8      {q2},[r2],r3
    162     vst1.8      {q2},[r9],r3
    163 
    164     vdup.8      q1,d0[3]
    165     vst1.8      {q3},[r2],r3
    166     vst1.8      {q3},[r9],r3
    167 
    168     vdup.8      q2,d0[2]
    169     vst1.8      {q4},[r2],r3
    170     vst1.8      {q4},[r9],r3
    171 
    172     vdup.8      q3,d0[1]
    173     vst1.8      {q1},[r2],r3
    174     vst1.8      {q1},[r9],r3
    175     sub         r12,r12,#16                 @move to 16th value pointer
    176 
    177     vdup.8      q4,d0[0]
    178     vst1.8      {q2},[r2],r3
    179     vst1.8      {q2},[r9],r3
    180 
    181     subs        r4,r4,#16                   @decrement the loop count by 16
    182     vst1.8      {q3},[r2],r3
    183     vst1.8      {q3},[r9],r3
    184 
    185     vst1.8      {q4},[r2],r3
    186     vst1.8      {q4},[r9],r3
    187     bgt         core_loop_32
    188     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    189     b           end_func
    190 
    191 core_loop_16:
    192     ldrb        lr,[r12],#1                 @pu1_ref[two_nt]
    193     vld1.8      {q15},[r12]                 @pu1_ref[two_nt + 1 + col]
    194 
    195     vdup.8      d28,lr
    196     sub         r12,r12,#17
    197     vld1.8      {q0},[r12]
    198     vdup.8      d26,d1[7]
    199     vmovl.u8    q13,d26
    200 
    201     vdup.8      q1,d1[6]
    202     vsubl.u8    q12,d30,d28
    203 
    204     vdup.8      q2,d1[5]
    205     vshr.s16    q12,q12,#1
    206 
    207     vdup.8      q3,d1[4]
    208     vqadd.s16   q11,q13,q12
    209 
    210     vdup.8      q4,d1[3]
    211     vqmovun.s16 d22,q11
    212 
    213     vst1.8      {d22},[r2]!
    214 
    215     vdup.8      q5,d1[2]
    216     vsubl.u8    q12,d31,d28
    217 
    218     vdup.8      q6,d1[1]
    219     vshr.s16    q12,q12,#1
    220 
    221     vdup.8      q7,d1[0]
    222     vqadd.s16   q11,q13,q12
    223 
    224     vdup.8      q8,d0[7]
    225     vqmovun.s16 d22,q11
    226 
    227     vst1.8      {d22},[r2],r3
    228     sub         r2,r2,#8
    229 
    230     vst1.8      {q1},[r2],r3
    231 
    232     vst1.8      {q2},[r2],r3
    233     vst1.8      {q3},[r2],r3
    234     vst1.8      {q4},[r2],r3
    235 
    236     vdup.8      q1,d0[6]
    237     vst1.8      {q5},[r2],r3
    238 
    239     vdup.8      q2,d0[5]
    240     vst1.8      {q6},[r2],r3
    241 
    242     vdup.8      q3,d0[4]
    243     vst1.8      {q7},[r2],r3
    244 
    245     vdup.8      q4,d0[3]
    246     vst1.8      {q8},[r2],r3
    247 
    248     vdup.8      q5,d0[2]
    249     vst1.8      {q1},[r2],r3
    250 
    251     vdup.8      q6,d0[1]
    252     vst1.8      {q2},[r2],r3
    253 
    254     vdup.8      q7,d0[0]
    255     vst1.8      {q3},[r2],r3
    256 
    257     vst1.8      {q4},[r2],r3
    258     vst1.8      {q5},[r2],r3
    259     vst1.8      {q6},[r2],r3
    260     vst1.8      {q7},[r2],r3
    261 
    262     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    263     b           end_func
    264 
    265 
    266 core_loop_8:
    267     ldrb        lr,[r12]                    @pu1_ref[two_nt]
    268     add         r12,r12,#1                  @pu1_ref[two_nt + 1]
    269     vld1.8      {d30},[r12]                 @pu1_ref[two_nt + 1 + col]
    270 
    271     sub         r12,r12,#9
    272     vld1.8      {d0},[r12]
    273     vdup.8      d26,d0[7]
    274     vdup.8      d28,lr
    275 
    276     vdup.8      d3,d0[6]
    277     vmovl.u8    q13,d26
    278 
    279     vdup.8      d4,d0[5]
    280     vsubl.u8    q12,d30,d28
    281 
    282     vdup.8      d5,d0[4]
    283     vshr.s16    q12,q12,#1
    284 
    285     vdup.8      d6,d0[3]
    286     vqadd.s16   q11,q13,q12
    287 
    288     vdup.8      d7,d0[2]
    289     vqmovun.s16 d22,q11
    290 
    291     vst1.8      {d22},[r2],r3
    292     vst1.8      {d3},[r2],r3
    293 
    294     vdup.8      d8,d0[1]
    295     vst1.8      {d4},[r2],r3
    296     vst1.8      {d5},[r2],r3
    297 
    298     vdup.8      d9,d0[0]
    299     vst1.8      {d6},[r2],r3
    300     vst1.8      {d7},[r2],r3
    301 
    302     vst1.8      {d8},[r2],r3
    303     vst1.8      {d9},[r2],r3
    304     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    305     b           end_func
    306 
    307 
    308 core_loop_4:
    309     ldrb        lr,[r12]                    @pu1_ref[two_nt]
    310     add         r12,r12,#1                  @pu1_ref[two_nt + 1]
    311     vld1.8      {d30},[r12]                 @pu1_ref[two_nt + 1 + col]
    312 
    313     sub         r12,r12,#5
    314     vld1.8      {d0},[r12]
    315     vdup.8      d28,lr
    316     vdup.8      d26,d0[3]
    317     vmovl.u8    q13,d26
    318 
    319     vdup.8      d3,d0[2]
    320     vsubl.u8    q12,d30,d28
    321 
    322     vdup.8      d4,d0[1]
    323     vshr.s16    q12,q12,#1
    324 
    325     vdup.8      d5,d0[0]
    326     vqadd.s16   q11,q13,q12
    327 
    328     vqmovun.s16 d22,q11
    329 
    330     vst1.32     {d22[0]},[r2],r3
    331     vst1.32     {d3[0]},[r2],r3
    332     vst1.32     {d4[0]},[r2],r3
    333     vst1.32     {d5[0]},[r2],r3
    334 
    335     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    336 end_func:
    337 
    338 
    339 
    340