Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_chroma_horz_neon.s
     22 @*
     23 @* @brief
     24 @*  contains function definition for intra prediction  interpolation filters
     25 @*
     26 @*
     27 @* @author
     28 @*  parthiban v
     29 @*
     30 @* @par list of functions:
     31 @*  - ihevc_intra_pred_luma_horz()
     32 @*
     33 @* @remarks
     34 @*  none
     35 @*
     36 @*******************************************************************************
     37 @*/
     38 @
     39 @/**
     40 @*******************************************************************************
     41 @*
     42 @* @brief
     43 @*     intra prediction interpolation filter for horizontal luma variable.
     44 @*
     45 @* @par description:
     46 @*      horizontal intraprediction(mode 10) with.extern  samples location
     47 @*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
     48 @*      to section 8.4.4.2.6 in the standard (special case)
     49 @*
     50 @* @param[in] pu1_src
     51 @*  uword8 pointer to the source
     52 @*
     53 @* @param[out] pu1_dst
     54 @*  uword8 pointer to the destination
     55 @*
     56 @* @param[in] src_strd
     57 @*  integer source stride
     58 @*
     59 @* @param[in] dst_strd
     60 @*  integer destination stride
     61 @*
     62 @* @param[in] nt
     63 @*  integer transform block size
     64 @*
     65 @* @param[in] mode
     66 @*  integer intraprediction mode
     67 @*
     68 @* @returns
     69 @*
     70 @* @remarks
     71 @*  none
     72 @*
     73 @*******************************************************************************
     74 @*/
     75 @void ihevc_intra_pred_chroma_horz(uword8 *pu1_ref,
     76 @                                  word32 src_strd,
     77 @                                  uword8 *pu1_dst,
     78 @                                  word32 dst_strd,
     79 @                                  word32 nt,
     80 @                                  word32 mode)
     81 @**************variables vs registers*****************************************
     82 @r0 => *pu1_ref
     83 @r1 =>  src_strd
     84 @r2 => *pu1_dst
     85 @r3 =>  dst_strd
     86 
     87 .equ    nt_offset,      104
     88 
     89 .text
     90 .align 4
     91 
     92 
     93 
     94 
     95 .globl ihevc_intra_pred_chroma_horz_a9q
     96 
     97 .type ihevc_intra_pred_chroma_horz_a9q, %function
     98 
     99 ihevc_intra_pred_chroma_horz_a9q:
    100 
    101     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    102     vpush       {d8 - d15}
    103 
    104     ldr         r4,[sp,#nt_offset]          @loads nt
    105 
    106     lsl         r6,r4,#2                    @four_nt
    107 
    108     add         r12,r0,r6                   @*pu1_ref[four_nt]
    109     cmp         r4,#4                       @if nt == 4
    110     beq         core_loop_4
    111 
    112     cmp         r4,#8                       @if nt == 8
    113     beq         core_loop_8
    114 
    115     @cmp            r4,#16                          @if nt == 16
    116     @beq            core_loop_16
    117 
    118     sub         r12,r12,#16                 @move to 16th value pointer
    119     add         r9,r2,#16
    120 
    121 core_loop_16:
    122     vld1.16     {q0},[r12]                  @load 16 values. d1[7] will have the 1st value.
    123     sub         r12,r12,#16
    124     vld1.16     {q5},[r12]                  @load 16 values. d1[7] will have the 1st value.
    125 
    126     vdup.16     q1,d1[3]                    @duplicate the i value.
    127 
    128     vdup.16     q2,d1[2]                    @duplicate the ii value.
    129     vdup.16     q3,d1[1]                    @duplicate the iii value.
    130     vst1.16     {q1},[r2],r3                @store in 1st row 0-16 columns
    131     vst1.16     {q1},[r9],r3                @store in 1st row 16-32 columns
    132 
    133     vdup.16     q4,d1[0]
    134     vst1.16     {q2},[r2],r3
    135     vst1.16     {q2},[r9],r3
    136 
    137     vdup.16     q1,d0[3]
    138     vst1.16     {q3},[r2],r3
    139     vst1.16     {q3},[r9],r3
    140 
    141     vdup.16     q2,d0[2]
    142     vst1.16     {q4},[r2],r3
    143     vst1.16     {q4},[r9],r3
    144 
    145     vdup.16     q3,d0[1]
    146     vst1.16     {q1},[r2],r3
    147     vst1.16     {q1},[r9],r3
    148 
    149     vdup.16     q4,d0[0]
    150     vst1.16     {q2},[r2],r3
    151     vst1.16     {q2},[r9],r3
    152 
    153     vdup.16     q1,d11[3]
    154     vst1.16     {q3},[r2],r3
    155     vst1.16     {q3},[r9],r3
    156 
    157     vdup.16     q2,d11[2]
    158     vst1.16     {q4},[r2],r3
    159     vst1.16     {q4},[r9],r3
    160 
    161     vdup.16     q3,d11[1]
    162     vst1.16     {q1},[r2],r3
    163     vst1.16     {q1},[r9],r3
    164 
    165     vdup.16     q4,d11[0]
    166     vst1.16     {q2},[r2],r3
    167     vst1.16     {q2},[r9],r3
    168 
    169     vdup.16     q1,d10[3]
    170     vst1.16     {q3},[r2],r3
    171     vst1.16     {q3},[r9],r3
    172 
    173     vdup.16     q2,d10[2]
    174     vst1.16     {q4},[r2],r3
    175     vst1.16     {q4},[r9],r3
    176 
    177     vdup.16     q3,d10[1]
    178     vst1.16     {q1},[r2],r3
    179     vst1.16     {q1},[r9],r3
    180     sub         r12,r12,#16                 @move to 16th value pointer
    181 
    182     vdup.16     q4,d10[0]
    183     vst1.16     {q2},[r2],r3
    184     vst1.16     {q2},[r9],r3
    185 
    186     subs        r4,r4,#16                   @decrement the loop count by 16
    187     vst1.16     {q3},[r2],r3
    188     vst1.16     {q3},[r9],r3
    189 
    190     vst1.16     {q4},[r2],r3
    191     vst1.16     {q4},[r9],r3
    192     bgt         core_loop_16
    193     vpop        {d8 - d15}
    194     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    195     b           endloop
    196 
    197 core_loop_8:
    198     ldrb        lr,[r12],#1                 @pu1_ref[two_nt]
    199     @vld1.8     {q15},[r12]                     @pu1_ref[two_nt + 1 + col]
    200 
    201     vdup.8      d28,lr
    202     sub         r12,r12,#17
    203     vld1.8      {q0},[r12]
    204 
    205     sub         r12,r12,#16
    206     vld1.8      {q15},[r12]
    207     vdup.16     q5,d1[3]
    208     @vmovl.u8   q13,d26
    209 
    210     vdup.16     q1,d1[2]
    211     @vsubl.u8   q12,d30,d28
    212 
    213     vdup.16     q2,d1[1]
    214     @vshr.s16   q12,q12,#1
    215 
    216     vdup.16     q3,d1[0]
    217     @vqadd.s16  q11,q13,q12
    218 
    219     vdup.16     q4,d0[3]
    220     @vqmovun.s16 d22,q11
    221 
    222     vst1.16     {q5},[r2],r3
    223 
    224     vdup.16     q5,d0[2]
    225     @vsubl.u8   q12,d31,d28
    226 
    227     vdup.16     q6,d0[1]
    228     @vshr.s16   q12,q12,#1
    229 
    230     vdup.16     q7,d0[0]
    231     @vqadd.s16  q11,q13,q12
    232 
    233     vdup.16     q8,d0[3]
    234     @vqmovun.s16 d22,q11
    235 
    236     vst1.16     {q1},[r2],r3
    237     @sub            r2,r2,#8
    238 
    239     vst1.16     {q2},[r2],r3
    240 
    241     vst1.16     {q3},[r2],r3
    242     vst1.16     {q4},[r2],r3
    243     vst1.16     {q5},[r2],r3
    244 
    245     @vdup.8     q1,d0[2]
    246     vst1.16     {q6},[r2],r3
    247 
    248     @vdup.8     q2,d0[1]
    249     vst1.16     {q7},[r2],r3
    250 
    251     @vdup.8     q3,d0[0]
    252     @vst1.8     {q7},[r2],r3
    253 
    254     @vdup.8     q4,d0[3]
    255     @vst1.8     {q8},[r2],r3
    256 
    257     @vdup.8     q5,d0[2]
    258     @vst1.8     {q1},[r2],r3
    259 
    260     @vdup.8     q6,d0[1]
    261     @vst1.8     {q2},[r2],r3
    262 
    263     @vdup.8     q7,d0[0]
    264     @vst1.8     {q3},[r2],r3
    265 
    266     @vst1.8     {q4},[r2],r3
    267     @vst1.8     {q5},[r2],r3
    268     @vst1.8     {q6},[r2],r3
    269     @vst1.8     {q7},[r2],r3
    270     vpop        {d8 - d15}
    271 
    272     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    273     b           endloop
    274 
    275 
    276 core_loop_4:
    277     ldrb        lr,[r12]                    @pu1_ref[two_nt]
    278     add         r12,r12,#1                  @pu1_ref[two_nt + 1]
    279     @vld1.8     {d30},[r12]                     @pu1_ref[two_nt + 1 + col]
    280 
    281     sub         r12,r12,#9
    282     vld1.8      {d0},[r12]
    283     sub         r12,r12,#8
    284     vld1.8      {d30},[r12]
    285     vdup.16     d26,d0[3]
    286     vdup.8      d28,lr
    287 
    288     vdup.16     d3,d0[2]
    289     vmovl.u8    q13,d26
    290 
    291     vdup.16     d4,d0[1]
    292     vsubl.u8    q12,d30,d28
    293 
    294     vdup.16     d5,d0[0]
    295     vshr.s16    q12,q12,#1
    296 
    297     vdup.16     d6,d0[3]
    298     vqadd.s16   q11,q13,q12
    299 
    300     vdup.16     d7,d0[2]
    301     vqmovun.s16 d22,q11
    302 
    303     vst1.8      {d6},[r2],r3
    304     vst1.8      {d3},[r2],r3
    305 
    306     vdup.16     d8,d0[1]
    307     vst1.8      {d4},[r2],r3
    308     vst1.8      {d5},[r2],r3
    309 
    310     vdup.16     d9,d0[0]
    311     @vst1.8     {d6},[r2],r3
    312     @vst1.8     {d7},[r2],r3
    313 
    314     @vst1.8     {d8},[r2],r3
    315     @vst1.8     {d9},[r2],r3
    316     vpop        {d8 - d15}
    317     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    318     b           endloop
    319 
    320 
    321 @core_loop_4
    322     ldrb        lr,[r12]                    @pu1_ref[two_nt]
    323     add         r12,r12,#1                  @pu1_ref[two_nt + 1]
    324     vld1.8      {d30},[r12]                 @pu1_ref[two_nt + 1 + col]
    325 
    326     sub         r12,r12,#5
    327     vld1.8      {d0},[r12]
    328     vdup.8      d28,lr
    329     vdup.8      d26,d0[3]
    330     vmovl.u8    q13,d26
    331 
    332     vdup.8      d3,d0[2]
    333     vsubl.u8    q12,d30,d28
    334 
    335     vdup.8      d4,d0[1]
    336     vshr.s16    q12,q12,#1
    337 
    338     vdup.8      d5,d0[0]
    339     vqadd.s16   q11,q13,q12
    340 
    341     vqmovun.s16 d22,q11
    342 
    343     vst1.32     {d22[0]},[r2],r3
    344     vst1.32     {d3[0]},[r2],r3
    345     vst1.32     {d4[0]},[r2],r3
    346     vst1.32     {d5[0]},[r2],r3
    347 
    348     vpop        {d8 - d15}
    349     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    350 
    351 endloop:
    352 
    353 
    354