Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_intra_pred_chroma_horz_neon.s
     22 @*
     23 @* @brief
     24 @*  contains function definition for intra prediction  interpolation filters
     25 @*
     26 @*
     27 @* @author
     28 @*  parthiban v
     29 @*
     30 @* @par list of functions:
     31 @*  - ihevc_intra_pred_luma_horz()
     32 @*
     33 @* @remarks
     34 @*  none
     35 @*
     36 @*******************************************************************************
     37 @*/
     38 @
     39 @/**
     40 @*******************************************************************************
     41 @*
     42 @* @brief
     43 @*     intra prediction interpolation filter for horizontal luma variable.
     44 @*
     45 @* @par description:
     46 @*      horizontal intraprediction(mode 10) with.extern  samples location
     47 @*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
     48 @*      to section 8.4.4.2.6 in the standard (special case)
     49 @*
     50 @* @param[in] pu1_src
     51 @*  uword8 pointer to the source
     52 @*
     53 @* @param[out] pu1_dst
     54 @*  uword8 pointer to the destination
     55 @*
     56 @* @param[in] src_strd
     57 @*  integer source stride
     58 @*
     59 @* @param[in] dst_strd
     60 @*  integer destination stride
     61 @*
     62 @* @param[in] nt
     63 @*  integer transform block size
     64 @*
     65 @* @param[in] mode
     66 @*  integer intraprediction mode
     67 @*
     68 @* @returns
     69 @*
     70 @* @remarks
     71 @*  none
     72 @*
     73 @*******************************************************************************
     74 @*/
     75 @void ihevc_intra_pred_chroma_horz(uword8 *pu1_ref,
     76 @                                  word32 src_strd,
     77 @                                  uword8 *pu1_dst,
     78 @                                  word32 dst_strd,
     79 @                                  word32 nt,
     80 @                                  word32 mode)
     81 @**************variables vs registers*****************************************
     82 @r0 => *pu1_ref
     83 @r1 =>  src_strd
     84 @r2 => *pu1_dst
     85 @r3 =>  dst_strd
     86 
     87 .text
     88 .align 4
     89 
     90 
     91 
     92 
     93 .globl ihevc_intra_pred_chroma_horz_a9q
     94 
     95 .type ihevc_intra_pred_chroma_horz_a9q, %function
     96 
     97 ihevc_intra_pred_chroma_horz_a9q:
     98 
     99     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    100 
    101     ldr         r4,[sp,#40]                 @loads nt
    102 
    103     lsl         r6,r4,#2                    @four_nt
    104 
    105     add         r12,r0,r6                   @*pu1_ref[four_nt]
    106     cmp         r4,#4                       @if nt == 4
    107     beq         core_loop_4
    108 
    109     cmp         r4,#8                       @if nt == 8
    110     beq         core_loop_8
    111 
    112     @cmp            r4,#16                          @if nt == 16
    113     @beq            core_loop_16
    114 
    115     sub         r12,r12,#16                 @move to 16th value pointer
    116     add         r9,r2,#16
    117 
    118 core_loop_16:
    119     vld1.16     {q0},[r12]                  @load 16 values. d1[7] will have the 1st value.
    120     sub         r12,r12,#16
    121     vld1.16     {q5},[r12]                  @load 16 values. d1[7] will have the 1st value.
    122 
    123     vdup.16     q1,d1[3]                    @duplicate the i value.
    124 
    125     vdup.16     q2,d1[2]                    @duplicate the ii value.
    126     vdup.16     q3,d1[1]                    @duplicate the iii value.
    127     vst1.16     {q1},[r2],r3                @store in 1st row 0-16 columns
    128     vst1.16     {q1},[r9],r3                @store in 1st row 16-32 columns
    129 
    130     vdup.16     q4,d1[0]
    131     vst1.16     {q2},[r2],r3
    132     vst1.16     {q2},[r9],r3
    133 
    134     vdup.16     q1,d0[3]
    135     vst1.16     {q3},[r2],r3
    136     vst1.16     {q3},[r9],r3
    137 
    138     vdup.16     q2,d0[2]
    139     vst1.16     {q4},[r2],r3
    140     vst1.16     {q4},[r9],r3
    141 
    142     vdup.16     q3,d0[1]
    143     vst1.16     {q1},[r2],r3
    144     vst1.16     {q1},[r9],r3
    145 
    146     vdup.16     q4,d0[0]
    147     vst1.16     {q2},[r2],r3
    148     vst1.16     {q2},[r9],r3
    149 
    150     vdup.16     q1,d11[3]
    151     vst1.16     {q3},[r2],r3
    152     vst1.16     {q3},[r9],r3
    153 
    154     vdup.16     q2,d11[2]
    155     vst1.16     {q4},[r2],r3
    156     vst1.16     {q4},[r9],r3
    157 
    158     vdup.16     q3,d11[1]
    159     vst1.16     {q1},[r2],r3
    160     vst1.16     {q1},[r9],r3
    161 
    162     vdup.16     q4,d11[0]
    163     vst1.16     {q2},[r2],r3
    164     vst1.16     {q2},[r9],r3
    165 
    166     vdup.16     q1,d10[3]
    167     vst1.16     {q3},[r2],r3
    168     vst1.16     {q3},[r9],r3
    169 
    170     vdup.16     q2,d10[2]
    171     vst1.16     {q4},[r2],r3
    172     vst1.16     {q4},[r9],r3
    173 
    174     vdup.16     q3,d10[1]
    175     vst1.16     {q1},[r2],r3
    176     vst1.16     {q1},[r9],r3
    177     sub         r12,r12,#16                 @move to 16th value pointer
    178 
    179     vdup.16     q4,d10[0]
    180     vst1.16     {q2},[r2],r3
    181     vst1.16     {q2},[r9],r3
    182 
    183     subs        r4,r4,#16                   @decrement the loop count by 16
    184     vst1.16     {q3},[r2],r3
    185     vst1.16     {q3},[r9],r3
    186 
    187     vst1.16     {q4},[r2],r3
    188     vst1.16     {q4},[r9],r3
    189     bgt         core_loop_16
    190     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    191     b           endloop
    192 
    193 core_loop_8:
    194     ldrb        lr,[r12],#1                 @pu1_ref[two_nt]
    195     @vld1.8     {q15},[r12]                     @pu1_ref[two_nt + 1 + col]
    196 
    197     vdup.8      d28,lr
    198     sub         r12,r12,#17
    199     vld1.8      {q0},[r12]
    200 
    201     sub         r12,r12,#16
    202     vld1.8      {q15},[r12]
    203     vdup.16     q5,d1[3]
    204     @vmovl.u8   q13,d26
    205 
    206     vdup.16     q1,d1[2]
    207     @vsubl.u8   q12,d30,d28
    208 
    209     vdup.16     q2,d1[1]
    210     @vshr.s16   q12,q12,#1
    211 
    212     vdup.16     q3,d1[0]
    213     @vqadd.s16  q11,q13,q12
    214 
    215     vdup.16     q4,d0[3]
    216     @vqmovun.s16 d22,q11
    217 
    218     vst1.16     {q5},[r2],r3
    219 
    220     vdup.16     q5,d0[2]
    221     @vsubl.u8   q12,d31,d28
    222 
    223     vdup.16     q6,d0[1]
    224     @vshr.s16   q12,q12,#1
    225 
    226     vdup.16     q7,d0[0]
    227     @vqadd.s16  q11,q13,q12
    228 
    229     vdup.16     q8,d0[3]
    230     @vqmovun.s16 d22,q11
    231 
    232     vst1.16     {q1},[r2],r3
    233     @sub            r2,r2,#8
    234 
    235     vst1.16     {q2},[r2],r3
    236 
    237     vst1.16     {q3},[r2],r3
    238     vst1.16     {q4},[r2],r3
    239     vst1.16     {q5},[r2],r3
    240 
    241     @vdup.8     q1,d0[2]
    242     vst1.16     {q6},[r2],r3
    243 
    244     @vdup.8     q2,d0[1]
    245     vst1.16     {q7},[r2],r3
    246 
    247     @vdup.8     q3,d0[0]
    248     @vst1.8     {q7},[r2],r3
    249 
    250     @vdup.8     q4,d0[3]
    251     @vst1.8     {q8},[r2],r3
    252 
    253     @vdup.8     q5,d0[2]
    254     @vst1.8     {q1},[r2],r3
    255 
    256     @vdup.8     q6,d0[1]
    257     @vst1.8     {q2},[r2],r3
    258 
    259     @vdup.8     q7,d0[0]
    260     @vst1.8     {q3},[r2],r3
    261 
    262     @vst1.8     {q4},[r2],r3
    263     @vst1.8     {q5},[r2],r3
    264     @vst1.8     {q6},[r2],r3
    265     @vst1.8     {q7},[r2],r3
    266 
    267     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    268     b           endloop
    269 
    270 
    271 core_loop_4:
    272     ldrb        lr,[r12]                    @pu1_ref[two_nt]
    273     add         r12,r12,#1                  @pu1_ref[two_nt + 1]
    274     @vld1.8     {d30},[r12]                     @pu1_ref[two_nt + 1 + col]
    275 
    276     sub         r12,r12,#9
    277     vld1.8      {d0},[r12]
    278     sub         r12,r12,#8
    279     vld1.8      {d30},[r12]
    280     vdup.16     d26,d0[3]
    281     vdup.8      d28,lr
    282 
    283     vdup.16     d3,d0[2]
    284     vmovl.u8    q13,d26
    285 
    286     vdup.16     d4,d0[1]
    287     vsubl.u8    q12,d30,d28
    288 
    289     vdup.16     d5,d0[0]
    290     vshr.s16    q12,q12,#1
    291 
    292     vdup.16     d6,d0[3]
    293     vqadd.s16   q11,q13,q12
    294 
    295     vdup.16     d7,d0[2]
    296     vqmovun.s16 d22,q11
    297 
    298     vst1.8      {d6},[r2],r3
    299     vst1.8      {d3},[r2],r3
    300 
    301     vdup.16     d8,d0[1]
    302     vst1.8      {d4},[r2],r3
    303     vst1.8      {d5},[r2],r3
    304 
    305     vdup.16     d9,d0[0]
    306     @vst1.8     {d6},[r2],r3
    307     @vst1.8     {d7},[r2],r3
    308 
    309     @vst1.8     {d8},[r2],r3
    310     @vst1.8     {d9},[r2],r3
    311     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    312     b           endloop
    313 
    314 
    315 @core_loop_4
    316     ldrb        lr,[r12]                    @pu1_ref[two_nt]
    317     add         r12,r12,#1                  @pu1_ref[two_nt + 1]
    318     vld1.8      {d30},[r12]                 @pu1_ref[two_nt + 1 + col]
    319 
    320     sub         r12,r12,#5
    321     vld1.8      {d0},[r12]
    322     vdup.8      d28,lr
    323     vdup.8      d26,d0[3]
    324     vmovl.u8    q13,d26
    325 
    326     vdup.8      d3,d0[2]
    327     vsubl.u8    q12,d30,d28
    328 
    329     vdup.8      d4,d0[1]
    330     vshr.s16    q12,q12,#1
    331 
    332     vdup.8      d5,d0[0]
    333     vqadd.s16   q11,q13,q12
    334 
    335     vqmovun.s16 d22,q11
    336 
    337     vst1.32     {d22[0]},[r2],r3
    338     vst1.32     {d3[0]},[r2],r3
    339     vst1.32     {d4[0]},[r2],r3
    340     vst1.32     {d5[0]},[r2],r3
    341 
    342     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    343 
    344 endloop:
    345 
    346 
    347