Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_inter_pred_chroma_vert_neon_w16inp_w16out_neon.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  yogeswaran rs / parthiban
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @/**
     42 @*******************************************************************************
     43 @*
     44 @* @brief
     45 @*    chroma interprediction filter for 16bit vertical input and output.
     46 @*
     47 @* @par description:
     48 @*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
     49 @*    the elements pointed by 'pu1_src' and  writes to the location pointed by
     50 @*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 6 and
     51 @*    8192 is  subtracted to store it as a 16 bit number  the output is used as
     52 @*    a input to weighted prediction   assumptions : the function is optimized
     53 @*    considering the fact width and  height are multiple of 2.
     54 @*
     55 @* @param[in] pi2_src
     56 @*  word16 pointer to the source
     57 @*
     58 @* @param[out] pi2_dst
     59 @*  word16 pointer to the destination
     60 @*
     61 @* @param[in] src_strd
     62 @*  integer source stride
     63 @*
     64 @* @param[in] dst_strd
     65 @*  integer destination stride
     66 @*
     67 @* @param[in] pi1_coeff
     68 @*  word8 pointer to the filter coefficients
     69 @*
     70 @* @param[in] ht
     71 @*  integer height of the array
     72 @*
     73 @* @param[in] wd
     74 @*  integer width of the array
     75 @*
     76 @* @returns
     77 @*
     78 @* @remarks
     79 @*  none
     80 @*
     81 @*******************************************************************************
     82 @*/
     83 @void ihevc_inter_pred_chroma_vert_w16inp_w16out(word16 *pi2_src,
     84 @                                                 word16 *pi2_dst,
     85 @                                                 word32 src_strd,
     86 @                                                 word32 dst_strd,
     87 @                                                 word8 *pi1_coeff,
     88 @                                                 word32 ht,
     89 @                                                 word32 wd)
     90 @**************variables vs registers*****************************************
     91 @r0 => *pu1_src
     92 @r1 => *pi2_dst
     93 @r2 =>  src_strd
     94 @r3 =>  dst_strd
     95 .text
     96 .align 4
     97 
     98 
     99 
    100 
    101 .globl ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q
    102 
    103 .type ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q, %function
    104 
    105 ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q:
    106 
    107     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    108 
    109     ldr         r4, [sp,#40]                @loads pi1_coeff
    110     ldr         r6, [sp,#48]                @wd
    111     lsl         r2,r2,#1                    @src_strd = 2* src_strd
    112     ldr         r5,[sp,#44]                 @loads ht
    113     vld1.8      {d0},[r4]                   @loads pi1_coeff
    114     sub         r4,r0,r2                    @pu1_src - src_strd
    115     vmovl.s8    q0,d0                       @long the value
    116 
    117     tst         r6,#3                       @checks wd  == 2
    118     vdup.16     d12,d0[0]                   @coeff_0
    119     vdup.16     d13,d0[1]                   @coeff_1
    120     vdup.16     d14,d0[2]                   @coeff_2
    121     vdup.16     d15,d0[3]                   @coeff_3
    122 
    123     bgt         core_loop_ht_2              @jumps to loop handles wd 2
    124 
    125     tst         r5,#3                       @checks ht == mul of 4
    126     beq         core_loop_ht_4              @jumps to loop handles ht mul of 4
    127 
    128 core_loop_ht_2:
    129     lsl         r7,r2,#1                    @2*src_strd
    130     lsl         r3,r3,#1                    @2*dst_strd
    131     lsl         r9,r6,#2                    @4*wd
    132     sub         r6,r3,r6,lsl #1             @2*dst_strd - 2*wd
    133     sub         r8,r7,r9                    @2*src_strd - 4*wd
    134     mov         r12,r9                      @4wd
    135 
    136 inner_loop_ht_2:
    137     add         r0,r4,r2                    @increments pi2_src
    138     vld1.16     {d0},[r4]!                  @loads pu1_src
    139     vmull.s16   q0,d0,d12                   @vmull_s16(src_tmp1, coeff_0)
    140     subs        r12,r12,#8                  @2wd + 8
    141     vld1.16     {d2},[r0],r2                @loads pi2_src
    142     vmull.s16   q4,d2,d12                   @vmull_s16(src_tmp2, coeff_0)
    143     vld1.16     {d3},[r0],r2                @loads pi2_src
    144     vmlal.s16   q0,d2,d13
    145     vld1.16     {d6},[r0],r2
    146     vmlal.s16   q4,d3,d13
    147     vld1.16     {d2},[r0]
    148     add         r7,r1,r3                    @pu1_dst + dst_strd
    149     vmlal.s16   q0,d3,d14
    150     vmlal.s16   q4,d6,d14
    151     vmlal.s16   q0,d6,d15
    152     vmlal.s16   q4,d2,d15
    153     vqshrn.s32  d0,q0,#6                    @right shift
    154     vqshrn.s32  d30,q4,#6                   @right shift
    155     vst1.32     {d0},[r1]!                  @stores the loaded value
    156     vst1.32     {d30},[r7]                  @stores the loaded value
    157     bgt         inner_loop_ht_2             @inner loop -again
    158 
    159     @inner loop ends
    160     subs        r5,r5,#2                    @increments ht
    161     add         r1,r1,r6,lsl #1             @pu1_dst += 2*dst_strd - 2*wd
    162     mov         r12,r9                      @4wd
    163     add         r4,r4,r8                    @pi1_src_tmp1 += 2*src_strd - 4*wd
    164     bgt         inner_loop_ht_2             @loop again
    165 
    166     b           end_loops                   @jumps to end
    167 
    168 core_loop_ht_4:
    169     lsl         r7,r2,#2                    @2*src_strd
    170     lsl         r10,r3,#2                   @2*dst_strd
    171     mov         r11,r6,lsr #1               @divide by 2
    172     sub         lr,r10,r6,lsl #1            @2*dst_strd - 2*wd
    173     sub         r8,r7,r6,lsl #2             @2*src_strd - 4*wd
    174 
    175     mul         r12,r5,r11                  @multiply height by width
    176     sub         r12,#4                      @subtract by one for epilog
    177     mov         r11,r6,lsl #1               @2*wd
    178     lsl         r3,r3,#1                    @2*dst_strd
    179 
    180 prolog:
    181     add         r0,r4,r2                    @increments pi2_src
    182     vld1.16     {d0},[r4]!                  @loads pu1_src
    183     vld1.16     {d1},[r0],r2                @loads pi2_src
    184     subs        r11,r11,#4
    185     vld1.16     {d2},[r0],r2                @loads pi2_src
    186     vmull.s16   q15,d0,d12                  @vmull_s16(src_tmp1, coeff_0)
    187     vld1.16     {d3},[r0],r2
    188     vmlal.s16   q15,d1,d13
    189     vmlal.s16   q15,d2,d14
    190     add         r9,r1,r3                    @pu1_dst + dst_strd
    191     vmlal.s16   q15,d3,d15
    192 
    193     vld1.16     {d4},[r0],r2
    194     vmull.s16   q14,d1,d12                  @vmull_s16(src_tmp2, coeff_0)
    195     addle       r4,r4,r8
    196     movle       r11,r6,lsl #1
    197     vmlal.s16   q14,d2,d13
    198     vmlal.s16   q14,d3,d14
    199     vld1.s16    {d5},[r0],r2
    200     vmlal.s16   q14,d4,d15
    201 
    202     vqshrn.s32  d30,q15,#6                  @right shift
    203 
    204     vld1.s16    {d6},[r0],r2
    205     vmull.s16   q13,d2,d12                  @vmull_s16(src_tmp2, coeff_0)
    206     vmlal.s16   q13,d3,d13
    207     vmlal.s16   q13,d4,d14
    208     add         r0,r4,r2
    209     vld1.16     {d0},[r4]!                  @loads pu1_src
    210     vmlal.s16   q13,d5,d15
    211 
    212     vqshrn.s32  d28,q14,#6                  @right shift
    213 
    214     vld1.16     {d1},[r0],r2                @loads pi2_src
    215     vmull.s16   q12,d3,d12                  @vmull_s16(src_tmp2, coeff_0)
    216     vst1.32     {d30},[r1]!                 @stores the loaded value
    217     vmlal.s16   q12,d4,d13
    218     vld1.16     {d2},[r0],r2                @loads pi2_src
    219     vmlal.s16   q12,d5,d14
    220     vld1.16     {d3},[r0],r2
    221     vmlal.s16   q12,d6,d15
    222     addle       r1,r1,lr,lsl #1
    223 
    224     vqshrn.s32  d26,q13,#6                  @right shift
    225     subs        r12,r12,#4
    226 
    227     beq         epilog                      @jumps to epilog
    228 
    229 kernel_4:
    230     vmull.s16   q15,d0,d12                  @vmull_s16(src_tmp1, coeff_0)
    231     subs        r11,r11,#4
    232     vmlal.s16   q15,d1,d13
    233     vst1.32     {d28},[r9],r3               @stores the loaded value
    234     vmlal.s16   q15,d2,d14
    235     vmlal.s16   q15,d3,d15
    236 
    237     vqshrn.s32  d24,q12,#6                  @right shift
    238 
    239     vld1.16     {d4},[r0],r2
    240     vmull.s16   q14,d1,d12                  @vmull_s16(src_tmp2, coeff_0)
    241     vmlal.s16   q14,d2,d13
    242     vmlal.s16   q14,d3,d14
    243     vmlal.s16   q14,d4,d15
    244     vst1.32     {d26},[r9],r3               @stores the loaded value
    245     addle       r4,r4,r8
    246     movle       r11,r6,lsl #1
    247 
    248     vqshrn.s32  d30,q15,#6                  @right shift
    249 
    250     vld1.s16    {d5},[r0],r2
    251     vmull.s16   q13,d2,d12                  @vmull_s16(src_tmp2, coeff_0)
    252     vld1.s16    {d6},[r0],r2
    253     vmlal.s16   q13,d3,d13
    254     vst1.32     {d24},[r9]                  @stores the loaded value
    255     add         r0,r4,r2
    256     vmlal.s16   q13,d4,d14
    257     vld1.16     {d0},[r4]!                  @loads pu1_src
    258     vmlal.s16   q13,d5,d15
    259 
    260     vqshrn.s32  d28,q14,#6                  @right shift
    261 
    262     vld1.16     {d1},[r0],r2                @loads pi2_src
    263     vmull.s16   q12,d3,d12                  @vmull_s16(src_tmp2, coeff_0)
    264     vld1.16     {d2},[r0],r2                @loads pi2_src
    265     vmlal.s16   q12,d4,d13
    266     add         r9,r1,r3                    @pu1_dst + dst_strd
    267     vld1.16     {d3},[r0],r2
    268     vmlal.s16   q12,d5,d14
    269 
    270     vst1.32     {d30},[r1]!                 @stores the loaded value
    271     vmlal.s16   q12,d6,d15
    272 
    273     vqshrn.s32  d26,q13,#6                  @right shift
    274     addle       r1,r1,lr,lsl #1
    275 
    276     subs        r12,r12,#4
    277 
    278     bgt         kernel_4                    @jumps to kernel_4
    279 
    280 epilog:
    281     vmull.s16   q15,d0,d12                  @vmull_s16(src_tmp1, coeff_0)
    282     vst1.32     {d28},[r9],r3               @stores the loaded value
    283     vmlal.s16   q15,d1,d13
    284     vmlal.s16   q15,d2,d14
    285     vmlal.s16   q15,d3,d15
    286 
    287     vqshrn.s32  d24,q12,#6                  @right shift
    288 
    289     vmull.s16   q14,d1,d12                  @vmull_s16(src_tmp2, coeff_0)
    290     vld1.16     {d4},[r0],r2
    291     vmlal.s16   q14,d2,d13
    292     vst1.32     {d26},[r9],r3               @stores the loaded value
    293     vmlal.s16   q14,d3,d14
    294     vmlal.s16   q14,d4,d15
    295 
    296     vqshrn.s32  d30,q15,#6                  @right shift
    297 
    298     vmull.s16   q13,d2,d12                  @vmull_s16(src_tmp2, coeff_0)
    299     vld1.s16    {d5},[r0],r2
    300     vmlal.s16   q13,d3,d13
    301     vmlal.s16   q13,d4,d14
    302     vmlal.s16   q13,d5,d15
    303 
    304     vqshrn.s32  d28,q14,#6                  @right shift
    305 
    306     vst1.32     {d24},[r9]                  @stores the loaded value
    307     vmull.s16   q12,d3,d12                  @vmull_s16(src_tmp2, coeff_0)
    308     vmlal.s16   q12,d4,d13
    309     add         r9,r1,r3                    @pu1_dst + dst_strd
    310     vld1.s16    {d6},[r0],r2
    311     vmlal.s16   q12,d5,d14
    312     vmlal.s16   q12,d6,d15
    313     vst1.32     {d30},[r1]!                 @stores the loaded value
    314 
    315     vqshrn.s32  d26,q13,#6                  @right shift
    316 
    317     vst1.32     {d28},[r9],r3               @stores the loaded value
    318 
    319     vqshrn.s32  d24,q12,#6                  @right shift
    320     vst1.32     {d26},[r9],r3               @stores the loaded value
    321 
    322     vst1.32     {d24},[r9]                  @stores the loaded value
    323 
    324 end_loops:
    325     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    326 
    327 
    328 
    329 
    330