Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @******************************************************************************
     20 @* @file
     21 @*  ihevc_inter_pred_filters_luma_vert_w16inp.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  yogeswaran rs
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*  - ihevc_inter_pred_filters_luma_vert_w16inp()
     35 @*
     36 @* @remarks
     37 @*  none
     38 @*
     39 @*******************************************************************************
     40 @*/
     41 
     42 @/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
     43 @/* include reconstruction */
     44 @
     45 
     46 @/**
     47 @*******************************************************************************
     48 @*
     49 @* @brief
     50 @*    luma vertical filter for 16bit input.
     51 @*
     52 @* @par description:
     53 @*     applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
     54 @*     the elements pointed by 'pu1_src' and  writes to the location pointed by
     55 @*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
     56 @*     clipped to lie  between 0 and 255   assumptions : the function is
     57 @*     optimized considering the fact width is  multiple of 4. and height as
     58 @*     multiple of 2.
     59 @*
     60 @* @param[in] pi2_src
     61 @*  word16 pointer to the source
     62 @*
     63 @* @param[out] pu1_dst
     64 @*  uword8 pointer to the destination
     65 @*
     66 @* @param[in] src_strd
     67 @*  integer source stride
     68 @*
     69 @* @param[in] dst_strd
     70 @*  integer destination stride
     71 @*
     72 @* @param[in] pi1_coeff
     73 @*  word8 pointer to the filter coefficients
     74 @*
     75 @* @param[in] ht
     76 @*  integer height of the array
     77 @*
     78 @* @param[in] wd
     79 @*  integer width of the array
     80 @*
     81 @* @returns
     82 @*
     83 @* @remarks
     84 @*  none
     85 @*
     86 @*******************************************************************************
     87 @*/
     88 
     89 @void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
     90 @                                    uword8 *pu1_dst,
     91 @                                    word32 src_strd,
     92 @                                    word32 dst_strd,
     93 @                                    word8 *pi1_coeff,
     94 @                                    word32 ht,
     95 @                                    word32 wd   )
     96 
     97 .equ    coeff_offset,   104
     98 .equ    ht_offset,      108
     99 .equ    wd_offset,      112
    100 
    101 .text
    102 .align 4
    103 
    104 
    105 
    106 
    107 .globl ihevc_inter_pred_luma_vert_w16inp_a9q
    108 
    109 .type ihevc_inter_pred_luma_vert_w16inp_a9q, %function
    110 
    111 ihevc_inter_pred_luma_vert_w16inp_a9q:
    112 
    113     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    114     vpush        {d8 - d15}
    115 
    116     ldr         r12,[sp,#coeff_offset]                @load pi1_coeff
    117     mov         r6,r3
    118     ldr         r5,[sp,#wd_offset]                 @load wd
    119     vld1.8      {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
    120     mov         r2, r2, lsl #1
    121     sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
    122     @vabs.s8    d0,d0               @vabs_s8(coeff)
    123     add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
    124     ldr         r3,[sp,#ht_offset]                 @load ht
    125     subs        r7,r3,#0                    @r3->ht
    126     @ble        end_loops           @end loop jump
    127     vmovl.s8    q0,d0
    128     vdup.16     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
    129     vdup.16     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
    130     vdup.16     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
    131     vdup.16     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
    132     vdup.16     d26,d1[0]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
    133     vdup.16     d27,d1[1]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
    134     vdup.16     d28,d1[2]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
    135     vdup.16     d29,d1[3]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
    136 
    137     rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
    138     rsb         r8,r5,r2,lsl #2             @r2->src_strd
    139     sub         r8,r8,r5
    140     mov         r3, r5, lsr #2              @divide by 4
    141     mul         r7, r3                      @multiply height by width
    142     sub         r7, #4                      @subtract by one for epilog
    143     mov         r4,r5                       @r5 ->wd
    144     @mov            r2, r2, lsl #1
    145 
    146 prolog:
    147 
    148     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    149     vld1.16     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    150     vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    151     subs        r4,r4,#4
    152     vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    153     vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    154     vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    155     vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
    156     vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    157     vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
    158     vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    159     vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    160     vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    161     vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    162     vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    163     vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
    164     vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    165     vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
    166 
    167     vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    168 
    169     vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    170     addle       r0,r0,r8,lsl #0
    171     vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
    172     movle       r4,r5                       @r5 ->wd
    173     vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
    174     vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    175     vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    176     vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    177     vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    178     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    179     vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
    180     vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    181     vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
    182     vqshrn.s32  d8, q4, #6
    183 
    184     vld1.16     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    185     vmull.s16   q6,d3,d23
    186     vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    187     vmlal.s16   q6,d2,d22
    188     vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    189     vmlal.s16   q6,d4,d24
    190     vmlal.s16   q6,d5,d25
    191     vmlal.s16   q6,d6,d26
    192     vmlal.s16   q6,d7,d27
    193     vmlal.s16   q6,d16,d28
    194     vmlal.s16   q6,d17,d29
    195     add         r14,r1,r6
    196     vqshrn.s32  d10, q5, #6
    197     vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
    198 
    199     vmull.s16   q7,d4,d23
    200     vmlal.s16   q7,d3,d22
    201     vmlal.s16   q7,d5,d24
    202     vmlal.s16   q7,d6,d25
    203     vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    204     vmlal.s16   q7,d7,d26
    205     vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    206     vmlal.s16   q7,d16,d27
    207     vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    208     vmlal.s16   q7,d17,d28
    209     vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    210     vmlal.s16   q7,d18,d29
    211     vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    212 
    213     vst1.32     {d8[0]},[r1]!               @vst1_u8(pu1_dst,sto_res)@
    214     vqshrn.s32  d12, q6, #6
    215     vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
    216     addle       r1,r1,r9
    217 
    218     subs        r7,r7,#4
    219 
    220     blt         epilog_end                  @jumps to epilog_end
    221     beq         epilog                      @jumps to epilog
    222 
    223 kernel_8:
    224 
    225     vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    226     subs        r4,r4,#4
    227     vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
    228     addle       r0,r0,r8,lsl #0
    229     vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
    230     vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    231     vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    232     vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
    233     vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    234     vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
    235     vst1.32     {d10[0]},[r14],r6           @vst1_u8(pu1_dst_tmp,sto_res)@
    236 
    237     vqshrn.s32  d14, q7, #6
    238     vqrshrun.s16 d12,q6,#6
    239     vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    240 
    241     vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    242     vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
    243     vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
    244     vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    245     vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    246     vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
    247     vst1.32     {d12[0]},[r14],r6
    248 
    249     vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    250     vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    251 
    252     vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
    253 
    254     vqshrn.s32  d8, q4, #6
    255     vqrshrun.s16 d14,q7,#6
    256 
    257     vmull.s16   q6,d3,d23
    258     movle       r4,r5                       @r5 ->wd
    259 
    260     vmlal.s16   q6,d2,d22
    261     vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    262 
    263     vmlal.s16   q6,d4,d24
    264     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    265 
    266     vmlal.s16   q6,d5,d25
    267 
    268     vmlal.s16   q6,d6,d26
    269     vst1.32     {d14[0]},[r14],r6
    270 
    271     vmlal.s16   q6,d7,d27
    272     vld1.16     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    273 
    274     vmlal.s16   q6,d16,d28
    275     add         r14,r1,r6
    276 
    277     vmlal.s16   q6,d17,d29
    278     vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    279 
    280     vqshrn.s32  d10, q5, #6
    281     vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
    282     vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    283 
    284     vmull.s16   q7,d4,d23
    285     vmlal.s16   q7,d3,d22
    286     vmlal.s16   q7,d5,d24
    287     vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    288 
    289     vmlal.s16   q7,d6,d25
    290     vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    291     vmlal.s16   q7,d7,d26
    292     vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    293     vmlal.s16   q7,d16,d27
    294     vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    295     vmlal.s16   q7,d17,d28
    296     vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    297     vmlal.s16   q7,d18,d29
    298     vst1.32     {d8[0]},[r1]!               @vst1_u8(pu1_dst,sto_res)@
    299 
    300     vqshrn.s32  d12, q6, #6
    301     addle       r1,r1,r9
    302 
    303     vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
    304     subs        r7,r7,#4
    305 
    306     bgt         kernel_8                    @jumps to kernel_8
    307 
    308 epilog:
    309 
    310     vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    311     vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
    312     vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
    313     vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    314     vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    315     vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
    316     vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    317     vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
    318     vst1.32     {d10[0]},[r14],r6
    319 
    320     vqshrn.s32  d14, q7, #6
    321     vqrshrun.s16 d12,q6,#6
    322 
    323     vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    324     vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    325     vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
    326     vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
    327     vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    328     vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    329     vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
    330     vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    331     vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
    332     vst1.32     {d12[0]},[r14],r6
    333 
    334     vqshrn.s32  d8, q4, #6
    335     vqrshrun.s16 d14,q7,#6
    336 
    337     vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    338     vmull.s16   q6,d3,d23
    339     vmlal.s16   q6,d2,d22
    340     vmlal.s16   q6,d4,d24
    341     vmlal.s16   q6,d5,d25
    342     vmlal.s16   q6,d6,d26
    343     vmlal.s16   q6,d7,d27
    344     vmlal.s16   q6,d16,d28
    345     vmlal.s16   q6,d17,d29
    346     vst1.32     {d14[0]},[r14],r6
    347     vqshrn.s32  d10, q5, #6
    348     vqrshrun.s16 d8,q4,#6                   @sto_res = vqmovun_s16(sto_res_tmp)@
    349 
    350     vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    351     vmull.s16   q7,d4,d23
    352     vmlal.s16   q7,d3,d22
    353     vmlal.s16   q7,d5,d24
    354     vmlal.s16   q7,d6,d25
    355     vmlal.s16   q7,d7,d26
    356     vmlal.s16   q7,d16,d27
    357     vmlal.s16   q7,d17,d28
    358     vmlal.s16   q7,d18,d29
    359     vqshrn.s32  d12, q6, #6
    360     vqrshrun.s16 d10,q5,#6                  @sto_res = vqmovun_s16(sto_res_tmp)@
    361 
    362     add         r14,r1,r6
    363     vst1.32     {d8[0]},[r1]!               @vst1_u8(pu1_dst,sto_res)@
    364 
    365 epilog_end:
    366     vst1.32     {d10[0]},[r14],r6           @vst1_u8(pu1_dst_tmp,sto_res)@
    367     vqrshrun.s16 d12,q6,#6
    368 
    369     vst1.32     {d12[0]},[r14],r6
    370     vqshrn.s32  d14, q7, #6
    371     vqrshrun.s16 d14,q7,#6
    372 
    373     vst1.32     {d14[0]},[r14],r6
    374 
    375 
    376 end_loops:
    377 
    378     vpop         {d8 - d15}
    379     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    380 
    381 
    382 
    383 
    384 
    385 
    386 
    387 
    388 
    389 
    390 
    391 
    392 
    393 
    394 
    395 
    396 
    397 
    398 
    399 
    400