Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @******************************************************************************
     20 @* @file
     21 @*  ihevc_inter_pred_filters_luma_vert_w16inp.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  yogeswaran rs
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*  - ihevc_inter_pred_luma_vert()
     35 @*
     36 @* @remarks
     37 @*  none
     38 @*
     39 @*******************************************************************************
     40 @*/
     41 
     42 @/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
     43 @/* include reconstruction */
     44 @
     45 
     46 @/**
     47 @*******************************************************************************
     48 @*
     49 @* @brief
     50 @*    luma vertical filter for 16bit input.
     51 @*
     52 @* @par description:
     53 @*     applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
     54 @*     the elements pointed by 'pu1_src' and  writes to the location pointed by
     55 @*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
     56 @*     clipped to lie  between 0 and 255   assumptions : the function is
     57 @*     optimized considering the fact width is  multiple of 4. and height as
     58 @*     multiple of 2.
     59 @*
     60 @* @param[in] pi2_src
     61 @*  word16 pointer to the source
     62 @*
     63 @* @param[out] pu1_dst
     64 @*  uword8 pointer to the destination
     65 @*
     66 @* @param[in] src_strd
     67 @*  integer source stride
     68 @*
     69 @* @param[in] dst_strd
     70 @*  integer destination stride
     71 @*
     72 @* @param[in] pi1_coeff
     73 @*  word8 pointer to the filter coefficients
     74 @*
     75 @* @param[in] ht
     76 @*  integer height of the array
     77 @*
     78 @* @param[in] wd
     79 @*  integer width of the array
     80 @*
     81 @* @returns
     82 @*
     83 @* @remarks
     84 @*  none
     85 @*
     86 @*******************************************************************************
     87 @*/
     88 
     89 @void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
     90 @                                    uword8 *pu1_dst,
     91 @                                    word32 src_strd,
     92 @                                    word32 dst_strd,
     93 @                                    word8 *pi1_coeff,
     94 @                                    word32 ht,
     95 @                                    word32 wd   )
     96 @**************variables vs registers*****************************************
     97 @   r0 => *pu2_src
     98 @   r1 => *pu1_dst
     99 @   r2 =>  src_strd
    100 @   r3 =>  dst_strd
    101 @   r4 => *pi1_coeff
    102 @   r5 =>  ht
    103 @   r6 =>  wd
    104 
    105 .text
    106 .align 4
    107 
    108 
    109 
    110 
    111 .globl ihevc_inter_pred_luma_vert_w16inp_w16out_a9q
    112 
    113 .type ihevc_inter_pred_luma_vert_w16inp_w16out_a9q, %function
    114 
    115 ihevc_inter_pred_luma_vert_w16inp_w16out_a9q:
    116 
    117     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    118 
    119     ldr         r12,[sp,#40]                @load pi1_coeff
    120     mov         r6,r3,lsl #1
    121     ldr         r5,[sp,#48]                 @load wd
    122     vld1.8      {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
    123     mov         r2, r2, lsl #1
    124     sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
    125     @vabs.s8    d0,d0               @vabs_s8(coeff)
    126     add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
    127     ldr         r3,[sp,#44]                 @load ht
    128     subs        r7,r3,#0                    @r3->ht
    129     @ble        end_loops           @end loop jump
    130     vmovl.s8    q0,d0
    131     vdup.16     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
    132     vdup.16     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
    133     vdup.16     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
    134     vdup.16     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
    135     vdup.16     d26,d1[0]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
    136     vdup.16     d27,d1[1]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
    137     vdup.16     d28,d1[2]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
    138     vdup.16     d29,d1[3]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
    139     vmov.i32    q15,#0x80000
    140 
    141     rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
    142     rsb         r8,r5,r2,lsl #2             @r2->src_strd
    143     sub         r8,r8,r5
    144     sub         r9,r9,r5
    145     mov         r3, r5, lsr #2              @divide by 4
    146     mul         r7, r3                      @multiply height by width
    147     sub         r7, #4                      @subtract by one for epilog
    148     mov         r4,r5                       @r5 ->wd
    149     @mov            r2, r2, lsl #1
    150 
    151 prolog:
    152 
    153     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    154     vld1.16     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    155     vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    156     subs        r4,r4,#4
    157     vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    158     vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    159     vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    160     vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
    161     vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    162     vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
    163     vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    164     vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    165     vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    166     vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    167     vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    168     vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
    169     vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    170     vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
    171 
    172     vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    173 
    174     vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    175     addle       r0,r0,r8,lsl #0
    176     vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
    177     movle       r4,r5                       @r5 ->wd
    178     vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
    179     vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    180     vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    181     vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    182     vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    183     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    184     vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
    185     vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    186     vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
    187     vsub.s32    q4, q4, q15
    188 
    189     vld1.16     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    190     vmull.s16   q6,d3,d23
    191     vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    192     vmlal.s16   q6,d2,d22
    193     vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    194     vmlal.s16   q6,d4,d24
    195     vmlal.s16   q6,d5,d25
    196     vmlal.s16   q6,d6,d26
    197     vmlal.s16   q6,d7,d27
    198     vmlal.s16   q6,d16,d28
    199     vmlal.s16   q6,d17,d29
    200     add         r14,r1,r6
    201     vsub.s32    q5, q5, q15
    202     vshrn.s32   d8, q4, #6
    203     @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
    204 
    205     vmull.s16   q7,d4,d23
    206     vmlal.s16   q7,d3,d22
    207     vmlal.s16   q7,d5,d24
    208     vmlal.s16   q7,d6,d25
    209     vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    210     vmlal.s16   q7,d7,d26
    211     vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    212     vmlal.s16   q7,d16,d27
    213     vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    214     vmlal.s16   q7,d17,d28
    215     vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    216     vmlal.s16   q7,d18,d29
    217     vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    218 
    219     vst1.32     {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
    220     vsub.s32    q6, q6, q15
    221     vshrn.s32   d10, q5, #6
    222     @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
    223     addle       r1,r1,r9
    224 
    225     subs        r7,r7,#4
    226 
    227 
    228     blt         epilog_end                  @jumps to epilog_end
    229     beq         epilog                      @jumps to epilog
    230 
    231 kernel_8:
    232 
    233     vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    234     subs        r4,r4,#4
    235     vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
    236     addle       r0,r0,r8,lsl #0
    237     vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
    238     vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    239     vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    240     vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
    241     vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    242     vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
    243     vst1.32     {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
    244 
    245     vsub.s32    q7, q7, q15
    246     vshrn.s32   d12, q6, #6
    247     @vqrshrun.s16 d12,q6,#6
    248     vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    249 
    250     vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    251     vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
    252     vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
    253     vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    254     vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    255     vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
    256     vst1.32     {d12},[r14],r6
    257 
    258     vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    259     vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    260 
    261     vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
    262 
    263     vsub.s32    q4, q4, q15
    264     vshrn.s32   d14, q7, #6
    265     @vqrshrun.s16 d14,q7,#6
    266 
    267     vmull.s16   q6,d3,d23
    268     movle       r4,r5                       @r5 ->wd
    269 
    270     vmlal.s16   q6,d2,d22
    271     vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    272 
    273     vmlal.s16   q6,d4,d24
    274     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    275 
    276     vmlal.s16   q6,d5,d25
    277 
    278     vmlal.s16   q6,d6,d26
    279     vst1.32     {d14},[r14],r6
    280 
    281     vmlal.s16   q6,d7,d27
    282     vld1.16     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    283 
    284     vmlal.s16   q6,d16,d28
    285     add         r14,r1,r6
    286 
    287     vmlal.s16   q6,d17,d29
    288     vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    289 
    290     vsub.s32    q5, q5, q15
    291     vshrn.s32   d8, q4, #6
    292     @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
    293     vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    294 
    295     vmull.s16   q7,d4,d23
    296     vmlal.s16   q7,d3,d22
    297     vmlal.s16   q7,d5,d24
    298     vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    299 
    300     vmlal.s16   q7,d6,d25
    301     vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    302     vmlal.s16   q7,d7,d26
    303     vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    304     vmlal.s16   q7,d16,d27
    305     vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    306     vmlal.s16   q7,d17,d28
    307     vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    308     vmlal.s16   q7,d18,d29
    309     vst1.32     {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
    310 
    311     vsub.s32    q6, q6, q15
    312     vshrn.s32   d10, q5, #6
    313     addle       r1,r1,r9
    314 
    315     @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
    316     subs        r7,r7,#4
    317 
    318     bgt         kernel_8                    @jumps to kernel_8
    319 
    320 epilog:
    321 
    322     vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    323     vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
    324     vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
    325     vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    326     vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    327     vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
    328     vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    329     vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
    330     vst1.32     {d10},[r14],r6
    331 
    332     vsub.s32    q7, q7, q15
    333     vshrn.s32   d12, q6, #6
    334     @vqrshrun.s16 d12,q6,#6
    335 
    336     vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    337     vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    338     vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
    339     vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
    340     vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    341     vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    342     vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
    343     vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    344     vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
    345     vst1.32     {d12},[r14],r6
    346 
    347     vsub.s32    q4, q4, q15
    348     vshrn.s32   d14, q7, #6
    349     @vqrshrun.s16 d14,q7,#6
    350 
    351     vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    352     vmull.s16   q6,d3,d23
    353     vmlal.s16   q6,d2,d22
    354     vmlal.s16   q6,d4,d24
    355     vmlal.s16   q6,d5,d25
    356     vmlal.s16   q6,d6,d26
    357     vmlal.s16   q6,d7,d27
    358     vmlal.s16   q6,d16,d28
    359     vmlal.s16   q6,d17,d29
    360     vst1.32     {d14},[r14],r6
    361     vsub.s32    q5, q5, q15
    362     vshrn.s32   d8, q4, #6
    363     @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
    364 
    365     vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    366     vmull.s16   q7,d4,d23
    367     vmlal.s16   q7,d3,d22
    368     vmlal.s16   q7,d5,d24
    369     vmlal.s16   q7,d6,d25
    370     vmlal.s16   q7,d7,d26
    371     vmlal.s16   q7,d16,d27
    372     vmlal.s16   q7,d17,d28
    373     vmlal.s16   q7,d18,d29
    374     vsub.s32    q6, q6, q15
    375     vshrn.s32   d10, q5, #6
    376     @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
    377 
    378     add         r14,r1,r6
    379     vst1.32     {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
    380 
    381 epilog_end:
    382     vst1.32     {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
    383     vshrn.s32   d12, q6, #6
    384     @vqrshrun.s16 d12,q6,#6
    385 
    386     vst1.32     {d12},[r14],r6
    387     vsub.s32    q7, q7, q15
    388     vshrn.s32   d14, q7, #6
    389     @vqrshrun.s16 d14,q7,#6
    390 
    391     vst1.32     {d14},[r14],r6
    392 
    393 
    394 end_loops:
    395 
    396     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    397 
    398 
    399 
    400 
    401 
    402 
    403 
    404 
    405