Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @******************************************************************************
     20 @* @file
     21 @*  ihevc_inter_pred_filters_luma_vert_w16inp.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  yogeswaran rs
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*  - ihevc_inter_pred_luma_vert()
     35 @*
     36 @* @remarks
     37 @*  none
     38 @*
     39 @*******************************************************************************
     40 @*/
     41 
     42 @/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
     43 @/* include reconstruction */
     44 @
     45 
     46 @/**
     47 @*******************************************************************************
     48 @*
     49 @* @brief
     50 @*    luma vertical filter for 16bit input.
     51 @*
     52 @* @par description:
     53 @*     applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
     54 @*     the elements pointed by 'pu1_src' and  writes to the location pointed by
     55 @*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
     56 @*     clipped to lie  between 0 and 255   assumptions : the function is
     57 @*     optimized considering the fact width is  multiple of 4. and height as
     58 @*     multiple of 2.
     59 @*
     60 @* @param[in] pi2_src
     61 @*  word16 pointer to the source
     62 @*
     63 @* @param[out] pu1_dst
     64 @*  uword8 pointer to the destination
     65 @*
     66 @* @param[in] src_strd
     67 @*  integer source stride
     68 @*
     69 @* @param[in] dst_strd
     70 @*  integer destination stride
     71 @*
     72 @* @param[in] pi1_coeff
     73 @*  word8 pointer to the filter coefficients
     74 @*
     75 @* @param[in] ht
     76 @*  integer height of the array
     77 @*
     78 @* @param[in] wd
     79 @*  integer width of the array
     80 @*
     81 @* @returns
     82 @*
     83 @* @remarks
     84 @*  none
     85 @*
     86 @*******************************************************************************
     87 @*/
     88 
     89 @void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
     90 @                                    uword8 *pu1_dst,
     91 @                                    word32 src_strd,
     92 @                                    word32 dst_strd,
     93 @                                    word8 *pi1_coeff,
     94 @                                    word32 ht,
     95 @                                    word32 wd   )
     96 @**************variables vs registers*****************************************
     97 @   r0 => *pu2_src
     98 @   r1 => *pu1_dst
     99 @   r2 =>  src_strd
    100 @   r3 =>  dst_strd
    101 @   r4 => *pi1_coeff
    102 @   r5 =>  ht
    103 @   r6 =>  wd
    104 
    105 .equ    coeff_offset,   104
    106 .equ    ht_offset,      108
    107 .equ    wd_offset,      112
    108 
    109 .text
    110 .align 4
    111 
    112 
    113 
    114 
    115 .globl ihevc_inter_pred_luma_vert_w16inp_w16out_a9q
    116 
    117 .type ihevc_inter_pred_luma_vert_w16inp_w16out_a9q, %function
    118 
    119 ihevc_inter_pred_luma_vert_w16inp_w16out_a9q:
    120 
    121     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    122     vpush        {d8 - d15}
    123 
    124     ldr         r12,[sp,#coeff_offset]                @load pi1_coeff
    125     mov         r6,r3,lsl #1
    126     ldr         r5,[sp,#wd_offset]                 @load wd
    127     vld1.8      {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
    128     mov         r2, r2, lsl #1
    129     sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
    130     @vabs.s8    d0,d0               @vabs_s8(coeff)
    131     add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
    132     ldr         r3,[sp,#ht_offset]                 @load ht
    133     subs        r7,r3,#0                    @r3->ht
    134     @ble        end_loops           @end loop jump
    135     vmovl.s8    q0,d0
    136     vdup.16     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
    137     vdup.16     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
    138     vdup.16     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
    139     vdup.16     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
    140     vdup.16     d26,d1[0]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
    141     vdup.16     d27,d1[1]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
    142     vdup.16     d28,d1[2]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
    143     vdup.16     d29,d1[3]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
    144     vmov.i32    q15,#0x80000
    145 
    146     rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
    147     rsb         r8,r5,r2,lsl #2             @r2->src_strd
    148     sub         r8,r8,r5
    149     sub         r9,r9,r5
    150     mov         r3, r5, lsr #2              @divide by 4
    151     mul         r7, r3                      @multiply height by width
    152     sub         r7, #4                      @subtract by one for epilog
    153     mov         r4,r5                       @r5 ->wd
    154     @mov            r2, r2, lsl #1
    155 
    156 prolog:
    157 
    158     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    159     vld1.16     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    160     vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    161     subs        r4,r4,#4
    162     vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    163     vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    164     vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    165     vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
    166     vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    167     vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
    168     vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    169     vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    170     vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    171     vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    172     vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    173     vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
    174     vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    175     vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
    176 
    177     vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    178 
    179     vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    180     addle       r0,r0,r8,lsl #0
    181     vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
    182     movle       r4,r5                       @r5 ->wd
    183     vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
    184     vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    185     vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    186     vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    187     vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    188     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    189     vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
    190     vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    191     vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
    192     vsub.s32    q4, q4, q15
    193 
    194     vld1.16     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    195     vmull.s16   q6,d3,d23
    196     vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    197     vmlal.s16   q6,d2,d22
    198     vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    199     vmlal.s16   q6,d4,d24
    200     vmlal.s16   q6,d5,d25
    201     vmlal.s16   q6,d6,d26
    202     vmlal.s16   q6,d7,d27
    203     vmlal.s16   q6,d16,d28
    204     vmlal.s16   q6,d17,d29
    205     add         r14,r1,r6
    206     vsub.s32    q5, q5, q15
    207     vshrn.s32   d8, q4, #6
    208     @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
    209 
    210     vmull.s16   q7,d4,d23
    211     vmlal.s16   q7,d3,d22
    212     vmlal.s16   q7,d5,d24
    213     vmlal.s16   q7,d6,d25
    214     vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    215     vmlal.s16   q7,d7,d26
    216     vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    217     vmlal.s16   q7,d16,d27
    218     vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    219     vmlal.s16   q7,d17,d28
    220     vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    221     vmlal.s16   q7,d18,d29
    222     vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    223 
    224     vst1.32     {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
    225     vsub.s32    q6, q6, q15
    226     vshrn.s32   d10, q5, #6
    227     @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
    228     addle       r1,r1,r9
    229 
    230     subs        r7,r7,#4
    231 
    232 
    233     blt         epilog_end                  @jumps to epilog_end
    234     beq         epilog                      @jumps to epilog
    235 
    236 kernel_8:
    237 
    238     vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    239     subs        r4,r4,#4
    240     vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
    241     addle       r0,r0,r8,lsl #0
    242     vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
    243     vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    244     vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    245     vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
    246     vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    247     vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
    248     vst1.32     {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
    249 
    250     vsub.s32    q7, q7, q15
    251     vshrn.s32   d12, q6, #6
    252     @vqrshrun.s16 d12,q6,#6
    253     vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    254 
    255     vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    256     vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
    257     vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
    258     vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    259     vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    260     vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
    261     vst1.32     {d12},[r14],r6
    262 
    263     vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    264     vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    265 
    266     vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
    267 
    268     vsub.s32    q4, q4, q15
    269     vshrn.s32   d14, q7, #6
    270     @vqrshrun.s16 d14,q7,#6
    271 
    272     vmull.s16   q6,d3,d23
    273     movle       r4,r5                       @r5 ->wd
    274 
    275     vmlal.s16   q6,d2,d22
    276     vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    277 
    278     vmlal.s16   q6,d4,d24
    279     add         r3,r0,r2                    @pu1_src_tmp += src_strd@
    280 
    281     vmlal.s16   q6,d5,d25
    282 
    283     vmlal.s16   q6,d6,d26
    284     vst1.32     {d14},[r14],r6
    285 
    286     vmlal.s16   q6,d7,d27
    287     vld1.16     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    288 
    289     vmlal.s16   q6,d16,d28
    290     add         r14,r1,r6
    291 
    292     vmlal.s16   q6,d17,d29
    293     vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
    294 
    295     vsub.s32    q5, q5, q15
    296     vshrn.s32   d8, q4, #6
    297     @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
    298     vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    299 
    300     vmull.s16   q7,d4,d23
    301     vmlal.s16   q7,d3,d22
    302     vmlal.s16   q7,d5,d24
    303     vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    304 
    305     vmlal.s16   q7,d6,d25
    306     vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
    307     vmlal.s16   q7,d7,d26
    308     vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
    309     vmlal.s16   q7,d16,d27
    310     vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
    311     vmlal.s16   q7,d17,d28
    312     vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
    313     vmlal.s16   q7,d18,d29
    314     vst1.32     {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
    315 
    316     vsub.s32    q6, q6, q15
    317     vshrn.s32   d10, q5, #6
    318     addle       r1,r1,r9
    319 
    320     @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
    321     subs        r7,r7,#4
    322 
    323     bgt         kernel_8                    @jumps to kernel_8
    324 
    325 epilog:
    326 
    327     vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
    328     vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
    329     vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
    330     vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
    331     vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
    332     vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
    333     vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
    334     vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
    335     vst1.32     {d10},[r14],r6
    336 
    337     vsub.s32    q7, q7, q15
    338     vshrn.s32   d12, q6, #6
    339     @vqrshrun.s16 d12,q6,#6
    340 
    341     vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
    342     vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
    343     vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
    344     vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
    345     vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
    346     vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
    347     vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
    348     vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
    349     vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
    350     vst1.32     {d12},[r14],r6
    351 
    352     vsub.s32    q4, q4, q15
    353     vshrn.s32   d14, q7, #6
    354     @vqrshrun.s16 d14,q7,#6
    355 
    356     vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
    357     vmull.s16   q6,d3,d23
    358     vmlal.s16   q6,d2,d22
    359     vmlal.s16   q6,d4,d24
    360     vmlal.s16   q6,d5,d25
    361     vmlal.s16   q6,d6,d26
    362     vmlal.s16   q6,d7,d27
    363     vmlal.s16   q6,d16,d28
    364     vmlal.s16   q6,d17,d29
    365     vst1.32     {d14},[r14],r6
    366     vsub.s32    q5, q5, q15
    367     vshrn.s32   d8, q4, #6
    368     @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
    369 
    370     vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
    371     vmull.s16   q7,d4,d23
    372     vmlal.s16   q7,d3,d22
    373     vmlal.s16   q7,d5,d24
    374     vmlal.s16   q7,d6,d25
    375     vmlal.s16   q7,d7,d26
    376     vmlal.s16   q7,d16,d27
    377     vmlal.s16   q7,d17,d28
    378     vmlal.s16   q7,d18,d29
    379     vsub.s32    q6, q6, q15
    380     vshrn.s32   d10, q5, #6
    381     @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
    382 
    383     add         r14,r1,r6
    384     vst1.32     {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
    385 
    386 epilog_end:
    387     vst1.32     {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
    388     vshrn.s32   d12, q6, #6
    389     @vqrshrun.s16 d12,q6,#6
    390 
    391     vst1.32     {d12},[r14],r6
    392     vsub.s32    q7, q7, q15
    393     vshrn.s32   d14, q7, #6
    394     @vqrshrun.s16 d14,q7,#6
    395 
    396     vst1.32     {d14},[r14],r6
    397 
    398 
    399 end_loops:
    400 
    401     vpop         {d8 - d15}
    402     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    403 
    404 
    405 
    406 
    407 
    408 
    409 
    410 
    411