Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_inter_pred_chroma_copy_w16out_neon.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  yogeswaran rs
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*   chroma interprediction filter for copy
     45 @*
     46 @* @par description:
     47 @*    copies the array of width 'wd' and height 'ht' from the  location pointed
     48 @*    by 'src' to the location pointed by 'dst'
     49 @*
     50 @* @param[in] pu1_src
     51 @*  uword8 pointer to the source
     52 @*
     53 @* @param[out] pu1_dst
     54 @*  uword8 pointer to the destination
     55 @*
     56 @* @param[in] src_strd
     57 @*  integer source stride
     58 @*
     59 @* @param[in] dst_strd
     60 @*  integer destination stride
     61 @*
     62 @* @param[in] pi1_coeff
     63 @*  word8 pointer to the filter coefficients
     64 @*
     65 @* @param[in] ht
     66 @*  integer height of the array
     67 @*
     68 @* @param[in] wd
     69 @*  integer width of the array
     70 @*
     71 @* @returns
     72 @*
     73 @* @remarks
     74 @*  none
     75 @*
     76 @*******************************************************************************
     77 @*/
     78 
     79 @void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src,
     80 @                                           word16 *pi2_dst,
     81 @                                           word32 src_strd,
     82 @                                           word32 dst_strd,
     83 @                                           word8 *pi1_coeff,
     84 @                                           word32 ht,
     85 @                                           word32 wd)
     86 @**************variables vs registers*****************************************
     87 @r0 => *pu1_src
     88 @r1 => *pi2_dst
     89 @r2 =>  src_strd
     90 @r3 =>  dst_strd
     91 @r4 => *pi1_coeff
     92 @r5 =>  ht
     93 @r6 =>  wd
     94 
     95 .text
     96 .align 4
     97 
     98 
     99 
    100 
    101 .globl ihevc_inter_pred_chroma_copy_w16out_a9q
    102 
    103 .type ihevc_inter_pred_chroma_copy_w16out_a9q, %function
    104 
    105 ihevc_inter_pred_chroma_copy_w16out_a9q:
    106 
    107     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    108     ldr         r12,[sp,#48]                @loads wd
    109     lsl         r12,r12,#1                  @2*wd
    110     ldr         r7,[sp,#44]                 @loads ht
    111     cmp         r7,#0                       @ht condition(ht == 0)
    112     ble         end_loops                   @loop
    113     and         r8,r7,#3                    @check ht for mul of 2
    114     sub         r9,r7,r8                    @check the rounded height value
    115     and         r11,r7,#6
    116     cmp         r11,#6
    117     beq         loop_ht_6
    118     tst         r12,#7                      @conditional check for wd (multiples)
    119     beq         core_loop_wd_8
    120 
    121 loop_ht_6:
    122     sub         r11,r12,#4
    123     lsls        r6,r3,#1
    124     cmp         r9,#0
    125     beq         outer_loop_wd_4_ht_2
    126 
    127 outer_loop_wd_4:
    128     subs        r4,r12,#0                   @wd conditional subtract
    129     ble         end_inner_loop_wd_4
    130 
    131 inner_loop_wd_4:
    132     vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
    133     add         r5,r0,r2                    @pu1_src +src_strd
    134     vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
    135     add         r10,r1,r6
    136     subs        r4,r4,#4                    @wd - 4
    137     vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
    138     vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
    139     add         r0,r0,#4                    @pu1_src += 4
    140     vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    141     add         r1,r1,#8
    142     vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    143     vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
    144     vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
    145     vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    146     vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    147     vshl.i64    q12,q12,#6                  @vshlq_n_s64(temp, 6)
    148     vld1.8      {d26},[r5],r2               @vld1_u8(pu1_src_tmp)
    149     vst1.64     {d24},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    150     vmovl.u8    q13,d26                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    151     vshl.i64    q13,q13,#6                  @vshlq_n_s64(temp, 6)
    152     vst1.64     {d26},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    153     bgt         inner_loop_wd_4
    154 
    155 end_inner_loop_wd_4:
    156     subs        r9,r9,#4                    @ht - 4
    157     sub         r0,r5,r11
    158     sub         r1,r10,r11,lsl #1
    159     bgt         outer_loop_wd_4
    160     cmp         r8,#0
    161     bgt         outer_loop_wd_4_ht_2
    162 
    163 
    164 end_loops:
    165     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    166 
    167 
    168 outer_loop_wd_4_ht_2:
    169     subs        r4,r12,#0                   @wd conditional subtract
    170     ble         end_inner_loop_wd_4
    171 
    172 inner_loop_wd_4_ht_2:
    173     vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
    174     add         r5,r0,r2                    @pu1_src +src_strd
    175     vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
    176     add         r10,r1,r6
    177     subs        r4,r4,#4                    @wd - 4
    178     vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
    179     vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
    180     add         r0,r0,#4                    @pu1_src += 4
    181     vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    182     add         r1,r1,#8
    183     vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    184     vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
    185     vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
    186     vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    187     vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    188     bgt         inner_loop_wd_4_ht_2
    189     b           end_loops
    190 
    191 
    192 core_loop_wd_8:
    193     @sub            r11,r12,#8
    194     lsls        r5,r3,#1
    195     rsb         r11,r12,r3, lsl #2          @ r11 = (dst_strd * 4) - width
    196     rsb         r8,r12,r2,lsl #2            @r2->src_strd
    197     mov         r4,r12, lsr #3              @ divide by 8
    198     mov         r7,r9
    199     mul         r7, r4
    200     sub         r4,r12,#0                   @wd conditional check
    201     sub         r7,r7,#4                    @subtract one for epilog
    202     cmp         r9,#0
    203     beq         core_loop_wd_8_ht_2
    204 
    205 prolog:
    206     add         r6,r0,r2                    @pu1_src_tmp += src_strd
    207     add         r10,r1,r5
    208     vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
    209     vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
    210     vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
    211     vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
    212     vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
    213     vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
    214     vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    215     vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    216     subs        r4,r4,#8                    @wd decrements by 8
    217     vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
    218     vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
    219     vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
    220     vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
    221     addle       r0,r0,r8
    222     add         r6,r0,r2                    @pu1_src_tmp += src_strd
    223     vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
    224     vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
    225     vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
    226     vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
    227 
    228     vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
    229     addle       r1,r1,r11,lsl #1
    230     suble       r4,r12,#0                   @wd conditional check
    231 
    232     subs        r7,r7,#4                    @ht - 4
    233 
    234     blt         epilog_end                  @jumps to epilog_end
    235     beq         epilog                      @jumps to epilog
    236 
    237 
    238 
    239 outer_loop_wd_8:
    240 
    241     vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    242     vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
    243 
    244     vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    245     vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
    246 
    247     vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    248     vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    249 
    250     vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    251 
    252     subs        r4,r4,#8                    @wd decrements by 8
    253     addle       r0,r0,r8
    254 
    255     add         r6,r0,r2                    @pu1_src_tmp += src_strd
    256 
    257     vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
    258     vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
    259 
    260     vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
    261     vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
    262 
    263     vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
    264     vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
    265 
    266     vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
    267     add         r10,r1,r5
    268 
    269     vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
    270 
    271     vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
    272 
    273     addle       r1,r1,r11,lsl #1
    274     suble       r4,r12,#0                   @wd conditional check
    275 
    276     subs        r7,r7,#4                    @ht - 4
    277     bgt         outer_loop_wd_8
    278 
    279 epilog:
    280     vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    281     vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
    282 
    283     vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    284     vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
    285 
    286     vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    287     vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    288 
    289     vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    290     @add        r6,r0,r2                @pu1_src_tmp += src_strd
    291 
    292     vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
    293     vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
    294     vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
    295     add         r10,r1,r5
    296     vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
    297 
    298     vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
    299 epilog_end:
    300     vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    301     vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    302     vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    303     b           end_loops
    304 
    305 core_loop_wd_8_ht_2:
    306     add         r6,r0,r2                    @pu1_src_tmp += src_strd
    307     add         r10,r1,r5
    308     vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
    309     vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
    310     vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
    311     vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
    312     subs        r12,r12,#8                  @wd decrements by 8
    313     vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
    314     vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
    315     vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
    316     vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    317     bgt         core_loop_wd_8_ht_2
    318 
    319     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    320 
    321 
    322 
    323 
    324 
    325 
    326