Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_inter_pred_chroma_copy_w16out_neon.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for inter prediction  interpolation.
     25 @* functions are coded using neon  intrinsics and can be compiled using
     26 
     27 @* rvct
     28 @*
     29 @* @author
     30 @*  yogeswaran rs
     31 @*
     32 @* @par list of functions:
     33 @*
     34 @*
     35 @* @remarks
     36 @*  none
     37 @*
     38 @*******************************************************************************
     39 @*/
     40 @/**
     41 @*******************************************************************************
     42 @*
     43 @* @brief
     44 @*   chroma interprediction filter for copy
     45 @*
     46 @* @par description:
     47 @*    copies the array of width 'wd' and height 'ht' from the  location pointed
     48 @*    by 'src' to the location pointed by 'dst'
     49 @*
     50 @* @param[in] pu1_src
     51 @*  uword8 pointer to the source
     52 @*
     53 @* @param[out] pu1_dst
     54 @*  uword8 pointer to the destination
     55 @*
     56 @* @param[in] src_strd
     57 @*  integer source stride
     58 @*
     59 @* @param[in] dst_strd
     60 @*  integer destination stride
     61 @*
     62 @* @param[in] pi1_coeff
     63 @*  word8 pointer to the filter coefficients
     64 @*
     65 @* @param[in] ht
     66 @*  integer height of the array
     67 @*
     68 @* @param[in] wd
     69 @*  integer width of the array
     70 @*
     71 @* @returns
     72 @*
     73 @* @remarks
     74 @*  none
     75 @*
     76 @*******************************************************************************
     77 @*/
     78 
     79 @void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src,
     80 @                                           word16 *pi2_dst,
     81 @                                           word32 src_strd,
     82 @                                           word32 dst_strd,
     83 @                                           word8 *pi1_coeff,
     84 @                                           word32 ht,
     85 @                                           word32 wd)
     86 @**************variables vs registers*****************************************
     87 @r0 => *pu1_src
     88 @r1 => *pi2_dst
     89 @r2 =>  src_strd
     90 @r3 =>  dst_strd
     91 @r4 => *pi1_coeff
     92 @r5 =>  ht
     93 @r6 =>  wd
     94 
     95 .equ    coeff_offset,   104
     96 .equ    ht_offset,      108
     97 .equ    wd_offset,      112
     98 
     99 
    100 .text
    101 .align 4
    102 
    103 
    104 
    105 
    106 .globl ihevc_inter_pred_chroma_copy_w16out_a9q
    107 
    108 .type ihevc_inter_pred_chroma_copy_w16out_a9q, %function
    109 
    110 ihevc_inter_pred_chroma_copy_w16out_a9q:
    111 
    112     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    113     vpush        {d8 - d15}
    114 
    115     ldr         r12,[sp,#wd_offset]                @loads wd
    116     lsl         r12,r12,#1                  @2*wd
    117     ldr         r7,[sp,#ht_offset]                 @loads ht
    118     cmp         r7,#0                       @ht condition(ht == 0)
    119     ble         end_loops                   @loop
    120     and         r8,r7,#3                    @check ht for mul of 2
    121     sub         r9,r7,r8                    @check the rounded height value
    122     and         r11,r7,#6
    123     cmp         r11,#6
    124     beq         loop_ht_6
    125     tst         r12,#7                      @conditional check for wd (multiples)
    126     beq         core_loop_wd_8
    127 
    128 loop_ht_6:
    129     sub         r11,r12,#4
    130     lsls        r6,r3,#1
    131     cmp         r9,#0
    132     beq         outer_loop_wd_4_ht_2
    133 
    134 outer_loop_wd_4:
    135     subs        r4,r12,#0                   @wd conditional subtract
    136     ble         end_inner_loop_wd_4
    137 
    138 inner_loop_wd_4:
    139     vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
    140     add         r5,r0,r2                    @pu1_src +src_strd
    141     vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
    142     add         r10,r1,r6
    143     subs        r4,r4,#4                    @wd - 4
    144     vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
    145     vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
    146     add         r0,r0,#4                    @pu1_src += 4
    147     vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    148     add         r1,r1,#8
    149     vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    150     vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
    151     vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
    152     vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    153     vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    154     vshl.i64    q12,q12,#6                  @vshlq_n_s64(temp, 6)
    155     vld1.8      {d26},[r5],r2               @vld1_u8(pu1_src_tmp)
    156     vst1.64     {d24},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    157     vmovl.u8    q13,d26                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    158     vshl.i64    q13,q13,#6                  @vshlq_n_s64(temp, 6)
    159     vst1.64     {d26},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    160     bgt         inner_loop_wd_4
    161 
    162 end_inner_loop_wd_4:
    163     subs        r9,r9,#4                    @ht - 4
    164     sub         r0,r5,r11
    165     sub         r1,r10,r11,lsl #1
    166     bgt         outer_loop_wd_4
    167     cmp         r8,#0
    168     bgt         outer_loop_wd_4_ht_2
    169 
    170 
    171 end_loops:
    172     vpop         {d8 - d15}
    173     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    174 
    175 
    176 outer_loop_wd_4_ht_2:
    177     subs        r4,r12,#0                   @wd conditional subtract
    178     ble         end_inner_loop_wd_4
    179 
    180 inner_loop_wd_4_ht_2:
    181     vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
    182     add         r5,r0,r2                    @pu1_src +src_strd
    183     vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
    184     add         r10,r1,r6
    185     subs        r4,r4,#4                    @wd - 4
    186     vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
    187     vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
    188     add         r0,r0,#4                    @pu1_src += 4
    189     vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    190     add         r1,r1,#8
    191     vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    192     vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
    193     vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
    194     vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    195     vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    196     bgt         inner_loop_wd_4_ht_2
    197     b           end_loops
    198 
    199 
    200 core_loop_wd_8:
    201     @sub            r11,r12,#8
    202     lsls        r5,r3,#1
    203     rsb         r11,r12,r3, lsl #2          @ r11 = (dst_strd * 4) - width
    204     rsb         r8,r12,r2,lsl #2            @r2->src_strd
    205     mov         r4,r12, lsr #3              @ divide by 8
    206     mov         r7,r9
    207     mul         r7, r4
    208     sub         r4,r12,#0                   @wd conditional check
    209     sub         r7,r7,#4                    @subtract one for epilog
    210     cmp         r9,#0
    211     beq         core_loop_wd_8_ht_2
    212 
    213 prolog:
    214     add         r6,r0,r2                    @pu1_src_tmp += src_strd
    215     add         r10,r1,r5
    216     vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
    217     vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
    218     vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
    219     vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
    220     vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
    221     vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
    222     vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    223     vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    224     subs        r4,r4,#8                    @wd decrements by 8
    225     vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
    226     vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
    227     vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
    228     vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
    229     addle       r0,r0,r8
    230     add         r6,r0,r2                    @pu1_src_tmp += src_strd
    231     vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
    232     vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
    233     vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
    234     vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
    235 
    236     vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
    237     addle       r1,r1,r11,lsl #1
    238     suble       r4,r12,#0                   @wd conditional check
    239 
    240     subs        r7,r7,#4                    @ht - 4
    241 
    242     blt         epilog_end                  @jumps to epilog_end
    243     beq         epilog                      @jumps to epilog
    244 
    245 
    246 
    247 outer_loop_wd_8:
    248 
    249     vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    250     vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
    251 
    252     vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    253     vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
    254 
    255     vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    256     vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    257 
    258     vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    259 
    260     subs        r4,r4,#8                    @wd decrements by 8
    261     addle       r0,r0,r8
    262 
    263     add         r6,r0,r2                    @pu1_src_tmp += src_strd
    264 
    265     vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
    266     vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
    267 
    268     vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
    269     vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
    270 
    271     vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
    272     vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
    273 
    274     vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
    275     add         r10,r1,r5
    276 
    277     vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
    278 
    279     vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
    280 
    281     addle       r1,r1,r11,lsl #1
    282     suble       r4,r12,#0                   @wd conditional check
    283 
    284     subs        r7,r7,#4                    @ht - 4
    285     bgt         outer_loop_wd_8
    286 
    287 epilog:
    288     vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    289     vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
    290 
    291     vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    292     vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
    293 
    294     vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    295     vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    296 
    297     vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    298     @add        r6,r0,r2                @pu1_src_tmp += src_strd
    299 
    300     vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
    301     vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
    302     vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
    303     add         r10,r1,r5
    304     vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
    305 
    306     vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
    307 epilog_end:
    308     vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    309     vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    310     vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    311     b           end_loops
    312 
    313 core_loop_wd_8_ht_2:
    314     add         r6,r0,r2                    @pu1_src_tmp += src_strd
    315     add         r10,r1,r5
    316     vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
    317     vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
    318     vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
    319     vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
    320     subs        r12,r12,#8                  @wd decrements by 8
    321     vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
    322     vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
    323     vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
    324     vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    325     bgt         core_loop_wd_8_ht_2
    326 
    327     vpop         {d8 - d15}
    328     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    329 
    330 
    331 
    332 
    333 
    334 
    335