Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @/**
     20 @*******************************************************************************
     21 @*
     22 @* @brief
     23 @*     interprediction luma function for copy
     24 @*
     25 @* @par description:
     26 @*   copies the array of width 'wd' and height 'ht' from the  location pointed
     27 @*   by 'src' to the location pointed by 'dst'
     28 @*
     29 @* @param[in] pu1_src
     30 @*  uword8 pointer to the source
     31 @*
     32 @* @param[out] pu1_dst
     33 @*  uword8 pointer to the destination
     34 @*
     35 @* @param[in] src_strd
     36 @*  integer source stride
     37 @*
     38 @* @param[in] dst_strd
     39 @*  integer destination stride
     40 @*
     41 @* @param[in] pi1_coeff
     42 @*  word8 pointer to the filter coefficients
     43 @*
     44 @* @param[in] ht
     45 @*  integer height of the array
     46 @*
     47 @* @param[in] wd
     48 @*  integer width of the array
     49 @*
     50 @* @returns
     51 @*
     52 @* @remarks
     53 @*  none
     54 @*
     55 @*******************************************************************************
     56 @*/
     57 
     58 @void ihevc_inter_pred_luma_copy_w16out (
     59 @                                uword8 *pu1_src,
     60 @                                word16 *pi2_dst,
     61 @                                word32 src_strd,
     62 @                                word32 dst_strd,
     63 @                                word8 *pi1_coeff,
     64 @                                word32 ht,
     65 @                                word32 wd   )
     66 
     67 @**************variables vs registers*****************************************
     68 @   r0 => *pu1_src
     69 @   r1 => *pi2_dst
     70 @   r2 =>  src_strd
     71 @   r3 =>  dst_strd
     72 @   r7 =>  ht
     73 @   r12 => wd
     74 
     75 .text
     76 .align 4
     77 
     78 
     79 
     80 
     81 .globl ihevc_inter_pred_luma_copy_w16out_a9q
     82 
     83 .type ihevc_inter_pred_luma_copy_w16out_a9q, %function
     84 
     85 ihevc_inter_pred_luma_copy_w16out_a9q:
     86 
     87     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
     88     ldr         r12,[sp,#48]                @loads wd
     89     ldr         r7,[sp,#44]                 @loads ht
     90     cmp         r7,#0                       @ht condition(ht == 0)
     91     ble         end_loops                   @loop
     92     tst         r12,#7                      @conditional check for wd (multiples)
     93     beq         core_loop_wd_8
     94     sub         r11,r12,#4
     95     lsls        r6,r3,#1
     96 
     97 outer_loop_wd_4:
     98     subs        r4,r12,#0                   @wd conditional subtract
     99     ble         end_inner_loop_wd_4
    100 
    101 inner_loop_wd_4:
    102     vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
    103     add         r5,r0,r2                    @pu1_src +src_strd
    104     vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
    105     add         r10,r1,r6
    106     subs        r4,r4,#4                    @wd - 4
    107     vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
    108     vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
    109     add         r0,r0,#4                    @pu1_src += 4
    110     vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    111     add         r1,r1,#8
    112     vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    113     vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
    114     vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
    115     vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    116     vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    117     vshl.i64    q12,q12,#6                  @vshlq_n_s64(temp, 6)
    118     vld1.8      {d26},[r5],r2               @vld1_u8(pu1_src_tmp)
    119     vst1.64     {d24},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    120     vmovl.u8    q13,d26                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    121     vshl.i64    q13,q13,#6                  @vshlq_n_s64(temp, 6)
    122     vst1.64     {d26},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
    123     bgt         inner_loop_wd_4
    124 
    125 end_inner_loop_wd_4:
    126     subs        r7,r7,#4                    @ht + 4
    127     sub         r0,r5,r11
    128     sub         r1,r10,r11,lsl #1
    129     bgt         outer_loop_wd_4
    130 
    131 end_loops:
    132     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    133 
    134 
    135 core_loop_wd_8:
    136     @sub            r11,r12,#8
    137     lsls        r5,r3,#1
    138     rsb         r11,r12,r3, lsl #2          @ r11 = (dst_strd * 4) - width
    139     rsb         r8,r12,r2,lsl #2            @r2->src_strd
    140     mov         r4,r12, lsr #3              @ divide by 8
    141     mul         r7, r4
    142     sub         r4,r12,#0                   @wd conditional check
    143     sub         r7,r7,#4                    @subtract one for epilog
    144 
    145 prolog:
    146     add         r6,r0,r2                    @pu1_src_tmp += src_strd
    147     add         r10,r1,r5
    148     vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
    149     vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
    150     vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
    151     vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
    152     vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
    153     vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
    154     vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    155     vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    156     subs        r4,r4,#8                    @wd decrements by 8
    157     vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
    158     vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
    159     vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
    160     vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
    161     addle       r0,r0,r8
    162     add         r6,r0,r2                    @pu1_src_tmp += src_strd
    163     vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
    164     vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
    165     vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
    166     vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
    167 
    168     vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
    169     addle       r1,r1,r11,lsl #1
    170     suble       r4,r12,#0                   @wd conditional check
    171 
    172     subs        r7,r7,#4                    @ht - 4
    173 
    174     blt         epilog_end                  @jumps to epilog_end
    175     beq         epilog                      @jumps to epilog
    176 
    177 
    178 
    179 outer_loop_wd_8:
    180 
    181     vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    182     vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
    183 
    184     vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    185     vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
    186 
    187     vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    188     vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    189 
    190     vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    191 
    192     subs        r4,r4,#8                    @wd decrements by 8
    193     addle       r0,r0,r8
    194 
    195     add         r6,r0,r2                    @pu1_src_tmp += src_strd
    196 
    197     vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
    198     vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
    199 
    200     vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
    201     vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
    202 
    203     vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
    204     vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
    205 
    206     vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
    207     add         r10,r1,r5
    208 
    209     vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
    210 
    211     vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
    212 
    213     addle       r1,r1,r11,lsl #1
    214     suble       r4,r12,#0                   @wd conditional check
    215 
    216     subs        r7,r7,#4                    @ht - 4
    217     bgt         outer_loop_wd_8
    218 
    219 epilog:
    220     vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    221     vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
    222 
    223     vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    224     vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
    225 
    226     vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    227     vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    228 
    229     vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
    230     @add        r6,r0,r2                @pu1_src_tmp += src_strd
    231 
    232     vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
    233     vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
    234     vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
    235     add         r10,r1,r5
    236     vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
    237 
    238     vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
    239 epilog_end:
    240     vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    241     vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    242     vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
    243 
    244 
    245     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    246 
    247 
    248 
    249 
    250