1 @/***************************************************************************** 2 @* 3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 @* 5 @* Licensed under the Apache License, Version 2.0 (the "License"); 6 @* you may not use this file except in compliance with the License. 7 @* You may obtain a copy of the License at: 8 @* 9 @* http://www.apache.org/licenses/LICENSE-2.0 10 @* 11 @* Unless required by applicable law or agreed to in writing, software 12 @* distributed under the License is distributed on an "AS IS" BASIS, 13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @* See the License for the specific language governing permissions and 15 @* limitations under the License. 16 @* 17 @*****************************************************************************/ 18 @/** 19 @/** 20 @******************************************************************************* 21 @* 22 @* @brief 23 @* interprediction luma function for copy 24 @* 25 @* @par description: 26 @* copies the array of width 'wd' and height 'ht' from the location pointed 27 @* by 'src' to the location pointed by 'dst' 28 @* 29 @* @param[in] pu1_src 30 @* uword8 pointer to the source 31 @* 32 @* @param[out] pu1_dst 33 @* uword8 pointer to the destination 34 @* 35 @* @param[in] src_strd 36 @* integer source stride 37 @* 38 @* @param[in] dst_strd 39 @* integer destination stride 40 @* 41 @* @param[in] pi1_coeff 42 @* word8 pointer to the filter coefficients 43 @* 44 @* @param[in] ht 45 @* integer height of the array 46 @* 47 @* @param[in] wd 48 @* integer width of the array 49 @* 50 @* @returns 51 @* 52 @* @remarks 53 @* none 54 @* 55 @******************************************************************************* 56 @*/ 57 58 @void ihevc_inter_pred_luma_copy_w16out ( 59 @ uword8 *pu1_src, 60 @ word16 *pi2_dst, 61 @ word32 src_strd, 62 @ word32 dst_strd, 63 @ word8 *pi1_coeff, 64 @ word32 ht, 65 @ word32 wd ) 66 67 @**************variables vs registers***************************************** 68 @ r0 => *pu1_src 69 @ r1 => *pi2_dst 70 @ r2 => src_strd 71 @ r3 => dst_strd 72 @ r7 => ht 73 @ r12 => wd 74 75 .text 76 .align 4 77 78 79 80 81 .globl ihevc_inter_pred_luma_copy_w16out_a9q 82 83 .type ihevc_inter_pred_luma_copy_w16out_a9q, %function 84 85 ihevc_inter_pred_luma_copy_w16out_a9q: 86 87 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 88 ldr r12,[sp,#48] @loads wd 89 ldr r7,[sp,#44] @loads ht 90 cmp r7,#0 @ht condition(ht == 0) 91 ble end_loops @loop 92 tst r12,#7 @conditional check for wd (multiples) 93 beq core_loop_wd_8 94 sub r11,r12,#4 95 lsls r6,r3,#1 96 97 outer_loop_wd_4: 98 subs r4,r12,#0 @wd conditional subtract 99 ble end_inner_loop_wd_4 100 101 inner_loop_wd_4: 102 vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp) 103 add r5,r0,r2 @pu1_src +src_strd 104 vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp) 105 add r10,r1,r6 106 subs r4,r4,#4 @wd - 4 107 vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6) 108 vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp) 109 add r0,r0,#4 @pu1_src += 4 110 vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 111 add r1,r1,#8 112 vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp) 113 vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp) 114 vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6) 115 vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp) 116 vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 117 vshl.i64 q12,q12,#6 @vshlq_n_s64(temp, 6) 118 vld1.8 {d26},[r5],r2 @vld1_u8(pu1_src_tmp) 119 vst1.64 {d24},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 120 vmovl.u8 q13,d26 @vmovl_u8(vld1_u8(pu1_src_tmp) 121 vshl.i64 q13,q13,#6 @vshlq_n_s64(temp, 6) 122 vst1.64 {d26},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 123 bgt inner_loop_wd_4 124 125 end_inner_loop_wd_4: 126 subs r7,r7,#4 @ht + 4 127 sub r0,r5,r11 128 sub r1,r10,r11,lsl #1 129 bgt outer_loop_wd_4 130 131 end_loops: 132 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 133 134 135 core_loop_wd_8: 136 @sub r11,r12,#8 137 lsls r5,r3,#1 138 rsb r11,r12,r3, lsl #2 @ r11 = (dst_strd * 4) - width 139 rsb r8,r12,r2,lsl #2 @r2->src_strd 140 mov r4,r12, lsr #3 @ divide by 8 141 mul r7, r4 142 sub r4,r12,#0 @wd conditional check 143 sub r7,r7,#4 @subtract one for epilog 144 145 prolog: 146 add r6,r0,r2 @pu1_src_tmp += src_strd 147 add r10,r1,r5 148 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 149 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 150 vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) 151 vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) 152 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 153 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 154 vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) 155 vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) 156 subs r4,r4,#8 @wd decrements by 8 157 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 158 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 159 vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) 160 vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) 161 addle r0,r0,r8 162 add r6,r0,r2 @pu1_src_tmp += src_strd 163 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 164 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 165 vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) 166 vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) 167 168 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 169 addle r1,r1,r11,lsl #1 170 suble r4,r12,#0 @wd conditional check 171 172 subs r7,r7,#4 @ht - 4 173 174 blt epilog_end @jumps to epilog_end 175 beq epilog @jumps to epilog 176 177 178 179 outer_loop_wd_8: 180 181 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 182 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 183 184 vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 185 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 186 187 vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 188 vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) 189 190 vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) 191 192 subs r4,r4,#8 @wd decrements by 8 193 addle r0,r0,r8 194 195 add r6,r0,r2 @pu1_src_tmp += src_strd 196 197 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 198 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 199 200 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 201 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 202 203 vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) 204 vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) 205 206 vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) 207 add r10,r1,r5 208 209 vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) 210 211 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 212 213 addle r1,r1,r11,lsl #1 214 suble r4,r12,#0 @wd conditional check 215 216 subs r7,r7,#4 @ht - 4 217 bgt outer_loop_wd_8 218 219 epilog: 220 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 221 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 222 223 vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 224 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 225 226 vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 227 vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) 228 229 vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) 230 @add r6,r0,r2 @pu1_src_tmp += src_strd 231 232 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 233 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 234 vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) 235 add r10,r1,r5 236 vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) 237 238 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 239 epilog_end: 240 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 241 vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 242 vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 243 244 245 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 246 247 248 249 250