1 @/***************************************************************************** 2 @* 3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 @* 5 @* Licensed under the Apache License, Version 2.0 (the "License"); 6 @* you may not use this file except in compliance with the License. 7 @* You may obtain a copy of the License at: 8 @* 9 @* http://www.apache.org/licenses/LICENSE-2.0 10 @* 11 @* Unless required by applicable law or agreed to in writing, software 12 @* distributed under the License is distributed on an "AS IS" BASIS, 13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @* See the License for the specific language governing permissions and 15 @* limitations under the License. 16 @* 17 @*****************************************************************************/ 18 @/** 19 @******************************************************************************* 20 @* @file 21 @* ihevc_inter_pred_chroma_copy_w16out_neon.s 22 @* 23 @* @brief 24 @* contains function definitions for inter prediction interpolation. 25 @* functions are coded using neon intrinsics and can be compiled using 26 27 @* rvct 28 @* 29 @* @author 30 @* yogeswaran rs 31 @* 32 @* @par list of functions: 33 @* 34 @* 35 @* @remarks 36 @* none 37 @* 38 @******************************************************************************* 39 @*/ 40 @/** 41 @******************************************************************************* 42 @* 43 @* @brief 44 @* chroma interprediction filter for copy 45 @* 46 @* @par description: 47 @* copies the array of width 'wd' and height 'ht' from the location pointed 48 @* by 'src' to the location pointed by 'dst' 49 @* 50 @* @param[in] pu1_src 51 @* uword8 pointer to the source 52 @* 53 @* @param[out] pu1_dst 54 @* uword8 pointer to the destination 55 @* 56 @* @param[in] src_strd 57 @* integer source stride 58 @* 59 @* @param[in] dst_strd 60 @* integer destination stride 61 @* 62 @* @param[in] pi1_coeff 63 @* word8 pointer to the filter coefficients 64 @* 65 @* @param[in] ht 66 @* integer height of the array 67 @* 68 @* @param[in] wd 69 @* integer width of the array 70 @* 71 @* @returns 72 @* 73 @* @remarks 74 @* none 75 @* 76 @******************************************************************************* 77 @*/ 78 79 @void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src, 80 @ word16 *pi2_dst, 81 @ word32 src_strd, 82 @ word32 dst_strd, 83 @ word8 *pi1_coeff, 84 @ word32 ht, 85 @ word32 wd) 86 @**************variables vs registers***************************************** 87 @r0 => *pu1_src 88 @r1 => *pi2_dst 89 @r2 => src_strd 90 @r3 => dst_strd 91 @r4 => *pi1_coeff 92 @r5 => ht 93 @r6 => wd 94 95 .text 96 .align 4 97 98 99 100 101 .globl ihevc_inter_pred_chroma_copy_w16out_a9q 102 103 .type ihevc_inter_pred_chroma_copy_w16out_a9q, %function 104 105 ihevc_inter_pred_chroma_copy_w16out_a9q: 106 107 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 108 ldr r12,[sp,#48] @loads wd 109 lsl r12,r12,#1 @2*wd 110 ldr r7,[sp,#44] @loads ht 111 cmp r7,#0 @ht condition(ht == 0) 112 ble end_loops @loop 113 and r8,r7,#3 @check ht for mul of 2 114 sub r9,r7,r8 @check the rounded height value 115 and r11,r7,#6 116 cmp r11,#6 117 beq loop_ht_6 118 tst r12,#7 @conditional check for wd (multiples) 119 beq core_loop_wd_8 120 121 loop_ht_6: 122 sub r11,r12,#4 123 lsls r6,r3,#1 124 cmp r9,#0 125 beq outer_loop_wd_4_ht_2 126 127 outer_loop_wd_4: 128 subs r4,r12,#0 @wd conditional subtract 129 ble end_inner_loop_wd_4 130 131 inner_loop_wd_4: 132 vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp) 133 add r5,r0,r2 @pu1_src +src_strd 134 vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp) 135 add r10,r1,r6 136 subs r4,r4,#4 @wd - 4 137 vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6) 138 vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp) 139 add r0,r0,#4 @pu1_src += 4 140 vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 141 add r1,r1,#8 142 vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp) 143 vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp) 144 vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6) 145 vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp) 146 vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 147 vshl.i64 q12,q12,#6 @vshlq_n_s64(temp, 6) 148 vld1.8 {d26},[r5],r2 @vld1_u8(pu1_src_tmp) 149 vst1.64 {d24},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 150 vmovl.u8 q13,d26 @vmovl_u8(vld1_u8(pu1_src_tmp) 151 vshl.i64 q13,q13,#6 @vshlq_n_s64(temp, 6) 152 vst1.64 {d26},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 153 bgt inner_loop_wd_4 154 155 end_inner_loop_wd_4: 156 subs r9,r9,#4 @ht - 4 157 sub r0,r5,r11 158 sub r1,r10,r11,lsl #1 159 bgt outer_loop_wd_4 160 cmp r8,#0 161 bgt outer_loop_wd_4_ht_2 162 163 164 end_loops: 165 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 166 167 168 outer_loop_wd_4_ht_2: 169 subs r4,r12,#0 @wd conditional subtract 170 ble end_inner_loop_wd_4 171 172 inner_loop_wd_4_ht_2: 173 vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp) 174 add r5,r0,r2 @pu1_src +src_strd 175 vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp) 176 add r10,r1,r6 177 subs r4,r4,#4 @wd - 4 178 vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6) 179 vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp) 180 add r0,r0,#4 @pu1_src += 4 181 vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 182 add r1,r1,#8 183 vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp) 184 vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp) 185 vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6) 186 vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp) 187 vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 188 bgt inner_loop_wd_4_ht_2 189 b end_loops 190 191 192 core_loop_wd_8: 193 @sub r11,r12,#8 194 lsls r5,r3,#1 195 rsb r11,r12,r3, lsl #2 @ r11 = (dst_strd * 4) - width 196 rsb r8,r12,r2,lsl #2 @r2->src_strd 197 mov r4,r12, lsr #3 @ divide by 8 198 mov r7,r9 199 mul r7, r4 200 sub r4,r12,#0 @wd conditional check 201 sub r7,r7,#4 @subtract one for epilog 202 cmp r9,#0 203 beq core_loop_wd_8_ht_2 204 205 prolog: 206 add r6,r0,r2 @pu1_src_tmp += src_strd 207 add r10,r1,r5 208 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 209 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 210 vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) 211 vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) 212 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 213 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 214 vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) 215 vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) 216 subs r4,r4,#8 @wd decrements by 8 217 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 218 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 219 vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) 220 vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) 221 addle r0,r0,r8 222 add r6,r0,r2 @pu1_src_tmp += src_strd 223 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 224 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 225 vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) 226 vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) 227 228 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 229 addle r1,r1,r11,lsl #1 230 suble r4,r12,#0 @wd conditional check 231 232 subs r7,r7,#4 @ht - 4 233 234 blt epilog_end @jumps to epilog_end 235 beq epilog @jumps to epilog 236 237 238 239 outer_loop_wd_8: 240 241 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 242 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 243 244 vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 245 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 246 247 vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 248 vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) 249 250 vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) 251 252 subs r4,r4,#8 @wd decrements by 8 253 addle r0,r0,r8 254 255 add r6,r0,r2 @pu1_src_tmp += src_strd 256 257 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 258 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 259 260 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 261 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 262 263 vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) 264 vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) 265 266 vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) 267 add r10,r1,r5 268 269 vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) 270 271 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 272 273 addle r1,r1,r11,lsl #1 274 suble r4,r12,#0 @wd conditional check 275 276 subs r7,r7,#4 @ht - 4 277 bgt outer_loop_wd_8 278 279 epilog: 280 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 281 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 282 283 vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 284 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 285 286 vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 287 vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) 288 289 vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) 290 @add r6,r0,r2 @pu1_src_tmp += src_strd 291 292 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 293 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 294 vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) 295 add r10,r1,r5 296 vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) 297 298 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 299 epilog_end: 300 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 301 vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 302 vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 303 b end_loops 304 305 core_loop_wd_8_ht_2: 306 add r6,r0,r2 @pu1_src_tmp += src_strd 307 add r10,r1,r5 308 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 309 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 310 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 311 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 312 subs r12,r12,#8 @wd decrements by 8 313 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 314 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 315 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 316 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 317 bgt core_loop_wd_8_ht_2 318 319 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 320 321 322 323 324 325 326