1 @/****************************************************************************** 2 @ * 3 @ * Copyright (C) 2015 The Android Open Source Project 4 @ * 5 @ * Licensed under the Apache License, Version 2.0 (the "License"); 6 @ * you may not use this file except in compliance with the License. 7 @ * You may obtain a copy of the License at: 8 @ * 9 @ * http://www.apache.org/licenses/LICENSE-2.0 10 @ * 11 @ * Unless required by applicable law or agreed to in writing, software 12 @ * distributed under the License is distributed on an "AS IS" BASIS, 13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @ * See the License for the specific language governing permissions and 15 @ * limitations under the License. 16 @ * 17 @ ***************************************************************************** 18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 @*/ 20 @** 21 @****************************************************************************** 22 @* @file 23 @* ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s 24 @* 25 @* @brief 26 @* Contains function definitions for inter prediction interpolation. 27 @* 28 @* @author 29 @* Mohit 30 @* 31 @* @par List of Functions: 32 @* 33 @* - ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q() 34 @* 35 @* @remarks 36 @* None 37 @* 38 @******************************************************************************* 39 @* 40 41 @* All the functions here are replicated from ih264_inter_pred_filters.c 42 @ 43 44 @******************************************************************************* 45 @* 46 @* @brief 47 @* This function implements two six tap filters. It 48 @* applies the six tap filter in the horizontal direction on the 49 @* predictor values, then applies the same filter in the 50 @* vertical direction on the predictor values. It then averages these 51 @* two outputs to obtain quarter pel values in horizontal and vertical direction. 52 @* The six tap filtering operation is described in sec 8.4.2.2.1 titled 53 @* "Luma sample interpolation process" 54 @* 55 @* @par Description: 56 @* This function is called to obtain pixels lying at the following 57 @* location (1/4,1/4) or (3/4,1/4) or (1/4,3/4) or (3/4,3/4). 58 @* The function interpolates the predictors first in the horizontal direction 59 @* and then in the vertical direction, and then averages these two 60 @* values. 61 @* 62 @* @param[in] pu1_src 63 @* UWORD8 pointer to the source 64 @* 65 @* @param[out] pu1_dst 66 @* UWORD8 pointer to the destination 67 @* 68 @* @param[in] src_strd 69 @* integer source stride 70 @* 71 @* @param[in] dst_strd 72 @* integer destination stride 73 @* 74 @* @param[in] ht 75 @* integer height of the array 76 @* 77 @* @param[in] wd 78 @* integer width of the array 79 @* 80 @* @param[in] pu1_tmp: temporary buffer 81 @* 82 @* @param[in] dydx: x and y reference offset for qpel calculations 83 @* 84 @* @returns 85 @* 86 @* @remarks 87 @* None 88 @* 89 @******************************************************************************* 90 @*; 91 92 @void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src, 93 @ UWORD8 *pu1_dst, 94 @ WORD32 src_strd,, 95 @ WORD32 dst_strd, 96 @ WORD32 ht, 97 @ WORD32 wd, 98 @ UWORD8* pu1_tmp, 99 @ UWORD32 dydx) 100 101 @**************Variables Vs Registers***************************************** 102 @ r0 => *pu1_src 103 @ r1 => *pu1_dst 104 @ r2 => src_strd 105 @ r3 => dst_strd 106 @ r4 => ht 107 @ r5 => wd 108 @ r6 => dydx 109 110 .text 111 .p2align 2 112 113 .global ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q 114 115 ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q: 116 117 stmfd sp!, {r4-r12, r14} @store register values to stack 118 vstmdb sp!, {d8-d15} @push neon registers to stack 119 ldr r4, [sp, #104] @ loads ht 120 ldr r5, [sp, #108] @ loads wd 121 ldr r6, [sp, #116] @dydx 122 and r7, r6, #3 123 add r7, r0, r7, lsr #1 @pu1_pred_vert = pu1_src + (x_offset>>1) 124 125 and r6, r6, #12 @Finds y-offset 126 lsr r6, r6, #3 @dydx>>3 127 mul r6, r2, r6 128 add r6, r0, r6 @pu1_pred_horz = pu1_src + (y_offset>>1)*src_strd 129 sub r7, r7, r2, lsl #1 @pu1_pred_vert-2*src_strd 130 sub r6, r6, #2 @pu1_pred_horz-2 131 vmov.u8 d30, #20 @ Filter coeff 20 132 vmov.u8 d31, #5 @ Filter coeff 5 133 134 subs r12, r5, #4 @if wd=4 branch to loop_4 135 beq loop_4 136 subs r12, r5, #8 @if wd=8 branch to loop_8 137 beq loop_8 138 139 loop_16: 140 vld1.32 {q0}, [r7], r2 @ Vector load from src[0_0] 141 vld1.32 {q1}, [r7], r2 @ Vector load from src[1_0] 142 vld1.32 {q2}, [r7], r2 @ Vector load from src[2_0] 143 vld1.32 {q3}, [r7], r2 @ Vector load from src[3_0] 144 vld1.32 {q4}, [r7], r2 @ Vector load from src[4_0] 145 add r11, r6, #8 146 vld1.32 {q5}, [r7], r2 @ Vector load from src[5_0] 147 vld1.32 {q9}, [r6], r2 @ horz row0, col 0 148 vaddl.u8 q12, d0, d10 149 vmlal.u8 q12, d4, d30 150 vmlal.u8 q12, d6, d30 151 vmlsl.u8 q12, d2, d31 152 vmlsl.u8 q12, d8, d31 153 vext.8 d23, d18, d19, #5 154 vext.8 d20, d18, d19, #2 155 vext.8 d21, d18, d19, #3 156 vext.8 d22, d18, d19, #4 157 vext.8 d19, d18, d19, #1 158 vqrshrun.s16 d26, q12, #5 159 vaddl.u8 q14, d18, d23 160 vmlal.u8 q14, d20, d30 161 vmlal.u8 q14, d21, d30 162 vmlsl.u8 q14, d19, d31 163 vmlsl.u8 q14, d22, d31 164 vld1.32 {q9}, [r11], r2 @ horz row 0, col 1 165 vaddl.u8 q12, d1, d11 166 vmlal.u8 q12, d5, d30 167 vmlal.u8 q12, d7, d30 168 vmlsl.u8 q12, d3, d31 169 vmlsl.u8 q12, d9, d31 170 vqrshrun.s16 d28, q14, #5 171 vext.8 d23, d18, d19, #5 172 vext.8 d20, d18, d19, #2 173 vext.8 d21, d18, d19, #3 174 vext.8 d22, d18, d19, #4 175 vext.8 d19, d18, d19, #1 176 vqrshrun.s16 d27, q12, #5 177 vld1.32 {q6}, [r7], r2 @ src[6_0] 178 179 vaddl.u8 q12, d18, d23 180 vmlal.u8 q12, d20, d30 181 vmlal.u8 q12, d21, d30 182 vmlsl.u8 q12, d19, d31 183 vmlsl.u8 q12, d22, d31 184 185 vaddl.u8 q8, d2, d12 186 vmlal.u8 q8, d6, d30 187 vmlal.u8 q8, d8, d30 188 vmlsl.u8 q8, d4, d31 189 vmlsl.u8 q8, d10, d31 190 vqrshrun.s16 d29, q12, #5 191 vld1.32 {q9}, [r6], r2 @ horz row 1, col 0 192 193 vaddl.u8 q12, d3, d13 194 vmlal.u8 q12, d7, d30 195 vmlal.u8 q12, d9, d30 196 vmlsl.u8 q12, d5, d31 197 vmlsl.u8 q12, d11, d31 198 vrhadd.u8 q14, q14, q13 199 vqrshrun.s16 d26, q8, #5 200 vext.8 d23, d18, d19, #5 201 vext.8 d20, d18, d19, #2 202 vext.8 d21, d18, d19, #3 203 vext.8 d22, d18, d19, #4 204 vst1.32 {q14}, [r1], r3 @ store row 0 205 vext.8 d19, d18, d19, #1 206 vqrshrun.s16 d27, q12, #5 207 208 vaddl.u8 q14, d18, d23 209 vmlal.u8 q14, d20, d30 210 vmlal.u8 q14, d21, d30 211 vmlsl.u8 q14, d19, d31 212 vmlsl.u8 q14, d22, d31 213 214 vld1.32 {q9}, [r11], r2 @ horz row 1, col 1 215 216 vext.8 d23, d18, d19, #5 217 vext.8 d20, d18, d19, #2 218 vext.8 d21, d18, d19, #3 219 vext.8 d22, d18, d19, #4 220 vext.8 d19, d18, d19, #1 221 222 vqrshrun.s16 d28, q14, #5 223 vaddl.u8 q12, d18, d23 224 vmlal.u8 q12, d20, d30 225 vmlal.u8 q12, d21, d30 226 vmlsl.u8 q12, d19, d31 227 vmlsl.u8 q12, d22, d31 228 229 vqrshrun.s16 d29, q12, #5 230 vrhadd.u8 q14, q14, q13 231 vst1.32 {q14}, [r1], r3 @ store row 1 232 233 subs r4, r4, #2 @ 2 rows processed, decrement by 2 234 subne r7, r7 , r2, lsl #2 235 subne r7, r7, r2 236 beq end_func @ Branch if height==4 237 238 b loop_16 @ looping if height = 8 or 16 239 240 241 loop_8: 242 vld1.32 d0, [r7], r2 @ Vector load from src[0_0] 243 vld1.32 d1, [r7], r2 @ Vector load from src[1_0] 244 vld1.32 d2, [r7], r2 @ Vector load from src[2_0] 245 vld1.32 d3, [r7], r2 @ Vector load from src[3_0] 246 vld1.32 d4, [r7], r2 @ Vector load from src[4_0] 247 vld1.32 d5, [r7], r2 @ Vector load from src[5_0] 248 vaddl.u8 q5, d0, d5 249 vmlal.u8 q5, d2, d30 250 vmlal.u8 q5, d3, d30 251 vmlsl.u8 q5, d1, d31 252 vmlsl.u8 q5, d4, d31 253 vld1.32 {q6}, [r6], r2 @horz row 0 254 vext.8 d17, d12, d13, #5 255 vext.8 d14, d12, d13, #2 256 vext.8 d15, d12, d13, #3 257 vext.8 d16, d12, d13, #4 258 vext.8 d13, d12, d13, #1 259 vqrshrun.s16 d26, q5, #5 260 vld1.32 d6, [r7], r2 @ src[6_0] 261 vaddl.u8 q5, d12, d17 262 vmlal.u8 q5, d14, d30 263 vmlal.u8 q5, d15, d30 264 vmlsl.u8 q5, d13, d31 265 vmlsl.u8 q5, d16, d31 266 vld1.32 {q6}, [r6], r2 @ horz row 1 267 vaddl.u8 q9, d1, d6 268 vmlal.u8 q9, d3, d30 269 vmlal.u8 q9, d4, d30 270 vmlsl.u8 q9, d2, d31 271 vmlsl.u8 q9, d5, d31 272 vqrshrun.s16 d28, q5, #5 273 vext.8 d17, d12, d13, #5 274 vext.8 d14, d12, d13, #2 275 vext.8 d15, d12, d13, #3 276 vext.8 d16, d12, d13, #4 277 vext.8 d13, d12, d13, #1 278 vqrshrun.s16 d27, q9, #5 279 vaddl.u8 q5, d12, d17 280 vmlal.u8 q5, d14, d30 281 vmlal.u8 q5, d15, d30 282 vmlsl.u8 q5, d13, d31 283 vmlsl.u8 q5, d16, d31 284 vqrshrun.s16 d29, q5, #5 285 vrhadd.u8 q13, q13, q14 286 vst1.32 d26, [r1], r3 287 vst1.32 d27, [r1], r3 288 289 subs r4, r4, #2 @ 2 rows processed, decrement by 2 290 subne r7, r7 , r2, lsl #2 291 subne r7, r7, r2 292 beq end_func @ Branch if height==4 293 b loop_8 @looping if height == 8 or 16 294 295 loop_4: 296 vld1.32 d0[0], [r7], r2 @ Vector load from src[0_0] 297 vld1.32 d1[0], [r7], r2 @ Vector load from src[1_0] 298 vld1.32 d2[0], [r7], r2 @ Vector load from src[2_0] 299 vld1.32 d3[0], [r7], r2 @ Vector load from src[3_0] 300 vld1.32 d4[0], [r7], r2 @ Vector load from src[4_0] 301 vld1.32 d5[0], [r7], r2 @ Vector load from src[5_0] 302 vaddl.u8 q5, d0, d5 303 vmlal.u8 q5, d2, d30 304 vmlal.u8 q5, d3, d30 305 vmlsl.u8 q5, d1, d31 306 vmlsl.u8 q5, d4, d31 307 vld1.32 {q6}, [r6], r2 @load for horz filter row 0 308 vext.8 d17, d12, d13, #5 309 vext.8 d14, d12, d13, #2 310 vext.8 d15, d12, d13, #3 311 vext.8 d16, d12, d13, #4 312 vext.8 d13, d12, d13, #1 313 vqrshrun.s16 d26, q5, #5 314 vld1.32 d6[0], [r7], r2 @ Vector load from src[6_0] 315 vaddl.u8 q5, d12, d17 316 vmlal.u8 q5, d14, d30 317 vmlal.u8 q5, d15, d30 318 vmlsl.u8 q5, d13, d31 319 vmlsl.u8 q5, d16, d31 320 vld1.32 {q6}, [r6], r2 @horz row 1 321 vaddl.u8 q9, d1, d6 322 vmlal.u8 q9, d3, d30 323 vmlal.u8 q9, d4, d30 324 vmlsl.u8 q9, d2, d31 325 vmlsl.u8 q9, d5, d31 326 vqrshrun.s16 d28, q5, #5 327 vext.8 d17, d12, d13, #5 328 vext.8 d14, d12, d13, #2 329 vext.8 d15, d12, d13, #3 330 vext.8 d16, d12, d13, #4 331 vext.8 d13, d12, d13, #1 332 vqrshrun.s16 d27, q9, #5 333 vaddl.u8 q5, d12, d17 334 vmlal.u8 q5, d14, d30 335 vmlal.u8 q5, d15, d30 336 vmlsl.u8 q5, d13, d31 337 vmlsl.u8 q5, d16, d31 338 vqrshrun.s16 d29, q5, #5 339 vrhadd.u8 q13, q13, q14 340 vst1.32 d26[0], [r1], r3 341 vst1.32 d27[0], [r1], r3 342 343 subs r4, r4, #2 @ 2 rows processed, decrement by 2 344 subne r7, r7 , r2, lsl #2 345 subne r7, r7, r2 346 beq end_func @ Branch if height==4 347 b loop_4 @ Loop if height==8 348 end_func: 349 vldmia sp!, {d8-d15} @ Restore neon registers that were saved 350 ldmfd sp!, {r4-r12, pc} @Restoring registers from stack 351 352 353