1 @/****************************************************************************** 2 @ * 3 @ * Copyright (C) 2015 The Android Open Source Project 4 @ * 5 @ * Licensed under the Apache License, Version 2.0 (the "License"); 6 @ * you may not use this file except in compliance with the License. 7 @ * You may obtain a copy of the License at: 8 @ * 9 @ * http://www.apache.org/licenses/LICENSE-2.0 10 @ * 11 @ * Unless required by applicable law or agreed to in writing, software 12 @ * distributed under the License is distributed on an "AS IS" BASIS, 13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @ * See the License for the specific language governing permissions and 15 @ * limitations under the License. 16 @ * 17 @ ***************************************************************************** 18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 @*/ 20 @** 21 @****************************************************************************** 22 @* @file 23 @* ih264_inter_pred_luma_bilinear_a9q.s 24 @* 25 @* @brief 26 @* Contains function definitions for inter prediction interpolation. 27 @* 28 @* @author 29 @* Ittiam 30 @* 31 @* @par List of Functions: 32 @* 33 @* - ih264_inter_pred_luma_bilinear_a9q() 34 @* 35 @* @remarks 36 @* None 37 @* 38 @******************************************************************************* 39 @* 40 41 @* All the functions here are replicated from ih264_inter_pred_filters.c 42 @ 43 44 @** 45 @** 46 @** 47 @ ******************************************************************************* 48 @ * function:ih264_inter_pred_luma_bilinear 49 @ * 50 @* @brief 51 @* This routine applies the bilinear filter to the predictors . 52 @* The filtering operation is described in 53 @* sec 8.4.2.2.1 titled "Luma sample interpolation process" 54 @* 55 @* @par Description: 56 @\note 57 @* This function is called to obtain pixels lying at the following 58 @* locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) . 59 @* The function averages the two adjacent values from the two input arrays in horizontal direction. 60 @* 61 @* 62 @* @param[in] pu1_src1: 63 @* UWORD8 Pointer to the buffer containing the first input array. 64 @* 65 @* @param[in] pu1_src2: 66 @* UWORD8 Pointer to the buffer containing the second input array. 67 @* 68 @* @param[out] pu1_dst 69 @* UWORD8 pointer to the destination where the output of bilinear filter is stored. 70 @* 71 @* @param[in] src_strd1 72 @* Stride of the first input buffer 73 @* 74 @* @param[in] src_strd2 75 @* Stride of the second input buffer 76 @* 77 @* @param[in] dst_strd 78 @* integer destination stride of pu1_dst 79 @* 80 @* @param[in] ht 81 @* integer height of the array 82 @* 83 @* @param[in] wd 84 @* integer width of the array 85 @* 86 @* @returns 87 @* 88 @* @remarks 89 @* None 90 @* 91 @******************************************************************************* 92 @* 93 94 @void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1, 95 @ UWORD8 *pu1_src2, 96 @ UWORD8 *pu1_dst, 97 @ WORD32 src_strd1, 98 @ WORD32 src_strd2, 99 @ WORD32 dst_strd, 100 @ WORD32 height, 101 @ WORD32 width) 102 @ 103 @**************Variables Vs Registers***************************************** 104 @ r0 => *pu1_src1 105 @ r1 => *pu1_src2 106 @ r2 => *pu1_dst 107 @ r3 => src_strd1 108 @ r4 => src_strd2 109 @ r5 => dst_strd 110 @ r6 => height 111 @ r7 => width 112 @ 113 .text 114 .p2align 2 115 116 .global ih264_inter_pred_luma_bilinear_a9q 117 118 ih264_inter_pred_luma_bilinear_a9q: 119 120 121 122 stmfd sp!, {r4-r12, r14} @store register values to stack 123 vstmdb sp!, {d8-d15} @push neon registers to stack 124 ldr r4, [sp, #104] 125 ldr r5, [sp, #108] @ 126 ldr r6, [sp, #112] 127 ldr r7, [sp, #116] 128 129 subs r12, r7, #4 @if wd=4 branch to loop_4 130 beq loop_4 131 subs r12, r7, #8 @if wd=8 branch to loop_8 132 beq loop_8 133 134 loop_16: @when wd=16 135 136 vld1.8 {q0}, [r0], r3 @// Load row0 ;src1 137 vld1.8 {q2}, [r1], r4 @// Load row0 ;src2 138 vld1.8 {q1}, [r0], r3 @// Load row1 ;src1 139 vaddl.u8 q10, d0, d4 140 vld1.8 {q3}, [r1], r4 @// Load row1 ;src2 141 vaddl.u8 q11, d1, d5 142 vld1.8 {q4}, [r0], r3 @// Load row2 ;src1 143 vaddl.u8 q12, d2, d6 144 vld1.8 {q5}, [r0], r3 @// Load row3 ;src1 145 vaddl.u8 q13, d3, d7 146 vld1.8 {q6}, [r1], r4 @// Load row2 ;src2 147 vaddl.u8 q8, d8, d12 148 vld1.8 {q7}, [r1], r4 @// Load row3 ;src2 149 vaddl.u8 q9, d9, d13 150 vqrshrun.s16 d28, q10, #1 151 vqrshrun.s16 d29, q11, #1 152 vaddl.u8 q10, d10, d14 153 vqrshrun.s16 d30, q12, #1 154 vqrshrun.s16 d31, q13, #1 155 vst1.8 {q14}, [r2], r5 @//Store dest row0 156 vaddl.u8 q11, d11, d15 157 vst1.8 {q15}, [r2], r5 @//Store dest row1 158 vqrshrun.s16 d28, q8, #1 159 vld1.8 {q0}, [r0], r3 @// Load row4 ;src1 160 vqrshrun.s16 d29, q9, #1 161 vld1.8 {q1}, [r0], r3 @// Load row5 ;src1 162 vqrshrun.s16 d30, q10, #1 163 vld1.8 {q2}, [r1], r4 @// Load row4 ;src2 164 vqrshrun.s16 d31, q11, #1 165 vld1.8 {q3}, [r1], r4 @// Load row5 ;src2 166 vaddl.u8 q10, d0, d4 167 vst1.8 {q14}, [r2], r5 @//Store dest row2 168 vaddl.u8 q13, d3, d7 169 vst1.8 {q15}, [r2], r5 @//Store dest row3 170 vaddl.u8 q11, d1, d5 171 vld1.8 {q4}, [r0], r3 @// Load row6 ;src1 172 vaddl.u8 q12, d2, d6 173 vld1.8 {q5}, [r0], r3 @// Load row7 ;src1 174 vqrshrun.s16 d28, q10, #1 175 vld1.8 {q6}, [r1], r4 @// Load row6 ;src2 176 vqrshrun.s16 d29, q11, #1 177 vld1.8 {q7}, [r1], r4 @// Load row7 ;src2 178 vaddl.u8 q8, d8, d12 179 vaddl.u8 q9, d9, d13 180 vaddl.u8 q10, d10, d14 181 vqrshrun.s16 d30, q12, #1 182 vqrshrun.s16 d31, q13, #1 183 vst1.8 {q14}, [r2], r5 @//Store dest row4 184 vaddl.u8 q11, d11, d15 185 vst1.8 {q15}, [r2], r5 @//Store dest row5 186 vqrshrun.s16 d28, q8, #1 187 vqrshrun.s16 d30, q10, #1 188 vqrshrun.s16 d29, q9, #1 189 vld1.8 {q2}, [r1], r4 @// Load row8 ;src2 190 vqrshrun.s16 d31, q11, #1 191 vst1.8 {q14}, [r2], r5 @//Store dest row6 192 subs r12, r6, #8 193 vst1.8 {q15}, [r2], r5 @//Store dest row7 194 195 beq end_func @ end function if ht=8 196 197 vld1.8 {q0}, [r0], r3 @// Load row8 ;src1 198 vaddl.u8 q10, d0, d4 199 vld1.8 {q1}, [r0], r3 @// Load row9 ;src1 200 vaddl.u8 q11, d1, d5 201 vld1.8 {q3}, [r1], r4 @// Load row9 ;src2 202 vqrshrun.s16 d28, q10, #1 203 vld1.8 {q4}, [r0], r3 @// Load row10 ;src1 204 vqrshrun.s16 d29, q11, #1 205 vld1.8 {q5}, [r0], r3 @// Load row11 ;src1 206 vaddl.u8 q12, d2, d6 207 vld1.8 {q6}, [r1], r4 @// Load row10 ;src2 208 vaddl.u8 q13, d3, d7 209 vld1.8 {q7}, [r1], r4 @// Load row11 ;src2 210 vaddl.u8 q8, d8, d12 211 vaddl.u8 q9, d9, d13 212 vaddl.u8 q10, d10, d14 213 vqrshrun.s16 d30, q12, #1 214 vst1.8 {q14}, [r2], r5 @//Store dest row8 215 vqrshrun.s16 d31, q13, #1 216 vst1.8 {q15}, [r2], r5 @//Store dest row9 217 vqrshrun.s16 d28, q8, #1 218 vld1.8 {q0}, [r0], r3 @// Load row12 ;src1 219 vaddl.u8 q11, d11, d15 220 vld1.8 {q1}, [r0], r3 @// Load row13 ;src1 221 vqrshrun.s16 d29, q9, #1 222 vld1.8 {q2}, [r1], r4 @// Load row12 ;src2 223 vqrshrun.s16 d30, q10, #1 224 vld1.8 {q3}, [r1], r4 @// Load row13 ;src2 225 vqrshrun.s16 d31, q11, #1 226 vst1.8 {q14}, [r2], r5 @//Store dest row10 227 vaddl.u8 q10, d0, d4 228 vst1.8 {q15}, [r2], r5 @//Store dest row11 229 vaddl.u8 q11, d1, d5 230 vld1.8 {q4}, [r0], r3 @// Load row14 ;src1 231 vaddl.u8 q13, d3, d7 232 vld1.8 {q5}, [r0], r3 @// Load row15 ;src1 233 vaddl.u8 q12, d2, d6 234 vld1.8 {q6}, [r1], r4 @// Load row14 ;src2 235 vaddl.u8 q8, d8, d12 236 vld1.8 {q7}, [r1], r4 @// Load row15 ;src2 237 vaddl.u8 q9, d9, d13 238 vqrshrun.s16 d28, q10, #1 239 vqrshrun.s16 d29, q11, #1 240 vaddl.u8 q10, d10, d14 241 vst1.8 {q14}, [r2], r5 @//Store dest row12 242 vqrshrun.s16 d30, q12, #1 243 vqrshrun.s16 d31, q13, #1 244 vaddl.u8 q11, d11, d15 245 vst1.8 {q15}, [r2], r5 @//Store dest row13 246 vqrshrun.s16 d28, q8, #1 247 vqrshrun.s16 d29, q9, #1 248 vqrshrun.s16 d30, q10, #1 249 vst1.8 {q14}, [r2], r5 @//Store dest row14 250 vqrshrun.s16 d31, q11, #1 251 vst1.8 {q15}, [r2], r5 @//Store dest row15 252 b end_func 253 254 255 256 loop_8: @wd=8; 257 vld1.8 {d0}, [r0], r3 @// Load row0 ;src1 258 vld1.8 {d4}, [r1], r4 @// Load row0 ;src2 259 vld1.8 {d1}, [r0], r3 @// Load row1 ;src1 260 vaddl.u8 q10, d0, d4 261 vld1.8 {d5}, [r1], r4 @// Load row1 ;src2 262 vld1.8 {d2}, [r0], r3 @// Load row2 ;src1 263 vqrshrun.s16 d28, q10, #1 264 vld1.8 {d6}, [r1], r4 @// Load row2 ;src2 265 vaddl.u8 q11, d1, d5 266 vld1.8 {d3}, [r0], r3 @// Load row3 ;src1 267 vaddl.u8 q12, d2, d6 268 vst1.8 {d28}, [r2], r5 @//Store dest row0 269 vqrshrun.s16 d29, q11, #1 270 vld1.8 {d7}, [r1], r4 @// Load row3 ;src2 271 vqrshrun.s16 d30, q12, #1 272 vst1.8 {d29}, [r2], r5 @//Store dest row1 273 vaddl.u8 q13, d3, d7 274 vst1.8 {d30}, [r2], r5 @//Store dest row2 275 vqrshrun.s16 d31, q13, #1 276 subs r12, r6, #4 277 vst1.8 {d31}, [r2], r5 @//Store dest row3 278 beq end_func @ end function if ht=4 279 280 vld1.8 {d12}, [r1], r4 @// Load row4 ;src2 281 vld1.8 {d8}, [r0], r3 @// Load row4 ;src1 282 vld1.8 {d9}, [r0], r3 @// Load row5 ;src1 283 vaddl.u8 q8, d8, d12 284 vld1.8 {d13}, [r1], r4 @// Load row5 ;src2 285 vld1.8 {d10}, [r0], r3 @// Load row6;src1 286 vaddl.u8 q9, d9, d13 287 vld1.8 {d14}, [r1], r4 @// Load row6 ;src2 288 vqrshrun.s16 d28, q8, #1 289 vld1.8 {d11}, [r0], r3 @// Load row7 ;src1 290 vqrshrun.s16 d29, q9, #1 291 vst1.8 {d28}, [r2], r5 @//Store dest row4 292 vaddl.u8 q10, d10, d14 293 vst1.8 {d29}, [r2], r5 @//Store dest row5 294 vqrshrun.s16 d30, q10, #1 295 vld1.8 {d15}, [r1], r4 @// Load row7 ;src2 296 vaddl.u8 q11, d11, d15 297 vst1.8 {d30}, [r2], r5 @//Store dest row6 298 vqrshrun.s16 d31, q11, #1 299 subs r12, r6, #8 300 vst1.8 {d31}, [r2], r5 @//Store dest row7 301 beq end_func @ end function if ht=8 302 303 vld1.8 {d0}, [r0], r3 @// Load row8 ;src1 304 vld1.8 {d4}, [r1], r4 @// Load row8 ;src2 305 vld1.8 {d1}, [r0], r3 @// Load row9 ;src1 306 vaddl.u8 q10, d0, d4 307 vld1.8 {d5}, [r1], r4 @// Load row9 ;src2 308 vld1.8 {d2}, [r0], r3 @// Load row10 ;src1 309 vaddl.u8 q11, d1, d5 310 vld1.8 {d6}, [r1], r4 @// Load row10 ;src2 311 vqrshrun.s16 d28, q10, #1 312 vld1.8 {d3}, [r0], r3 @// Load row11 ;src1 313 vaddl.u8 q12, d2, d6 314 vld1.8 {d7}, [r1], r4 @// Load row11 ;src2 315 vqrshrun.s16 d29, q11, #1 316 vld1.8 {d8}, [r0], r3 @// Load row12 ;src1 317 vaddl.u8 q13, d3, d7 318 vst1.8 {d28}, [r2], r5 @//Store dest row8 319 vqrshrun.s16 d30, q12, #1 320 vld1.8 {d12}, [r1], r4 @// Load row12 ;src2 321 vqrshrun.s16 d31, q13, #1 322 vst1.8 {d29}, [r2], r5 @//Store dest row9 323 vaddl.u8 q8, d8, d12 324 vld1.8 {d9}, [r0], r3 @// Load row13 ;src1 325 vqrshrun.s16 d28, q8, #1 326 vld1.8 {d13}, [r1], r4 @// Load row13 ;src2 327 vld1.8 {d10}, [r0], r3 @// Load row14;src1 328 vaddl.u8 q9, d9, d13 329 vld1.8 {d11}, [r0], r3 @// Load row15 ;src1 330 vld1.8 {d14}, [r1], r4 @// Load row14 ;src2 331 vqrshrun.s16 d29, q9, #1 332 vld1.8 {d15}, [r1], r4 @// Load roW15 ;src2 333 vaddl.u8 q10, d10, d14 334 vst1.8 {d30}, [r2], r5 @//Store dest row10 335 vaddl.u8 q11, d11, d15 336 vst1.8 {d31}, [r2], r5 @//Store dest row11 337 vqrshrun.s16 d30, q10, #1 338 vst1.8 {d28}, [r2], r5 @//Store dest row12 339 vqrshrun.s16 d31, q11, #1 340 vst1.8 {d29}, [r2], r5 @//Store dest row13 341 vst1.8 {d30}, [r2], r5 @//Store dest row14 342 vst1.8 {d31}, [r2], r5 @//Store dest row15 343 344 b end_func 345 346 347 348 loop_4: 349 vld1.32 d0[0], [r0], r3 @// Load row0 ;src1 350 vld1.32 d4[0], [r1], r4 @// Load row0 ;src2 351 vld1.32 d1[0], [r0], r3 @// Load row1 ;src1 352 vaddl.u8 q10, d0, d4 353 vld1.32 d5[0], [r1], r4 @// Load row1 ;src2 354 vld1.32 d2[0], [r0], r3 @// Load row2 ;src1 355 vqrshrun.s16 d28, q10, #1 356 vld1.32 d6[0], [r1], r4 @// Load row2 ;src2 357 vaddl.u8 q11, d1, d5 358 vld1.32 d3[0], [r0], r3 @// Load row3 ;src1 359 vaddl.u8 q12, d2, d6 360 vst1.32 d28[0], [r2], r5 @//Store dest row0 361 vqrshrun.s16 d29, q11, #1 362 vld1.32 d7[0], [r1], r4 @// Load row3 ;src2 363 vqrshrun.s16 d30, q12, #1 364 vst1.32 d29[0], [r2], r5 @//Store dest row1 365 vaddl.u8 q13, d3, d7 366 vst1.32 d30[0], [r2], r5 @//Store dest row2 367 vqrshrun.s16 d31, q13, #1 368 subs r12, r6, #4 369 vst1.32 d31[0], [r2], r5 @//Store dest row3 370 beq end_func @ end function if ht=4 371 372 vld1.32 d12[0], [r1], r4 @// Load row4 ;src2 373 vld1.32 d8[0], [r0], r3 @// Load row4 ;src1 374 vld1.32 d9[0], [r0], r3 @// Load row5 ;src1 375 vaddl.u8 q8, d8, d12 376 vld1.32 d13[0], [r1], r4 @// Load row5 ;src2 377 vld1.32 d10[0], [r0], r3 @// Load row6;src1 378 vaddl.u8 q9, d9, d13 379 vld1.32 d14[0], [r1], r4 @// Load row6 ;src2 380 vqrshrun.s16 d28, q8, #1 381 vld1.32 d11[0], [r0], r3 @// Load row7 ;src1 382 vqrshrun.s16 d29, q9, #1 383 vst1.32 d28[0], [r2], r5 @//Store dest row4 384 vaddl.u8 q10, d10, d14 385 vst1.32 d29[0], [r2], r5 @//Store dest row5 386 vqrshrun.s16 d30, q10, #1 387 vld1.32 d15[0], [r1], r4 @// Load row7 ;src2 388 vaddl.u8 q11, d11, d15 389 vst1.32 d30[0], [r2], r5 @//Store dest row6 390 vqrshrun.s16 d31, q11, #1 391 vst1.32 d31[0], [r2], r5 @//Store dest row7 392 393 end_func: 394 395 vldmia sp!, {d8-d15} @ Restore neon registers that were saved 396 ldmfd sp!, {r4-r12, pc} @Restoring registers from stack 397 398 399