1 @/****************************************************************************** 2 @ * 3 @ * Copyright (C) 2015 The Android Open Source Project 4 @ * 5 @ * Licensed under the Apache License, Version 2.0 (the "License"); 6 @ * you may not use this file except in compliance with the License. 7 @ * You may obtain a copy of the License at: 8 @ * 9 @ * http://www.apache.org/licenses/LICENSE-2.0 10 @ * 11 @ * Unless required by applicable law or agreed to in writing, software 12 @ * distributed under the License is distributed on an "AS IS" BASIS, 13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @ * See the License for the specific language governing permissions and 15 @ * limitations under the License. 16 @ * 17 @ ***************************************************************************** 18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 @*/ 20 @** 21 @****************************************************************************** 22 @* @file 23 @* ih264_inter_pred_luma_horz_qpel_a9q.s 24 @* 25 @* @brief 26 @* Contains function definitions for inter prediction horizontal quarter pel interpolation. 27 @* 28 @* @author 29 @* Mohit 30 @* 31 @* @par List of Functions: 32 @* 33 @* - ih264_inter_pred_luma_horz_qpel_a9q() 34 @* 35 @* @remarks 36 @* None 37 @* 38 @******************************************************************************* 39 @* 40 41 @* All the functions here are replicated from ih264_inter_pred_filters.c 42 @ 43 44 @** 45 @** 46 @******************************************************************************* 47 @* 48 @* @brief 49 @* Quarter pel interprediction luma filter for horizontal input 50 @* 51 @* @par Description: 52 @* Applies a 6 tap horizontal filter .The output is clipped to 8 bits 53 @* sec 8.4.2.2.1 titled "Luma sample interpolation process" 54 @* 55 @* @param[in] pu1_src 56 @* UWORD8 pointer to the source 57 @* 58 @* @param[out] pu1_dst 59 @* UWORD8 pointer to the destination 60 @* 61 @* @param[in] src_strd 62 @* integer source stride 63 @* 64 @* @param[in] dst_strd 65 @* integer destination stride 66 @* 67 @* @param[in] ht 68 @* integer height of the array 69 @* 70 @* @param[in] wd 71 @* integer width of the array 72 @* 73 @ @param[in] pu1_tmp: temporary buffer: UNUSED in this function 74 @* 75 @* @param[in] dydx: x and y reference offset for qpel calculations. 76 @* @returns 77 @* 78 @ @remarks 79 @* None 80 @* 81 @******************************************************************************* 82 @* 83 84 @void ih264_inter_pred_luma_horz ( 85 @ UWORD8 *pu1_src, 86 @ UWORD8 *pu1_dst, 87 @ WORD32 src_strd, 88 @ WORD32 dst_strd, 89 @ WORD32 ht, 90 @ WORD32 wd, 91 @ UWORD8* pu1_tmp, 92 @ UWORD32 dydx) 93 94 @**************Variables Vs Registers***************************************** 95 @ r0 => *pu1_src 96 @ r1 => *pu1_dst 97 @ r2 => src_strd 98 @ r3 => dst_strd 99 @ r5 => ht 100 @ r6 => wd 101 @ r7 => dydx 102 103 .text 104 .p2align 2 105 106 107 .global ih264_inter_pred_luma_horz_qpel_a9q 108 109 ih264_inter_pred_luma_horz_qpel_a9q: 110 111 stmfd sp!, {r4-r12, r14} @store register values to stack 112 vstmdb sp!, {d8-d15} @push neon registers to stack 113 ldr r5, [sp, #104] @Loads ht 114 ldr r6, [sp, #108] @Loads wd 115 ldr r7, [sp, #116] @Loads dydx 116 and r7, r7, #3 @Finds x-offset 117 add r7, r0, r7, lsr #1 @pu1_src + (x_offset>>1) 118 sub r0, r0, #2 @pu1_src-2 119 vmov.i8 d0, #5 @filter coeff 120 subs r12, r6, #8 @if wd=8 branch to loop_8 121 vmov.i8 d1, #20 @filter coeff 122 123 beq loop_8 124 125 subs r12, r6, #4 @if wd=4 branch to loop_4 126 beq loop_4 127 128 loop_16: @when wd=16 129 @ Processing row0 and row1 130 vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0 131 vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) 132 vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1 133 vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0) 134 vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) 135 vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) 136 vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0) 137 vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1) 138 vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) 139 vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0) 140 vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1) 141 vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0) 142 vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0) 143 vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1) 144 vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0) 145 vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1) 146 vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1) 147 vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0) 148 vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1) 149 vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0) 150 vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) 151 vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1) 152 vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) 153 vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1) 154 vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) 155 vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0) 156 vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1) 157 vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0) 158 vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 159 vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1) 160 vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 161 vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1) 162 vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 163 vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0) 164 vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) 165 vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0) 166 vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 167 vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1) 168 vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 169 vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1) 170 vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 171 vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) 172 vld1.32 {d12, d13}, [r7], r2 @Load value for interpolation (column1,row0) 173 vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 174 vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 175 vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row2) 176 vrhadd.u8 q10, q6, q10 @Interpolation step for qpel calculation 177 vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 178 vst1.8 {d20, d21}, [r1], r3 @//Store dest row0 179 vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row2) 180 vqrshrun.s16 d19, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) 181 vld1.32 {d12, d13}, [r7], r2 @Load value for interpolation (column1,row1) 182 vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation 183 vst1.8 {d18, d19}, [r1], r3 @//Store dest row1 184 subs r5, r5, #2 @ 2 rows done, decrement by 2 185 186 beq end_func 187 b loop_16 188 189 loop_8: 190 @ Processing row0 and row1 191 192 vld1.8 {d5, d6}, [r0], r2 @// Load row1 193 vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) 194 vld1.8 {d2, d3}, [r0], r2 @// Load row0 195 vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) 196 vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) 197 vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) 198 vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) 199 vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) 200 vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) 201 vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) 202 vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) 203 vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) 204 vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 205 vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 206 vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) 207 vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) 208 vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) 209 vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) 210 vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) 211 vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) 212 vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 213 vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 214 vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 215 vld1.32 d12, [r7], r2 @Load value for interpolation (column1,row0) 216 vld1.32 d13, [r7], r2 @Load value for interpolation (column1,row1) 217 vqrshrun.s16 d19, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 218 vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation 219 vst1.8 {d18}, [r1], r3 @//Store dest row0 220 vst1.8 {d19}, [r1], r3 @//Store dest row1 221 subs r5, r5, #2 @ 2 rows done, decrement by 2 222 223 beq end_func @ Branch if height==4 224 b loop_8 @looping if height == 8 or 16 225 226 loop_4: 227 vld1.8 {d5, d6}, [r0], r2 @// Load row1 228 vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) 229 vld1.8 {d2, d3}, [r0], r2 @// Load row0 230 vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) 231 vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) 232 vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) 233 vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) 234 vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) 235 vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) 236 vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) 237 vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) 238 vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) 239 vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 240 vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 241 vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) 242 vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) 243 vld1.32 d12, [r7], r2 @Load value for interpolation (column1,row0) 244 vld1.32 d13, [r7], r2 @Load value for interpolation (column1,row1) 245 vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) 246 vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) 247 vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) 248 vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) 249 vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 250 vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 251 vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 252 vqrshrun.s16 d19, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 253 vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation 254 vst1.32 d18[0], [r1], r3 @//Store dest row0 255 vst1.32 d19[0], [r1], r3 @//Store dest row1 256 257 subs r5, r5, #2 @ 2 rows done, decrement by 2 258 beq end_func 259 260 b loop_4 261 262 end_func: 263 vldmia sp!, {d8-d15} @ Restore neon registers that were saved 264 ldmfd sp!, {r4-r12, pc} @Restoring registers from stack 265 266 267