1 @/****************************************************************************** 2 @ * 3 @ * Copyright (C) 2015 The Android Open Source Project 4 @ * 5 @ * Licensed under the Apache License, Version 2.0 (the "License"); 6 @ * you may not use this file except in compliance with the License. 7 @ * You may obtain a copy of the License at: 8 @ * 9 @ * http://www.apache.org/licenses/LICENSE-2.0 10 @ * 11 @ * Unless required by applicable law or agreed to in writing, software 12 @ * distributed under the License is distributed on an "AS IS" BASIS, 13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @ * See the License for the specific language governing permissions and 15 @ * limitations under the License. 16 @ * 17 @ ***************************************************************************** 18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 @*/ 20 @** 21 @****************************************************************************** 22 @* @file 23 @* ih264_inter_pred_luma_vert_a9q.s 24 @* 25 @* @brief 26 @* Contains function definitions for inter prediction interpolation. 27 @* 28 @* @author 29 @* Ittiam 30 @* 31 @* @par List of Functions: 32 @* 33 @* - ih264_inter_pred_luma_vert_a9q() 34 @* 35 @* @remarks 36 @* None 37 @* 38 @******************************************************************************* 39 @* 40 41 @* All the functions here are replicated from ih264_inter_pred_filters.c 42 @ 43 44 @** 45 @** 46 @** 47 @ ******************************************************************************* 48 @ * 49 @ * @brief 50 @ * Interprediction luma filter for vertical input 51 @ * 52 @ * @par Description: 53 @ * Applies a 6 tap vertcal filter.The output is clipped to 8 bits 54 @ * sec 8.4.2.2.1 titled "Luma sample interpolation process" 55 @ * 56 @ * @param[in] pu1_src 57 @ * UWORD8 pointer to the source 58 @ * 59 @ * @param[out] pu1_dst 60 @ * UWORD8 pointer to the destination 61 @ * 62 @ * @param[in] src_strd 63 @ * integer source stride 64 @ * 65 @ * @param[in] dst_strd 66 @ * integer destination stride 67 @ * 68 @ * @param[in] ht 69 @ * integer height of the array 70 @ * 71 @ * @param[in] wd 72 @ * integer width of the array 73 @ * 74 @ * @returns 75 @ * 76 @ * @remarks 77 @ * None 78 @ * 79 @ ******************************************************************************* 80 81 @void ih264_inter_pred_luma_vert ( 82 @ UWORD8 *pu1_src, 83 @ UWORD8 *pu1_dst, 84 @ WORD32 src_strd, 85 @ WORD32 dst_strd, 86 @ WORD32 ht, 87 @ WORD32 wd ) 88 89 @**************Variables Vs Registers***************************************** 90 @ r0 => *pu1_src 91 @ r1 => *pu1_dst 92 @ r2 => src_strd 93 @ r3 => dst_strd 94 @ r5 => ht 95 @ r6 => wd 96 97 .text 98 .p2align 2 99 100 101 .global ih264_inter_pred_luma_vert_a9q 102 103 ih264_inter_pred_luma_vert_a9q: 104 105 stmfd sp!, {r4-r12, r14} @store register values to stack 106 vstmdb sp!, {d8-d15} @push neon registers to stack 107 ldr r5, [sp, #104] @Loads ht 108 sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd 109 ldr r6, [sp, #108] @Loads wd 110 vmov.u16 q11, #20 @ Filter coeff 0x14 into Q11 111 112 subs r12, r6, #8 @if wd=8 branch to loop_8 113 vmov.u16 q12, #5 @ Filter coeff 0x5 into Q12 114 beq loop_8 115 116 subs r12, r6, #4 @if wd=4 branch to loop_4 117 beq loop_4 118 119 loop_16: @when wd=16 120 121 vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0] 122 vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0] 123 vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0] 124 vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0] 125 vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0] 126 vaddl.u8 q6, d4, d6 @ temp1 = src[2_0] + src[3_0] 127 vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0] 128 129 vaddl.u8 q7, d0, d10 @ temp = src[0_0] + src[5_0] 130 vaddl.u8 q8, d2, d8 @ temp2 = src[1_0] + src[4_0] 131 vmla.u16 q7, q6, q11 @ temp += temp1 * 20 132 vaddl.u8 q10, d1, d11 @ temp4 = src[0_8] + src[5_8] 133 vaddl.u8 q9, d5, d7 @ temp3 = src[2_8] + src[3_8] 134 vmla.u16 q10, q9, q11 @ temp4 += temp3 * 20 135 vld1.u32 {q0}, [r0], r2 136 vaddl.u8 q13, d3, d9 @ temp5 = src[1_8] + src[4_8] 137 vaddl.u8 q6, d6, d8 138 vmls.u16 q7, q8, q12 @ temp -= temp2 * 5 139 vaddl.u8 q8, d2, d0 140 vaddl.u8 q9, d4, d10 141 vmla.u16 q8, q6, q11 142 vmls.u16 q10, q13, q12 @ temp4 -= temp5 * 5 143 vaddl.u8 q13, d5, d11 144 vaddl.u8 q6, d7, d9 145 vqrshrun.s16 d30, q7, #5 @ dst[0_0] = CLIP_U8((temp +16) >> 5) 146 vaddl.u8 q7, d3, d1 147 vld1.u32 {q1}, [r0], r2 148 vmla.u16 q7, q6, q11 149 vmls.u16 q8, q9, q12 150 vqrshrun.s16 d31, q10, #5 @ dst[0_8] = CLIP_U8((temp4 +16) >> 5) 151 vaddl.u8 q9, d4, d2 152 vaddl.u8 q6, d8, d10 153 154 vst1.u32 {q15}, [r1], r3 @ Vector store to dst[0_0] 155 vmla.u16 q9, q6, q11 156 vaddl.u8 q10, d6, d0 157 vmls.u16 q7, q13, q12 158 vqrshrun.s16 d30, q8, #5 159 vaddl.u8 q6, d9, d11 160 vaddl.u8 q8, d5, d3 161 vaddl.u8 q13, d7, d1 162 vmla.u16 q8, q6, q11 163 vmls.u16 q9, q10, q12 164 vld1.u32 {q2}, [r0], r2 165 166 vqrshrun.s16 d31, q7, #5 167 vaddl.u8 q6, d10, d0 168 vaddl.u8 q7, d6, d4 169 vaddl.u8 q10, d8, d2 170 vmla.u16 q7, q6, q11 171 vmls.u16 q8, q13, q12 172 vst1.u32 {q15}, [r1], r3 @store row 1 173 vqrshrun.s16 d30, q9, #5 174 vaddl.u8 q9, d7, d5 175 vaddl.u8 q6, d11, d1 176 vmla.u16 q9, q6, q11 177 vaddl.u8 q13, d9, d3 178 vmls.u16 q7, q10, q12 179 180 vqrshrun.s16 d31, q8, #5 181 vmls.u16 q9, q13, q12 182 vaddl.u8 q6, d0, d2 @ temp1 = src[2_0] + src[3_0] 183 vst1.u32 {q15}, [r1], r3 @store row 2 184 vaddl.u8 q8, d10, d4 @ temp2 = src[1_0] + src[4_0] 185 vaddl.u8 q10, d9, d7 @ temp4 = src[0_8] + src[5_8] 186 vqrshrun.s16 d30, q7, #5 187 vaddl.u8 q13, d5, d11 @ temp5 = src[1_8] + src[4_8] 188 vaddl.u8 q7, d8, d6 @ temp = src[0_0] + src[5_0] 189 vqrshrun.s16 d31, q9, #5 190 vmla.u16 q7, q6, q11 @ temp += temp1 * 20 191 vaddl.u8 q9, d1, d3 @ temp3 = src[2_8] + src[3_8] 192 vst1.u32 {q15}, [r1], r3 @store row 3 193 subs r5, r5, #4 @ 4 rows processed, decrement by 4 194 subne r0, r0 , r2, lsl #2 195 subne r0, r0, r2 196 beq end_func @ Branch if height==4 197 198 b loop_16 @ looping if height = 8 or 16 199 200 loop_8: 201 @ Processing row0 and row1 202 203 vld1.u32 d0, [r0], r2 @ Vector load from src[0_0] 204 vld1.u32 d1, [r0], r2 @ Vector load from src[1_0] 205 vld1.u32 d2, [r0], r2 @ Vector load from src[2_0] 206 vld1.u32 d3, [r0], r2 @ Vector load from src[3_0] 207 vld1.u32 d4, [r0], r2 @ Vector load from src[4_0] 208 vld1.u32 d5, [r0], r2 @ Vector load from src[5_0] 209 210 vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0] 211 vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0] 212 vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0] 213 vmla.u16 q4, q3, q11 @ temp += temp1 * 20 214 vld1.u32 d6, [r0], r2 215 vaddl.u8 q7, d3, d4 216 vaddl.u8 q8, d1, d6 217 vaddl.u8 q9, d2, d5 218 vmls.u16 q4, q5, q12 @ temp -= temp2 * 5 219 vmla.u16 q8, q7, q11 220 vld1.u32 d7, [r0], r2 221 vaddl.u8 q10, d4, d5 222 vaddl.u8 q6, d2, d7 223 vaddl.u8 q5, d3, d6 224 vmls.u16 q8, q9, q12 225 vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5) 226 vmla.u16 q6, q10, q11 227 vld1.u32 d0, [r0], r2 228 vaddl.u8 q7, d5, d6 229 vqrshrun.s16 d27, q8, #5 230 vaddl.u8 q10, d3, d0 231 vmls.u16 q6, q5, q12 232 vst1.u32 d26, [r1], r3 @ Vector store to dst[0_0] 233 vaddl.u8 q9, d4, d7 234 vmla.u16 q10, q7, q11 235 vst1.u32 d27, [r1], r3 236 vqrshrun.s16 d28, q6, #5 237 vst1.u32 d28, [r1], r3 238 vmls.u16 q10, q9, q12 239 vqrshrun.s16 d29, q10, #5 240 vst1.u32 d29, [r1], r3 @store row 3 241 242 subs r5, r5, #4 @ 4 rows processed, decrement by 4 243 subne r0, r0 , r2, lsl #2 244 subne r0, r0, r2 245 beq end_func @ Branch if height==4 246 247 b loop_8 @looping if height == 8 or 16 248 249 250 loop_4: 251 @ Processing row0 and row1 252 253 vld1.u32 d0[0], [r0], r2 @ Vector load from src[0_0] 254 vld1.u32 d1[0], [r0], r2 @ Vector load from src[1_0] 255 vld1.u32 d2[0], [r0], r2 @ Vector load from src[2_0] 256 vld1.u32 d3[0], [r0], r2 @ Vector load from src[3_0] 257 vld1.u32 d4[0], [r0], r2 @ Vector load from src[4_0] 258 vld1.u32 d5[0], [r0], r2 @ Vector load from src[5_0] 259 260 vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0] 261 vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0] 262 vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0] 263 vmla.u16 q4, q3, q11 @ temp += temp1 * 20 264 vld1.u32 d6[0], [r0], r2 265 vaddl.u8 q7, d3, d4 266 vaddl.u8 q8, d1, d6 267 vaddl.u8 q9, d2, d5 268 vmls.u16 q4, q5, q12 @ temp -= temp2 * 5 269 vld1.u32 d7[0], [r0], r2 270 vmla.u16 q8, q7, q11 271 vaddl.u8 q10, d4, d5 272 vaddl.u8 q6, d2, d7 273 vaddl.u8 q5, d3, d6 274 vmls.u16 q8, q9, q12 275 vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5) 276 vmla.u16 q6, q10, q11 277 vld1.u32 d0[0], [r0], r2 278 vaddl.u8 q7, d5, d6 279 vqrshrun.s16 d27, q8, #5 280 vaddl.u8 q10, d3, d0 281 vmls.u16 q6, q5, q12 282 vst1.u32 d26[0], [r1], r3 @ Vector store to dst[0_0] 283 vaddl.u8 q9, d4, d7 284 vmla.u16 q10, q7, q11 285 vst1.u32 d27[0], [r1], r3 286 vqrshrun.s16 d28, q6, #5 287 vst1.u32 d28[0], [r1], r3 288 vmls.u16 q10, q9, q12 289 vqrshrun.s16 d29, q10, #5 290 vst1.u32 d29[0], [r1], r3 @store row 3 291 292 subs r5, r5, #8 293 subeq r0, r0, r2, lsl #2 294 subeq r0, r0, r2 295 beq loop_4 @ Loop if height==8 296 297 end_func: 298 vldmia sp!, {d8-d15} @ Restore neon registers that were saved 299 ldmfd sp!, {r4-r12, pc} @Restoring registers from stack 300 301 302