1 @/****************************************************************************** 2 @ * 3 @ * Copyright (C) 2015 The Android Open Source Project 4 @ * 5 @ * Licensed under the Apache License, Version 2.0 (the "License"); 6 @ * you may not use this file except in compliance with the License. 7 @ * You may obtain a copy of the License at: 8 @ * 9 @ * http://www.apache.org/licenses/LICENSE-2.0 10 @ * 11 @ * Unless required by applicable law or agreed to in writing, software 12 @ * distributed under the License is distributed on an "AS IS" BASIS, 13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @ * See the License for the specific language governing permissions and 15 @ * limitations under the License. 16 @ * 17 @ ***************************************************************************** 18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 @*/ 20 @** 21 @****************************************************************************** 22 @* @file 23 @* ih264_inter_pred_luma_horz_a9q.s 24 @* 25 @* @brief 26 @* Contains function definitions for inter prediction interpolation. 27 @* 28 @* @author 29 @* Ittiam 30 @* 31 @* @par List of Functions: 32 @* 33 @* - ih264_inter_pred_luma_horz_a9q() 34 @* 35 @* @remarks 36 @* None 37 @* 38 @******************************************************************************* 39 @* 40 41 @* All the functions here are replicated from ih264_inter_pred_filters.c 42 @ 43 44 @** 45 @** 46 @******************************************************************************* 47 @* 48 @* @brief 49 @* Interprediction luma filter for horizontal input 50 @* 51 @* @par Description: 52 @* Applies a 6 tap horizontal filter .The output is clipped to 8 bits 53 @* sec 8.4.2.2.1 titled "Luma sample interpolation process" 54 @* 55 @* @param[in] pu1_src 56 @* UWORD8 pointer to the source 57 @* 58 @* @param[out] pu1_dst 59 @* UWORD8 pointer to the destination 60 @* 61 @* @param[in] src_strd 62 @* integer source stride 63 @* 64 @* @param[in] dst_strd 65 @* integer destination stride 66 @* 67 @* @param[in] ht 68 @* integer height of the array 69 @* 70 @* @param[in] wd 71 @* integer width of the array 72 @* 73 @* @returns 74 @* 75 @ @remarks 76 @* None 77 @* 78 @******************************************************************************* 79 @* 80 81 @void ih264_inter_pred_luma_horz ( 82 @ UWORD8 *pu1_src, 83 @ UWORD8 *pu1_dst, 84 @ WORD32 src_strd, 85 @ WORD32 dst_strd, 86 @ WORD32 ht, 87 @ WORD32 wd ) 88 89 @**************Variables Vs Registers***************************************** 90 @ r0 => *pu1_src 91 @ r1 => *pu1_dst 92 @ r2 => src_strd 93 @ r3 => dst_strd 94 @ r5 => ht 95 @ r6 => wd 96 97 .text 98 .p2align 2 99 100 101 .global ih264_inter_pred_luma_horz_a9q 102 103 ih264_inter_pred_luma_horz_a9q: 104 105 106 107 108 stmfd sp!, {r4-r12, r14} @store register values to stack 109 vstmdb sp!, {d8-d15} @push neon registers to stack 110 ldr r5, [sp, #104] @Loads ht 111 sub r0, r0, #2 @pu1_src-2 112 ldr r6, [sp, #108] @Loads wd 113 vmov.i8 d0, #5 @filter coeff 114 subs r12, r6, #8 @if wd=8 branch to loop_8 115 vmov.i8 d1, #20 @filter coeff 116 beq loop_8 117 118 subs r12, r6, #4 @if wd=4 branch to loop_4 119 beq loop_4 120 121 loop_16: @when wd=16 122 @ Processing row0 and row1 123 vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0 ;for checking loop 124 vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) 125 vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1 126 vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0) 127 vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) 128 vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) 129 vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0) 130 vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1) 131 vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) 132 vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0) 133 vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1) 134 vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0) 135 vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0) 136 vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1) 137 vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0) 138 vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1) 139 vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1) 140 vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0) 141 vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1) 142 vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0) 143 vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) 144 vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1) 145 vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) 146 vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1) 147 vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) 148 vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0) 149 vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1) 150 vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0) 151 vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 152 vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1) 153 vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 154 vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1) 155 vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 156 vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0) 157 vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) 158 vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0) 159 vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 160 vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1) 161 vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 162 vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1) 163 vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 164 vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) 165 vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 166 vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 167 vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row2) 168 vst1.8 {d20, d21}, [r1], r3 @//Store dest row0 169 vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 170 vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row2) 171 vqrshrun.s16 d24, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) 172 vst1.8 {d23, d24}, [r1], r3 @//Store dest row1 173 subs r5, r5, #2 @ 2 rows done, decrement by 2 174 175 beq end_func 176 b loop_16 @ loop if height == 8 or 16 177 178 loop_8: 179 @ Processing row0 and row1 180 vld1.8 {d5, d6}, [r0], r2 @// Load row1 181 vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) 182 vld1.8 {d2, d3}, [r0], r2 @// Load row0 183 vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) 184 vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) 185 vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) 186 vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) 187 vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) 188 vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) 189 vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) 190 vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) 191 vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) 192 vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 193 vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 194 vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) 195 vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) 196 vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) 197 vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) 198 vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) 199 vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) 200 vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 201 vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 202 vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 203 vst1.8 {d23}, [r1], r3 @//Store dest row0 204 vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 205 vst1.8 {d20}, [r1], r3 @//Store dest row1 206 subs r5, r5, #2 @ 2 rows done, decrement by 2 207 208 beq end_func @ Branch if height==4 209 210 b loop_8 @looping if height =8 or 16 211 212 loop_4: 213 vld1.8 {d5, d6}, [r0], r2 @// Load row1 214 vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) 215 vld1.8 {d2, d3}, [r0], r2 @// Load row0 216 vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) 217 vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) 218 vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) 219 vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) 220 vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) 221 vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) 222 vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) 223 vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) 224 vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) 225 vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 226 vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 227 vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) 228 vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) 229 vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) 230 vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) 231 vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) 232 vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) 233 vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 234 vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 235 vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 236 vst1.32 d23[0], [r1], r3 @//Store dest row0 237 vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 238 vst1.32 d20[0], [r1], r3 @//Store dest row1 239 subs r5, r5, #2 @ 2 rows done, decrement by 2 240 beq end_func 241 242 b loop_4 243 244 end_func: 245 vldmia sp!, {d8-d15} @ Restore neon registers that were saved 246 ldmfd sp!, {r4-r12, pc} @Restoring registers from stack 247 248 249