1 @/****************************************************************************** 2 @ * 3 @ * Copyright (C) 2015 The Android Open Source Project 4 @ * 5 @ * Licensed under the Apache License, Version 2.0 (the "License"); 6 @ * you may not use this file except in compliance with the License. 7 @ * You may obtain a copy of the License at: 8 @ * 9 @ * http://www.apache.org/licenses/LICENSE-2.0 10 @ * 11 @ * Unless required by applicable law or agreed to in writing, software 12 @ * distributed under the License is distributed on an "AS IS" BASIS, 13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @ * See the License for the specific language governing permissions and 15 @ * limitations under the License. 16 @ * 17 @ ***************************************************************************** 18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 @*/ 20 @** 21 @****************************************************************************** 22 @* @file 23 @* ih264_inter_pred_chroma_a9q.s 24 @* 25 @* @brief 26 @* Contains function definitions for inter prediction interpolation. 27 @* 28 @* @author 29 @* Ittaim 30 @* 31 @* @par List of Functions: 32 @* 33 @* - ih264_inter_pred_chroma_a9q() 34 @* 35 @* @remarks 36 @* None 37 @* 38 @******************************************************************************* 39 @* 40 41 @* All the functions here are replicated from ih264_inter_pred_filters.c 42 @ 43 44 @** 45 @** 46 @** 47 @ 48 @** 49 @******************************************************************************* 50 @* 51 @* @brief 52 @* Interprediction chroma filter 53 @* 54 @* @par Description: 55 @* Applies filtering to chroma samples as mentioned in 56 @* sec 8.4.2.2.2 titled "chroma sample interpolation process" 57 @* 58 @* @param[in] pu1_src 59 @* UWORD8 pointer to the source containing alternate U and V samples 60 @* 61 @* @param[out] pu1_dst 62 @* UWORD8 pointer to the destination 63 @* 64 @* @param[in] src_strd 65 @* integer source stride 66 @* 67 @* @param[in] dst_strd 68 @* integer destination stride 69 @* 70 @* @param[in]uc_dx 71 @* dx value where the sample is to be produced(refer sec 8.4.2.2.2 ) 72 @* 73 @* @param[in] uc_dy 74 @* dy value where the sample is to be produced(refer sec 8.4.2.2.2 ) 75 @* 76 @* @param[in] ht 77 @* integer height of the array 78 @* 79 @* @param[in] wd 80 @* integer width of the array 81 @* 82 @* @returns 83 @* 84 @* @remarks 85 @* None 86 @* 87 @******************************************************************************* 88 @* 89 90 @void ih264_inter_pred_chroma(UWORD8 *pu1_src, 91 @ UWORD8 *pu1_dst, 92 @ WORD32 src_strd, 93 @ WORD32 dst_strd, 94 @ UWORD8 u1_dx, 95 @ UWORD8 u1_dy, 96 @ WORD32 ht, 97 @ WORD32 wd) 98 @**************Variables Vs Registers***************************************** 99 @ r0 => *pu1_src 100 @ r1 => *pu1_dst 101 @ r2 => src_strd 102 @ r3 => dst_strd 103 @ r4 => u1_dx 104 @ r5 => u1_dy 105 @ r6 => height 106 @ r7 => width 107 @ 108 .text 109 .p2align 2 110 111 .global ih264_inter_pred_chroma_a9q 112 113 ih264_inter_pred_chroma_a9q: 114 115 stmfd sp!, {r4-r12, r14} @store register values to stack 116 vstmdb sp!, {d8-d15} @push neon registers to stack 117 ldr r4, [sp, #104] 118 ldr r5, [sp, #108] 119 ldr r6, [sp, #112] 120 ldr r7, [sp, #116] 121 122 rsb r8, r4, #8 @8-u1_dx 123 rsb r9, r5, #8 @8-u1_dy 124 mul r10, r8, r9 125 mul r11, r4, r9 126 127 vdup.u8 d28, r10 128 vdup.u8 d29, r11 129 130 mul r10, r8, r5 131 mul r11, r4, r5 132 133 vdup.u8 d30, r10 134 vdup.u8 d31, r11 135 136 subs r12, r7, #2 @if wd=4 branch to loop_4 137 beq loop_2 138 subs r12, r7, #4 @if wd=8 branch to loop_8 139 beq loop_4 140 141 loop_8: 142 sub r6, #1 143 vld1.8 {d0, d1, d2}, [r0], r2 @ Load row0 144 vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1 145 vext.8 d3, d0, d1, #2 146 vext.8 d8, d5, d6, #2 147 148 vmull.u8 q5, d0, d28 149 vmlal.u8 q5, d5, d30 150 vmlal.u8 q5, d3, d29 151 vmlal.u8 q5, d8, d31 152 vext.8 d9, d6, d7, #2 153 vext.8 d4, d1, d2, #2 154 155 inner_loop_8: 156 vmull.u8 q6, d6, d30 157 vmlal.u8 q6, d1, d28 158 vmlal.u8 q6, d9, d31 159 vmlal.u8 q6, d4, d29 160 vmov d0, d5 161 vmov d3, d8 162 163 vqrshrun.s16 d14, q5, #6 164 vmov d1, d6 165 vmov d4, d9 166 167 vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1 168 vqrshrun.s16 d15, q6, #6 169 170 vext.8 d8, d5, d6, #2 171 subs r6, #1 172 vext.8 d9, d6, d7, #2 173 vst1.8 {q7}, [r1], r3 @ Store dest row 174 175 vmull.u8 q5, d0, d28 176 vmlal.u8 q5, d5, d30 177 vmlal.u8 q5, d3, d29 178 vmlal.u8 q5, d8, d31 179 bne inner_loop_8 180 181 vmull.u8 q6, d6, d30 182 vmlal.u8 q6, d1, d28 183 vmlal.u8 q6, d9, d31 184 vmlal.u8 q6, d4, d29 185 186 vqrshrun.s16 d14, q5, #6 187 vqrshrun.s16 d15, q6, #6 188 189 vst1.8 {q7}, [r1], r3 @ Store dest row 190 191 b end_func 192 193 loop_4: 194 sub r6, #1 195 vld1.8 {d0, d1}, [r0], r2 @ Load row0 196 vld1.8 {d2, d3}, [r0], r2 @ Load row1 197 vext.8 d1, d0, d1, #2 198 vext.8 d3, d2, d3, #2 199 200 vmull.u8 q2, d2, d30 201 vmlal.u8 q2, d0, d28 202 vmlal.u8 q2, d3, d31 203 vmlal.u8 q2, d1, d29 204 205 inner_loop_4: 206 subs r6, #1 207 vmov d0, d2 208 vmov d1, d3 209 210 vld1.8 {d2, d3}, [r0], r2 @ Load row1 211 vqrshrun.s16 d6, q2, #6 212 213 vext.8 d3, d2, d3, #2 214 vst1.8 {d6}, [r1], r3 @ Store dest row 215 216 vmull.u8 q2, d0, d28 217 vmlal.u8 q2, d2, d30 218 vmlal.u8 q2, d1, d29 219 vmlal.u8 q2, d3, d31 220 bne inner_loop_4 221 222 vqrshrun.s16 d6, q2, #6 223 vst1.8 {d6}, [r1], r3 @ Store dest row 224 225 b end_func 226 227 loop_2: 228 vld1.8 {d0}, [r0], r2 @ Load row0 229 vext.8 d1, d0, d0, #2 230 vld1.8 {d2}, [r0], r2 @ Load row1 231 vext.8 d3, d2, d2, #2 232 vmull.u8 q2, d0, d28 233 vmlal.u8 q2, d1, d29 234 vmlal.u8 q2, d2, d30 235 vmlal.u8 q2, d3, d31 236 vld1.8 {d6}, [r0] @ Load row2 237 vqrshrun.s16 d4, q2, #6 238 vext.8 d7, d6, d6, #2 239 vst1.32 d4[0], [r1], r3 @ Store dest row0 240 vmull.u8 q4, d2, d28 241 vmlal.u8 q4, d3, d29 242 vmlal.u8 q4, d6, d30 243 vmlal.u8 q4, d7, d31 244 subs r6, #2 245 vqrshrun.s16 d8, q4, #6 246 vst1.32 d8[0], [r1], r3 @ Store dest row1 247 bne loop_2 @ repeat if ht=2 248 249 end_func: 250 vldmia sp!, {d8-d15} @ Restore neon registers that were saved 251 ldmfd sp!, {r4-r12, pc} @ Restoring registers from stack 252 253