1 @/****************************************************************************** 2 @ * 3 @ * Copyright (C) 2015 The Android Open Source Project 4 @ * 5 @ * Licensed under the Apache License, Version 2.0 (the "License"); 6 @ * you may not use this file except in compliance with the License. 7 @ * You may obtain a copy of the License at: 8 @ * 9 @ * http://www.apache.org/licenses/LICENSE-2.0 10 @ * 11 @ * Unless required by applicable law or agreed to in writing, software 12 @ * distributed under the License is distributed on an "AS IS" BASIS, 13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @ * See the License for the specific language governing permissions and 15 @ * limitations under the License. 16 @ * 17 @ ***************************************************************************** 18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 @*/ 20 @** 21 @****************************************************************************** 22 @* @file 23 @* ih264_default_weighted_pred_a9q.s 24 @* 25 @* @brief 26 @* Contains function definitions for default weighted prediction. 27 @* 28 @* @author 29 @* Kaushik Senthoor R 30 @* 31 @* @par List of Functions: 32 @* 33 @* - ih264_default_weighted_pred_luma_a9q() 34 @* - ih264_default_weighted_pred_chroma_a9q() 35 @* 36 @* @remarks 37 @* None 38 @* 39 @******************************************************************************* 40 @* 41 @******************************************************************************* 42 @* @function 43 @* ih264_default_weighted_pred_luma_a9q() 44 @* 45 @* @brief 46 @* This routine performs the default weighted prediction as described in sec 47 @* 8.4.2.3.1 titled "Default weighted sample prediction process" for luma. 48 @* 49 @* @par Description: 50 @* This function gets two ht x wd blocks, calculates their rounded-average and 51 @* stores it in the destination block. 52 @* 53 @* @param[in] pu1_src1: 54 @* UWORD8 Pointer to the buffer containing the first input block. 55 @* 56 @* @param[in] pu1_src2: 57 @* UWORD8 Pointer to the buffer containing the second input block. 58 @* 59 @* @param[out] pu1_dst 60 @* UWORD8 pointer to the destination where the output block is stored. 61 @* 62 @* @param[in] src_strd1 63 @* Stride of the first input buffer 64 @* 65 @* @param[in] src_strd2 66 @* Stride of the second input buffer 67 @* 68 @* @param[in] dst_strd 69 @* Stride of the destination buffer 70 @* 71 @* @param[in] ht 72 @* integer height of the array 73 @* 74 @* @param[in] wd 75 @* integer width of the array 76 @* 77 @* @returns 78 @* None 79 @* 80 @* @remarks 81 @* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). 82 @* 83 @******************************************************************************* 84 @* 85 @void ih264_default_weighted_pred_luma_a9q(UWORD8 *pu1_src1, 86 @ UWORD8 *pu1_src2, 87 @ UWORD8 *pu1_dst, 88 @ WORD32 src_strd1, 89 @ WORD32 src_strd2, 90 @ WORD32 dst_strd, 91 @ WORD32 ht, 92 @ WORD32 wd) 93 @ 94 @**************Variables Vs Registers***************************************** 95 @ r0 => pu1_src1 96 @ r1 => pu1_src2 97 @ r2 => pu1_dst 98 @ r3 => src_strd1 99 @ [sp] => src_strd2 (r4) 100 @ [sp+4] => dst_strd (r5) 101 @ [sp+8] => ht (r6) 102 @ [sp+12] => wd (r7) 103 @ 104 .text 105 .p2align 2 106 107 .global ih264_default_weighted_pred_luma_a9q 108 109 ih264_default_weighted_pred_luma_a9q: 110 111 stmfd sp!, {r4-r7, r14} @stack stores the values of the arguments 112 ldr r7, [sp, #32] @Load wd 113 ldr r4, [sp, #20] @Load src_strd2 114 ldr r5, [sp, #24] @Load dst_strd 115 cmp r7, #16 116 ldr r6, [sp, #28] @Load ht 117 vpush {d8-d15} 118 beq loop_16 @branch if wd is 16 119 cmp r7, #8 120 beq loop_8 @branch if wd is 8 121 122 loop_4: @each iteration processes four rows 123 124 vld1.32 d0[0], [r0], r3 @load row 1 in source 1 125 vld1.32 d0[1], [r0], r3 @load row 2 in source 1 126 vld1.32 d2[0], [r1], r4 @load row 1 in source 2 127 vld1.32 d2[1], [r1], r4 @load row 2 in source 2 128 129 vld1.32 d1[0], [r0], r3 @load row 3 in source 1 130 vld1.32 d1[1], [r0], r3 @load row 4 in source 1 131 vrhadd.u8 d0, d0, d2 132 vld1.32 d3[0], [r1], r4 @load row 3 in source 2 133 vld1.32 d3[1], [r1], r4 @load row 4 in source 2 134 135 subs r6, r6, #4 @decrement ht by 4 136 vst1.32 d0[0], [r2], r5 @load row 1 in destination 137 vst1.32 d0[1], [r2], r5 @load row 2 in destination 138 vrhadd.u8 d1, d1, d3 139 vst1.32 d1[0], [r2], r5 @load row 3 in destination 140 vst1.32 d1[1], [r2], r5 @load row 4 in destination 141 142 bgt loop_4 @if greater than 0 repeat the loop again 143 144 b end_loops 145 146 loop_8: @each iteration processes four rows 147 148 vld1.8 d0, [r0], r3 @load row 1 in source 1 149 vld1.8 d4, [r1], r4 @load row 1 in source 2 150 vld1.8 d1, [r0], r3 @load row 2 in source 1 151 vld1.8 d5, [r1], r4 @load row 2 in source 2 152 vld1.8 d2, [r0], r3 @load row 3 in source 1 153 vrhadd.u8 q0, q0, q2 154 vld1.8 d6, [r1], r4 @load row 3 in source 2 155 vld1.8 d3, [r0], r3 @load row 4 in source 1 156 vrhadd.u8 d2, d2, d6 157 vld1.8 d7, [r1], r4 @load row 4 in source 2 158 159 subs r6, r6, #4 @decrement ht by 4 160 vst1.8 d0, [r2], r5 @load row 1 in destination 161 vrhadd.u8 d3, d3, d7 162 vst1.8 d1, [r2], r5 @load row 2 in destination 163 vst1.8 d2, [r2], r5 @load row 3 in destination 164 vst1.8 d3, [r2], r5 @load row 4 in destination 165 166 bgt loop_8 @if greater than 0 repeat the loop again 167 168 b end_loops 169 170 loop_16: @each iteration processes eight rows 171 172 vld1.8 {q0}, [r0], r3 @load row 1 in source 1 173 vld1.8 {q8}, [r1], r4 @load row 1 in source 2 174 vld1.8 {q1}, [r0], r3 @load row 2 in source 1 175 vld1.8 {q9}, [r1], r4 @load row 2 in source 2 176 vrhadd.u8 q0, q0, q8 177 vld1.8 {q2}, [r0], r3 @load row 3 in source 1 178 vld1.8 {q10}, [r1], r4 @load row 3 in source 2 179 vrhadd.u8 q1, q1, q9 180 vld1.8 {q3}, [r0], r3 @load row 4 in source 1 181 vld1.8 {q11}, [r1], r4 @load row 4 in source 2 182 vrhadd.u8 q2, q2, q10 183 vld1.8 {q4}, [r0], r3 @load row 5 in source 1 184 vld1.8 {q12}, [r1], r4 @load row 5 in source 2 185 vrhadd.u8 q3, q3, q11 186 vld1.8 {q5}, [r0], r3 @load row 6 in source 1 187 vld1.8 {q13}, [r1], r4 @load row 6 in source 2 188 vrhadd.u8 q4, q4, q12 189 vld1.8 {q6}, [r0], r3 @load row 7 in source 1 190 vld1.8 {q14}, [r1], r4 @load row 7 in source 2 191 vrhadd.u8 q5, q5, q13 192 vld1.8 {q7}, [r0], r3 @load row 8 in source 1 193 vld1.8 {q15}, [r1], r4 @load row 8 in source 2 194 195 vrhadd.u8 q6, q6, q14 196 vst1.8 {q0}, [r2], r5 @load row 1 in destination 197 vst1.8 {q1}, [r2], r5 @load row 2 in destination 198 vrhadd.u8 q7, q7, q15 199 vst1.8 {q2}, [r2], r5 @load row 3 in destination 200 vst1.8 {q3}, [r2], r5 @load row 4 in destination 201 subs r6, r6, #8 @decrement ht by 8 202 vst1.8 {q4}, [r2], r5 @load row 5 in destination 203 vst1.8 {q5}, [r2], r5 @load row 6 in destination 204 vst1.8 {q6}, [r2], r5 @load row 7 in destination 205 vst1.8 {q7}, [r2], r5 @load row 8 in destination 206 207 bgt loop_16 @if greater than 0 repeat the loop again 208 209 end_loops: 210 211 vpop {d8-d15} 212 ldmfd sp!, {r4-r7, r15} @Reload the registers from sp 213 214 215 @******************************************************************************* 216 @* @function 217 @* ih264_default_weighted_pred_chroma_a9q() 218 @* 219 @* @brief 220 @* This routine performs the default weighted prediction as described in sec 221 @* 8.4.2.3.1 titled "Default weighted sample prediction process" for chroma. 222 @* 223 @* @par Description: 224 @* This function gets two ht x wd blocks, calculates their rounded-average and 225 @* stores it in the destination block for U and V. 226 @* 227 @* @param[in] pu1_src1: 228 @* UWORD8 Pointer to the buffer containing the first input block. 229 @* 230 @* @param[in] pu1_src2: 231 @* UWORD8 Pointer to the buffer containing the second input block. 232 @* 233 @* @param[out] pu1_dst 234 @* UWORD8 pointer to the destination where the output block is stored. 235 @* 236 @* @param[in] src_strd1 237 @* Stride of the first input buffer 238 @* 239 @* @param[in] src_strd2 240 @* Stride of the second input buffer 241 @* 242 @* @param[in] dst_strd 243 @* Stride of the destination buffer 244 @* 245 @* @param[in] ht 246 @* integer height of the array 247 @* 248 @* @param[in] wd 249 @* integer width of the array 250 @* 251 @* @returns 252 @* None 253 @* 254 @* @remarks 255 @* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). 256 @* 257 @******************************************************************************* 258 @* 259 @void ih264_default_weighted_pred_chroma_a9q(UWORD8 *pu1_src1, 260 @ UWORD8 *pu1_src2, 261 @ UWORD8 *pu1_dst, 262 @ WORD32 src_strd1, 263 @ WORD32 src_strd2, 264 @ WORD32 dst_strd, 265 @ WORD32 ht, 266 @ WORD32 wd) 267 @ 268 @**************Variables Vs Registers***************************************** 269 @ r0 => pu1_src1 270 @ r1 => pu1_src2 271 @ r2 => pu1_dst 272 @ r3 => src_strd1 273 @ [sp] => src_strd2 (r4) 274 @ [sp+4] => dst_strd (r5) 275 @ [sp+8] => ht (r6) 276 @ [sp+12] => wd (r7) 277 @ 278 279 280 .global ih264_default_weighted_pred_chroma_a9q 281 282 ih264_default_weighted_pred_chroma_a9q: 283 284 stmfd sp!, {r4-r7, r14} @stack stores the values of the arguments 285 ldr r7, [sp, #32] @Load wd 286 ldr r4, [sp, #20] @Load src_strd2 287 ldr r5, [sp, #24] @Load dst_strd 288 cmp r7, #8 289 ldr r6, [sp, #28] @Load ht 290 vpush {d8-d15} 291 beq loop_8_uv @branch if wd is 8 292 cmp r7, #4 293 beq loop_4_uv @branch if wd is 4 294 295 loop_2_uv: @each iteration processes two rows 296 297 vld1.32 d0[0], [r0], r3 @load row 1 in source 1 298 vld1.32 d0[1], [r0], r3 @load row 2 in source 1 299 300 vld1.32 d1[0], [r1], r4 @load row 1 in source 2 301 vld1.32 d1[1], [r1], r4 @load row 2 in source 2 302 303 vrhadd.u8 d0, d0, d1 304 305 subs r6, r6, #2 @decrement ht by 2 306 vst1.32 d0[0], [r2], r5 @load row 1 in destination 307 vst1.32 d0[1], [r2], r5 @load row 2 in destination 308 309 bgt loop_2_uv @if greater than 0 repeat the loop again 310 311 b end_loops_uv 312 313 loop_4_uv: @each iteration processes two rows 314 315 vld1.8 d0, [r0], r3 @load row 1 in source 1 316 vld1.8 d2, [r1], r4 @load row 1 in source 2 317 vld1.8 d1, [r0], r3 @load row 2 in source 1 318 vrhadd.u8 d0, d0, d2 319 vld1.8 d3, [r1], r4 @load row 2 in source 2 320 321 vrhadd.u8 d1, d1, d3 322 vst1.8 d0, [r2], r5 @load row 1 in destination 323 subs r6, r6, #2 @decrement ht by 2 324 vst1.8 d1, [r2], r5 @load row 2 in destination 325 326 bgt loop_4_uv @if greater than 0 repeat the loop again 327 328 b end_loops_uv 329 330 loop_8_uv: @each iteration processes four rows 331 332 vld1.8 {q0}, [r0], r3 @load row 1 in source 1 333 vld1.8 {q4}, [r1], r4 @load row 1 in source 2 334 vld1.8 {q1}, [r0], r3 @load row 2 in source 1 335 vrhadd.u8 q0, q0, q4 336 vld1.8 {q5}, [r1], r4 @load row 2 in source 2 337 vld1.8 {q2}, [r0], r3 @load row 3 in source 1 338 vrhadd.u8 q1, q1, q5 339 vld1.8 {q6}, [r1], r4 @load row 3 in source 2 340 vld1.8 {q3}, [r0], r3 @load row 4 in source 1 341 vrhadd.u8 q2, q2, q6 342 vld1.8 {q7}, [r1], r4 @load row 4 in source 2 343 344 vst1.8 {q0}, [r2], r5 @load row 1 in destination 345 vrhadd.u8 q3, q3, q7 346 vst1.8 {q1}, [r2], r5 @load row 2 in destination 347 subs r6, r6, #4 @decrement ht by 4 348 vst1.8 {q2}, [r2], r5 @load row 3 in destination 349 vst1.8 {q3}, [r2], r5 @load row 4 in destination 350 351 bgt loop_8_uv @if greater than 0 repeat the loop again 352 353 end_loops_uv: 354 355 vpop {d8-d15} 356 ldmfd sp!, {r4-r7, r15} @Reload the registers from sp 357 358 359